Module: Excite::TokenFeatures

Included in:
CRFParser
Defined in:
lib/excite/token_features.rb

Defined Under Namespace

Modules: DictFlags

Constant Summary collapse

DIR =
File.dirname(__FILE__)
DICT =
TokenFeatures.read_dict_files("#{DIR}/resources/dicts")
NODE_TYPES_BY_NAME =
{
  'div'=>'div',
  'p'=>'p',
  'ul'=>'div', # lump with div - higher-level structure
  'li'=>'li',
  'tr'=>'div', # lump with div - higher-level structure
  'td'=>'td',
  'span'=>'span',
  'font'=>'span',
  'em'=>'em',
  'i'=>'em',
  'strong'=>'strong',
  'b'=>'strong',
  'u'=>'u',
  'h1'=>'h',
  'h2'=>'h',
  'h3'=>'h',
  'h4'=>'h',
  'h5'=>'h',
  'h6'=>'h',
  'a'=>'a',
  'br'=>'br',
  '#document-fragment'=>'unknown' # the actual tag wasn't captured in the fragment we're parsing
}

Instance Method Summary collapse

Instance Method Details

#capitalization(toks, idx, author_names = nil) ⇒ Object



86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/excite/token_features.rb', line 86

def capitalization(toks, idx, author_names=nil)
  case toks[idx].np
  when /^[[:upper:]]$/
    "singleCap"
  when /^[[:upper:]][[:lower:]]+/
    "InitCap"
  when /^[[:upper:]]+$/
    "AllCap"
  else
    "others"
  end
end

#clearObject



47
48
49
50
51
52
# File 'lib/excite/token_features.rb', line 47

def clear
  @possible_editor = nil
  @possible_chapter = nil
  @dict_status = nil
  @is_proceeding = nil
end

#dict_status(toks, idx, author_names = nil) ⇒ Object



203
204
205
206
# File 'lib/excite/token_features.rb', line 203

def dict_status(toks, idx, author_names=nil)
  @dict_status ||= [nil]*toks.length
  @dict_status[idx] ||= (DICT[toks[idx].lcnp] || DICT[toks[idx].raw.downcase] || 0)
end

#first_1_char(toks, idx, author_names = nil) ⇒ Object



67
# File 'lib/excite/token_features.rb', line 67

def first_1_char(toks, idx, author_names=nil); toks[idx].raw[0,1]; end

#first_2_chars(toks, idx, author_names = nil) ⇒ Object



68
# File 'lib/excite/token_features.rb', line 68

def first_2_chars(toks, idx, author_names=nil); toks[idx].raw[0,2]; end

#first_3_chars(toks, idx, author_names = nil) ⇒ Object



69
# File 'lib/excite/token_features.rb', line 69

def first_3_chars(toks, idx, author_names=nil); toks[idx].raw[0,3]; end

#first_4_chars(toks, idx, author_names = nil) ⇒ Object



70
# File 'lib/excite/token_features.rb', line 70

def first_4_chars(toks, idx, author_names=nil); toks[idx].raw[0,4]; end

#first_5_chars(toks, idx, author_names = nil) ⇒ Object



71
# File 'lib/excite/token_features.rb', line 71

def first_5_chars(toks, idx, author_names=nil); toks[idx].raw[0,5]; end

#firstName(toks, idx, author_names = nil) ⇒ Object



198
199
200
201
# File 'lib/excite/token_features.rb', line 198

def firstName(toks, idx, author_names=nil)
  return 'firstName' if author_names && author_names.first == toks[idx].lcnp
  (dict_status(toks, idx) & DictFlags::FIRST_NAME) > 0 ? 'firstName' : 'noFirstName'
end

#is_in(toks, idx, author_names = nil) ⇒ Object

TODO remove duplication with possible_chapter



143
144
145
146
147
148
149
150
# File 'lib/excite/token_features.rb', line 143

def is_in(toks, idx, author_names=nil)
  is_in = if idx > 0 && idx < (toks.length-1) && toks[idx].lcnp == 'in'
    prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[idx-1].part_of_speech)
    next_is_separator = ['ppl','ppc','pps'].include?(toks[idx+1].part_of_speech)
    prev_is_separator && (next_is_separator || toks[idx+1].np =~ /^[[:upper:]]/)
  end
  is_in ? "inBook" : "notInBook"
end

#last_1_char(toks, idx, author_names = nil) ⇒ Object



73
# File 'lib/excite/token_features.rb', line 73

def last_1_char(toks, idx, author_names=nil); toks[idx].raw[-1,1]; end

#last_2_chars(toks, idx, author_names = nil) ⇒ Object



74
# File 'lib/excite/token_features.rb', line 74

def last_2_chars(toks, idx, author_names=nil); toks[idx].raw[-2,2] || toks[idx].raw; end

#last_3_chars(toks, idx, author_names = nil) ⇒ Object



75
# File 'lib/excite/token_features.rb', line 75

def last_3_chars(toks, idx, author_names=nil); toks[idx].raw[-3,3] || toks[idx].raw; end

#last_4_chars(toks, idx, author_names = nil) ⇒ Object



76
# File 'lib/excite/token_features.rb', line 76

def last_4_chars(toks, idx, author_names=nil); toks[idx].raw[-4,4] || toks[idx].raw; end

#last_char(toks, idx, author_names = nil) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/excite/token_features.rb', line 54

def last_char(toks, idx, author_names=nil)
  case toks[idx].raw[-1,1]
    when /[[:lower:]]/
      'a'
    when /[[:upper:]]/
      'A'
    when /[0-9]/
      0
    else
      toks[idx].raw[-1,1]
  end
end

#lastName(toks, idx, author_names = nil) ⇒ Object



193
194
195
196
# File 'lib/excite/token_features.rb', line 193

def lastName(toks, idx, author_names=nil)
  return 'lastName' if author_names && author_names.last == toks[idx].lcnp
  (dict_status(toks, idx) & DictFlags::LAST_NAME) > 0 ? 'lastName' : 'noLastName'
end

#location(toks, idx, author_names = nil) ⇒ Object



152
153
154
# File 'lib/excite/token_features.rb', line 152

def location(toks, idx, author_names=nil)
  r = ((idx.to_f / toks.length) * 10).round
end

#location_in_node(toks, idx, author_names = nil) ⇒ Object



239
240
241
# File 'lib/excite/token_features.rb', line 239

def location_in_node(toks, idx, author_names=nil)
  ((toks[idx].idx_in_node.to_f / toks[idx].node_token_count) * 10).round
end

#monthName(toks, idx, author_names = nil) ⇒ Object



189
190
191
# File 'lib/excite/token_features.rb', line 189

def monthName(toks, idx, author_names=nil)
  (dict_status(toks, idx) & DictFlags::MONTH_NAME) > 0 ? 'monthName' : 'noMonthName'
end

#numbers(toks, idx, author_names = nil) ⇒ Object



99
100
101
102
103
104
105
106
107
108
109
# File 'lib/excite/token_features.rb', line 99

def numbers(toks, idx, author_names=nil)
  (toks[idx].raw           =~ /[0-9]\-[0-9]/)          ? "possiblePage" :
    (toks[idx].raw         =~ /^\D*(19|20)[0-9][0-9]\D*$/)   ? "year"         :
    (toks[idx].np       =~ /^(19|20)[0-9][0-9]$/)   ? "year"         :
    (toks[idx].np       =~ /^[0-9]$/)               ? "1dig"         :
    (toks[idx].np       =~ /^[0-9][0-9]$/)          ? "2dig"         :
    (toks[idx].np       =~ /^[0-9][0-9][0-9]$/)     ? "3dig"         :
    (toks[idx].np       =~ /^[0-9]+$/)              ? "4+dig"        :
    (toks[idx].np       =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal"      :
    (toks[idx].np       =~ /[0-9]/)                 ? "hasDig"       : "nonNum"
end

#part_of_speech(toks, idx, author_names = nil) ⇒ Object



243
244
245
# File 'lib/excite/token_features.rb', line 243

def part_of_speech(toks, idx, author_names=nil)
  toks[idx].part_of_speech
end

#placeName(toks, idx, author_names = nil) ⇒ Object



185
186
187
# File 'lib/excite/token_features.rb', line 185

def placeName(toks, idx, author_names=nil)
  (dict_status(toks, idx) & DictFlags::PLACE_NAME) > 0 ? 'placeName' : 'noPlaceName'
end

#possible_chapter(toks, idx = nil, author_names = nil) ⇒ Object

if there is possible editor entry and “IN” preceeded by punctuation this citation may be a book chapter

ignores idx



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/excite/token_features.rb', line 126

def possible_chapter(toks, idx=nil, author_names=nil)
  if !@possible_chapter.nil?
    @possible_chapter
  else
    has_editor = possible_editor(toks) == 'possibleEditors'
    has_chapter = toks.each_with_index.any? do |t, i|
      if i > 0 && i < (toks.length-1) && t.lcnp == 'in'
        prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[i-1].part_of_speech)
        next_is_separator = ['ppl','ppc','pps'].include?(toks[i+1].part_of_speech)
        prev_is_separator && (has_editor || next_is_separator)
      end
    end
    has_chapter ? "possibleChapter" : "noChapter"
  end
end

#possible_editor(toks, idx = nil, author_names = nil) ⇒ Object

ignores idx



112
113
114
115
116
117
118
119
120
# File 'lib/excite/token_features.rb', line 112

def possible_editor(toks, idx=nil, author_names=nil)
  if !@possible_editor.nil?
    @possible_editor
  else
    @possible_editor =
      (toks.any? { |t|  %w(ed editor editors eds edited).include?(t.lcnp) } ?
        "possibleEditors" : "noEditors")
  end
end

#possible_volume(toks, idx, author_names = nil) ⇒ Object



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# File 'lib/excite/token_features.rb', line 163

def possible_volume(toks, idx, author_names=nil)
  if possible_vol_with_str(toks, idx)
    'volume'
  elsif possible_vol_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
    'issue'
  elsif possible_vol_with_str(toks, idx-2) && possible_issue_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
    'issue'
  elsif possible_vol_with_parens(toks, idx)
    'volume'
  elsif (1..3).any? { |i| possible_vol_with_parens(toks, idx-i) }
    'issue'
  elsif possible_vol_with_colon(toks, idx)
    'volume'
  else
    'noVolume'
  end
end

#publisherName(toks, idx, author_names = nil) ⇒ Object



181
182
183
# File 'lib/excite/token_features.rb', line 181

def publisherName(toks, idx, author_names=nil)
  (dict_status(toks, idx) & DictFlags::PUBLISHER_NAME) > 0 ? 'publisherName' : 'noPublisherName'
end

#punct(toks, idx, author_names = nil) ⇒ Object



156
157
158
159
160
161
# File 'lib/excite/token_features.rb', line 156

def punct(toks, idx, author_names=nil)
  (toks[idx].raw =~ /\-.*\-/)              ? "multiHyphen" :
  (toks[idx].raw =~ /[[:alpha:]].*\-$/)    ? "truncated"   :
  (toks[idx].raw =~ /[[:alpha:]].*\.$/)    ? "abbrev"      :
  (toks[idx].np != toks[idx].raw)          ? "hasPunct"    : "others"
end

#tag_name(toks, idx, author_names = nil) ⇒ Object



233
234
235
236
237
# File 'lib/excite/token_features.rb', line 233

def tag_name(toks, idx, author_names=nil)
  node = toks[idx].node
  name = node.text? ? node.parent.name : node.name
  NODE_TYPES_BY_NAME[name.downcase] || 'other'
end

#toklcnp(toks, idx, author_names = nil) ⇒ Object



78
79
80
81
82
83
84
# File 'lib/excite/token_features.rb', line 78

def toklcnp(toks, idx, author_names=nil)
  if toks[idx].lcnp.blank?
    "EMPTY"
  else
    toks[idx].lcnp
  end
end