Module: Excite::TokenFeatures

Included in:: CRFParser

Defined in:: lib/excite/token_features.rb

Defined Under Namespace

Constant Summary collapse

DIR =

File.dirname(__FILE__)

DICT =

TokenFeatures.read_dict_files("#{DIR}/resources/dicts")

NODE_TYPES_BY_NAME =

{
  'div'=>'div',
  'p'=>'p',
  'ul'=>'div', # lump with div - higher-level structure
  'li'=>'li',
  'tr'=>'div', # lump with div - higher-level structure
  'td'=>'td',
  'span'=>'span',
  'font'=>'span',
  'em'=>'em',
  'i'=>'em',
  'strong'=>'strong',
  'b'=>'strong',
  'u'=>'u',
  'h1'=>'h',
  'h2'=>'h',
  'h3'=>'h',
  'h4'=>'h',
  'h5'=>'h',
  'h6'=>'h',
  'a'=>'a',
  'br'=>'br',
  '#document-fragment'=>'unknown' # the actual tag wasn't captured in the fragment we're parsing
}

Instance Method Summary collapse

Instance Method Details

#capitalization(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 86

def capitalization(toks, idx, author_names=nil)
  case toks[idx].np
  when /^[[:upper:]]$/
    "singleCap"
  when /^[[:upper:]][[:lower:]]+/
    "InitCap"
  when /^[[:upper:]]+$/
    "AllCap"
  else
    "others"
  end
end

#clear ⇒ `Object`

# File 'lib/excite/token_features.rb', line 47

def clear
  @possible_editor = nil
  @possible_chapter = nil
  @dict_status = nil
  @is_proceeding = nil
end

#dict_status(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 203

def dict_status(toks, idx, author_names=nil)
  @dict_status ||= [nil]*toks.length
  @dict_status[idx] ||= (DICT[toks[idx].lcnp] || DICT[toks[idx].raw.downcase] || 0)
end

#first_1_char(toks, idx, author_names = nil) ⇒ `Object`

67	# File 'lib/excite/token_features.rb', line 67 def first_1_char(toks, idx, author_names=nil); toks[idx].raw[0,1]; end

#first_2_chars(toks, idx, author_names = nil) ⇒ `Object`

68	# File 'lib/excite/token_features.rb', line 68 def first_2_chars(toks, idx, author_names=nil); toks[idx].raw[0,2]; end

#first_3_chars(toks, idx, author_names = nil) ⇒ `Object`

69	# File 'lib/excite/token_features.rb', line 69 def first_3_chars(toks, idx, author_names=nil); toks[idx].raw[0,3]; end

#first_4_chars(toks, idx, author_names = nil) ⇒ `Object`

70	# File 'lib/excite/token_features.rb', line 70 def first_4_chars(toks, idx, author_names=nil); toks[idx].raw[0,4]; end

#first_5_chars(toks, idx, author_names = nil) ⇒ `Object`

71	# File 'lib/excite/token_features.rb', line 71 def first_5_chars(toks, idx, author_names=nil); toks[idx].raw[0,5]; end

#firstName(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 198

def firstName(toks, idx, author_names=nil)
  return 'firstName' if author_names && author_names.first == toks[idx].lcnp
  (dict_status(toks, idx) & DictFlags::FIRST_NAME) > 0 ? 'firstName' : 'noFirstName'
end

#is_in(toks, idx, author_names = nil) ⇒ `Object`

TODO remove duplication with possible_chapter

# File 'lib/excite/token_features.rb', line 143

def is_in(toks, idx, author_names=nil)
  is_in = if idx > 0 && idx < (toks.length-1) && toks[idx].lcnp == 'in'
    prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[idx-1].part_of_speech)
    next_is_separator = ['ppl','ppc','pps'].include?(toks[idx+1].part_of_speech)
    prev_is_separator && (next_is_separator || toks[idx+1].np =~ /^[[:upper:]]/)
  end
  is_in ? "inBook" : "notInBook"
end

#last_1_char(toks, idx, author_names = nil) ⇒ `Object`

73	# File 'lib/excite/token_features.rb', line 73 def last_1_char(toks, idx, author_names=nil); toks[idx].raw[-1,1]; end

#last_2_chars(toks, idx, author_names = nil) ⇒ `Object`

74	# File 'lib/excite/token_features.rb', line 74 def last_2_chars(toks, idx, author_names=nil); toks[idx].raw[-2,2] \|\| toks[idx].raw; end

#last_3_chars(toks, idx, author_names = nil) ⇒ `Object`

75	# File 'lib/excite/token_features.rb', line 75 def last_3_chars(toks, idx, author_names=nil); toks[idx].raw[-3,3] \|\| toks[idx].raw; end

#last_4_chars(toks, idx, author_names = nil) ⇒ `Object`

76	# File 'lib/excite/token_features.rb', line 76 def last_4_chars(toks, idx, author_names=nil); toks[idx].raw[-4,4] \|\| toks[idx].raw; end

#last_char(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 54

def last_char(toks, idx, author_names=nil)
  case toks[idx].raw[-1,1]
    when /[[:lower:]]/
      'a'
    when /[[:upper:]]/
      'A'
    when /[0-9]/
      0
    else
      toks[idx].raw[-1,1]
  end
end

#lastName(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 193

def lastName(toks, idx, author_names=nil)
  return 'lastName' if author_names && author_names.last == toks[idx].lcnp
  (dict_status(toks, idx) & DictFlags::LAST_NAME) > 0 ? 'lastName' : 'noLastName'
end

#location(toks, idx, author_names = nil) ⇒ `Object`



152
153
154

# File 'lib/excite/token_features.rb', line 152

def location(toks, idx, author_names=nil)
  r = ((idx.to_f / toks.length) * 10).round
end

#location_in_node(toks, idx, author_names = nil) ⇒ `Object`



239
240
241

# File 'lib/excite/token_features.rb', line 239

def location_in_node(toks, idx, author_names=nil)
  ((toks[idx].idx_in_node.to_f / toks[idx].node_token_count) * 10).round
end

#monthName(toks, idx, author_names = nil) ⇒ `Object`



189
190
191

# File 'lib/excite/token_features.rb', line 189

def monthName(toks, idx, author_names=nil)
  (dict_status(toks, idx) & DictFlags::MONTH_NAME) > 0 ? 'monthName' : 'noMonthName'
end

#numbers(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 99

def numbers(toks, idx, author_names=nil)
  (toks[idx].raw           =~ /[0-9]\-[0-9]/)          ? "possiblePage" :
    (toks[idx].raw         =~ /^\D*(19|20)[0-9][0-9]\D*$/)   ? "year"         :
    (toks[idx].np       =~ /^(19|20)[0-9][0-9]$/)   ? "year"         :
    (toks[idx].np       =~ /^[0-9]$/)               ? "1dig"         :
    (toks[idx].np       =~ /^[0-9][0-9]$/)          ? "2dig"         :
    (toks[idx].np       =~ /^[0-9][0-9][0-9]$/)     ? "3dig"         :
    (toks[idx].np       =~ /^[0-9]+$/)              ? "4+dig"        :
    (toks[idx].np       =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal"      :
    (toks[idx].np       =~ /[0-9]/)                 ? "hasDig"       : "nonNum"
end

#part_of_speech(toks, idx, author_names = nil) ⇒ `Object`



243
244
245

# File 'lib/excite/token_features.rb', line 243

def part_of_speech(toks, idx, author_names=nil)
  toks[idx].part_of_speech
end

#placeName(toks, idx, author_names = nil) ⇒ `Object`



185
186
187

# File 'lib/excite/token_features.rb', line 185

def placeName(toks, idx, author_names=nil)
  (dict_status(toks, idx) & DictFlags::PLACE_NAME) > 0 ? 'placeName' : 'noPlaceName'
end

#possible_chapter(toks, idx = nil, author_names = nil) ⇒ `Object`

if there is possible editor entry and “IN” preceeded by punctuation this citation may be a book chapter

ignores idx

# File 'lib/excite/token_features.rb', line 126

def possible_chapter(toks, idx=nil, author_names=nil)
  if !@possible_chapter.nil?
    @possible_chapter
  else
    has_editor = possible_editor(toks) == 'possibleEditors'
    has_chapter = toks.each_with_index.any? do |t, i|
      if i > 0 && i < (toks.length-1) && t.lcnp == 'in'
        prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[i-1].part_of_speech)
        next_is_separator = ['ppl','ppc','pps'].include?(toks[i+1].part_of_speech)
        prev_is_separator && (has_editor || next_is_separator)
      end
    end
    has_chapter ? "possibleChapter" : "noChapter"
  end
end

#possible_editor(toks, idx = nil, author_names = nil) ⇒ `Object`

ignores idx

# File 'lib/excite/token_features.rb', line 112

def possible_editor(toks, idx=nil, author_names=nil)
  if !@possible_editor.nil?
    @possible_editor
  else
    @possible_editor =
      (toks.any? { |t|  %w(ed editor editors eds edited).include?(t.lcnp) } ?
        "possibleEditors" : "noEditors")
  end
end

#possible_volume(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 163

def possible_volume(toks, idx, author_names=nil)
  if possible_vol_with_str(toks, idx)
    'volume'
  elsif possible_vol_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
    'issue'
  elsif possible_vol_with_str(toks, idx-2) && possible_issue_with_str(toks, idx-1) && possible_issue_with_str(toks, idx)
    'issue'
  elsif possible_vol_with_parens(toks, idx)
    'volume'
  elsif (1..3).any? { |i| possible_vol_with_parens(toks, idx-i) }
    'issue'
  elsif possible_vol_with_colon(toks, idx)
    'volume'
  else
    'noVolume'
  end
end

#publisherName(toks, idx, author_names = nil) ⇒ `Object`



181
182
183

# File 'lib/excite/token_features.rb', line 181

def publisherName(toks, idx, author_names=nil)
  (dict_status(toks, idx) & DictFlags::PUBLISHER_NAME) > 0 ? 'publisherName' : 'noPublisherName'
end

#punct(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 156

def punct(toks, idx, author_names=nil)
  (toks[idx].raw =~ /\-.*\-/)              ? "multiHyphen" :
  (toks[idx].raw =~ /[[:alpha:]].*\-$/)    ? "truncated"   :
  (toks[idx].raw =~ /[[:alpha:]].*\.$/)    ? "abbrev"      :
  (toks[idx].np != toks[idx].raw)          ? "hasPunct"    : "others"
end

#tag_name(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 233

def tag_name(toks, idx, author_names=nil)
  node = toks[idx].node
  name = node.text? ? node.parent.name : node.name
  NODE_TYPES_BY_NAME[name.downcase] || 'other'
end

#toklcnp(toks, idx, author_names = nil) ⇒ `Object`

# File 'lib/excite/token_features.rb', line 78

def toklcnp(toks, idx, author_names=nil)
  if toks[idx].lcnp.blank?
    "EMPTY"
  else
    toks[idx].lcnp
  end
end

Module: Excite::TokenFeatures

Defined Under Namespace

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#capitalization(toks, idx, author_names = nil) ⇒ Object

#clear ⇒ Object

#dict_status(toks, idx, author_names = nil) ⇒ Object

#first_1_char(toks, idx, author_names = nil) ⇒ Object

#first_2_chars(toks, idx, author_names = nil) ⇒ Object

#first_3_chars(toks, idx, author_names = nil) ⇒ Object

#first_4_chars(toks, idx, author_names = nil) ⇒ Object

#first_5_chars(toks, idx, author_names = nil) ⇒ Object

#firstName(toks, idx, author_names = nil) ⇒ Object

#is_in(toks, idx, author_names = nil) ⇒ Object

#last_1_char(toks, idx, author_names = nil) ⇒ Object

#last_2_chars(toks, idx, author_names = nil) ⇒ Object

#last_3_chars(toks, idx, author_names = nil) ⇒ Object

#last_4_chars(toks, idx, author_names = nil) ⇒ Object

#last_char(toks, idx, author_names = nil) ⇒ Object

#lastName(toks, idx, author_names = nil) ⇒ Object

#location(toks, idx, author_names = nil) ⇒ Object

#location_in_node(toks, idx, author_names = nil) ⇒ Object

#monthName(toks, idx, author_names = nil) ⇒ Object

#numbers(toks, idx, author_names = nil) ⇒ Object

#part_of_speech(toks, idx, author_names = nil) ⇒ Object

#placeName(toks, idx, author_names = nil) ⇒ Object

#possible_chapter(toks, idx = nil, author_names = nil) ⇒ Object

#possible_editor(toks, idx = nil, author_names = nil) ⇒ Object

#possible_volume(toks, idx, author_names = nil) ⇒ Object

#publisherName(toks, idx, author_names = nil) ⇒ Object

#punct(toks, idx, author_names = nil) ⇒ Object

#tag_name(toks, idx, author_names = nil) ⇒ Object

#toklcnp(toks, idx, author_names = nil) ⇒ Object