Class: NGramPrefixDictionary

Inherits:

NER

Object
NER
NGramPrefixDictionary

show all

Defined in:: lib/rbbt/ner/ngram_prefix_dictionary.rb

Overview

This code was adapted from Ashish Tendulkar (ASK MARTIN)

Constant Summary collapse

STOP_LETTERS =

%w(\' " ( ) { } [ ] - ? ! < ; : > . ,)

STOP_LETTER_CHAR_VALUES =

STOP_LETTERS.collect{|l| l[0]} + ["\n", "\r", " "].collect{|l| l[0]}

LETTER_REGEXP =

Regexp.compile(/[#{Regexp.quote((STOP_LETTERS + ["\n", "\r", " "]) * "")}]/)

Instance Attribute Summary collapse

#case_insensitive ⇒ Object

Returns the value of attribute case_insensitive.
#index ⇒ Object

Returns the value of attribute index.
#type ⇒ Object

Returns the value of attribute type.

Class Method Summary collapse

Instance Method Summary collapse

#initialize(file, type = nil, case_insensitive = false) ⇒ NGramPrefixDictionary constructor

A new instance of NGramPrefixDictionary.
#match(text) ⇒ Object

Methods inherited from NER

#entities, #extract

Constructor Details

#initialize(file, type = nil, case_insensitive = false) ⇒ `NGramPrefixDictionary`

Returns a new instance of NGramPrefixDictionary.

# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 128

def initialize(file, type = nil, case_insensitive = false)
  @type = type
  @case_insensitive = case_insensitive
  case
  when (TSV === file or Hash === file)
    Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.")
    @index = NGramPrefixDictionary.process_hash(file, case_insensitive)
  when Path === file
    Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
    @index = NGramPrefixDictionary.process_stream(file.open, case_insensitive)
  when Misc.is_filename?(file)
    Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.")
    @index = NGramPrefixDictionary.process_stream(Open.open(file))
  when StreamIO === file
    Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.")
    @index = NGramPrefixDictionary.process_stream(file, case_insensitive)
  else
    raise "Format of lexicon not understood: #{file.inspect}"
  end

  Log.debug("Ngram Prefix Dictionary. Loading done.")
end

Instance Attribute Details

#case_insensitive ⇒ `Object`

Returns the value of attribute case_insensitive.



127
128
129

# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 127

def case_insensitive
  @case_insensitive
end

#index ⇒ `Object`

Returns the value of attribute index.



127
128
129

# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 127

def index
  @index
end

#type ⇒ `Object`

Returns the value of attribute type.



127
128
129

# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 127

def type
  @type
end

Class Method Details

.match(index, text) ⇒ `Object`

# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 84

def self.match(index, text)
  return [] if text.nil? or text.empty?

  matches = []

  text_offset = 0
  text_chars = text.chars.to_a
  text_length = text.length
  while (not text_offset.nil?) and text_offset < text_length
    if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
      text_offset += 1 
      next
    end
    ngram =  text.slice(text_offset, 3).strip
    text_byte_offset = text_offset == 0 ? 0 : text[0..text_offset-1].bytesize

    found = nil
    if index.include? ngram
      diff = text_length - text_offset
      # Match with entries
      index[ngram].each do |name, code|
        if name.length <= diff
          if fast_start_with(text, name, text_byte_offset)
            found = [name.dup, code, text_offset]
            break
          end
        end
      end
    end

    if found.nil?
      text_offset = text.index(LETTER_REGEXP, text_offset)
      text_offset += 1 unless text_offset.nil?
    else
      matches << found
      text_offset += found.first.length
    end
  end

  matches
end

.process_hash(hash, case_insensitive = false) ⇒ `Object`

# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 64

def self.process_hash(hash, case_insensitive = false)
  index = {}

  hash.monitor = true if hash.respond_to? :monitor
  hash.unnamed = true if hash.respond_to? :unnamed
  method = hash.respond_to?(:through)? :through : :each

  hash.send(method) do |code, names|
    names.each do |name|
      name = name.downcase if case_insensitive
      ngram = name[0..2].strip
      index[ngram] ||= []
      index[ngram] << [name, code]
    end
  end

  index
end

.process_stream(stream, case_insensitive = false) ⇒ `Object`

# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 46

def self.process_stream(stream, case_insensitive = false)
  index = {}

  while line = stream.gets
    names = line.split(/\t|\|/).select{|n| not n.empty?}.compact
    code = names.shift
    
    names.each do |name|
      name = name.downcase if case_insensitive
      ngram = name[0..2].strip
      index[ngram] ||= []
      index[ngram] << [name, code]
    end
  end

  index
end

Instance Method Details

#match(text) ⇒ `Object`

# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 151

def match(text)
  matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset|
    NamedEntity.setup(name, :offset => offset, :entity_type => type, :code => code)
  }

  if case_insensitive
    matches.each{|m| m.replace(text[m.range])}
    matches
  else
    matches
  end
end