Class: NGramPrefixDictionary
- Defined in:
- lib/rbbt/ner/ngram_prefix_dictionary.rb
Overview
This code was adapted from Ashish Tendulkar (ASK MARTIN)
Constant Summary collapse
- STOP_LETTERS =
%w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
- STOP_LETTER_CHAR_VALUES =
STOP_LETTERS.collect{|l| l[0]} + ["\n", "\r", " "].collect{|l| l[0]}
- LETTER_REGEXP =
Regexp.compile(/[#{Regexp.quote((STOP_LETTERS + ["\n", "\r", " "]) * "")}]/)
Instance Attribute Summary collapse
-
#case_insensitive ⇒ Object
Returns the value of attribute case_insensitive.
-
#index ⇒ Object
Returns the value of attribute index.
-
#type ⇒ Object
Returns the value of attribute type.
Class Method Summary collapse
- .match(index, text) ⇒ Object
- .process_hash(hash, case_insensitive = false) ⇒ Object
- .process_stream(stream, case_insensitive = false) ⇒ Object
Instance Method Summary collapse
-
#initialize(file, type = nil, case_insensitive = false) ⇒ NGramPrefixDictionary
constructor
A new instance of NGramPrefixDictionary.
- #match(text) ⇒ Object
Methods inherited from NER
Constructor Details
#initialize(file, type = nil, case_insensitive = false) ⇒ NGramPrefixDictionary
Returns a new instance of NGramPrefixDictionary.
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 128 def initialize(file, type = nil, case_insensitive = false) @type = type @case_insensitive = case_insensitive case when (TSV === file or Hash === file) Log.debug("Ngram Prefix Dictionary. Loading of lexicon hash started.") @index = NGramPrefixDictionary.process_hash(file, case_insensitive) when Path === file Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.") @index = NGramPrefixDictionary.process_stream(file.open, case_insensitive) when Misc.is_filename?(file) Log.debug("Ngram Prefix Dictionary. Loading of lexicon file started: #{ file }.") @index = NGramPrefixDictionary.process_stream(Open.open(file)) when StreamIO === file Log.debug("Ngram Prefix Dictionary. Loading of lexicon stream started.") @index = NGramPrefixDictionary.process_stream(file, case_insensitive) else raise "Format of lexicon not understood: #{file.inspect}" end Log.debug("Ngram Prefix Dictionary. Loading done.") end |
Instance Attribute Details
#case_insensitive ⇒ Object
Returns the value of attribute case_insensitive.
127 128 129 |
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 127 def case_insensitive @case_insensitive end |
#index ⇒ Object
Returns the value of attribute index.
127 128 129 |
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 127 def index @index end |
#type ⇒ Object
Returns the value of attribute type.
127 128 129 |
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 127 def type @type end |
Class Method Details
.match(index, text) ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 84 def self.match(index, text) return [] if text.nil? or text.empty? matches = [] text_offset = 0 text_chars = text.chars.to_a text_length = text.length while (not text_offset.nil?) and text_offset < text_length if STOP_LETTER_CHAR_VALUES.include? text[text_offset] text_offset += 1 next end ngram = text.slice(text_offset, 3).strip text_byte_offset = text_offset == 0 ? 0 : text[0..text_offset-1].bytesize found = nil if index.include? ngram diff = text_length - text_offset # Match with entries index[ngram].each do |name, code| if name.length <= diff if fast_start_with(text, name, text_byte_offset) found = [name.dup, code, text_offset] break end end end end if found.nil? text_offset = text.index(LETTER_REGEXP, text_offset) text_offset += 1 unless text_offset.nil? else matches << found text_offset += found.first.length end end matches end |
.process_hash(hash, case_insensitive = false) ⇒ Object
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 64 def self.process_hash(hash, case_insensitive = false) index = {} hash.monitor = true if hash.respond_to? :monitor hash.unnamed = true if hash.respond_to? :unnamed method = hash.respond_to?(:through)? :through : :each hash.send(method) do |code, names| names.each do |name| name = name.downcase if case_insensitive ngram = name[0..2].strip index[ngram] ||= [] index[ngram] << [name, code] end end index end |
.process_stream(stream, case_insensitive = false) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 46 def self.process_stream(stream, case_insensitive = false) index = {} while line = stream.gets names = line.split(/\t|\|/).select{|n| not n.empty?}.compact code = names.shift names.each do |name| name = name.downcase if case_insensitive ngram = name[0..2].strip index[ngram] ||= [] index[ngram] << [name, code] end end index end |
Instance Method Details
#match(text) ⇒ Object
151 152 153 154 155 156 157 158 159 160 161 162 |
# File 'lib/rbbt/ner/ngram_prefix_dictionary.rb', line 151 def match(text) matches = NGramPrefixDictionary.match(index, (case_insensitive ? text.downcase : text)).collect{|name, code, offset| NamedEntity.setup(name, :offset => offset, :entity_type => type, :code => code) } if case_insensitive matches.each{|m| m.replace(text[m.range])} matches else matches end end |