Class: RegExpNER

Inherits:
NER
  • Object
show all
Includes:
SimpleDSL
Defined in:
lib/rbbt/ner/regexpNER.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from NER

#entities, #extract

Constructor Details

#initialize(regexps = {}) ⇒ RegExpNER

Returns a new instance of RegExpNER.



76
77
78
# File 'lib/rbbt/ner/regexpNER.rb', line 76

def initialize(regexps = {})
  @regexps = regexps.collect{|p| p }
end

Instance Attribute Details

#regexpsObject

Returns the value of attribute regexps.



75
76
77
# File 'lib/rbbt/ner/regexpNER.rb', line 75

def regexps
  @regexps
end

#split_on_matchesObject

Returns the value of attribute split_on_matches.



75
76
77
# File 'lib/rbbt/ner/regexpNER.rb', line 75

def split_on_matches
  @split_on_matches
end

Class Method Details

.match_regexp(text, regexp, type = nil) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/rbbt/ner/regexpNER.rb', line 7

def self.match_regexp(text, regexp, type = nil)
  matches = []
  start = 0
  while matchdata = text.match(regexp)
    pre   = matchdata.pre_match
    post  = matchdata.post_match

    if matchdata.named_captures.any?
      match = matchdata[0]
      code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
      NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
      matches << match
      eend = match.length + pre.length
      text = text[eend..-1] 
      start += match.length + pre.length
    elsif matchdata.captures.any?
      match = matchdata.captures.first
      offset, eend = matchdata.offset(1)
      NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
      matches << match
      start += offset + match.length
      text = text[eend..-1] 
    else
      match = matchdata[0]
      NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
      matches << match
      eend = match.length + pre.length
      text = text[eend..-1] 
      start += match.length + pre.length
    end
  end

  matches
end

.match_regexp_hash(text, regexp_hash, split_on_matches = false) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/rbbt/ner/regexpNER.rb', line 57

def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
  matches = []

  regexp_hash.each do |type, regexp_list|
    regexp_list = [regexp_list] unless Array === regexp_list
    chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
    chunks.each do |chunk|
      chunk_offset = chunk.offset
      match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match| 
        match.offset = match.offset + chunk_offset; 
        matches << match 
      end
    end
  end

  matches
end

.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/rbbt/ner/regexpNER.rb', line 42

def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
  matches = []

  regexp_list.each do |regexp|
    chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
    chunks = Segment.split(text, [])
    chunks.each do |chunk|
      new_matches = match_regexp(chunk, regexp, type)
      new_matches.each do |match| match.offset += chunk.offset; matches << match end
    end
  end

  matches
end

Instance Method Details

#__define_regexp_hook(name, regexp, *args) ⇒ Object



84
85
86
# File 'lib/rbbt/ner/regexpNER.rb', line 84

def __define_regexp_hook(name, regexp, *args)
  @regexps << [name, regexp]
end

#add_regexp(list = {}) ⇒ Object



92
93
94
# File 'lib/rbbt/ner/regexpNER.rb', line 92

def add_regexp(list = {})
  @regexps.concat list.collect
end

#define_regexp(*args, &block) ⇒ Object



88
89
90
# File 'lib/rbbt/ner/regexpNER.rb', line 88

def define_regexp(*args, &block)
  load_config("__define_regexp_hook", *args, &block)
end

#match(text) ⇒ Object



96
97
98
99
100
101
# File 'lib/rbbt/ner/regexpNER.rb', line 96

def match(text)
  matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
  matches.collect do |m|
    NamedEntity.setup(m, :offset => m.offset, :type =>  m.type, :code => m.code || m)
  end
end

#token_score(*args) ⇒ Object



80
81
82
# File 'lib/rbbt/ner/regexpNER.rb', line 80

def token_score(*args)
  1
end