Class: Excite::CRFParser

Inherits:
Object
  • Object
show all
Includes:
Postprocessor, Preprocessor, TokenFeatures
Defined in:
lib/excite/crfparser.rb

Constant Summary collapse

DIR =
File.dirname(__FILE__)
TAGGED_REFERENCES =
"#{DIR}/resources/trainingdata/tagged_references.txt"
TAGGED_HTML_REFERENCES =
"#{DIR}/resources/trainingdata/tagged_html_references.txt"
TRAINING_DATA =
"#{DIR}/resources/trainingdata/training_data.txt"
MODEL_FILE =
"#{DIR}/resources/model"
HTML_MODEL_FILE =
"#{DIR}/resources/html_model"
TEMPLATE_FILE =
"#{DIR}/resources/parsCit.template"
HTML_TEMPLATE_FILE =
"#{DIR}/resources/html.template"
CONFIG_FILE =
"#{DIR}/../../config/parscit_features.yml"

Constants included from Preprocessor

Preprocessor::CLEANUP_RULES_FILE, Preprocessor::MARKER_TYPES

Constants included from TokenFeatures

TokenFeatures::DICT, TokenFeatures::NODE_TYPES_BY_NAME

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Postprocessor

#join_multi_word_names, #method_missing, #normalize, #normalize_author, #normalize_author_name, #normalize_date, #normalize_fields, #normalize_pages, #normalize_title, #normalize_volume, #pairable_quote_chars, #repair_and_tokenize_author_text

Methods included from Preprocessor

#cleanup_rules, #normalize_citation, #normalize_cite_text, #segment_citations, #split_citations_by_marker

Methods included from TokenFeatures

#capitalization, #clear, #dict_status, #firstName, #first_1_char, #first_2_chars, #first_3_chars, #first_4_chars, #first_5_chars, #is_in, #lastName, #last_1_char, #last_2_chars, #last_3_chars, #last_4_chars, #last_char, #location, #location_in_node, #monthName, #numbers, #part_of_speech, #placeName, #possible_chapter, #possible_editor, #possible_volume, #publisherName, #punct, #tag_name, #toklcnp

Constructor Details

#initialize(mode = :string) ⇒ CRFParser

Feature functions must be performed in alphabetical order, since later functions may depend on earlier ones. TODO This seems pretty confusing and dependent on the current features.



32
33
34
35
36
37
38
39
# File 'lib/excite/crfparser.rb', line 32

def initialize(mode=:string)
  @mode = mode

  f = File.open(CONFIG_FILE, 'r')
  hsh = YAML::load(f)[mode.to_s]
  @feature_order = hsh["feature_order"].map(&:to_sym)
  @token_features = hsh["feature_order"].sort.map(&:to_sym)
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method in the class Excite::Postprocessor

Instance Attribute Details

#feature_orderObject (readonly)

Returns the value of attribute feature_order.



12
13
14
# File 'lib/excite/crfparser.rb', line 12

def feature_order
  @feature_order
end

#token_featuresObject (readonly)

Returns the value of attribute token_features.



13
14
15
# File 'lib/excite/crfparser.rb', line 13

def token_features
  @token_features
end

Class Method Details

.strip_punct(str) ⇒ Object



77
78
79
# File 'lib/excite/crfparser.rb', line 77

def self.strip_punct(str)
  str.gsub(/[^[:alnum:]]/, '')
end

Instance Method Details

#default_model_fileObject



261
262
263
264
265
266
267
268
269
# File 'lib/excite/crfparser.rb', line 261

def default_model_file
  if @mode == :string
    MODEL_FILE
  elsif @mode == :html
    HTML_MODEL_FILE
  else
    raise "Unknown mode: #{@mode}"
  end
end

#default_tagged_referencesObject



251
252
253
254
255
256
257
258
259
# File 'lib/excite/crfparser.rb', line 251

def default_tagged_references
  if @mode == :string
    TAGGED_REFERENCES
  elsif @mode == :html
    TAGGED_HTML_REFERENCES
  else
    raise "Unknown mode: #{@mode}"
  end
end

#default_template_fileObject



271
272
273
274
275
276
277
278
279
# File 'lib/excite/crfparser.rb', line 271

def default_template_file
  if @mode == :string
    TEMPLATE_FILE
  elsif @mode == :html
    HTML_TEMPLATE_FILE
  else
    raise "Unknown mode: #{@mode}"
  end
end

#eval_crfpp(feat_seq, model) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/excite/crfparser.rb', line 60

def eval_crfpp(feat_seq, model)
  model.clear
  feat_seq.each {|vec|
    line = vec.join(" ").strip
    raise unless model.add(line)
  }
  raise unless model.parse
  tags = []
  probs = {}
  feat_seq.length.times {|i|
    tags << model.y2(i)
    probs[model.y2(i)] ||= 1
    probs[model.y2(i)] *= model.prob(i)
  }
  [tags, model.prob, probs]
end

#html_str_2_tokens(str) ⇒ Object



168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/excite/crfparser.rb', line 168

def html_str_2_tokens(str)
  html = Nokogiri::HTML.fragment(str.gsub('>', '> ')) # gsub to ensure strings in separate tags are always separate tokens even if HTML is bad

  tokens = []
  html.traverse do |node|
    if node.text?
      tokens += html_text_node_2_tokens(node)
    elsif node.name == 'br'
      tokens << Token.for_br(node)
    end
  end
  tokens
end

#html_text_node_2_tokens(node) ⇒ Object



182
183
184
185
186
187
188
189
# File 'lib/excite/crfparser.rb', line 182

def html_text_node_2_tokens(node)
  text = CGI.unescapeHTML(node.text)
  return [] if text.blank?

  tokens = text_str_2_tokens(text)
  tokens.each_with_index { |tok, i| tok.is_in_node!(node, i, tokens.length) }
  tokens
end

#modelObject



41
42
43
# File 'lib/excite/crfparser.rb', line 41

def model
  @model ||= CRFPP::Tagger.new("-m #{default_model_file} -v 1");
end

#normalize_input_author(str) ⇒ Object



81
82
83
84
# File 'lib/excite/crfparser.rb', line 81

def normalize_input_author(str)
  return nil if str.blank?
  str.split.map(&:downcase).map{ |t| self.class.strip_punct(t) }.select{ |s| s.length > 2 }
end

#parse(str, presumed_author = nil) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/excite/crfparser.rb', line 45

def parse(str, presumed_author=nil)
  raw_string = str.dup

  toks, features = str_2_features(str, false, presumed_author)
  tags, overall_prob, tag_probs = eval_crfpp(features, model)

  ret = {}
  tags.each_with_index { |t, i| (ret[t] ||= []) << toks[i].for_join(toks[i-1]) }
  ret.each { |k, v| ret[k] = v.join('').strip }

  normalize_fields(ret)
  ret['raw_string'] = raw_string
  [ret, overall_prob, tag_probs]
end

#prepare_token_data(raw_string, training = false) ⇒ Object



86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/excite/crfparser.rb', line 86

def prepare_token_data(raw_string, training=false)
  if training
    tags = tagged_string_2_tags(raw_string.strip)

    labels, raw_string, joined_tokens = [], '', ''
    tags.each do |tag|
      raw = CGI.unescapeHTML(tag.inner_html)

      label = tag.name
      raise "Invalid label #{label} for:\n#{str}" if label.present? && !recognized_labels.include?(label)

      toks = str_2_tokens(raw)

      labels << [label, joined_tokens.length]
      joined_tokens += toks.map(&:raw).join
      raw_string += "\n#{raw}"
    end
  end

  tokens = str_2_tokens(raw_string.strip)

  if training
    joined_tokens = ''
    label, _ = labels.shift
    next_label, end_idx = labels.shift unless labels.empty?

    tokens.each do |tok|
      tok.label = label
      joined_tokens += tok.raw
      if joined_tokens.length == end_idx
        label = next_label
        next_label, end_idx = labels.shift unless labels.empty?
      elsif joined_tokens.length > end_idx && !labels.empty?
        raise "Tokens do not match labels"
      end
    end
    raise "Unused label" unless labels.empty?
  end

  if @mode == :html
    # drop leading and trailing <br>s; needs to be done after labels are applied
    tokens = tokens.drop_while { |t| t.part_of_speech == 'br' }
    tokens.reverse!
    tokens = tokens.drop_while { |t| t.part_of_speech == 'br' }
    tokens.reverse!
  end

  self.clear

  return tokens
end

#recognized_labelsObject



158
159
160
161
162
163
164
165
166
# File 'lib/excite/crfparser.rb', line 158

def recognized_labels
  if @mode == :string
    ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "tech"]
  elsif @mode == :html
    ["author", "title", "editor", "booktitle", "date", "journal", "volume", "institution", "pages", "location", "publisher", "note", "workid", "link", "bullet"]
  else
    []
  end
end

#str_2_features(raw_string, training = false, presumed_author = nil) ⇒ Object

calculate features on the full citation string



198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/excite/crfparser.rb', line 198

def str_2_features(raw_string, training=false, presumed_author=nil)
  features = []
  tokens = prepare_token_data(raw_string, training)

  author_names = normalize_input_author(presumed_author)

  tokens.each_with_index do |tok, toki|
    raise "All tokens must be labeled" if training && tok.label.nil?

    feats = {}

    @token_features.each {|f|
      feats[f] = self.send(f, tokens, toki, author_names)
    }

    features << [tok.raw]
    @feature_order.each {|f| features.last << feats[f]}
    features.last << tok.label if training
  end

  [tokens, features]
end

#str_2_tokens(str) ⇒ Object



148
149
150
151
152
153
154
155
156
# File 'lib/excite/crfparser.rb', line 148

def str_2_tokens(str)
  if @mode == :html
    toks = html_str_2_tokens(str)
  elsif @mode == :string
    toks = text_str_2_tokens(str)
  end

  toks.reject { |t| t.empty? }
end

#tagged_string_2_tags(str) ⇒ Object



142
143
144
145
146
# File 'lib/excite/crfparser.rb', line 142

def tagged_string_2_tags(str)
  str = "<string>#{str}</string>"
  node = Nokogiri::XML.fragment(str).css('string')
  node.children.reject(&:text?)
end

#taggerObject



138
139
140
# File 'lib/excite/crfparser.rb', line 138

def tagger
  @tagger ||= EngTagger.new
end

#text_str_2_tokens(text) ⇒ Object



191
192
193
194
195
# File 'lib/excite/crfparser.rb', line 191

def text_str_2_tokens(text)
  tagged = tagger.add_tags(normalize_citation(text))
  tags = tagged_string_2_tags(tagged.gsub('&','&amp;')) # EngTagger has legitimately added angle brackets which are meaningful in XML, but angle-brackets predate EngTagger and are semantic
  tags.map { |tag| Token.new(tag.text, tag.name) }
end

#train(tagged_refs = nil, model = nil, template = nil, training_data = nil) ⇒ Object



238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/excite/crfparser.rb', line 238

def train(tagged_refs=nil, model=nil, template=nil, training_data=nil)
  tagged_refs ||= default_tagged_references
  model ||= default_model_file
  template ||= default_template_file

  if training_data.nil?
    training_data = TRAINING_DATA
    write_training_file(tagged_refs, training_data)
  end

  `crf_learn #{template} #{training_data} #{model} -f3 1>&2`
end

#write_training_file(tagged_refs = nil, training_data = TRAINING_DATA) ⇒ Object



221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/excite/crfparser.rb', line 221

def write_training_file(tagged_refs=nil, training_data=TRAINING_DATA)
  tagged_refs ||= default_tagged_references

  fin = File.open(tagged_refs, 'r')
  fout = File.open(training_data, 'w')
  x = 0
  while l = fin.gets
    _, data = str_2_features(l.strip, true)
    data.each {|line| fout.write("#{line.join(" ")}\n") }
    fout.write("\n")
  end

  fin.close
  fout.flush
  fout.close
end