Module: Excite::Postprocessor

Included in:
CRFParser
Defined in:
lib/excite/postprocessor.rb

Instance Method Summary collapse

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(m, *args, &block) ⇒ Object



12
13
14
15
16
17
18
19
20
# File 'lib/excite/postprocessor.rb', line 12

def method_missing(m, *args, &block)
  # Call normalize on any fields that don't have their own normalization
  # method defined
  if m.to_s =~ /^normalize/
    m.to_s =~ /normalize_(.*)$/
    normalize($1, *args)
  else super
  end
end

Instance Method Details

#join_multi_word_names(author_text) ⇒ Object

Insert underscores to join name particles. i.e. Jon de Groote —> Jon de_Groote



217
218
219
220
221
# File 'lib/excite/postprocessor.rb', line 217

def join_multi_word_names(author_text)
  author_text.gsub(/\b((?:van|von|der|den|de|di|le|el))\s/i) {
    "#{$1}_"
  }
end

#normalize(key, hsh) ⇒ Object

default normalization function for all fields that do not have their own normalization Strip any leading and/or trailing punctuation and space



25
26
27
28
# File 'lib/excite/postprocessor.rb', line 25

def normalize(key, hsh)
  hsh[key].gsub!(/^[^[:alnum:]]+/, '')
  hsh[key].gsub!(/[^[:alnum:]]+$/, '')
end

#normalize_author(hsh) ⇒ Object

Tries to split the author tokens into individual author names and then normalizes these names individually. Returns a list of author names.



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/excite/postprocessor.rb', line 79

def normalize_author(hsh)
  str = hsh['author']
  tokens = repair_and_tokenize_author_text(str)
  authors = []
  current_auth = []
  begin_auth = 1
  tokens.each {|tok|
    if tok =~ /^(&|and)$/i
      if !current_auth.empty?
        auth = normalize_author_name(current_auth)
        authors << auth
      end
      current_auth = []
      begin_auth = 1
      next
    end
    if begin_auth > 0
      current_auth << tok
      begin_auth = 0
      next
    end
    if tok =~ /,$/
      current_auth << tok
      if !current_auth.empty?
        auth = normalize_author_name(current_auth)
        authors << auth
        current_auth = []
        begin_auth = 1
      end
    else
      current_auth << tok
    end
  }
  if !current_auth.empty?
    auth = normalize_author_name(current_auth)
    authors << auth.strip unless auth.strip == "-" || auth.strip.blank?
  end
  hsh['authors'] = authors if !authors.empty?
  normalize('author',hsh)
  hsh
end

#normalize_author_name(auth_toks) ⇒ Object

Tries to normalize an individual author name into the form “First Middle Last”, without punctuation.



227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/excite/postprocessor.rb', line 227

def normalize_author_name(auth_toks)
  return '' if auth_toks.empty?
  str = auth_toks.join(" ")
  if str =~ /(.+),\s*(.+)/
    str = "#{$1} #{$2}"
  end
  str.gsub!(/\.\-/, '-')
  str.gsub!(/[\,\.]/, ' ')
  str.gsub!(/  +/, ' ')
  str.strip!

  if (str =~ /^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/)
    new_toks = str.split(/\s+/)
    new_order = new_toks[1...new_toks.length];
    new_order << new_toks[0]
    str = new_order.join(" ")
  end

  str.gsub!(/^[^A-Za-z0-9]+/, '')
  str.gsub!(/[^A-Za-z0-9]+$/, '')
  return str
end

#normalize_date(hsh) ⇒ Object



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/excite/postprocessor.rb', line 121

def normalize_date(hsh)
  str = hsh['date']
  if str =~ /(\d{4})/
    year = $1.to_i
    current_year = Time.now.year
    if year <= current_year+3
      ret = year
      hsh['year'] = ret
    else
      ret = nil
    end
  end
  hsh['date'] = ret
  hsh
end

#normalize_fields(citation_hsh) ⇒ Object



7
8
9
10
# File 'lib/excite/postprocessor.rb', line 7

def normalize_fields(citation_hsh)
  citation_hsh.keys.each {|key| self.send("normalize_#{key}", citation_hsh) }
  citation_hsh
end

#normalize_pages(hsh) ⇒ Object

Normalizes page fields into the form “start–end”. If the page field does not appear to be in a standard form, does nothing.



154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/excite/postprocessor.rb', line 154

def normalize_pages(hsh)
  # "vol.issue (year):pp"
  case hsh['pages']
  when /(\d+) (?: \.(\d+))? (?: \( (\d\d\d\d) \))? : (\d.*)/x
    hsh['volume'] = $1
    hsh['number'] = $2 if $2
    hsh['year'] = $3 if $3
    hsh['pages'] = $4
  end

  case hsh['pages']
  when  /(\d+)[^\d]+(\d+)/
    hsh['pages'] = "#{$1}--#{$2}"
  when  /(\d+)/
    hsh['pages'] = $1
  end
  hsh
end

#normalize_title(hsh) ⇒ Object

strip leading numerals if the real title is quoted inside this string, try to extract it if the title has at least 2 words before a newline or period or open parens, strip everything after TODO could do better with knowledge of prepositions, names - maybe we just need a second model?



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/excite/postprocessor.rb', line 34

def normalize_title(hsh)
  str = hsh['title'].strip

  numeral_regexes = [
    /^[0-9]+[.)](\s+|(?=["'”’´‘“`'A-Z]))/i,                                    # initial numbers + punctuation + space or a quote or a capital letter
    /^C{0,3}(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])[.)](\s+|(?=["'”’´‘“`'A-Z]))/i,    # initial roman numerals
    /^[A-Z][.)](\s+|(?=["'”’´‘“`'A-Z]))/i                                      # initial single letter
  ]

  numeral_regexes.each do |regex|
    if str.gsub!(regex, '')
      break
    end
  end

  if (m = str.match /^(["'”’´‘“`'])/)
    quote_char = m[1]
    pairable = pairable_quote_chars(quote_char)

    if str.scan(/[#{pairable}]/).length >= 2
      str.gsub!(/^#{quote_char}/, '')
      str.gsub!(/[#{pairable}][^#{pairable}]+$/, '')
    end
  end

  while (m = str.match /\S+\s+\S+.*(\n|\.(?=\s|\()).*/)
    i = str.rindex m[1]
    str = str[0..i-1]
  end

  hsh['title'] = str
  normalize('title',hsh)
end

#normalize_volume(hsh) ⇒ Object



137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/excite/postprocessor.rb', line 137

def normalize_volume(hsh)
  # If there are two numbers, they are volume and number.
  # e.g. "23(2)", "Vol. 23, No. 3" etc...
  if hsh['volume'] =~ /\D*(\d+)\D+(\d+)/i
    hsh['volume'] = $1
    hsh['number'] = $2
  # Otherwise, just pull out a number and hope that it's the volume
  elsif hsh['volume'] =~ /(\d+)/
    hsh['volume'] = $1
  end
  hsh
end

#pairable_quote_chars(quote_char) ⇒ Object



68
69
70
71
72
# File 'lib/excite/postprocessor.rb', line 68

def pairable_quote_chars(quote_char)
  [%{"”“}, %{’'`‘´'}].each do |chars|
    return chars if chars.include? quote_char
  end
end

#repair_and_tokenize_author_text(author_text) ⇒ Object



173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/excite/postprocessor.rb', line 173

def repair_and_tokenize_author_text(author_text)
  # Repair obvious parse errors and weird notations.
  author_text.sub!(/et\.? al\.?.*$/, '')
  # FIXME: maybe I'm mis-understanding Perl regular expressions, but
  # this pattern from ParseCit appears to do the Wrong Thing:
  # author_text.sub!(/^.*?[a-zA-Z][a-zA-Z]+\. /, '')
  author_text.gsub!(/\(.*?\)/, '')
  author_text.gsub!(/^.*?\)\.?/, '')
  author_text.gsub!(/\(.*?$/, '')
  author_text.gsub!(/\[.*?\]/, '')
  author_text.gsub!(/^.*?\]\.?/, '')
  author_text.gsub!(/\[.*?$/, '')
  author_text.gsub!(/;/, ',')
  author_text.gsub!(/,/, ', ')
  author_text.gsub!(/\:/, ' ')
  author_text.gsub!(/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]/, '')
  author_text = join_multi_word_names(author_text)

  orig_tokens = author_text.split(/\s+/)
  tokens = []
  last = false
  orig_tokens.each_with_index {|tok, i|
    if tok !~ /[A-Za-z&]/
      if i < orig_tokens.length/2
        tokens = []
        next
      else
        last = true
      end
    end
    if (tok =~ /^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i and
        tokens.last =~ /\,$/) or
        tok =~ /^[IVX][IVX]+\.?\,?$/

      next
    end
    tokens << tok
    break if last
  }
  tokens
end