Module: Excite::Postprocessor

Included in:: CRFParser

Defined in:: lib/excite/postprocessor.rb

Instance Method Summary collapse

#join_multi_word_names(author_text) ⇒ Object

Insert underscores to join name particles.
#method_missing(m, *args, &block) ⇒ Object
#normalize(key, hsh) ⇒ Object

default normalization function for all fields that do not have their own normalization Strip any leading and/or trailing punctuation and space.
#normalize_author(hsh) ⇒ Object

Tries to split the author tokens into individual author names and then normalizes these names individually.
#normalize_author_name(auth_toks) ⇒ Object

Tries to normalize an individual author name into the form “First Middle Last”, without punctuation.
#normalize_date(hsh) ⇒ Object
#normalize_fields(citation_hsh) ⇒ Object
#normalize_pages(hsh) ⇒ Object

Normalizes page fields into the form “start–end”.
#normalize_title(hsh) ⇒ Object

strip leading numerals if the real title is quoted inside this string, try to extract it if the title has at least 2 words before a newline or period or open parens, strip everything after TODO could do better with knowledge of prepositions, names - maybe we just need a second model?.
#normalize_volume(hsh) ⇒ Object
#pairable_quote_chars(quote_char) ⇒ Object
#repair_and_tokenize_author_text(author_text) ⇒ Object

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(m, *args, &block) ⇒ `Object`

# File 'lib/excite/postprocessor.rb', line 12

def method_missing(m, *args, &block)
  # Call normalize on any fields that don't have their own normalization
  # method defined
  if m.to_s =~ /^normalize/
    m.to_s =~ /normalize_(.*)$/
    normalize($1, *args)
  else super
  end
end

Instance Method Details

#join_multi_word_names(author_text) ⇒ `Object`

Insert underscores to join name particles. i.e. Jon de Groote —> Jon de_Groote

# File 'lib/excite/postprocessor.rb', line 217

def join_multi_word_names(author_text)
  author_text.gsub(/\b((?:van|von|der|den|de|di|le|el))\s/i) {
    "#{$1}_"
  }
end

#normalize(key, hsh) ⇒ `Object`

default normalization function for all fields that do not have their own normalization Strip any leading and/or trailing punctuation and space

# File 'lib/excite/postprocessor.rb', line 25

def normalize(key, hsh)
  hsh[key].gsub!(/^[^[:alnum:]]+/, '')
  hsh[key].gsub!(/[^[:alnum:]]+$/, '')
end

#normalize_author(hsh) ⇒ `Object`

Tries to split the author tokens into individual author names and then normalizes these names individually. Returns a list of author names.

# File 'lib/excite/postprocessor.rb', line 79

def normalize_author(hsh)
  str = hsh['author']
  tokens = repair_and_tokenize_author_text(str)
  authors = []
  current_auth = []
  begin_auth = 1
  tokens.each {|tok|
    if tok =~ /^(&|and)$/i
      if !current_auth.empty?
        auth = normalize_author_name(current_auth)
        authors << auth
      end
      current_auth = []
      begin_auth = 1
      next
    end
    if begin_auth > 0
      current_auth << tok
      begin_auth = 0
      next
    end
    if tok =~ /,$/
      current_auth << tok
      if !current_auth.empty?
        auth = normalize_author_name(current_auth)
        authors << auth
        current_auth = []
        begin_auth = 1
      end
    else
      current_auth << tok
    end
  }
  if !current_auth.empty?
    auth = normalize_author_name(current_auth)
    authors << auth.strip unless auth.strip == "-" || auth.strip.blank?
  end
  hsh['authors'] = authors if !authors.empty?
  normalize('author',hsh)
  hsh
end

#normalize_author_name(auth_toks) ⇒ `Object`

Tries to normalize an individual author name into the form “First Middle Last”, without punctuation.

# File 'lib/excite/postprocessor.rb', line 227

def normalize_author_name(auth_toks)
  return '' if auth_toks.empty?
  str = auth_toks.join(" ")
  if str =~ /(.+),\s*(.+)/
    str = "#{$1} #{$2}"
  end
  str.gsub!(/\.\-/, '-')
  str.gsub!(/[\,\.]/, ' ')
  str.gsub!(/  +/, ' ')
  str.strip!

  if (str =~ /^[^\s][^\s]+(\s+[^\s]|\s+[^\s]\-[^\s])+$/)
    new_toks = str.split(/\s+/)
    new_order = new_toks[1...new_toks.length];
    new_order << new_toks[0]
    str = new_order.join(" ")
  end

  str.gsub!(/^[^A-Za-z0-9]+/, '')
  str.gsub!(/[^A-Za-z0-9]+$/, '')
  return str
end

#normalize_date(hsh) ⇒ `Object`

# File 'lib/excite/postprocessor.rb', line 121

def normalize_date(hsh)
  str = hsh['date']
  if str =~ /(\d{4})/
    year = $1.to_i
    current_year = Time.now.year
    if year <= current_year+3
      ret = year
      hsh['year'] = ret
    else
      ret = nil
    end
  end
  hsh['date'] = ret
  hsh
end

#normalize_fields(citation_hsh) ⇒ `Object`

# File 'lib/excite/postprocessor.rb', line 7

def normalize_fields(citation_hsh)
  citation_hsh.keys.each {|key| self.send("normalize_#{key}", citation_hsh) }
  citation_hsh
end

#normalize_pages(hsh) ⇒ `Object`

Normalizes page fields into the form “start–end”. If the page field does not appear to be in a standard form, does nothing.

# File 'lib/excite/postprocessor.rb', line 154

def normalize_pages(hsh)
  # "vol.issue (year):pp"
  case hsh['pages']
  when /(\d+) (?: \.(\d+))? (?: \( (\d\d\d\d) \))? : (\d.*)/x
    hsh['volume'] = $1
    hsh['number'] = $2 if $2
    hsh['year'] = $3 if $3
    hsh['pages'] = $4
  end

  case hsh['pages']
  when  /(\d+)[^\d]+(\d+)/
    hsh['pages'] = "#{$1}--#{$2}"
  when  /(\d+)/
    hsh['pages'] = $1
  end
  hsh
end

#normalize_title(hsh) ⇒ `Object`

strip leading numerals if the real title is quoted inside this string, try to extract it if the title has at least 2 words before a newline or period or open parens, strip everything after TODO could do better with knowledge of prepositions, names - maybe we just need a second model?

# File 'lib/excite/postprocessor.rb', line 34

def normalize_title(hsh)
  str = hsh['title'].strip

  numeral_regexes = [
    /^[0-9]+[.)](\s+|(?=["'”’´‘“`'A-Z]))/i,                                    # initial numbers + punctuation + space or a quote or a capital letter
    /^C{0,3}(L?X{0,3}|X[LC])(V?I{0,3}|I[VX])[.)](\s+|(?=["'”’´‘“`'A-Z]))/i,    # initial roman numerals
    /^[A-Z][.)](\s+|(?=["'”’´‘“`'A-Z]))/i                                      # initial single letter
  ]

  numeral_regexes.each do |regex|
    if str.gsub!(regex, '')
      break
    end
  end

  if (m = str.match /^(["'”’´‘“`'])/)
    quote_char = m[1]
    pairable = pairable_quote_chars(quote_char)

    if str.scan(/[#{pairable}]/).length >= 2
      str.gsub!(/^#{quote_char}/, '')
      str.gsub!(/[#{pairable}][^#{pairable}]+$/, '')
    end
  end

  while (m = str.match /\S+\s+\S+.*(\n|\.(?=\s|\()).*/)
    i = str.rindex m[1]
    str = str[0..i-1]
  end

  hsh['title'] = str
  normalize('title',hsh)
end

#normalize_volume(hsh) ⇒ `Object`

# File 'lib/excite/postprocessor.rb', line 137

def normalize_volume(hsh)
  # If there are two numbers, they are volume and number.
  # e.g. "23(2)", "Vol. 23, No. 3" etc...
  if hsh['volume'] =~ /\D*(\d+)\D+(\d+)/i
    hsh['volume'] = $1
    hsh['number'] = $2
  # Otherwise, just pull out a number and hope that it's the volume
  elsif hsh['volume'] =~ /(\d+)/
    hsh['volume'] = $1
  end
  hsh
end

#pairable_quote_chars(quote_char) ⇒ `Object`

# File 'lib/excite/postprocessor.rb', line 68

def pairable_quote_chars(quote_char)
  [%{"”“}, %{’'`‘´'}].each do |chars|
    return chars if chars.include? quote_char
  end
end

#repair_and_tokenize_author_text(author_text) ⇒ `Object`

# File 'lib/excite/postprocessor.rb', line 173

def repair_and_tokenize_author_text(author_text)
  # Repair obvious parse errors and weird notations.
  author_text.sub!(/et\.? al\.?.*$/, '')
  # FIXME: maybe I'm mis-understanding Perl regular expressions, but
  # this pattern from ParseCit appears to do the Wrong Thing:
  # author_text.sub!(/^.*?[a-zA-Z][a-zA-Z]+\. /, '')
  author_text.gsub!(/\(.*?\)/, '')
  author_text.gsub!(/^.*?\)\.?/, '')
  author_text.gsub!(/\(.*?$/, '')
  author_text.gsub!(/\[.*?\]/, '')
  author_text.gsub!(/^.*?\]\.?/, '')
  author_text.gsub!(/\[.*?$/, '')
  author_text.gsub!(/;/, ',')
  author_text.gsub!(/,/, ', ')
  author_text.gsub!(/\:/, ' ')
  author_text.gsub!(/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]/, '')
  author_text = join_multi_word_names(author_text)

  orig_tokens = author_text.split(/\s+/)
  tokens = []
  last = false
  orig_tokens.each_with_index {|tok, i|
    if tok !~ /[A-Za-z&]/
      if i < orig_tokens.length/2
        tokens = []
        next
      else
        last = true
      end
    end
    if (tok =~ /^(jr|sr|ph\.?d|m\.?d|esq)\.?\,?$/i and
        tokens.last =~ /\,$/) or
        tok =~ /^[IVX][IVX]+\.?\,?$/

      next
    end
    tokens << tok
    break if last
  }
  tokens
end

Module: Excite::Postprocessor

Instance Method Summary collapse

Dynamic Method Handling

#method_missing(m, *args, &block) ⇒ Object

Instance Method Details

#join_multi_word_names(author_text) ⇒ Object

#normalize(key, hsh) ⇒ Object

#normalize_author(hsh) ⇒ Object

#normalize_author_name(auth_toks) ⇒ Object

#normalize_date(hsh) ⇒ Object

#normalize_fields(citation_hsh) ⇒ Object

#normalize_pages(hsh) ⇒ Object

#normalize_title(hsh) ⇒ Object

#normalize_volume(hsh) ⇒ Object

#pairable_quote_chars(quote_char) ⇒ Object

#repair_and_tokenize_author_text(author_text) ⇒ Object