Module: MathMetadata

Defined in:: lib/math_metadata_lookup/lookup.rb,
lib/math_metadata_lookup/site.rb,
lib/math_metadata_lookup/tools.rb,
lib/math_metadata_lookup/author.rb,
lib/math_metadata_lookup/entity.rb,
lib/math_metadata_lookup/result.rb,
lib/math_metadata_lookup/article.rb,
lib/math_metadata_lookup/sites/mr.rb,
lib/math_metadata_lookup/reference.rb,
lib/math_metadata_lookup/sites/zbl.rb,
lib/math_metadata_lookup/sites/dmlcz.rb,
lib/math_metadata_lookup/sites/bas-bg.rb,
lib/math_metadata_lookup/sites/cedram.rb,
lib/math_metadata_lookup/sites/numdam.rb

Overview

vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2

Defined Under Namespace

Classes: Article, Author, BasBg, CEDRAM, DMLCZ, Entity, Lookup, MR, NUMDAM, Reference, Result, Site, ZBL

Constant Summary collapse

SITES =

[]

ACCENT_REPL =

{
  "`" => "\u0300", # grave accent
  "'" => "\u0301", # acute accent
  "^" => "\u0302", # circumflex
  '"' => "\u0308", # umlaut or dieresis
  "~" => "\u0303", # tilde
  "H" => "\u030b", # long Hungarian umlaut (double acute)
  "c" => "\u0327", # cedilla
  "=" => "\u0304", # macron accent
  "." => "\u0307", # dot over the letter
  "r" => "\u030a", # ring over the letter
  "u" => "\u0306", # breve over the letter
  "v" => "\u030c"  # caron/hacek ("v") over the letter
}

Class Method Summary collapse

Class Method Details

.latex_to_utf8(s) ⇒ `Object`

# File 'lib/math_metadata_lookup/tools.rb', line 102

def latex_to_utf8( s )
  str = s.gsub( /\\(.)(?:([a-zA-Z])|\{([a-zA-Z])\}|\{\\([a-zA-Z])\})/ ) do |match|
    accent = ACCENT_REPL[$1]
    char = $2 || $3 || $4
    accent ? Unicode.normalize_KC( char + accent ) : match
  end
end

.levenshtein_distance(s1, s2) ⇒ `Object`

# File 'lib/math_metadata_lookup/tools.rb', line 11

def levenshtein_distance( s1, s2 )
  return 1.0 if s1 == s2

  s1u, s2u = s1.split(//u), s2.split(//u)
  tab = Array.new(s1u.size+1){ Array.new(s2u.size+1){0} }

  (0..s1u.size).each do |i|
    tab[i][0] = i
  end
  (0..s2u.size).each do |j|
    tab[0][j] = j
  end

  (1..s2u.size).each do |j|
    (1..s1u.size).each do |i|
      if s2u[j-1] == s1u[i-1]
        tab[i][j] = tab[i-1][j-1]
      else
        tab[i][j] = [
          tab[i-1][j] + 1,
          tab[i][j-1] + 1,
          tab[i-1][j-1] + 1
        ].sort.first
      end
    end
  end
  1 - (tab.last.last.to_f / ([s1u.size, s2u.size].sort.last))
end

.normalize_mscs(mscs) ⇒ `Object`



46
47
48

# File 'lib/math_metadata_lookup/tools.rb', line 46

def normalize_mscs( mscs )
  mscs.map{|m| m.split(/,|;/) }.flatten.map{|m| m.gsub(/<.*?>/,'')}.map{|m| m =~ /\s*\(?([^\s\)\(]+)\)?\s*/; $1}
end

.normalize_name(name) ⇒ `Object`

# File 'lib/math_metadata_lookup/tools.rb', line 51

def normalize_name( name )
  # only latin chars
  trans = latex_to_utf8(name.to_s)
  trans = trans.to_ascii

  # remove Jr. 
  trans.sub! %r{\bjr\.(\b|$)}i, ' '

  # remove abbr.: Rakosnik, J. => Rakosnik, 
  trans.sub! %r{(\W|^)\w\.}i, ' '
  
  # transform: Surname, N.M. => Surname, N. M.
  trans.gsub( /([^\s,])?\.([^\s,])/, '\1. \2' )

  #MathMetadata.remove_punctuation(trans)
  trans
end

.normalize_range(range) ⇒ `Object`



41
42
43

# File 'lib/math_metadata_lookup/tools.rb', line 41

def normalize_range( range )
  range.to_s.gsub(/–|--/,'-')
end

.normalize_text(s) ⇒ `Object`

# File 'lib/math_metadata_lookup/tools.rb', line 77

def normalize_text( s )
  str = latex_to_utf8(s.to_s)
  str = str.to_ascii.downcase
  str = remove_punctuation(str)
  str.gsub!(%r{\W+}, ' ')
  str.gsub(%r{(?: the| a| of|^a|^the|^of)\s+}i, ' ')
  str.gsub!(%r{\s+}, ' ')
  str.strip
end

.remove_punctuation(s) ⇒ `Object`

# File 'lib/math_metadata_lookup/tools.rb', line 70

def remove_punctuation( s )
  str = s.gsub %r{(\w)[.,]+( |$)}i, '\1 '
  str.gsub! %r{(\s)[.,]+( |$)}i, '\1 '
  str.strip
end

Module: MathMetadata

Overview

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.latex_to_utf8(s) ⇒ Object

.levenshtein_distance(s1, s2) ⇒ Object

.normalize_mscs(mscs) ⇒ Object

.normalize_name(name) ⇒ Object

.normalize_range(range) ⇒ Object

.normalize_text(s) ⇒ Object

.remove_punctuation(s) ⇒ Object

.latex_to_utf8(s) ⇒ `Object`

.levenshtein_distance(s1, s2) ⇒ `Object`

.normalize_mscs(mscs) ⇒ `Object`

.normalize_name(name) ⇒ `Object`

.normalize_range(range) ⇒ `Object`

.normalize_text(s) ⇒ `Object`

.remove_punctuation(s) ⇒ `Object`