Class: LanguageDetector

Inherits:

Object

Object
LanguageDetector

show all

Defined in:: lib/language_detector.rb

Defined Under Namespace

Classes: Profile

Class Method Summary collapse

Instance Method Summary collapse

#detect(text) ⇒ Object
#initialize(type = 'tc') ⇒ LanguageDetector constructor

Supports two ngram databases: - fm - built from scratch texts included with gem - tc - textcat ngram database.
#load_model(name) ⇒ Object

Constructor Details

#initialize(type = 'tc') ⇒ `LanguageDetector`

Supports two ngram databases:

fm - built from scratch texts included with gem
tc - textcat ngram database



13
14
15

# File 'lib/language_detector.rb', line 13

def initialize(type='tc')
  @profiles = load_model(type)
end

Class Method Details

.train_fm ⇒ `Object`

# File 'lib/language_detector.rb', line 65

def self.train_fm
  # For a full list of ISO 639 language tags visit:
  # http://www.loc.gov/standards/iso639-2/englangn.html
  # http://www.loc.gov/standards/iso639-2/php/English_list.php

  #LARGE profiles follow:

  #NOTE: These profiles taken from the "World War II" node on wikipedia
  #with the 'lang' and ?action=raw URI which results in a UTF8 encoded
  #file.  If we need to get more profile data for a language this is
  #always a good source of data.
  #
  # http:#en.wikipedia.org/wiki/World_War_II
  # EU corpus: http://wt.jrc.it/lt/Acquis/
  # 

  training_data = [
    [ "ar", "ar-utf8.txt", "utf8", "arabic" ],
    [ "bg", "bg-utf8.txt", "utf8", "bulgarian" ],
    [ "cs", "cs-utf8.txt", "utf8", "czech" ],
    [ "da", "da-utf8.txt", "utf8", "danish" ],
    [ "de", "de-utf8.txt", "utf8", "german" ],
    [ "el", "el-utf8.txt", "utf8", "greek" ],
    [ "en", "en-utf8.txt", "utf8", "english" ],
    [ "et", "et-utf8.txt", "utf8", "estonian" ],
    [ "es", "es-utf8.txt", "utf8", "spanish" ],
    [ "fa", "fa-utf8.txt", "utf8", "farsi" ],
    [ "fi", "fi-utf8.txt", "utf8", "finnish" ],
    [ "fr", "fr-utf8.txt", "utf8", "french" ],
    [ "ga", "ga-utf8.txt", "utf8", "irish" ],
    [ "he", "he-utf8.txt", "utf8", "hebrew" ],
    [ "hi", "hi-utf8.txt", "utf8", "hindi" ],
    [ "hr", "hr-utf8.txt", "utf8", "croatian" ],
    [ "it", "it-utf8.txt", "utf8", "italian" ],
    [ "ja", "ja-utf8.txt", "utf8", "japanese" ],
    [ "ko", "ko-utf8.txt", "utf8", "korean" ],
    [ "hu", "hu-utf8.txt", "utf8", "hungarian" ],
    [ "tk", "tk-utf8.txt", "utf8", "turkish" ],
    [ "nl", "nl-utf8.txt", "utf8", "dutch" ],
    [ "no", "no-utf8.txt", "utf8", "norwegian" ],
    [ "pl", "pl-utf8.txt", "utf8", "polish" ],
    [ "pt", "pt-utf8.txt", "utf8", "portuguese" ],
    [ "ro", "ro-utf8.txt", "utf8", "romanian" ],
    [ "ru", "ru-utf8.txt", "utf8", "russian" ],
    [ "sl", "sl-utf8.txt", "utf8", "slovenian" ],
    [ "sv", "sv-utf8.txt", "utf8", "swedish" ],
    [ "th", "th-utf8.txt", "utf8", "thai" ],
    [ "uk", "uk-utf8.txt", "utf8", "ukraninan" ],
    [ "vi", "vi-utf8.txt", "utf8", "vietnamese" ],
    [ "zh", "zh-utf8.txt", "utf8", "chinese" ]
    # id (indonesian)
    # ku (kurdish)
    # lt (lithuanian)
    # lv (latvian)
    # mk (macedonian)
    # ms (malay)
    # sr (serbian)
    # my (burmese)
    # [ "fy", "fy-utf8.txt", "utf8", "frisian" ],
    # [ "io", "io-utf8.txt", "utf8", "ido" ],
    # [ "is", "is-utf8.txt", "utf8", "icelandic" ],
  ]

  profiles = []
  training_data.each do |data|
    p = LanguageDetector::Profile.new(:name => data.last, :file => data[1])
    profiles.push p
  end

  puts 'saving model...'
  filename = File.expand_path(File.join(File.dirname(__FILE__), "model-fm.yml"))
  File.open(filename, 'w') {|f| YAML.dump(profiles, f)}
end

.train_tc ⇒ `Object`

# File 'lib/language_detector.rb', line 34

def self.train_tc
  profiles = []
  languages = Dir.glob("textcat_ngrams/*.lm").collect {|l| l.gsub(/\.lm$/,'')}.sort

  languages.each do |language|
    ngram = {}
    rang = 1

    lang = File.open("#{language}.lm", "r")
    lang.each_line do |line|

      line = line.chomp
      if line =~ /^[^0-9\s]+/o
        ngram[line.chomp.split(/\t/).first] = rang
        rang += 1
      end

    end
    lang.close

    p = LanguageDetector::Profile.new(:name => language.split('/').last.split('-').first)
    p.ngrams = ngram

    profiles.push p
  end

  puts 'saving model...'
  filename = File.expand_path(File.join(File.dirname(__FILE__), "model-tc.yml"))
  File.open(filename, 'w') {|f| YAML.dump(profiles, f)}
end

Instance Method Details

#detect(text) ⇒ `Object`

# File 'lib/language_detector.rb', line 17

def detect(text)
  p = LanguageDetector::Profile.new(:text => text)
  best_profile = nil
  best_distance = nil

  @profiles.each do |profile|
    distance = profile.compute_distance(p)

    if !best_distance or distance < best_distance
      best_distance = distance
      best_profile = profile
    end
  end
  p best_distance
  best_profile.name
end

#load_model(name) ⇒ `Object`

# File 'lib/language_detector.rb', line 139

def load_model(name)
  filename = File.expand_path(File.join(File.dirname(__FILE__), "model-#{name}.yml"))
  @profiles = YAML.load_file(filename)
end

Class: LanguageDetector

Defined Under Namespace

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(type = 'tc') ⇒ LanguageDetector

Class Method Details

.train_fm ⇒ Object

.train_tc ⇒ Object

Instance Method Details

#detect(text) ⇒ Object

#load_model(name) ⇒ Object

#initialize(type = 'tc') ⇒ `LanguageDetector`

.train_fm ⇒ `Object`

.train_tc ⇒ `Object`

#detect(text) ⇒ `Object`

#load_model(name) ⇒ `Object`