Class: Scylla::Generator

Inherits:

Object

Object
Scylla::Generator

Defined in:: lib/scylla/generator.rb

Constant Summary collapse

NONLATIN =

["bg","ar","ru","zh","ja","he","kn","ko","mr","hi","th","fa","el","uk"]

Instance Attribute Summary collapse

#delimiter ⇒ Object

Returns the value of attribute delimiter.
#dirlm ⇒ Object

Returns the value of attribute dirlm.
#dirtext ⇒ Object

Returns the value of attribute dirtext.
#minsize ⇒ Object

Returns the value of attribute minsize.

Instance Method Summary collapse

#clean(string) ⇒ Object
#create_lm(input, frequencies = false) ⇒ Object

Creates a language map for a given input string.
#get_wiki(locale, article) ⇒ Object
#get_wikis ⇒ Object
#initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]") ⇒ Generator constructor

dirtext: The location of the source training text files minsize: The minimum size of the ngrams that you would like to store.
#train ⇒ Object

Loads all the .txt files in the specified source training text folder and creates language maps using ngram frequencies.
#write_lm(text, language) ⇒ Object

Reads a single text file specified by a path and writes a .lm file in lib/scylla/lms.

Constructor Details

#initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]") ⇒ `Generator`

dirtext: The location of the source training text files minsize: The minimum size of the ngrams that you would like to store

# File 'lib/scylla/generator.rb', line 12

def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]")
  @dirtext = dirtext
  @dirlm   = dirlm
  @minsize = minsize
  @delimiter = delimiter
end

Instance Attribute Details

#delimiter ⇒ `Object`

Returns the value of attribute delimiter.



8
9
10

# File 'lib/scylla/generator.rb', line 8

def delimiter
  @delimiter
end

#dirlm ⇒ `Object`

Returns the value of attribute dirlm.



8
9
10

# File 'lib/scylla/generator.rb', line 8

def dirlm
  @dirlm
end

#dirtext ⇒ `Object`

Returns the value of attribute dirtext.



8
9
10

# File 'lib/scylla/generator.rb', line 8

def dirtext
  @dirtext
end

#minsize ⇒ `Object`

Returns the value of attribute minsize.



8
9
10

# File 'lib/scylla/generator.rb', line 8

def minsize
  @minsize
end

Instance Method Details

#clean(string) ⇒ `Object`

# File 'lib/scylla/generator.rb', line 83

def clean(string)
  delimit = string.index(@delimiter)
  string = string[0, delimit] if delimit
  string = Sanitize.clean(string)
  string = CGI.unescapeHTML(string)
  string = Unicode::downcase(string)
  string.gsub!(/(?:http|https):\/\/[a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?:(?::[0-9]{1,5})?\/[^\s]*)?/, "")
  string.gsub!(/[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}/, "")
  string.gsub!(/[\*\^><!\"#\$%&\'\(\)\*\+:;,._\/=\?@\{\}\[\]|\-\n\r0-9]/," ")
  latin, nonlatin = string.scan(/[a-z]/), string.scan(/[\p{L}&&[^a-z]]/)
  string.gsub!(/[a-zA-Z]/, "") if !latin.empty? && !nonlatin.empty? && nonlatin.size/(latin.size*1.0) > 0.5
  string.strip.split(" ").join(" ")
end

#create_lm(input, frequencies = false) ⇒ `Object`

Creates a language map for a given input string. The frequencies boolean specifies whether or not the method should return the freqencies of the ngrams, or simply an array in sorted order

# File 'lib/scylla/generator.rb', line 100

def create_lm(input, frequencies = false)
  input = clean(input)
  ngram = Hash.new
  input.split(/[\d\s\[\]]/).each do |word|
    word = "_" + word + "_";
    len = word.size
    for i in 0..word.size
      for j in (1..3)
        next unless word[i,j]
        ngram[word[i,j]] ||= 0
        ngram[word[i,j]] += 1 if (len > (j - 1))
      end
      len = len - 1
    end
  end
  ngram.each_key do |key|
    ngram.delete(key) if key.size <= @minsize
  end
  ngram = ngram.sort {|a,b| b[1] <=> a[1]}
  return ngram if frequencies
  sorted = []
  ngram.each {|key| sorted << key[0]}
  return sorted
end

#get_wiki(locale, article) ⇒ `Object`

# File 'lib/scylla/generator.rb', line 47

def get_wiki(locale,article)
  Wikipedia.Configure {
    domain "#{locale}.wikipedia.org"
    path   'w/api.php'
  }
  p article
  page = Wikipedia.find( article )
  value = page.raw_data['query']['pages'].values.first['revisions'].first.fetch('*')
  value = value.force_encoding("UTF-8").chars.select {|c| c.valid_encoding?}.join
  value = value.gsub(/\{\{(.*?)\}\}/,"")
  value = value.gsub(/\[\[(.+?)\]\]/m,"")
  value = value.gsub(/\{\{(.+?)\}\}/m,"")
  value = value.gsub(/\{(.+?)\}/m,"")
  value = value.gsub(/\[(.+?)\]/m,"")
  value = Sanitize.clean(value)
  value = value.gsub(/[a-zA-Z]/,"") if NONLATIN.include?(locale)
  clean(value)
end

#get_wikis ⇒ `Object`

# File 'lib/scylla/generator.rb', line 36

def get_wikis
  require 'wikipedia'
  locales = Scylla::Resources.locales
  locales.each do |key, value|
    text = get_wiki(value[0],value[1])
    textname = File.join(@dirtext, "#{key}.txt")
    File.delete(textname) if File.exists?(textname)
    File.open(textname, 'w') { |f| f.write(text) }
  end
end

#train ⇒ `Object`

Loads all the .txt files in the specified source training text folder

and creates language maps using ngram frequencies. The maps are stored in lib/scylla/lms as .lm files

# File 'lib/scylla/generator.rb', line 22

def train
  languages = Dir.glob(@dirlm + "/*.lm")
  languages.each {|l| File.delete(l) }
  locales = Scylla::Resources.locales
  get_wikis
  locales.each do |key, value|
    path = File.join(@dirtext, "#{key}.txt")
    text = ""
    File.open(path).each { |line| text += " " + line }
    write_lm(text, key)
    File.delete(path)
  end
end

#write_lm(text, language) ⇒ `Object`

Reads a single text file specified by a path and writes a .lm file in lib/scylla/lms

# File 'lib/scylla/generator.rb', line 68

def write_lm(text, language)
  p "Creating language map for #{language}"
  lm = create_lm(text, true)
  lmname = File.join(@dirlm, "#{language}.lm")
  File.delete(lmname) if File.exists?(lmname)
  File.open(lmname, 'w') do |f|
    i = 0
    lm.each do |freq|
      break if i == 400
      f.write(freq[0] + "\t" + freq[1].to_s + "\n")
      i += 1
    end
  end
end

Class: Scylla::Generator

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]") ⇒ Generator

Instance Attribute Details

#delimiter ⇒ Object

#dirlm ⇒ Object

#dirtext ⇒ Object

#minsize ⇒ Object

Instance Method Details

#clean(string) ⇒ Object

#create_lm(input, frequencies = false) ⇒ Object

#get_wiki(locale, article) ⇒ Object

#get_wikis ⇒ Object

#train ⇒ Object

#write_lm(text, language) ⇒ Object