Class: Scylla::Generator

Inherits:
Object
  • Object
show all
Defined in:
lib/scylla/generator.rb

Constant Summary collapse

NONLATIN =
["bg","ar","ru","zh","ja","he","kn","ko","mr","hi","th","fa","el","uk"]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]") ⇒ Generator

dirtext: The location of the source training text files minsize: The minimum size of the ngrams that you would like to store



12
13
14
15
16
17
# File 'lib/scylla/generator.rb', line 12

def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]")
  @dirtext = dirtext
  @dirlm   = dirlm
  @minsize = minsize
  @delimiter = delimiter
end

Instance Attribute Details

#delimiterObject

Returns the value of attribute delimiter.



8
9
10
# File 'lib/scylla/generator.rb', line 8

def delimiter
  @delimiter
end

#dirlmObject

Returns the value of attribute dirlm.



8
9
10
# File 'lib/scylla/generator.rb', line 8

def dirlm
  @dirlm
end

#dirtextObject

Returns the value of attribute dirtext.



8
9
10
# File 'lib/scylla/generator.rb', line 8

def dirtext
  @dirtext
end

#minsizeObject

Returns the value of attribute minsize.



8
9
10
# File 'lib/scylla/generator.rb', line 8

def minsize
  @minsize
end

Instance Method Details

#clean(string) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/scylla/generator.rb', line 83

def clean(string)
  delimit = string.index(@delimiter)
  string = string[0, delimit] if delimit
  string = Sanitize.clean(string)
  string = CGI.unescapeHTML(string)
  string = Unicode::downcase(string)
  string.gsub!(/(?:http|https):\/\/[a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?:(?::[0-9]{1,5})?\/[^\s]*)?/, "")
  string.gsub!(/[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}/, "")
  string.gsub!(/[\*\^><!\"#\$%&\'\(\)\*\+:;,._\/=\?@\{\}\[\]|\-\n\r0-9]/," ")
  latin, nonlatin = string.scan(/[a-z]/), string.scan(/[\p{L}&&[^a-z]]/)
  string.gsub!(/[a-zA-Z]/, "") if !latin.empty? && !nonlatin.empty? && nonlatin.size/(latin.size*1.0) > 0.5
  string.strip.split(" ").join(" ")
end

#create_lm(input, frequencies = false) ⇒ Object

Creates a language map for a given input string. The frequencies boolean specifies whether or not the method should return the freqencies of the ngrams, or simply an array in sorted order



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/scylla/generator.rb', line 100

def create_lm(input, frequencies = false)
  input = clean(input)
  ngram = Hash.new
  input.split(/[\d\s\[\]]/).each do |word|
    word = "_" + word + "_";
    len = word.size
    for i in 0..word.size
      for j in (1..3)
        next unless word[i,j]
        ngram[word[i,j]] ||= 0
        ngram[word[i,j]] += 1 if (len > (j - 1))
      end
      len = len - 1
    end
  end
  ngram.each_key do |key|
    ngram.delete(key) if key.size <= @minsize
  end
  ngram = ngram.sort {|a,b| b[1] <=> a[1]}
  return ngram if frequencies
  sorted = []
  ngram.each {|key| sorted << key[0]}
  return sorted
end

#get_wiki(locale, article) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/scylla/generator.rb', line 47

def get_wiki(locale,article)
  Wikipedia.Configure {
    domain "#{locale}.wikipedia.org"
    path   'w/api.php'
  }
  p article
  page = Wikipedia.find( article )
  value = page.raw_data['query']['pages'].values.first['revisions'].first.fetch('*')
  value = value.force_encoding("UTF-8").chars.select {|c| c.valid_encoding?}.join
  value = value.gsub(/\{\{(.*?)\}\}/,"")
  value = value.gsub(/\[\[(.+?)\]\]/m,"")
  value = value.gsub(/\{\{(.+?)\}\}/m,"")
  value = value.gsub(/\{(.+?)\}/m,"")
  value = value.gsub(/\[(.+?)\]/m,"")
  value = Sanitize.clean(value)
  value = value.gsub(/[a-zA-Z]/,"") if NONLATIN.include?(locale)
  clean(value)
end

#get_wikisObject



36
37
38
39
40
41
42
43
44
45
# File 'lib/scylla/generator.rb', line 36

def get_wikis
  require 'wikipedia'
  locales = Scylla::Resources.locales
  locales.each do |key, value|
    text = get_wiki(value[0],value[1])
    textname = File.join(@dirtext, "#{key}.txt")
    File.delete(textname) if File.exists?(textname)
    File.open(textname, 'w') { |f| f.write(text) }
  end
end

#trainObject

Loads all the .txt files in the specified source training text folder

and creates language maps using ngram frequencies. The maps are stored in lib/scylla/lms as .lm files



22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/scylla/generator.rb', line 22

def train
  languages = Dir.glob(@dirlm + "/*.lm")
  languages.each {|l| File.delete(l) }
  locales = Scylla::Resources.locales
  get_wikis
  locales.each do |key, value|
    path = File.join(@dirtext, "#{key}.txt")
    text = ""
    File.open(path).each { |line| text += " " + line }
    write_lm(text, key)
    File.delete(path)
  end
end

#write_lm(text, language) ⇒ Object

Reads a single text file specified by a path and writes a .lm file in lib/scylla/lms



68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/scylla/generator.rb', line 68

def write_lm(text, language)
  p "Creating language map for #{language}"
  lm = create_lm(text, true)
  lmname = File.join(@dirlm, "#{language}.lm")
  File.delete(lmname) if File.exists?(lmname)
  File.open(lmname, 'w') do |f|
    i = 0
    lm.each do |freq|
      break if i == 400
      f.write(freq[0] + "\t" + freq[1].to_s + "\n")
      i += 1
    end
  end
end