Class: Scylla::Generator
- Inherits:
-
Object
- Object
- Scylla::Generator
- Defined in:
- lib/scylla/generator.rb
Constant Summary collapse
- NONLATIN =
["bg","ar","ru","zh","ja","he","kn","ko","mr","hi","th","fa","el","uk"]
Instance Attribute Summary collapse
-
#delimiter ⇒ Object
Returns the value of attribute delimiter.
-
#dirlm ⇒ Object
Returns the value of attribute dirlm.
-
#dirtext ⇒ Object
Returns the value of attribute dirtext.
-
#minsize ⇒ Object
Returns the value of attribute minsize.
Instance Method Summary collapse
- #clean(string) ⇒ Object
-
#create_lm(input, frequencies = false) ⇒ Object
Creates a language map for a given input string.
- #get_wiki(locale, article) ⇒ Object
- #get_wikis ⇒ Object
-
#initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]") ⇒ Generator
constructor
dirtext: The location of the source training text files minsize: The minimum size of the ngrams that you would like to store.
-
#train ⇒ Object
Loads all the .txt files in the specified source training text folder and creates language maps using ngram frequencies.
-
#write_lm(text, language) ⇒ Object
Reads a single text file specified by a path and writes a .lm file in lib/scylla/lms.
Constructor Details
#initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]") ⇒ Generator
dirtext: The location of the source training text files minsize: The minimum size of the ngrams that you would like to store
12 13 14 15 16 17 |
# File 'lib/scylla/generator.rb', line 12 def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false, delimiter = "[[classifier_delimiter]]") @dirtext = dirtext @dirlm = dirlm @minsize = minsize @delimiter = delimiter end |
Instance Attribute Details
#delimiter ⇒ Object
Returns the value of attribute delimiter.
8 9 10 |
# File 'lib/scylla/generator.rb', line 8 def delimiter @delimiter end |
#dirlm ⇒ Object
Returns the value of attribute dirlm.
8 9 10 |
# File 'lib/scylla/generator.rb', line 8 def dirlm @dirlm end |
#dirtext ⇒ Object
Returns the value of attribute dirtext.
8 9 10 |
# File 'lib/scylla/generator.rb', line 8 def dirtext @dirtext end |
#minsize ⇒ Object
Returns the value of attribute minsize.
8 9 10 |
# File 'lib/scylla/generator.rb', line 8 def minsize @minsize end |
Instance Method Details
#clean(string) ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/scylla/generator.rb', line 83 def clean(string) delimit = string.index(@delimiter) string = string[0, delimit] if delimit string = Sanitize.clean(string) string = CGI.unescapeHTML(string) string = Unicode::downcase(string) string.gsub!(/(?:http|https):\/\/[a-z0-9]+(?:[\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(?:(?::[0-9]{1,5})?\/[^\s]*)?/, "") string.gsub!(/[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,4}/, "") string.gsub!(/[\*\^><!\"#\$%&\'\(\)\*\+:;,._\/=\?@\{\}\[\]|\-\n\r0-9]/," ") latin, nonlatin = string.scan(/[a-z]/), string.scan(/[\p{L}&&[^a-z]]/) string.gsub!(/[a-zA-Z]/, "") if !latin.empty? && !nonlatin.empty? && nonlatin.size/(latin.size*1.0) > 0.5 string.strip.split(" ").join(" ") end |
#create_lm(input, frequencies = false) ⇒ Object
Creates a language map for a given input string. The frequencies boolean specifies whether or not the method should return the freqencies of the ngrams, or simply an array in sorted order
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/scylla/generator.rb', line 100 def create_lm(input, frequencies = false) input = clean(input) ngram = Hash.new input.split(/[\d\s\[\]]/).each do |word| word = "_" + word + "_"; len = word.size for i in 0..word.size for j in (1..3) next unless word[i,j] ngram[word[i,j]] ||= 0 ngram[word[i,j]] += 1 if (len > (j - 1)) end len = len - 1 end end ngram.each_key do |key| ngram.delete(key) if key.size <= @minsize end ngram = ngram.sort {|a,b| b[1] <=> a[1]} return ngram if frequencies sorted = [] ngram.each {|key| sorted << key[0]} return sorted end |
#get_wiki(locale, article) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/scylla/generator.rb', line 47 def get_wiki(locale,article) Wikipedia.Configure { domain "#{locale}.wikipedia.org" path 'w/api.php' } p article page = Wikipedia.find( article ) value = page.raw_data['query']['pages'].values.first['revisions'].first.fetch('*') value = value.force_encoding("UTF-8").chars.select {|c| c.valid_encoding?}.join value = value.gsub(/\{\{(.*?)\}\}/,"") value = value.gsub(/\[\[(.+?)\]\]/m,"") value = value.gsub(/\{\{(.+?)\}\}/m,"") value = value.gsub(/\{(.+?)\}/m,"") value = value.gsub(/\[(.+?)\]/m,"") value = Sanitize.clean(value) value = value.gsub(/[a-zA-Z]/,"") if NONLATIN.include?(locale) clean(value) end |
#get_wikis ⇒ Object
36 37 38 39 40 41 42 43 44 45 |
# File 'lib/scylla/generator.rb', line 36 def get_wikis require 'wikipedia' locales = Scylla::Resources.locales locales.each do |key, value| text = get_wiki(value[0],value[1]) textname = File.join(@dirtext, "#{key}.txt") File.delete(textname) if File.exists?(textname) File.open(textname, 'w') { |f| f.write(text) } end end |
#train ⇒ Object
Loads all the .txt files in the specified source training text folder
and creates language maps using ngram frequencies. The maps are stored in lib/scylla/lms as .lm files
22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/scylla/generator.rb', line 22 def train languages = Dir.glob(@dirlm + "/*.lm") languages.each {|l| File.delete(l) } locales = Scylla::Resources.locales get_wikis locales.each do |key, value| path = File.join(@dirtext, "#{key}.txt") text = "" File.open(path).each { |line| text += " " + line } write_lm(text, key) File.delete(path) end end |
#write_lm(text, language) ⇒ Object
Reads a single text file specified by a path and writes a .lm file in lib/scylla/lms
68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/scylla/generator.rb', line 68 def write_lm(text, language) p "Creating language map for #{language}" lm = create_lm(text, true) lmname = File.join(@dirlm, "#{language}.lm") File.delete(lmname) if File.exists?(lmname) File.open(lmname, 'w') do |f| i = 0 lm.each do |freq| break if i == 400 f.write(freq[0] + "\t" + freq[1].to_s + "\n") i += 1 end end end |