Class: WhatLanguage
- Inherits:
-
Object
- Object
- WhatLanguage
- Defined in:
- lib/whatlanguage.rb
Constant Summary collapse
- VERSION =
'1.0.3'
- HASHERS =
[ lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") } , lambda { |item| Digest::SHA2.digest(item.downcase.strip).unpack("xxxxVVVVVVV") } ]
- BITFIELD_WIDTH =
2_000_000
- @@data =
{}
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(options = {}) ⇒ WhatLanguage
constructor
A new instance of WhatLanguage.
- #language(text) ⇒ Object
-
#process_text(text) ⇒ Object
Very inefficient method for now..
- #reinit(options = {}) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ WhatLanguage
Returns a new instance of WhatLanguage.
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/whatlanguage.rb', line 16 def initialize( = {}) lang = hashType= nil if == :large || (.kind_of?(Hash) && [:large]) lang = "lang-lg" else lang = "lang" end languages_folder = File.join(File.dirname(__FILE__), "..", lang) Dir.entries(languages_folder).grep(/\.lang/).each do |lang| if !@@data[lang[/\w+/].to_sym] lfile = File.new(File.join(languages_folder, lang), 'rb') hashType = lfile.read(4).unpack("I")[0] @@data[lang[/\w+/].to_sym] = BloominSimple.from_dump(lfile.read, &HASHERS[hashType]) end end end |
Class Method Details
.filter_from_dictionary(filename, outfilename, options = {}) ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/whatlanguage.rb', line 66 def self.filter_from_dictionary(filename, outfilename, = {}) size = 0 hasher = nil infile = File.open(filename) if == :large || [:large] lines = 0 infile.each {|word| lines += 1} size = 10*lines hasherId = 1 else size = BITFIELD_WIDTH hasherId = 0 end bf = BloominSimple.new(size, &HASHERS[hasherId]) infile.rewind infile.each { |word| bf.add(word) } outfile = File.open(outfilename,"wb") outfile.write([hasherId].pack("I")) outfile.write(bf.dump) outfile.close end |
Instance Method Details
#language(text) ⇒ Object
62 63 64 |
# File 'lib/whatlanguage.rb', line 62 def language(text) process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil end |
#process_text(text) ⇒ Object
Very inefficient method for now.. but still beats the non-Bloom alternatives. Change to better bit comparison technique later..
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/whatlanguage.rb', line 40 def process_text(text) results = Hash.new(0) it = 0 text.split.collect {|a| a.downcase }.each do |word| it += 1 @@data.keys.each do |lang| results[lang] += 1 if @@data[lang].includes?(word) end # Every now and then check to see if we have a really convincing result.. if so, exit early. if it % 4 == 0 && results.size > 1 top_results = results.sort_by{|a,b| b}.reverse[0..1] # Next line may need some tweaking one day.. break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25)) end #break if it > 100 end results end |
#reinit(options = {}) ⇒ Object
33 34 35 36 |
# File 'lib/whatlanguage.rb', line 33 def reinit( = {}) @@data = nil initialize( = {}) end |