Class: WhatLanguage

Inherits:
Object
  • Object
show all
Defined in:
lib/whatlanguage.rb

Constant Summary collapse

VERSION =
'1.0.3'
HASHERS =
[ 
   lambda { |item| Digest::SHA1.digest(item.downcase.strip).unpack("VV") } ,
   lambda { |item| Digest::SHA2.digest(item.downcase.strip).unpack("xxxxVVVVVVV") }
]
BITFIELD_WIDTH =
2_000_000
@@data =
{}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ WhatLanguage

Returns a new instance of WhatLanguage.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/whatlanguage.rb', line 16

def initialize(options = {})
  lang = hashType= nil
  if options == :large || (options.kind_of?(Hash) && options[:large])
    lang = "lang-lg"
  else
    lang = "lang"
  end
  languages_folder = File.join(File.dirname(__FILE__), "..", lang)
  Dir.entries(languages_folder).grep(/\.lang/).each do |lang|
    if !@@data[lang[/\w+/].to_sym]
      lfile = File.new(File.join(languages_folder, lang), 'rb')
      hashType = lfile.read(4).unpack("I")[0]
      @@data[lang[/\w+/].to_sym] = BloominSimple.from_dump(lfile.read, &HASHERS[hashType])
    end
  end
end

Class Method Details

.filter_from_dictionary(filename, outfilename, options = {}) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/whatlanguage.rb', line 66

def self.filter_from_dictionary(filename, outfilename, options = {})
  size = 0
  hasher = nil
  infile = File.open(filename)
  if options == :large || options[:large] 
    lines = 0
    infile.each {|word| lines += 1}
    size = 10*lines
    hasherId = 1
  else
    size = BITFIELD_WIDTH
    hasherId = 0
  end
  bf = BloominSimple.new(size, &HASHERS[hasherId])
  infile.rewind
  infile.each { |word| bf.add(word) }
  outfile = File.open(outfilename,"wb")
  outfile.write([hasherId].pack("I"))
  outfile.write(bf.dump)
  outfile.close
end

Instance Method Details

#language(text) ⇒ Object



62
63
64
# File 'lib/whatlanguage.rb', line 62

def language(text)
  process_text(text).max { |a,b| a[1] <=> b[1] }.first rescue nil
end

#process_text(text) ⇒ Object

Very inefficient method for now.. but still beats the non-Bloom alternatives. Change to better bit comparison technique later..



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/whatlanguage.rb', line 40

def process_text(text)
  results = Hash.new(0)
  it = 0
  text.split.collect {|a| a.downcase }.each do |word|
    it += 1
    @@data.keys.each do |lang|
      results[lang] += 1 if @@data[lang].includes?(word)
    end
    
    # Every now and then check to see if we have a really convincing result.. if so, exit early.
    if it % 4 == 0 && results.size > 1
      top_results = results.sort_by{|a,b| b}.reverse[0..1]
      
      # Next line may need some tweaking one day..
      break if top_results[0][1] > 4 && ((top_results[0][1] > top_results[1][1] * 2) || (top_results[0][1] - top_results[1][1] > 25))
    end
    
    #break if it > 100
  end
  results
end

#reinit(options = {}) ⇒ Object



33
34
35
36
# File 'lib/whatlanguage.rb', line 33

def reinit(options = {})
  @@data = nil
  initialize(options = {})
end