Module: Charles::Misc
- Defined in:
- lib/charles/misc.rb
Constant Summary collapse
- UNICODE_CONVERSIONS =
{ "8230" => '...', "8194" => ' ', "8195" => ' ', "8201" => ' ', "8211" => '-', "8216" => '\'', "8217" => '\'', "8220" => '"', "8221" => '"' }
- TRANSLATED_CONVERSIONS =
UNICODE_CONVERSIONS.map {|k, v| [[k.to_i].pack('U*'), v] }
Class Method Summary collapse
- .analyzer(type = :all_stop_words) ⇒ Object
- .analyzer_all_stop_words ⇒ Object
- .analyzer_no_stop_words ⇒ Object
- .compare_strings(a, b) ⇒ Object
- .compare_strings_single_side(a, b) ⇒ Object
- .normalize_string(string) ⇒ Object
- .normalize_unicode_characters(string) ⇒ Object
- .string_to_clean_tokens(string, type = :all_stop_words) ⇒ Object
- .string_to_clean_tokens_string(string, type = :all_stop_words) ⇒ Object
- .string_to_tokens(string, type = :all_stop_words) ⇒ Object
- .string_to_tokens_raw(string, type = :all_stop_words) ⇒ Object
Class Method Details
.analyzer(type = :all_stop_words) ⇒ Object
17 18 19 20 |
# File 'lib/charles/misc.rb', line 17 def self.analyzer(type = :all_stop_words) @analyzer||={} @analyzer[type]||=self.send("analyzer_#{type}") end |
.analyzer_all_stop_words ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/charles/misc.rb', line 21 def self.analyzer_all_stop_words #http://blackwinter.github.com/ferret/classes/Ferret/Analysis.html stop_words = Ferret::Analysis::EXTENDED_ENGLISH_STOP_WORDS | Ferret::Analysis::FULL_FRENCH_STOP_WORDS | Ferret::Analysis::FULL_SPANISH_STOP_WORDS | Ferret::Analysis::FULL_PORTUGUESE_STOP_WORDS | Ferret::Analysis::FULL_ITALIAN_STOP_WORDS | Ferret::Analysis::FULL_GERMAN_STOP_WORDS | Ferret::Analysis::FULL_DUTCH_STOP_WORDS | Ferret::Analysis::FULL_SWEDISH_STOP_WORDS | Ferret::Analysis::FULL_NORWEGIAN_STOP_WORDS | Ferret::Analysis::FULL_DANISH_STOP_WORDS | Ferret::Analysis::FULL_RUSSIAN_STOP_WORDS | Ferret::Analysis::FULL_FINNISH_STOP_WORDS Ferret::Analysis::StandardAnalyzer.new(stop_words,true)#(Ferret::Analysis::FULL_ENGLISH_STOP_WORDS) #no stop words end |
.analyzer_no_stop_words ⇒ Object
37 38 39 |
# File 'lib/charles/misc.rb', line 37 def self.analyzer_no_stop_words Ferret::Analysis::StandardAnalyzer.new([],true)#no stop words end |
.compare_strings(a, b) ⇒ Object
3 4 5 |
# File 'lib/charles/misc.rb', line 3 def self.compare_strings(a,b) [compare_strings_single_side(a,b),compare_strings_single_side(b,a)].mean end |
.compare_strings_single_side(a, b) ⇒ Object
6 7 8 9 10 11 12 |
# File 'lib/charles/misc.rb', line 6 def self.compare_strings_single_side(a,b) index = Ferret::Index::Index.new() index.field_infos.add_field(:content, :store => :no, :boost => 1) index << {:content => a} search = index.search(b.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'')) #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html search.max_score end |
.normalize_string(string) ⇒ Object
63 64 65 66 |
# File 'lib/charles/misc.rb', line 63 def self.normalize_string(string) @htmlentities||=HTMLEntities.new @htmlentities.decode(normalize_unicode_characters(string.gsub(/[\s\302\240]+/,' ').strip)) end |
.normalize_unicode_characters(string) ⇒ Object
79 80 81 82 |
# File 'lib/charles/misc.rb', line 79 def self.normalize_unicode_characters(string) TRANSLATED_CONVERSIONS.each {|k,v| string.gsub! k, v } string end |
.string_to_clean_tokens(string, type = :all_stop_words) ⇒ Object
49 50 51 52 53 |
# File 'lib/charles/misc.rb', line 49 def self.string_to_clean_tokens(string, type = :all_stop_words) tokens = string_to_tokens(string, type) tokens.delete_if{|token| token.match(/\d/)} tokens end |
.string_to_clean_tokens_string(string, type = :all_stop_words) ⇒ Object
54 55 56 |
# File 'lib/charles/misc.rb', line 54 def self.string_to_clean_tokens_string(string, type = :all_stop_words) string_to_clean_tokens(string, type).join(' ') end |
.string_to_tokens(string, type = :all_stop_words) ⇒ Object
46 47 48 |
# File 'lib/charles/misc.rb', line 46 def self.string_to_tokens(string, type = :all_stop_words) self.string_to_tokens_raw(string, type).collect{|token| token.text} end |
.string_to_tokens_raw(string, type = :all_stop_words) ⇒ Object
41 42 43 44 45 |
# File 'lib/charles/misc.rb', line 41 def self.string_to_tokens_raw(string, type = :all_stop_words) token_stream = self.analyzer(type).token_stream('',string) o=[]; while(j=token_stream.next); o << j; end; return o end |