Module: Charles::Misc

Defined in:
lib/charles/misc.rb

Constant Summary collapse

UNICODE_CONVERSIONS =
{
  "8230" => '...',
  "8194" => ' ',
  "8195" => ' ',
  "8201" => ' ',
  "8211" => '-',
  "8216" => '\'',
  "8217" => '\'',
  "8220" => '"',
  "8221" => '"'
}
TRANSLATED_CONVERSIONS =
UNICODE_CONVERSIONS.map {|k, v| [[k.to_i].pack('U*'), v] }

Class Method Summary collapse

Class Method Details

.analyzer(type = :all_stop_words) ⇒ Object



17
18
19
20
# File 'lib/charles/misc.rb', line 17

def self.analyzer(type = :all_stop_words)
  @analyzer||={}
  @analyzer[type]||=self.send("analyzer_#{type}")
end

.analyzer_all_stop_wordsObject



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/charles/misc.rb', line 21

def self.analyzer_all_stop_words
  #http://blackwinter.github.com/ferret/classes/Ferret/Analysis.html
  stop_words = Ferret::Analysis::EXTENDED_ENGLISH_STOP_WORDS |
                Ferret::Analysis::FULL_FRENCH_STOP_WORDS |
                Ferret::Analysis::FULL_SPANISH_STOP_WORDS |
                Ferret::Analysis::FULL_PORTUGUESE_STOP_WORDS |
                Ferret::Analysis::FULL_ITALIAN_STOP_WORDS |
                Ferret::Analysis::FULL_GERMAN_STOP_WORDS |
                Ferret::Analysis::FULL_DUTCH_STOP_WORDS |
                Ferret::Analysis::FULL_SWEDISH_STOP_WORDS |
                Ferret::Analysis::FULL_NORWEGIAN_STOP_WORDS |
                Ferret::Analysis::FULL_DANISH_STOP_WORDS |
                Ferret::Analysis::FULL_RUSSIAN_STOP_WORDS |
                Ferret::Analysis::FULL_FINNISH_STOP_WORDS
  Ferret::Analysis::StandardAnalyzer.new(stop_words,true)#(Ferret::Analysis::FULL_ENGLISH_STOP_WORDS) #no stop words
end

.analyzer_no_stop_wordsObject



37
38
39
# File 'lib/charles/misc.rb', line 37

def self.analyzer_no_stop_words
  Ferret::Analysis::StandardAnalyzer.new([],true)#no stop words
end

.compare_strings(a, b) ⇒ Object



3
4
5
# File 'lib/charles/misc.rb', line 3

def self.compare_strings(a,b)
  [compare_strings_single_side(a,b),compare_strings_single_side(b,a)].mean
end

.compare_strings_single_side(a, b) ⇒ Object



6
7
8
9
10
11
12
# File 'lib/charles/misc.rb', line 6

def self.compare_strings_single_side(a,b)
  index = Ferret::Index::Index.new()
  index.field_infos.add_field(:content, :store => :no, :boost => 1)
  index << {:content => a}
  search = index.search(b.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'')) #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
  search.max_score
end

.normalize_string(string) ⇒ Object



63
64
65
66
# File 'lib/charles/misc.rb', line 63

def self.normalize_string(string)
  @htmlentities||=HTMLEntities.new
  @htmlentities.decode(normalize_unicode_characters(string.gsub(/[\s\302\240]+/,' ').strip))
end

.normalize_unicode_characters(string) ⇒ Object



79
80
81
82
# File 'lib/charles/misc.rb', line 79

def self.normalize_unicode_characters(string)
  TRANSLATED_CONVERSIONS.each {|k,v| string.gsub! k, v }
  string
end

.string_to_clean_tokens(string, type = :all_stop_words) ⇒ Object



49
50
51
52
53
# File 'lib/charles/misc.rb', line 49

def self.string_to_clean_tokens(string, type = :all_stop_words)
  tokens = string_to_tokens(string, type)
  tokens.delete_if{|token| token.match(/\d/)}
  tokens
end

.string_to_clean_tokens_string(string, type = :all_stop_words) ⇒ Object



54
55
56
# File 'lib/charles/misc.rb', line 54

def self.string_to_clean_tokens_string(string, type = :all_stop_words)
  string_to_clean_tokens(string, type).join(' ')
end

.string_to_tokens(string, type = :all_stop_words) ⇒ Object



46
47
48
# File 'lib/charles/misc.rb', line 46

def self.string_to_tokens(string, type = :all_stop_words)
  self.string_to_tokens_raw(string, type).collect{|token| token.text}
end

.string_to_tokens_raw(string, type = :all_stop_words) ⇒ Object



41
42
43
44
45
# File 'lib/charles/misc.rb', line 41

def self.string_to_tokens_raw(string, type = :all_stop_words)
  token_stream = self.analyzer(type).token_stream('',string)
  o=[]; while(j=token_stream.next); o << j; end;
  return o
end