Module: DocumentCache
- Defined in:
- lib/document-cache.rb
Class Method Summary collapse
- .add(document) ⇒ Object
- .clean(sentence) ⇒ Object
- .documents ⇒ Object
- .extract_matching_words(search, sentence) ⇒ Object
- .find_examples_for(search, count = 1) ⇒ Object
- .find_matches_by_grepping(search, sentences) ⇒ Object
- .find_matches_by_stemming(search, sentences) ⇒ Object
- .find_matches_in(filenames, search, count) ⇒ Object
- .frequency_list ⇒ Object
- .stemmed_frequency_list ⇒ Object
Class Method Details
.add(document) ⇒ Object
8 9 10 11 |
# File 'lib/document-cache.rb', line 8 def self.add document filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}" File.open(filename,'w'){|f| f.write(document)} end |
.clean(sentence) ⇒ Object
57 58 59 |
# File 'lib/document-cache.rb', line 57 def self.clean(sentence) sentence.strip + "." end |
.documents ⇒ Object
49 50 51 |
# File 'lib/document-cache.rb', line 49 def self.documents Dir["#{TAUConfig::cache_dir}/*"] end |
.extract_matching_words(search, sentence) ⇒ Object
61 62 63 64 65 |
# File 'lib/document-cache.rb', line 61 def self.extract_matching_words search, sentence matches = find_matches_by_stemming(search, [sentence]) return matches.values.first if !matches.empty? return find_matches_by_grepping(search, [sentence]).values.first end |
.find_examples_for(search, count = 1) ⇒ Object
53 54 55 |
# File 'lib/document-cache.rb', line 53 def self.find_examples_for search, count=1 find_matches_in documents, search, count end |
.find_matches_by_grepping(search, sentences) ⇒ Object
23 24 25 26 27 28 |
# File 'lib/document-cache.rb', line 23 def self.find_matches_by_grepping search, sentences sentences.inject({}){|hash, s| hash[clean(s)] = [search] if s.include? search hash } end |
.find_matches_by_stemming(search, sentences) ⇒ Object
13 14 15 16 17 18 19 20 21 |
# File 'lib/document-cache.rb', line 13 def self.find_matches_by_stemming search, sentences token = VocabularyChest::stem(search) sentences.inject({}){|hash, s| words = s.split(" ") found = words.select{|w| VocabularyChest::stem(w) == token} hash[clean(s)] = found.map{|f| VocabularyChest::sanitize(f)} if !found.empty? hash } end |
.find_matches_in(filenames, search, count) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/document-cache.rb', line 30 def self.find_matches_in filenames, search, count matches = {} [:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher| filenames.each {|filename| File.open(filename){|file| contents = file.read sentences = contents.split(/[\.?!\n]/) matches.merge!(self.send(matcher, search, sentences)) matches.shift until matches.size <= count if matches.size > count return matches if matches.size == count } } } matches end |
.frequency_list ⇒ Object
67 68 69 70 71 72 73 |
# File 'lib/document-cache.rb', line 67 def self.frequency_list text = "" documents.each{|f| text += File.open(f).read } counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h } counts.reject!{|word, count| count < 2} counts.sort_by {|k,v| v}.reverse end |
.stemmed_frequency_list ⇒ Object
75 76 77 78 79 80 81 82 |
# File 'lib/document-cache.rb', line 75 def self.stemmed_frequency_list text = "" documents.each{|f| text += File.open(f).read } stems = text.split(" ").map{|w| VocabularyChest::stem(w)} counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h } counts.reject!{|stem, count| count < 2} counts.sort_by {|k,v| v}.reverse end |