Class: DocumentCache
- Inherits:
-
Object
- Object
- DocumentCache
- Defined in:
- lib/document-cache.rb
Instance Method Summary collapse
- #add(document) ⇒ Object
- #clean(sentence) ⇒ Object
- #clear ⇒ Object
- #documents ⇒ Object
- #extract_matching_words(search, sentence) ⇒ Object
- #find_examples_for(search, count = 1) ⇒ Object
- #find_matches_by_grepping(search, sentences) ⇒ Object
- #find_matches_by_stemming(search, sentences) ⇒ Object
- #find_matches_in(filenames, search, count) ⇒ Object
- #frequency_list ⇒ Object
-
#initialize ⇒ DocumentCache
constructor
A new instance of DocumentCache.
- #stemmed_frequency_list ⇒ Object
Constructor Details
#initialize ⇒ DocumentCache
Returns a new instance of DocumentCache.
8 9 10 |
# File 'lib/document-cache.rb', line 8 def initialize @chest = VocabularyChest.new end |
Instance Method Details
#add(document) ⇒ Object
12 13 14 15 |
# File 'lib/document-cache.rb', line 12 def add document filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}" File.open(filename,'w'){|f| f.write(document)} end |
#clean(sentence) ⇒ Object
65 66 67 |
# File 'lib/document-cache.rb', line 65 def clean(sentence) sentence.strip + "." end |
#clear ⇒ Object
57 58 59 |
# File 'lib/document-cache.rb', line 57 def clear documents.each {|doc| FileUtils.rm_rf doc} end |
#documents ⇒ Object
53 54 55 |
# File 'lib/document-cache.rb', line 53 def documents Dir["#{TAUConfig::cache_dir}/*"] end |
#extract_matching_words(search, sentence) ⇒ Object
69 70 71 72 73 |
# File 'lib/document-cache.rb', line 69 def extract_matching_words search, sentence matches = find_matches_by_stemming(search, [sentence]) return matches.values.first if !matches.empty? return find_matches_by_grepping(search, [sentence]).values.first end |
#find_examples_for(search, count = 1) ⇒ Object
61 62 63 |
# File 'lib/document-cache.rb', line 61 def find_examples_for search, count=1 find_matches_in documents, search, count end |
#find_matches_by_grepping(search, sentences) ⇒ Object
27 28 29 30 31 32 |
# File 'lib/document-cache.rb', line 27 def find_matches_by_grepping search, sentences sentences.inject({}){|hash, s| hash[clean(s)] = [search] if s.include? search hash } end |
#find_matches_by_stemming(search, sentences) ⇒ Object
17 18 19 20 21 22 23 24 25 |
# File 'lib/document-cache.rb', line 17 def find_matches_by_stemming search, sentences token = @chest.stem(search) sentences.inject({}){|hash, s| words = s.split(" ") found = words.select{|w| @chest.stem(w) == token} hash[clean(s)] = found.map{|f| @chest.sanitize(f)} if !found.empty? hash } end |
#find_matches_in(filenames, search, count) ⇒ Object
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/document-cache.rb', line 34 def find_matches_in filenames, search, count matches = {} [:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher| filenames.each {|filename| File.open(filename){|file| contents = file.read sentences = contents.split(/[\.?!\n]/) matches.merge!(self.send(matcher, search, sentences)) matches.shift until matches.size <= count if matches.size > count return matches if matches.size == count } } } matches end |
#frequency_list ⇒ Object
75 76 77 78 79 80 81 |
# File 'lib/document-cache.rb', line 75 def frequency_list text = "" documents.each{|f| text += File.open(f).read } counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h } counts.reject!{|word, count| count < 2} counts.sort_by {|k,v| v}.reverse end |
#stemmed_frequency_list ⇒ Object
83 84 85 86 87 88 89 90 |
# File 'lib/document-cache.rb', line 83 def stemmed_frequency_list text = "" documents.each{|f| text += File.open(f).read } stems = text.split(" ").map{|w| @chest.stem(w)} counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h } counts.reject!{|stem, count| count < 2} counts.sort_by {|k,v| v}.reverse end |