Module: DocumentCache

Defined in:
lib/document-cache.rb

Class Method Summary collapse

Class Method Details

.add(document) ⇒ Object



8
9
10
11
# File 'lib/document-cache.rb', line 8

def self.add document
  filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}"
  File.open(filename,'w'){|f| f.write(document)}
end

.clean(sentence) ⇒ Object



57
58
59
# File 'lib/document-cache.rb', line 57

def self.clean(sentence)
  sentence.strip + "."
end

.documentsObject



49
50
51
# File 'lib/document-cache.rb', line 49

def self.documents
  Dir["#{TAUConfig::cache_dir}/*"]
end

.extract_matching_words(search, sentence) ⇒ Object



61
62
63
64
65
# File 'lib/document-cache.rb', line 61

def self.extract_matching_words search, sentence
  matches = find_matches_by_stemming(search, [sentence])
  return matches.values.first if !matches.empty?
  return find_matches_by_grepping(search, [sentence]).values.first
end

.find_examples_for(search, count = 1) ⇒ Object



53
54
55
# File 'lib/document-cache.rb', line 53

def self.find_examples_for search, count=1
  find_matches_in documents, search, count
end

.find_matches_by_grepping(search, sentences) ⇒ Object



23
24
25
26
27
28
# File 'lib/document-cache.rb', line 23

def self.find_matches_by_grepping search, sentences
  sentences.inject({}){|hash, s| 
    hash[clean(s)] = [search] if s.include? search 
    hash
  }
end

.find_matches_by_stemming(search, sentences) ⇒ Object



13
14
15
16
17
18
19
20
21
# File 'lib/document-cache.rb', line 13

def self.find_matches_by_stemming search, sentences
  token = VocabularyChest::stem(search)
  sentences.inject({}){|hash, s| 
    words = s.split(" ")
    found = words.select{|w| VocabularyChest::stem(w) == token}
    hash[clean(s)] = found.map{|f| VocabularyChest::sanitize(f)} if !found.empty?
    hash
  }
end

.find_matches_in(filenames, search, count) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/document-cache.rb', line 30

def self.find_matches_in filenames, search, count
  matches = {}

  [:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
    filenames.each {|filename|
      File.open(filename){|file|
        contents = file.read
        sentences = contents.split(/[\.?!\n]/)
        matches.merge!(self.send(matcher, search, sentences))

        matches.shift until matches.size <= count if matches.size > count
        return matches if matches.size == count
      }
    }
  }

  matches
end

.frequency_listObject



67
68
69
70
71
72
73
# File 'lib/document-cache.rb', line 67

def self.frequency_list
  text = ""
  documents.each{|f| text += File.open(f).read }
  counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
  counts.reject!{|word, count| count < 2}
  counts.sort_by {|k,v| v}.reverse
end

.stemmed_frequency_listObject



75
76
77
78
79
80
81
82
# File 'lib/document-cache.rb', line 75

def self.stemmed_frequency_list
  text = ""
  documents.each{|f| text += File.open(f).read }
  stems = text.split(" ").map{|w| VocabularyChest::stem(w)}
  counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
  counts.reject!{|stem, count| count < 2}
  counts.sort_by {|k,v| v}.reverse
end