Class: DocumentCache

Inherits:
Object
  • Object
show all
Defined in:
lib/document-cache.rb

Instance Method Summary collapse

Constructor Details

#initializeDocumentCache

Returns a new instance of DocumentCache.



8
9
10
# File 'lib/document-cache.rb', line 8

def initialize
  @chest = VocabularyChest.new
end

Instance Method Details

#add(document) ⇒ Object



12
13
14
15
# File 'lib/document-cache.rb', line 12

def add document
	filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}"
	File.open(filename,'w'){|f| f.write(document)}
end

#clean(sentence) ⇒ Object



65
66
67
# File 'lib/document-cache.rb', line 65

def clean(sentence)
	sentence.strip + "."
end

#clearObject



57
58
59
# File 'lib/document-cache.rb', line 57

def clear
  documents.each {|doc| FileUtils.rm_rf doc}
end

#documentsObject



53
54
55
# File 'lib/document-cache.rb', line 53

def documents
	Dir["#{TAUConfig::cache_dir}/*"]
end

#extract_matching_words(search, sentence) ⇒ Object



69
70
71
72
73
# File 'lib/document-cache.rb', line 69

def extract_matching_words search, sentence
	matches = find_matches_by_stemming(search, [sentence])
	return matches.values.first if !matches.empty?
	return find_matches_by_grepping(search, [sentence]).values.first
end

#find_examples_for(search, count = 1) ⇒ Object



61
62
63
# File 'lib/document-cache.rb', line 61

def find_examples_for search, count=1
	find_matches_in documents, search, count
end

#find_matches_by_grepping(search, sentences) ⇒ Object



27
28
29
30
31
32
# File 'lib/document-cache.rb', line 27

def find_matches_by_grepping search, sentences
	sentences.inject({}){|hash, s| 
		hash[clean(s)] = [search] if s.include? search 
		hash
	}
end

#find_matches_by_stemming(search, sentences) ⇒ Object



17
18
19
20
21
22
23
24
25
# File 'lib/document-cache.rb', line 17

def find_matches_by_stemming search, sentences
	token = @chest.stem(search)
	sentences.inject({}){|hash, s| 
		words = s.split(" ")
		found = words.select{|w| @chest.stem(w) == token}
		hash[clean(s)] = found.map{|f| @chest.sanitize(f)} if !found.empty?
		hash
	}
end

#find_matches_in(filenames, search, count) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/document-cache.rb', line 34

def find_matches_in filenames, search, count
	matches = {}

	[:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
		filenames.each {|filename|
			File.open(filename){|file|
				contents = file.read
				sentences = contents.split(/[\.?!\n]/)
				matches.merge!(self.send(matcher, search, sentences))

				matches.shift until matches.size <= count if matches.size > count
				return matches if matches.size == count
			}
		}
	}

	matches
end

#frequency_listObject



75
76
77
78
79
80
81
# File 'lib/document-cache.rb', line 75

def frequency_list
	text = ""
	documents.each{|f| text += File.open(f).read }
	counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
	counts.reject!{|word, count| count < 2}
	counts.sort_by {|k,v| v}.reverse
end

#stemmed_frequency_listObject



83
84
85
86
87
88
89
90
# File 'lib/document-cache.rb', line 83

def stemmed_frequency_list
	text = ""
	documents.each{|f| text += File.open(f).read }
	stems = text.split(" ").map{|w| @chest.stem(w)}
	counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
	counts.reject!{|stem, count| count < 2}
	counts.sort_by {|k,v| v}.reverse
end