7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
# File 'lib/corpus/utils.rb', line 7
def self.load_movie_reviews
base_dir = File.join(File.dirname(__FILE__), '..', '..', 'test', 'corpora', 'movie_reviews')
classification_count = { :negative => 0, :positive => 0}
movie_review_corpus = []
if File.directory?(File.join(base_dir, 'movie_reviews', 'neg'))
['negative', 'positive'].each do |classification|
dir = File.join(base_dir, 'movie_reviews', classification[0,3])
Dir[dir + "/*.txt"].each do |filename|
movie_review_corpus << {:classification => classification.to_sym, :text => File.read(filename) }
classification_count[classification.to_sym] +=1
end
end
else
::Archive.read_open_filename(File.join(base_dir, 'movie_reviews.zip')) do |ar|
while entry = ar.
name = entry.pathname
if name =~ /\/$/
FileUtils.mkdir_p File.join(base_dir, name)
next
end
classification = name =~ /neg/ ? :negative : :positive
text = ar.read_data
movie_review_corpus << { :classification => classification, :text => text }
file = File.join(base_dir, name)
File.open(file, 'w') {|f| f.write(text) }
classification_count[classification.to_sym] +=1
end
end
end
movie_review = { :training => [], :test => [] }
count = { :positive => 0, :negative => 0}
movie_review_corpus.each do |doc|
count[doc[:classification]]+=1
if count[doc[:classification]] > classification_count[doc[:classification]]*3/4
movie_review[:test] << doc
else
movie_review[:training] << doc
end
end
movie_review
end
|