Class: Gulp::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/gulp/document.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(name, corpus) ⇒ Document

Returns a new instance of Document.



5
6
7
8
9
10
11
12
# File 'lib/gulp/document.rb', line 5

def initialize(name, corpus)
  @name = name
  @corpus = corpus
  @word_count = 0
  @finalized = false
  @phrase_counts = {}#Gulp::DataStore.new('document')
  @extractor = Gulp::PhraseExtractor.new
end

Instance Attribute Details

#corpusObject (readonly)

Returns the value of attribute corpus.



3
4
5
# File 'lib/gulp/document.rb', line 3

def corpus
  @corpus
end

#nameObject (readonly)

Returns the value of attribute name.



3
4
5
# File 'lib/gulp/document.rb', line 3

def name
  @name
end

#phrase_countsObject (readonly)

Returns the value of attribute phrase_counts.



3
4
5
# File 'lib/gulp/document.rb', line 3

def phrase_counts
  @phrase_counts
end

#word_countObject (readonly)

Returns the value of attribute word_count.



3
4
5
# File 'lib/gulp/document.rb', line 3

def word_count
  @word_count
end

Instance Method Details

#add_text(text) ⇒ Object



39
40
41
42
43
44
45
46
47
48
# File 'lib/gulp/document.rb', line 39

def add_text(text)
  raise "cannot add text once finalized" if finalized?
  word_count, phrases = @extractor.extract(text)
  @word_count += word_count
  
  phrases.each do |phrase|
    @phrase_counts[phrase] ||= 0
    @phrase_counts[phrase] += 1
  end
end

#add_to_corpus!Object



28
29
30
31
32
33
34
35
36
37
# File 'lib/gulp/document.rb', line 28

def add_to_corpus!
  unless already_processed?
    @finalized = true
    @phrase_counts.each_key do |phrase|
      @corpus.increment_phrase_document_count(phrase)
    end

    @corpus.mark_as_processed!(name)
  end
end

#already_processed?Boolean

Returns:

  • (Boolean)


20
21
22
# File 'lib/gulp/document.rb', line 20

def already_processed?
  @corpus.already_processed?(name)
end

#finalized?Boolean

Returns:

  • (Boolean)


24
25
26
# File 'lib/gulp/document.rb', line 24

def finalized?
  @finalized
end

#number_of_unique_phrasesObject



50
51
52
# File 'lib/gulp/document.rb', line 50

def number_of_unique_phrases
  phrase_counts.size
end

#phrasesObject



54
55
56
57
58
# File 'lib/gulp/document.rb', line 54

def phrases
  phrase_counts.map do |phrase, count|
    Phrase.new(self, phrase, count)
  end
end

#process!Object



14
15
16
17
18
# File 'lib/gulp/document.rb', line 14

def process!
  extractor = XMLTextExtractor.new(self)
  Nokogiri::XML::SAX::Parser.new(extractor).parse(File.open(name))
  self
end