Class: Parts::Treebank

Inherits:
Object
  • Object
show all
Defined in:
lib/parts/treebank.rb

Instance Method Summary collapse

Constructor Details

#initialize(path = "#{File.dirname(__FILE__)}/treebank3.2.txt") ⇒ Treebank

Returns a new instance of Treebank.



3
4
5
6
7
8
# File 'lib/parts/treebank.rb', line 3

def initialize path="#{File.dirname(__FILE__)}/treebank3.2.txt"
  # Sentences are stored as array's of word-tag pairs, where each sentence
  # will be [{:word => w1, :tag => t1},...,{:word => wn, :tag => tn}].
  @sentences = []
  self.load path
end

Instance Method Details

#load(path) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/parts/treebank.rb', line 14

def load path
  # For each sentence we split on empty space, and then use regex to split
  # each word/tag pair into its word and tag constituents. Whenever a full
  # stop is encountered we create a new sentence.
  File.open(path, "r") do |file|
    sentence  = []
    while (line = file.gets)
      line.split(' ').each do |part|
        md = /(.+)+(\/){1}(.+)+/.match part
        if md
          if md[3] == "."
            @sentences << sentence if not sentence.empty?
            sentence = []
          else
            sentence << {:word => md[1].downcase, :tag => md[3]}
          end
        end
      end
    end
  end
end

#sentencesObject



10
11
12
# File 'lib/parts/treebank.rb', line 10

def sentences
  @sentences
end