Class: Wapiti::Dataset

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Comparable, Enumerable
Defined in:
lib/wapiti/dataset.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sequences = []) ⇒ Dataset

Returns a new instance of Dataset.



52
53
54
# File 'lib/wapiti/dataset.rb', line 52

def initialize(sequences = [])
  @sequences = sequences
end

Instance Attribute Details

#sequencesObject (readonly)

Returns the value of attribute sequences.



11
12
13
# File 'lib/wapiti/dataset.rb', line 11

def sequences
  @sequences
end

Class Method Details

.open(path, format: File.extname(path), **opts) ⇒ Object



41
42
43
44
45
46
47
48
49
# File 'lib/wapiti/dataset.rb', line 41

def open(path, format: File.extname(path), **opts)
  input = File.read(path, encoding: 'utf-8')
  case format.downcase
  when '.xml', 'xml'
    parse(REXML::Document.new(input), **opts)
  else
    parse(input, **opts)
  end
end

.parse(dataset, separator: /(?:\r?\n){2,}/, **opts) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/wapiti/dataset.rb', line 15

def parse(dataset, separator: /(?:\r?\n){2,}/, **opts)
  case dataset
  when Array
    new(dataset.map { |seq|
      Sequence.new(seq.map { |tk|
        value, *obs = tk[0].split(/\s+/)
        Token.new value, label: tk[1].to_s, observations: obs, score: tk[2]
      })
    })
  when String
    new(dataset.split(separator).map { |seq|
      Sequence.parse(seq, **opts)
    }.reject(&:empty?))
  when REXML::Document
    new(dataset.elements.to_a('dataset/sequence').map { |seq|
      Sequence.new(seq.elements.to_a.map { |sgm|
        sgm.text.strip.split(opts[:spacer] || /\s+/).map { |tk|
          Token.new tk, label: sgm.name
        }
      }.flatten)
    })
  else
    raise ArgumentError, "unknown input type: #{input.class}"
  end
end

Instance Method Details

#&(other) ⇒ Object



110
111
112
# File 'lib/wapiti/dataset.rb', line 110

def &(other)
  Dataset.new(sequences & other.sequences)
end

#+(other) ⇒ Object



98
99
100
# File 'lib/wapiti/dataset.rb', line 98

def +(other)
  Dataset.new(sequences + other.sequences)
end

#-(other) ⇒ Object



102
103
104
# File 'lib/wapiti/dataset.rb', line 102

def -(other)
  Dataset.new(sequences - other.sequences)
end

#<=>(other) ⇒ Object



77
78
79
# File 'lib/wapiti/dataset.rb', line 77

def <=>(other)
  Dataset === other ? sequences <=> other.sequences : nil
end

#concat(other) ⇒ Object



81
82
83
84
# File 'lib/wapiti/dataset.rb', line 81

def concat(other)
  sequences.concat other.sequences
  self
end

#each(&block) ⇒ Object



56
57
58
59
60
61
62
63
# File 'lib/wapiti/dataset.rb', line 56

def each(&block)
  if block_given?
    sequences.each(&block)
    self
  else
    to_enum
  end
end

#eql?(other) ⇒ Boolean

Returns:

  • (Boolean)


73
74
75
# File 'lib/wapiti/dataset.rb', line 73

def eql?(other)
  hash == other.hash
end

#hashObject



69
70
71
# File 'lib/wapiti/dataset.rb', line 69

def hash
  sequences.hash
end

#inspectObject



153
154
155
# File 'lib/wapiti/dataset.rb', line 153

def inspect
  "#<Wapiti::Dataset sequences={#{size}}>"
end

#labelsObject



65
66
67
# File 'lib/wapiti/dataset.rb', line 65

def labels
  map { |sq| sq.map(&:label).uniq }.flatten.uniq.sort
end

#sample(n = 1, **opts) ⇒ Object



86
87
88
# File 'lib/wapiti/dataset.rb', line 86

def sample(n = 1, **opts)
  Dataset.new sequences.sample(n, **opts)
end

#save(path, format: File.extname(path), **opts) ⇒ Object



140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/wapiti/dataset.rb', line 140

def save(path, format: File.extname(path), **opts)
  output = case format.downcase
    when '.txt', 'txt'
      to_s(**opts)
    when '.xml', 'xml'
      to_xml(**opts)
    else
      raise ArgumentError, "unknown format: '#{format}'"
    end

  File.write(path, output, encoding: 'utf-8', mode: 'w')
end

#slice(start, length = 1) ⇒ Object



90
91
92
93
94
95
96
# File 'lib/wapiti/dataset.rb', line 90

def slice(start, length = 1)
  if Range === start
    Dataset.new sequences.slice(start)
  else
    Dataset.new sequences.slice(start, length)
  end
end

#to_a(**opts) ⇒ Object



122
123
124
# File 'lib/wapiti/dataset.rb', line 122

def to_a(**opts)
  map { |sq| sq.to_a(**opts) }
end

#to_s(separator: "\n\n", **opts) ⇒ Object



114
115
116
# File 'lib/wapiti/dataset.rb', line 114

def to_s(separator: "\n\n", **opts)
  map { |sq| sq.to_s(**opts) }.join(separator)
end

#to_txt(separator: "\n", **opts) ⇒ Object



118
119
120
# File 'lib/wapiti/dataset.rb', line 118

def to_txt(separator: "\n", **opts)
  map { |sq| sq.to_sentence(**opts) }.join(separator)
end

#to_xml(**opts) ⇒ Object



126
127
128
129
130
131
132
133
134
# File 'lib/wapiti/dataset.rb', line 126

def to_xml(**opts)
  xml = Builder::XmlMarkup.new(**opts)
  xml.instruct!
  xml.dataset do |ds|
    each do |seq|
      seq.to_xml ds
    end
  end
end

#to_yml(**opts) ⇒ Object



136
137
138
# File 'lib/wapiti/dataset.rb', line 136

def to_yml(**opts)
  map { |sq| sq.to_h(**opts) }
end

#|(other) ⇒ Object



106
107
108
# File 'lib/wapiti/dataset.rb', line 106

def |(other)
  Dataset.new(sequences | other.sequences)
end