Class: Wapiti::Dataset
- Inherits:
-
Object
- Object
- Wapiti::Dataset
- Extended by:
- Forwardable
- Includes:
- Comparable, Enumerable
- Defined in:
- lib/wapiti/dataset.rb
Instance Attribute Summary collapse
-
#sequences ⇒ Object
readonly
Returns the value of attribute sequences.
Class Method Summary collapse
- .open(path, format: File.extname(path), **opts) ⇒ Object
- .parse(dataset, separator: /(?:\r?\n){2,}/, **opts) ⇒ Object
Instance Method Summary collapse
- #&(other) ⇒ Object
- #+(other) ⇒ Object
- #-(other) ⇒ Object
- #<=>(other) ⇒ Object
- #concat(other) ⇒ Object
- #each(&block) ⇒ Object
- #eql?(other) ⇒ Boolean
- #hash ⇒ Object
-
#initialize(sequences = []) ⇒ Dataset
constructor
A new instance of Dataset.
- #inspect ⇒ Object
- #labels ⇒ Object
- #sample(n = 1, **opts) ⇒ Object
- #save(path, format: File.extname(path), **opts) ⇒ Object
- #slice(start, length = 1) ⇒ Object
- #to_a(**opts) ⇒ Object
- #to_s(separator: "\n\n", **opts) ⇒ Object
- #to_txt(separator: "\n", **opts) ⇒ Object
- #to_xml(**opts) ⇒ Object
- #to_yml(**opts) ⇒ Object
- #|(other) ⇒ Object
Constructor Details
#initialize(sequences = []) ⇒ Dataset
Returns a new instance of Dataset.
52 53 54 |
# File 'lib/wapiti/dataset.rb', line 52 def initialize(sequences = []) @sequences = sequences end |
Instance Attribute Details
#sequences ⇒ Object (readonly)
Returns the value of attribute sequences.
11 12 13 |
# File 'lib/wapiti/dataset.rb', line 11 def sequences @sequences end |
Class Method Details
.open(path, format: File.extname(path), **opts) ⇒ Object
41 42 43 44 45 46 47 48 49 |
# File 'lib/wapiti/dataset.rb', line 41 def open(path, format: File.extname(path), **opts) input = File.read(path, encoding: 'utf-8') case format.downcase when '.xml', 'xml' parse(REXML::Document.new(input), **opts) else parse(input, **opts) end end |
.parse(dataset, separator: /(?:\r?\n){2,}/, **opts) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/wapiti/dataset.rb', line 15 def parse(dataset, separator: /(?:\r?\n){2,}/, **opts) case dataset when Array new(dataset.map { |seq| Sequence.new(seq.map { |tk| value, *obs = tk[0].split(/\s+/) Token.new value, label: tk[1].to_s, observations: obs, score: tk[2] }) }) when String new(dataset.split(separator).map { |seq| Sequence.parse(seq, **opts) }.reject(&:empty?)) when REXML::Document new(dataset.elements.to_a('dataset/sequence').map { |seq| Sequence.new(seq.elements.to_a.map { |sgm| sgm.text.strip.split(opts[:spacer] || /\s+/).map { |tk| Token.new tk, label: sgm.name } }.flatten) }) else raise ArgumentError, "unknown input type: #{input.class}" end end |
Instance Method Details
#&(other) ⇒ Object
110 111 112 |
# File 'lib/wapiti/dataset.rb', line 110 def &(other) Dataset.new(sequences & other.sequences) end |
#+(other) ⇒ Object
98 99 100 |
# File 'lib/wapiti/dataset.rb', line 98 def +(other) Dataset.new(sequences + other.sequences) end |
#-(other) ⇒ Object
102 103 104 |
# File 'lib/wapiti/dataset.rb', line 102 def -(other) Dataset.new(sequences - other.sequences) end |
#<=>(other) ⇒ Object
77 78 79 |
# File 'lib/wapiti/dataset.rb', line 77 def <=>(other) Dataset === other ? sequences <=> other.sequences : nil end |
#concat(other) ⇒ Object
81 82 83 84 |
# File 'lib/wapiti/dataset.rb', line 81 def concat(other) sequences.concat other.sequences self end |
#each(&block) ⇒ Object
56 57 58 59 60 61 62 63 |
# File 'lib/wapiti/dataset.rb', line 56 def each(&block) if block_given? sequences.each(&block) self else to_enum end end |
#eql?(other) ⇒ Boolean
73 74 75 |
# File 'lib/wapiti/dataset.rb', line 73 def eql?(other) hash == other.hash end |
#hash ⇒ Object
69 70 71 |
# File 'lib/wapiti/dataset.rb', line 69 def hash sequences.hash end |
#inspect ⇒ Object
153 154 155 |
# File 'lib/wapiti/dataset.rb', line 153 def inspect "#<Wapiti::Dataset sequences={#{size}}>" end |
#labels ⇒ Object
65 66 67 |
# File 'lib/wapiti/dataset.rb', line 65 def labels map { |sq| sq.map(&:label).uniq }.flatten.uniq.sort end |
#sample(n = 1, **opts) ⇒ Object
86 87 88 |
# File 'lib/wapiti/dataset.rb', line 86 def sample(n = 1, **opts) Dataset.new sequences.sample(n, **opts) end |
#save(path, format: File.extname(path), **opts) ⇒ Object
140 141 142 143 144 145 146 147 148 149 150 151 |
# File 'lib/wapiti/dataset.rb', line 140 def save(path, format: File.extname(path), **opts) output = case format.downcase when '.txt', 'txt' to_s(**opts) when '.xml', 'xml' to_xml(**opts) else raise ArgumentError, "unknown format: '#{format}'" end File.write(path, output, encoding: 'utf-8', mode: 'w') end |
#slice(start, length = 1) ⇒ Object
90 91 92 93 94 95 96 |
# File 'lib/wapiti/dataset.rb', line 90 def slice(start, length = 1) if Range === start Dataset.new sequences.slice(start) else Dataset.new sequences.slice(start, length) end end |
#to_a(**opts) ⇒ Object
122 123 124 |
# File 'lib/wapiti/dataset.rb', line 122 def to_a(**opts) map { |sq| sq.to_a(**opts) } end |
#to_s(separator: "\n\n", **opts) ⇒ Object
114 115 116 |
# File 'lib/wapiti/dataset.rb', line 114 def to_s(separator: "\n\n", **opts) map { |sq| sq.to_s(**opts) }.join(separator) end |
#to_txt(separator: "\n", **opts) ⇒ Object
118 119 120 |
# File 'lib/wapiti/dataset.rb', line 118 def to_txt(separator: "\n", **opts) map { |sq| sq.to_sentence(**opts) }.join(separator) end |
#to_xml(**opts) ⇒ Object
126 127 128 129 130 131 132 133 134 |
# File 'lib/wapiti/dataset.rb', line 126 def to_xml(**opts) xml = Builder::XmlMarkup.new(**opts) xml.instruct! xml.dataset do |ds| each do |seq| seq.to_xml ds end end end |
#to_yml(**opts) ⇒ Object
136 137 138 |
# File 'lib/wapiti/dataset.rb', line 136 def to_yml(**opts) map { |sq| sq.to_h(**opts) } end |