Class: AnyStyle::Finder
Instance Attribute Summary
Attributes inherited from ParserCore
#features, #model, #mtime, #normalizers, #options
Instance Method Summary
collapse
-
#expand(dataset) ⇒ Object
-
#find(input, format: , **opts) ⇒ Object
-
#format_hash(dataset, **opts) ⇒ Object
-
#format_references(dataset, **opts) ⇒ Object
-
#initialize(options = {}) ⇒ Finder
constructor
A new instance of Finder.
-
#label(input, layout: true, crop: false, **opts) ⇒ Object
-
#prepare(input, layout: , crop: false, pdftotext: , pdfinfo: , **opts) ⇒ Object
-
#save_each(dataset, dir: '.', tagged: false, **opts) ⇒ Object
Methods inherited from ParserCore
#check, instance, #learn, load, #load_model, #normalize, #reload, #stale?, #train
canonize, count, display_chars, display_width, indent, nnum, page_break?, scrub, strip_html, transliterate
Constructor Details
#initialize(options = {}) ⇒ Finder
Returns a new instance of Finder.
Instance Method Details
#expand(dataset) ⇒ Object
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
# File 'lib/anystyle/finder.rb', line 31
def expand(dataset)
dataset.each do |doc|
doc.each.with_index do |(line, ln, page, pn), idx|
line.observations = features.map.with_index { |f, fn|
f.observe line.value,
page: page,
pages: doc.pages,
seq: doc,
pn: pn,
ln: ln,
fn: fn,
idx: idx
}.flatten
end
end
end
|
#find(input, format: , **opts) ⇒ Object
48
49
50
51
52
53
54
55
56
57
58
59
|
# File 'lib/anystyle/finder.rb', line 48
def find(input, format: options[:format], **opts)
case format.to_sym
when :references, :ref
format_references(label(input, **opts), **opts)
when :hash
format_hash(label(input, **opts), **opts)
when :wapiti
label(input, **opts)
else
raise ArgumentError, "unknown format '#{format}'"
end
end
|
61
62
63
|
# File 'lib/anystyle/finder.rb', line 61
def format_hash(dataset, **opts)
dataset.map { |doc| doc.to_h(**opts) }
end
|
65
66
67
|
# File 'lib/anystyle/finder.rb', line 65
def format_references(dataset, **opts)
dataset.map { |doc| doc.references(**opts) }
end
|
#label(input, layout: true, crop: false, **opts) ⇒ Object
69
70
71
72
73
74
75
|
# File 'lib/anystyle/finder.rb', line 69
def label(input, layout: true, crop: false, **opts)
dataset = prepare(input, layout: layout, crop: crop, **opts)
output = model.label(dataset, **opts)
Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
doc.label(output[idx])
})
end
|
#prepare(input, layout: , crop: false, pdftotext: , pdfinfo: , **opts) ⇒ Object
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
# File 'lib/anystyle/finder.rb', line 77
def prepare(input,
layout: options[:layout],
crop: false,
pdftotext: options[:pdftotext],
pdfinfo: options[:pdfinfo],
**opts)
doc_opts = { layout: layout, crop: crop, pdftotext: pdftotext, pdfinfo: pdfinfo, **opts }
case input
when String
super(Document.open(input, **doc_opts), **opts)
when Array
super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **doc_opts) }), **opts)
else
super(input, **opts)
end
end
|
#save_each(dataset, dir: '.', tagged: false, **opts) ⇒ Object
94
95
96
97
98
99
100
|
# File 'lib/anystyle/finder.rb', line 94
def save_each(dataset, dir: '.', tagged: false, **opts)
dataset.each.with_index do |doc, idx|
name = doc.path.nil? ? idx : File.basename(doc.path, File.extname(doc.path))
file = "#{name}.#{tagged ? 'ttx' : 'txt'}"
File.write(File.join(dir, file), doc.to_s(tagged: tagged, **opts))
end
end
|