Class: AnyStyle::Finder

Inherits:
ParserCore show all
Defined in:
lib/anystyle/finder.rb

Instance Attribute Summary

Attributes inherited from ParserCore

#features, #model, #normalizers, #options

Instance Method Summary collapse

Methods inherited from ParserCore

#check, instance, #learn, load, #load_model, #normalize, #train

Methods included from StringUtils

canonize, count, display_chars, display_width, indent, nnum, page_break?, scrub, strip_html, transliterate

Constructor Details

#initialize(options = {}) ⇒ Finder

Returns a new instance of Finder


17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/anystyle/finder.rb', line 17

def initialize(options = {})
  super(options)

  @features = [
    Feature::Line.new,
    Feature::Category.new(strip: true),
    Feature::Words.new(dictionary: options[:dictionary] || Dictionary.instance),
    Feature::Indent.new,
    Feature::Ref.new,
    Feature::Position.new(seq: :page, idx: :ln),
    Feature::Position.new(seq: :pages, idx: :pn)
  ]
end

Instance Method Details

#expand(dataset) ⇒ Object


31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/anystyle/finder.rb', line 31

def expand(dataset)
  dataset.each do |doc|
    doc.each.with_index do |(line, ln, page, pn), idx|
      line.observations = features.map.with_index { |f, fn|
        f.observe line.value,
          page: page,
          pages: doc.pages,
          seq: doc,
          pn: pn,
          ln: ln,
          fn: fn,
          idx: idx
      }.flatten
    end
  end
end

#find(input, format: options[:format], **opts) ⇒ Object


48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/anystyle/finder.rb', line 48

def find(input, format: options[:format], **opts)
  case format.to_sym
  when :references, :ref
    format_references(label(input, **opts), **opts)
  when :hash
    format_hash(label(input, **opts), **opts)
  when :wapiti
    label(input, **opts)
  else
    raise ArgumentError, "unknown format '#{format}'"
  end
end

#format_hash(dataset, **opts) ⇒ Object


61
62
63
# File 'lib/anystyle/finder.rb', line 61

def format_hash(dataset, **opts)
  dataset.map { |doc| doc.to_h(**opts) }
end

#format_references(dataset, **opts) ⇒ Object


65
66
67
# File 'lib/anystyle/finder.rb', line 65

def format_references(dataset, **opts)
  dataset.map { |doc| doc.references(**opts) }
end

#label(input, layout: true, crop: false, **opts) ⇒ Object


69
70
71
72
73
74
75
# File 'lib/anystyle/finder.rb', line 69

def label(input, layout: true, crop: false, **opts)
  dataset = prepare(input, layout: layout, crop: crop, **opts)
  output = model.label(dataset, **opts)
  Wapiti::Dataset.new(dataset.map.with_index { |doc, idx|
    doc.label(output[idx])
  })
end

#prepare(input, layout: options[:layout], crop: false, pdftotext: options[:pdftotext], pdfinfo: options[:pdfinfo], **opts) ⇒ Object


77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/anystyle/finder.rb', line 77

def prepare(input,
            layout: options[:layout],
            crop: false,
            pdftotext: options[:pdftotext],
            pdfinfo: options[:pdfinfo],
            **opts)
  doc_opts = { layout: layout, crop: crop, pdftotext: pdftotext, pdfinfo: pdfinfo, **opts }
  case input
  when String
    super(Document.open(input, **doc_opts), **opts)
  when Array
    super(Wapiti::Dataset.new(input.map { |f| Document.open(f, **doc_opts) }), **opts)
  else
    super(input, **opts)
  end
end

#save_each(dataset, dir: '.', tagged: false, **opts) ⇒ Object


94
95
96
97
98
99
100
# File 'lib/anystyle/finder.rb', line 94

def save_each(dataset, dir: '.', tagged: false, **opts)
  dataset.each.with_index do |doc, idx|
    name = doc.path.nil? ? idx : File.basename(doc.path, File.extname(doc.path))
    file = "#{name}.#{tagged ? 'ttx' : 'txt'}"
    File.write(File.join(dir, file), doc.to_s(tagged: tagged, **opts))
  end
end