Class: AnyStyle::Document

Inherits:
Wapiti::Sequence
  • Object
show all
Extended by:
PDFUtils
Includes:
StringUtils
Defined in:
lib/anystyle/document.rb

Constant Summary collapse

REFSECT =
/references|referenzen|cited|bibliogra|secondary sources|literatur/i

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from PDFUtils

pdf_info, pdf_page_size, pdf_to_text

Methods included from StringUtils

canonize, count, display_chars, display_width, indent, nnum, page_break?, scrub, strip_html, transliterate

Instance Attribute Details

#infoObject

Returns the value of attribute info


50
51
52
# File 'lib/anystyle/document.rb', line 50

def info
  @info
end

#metaObject

Returns the value of attribute meta


50
51
52
# File 'lib/anystyle/document.rb', line 50

def meta
  @meta
end

#pagesObject

Returns the value of attribute pages


50
51
52
# File 'lib/anystyle/document.rb', line 50

def pages
  @pages
end

#pathObject

Returns the value of attribute path


50
51
52
# File 'lib/anystyle/document.rb', line 50

def path
  @path
end

#tokensObject Also known as: lines

Returns the value of attribute tokens


50
51
52
# File 'lib/anystyle/document.rb', line 50

def tokens
  @tokens
end

Class Method Details

.open(path, format: File.extname(path), tagged: false, **opts) ⇒ Object

Raises:

  • (ArgumentError)

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/anystyle/document.rb', line 20

def open(path, format: File.extname(path), tagged: false, **opts)
  raise ArgumentError,
    "cannot open tainted path: '#{path}'" if path.tainted?
  raise ArgumentError,
    "document not found: '#{path}'" unless File.exist?(path)

  path = File.absolute_path(path)

  case format.downcase
  when '.pdf'
    meta = pdf_meta path, **opts if opts[:parse_meta]
    info = pdf_info path, **opts if opts[:parse_info]
    input = pdf_to_text path, **opts
  when '.ttx'
    tagged = true
    input = File.read(path, encoding: 'utf-8')
  when '.txt'
    input = File.read(path, encoding: 'utf-8')
  end

  doc = parse input, tagged: tagged
  doc.path = path
  doc.meta = meta
  doc.info = info
  doc
end

.parse(string, delimiter: /\r?\n/, tagged: false) ⇒ Object


9
10
11
12
13
14
15
16
17
18
# File 'lib/anystyle/document.rb', line 9

def parse(string, delimiter: /\r?\n/, tagged: false)
  current_label = ''
  new(string.split(delimiter).map { |line|
    if tagged
      label, line = line.split(/\s*\| /, 2)
      current_label = label unless label.empty?
    end
    Wapiti::Token.new line, label: current_label.to_s
  })
end

Instance Method Details

#eachObject


65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/anystyle/document.rb', line 65

def each
  if block_given?
    pages.each.with_index do |page, pn|
      page.lines.each.with_index do |line, ln|
        yield line, ln, page, pn
      end
    end
    self
  else
    to_enum
  end
end

#each_section(skip: ['meta']) ⇒ Object


78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/anystyle/document.rb', line 78

def each_section(skip: ['meta'])
  if block_given?
    head = []
    body = []
    seen_content = false

    lines.each do |ln|
      case ln.label
      when 'title'
        if seen_content
          yield [head, body]
          head, body, seen_content = [ln], [], false
        else
          head << ln
        end
      when 'ref', 'text'
        body << ln
        seen_content = true
      else
        body << ln unless skip.include?(ln.label)
      end
    end
    unless head.empty?
      yield [head, body]
    end
    self
  else
    to_enum :each_section
  end
end

#include_references?(rc, tc) ⇒ Boolean

Returns:

  • (Boolean)

169
170
171
# File 'lib/anystyle/document.rb', line 169

def include_references?(rc, tc)
  rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2
end

#inspectObject


194
195
196
# File 'lib/anystyle/document.rb', line 194

def inspect
  "#<AnyStyle::Document lines={#{size}}>"
end

#label(other) ⇒ Object


109
110
111
112
113
114
115
116
117
118
# File 'lib/anystyle/document.rb', line 109

def label(other)
  doc = dup
  doc.tokens = lines.map.with_index { |line, idx|
    Wapiti::Token.new line.value,
      label: other[idx].label.to_s,
      observations: other[idx].observations.dup,
      score: other[idx].score
  }
  doc
end

#line_countsObject


53
54
55
# File 'lib/anystyle/document.rb', line 53

def line_counts
  @line_counts ||= Hash.new(0)
end

#nnum_countsObject


57
58
59
# File 'lib/anystyle/document.rb', line 57

def nnum_counts
  @nnum_counts ||= Hash.new(0)
end

#references(normalize_blocks: false, **opts) ⇒ Object


147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/anystyle/document.rb', line 147

def references(normalize_blocks: false, **opts)
  if normalize_blocks
    each_section.inject([]) do |refs, (head, body)|
      rc = body.count { |tk| tk.label == 'ref' }
      unless rc == 0
        tc = body.count { |tk| tk.label == 'text' }
        is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil?

        # Skip sections with few ref lines!
        if is_ref_sect || include_references?(rc, tc)
          Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2
          refs.concat Refs.parse(body).to_a
        end
      end

      refs
    end
  else
    Refs.parse(lines).to_a
  end
end

#sections(delimiter: "\n", spacer: ' ', **opts) ⇒ Object


173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/anystyle/document.rb', line 173

def sections(delimiter: "\n", spacer: ' ', **opts)
  each_section.map do |(head, body)|
    {
      title: head.map { |tk|
        display_chars(tk.value).lstrip.unicode_normalize
      }.join(spacer),
      text: body.map { |tk|
        display_chars(tk.value).unicode_normalize
      }.join(delimiter)
    }
  end
end

#title(delimiter: " ", **opts) ⇒ Object


186
187
188
189
190
191
192
# File 'lib/anystyle/document.rb', line 186

def title(delimiter: " ", **opts)
  lines.drop_while { |ln|
    ln.label != 'title'
  }.take_while { |ln|
    ln.label == 'title'
  }.map(&:value).join(delimiter)
end

#to_a(encode: true, **opts) ⇒ Object


133
134
135
# File 'lib/anystyle/document.rb', line 133

def to_a(encode: true, **opts)
  super(encode: encode, **opts)
end

#to_h(**opts) ⇒ Object


137
138
139
140
141
142
143
144
145
# File 'lib/anystyle/document.rb', line 137

def to_h(**opts)
  {
    info: info,
    meta: meta,
    sections: sections(**opts),
    title: title(**opts),
    references: references(**opts)
  }
end

#to_s(delimiter: "\n", encode: false, tagged: false, **opts) ⇒ Object


120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/anystyle/document.rb', line 120

def to_s(delimiter: "\n", encode: false, tagged: false, **opts)
  if tagged
    prev_label = nil
    lines.map { |ln|
      label = (ln.label == prev_label) ? '' : ln.label
      prev_label = ln.label
      '%.14s| %s' % ["#{label}              ", ln.value]
    }.join(delimiter)
  else
    super(delimiter: delimiter, encode: encode, tagged: tagged, expanded: false, **opts)
  end
end