Class: AnyStyle::Document
- Inherits:
-
Wapiti::Sequence
- Object
- Wapiti::Sequence
- AnyStyle::Document
show all
- Extended by:
- PDFUtils
- Includes:
- StringUtils
- Defined in:
- lib/anystyle/document.rb
Constant Summary
collapse
- REFSECT =
/references|referenzen|cited|bibliogra|secondary sources|literatur/i
Instance Attribute Summary collapse
Class Method Summary
collapse
Instance Method Summary
collapse
Methods included from PDFUtils
pdf_info, pdf_page_size, pdf_to_text
canonize, count, display_chars, display_width, indent, nnum, page_break?, scrub, strip_html, transliterate
Instance Attribute Details
#info ⇒ Object
Returns the value of attribute info.
48
49
50
|
# File 'lib/anystyle/document.rb', line 48
def info
@info
end
|
Returns the value of attribute meta.
48
49
50
|
# File 'lib/anystyle/document.rb', line 48
def meta
@meta
end
|
#pages ⇒ Object
Returns the value of attribute pages.
48
49
50
|
# File 'lib/anystyle/document.rb', line 48
def pages
@pages
end
|
#path ⇒ Object
Returns the value of attribute path.
48
49
50
|
# File 'lib/anystyle/document.rb', line 48
def path
@path
end
|
#tokens ⇒ Object
Also known as:
lines
Returns the value of attribute tokens.
48
49
50
|
# File 'lib/anystyle/document.rb', line 48
def tokens
@tokens
end
|
Class Method Details
.open(path, format: File.extname(path), tagged: false, **opts) ⇒ Object
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
# File 'lib/anystyle/document.rb', line 20
def open(path, format: File.extname(path), tagged: false, **opts)
raise ArgumentError,
"document not found: '#{path}'" unless File.exist?(path)
path = File.absolute_path(path)
case format.downcase
when '.pdf'
meta = pdf_meta path, **opts if opts[:parse_meta]
info = pdf_info path, **opts if opts[:parse_info]
input = pdf_to_text path, **opts
when '.ttx'
tagged = true
input = File.read(path, encoding: 'utf-8')
when '.txt'
input = File.read(path, encoding: 'utf-8')
end
doc = parse input, tagged: tagged
doc.path = path
doc.meta = meta
doc.info = info
doc
end
|
.parse(string, delimiter: /\r?\n/, tagged: false) ⇒ Object
9
10
11
12
13
14
15
16
17
18
|
# File 'lib/anystyle/document.rb', line 9
def parse(string, delimiter: /\r?\n/, tagged: false)
current_label = ''
new(string.split(delimiter).map { |line|
if tagged
label, line = line.split(/\s*\|(?: |$)/, 2)
current_label = label unless label.empty?
end
Wapiti::Token.new line, label: current_label.to_s
})
end
|
Instance Method Details
#each ⇒ Object
63
64
65
66
67
68
69
70
71
72
73
74
|
# File 'lib/anystyle/document.rb', line 63
def each
if block_given?
pages.each.with_index do |page, pn|
page.lines.each.with_index do |line, ln|
yield line, ln, page, pn
end
end
self
else
to_enum
end
end
|
#each_section(skip: ['meta']) ⇒ Object
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
# File 'lib/anystyle/document.rb', line 76
def each_section(skip: ['meta'])
if block_given?
head = []
body = []
seen_content = false
lines.each do |ln|
case ln.label
when 'title'
if seen_content
yield [head, body]
head, body, seen_content = [ln], [], false
else
head << ln
end
when 'ref', 'text'
body << ln
seen_content = true
else
body << ln unless skip.include?(ln.label)
end
end
unless head.empty?
yield [head, body]
end
self
else
to_enum :each_section
end
end
|
#include_references?(rc, tc) ⇒ Boolean
167
168
169
|
# File 'lib/anystyle/document.rb', line 167
def include_references?(rc, tc)
rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2
end
|
#inspect ⇒ Object
192
193
194
|
# File 'lib/anystyle/document.rb', line 192
def inspect
"#<AnyStyle::Document lines={#{size}}>"
end
|
#label(other) ⇒ Object
107
108
109
110
111
112
113
114
115
116
|
# File 'lib/anystyle/document.rb', line 107
def label(other)
doc = dup
doc.tokens = lines.map.with_index { |line, idx|
Wapiti::Token.new line.value,
label: other[idx].label.to_s,
observations: other[idx].observations.dup,
score: other[idx].score
}
doc
end
|
#line_counts ⇒ Object
51
52
53
|
# File 'lib/anystyle/document.rb', line 51
def line_counts
@line_counts ||= Hash.new(0)
end
|
#nnum_counts ⇒ Object
55
56
57
|
# File 'lib/anystyle/document.rb', line 55
def nnum_counts
@nnum_counts ||= Hash.new(0)
end
|
#references(normalize_blocks: false, **opts) ⇒ Object
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
|
# File 'lib/anystyle/document.rb', line 145
def references(normalize_blocks: false, **opts)
if normalize_blocks
each_section.inject([]) do |refs, (head, body)|
rc = body.count { |tk| tk.label == 'ref' }
unless rc == 0
tc = body.count { |tk| tk.label == 'text' }
is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil?
if is_ref_sect || include_references?(rc, tc)
Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2
refs.concat Refs.parse(body).to_a
end
end
refs
end
else
Refs.parse(lines).to_a
end
end
|
#sections(delimiter: "\n", spacer: ' ', **opts) ⇒ Object
171
172
173
174
175
176
177
178
179
180
181
182
|
# File 'lib/anystyle/document.rb', line 171
def sections(delimiter: "\n", spacer: ' ', **opts)
each_section.map do |(head, body)|
{
title: head.map { |tk|
display_chars(tk.value).lstrip.unicode_normalize
}.join(spacer),
text: body.map { |tk|
display_chars(tk.value).unicode_normalize
}.join(delimiter)
}
end
end
|
#title(delimiter: " ", **opts) ⇒ Object
184
185
186
187
188
189
190
|
# File 'lib/anystyle/document.rb', line 184
def title(delimiter: " ", **opts)
lines.drop_while { |ln|
ln.label != 'title'
}.take_while { |ln|
ln.label == 'title'
}.map(&:value).join(delimiter)
end
|
#to_a(encode: true, **opts) ⇒ Object
131
132
133
|
# File 'lib/anystyle/document.rb', line 131
def to_a(encode: true, **opts)
super(encode: encode, **opts)
end
|
#to_h(**opts) ⇒ Object
135
136
137
138
139
140
141
142
143
|
# File 'lib/anystyle/document.rb', line 135
def to_h(**opts)
{
info: info,
meta: meta,
sections: sections(**opts),
title: title(**opts),
references: references(**opts)
}
end
|
#to_s(delimiter: "\n", encode: false, tagged: false, **opts) ⇒ Object
118
119
120
121
122
123
124
125
126
127
128
129
|
# File 'lib/anystyle/document.rb', line 118
def to_s(delimiter: "\n", encode: false, tagged: false, **opts)
if tagged
prev_label = nil
lines.map { |ln|
label = (ln.label == prev_label) ? '' : ln.label
prev_label = ln.label
'%.14s| %s' % ["#{label} ", ln.value]
}.join(delimiter)
else
super(delimiter: delimiter, encode: encode, tagged: tagged, expanded: false, **opts)
end
end
|