Class: LLT::Tokenizer
- Inherits:
-
Object
show all
- Includes:
- Constants::Abbreviations, Core::Serviceable, Helpers::Metrical, Greek
- Defined in:
- lib/llt/tokenizer.rb,
lib/llt/tokenizer/greek.rb,
lib/llt/tokenizer/worker.rb,
lib/llt/tokenizer/version.rb,
lib/llt/tokenizer/version_info.rb
Defined Under Namespace
Modules: Greek
Classes: VersionInfo, Worker
Constant Summary
collapse
- PUNCTUATION =
/&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>᾽·])\1*/
- XML_TAG =
/<\/?.+?>/
- ABBREVIATIONS =
covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero covers Roman date expression like a. d. V. Kal. Apr.
/^(#{ALL_ABBRS_PIPED})$/
- APOSTROPHE_WORDS =
covers a list of words which are abbreviated with a ‘ like satin’ for satisne
/^(#{APOSTROPHES_PIPED})$/
- WORDS_ENDING_WITH_QUE =
/^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i
- WORDS_ENDING_WITH_NE =
generalize these words and start to look for them in the db, especiialy for adverbs
/^(omne|sine|bene|paene|iuvene|siccine)$/i
- WORDS_ENDING_WITH_VE =
formerly had neve and sive, which we split now
/^()$/i
- ENCLITICS =
laetusque to -que laetus in eoque to -que in eo honestumne to -ne honestum
but
uterque, institutione, sive et al. remain
iuvene might come as a suprise in these lists - it’s a hack, but special because it has ve and ne - both would get split. Such words might be so rare that we postpone proper handling for now
%w{ que ne ve c }
- ENCLITICS_MAP =
{
/^(nec)$/i => 'c',
/^(ne|se)u$/i => 'u',
/^(nisi)$/i => 'si',
/^(οὐ|μή|εἰ)τε$/i => 'τε',
/^(οὐ|μή)δε$/i => 'δε',
}
- MERGE_WORDS =
[ %w{ quam diu }, ['non', /null.{1,4}$/]
- ABBR_NAME_WITH_DOT =
/^(#{NAMES_PIPED})\.$/
- ROMAN_DATE_EXPR_WITH_DOT =
/^(#{DATES_PIPED})\.$/
- PUNCT_ITSELF =
Regexp.new("^(?:#{PUNCTUATION.source})$")
- VERSION =
"0.0.8"
Constants included
from Greek
Greek::ALL, Greek::CONS, Greek::CONSONANTS, Greek::PLAIN_VOWELS, Greek::SPIRITUS_ASPER, Greek::SPIRITUS_ASPER_WITH_ACUTE, Greek::SPIRITUS_ASPER_WITH_CIRCUMFLEX, Greek::SPIRITUS_ASPER_WITH_GRAVE, Greek::SPIRITUS_LENIS, Greek::SPIRITUS_LENIS_WITH_ACUTE, Greek::SPIRITUS_LENIS_WITH_CIRCUMFLEX, Greek::SPIRITUS_LENIS_WITH_GRAVE, Greek::SPIRITUS_WITH_IOTA, Greek::STARTING_VOWELS, Greek::VOWELS, Greek::VOWELS_WITH_ACUTE, Greek::VOWELS_WITH_CIRCUMFLEX, Greek::VOWELS_WITH_GRAVE, Greek::VOWELS_WITH_IOTA, Greek::VOWELS_WITH_SPIRITUS
Instance Attribute Summary collapse
Class Method Summary
collapse
Instance Method Summary
collapse
-
#create_tokens ⇒ Object
-
#enclitic(val) ⇒ Object
-
#find_abbreviations_and_join_strings ⇒ Object
%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }.
-
#is_a_mergable_pair?(x, y) ⇒ Boolean
-
#is_que?(element) ⇒ Boolean
-
#led_by_preposition?(index) ⇒ Boolean
-
#lookup(string, type, column, inflection_class = 3) ⇒ Object
-
#make_frequent_corrections ⇒ Object
-
#merge_what_needs_merging ⇒ Object
-
#merge_words(pair, i, to_delete) ⇒ Object
-
#ne_corrections ⇒ Object
-
#open_xml_tag?(str) ⇒ Boolean
-
#original_word(i) ⇒ Object
-
#preliminary ⇒ Object
-
#put_xml_attributes_back_together(elements) ⇒ Object
-
#que_corrections ⇒ Object
-
#raise_id ⇒ Object
-
#reset_id ⇒ Object
-
#reverse_splittings(indices) ⇒ Object
-
#setup(text, options = {}, worker = []) ⇒ Object
-
#setup_worker(worker) ⇒ Object
This is here for two reasons: 1) easier test setup, when a preliminary result shall be further evaluated.
-
#shift_range(shifting_enabled) ⇒ Object
-
#split_and_space_text ⇒ Object
-
#split_enklitika_and_change_their_position ⇒ Object
-
#split_enklitikon(encl, restrictors) ⇒ Object
-
#split_frequent_enclitics ⇒ Object
-
#split_with_force ⇒ Object
-
#to_be_shifted_que_indices ⇒ Object
-
#tokenize(text, add_to: nil, **options) ⇒ Object
-
#ve_corrections ⇒ Object
Methods included from Greek
#contains_krasis, #greek_apostrophe, #krasis, #split_krasis
Instance Attribute Details
#default_options ⇒ Object
Returns the value of attribute default_options.
23
24
25
|
# File 'lib/llt/tokenizer.rb', line 23
def default_options
@default_options
end
|
Class Method Details
.default_options ⇒ Object
25
26
27
28
29
30
31
32
33
34
35
36
|
# File 'lib/llt/tokenizer.rb', line 25
def self.default_options
{
shifting: true,
enclitics_marker: '-',
merging: true,
indexing: true,
splitting: true,
xml: false,
krasis_marker: '-'
}
end
|
Instance Method Details
#enclitic(val) ⇒ Object
201
202
203
|
# File 'lib/llt/tokenizer.rb', line 201
def enclitic(val)
"#{@enclitics_marker}#{val}"
end
|
#find_abbreviations_and_join_strings ⇒ Object
%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }
138
139
140
141
142
143
144
145
146
147
148
149
|
# File 'lib/llt/tokenizer.rb', line 138
def find_abbreviations_and_join_strings
arr = []
@worker.each_with_index do |e, i|
n = @worker[i + 1]
if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS) || greek_apostrophe(n,e)
@worker[i + 1] = n.prepend(e)
arr << (i - arr.size)
end
end
arr.each { |i| @worker.delete_at(i) }
end
|
#is_a_mergable_pair?(x, y) ⇒ Boolean
353
354
355
356
357
|
# File 'lib/llt/tokenizer.rb', line 353
def is_a_mergable_pair?(x, y)
MERGE_WORDS.any? { |a, b| a === x.downcase && b === y }
end
|
#is_que?(element) ⇒ Boolean
251
252
253
|
# File 'lib/llt/tokenizer.rb', line 251
def is_que?(element)
element == enclitic('que')
end
|
#led_by_preposition?(index) ⇒ Boolean
255
256
257
|
# File 'lib/llt/tokenizer.rb', line 255
def led_by_preposition?(index)
@worker[index - 1] =~ /^(in|ad|ob)$/i end
|
#lookup(string, type, column, inflection_class = 3) ⇒ Object
321
322
323
324
325
326
327
328
|
# File 'lib/llt/tokenizer.rb', line 321
def lookup(string, type, column, inflection_class = 3)
string = (type == :persona ? string : string.downcase)
query = {
type: type, stem_type: column, stem: string,
restrictions: { type: :inflection_class, values: Array(inflection_class) }
}
@db.look_up_stem(query)
end
|
#make_frequent_corrections ⇒ Object
225
226
227
228
229
230
231
232
|
# File 'lib/llt/tokenizer.rb', line 225
def make_frequent_corrections
ne_corrections
ve_corrections
que_corrections
end
|
#merge_what_needs_merging ⇒ Object
345
346
347
348
349
350
351
|
# File 'lib/llt/tokenizer.rb', line 345
def merge_what_needs_merging
to_delete = []
@worker.each_overlapping_pair.each_with_index do |pair, i|
merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
end
to_delete.each { |i| @worker.delete_at(i) }
end
|
#merge_words(pair, i, to_delete) ⇒ Object
359
360
361
362
|
# File 'lib/llt/tokenizer.rb', line 359
def merge_words(pair, i, to_delete)
pair.first << pair.last
to_delete << (i + 1 - to_delete.size)
end
|
#ne_corrections ⇒ Object
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
|
# File 'lib/llt/tokenizer.rb', line 259
def ne_corrections
corrections = []
@worker.each_with_index do |w, i|
if w == enclitic('ne')
orig_el = original_word(i)
entries = []
entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ entries += lookup(orig_el + "n", :noun, :stem, [3, 33]) entries += lookup(orig_el + "n", :noun, :stem, 2) entries += lookup(orig_el + "n", :adjective, :stem, [1,3])
entries += lookup(orig_el + "n", :persona, :stem, 2)
if entries.any?(&:third_decl_with_possible_ne_abl?)
corrections << i - corrections.size
end
if entries.any?(&:o_decl_with_possible_ne_voc?)
corrections << i - corrections.size
end
end
end
reverse_splittings(corrections)
end
|
#open_xml_tag?(str) ⇒ Boolean
123
124
125
|
# File 'lib/llt/tokenizer.rb', line 123
def open_xml_tag?(str)
str.start_with?('<') &! str.end_with?('>')
end
|
#original_word(i) ⇒ Object
310
311
312
313
314
315
316
317
318
319
|
# File 'lib/llt/tokenizer.rb', line 310
def original_word(i)
@worker[i + (@shifting ? 1 : -1)]
end
|
#preliminary ⇒ Object
398
399
400
|
# File 'lib/llt/tokenizer.rb', line 398
def preliminary
@worker.to_a
end
|
#put_xml_attributes_back_together(elements) ⇒ Object
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
# File 'lib/llt/tokenizer.rb', line 103
def put_xml_attributes_back_together(elements)
as = ArrayScanner.new(elements)
loop do
last = as.look_behind.to_s if open_xml_tag?(last)
number_of_xml_elements = as.peek_until do |el|
el.end_with?('>')
end.size + 1
number_of_xml_elements.times do
last << ' ' << as.current
elements.delete_at(as.pos)
end
else
as.forward(1)
end
break if as.eoa?
end
end
|
#que_corrections ⇒ Object
234
235
236
237
238
239
240
241
242
|
# File 'lib/llt/tokenizer.rb', line 234
def que_corrections
if @shifting
to_be_shifted_que_indices.each do |i|
@worker.insert(i - 1, @worker.delete_at(i))
end
end
end
|
#raise_id ⇒ Object
388
389
390
391
392
393
394
395
396
|
# File 'lib/llt/tokenizer.rb', line 388
def raise_id
if @indexing
@id += 1
else
true
end
end
|
#reset_id ⇒ Object
384
385
386
|
# File 'lib/llt/tokenizer.rb', line 384
def reset_id
@id = (@indexing ? @id = 0 : nil)
end
|
#reverse_splittings(indices) ⇒ Object
330
331
332
333
334
335
336
337
338
|
# File 'lib/llt/tokenizer.rb', line 330
def reverse_splittings(indices)
indices.each do |i|
ow = original_word(i)
splitted = @worker.delete_at(i).delete(@enclitics_marker)
ow << splitted
end
end
|
#setup(text, options = {}, worker = []) ⇒ Object
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
# File 'lib/llt/tokenizer.rb', line 55
def setup(text, options = {}, worker = [])
@text = text
evaluate_metrical_presence(@text)
@enclitics_marker = parse_option(:enclitics_marker, options)
@merging = parse_option(:merging, options)
@shifting = parse_option(:shifting, options)
@splitting = parse_option(:splitting, options)
@indexing = parse_option(:indexing, options)
@xml = parse_option(:xml, options)
@krasis_marker = parse_option(:krasis_marker, options)
@worker = setup_worker(worker)
@shift_range = shift_range(@shifting)
end
|
#setup_worker(worker) ⇒ Object
This is here for two reasons:
1) easier test setup, when a preliminary result shall be further evaluated
2) more importantly adding a level of indirection, when
the given text holds metrical information. It adds a
substitute implementation for the worker array, but only
if it's needed - which should perform better, when there
are no metrics involved (the default case)
81
82
83
84
85
86
87
88
89
90
91
92
|
# File 'lib/llt/tokenizer.rb', line 81
def setup_worker(worker)
return worker if worker.any?
elements = split_and_space_text
put_xml_attributes_back_together(elements) if @xml
if metrical?
Worker.new(elements, @enclitics_marker)
else
elements
end
end
|
#shift_range(shifting_enabled) ⇒ Object
94
95
96
|
# File 'lib/llt/tokenizer.rb', line 94
def shift_range(shifting_enabled)
shifting_enabled ? 0 : 1
end
|
#split_and_space_text ⇒ Object
98
99
100
101
|
# File 'lib/llt/tokenizer.rb', line 98
def split_and_space_text
regex = @xml ? Regexp.union(XML_TAG, PUNCTUATION) : PUNCTUATION
@text.gsub(regex, ' \0 ').split
end
|
#split_enklitika_and_change_their_position ⇒ Object
170
171
172
173
174
|
# File 'lib/llt/tokenizer.rb', line 170
def split_enklitika_and_change_their_position
split_with_force
split_frequent_enclitics make_frequent_corrections
end
|
#split_enklitikon(encl, restrictors) ⇒ Object
186
187
188
189
190
191
192
193
194
195
196
197
198
199
|
# File 'lib/llt/tokenizer.rb', line 186
def split_enklitikon(encl, restrictors)
regexp = /(?<=\w)#{encl}$/
indices = []
@worker.each_with_index do |token, i|
if token.match(regexp) && restrictors !~ token
token.slice!(regexp)
indices << (i + indices.size + @shift_range)
end
end
indices.each { |i| @worker.insert(i, enclitic(encl)) }
end
|
#split_frequent_enclitics ⇒ Object
212
213
214
215
216
217
218
219
220
221
222
223
|
# File 'lib/llt/tokenizer.rb', line 212
def split_frequent_enclitics
container = []
@worker.each_with_index do |token, i|
ENCLITICS_MAP.each do |regex, encl|
if token.match(regex)
token.slice!(-encl.length, encl.length)
container << [encl, (i + container.size + @shift_range)]
end
end
end
container.each { |encl, i|@worker.insert(i, enclitic(encl)) }
end
|
#split_with_force ⇒ Object
176
177
178
179
180
181
182
183
184
|
# File 'lib/llt/tokenizer.rb', line 176
def split_with_force
ENCLITICS[0..-2].each do |encl|
split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
end
end
|
#to_be_shifted_que_indices ⇒ Object
244
245
246
247
248
249
|
# File 'lib/llt/tokenizer.rb', line 244
def to_be_shifted_que_indices
@worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
accumulator << index if is_que?(element) && led_by_preposition?(index)
end
end
|
#tokenize(text, add_to: nil, **options) ⇒ Object
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
# File 'lib/llt/tokenizer.rb', line 38
def tokenize(text, add_to: nil, **options)
raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
return [] if text.empty?
setup(text, options)
find_abbreviations_and_join_strings
split_krasis if @splitting
split_enklitika_and_change_their_position if @splitting
merge_what_needs_merging if @merging tokens = create_tokens
add_to << tokens if add_to.respond_to?(:<<)
tokens
end
|
#ve_corrections ⇒ Object
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
|
# File 'lib/llt/tokenizer.rb', line 287
def ve_corrections
corrections = []
@worker.each_with_index do |w, i|
if w == enclitic('ve')
orig_el = original_word(i)
entries = []
entries += lookup(orig_el + 'v', :adjective, :stem, 1)
entries += lookup(orig_el + 'v', :adjective, :stem, 3)
entries += lookup(orig_el + 'v', :noun, :stem, [2, 33, 5])
entries += lookup(orig_el + 'v', :persona, :stem, 3)
entries += lookup(orig_el + 've', :verb, :pr, 2)
entries += lookup(orig_el + 'v', :verb, :pr, [3, 5])
if entries.any?
corrections << i - corrections.size
end
end
end
reverse_splittings(corrections)
end
|