Class: LLT::Tokenizer

Inherits:
Object
  • Object
show all
Includes:
Constants::Abbreviations, Core::Serviceable, Helpers::Metrical, Greek
Defined in:
lib/llt/tokenizer.rb,
lib/llt/tokenizer/greek.rb,
lib/llt/tokenizer/worker.rb,
lib/llt/tokenizer/version.rb,
lib/llt/tokenizer/version_info.rb

Defined Under Namespace

Modules: Greek Classes: VersionInfo, Worker

Constant Summary collapse

PUNCTUATION =
/&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>᾽·])\1*/
XML_TAG =
/<\/?.+?>/
ABBREVIATIONS =

covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero covers Roman date expression like a. d. V. Kal. Apr.

/^(#{ALL_ABBRS_PIPED})$/
APOSTROPHE_WORDS =

covers a list of words which are abbreviated with a ‘ like satin’ for satisne

/^(#{APOSTROPHES_PIPED})$/
WORDS_ENDING_WITH_QUE =

neque taken out!

/^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i
WORDS_ENDING_WITH_NE =

generalize these words and start to look for them in the db, especiialy for adverbs

/^(omne|sine|bene|paene|iuvene|siccine)$/i
WORDS_ENDING_WITH_VE =

formerly had neve and sive, which we split now

/^()$/i
ENCLITICS =

laetusque to -que laetus in eoque to -que in eo honestumne to -ne honestum

but

uterque, institutione, sive et al. remain

iuvene might come as a suprise in these lists - it’s a hack, but special because it has ve and ne - both would get split. Such words might be so rare that we postpone proper handling for now

%w{ que ne ve c }
ENCLITICS_MAP =
{
  /^(nec)$/i => 'c',
  /^(ne|se)u$/i => 'u',
  /^(nisi)$/i => 'si',
  /^(οὐ|μή|εἰ)τε$/i => 'τε',
  /^(οὐ|μή)δε$/i => 'δε',
}
MERGE_WORDS =
[ %w{ quam diu }, ['non', /null.{1,4}$/]
ABBR_NAME_WITH_DOT =
/^(#{NAMES_PIPED})\.$/
ROMAN_DATE_EXPR_WITH_DOT =
/^(#{DATES_PIPED})\.$/
PUNCT_ITSELF =
Regexp.new("^(?:#{PUNCTUATION.source})$")
VERSION =
"0.0.8"

Constants included from Greek

Greek::ALL, Greek::CONS, Greek::CONSONANTS, Greek::PLAIN_VOWELS, Greek::SPIRITUS_ASPER, Greek::SPIRITUS_ASPER_WITH_ACUTE, Greek::SPIRITUS_ASPER_WITH_CIRCUMFLEX, Greek::SPIRITUS_ASPER_WITH_GRAVE, Greek::SPIRITUS_LENIS, Greek::SPIRITUS_LENIS_WITH_ACUTE, Greek::SPIRITUS_LENIS_WITH_CIRCUMFLEX, Greek::SPIRITUS_LENIS_WITH_GRAVE, Greek::SPIRITUS_WITH_IOTA, Greek::STARTING_VOWELS, Greek::VOWELS, Greek::VOWELS_WITH_ACUTE, Greek::VOWELS_WITH_CIRCUMFLEX, Greek::VOWELS_WITH_GRAVE, Greek::VOWELS_WITH_IOTA, Greek::VOWELS_WITH_SPIRITUS

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Greek

#contains_krasis, #greek_apostrophe, #krasis, #split_krasis

Instance Attribute Details

#default_optionsObject (readonly)

Returns the value of attribute default_options.



23
24
25
# File 'lib/llt/tokenizer.rb', line 23

def default_options
  @default_options
end

Class Method Details

.default_optionsObject



25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/llt/tokenizer.rb', line 25

def self.default_options
  {
    shifting: true,
    enclitics_marker: '-',
    merging: true,
    indexing: true,
    splitting: true,
    xml: false,
    #for Greek
    krasis_marker: '-'
  }
end

Instance Method Details

#create_tokensObject



370
371
372
373
374
375
376
377
378
379
380
381
382
# File 'lib/llt/tokenizer.rb', line 370

def create_tokens
  # call #to_a is to retrieve (and align) optional metrical data
  reset_id
  @worker.to_a.map! do |el|
    case el
    when XML_TAG                  then Token::XmlTag.new(el)
    when ABBR_NAME_WITH_DOT       then raise_id and Token::Filler.new(el, @id)
    when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id)
    when PUNCT_ITSELF             then raise_id and Token::Punctuation.new(el, @id)
    else                               raise_id and Token::Word.new(el, @id)
    end
  end
end

#enclitic(val) ⇒ Object



201
202
203
# File 'lib/llt/tokenizer.rb', line 201

def enclitic(val)
  "#{@enclitics_marker}#{val}"
end

#find_abbreviations_and_join_stringsObject

%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }



138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/llt/tokenizer.rb', line 138

def find_abbreviations_and_join_strings
  arr = []
  @worker.each_with_index do |e, i|
    n = @worker[i + 1]
    if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS) || greek_apostrophe(n,e)
      @worker[i + 1] = n.prepend(e)
      arr << (i - arr.size)
    end
  end

  arr.each { |i| @worker.delete_at(i) }
end

#is_a_mergable_pair?(x, y) ⇒ Boolean

Returns:

  • (Boolean)


353
354
355
356
357
# File 'lib/llt/tokenizer.rb', line 353

def is_a_mergable_pair?(x, y)
  # x, i.e. quam in quamdiu, needs to be downcased, as it could be in a
  # sentence's first position
  MERGE_WORDS.any? { |a, b| a === x.downcase && b === y  }
end

#is_que?(element) ⇒ Boolean

Returns:

  • (Boolean)


251
252
253
# File 'lib/llt/tokenizer.rb', line 251

def is_que?(element)
  element == enclitic('que')
end

#led_by_preposition?(index) ⇒ Boolean

Returns:

  • (Boolean)


255
256
257
# File 'lib/llt/tokenizer.rb', line 255

def led_by_preposition?(index)
  @worker[index - 1] =~ /^(in|ad|ob)$/i # and others
end

#lookup(string, type, column, inflection_class = 3) ⇒ Object



321
322
323
324
325
326
327
328
# File 'lib/llt/tokenizer.rb', line 321

def lookup(string, type, column, inflection_class = 3)
  string = (type == :persona ? string : string.downcase)
  query = {
            type: type, stem_type: column, stem: string,
            restrictions: { type: :inflection_class, values: Array(inflection_class) }
          }
  @db.look_up_stem(query)
end

#make_frequent_correctionsObject



225
226
227
228
229
230
231
232
# File 'lib/llt/tokenizer.rb', line 225

def make_frequent_corrections
  # uses db lookups
  # # TODO 27.11.13 14:15 by LFDM
  # Implement caching here
  ne_corrections
  ve_corrections
  que_corrections
end

#merge_what_needs_mergingObject

quam diu to quamdiu



345
346
347
348
349
350
351
# File 'lib/llt/tokenizer.rb', line 345

def merge_what_needs_merging
  to_delete = []
  @worker.each_overlapping_pair.each_with_index do |pair, i|
    merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
  end
  to_delete.each { |i| @worker.delete_at(i) }
end

#merge_words(pair, i, to_delete) ⇒ Object



359
360
361
362
# File 'lib/llt/tokenizer.rb', line 359

def merge_words(pair, i, to_delete)
  pair.first << pair.last
  to_delete  << (i + 1 - to_delete.size)
end

#ne_correctionsObject



259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# File 'lib/llt/tokenizer.rb', line 259

def ne_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ne')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el, :noun, :nom)           if orig_el =~ /io$/   # actio-ne ratio-ne
      entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/    # Plato-ne Cicero-ne Solo-ne
      entries += lookup(orig_el + "n", :noun, :stem, [3, 33])  # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne ig-ne
      entries += lookup(orig_el + "n", :noun, :stem, 2)                       # domi-ne
      entries += lookup(orig_el + "n", :adjective, :stem, [1,3])              # communis commune, or bonus

      entries += lookup(orig_el + "n", :persona, :stem, 2)                    # Pauli-ne

      if entries.any?(&:third_decl_with_possible_ne_abl?)
        corrections << i - corrections.size
      end

      if entries.any?(&:o_decl_with_possible_ne_voc?)
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end

#open_xml_tag?(str) ⇒ Boolean

Returns:

  • (Boolean)


123
124
125
# File 'lib/llt/tokenizer.rb', line 123

def open_xml_tag?(str)
  str.start_with?('<') &! str.end_with?('>')
end

#original_word(i) ⇒ Object



310
311
312
313
314
315
316
317
318
319
# File 'lib/llt/tokenizer.rb', line 310

def original_word(i)
  # there are two possible scenarios at this point
  # with shifting enabled:
  #         i  i + 1
  #   arma que virum
  # with shifting disabled:
  #        i - 1  i
  #   arma virum que
  @worker[i + (@shifting ? 1 : -1)]
end

#preliminaryObject



398
399
400
# File 'lib/llt/tokenizer.rb', line 398

def preliminary
  @worker.to_a
end

#put_xml_attributes_back_together(elements) ⇒ Object



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/llt/tokenizer.rb', line 103

def put_xml_attributes_back_together(elements)
  as = ArrayScanner.new(elements)
  loop do
    last = as.look_behind.to_s # catch nil
    if open_xml_tag?(last)
      number_of_xml_elements = as.peek_until do |el|
        el.end_with?('>')
      end.size + 1

      number_of_xml_elements.times do
        last << ' ' << as.current
        elements.delete_at(as.pos)
      end
    else
      as.forward(1)
    end
    break if as.eoa?
  end
end

#que_correctionsObject



234
235
236
237
238
239
240
241
242
# File 'lib/llt/tokenizer.rb', line 234

def que_corrections
  # this is used in rare only in cases like in eoque
  # which needs a shift to -que in eo
  if @shifting
    to_be_shifted_que_indices.each do |i|
      @worker.insert(i - 1, @worker.delete_at(i))
    end
  end
end

#raise_idObject



388
389
390
391
392
393
394
395
396
# File 'lib/llt/tokenizer.rb', line 388

def raise_id
  if @indexing
    @id += 1
  else
    # need to return true because this is used as first part
    # of an and construction
    true
  end
end

#reset_idObject



384
385
386
# File 'lib/llt/tokenizer.rb', line 384

def reset_id
  @id = (@indexing ? @id = 0 : nil)
end

#reverse_splittings(indices) ⇒ Object



330
331
332
333
334
335
336
337
338
# File 'lib/llt/tokenizer.rb', line 330

def reverse_splittings(indices)
  indices.each do |i|
    # need to retrieve the orig word before the splitted var is
    # assigned, as it deletes something in the worker
    ow = original_word(i)
    splitted  = @worker.delete_at(i).delete(@enclitics_marker)
    ow << splitted
  end
end

#setup(text, options = {}, worker = []) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/llt/tokenizer.rb', line 55

def setup(text, options = {}, worker = [])
  @text = text
  evaluate_metrical_presence(@text)
  @enclitics_marker = parse_option(:enclitics_marker, options)
  @merging          = parse_option(:merging, options)
  @shifting         = parse_option(:shifting, options)
  @splitting        = parse_option(:splitting, options)
  @indexing         = parse_option(:indexing, options)
  @xml              = parse_option(:xml, options)
  #for Greek
  @krasis_marker    = parse_option(:krasis_marker, options)
  @worker = setup_worker(worker)
  @shift_range = shift_range(@shifting)
end

#setup_worker(worker) ⇒ Object

This is here for two reasons:

1) easier test setup, when a preliminary result shall be further evaluated

2) more importantly adding a level of indirection, when
   the given text holds metrical information. It adds a
   substitute implementation for the worker array, but only
   if it's needed - which should perform better, when there
   are no metrics involved (the default case)


81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/llt/tokenizer.rb', line 81

def setup_worker(worker)
  return worker if worker.any?

  elements = split_and_space_text
  put_xml_attributes_back_together(elements) if @xml

  if metrical?
    Worker.new(elements, @enclitics_marker)
  else
    elements
  end
end

#shift_range(shifting_enabled) ⇒ Object



94
95
96
# File 'lib/llt/tokenizer.rb', line 94

def shift_range(shifting_enabled)
  shifting_enabled ? 0 : 1
end

#split_and_space_textObject



98
99
100
101
# File 'lib/llt/tokenizer.rb', line 98

def split_and_space_text
  regex = @xml ? Regexp.union(XML_TAG, PUNCTUATION) : PUNCTUATION
  @text.gsub(regex, ' \0 ').split
end

#split_enklitika_and_change_their_positionObject



170
171
172
173
174
# File 'lib/llt/tokenizer.rb', line 170

def split_enklitika_and_change_their_position
  split_with_force
  split_frequent_enclitics # like latin c, ve or greek te, de
  make_frequent_corrections
end

#split_enklitikon(encl, restrictors) ⇒ Object



186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/llt/tokenizer.rb', line 186

def split_enklitikon(encl, restrictors)
  # needs a word character in front - ne itself should be contained
  regexp = /(?<=\w)#{encl}$/

  indices = []
  @worker.each_with_index do |token, i|
    if token.match(regexp) && restrictors !~ token
      token.slice!(regexp)
      indices << (i + indices.size + @shift_range)
    end
  end

  indices.each { |i| @worker.insert(i, enclitic(encl)) }
end

#split_frequent_encliticsObject



212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/llt/tokenizer.rb', line 212

def split_frequent_enclitics
  container = []
  @worker.each_with_index do |token, i|
    ENCLITICS_MAP.each do |regex, encl|
      if token.match(regex)
        token.slice!(-encl.length, encl.length)
        container << [encl, (i + container.size + @shift_range)]
      end
    end
  end
  container.each { |encl, i|@worker.insert(i, enclitic(encl)) }
end

#split_with_forceObject



176
177
178
179
180
181
182
183
184
# File 'lib/llt/tokenizer.rb', line 176

def split_with_force
  # uses brute force at first
  # the restrictor regexps handle only obvious cases

  # don't use c here atm
  ENCLITICS[0..-2].each do |encl|
    split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
  end
end

#to_be_shifted_que_indicesObject



244
245
246
247
248
249
# File 'lib/llt/tokenizer.rb', line 244

def to_be_shifted_que_indices
  # double shifts would properly fail, but they  might never happen
  @worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
    accumulator << index if is_que?(element) && led_by_preposition?(index)
  end
end

#tokenize(text, add_to: nil, **options) ⇒ Object

Raises:

  • (ArgumentError)


38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/llt/tokenizer.rb', line 38

def tokenize(text, add_to: nil, **options)
  raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
  return [] if text.empty?

  setup(text, options)

  find_abbreviations_and_join_strings
  #for Greek
  split_krasis if @splitting
  split_enklitika_and_change_their_position if @splitting
  merge_what_needs_merging if @merging # quam diu => quamdiu
  tokens = create_tokens

  add_to << tokens if add_to.respond_to?(:<<)
  tokens
end

#ve_correctionsObject



287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
# File 'lib/llt/tokenizer.rb', line 287

def ve_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ve')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el + 'v',  :adjective, :stem, 1)
      entries += lookup(orig_el + 'v',  :adjective, :stem, 3)
      entries += lookup(orig_el + 'v',  :noun,      :stem, [2, 33, 5])
      entries += lookup(orig_el + 'v',  :persona,   :stem, 3)
      entries += lookup(orig_el + 've', :verb,      :pr,   2)
      entries += lookup(orig_el + 'v',  :verb,      :pr,   [3, 5]) # not sure if such a word of 5 exists

      if entries.any?
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end