Class: LLT::Tokenizer

Inherits:

Object

Object
LLT::Tokenizer

show all

Includes:: Constants::Abbreviations, Core::Serviceable, Helpers::Metrical, Greek

Defined in:: lib/llt/tokenizer.rb,
lib/llt/tokenizer/greek.rb,
lib/llt/tokenizer/worker.rb,
lib/llt/tokenizer/version.rb,
lib/llt/tokenizer/version_info.rb

Defined Under Namespace

Modules: Greek Classes: VersionInfo, Worker

Constant Summary collapse

PUNCTUATION =

/&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>᾽·])\1*/

XML_TAG =

/<\/?.+?>/

ABBREVIATIONS = covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero covers Roman date expression like a. d. V. Kal. Apr.

/^(#{ALL_ABBRS_PIPED})$/

APOSTROPHE_WORDS = covers a list of words which are abbreviated with a ‘ like satin’ for satisne

/^(#{APOSTROPHES_PIPED})$/

WORDS_ENDING_WITH_QUE = neque taken out!

/^((un.{1,3})?[qc]u[aei].*que|qu[ao]que|itaque|atque|ut[er].*que|.*cumque|pler(.{1,2}|[oa]rum)que|denique|undique|usque)$/i

WORDS_ENDING_WITH_NE = generalize these words and start to look for them in the db, especiialy for adverbs

/^(omne|sine|bene|paene|iuvene|siccine)$/i

WORDS_ENDING_WITH_VE = formerly had neve and sive, which we split now

/^()$/i

ENCLITICS = laetusque to -que laetus in eoque to -que in eo honestumne to -ne honestum but uterque, institutione, sive et al. remain iuvene might come as a suprise in these lists - it’s a hack, but special because it has ve and ne - both would get split. Such words might be so rare that we postpone proper handling for now

%w{ que ne ve c }

ENCLITICS_MAP =

{
  /^(nec)$/i => 'c',
  /^(ne|se)u$/i => 'u',
  /^(nisi)$/i => 'si',
  /^(οὐ|μή|εἰ)τε$/i => 'τε',
  /^(οὐ|μή)δε$/i => 'δε',
}

MERGE_WORDS =

[ %w{ quam diu }, ['non', /null.{1,4}$/]

ABBR_NAME_WITH_DOT =

/^(#{NAMES_PIPED})\.$/

ROMAN_DATE_EXPR_WITH_DOT =

/^(#{DATES_PIPED})\.$/

PUNCT_ITSELF =

Regexp.new("^(?:#{PUNCTUATION.source})$")

VERSION =

"0.0.8"

Constants included from Greek

Greek::ALL, Greek::CONS, Greek::CONSONANTS, Greek::PLAIN_VOWELS, Greek::SPIRITUS_ASPER, Greek::SPIRITUS_ASPER_WITH_ACUTE, Greek::SPIRITUS_ASPER_WITH_CIRCUMFLEX, Greek::SPIRITUS_ASPER_WITH_GRAVE, Greek::SPIRITUS_LENIS, Greek::SPIRITUS_LENIS_WITH_ACUTE, Greek::SPIRITUS_LENIS_WITH_CIRCUMFLEX, Greek::SPIRITUS_LENIS_WITH_GRAVE, Greek::SPIRITUS_WITH_IOTA, Greek::STARTING_VOWELS, Greek::VOWELS, Greek::VOWELS_WITH_ACUTE, Greek::VOWELS_WITH_CIRCUMFLEX, Greek::VOWELS_WITH_GRAVE, Greek::VOWELS_WITH_IOTA, Greek::VOWELS_WITH_SPIRITUS

Instance Attribute Summary collapse

#default_options ⇒ Object readonly

Returns the value of attribute default_options.

Class Method Summary collapse

.default_options ⇒ Object

Instance Method Summary collapse

#create_tokens ⇒ Object
#enclitic(val) ⇒ Object
#find_abbreviations_and_join_strings ⇒ Object

%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }.
#is_a_mergable_pair?(x, y) ⇒ Boolean
#is_que?(element) ⇒ Boolean
#led_by_preposition?(index) ⇒ Boolean
#lookup(string, type, column, inflection_class = 3) ⇒ Object
#make_frequent_corrections ⇒ Object
#merge_what_needs_merging ⇒ Object

quam diu to quamdiu.
#merge_words(pair, i, to_delete) ⇒ Object
#ne_corrections ⇒ Object
#open_xml_tag?(str) ⇒ Boolean
#original_word(i) ⇒ Object
#preliminary ⇒ Object
#put_xml_attributes_back_together(elements) ⇒ Object
#que_corrections ⇒ Object
#raise_id ⇒ Object
#reset_id ⇒ Object
#reverse_splittings(indices) ⇒ Object
#setup(text, options = {}, worker = []) ⇒ Object
#setup_worker(worker) ⇒ Object

This is here for two reasons: 1) easier test setup, when a preliminary result shall be further evaluated.
#shift_range(shifting_enabled) ⇒ Object
#split_and_space_text ⇒ Object
#split_enklitika_and_change_their_position ⇒ Object
#split_enklitikon(encl, restrictors) ⇒ Object
#split_frequent_enclitics ⇒ Object
#split_with_force ⇒ Object
#to_be_shifted_que_indices ⇒ Object
#tokenize(text, add_to: nil, **options) ⇒ Object
#ve_corrections ⇒ Object

Methods included from Greek

#contains_krasis, #greek_apostrophe, #krasis, #split_krasis

Instance Attribute Details

#default_options ⇒ `Object` (readonly)

Returns the value of attribute default_options.



23
24
25

# File 'lib/llt/tokenizer.rb', line 23

def default_options
  @default_options
end

Class Method Details

.default_options ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 25

def self.default_options
  {
    shifting: true,
    enclitics_marker: '-',
    merging: true,
    indexing: true,
    splitting: true,
    xml: false,
    #for Greek
    krasis_marker: '-'
  }
end

Instance Method Details

#create_tokens ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 370

def create_tokens
  # call #to_a is to retrieve (and align) optional metrical data
  reset_id
  @worker.to_a.map! do |el|
    case el
    when XML_TAG                  then Token::XmlTag.new(el)
    when ABBR_NAME_WITH_DOT       then raise_id and Token::Filler.new(el, @id)
    when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id)
    when PUNCT_ITSELF             then raise_id and Token::Punctuation.new(el, @id)
    else                               raise_id and Token::Word.new(el, @id)
    end
  end
end

#enclitic(val) ⇒ `Object`



201
202
203

# File 'lib/llt/tokenizer.rb', line 201

def enclitic(val)
  "#{@enclitics_marker}#{val}"
end

#find_abbreviations_and_join_strings ⇒ `Object`

%w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }

# File 'lib/llt/tokenizer.rb', line 138

def find_abbreviations_and_join_strings
  arr = []
  @worker.each_with_index do |e, i|
    n = @worker[i + 1]
    if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS) || greek_apostrophe(n,e)
      @worker[i + 1] = n.prepend(e)
      arr << (i - arr.size)
    end
  end

  arr.each { |i| @worker.delete_at(i) }
end

#is_a_mergable_pair?(x, y) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/llt/tokenizer.rb', line 353

def is_a_mergable_pair?(x, y)
  # x, i.e. quam in quamdiu, needs to be downcased, as it could be in a
  # sentence's first position
  MERGE_WORDS.any? { |a, b| a === x.downcase && b === y  }
end

#is_que?(element) ⇒ `Boolean`

Returns:

(Boolean)



251
252
253

# File 'lib/llt/tokenizer.rb', line 251

def is_que?(element)
  element == enclitic('que')
end

#led_by_preposition?(index) ⇒ `Boolean`

Returns:

(Boolean)



255
256
257

# File 'lib/llt/tokenizer.rb', line 255

def led_by_preposition?(index)
  @worker[index - 1] =~ /^(in|ad|ob)$/i # and others
end

#lookup(string, type, column, inflection_class = 3) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 321

def lookup(string, type, column, inflection_class = 3)
  string = (type == :persona ? string : string.downcase)
  query = {
            type: type, stem_type: column, stem: string,
            restrictions: { type: :inflection_class, values: Array(inflection_class) }
          }
  @db.look_up_stem(query)
end

#make_frequent_corrections ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 225

def make_frequent_corrections
  # uses db lookups
  # # TODO 27.11.13 14:15 by LFDM
  # Implement caching here
  ne_corrections
  ve_corrections
  que_corrections
end

#merge_what_needs_merging ⇒ `Object`

quam diu to quamdiu

# File 'lib/llt/tokenizer.rb', line 345

def merge_what_needs_merging
  to_delete = []
  @worker.each_overlapping_pair.each_with_index do |pair, i|
    merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
  end
  to_delete.each { |i| @worker.delete_at(i) }
end

#merge_words(pair, i, to_delete) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 359

def merge_words(pair, i, to_delete)
  pair.first << pair.last
  to_delete  << (i + 1 - to_delete.size)
end

#ne_corrections ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 259

def ne_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ne')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el, :noun, :nom)           if orig_el =~ /io$/   # actio-ne ratio-ne
      entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/    # Plato-ne Cicero-ne Solo-ne
      entries += lookup(orig_el + "n", :noun, :stem, [3, 33])  # fortitudi-ne ratio-ne libidi-ne homi-ne fi-ne agmi-ne iuve-ne ig-ne
      entries += lookup(orig_el + "n", :noun, :stem, 2)                       # domi-ne
      entries += lookup(orig_el + "n", :adjective, :stem, [1,3])              # communis commune, or bonus

      entries += lookup(orig_el + "n", :persona, :stem, 2)                    # Pauli-ne

      if entries.any?(&:third_decl_with_possible_ne_abl?)
        corrections << i - corrections.size
      end

      if entries.any?(&:o_decl_with_possible_ne_voc?)
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end

#open_xml_tag?(str) ⇒ `Boolean`

Returns:

(Boolean)



123
124
125

# File 'lib/llt/tokenizer.rb', line 123

def open_xml_tag?(str)
  str.start_with?('<') &! str.end_with?('>')
end

#original_word(i) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 310

def original_word(i)
  # there are two possible scenarios at this point
  # with shifting enabled:
  #         i  i + 1
  #   arma que virum
  # with shifting disabled:
  #        i - 1  i
  #   arma virum que
  @worker[i + (@shifting ? 1 : -1)]
end

#preliminary ⇒ `Object`



398
399
400

# File 'lib/llt/tokenizer.rb', line 398

def preliminary
  @worker.to_a
end

#put_xml_attributes_back_together(elements) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 103

def put_xml_attributes_back_together(elements)
  as = ArrayScanner.new(elements)
  loop do
    last = as.look_behind.to_s # catch nil
    if open_xml_tag?(last)
      number_of_xml_elements = as.peek_until do |el|
        el.end_with?('>')
      end.size + 1

      number_of_xml_elements.times do
        last << ' ' << as.current
        elements.delete_at(as.pos)
      end
    else
      as.forward(1)
    end
    break if as.eoa?
  end
end

#que_corrections ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 234

def que_corrections
  # this is used in rare only in cases like in eoque
  # which needs a shift to -que in eo
  if @shifting
    to_be_shifted_que_indices.each do |i|
      @worker.insert(i - 1, @worker.delete_at(i))
    end
  end
end

#raise_id ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 388

def raise_id
  if @indexing
    @id += 1
  else
    # need to return true because this is used as first part
    # of an and construction
    true
  end
end

#reset_id ⇒ `Object`



384
385
386

# File 'lib/llt/tokenizer.rb', line 384

def reset_id
  @id = (@indexing ? @id = 0 : nil)
end

#reverse_splittings(indices) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 330

def reverse_splittings(indices)
  indices.each do |i|
    # need to retrieve the orig word before the splitted var is
    # assigned, as it deletes something in the worker
    ow = original_word(i)
    splitted  = @worker.delete_at(i).delete(@enclitics_marker)
    ow << splitted
  end
end

#setup(text, options = {}, worker = []) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 55

def setup(text, options = {}, worker = [])
  @text = text
  evaluate_metrical_presence(@text)
  @enclitics_marker = parse_option(:enclitics_marker, options)
  @merging          = parse_option(:merging, options)
  @shifting         = parse_option(:shifting, options)
  @splitting        = parse_option(:splitting, options)
  @indexing         = parse_option(:indexing, options)
  @xml              = parse_option(:xml, options)
  #for Greek
  @krasis_marker    = parse_option(:krasis_marker, options)
  @worker = setup_worker(worker)
  @shift_range = shift_range(@shifting)
end

#setup_worker(worker) ⇒ `Object`

This is here for two reasons:

1) easier test setup, when a preliminary result shall be further evaluated

2) more importantly adding a level of indirection, when
   the given text holds metrical information. It adds a
   substitute implementation for the worker array, but only
   if it's needed - which should perform better, when there
   are no metrics involved (the default case)

# File 'lib/llt/tokenizer.rb', line 81

def setup_worker(worker)
  return worker if worker.any?

  elements = split_and_space_text
  put_xml_attributes_back_together(elements) if @xml

  if metrical?
    Worker.new(elements, @enclitics_marker)
  else
    elements
  end
end

#shift_range(shifting_enabled) ⇒ `Object`



94
95
96

# File 'lib/llt/tokenizer.rb', line 94

def shift_range(shifting_enabled)
  shifting_enabled ? 0 : 1
end

#split_and_space_text ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 98

def split_and_space_text
  regex = @xml ? Regexp.union(XML_TAG, PUNCTUATION) : PUNCTUATION
  @text.gsub(regex, ' \0 ').split
end

#split_enklitika_and_change_their_position ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 170

def split_enklitika_and_change_their_position
  split_with_force
  split_frequent_enclitics # like latin c, ve or greek te, de
  make_frequent_corrections
end

#split_enklitikon(encl, restrictors) ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 186

def split_enklitikon(encl, restrictors)
  # needs a word character in front - ne itself should be contained
  regexp = /(?<=\w)#{encl}$/

  indices = []
  @worker.each_with_index do |token, i|
    if token.match(regexp) && restrictors !~ token
      token.slice!(regexp)
      indices << (i + indices.size + @shift_range)
    end
  end

  indices.each { |i| @worker.insert(i, enclitic(encl)) }
end

#split_frequent_enclitics ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 212

def split_frequent_enclitics
  container = []
  @worker.each_with_index do |token, i|
    ENCLITICS_MAP.each do |regex, encl|
      if token.match(regex)
        token.slice!(-encl.length, encl.length)
        container << [encl, (i + container.size + @shift_range)]
      end
    end
  end
  container.each { |encl, i|@worker.insert(i, enclitic(encl)) }
end

#split_with_force ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 176

def split_with_force
  # uses brute force at first
  # the restrictor regexps handle only obvious cases

  # don't use c here atm
  ENCLITICS[0..-2].each do |encl|
    split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
  end
end

#to_be_shifted_que_indices ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 244

def to_be_shifted_que_indices
  # double shifts would properly fail, but they  might never happen
  @worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
    accumulator << index if is_que?(element) && led_by_preposition?(index)
  end
end

#tokenize(text, add_to: nil, **options) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/llt/tokenizer.rb', line 38

def tokenize(text, add_to: nil, **options)
  raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
  return [] if text.empty?

  setup(text, options)

  find_abbreviations_and_join_strings
  #for Greek
  split_krasis if @splitting
  split_enklitika_and_change_their_position if @splitting
  merge_what_needs_merging if @merging # quam diu => quamdiu
  tokens = create_tokens

  add_to << tokens if add_to.respond_to?(:<<)
  tokens
end

#ve_corrections ⇒ `Object`

# File 'lib/llt/tokenizer.rb', line 287

def ve_corrections
  corrections = []
  @worker.each_with_index do |w, i|
    if w == enclitic('ve')
      orig_el = original_word(i)

      entries = []
      entries += lookup(orig_el + 'v',  :adjective, :stem, 1)
      entries += lookup(orig_el + 'v',  :adjective, :stem, 3)
      entries += lookup(orig_el + 'v',  :noun,      :stem, [2, 33, 5])
      entries += lookup(orig_el + 'v',  :persona,   :stem, 3)
      entries += lookup(orig_el + 've', :verb,      :pr,   2)
      entries += lookup(orig_el + 'v',  :verb,      :pr,   [3, 5]) # not sure if such a word of 5 exists

      if entries.any?
        corrections << i - corrections.size
      end
    end
  end

  reverse_splittings(corrections)
end

Class: LLT::Tokenizer

Defined Under Namespace

Constant Summary collapse

Constants included from Greek

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Greek

Instance Attribute Details

#default_options ⇒ Object (readonly)

Class Method Details

.default_options ⇒ Object

Instance Method Details

#create_tokens ⇒ Object

#enclitic(val) ⇒ Object

#find_abbreviations_and_join_strings ⇒ Object

#is_a_mergable_pair?(x, y) ⇒ Boolean

#is_que?(element) ⇒ Boolean

#led_by_preposition?(index) ⇒ Boolean

#lookup(string, type, column, inflection_class = 3) ⇒ Object

#make_frequent_corrections ⇒ Object

#merge_what_needs_merging ⇒ Object

#merge_words(pair, i, to_delete) ⇒ Object

#ne_corrections ⇒ Object

#open_xml_tag?(str) ⇒ Boolean

#original_word(i) ⇒ Object

#preliminary ⇒ Object

#put_xml_attributes_back_together(elements) ⇒ Object

#que_corrections ⇒ Object

#raise_id ⇒ Object

#reset_id ⇒ Object

#reverse_splittings(indices) ⇒ Object

#setup(text, options = {}, worker = []) ⇒ Object

#setup_worker(worker) ⇒ Object

#shift_range(shifting_enabled) ⇒ Object

#split_and_space_text ⇒ Object

#split_enklitika_and_change_their_position ⇒ Object

#split_enklitikon(encl, restrictors) ⇒ Object

#split_frequent_enclitics ⇒ Object

#split_with_force ⇒ Object

#to_be_shifted_que_indices ⇒ Object

#tokenize(text, add_to: nil, **options) ⇒ Object

#ve_corrections ⇒ Object

#default_options ⇒ `Object` (readonly)

.default_options ⇒ `Object`

#create_tokens ⇒ `Object`

#enclitic(val) ⇒ `Object`

#find_abbreviations_and_join_strings ⇒ `Object`

#is_a_mergable_pair?(x, y) ⇒ `Boolean`

#is_que?(element) ⇒ `Boolean`

#led_by_preposition?(index) ⇒ `Boolean`

#lookup(string, type, column, inflection_class = 3) ⇒ `Object`

#make_frequent_corrections ⇒ `Object`

#merge_what_needs_merging ⇒ `Object`

#merge_words(pair, i, to_delete) ⇒ `Object`

#ne_corrections ⇒ `Object`

#open_xml_tag?(str) ⇒ `Boolean`

#original_word(i) ⇒ `Object`

#preliminary ⇒ `Object`

#put_xml_attributes_back_together(elements) ⇒ `Object`

#que_corrections ⇒ `Object`

#raise_id ⇒ `Object`

#reset_id ⇒ `Object`

#reverse_splittings(indices) ⇒ `Object`

#setup(text, options = {}, worker = []) ⇒ `Object`

#setup_worker(worker) ⇒ `Object`

#shift_range(shifting_enabled) ⇒ `Object`

#split_and_space_text ⇒ `Object`

#split_enklitika_and_change_their_position ⇒ `Object`

#split_enklitikon(encl, restrictors) ⇒ `Object`

#split_frequent_enclitics ⇒ `Object`

#split_with_force ⇒ `Object`

#to_be_shifted_que_indices ⇒ `Object`

#tokenize(text, add_to: nil, **options) ⇒ `Object`

#ve_corrections ⇒ `Object`