Class: Spellr::Tokenizer

Inherits:

Object

Object
Spellr::Tokenizer

show all

Defined in:: lib/spellr/tokenizer.rb

Instance Attribute Summary collapse

#file ⇒ Object readonly

Returns the value of attribute file.
#filename ⇒ Object readonly

Returns the value of attribute filename.

Instance Method Summary collapse

#each_line_with_stats ⇒ Object

rubocop:disable Metrics/MethodLength.
#each_term(&block) ⇒ Object
#each_token(skip_term_proc: nil) ⇒ Object
#initialize(file, start_at: nil, skip_key: true) ⇒ Tokenizer constructor

A new instance of Tokenizer.
#map(&block) ⇒ Object
#normalized_terms ⇒ Object
#prepare_line(line, line_number, char_offset, byte_offset) ⇒ Object
#terms ⇒ Object

leftovers:test.

Constructor Details

#initialize(file, start_at: nil, skip_key: true) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

# File 'lib/spellr/tokenizer.rb', line 12

def initialize(file, start_at: nil, skip_key: true)
  @filename = file
  @start_at = start_at || ColumnLocation.new(line_location: LineLocation.new(file))
  @file = file.is_a?(StringIO) || file.is_a?(IO) ? file : ::File.new(file)
  @file.pos = @start_at.line_location.byte_offset

  @line_tokenizer = LineTokenizer.new('', skip_key: skip_key)
end

Instance Attribute Details

#file ⇒ `Object` (readonly)

Returns the value of attribute file.



10
11
12

# File 'lib/spellr/tokenizer.rb', line 10

def file
  @file
end

#filename ⇒ `Object` (readonly)

Returns the value of attribute filename.



10
11
12

# File 'lib/spellr/tokenizer.rb', line 10

def filename
  @filename
end

Instance Method Details

#each_line_with_stats ⇒ `Object`

rubocop:disable Metrics/MethodLength

# File 'lib/spellr/tokenizer.rb', line 55

def each_line_with_stats # rubocop:disable Metrics/MethodLength
  char_offset = @start_at.line_location.char_offset
  byte_offset = @start_at.line_location.byte_offset

  file.each_line.with_index(@start_at.line_location.line_number) do |line, line_number|
    yield line, line_number, char_offset, byte_offset

    char_offset += line.length
    byte_offset += line.bytesize
  end
ensure
  file.close
end

#each_term(&block) ⇒ `Object`

# File 'lib/spellr/tokenizer.rb', line 29

def each_term(&block)
  file.each_line do |line|
    prepare_tokenizer_for_line(line)&.each_term(&block)
  end
ensure
  file.close
end

#each_token(skip_term_proc: nil) ⇒ `Object`

# File 'lib/spellr/tokenizer.rb', line 37

def each_token(skip_term_proc: nil)
  each_line_with_stats do |line, line_number, char_offset, byte_offset|
    prepare_tokenizer_for_line(line)&.each_token(skip_term_proc: skip_term_proc) do |token|
      token.line = prepare_line(line, line_number, char_offset, byte_offset)

      yield token
    end
  end
end

#map(&block) ⇒ `Object`



25
26
27

# File 'lib/spellr/tokenizer.rb', line 25

def map(&block)
  enum_for(:each_token).map(&block)
end

#normalized_terms ⇒ `Object`



69
70
71

# File 'lib/spellr/tokenizer.rb', line 69

def normalized_terms
  enum_for(:each_term).map(&:spellr_normalize).uniq.sort
end

#prepare_line(line, line_number, char_offset, byte_offset) ⇒ `Object`

# File 'lib/spellr/tokenizer.rb', line 47

def prepare_line(line, line_number, char_offset, byte_offset)
  line_location = LineLocation.new(
    filename, line_number, char_offset: char_offset, byte_offset: byte_offset
  )
  column_location = ColumnLocation.new(line_location: line_location)
  Token.new(line, location: column_location)
end

#terms ⇒ `Object`

leftovers:test



21
22
23

# File 'lib/spellr/tokenizer.rb', line 21

def terms # leftovers:test
  enum_for(:each_term).to_a
end

Class: Spellr::Tokenizer

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file, start_at: nil, skip_key: true) ⇒ Tokenizer

Instance Attribute Details

#file ⇒ Object (readonly)

#filename ⇒ Object (readonly)

Instance Method Details

#each_line_with_stats ⇒ Object

#each_term(&block) ⇒ Object

#each_token(skip_term_proc: nil) ⇒ Object

#map(&block) ⇒ Object

#normalized_terms ⇒ Object

#prepare_line(line, line_number, char_offset, byte_offset) ⇒ Object

#terms ⇒ Object

#initialize(file, start_at: nil, skip_key: true) ⇒ `Tokenizer`

#file ⇒ `Object` (readonly)

#filename ⇒ `Object` (readonly)

#each_line_with_stats ⇒ `Object`

#each_term(&block) ⇒ `Object`

#each_token(skip_term_proc: nil) ⇒ `Object`

#map(&block) ⇒ `Object`

#normalized_terms ⇒ `Object`

#prepare_line(line, line_number, char_offset, byte_offset) ⇒ `Object`

#terms ⇒ `Object`