Class: Spellr::Tokenizer
- Inherits:
-
Object
- Object
- Spellr::Tokenizer
- Defined in:
- lib/spellr/tokenizer.rb
Instance Attribute Summary collapse
-
#file ⇒ Object
readonly
Returns the value of attribute file.
-
#filename ⇒ Object
readonly
Returns the value of attribute filename.
Instance Method Summary collapse
-
#each_line_with_stats ⇒ Object
rubocop:disable Metrics/MethodLength.
- #each_term(&block) ⇒ Object
- #each_token(skip_term_proc: nil) ⇒ Object
-
#initialize(file, start_at: nil, skip_key: true) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #map(&block) ⇒ Object
- #normalized_terms ⇒ Object
- #prepare_line(line, line_number, char_offset, byte_offset) ⇒ Object
-
#terms ⇒ Object
leftovers:test.
Constructor Details
#initialize(file, start_at: nil, skip_key: true) ⇒ Tokenizer
Returns a new instance of Tokenizer.
12 13 14 15 16 17 18 19 |
# File 'lib/spellr/tokenizer.rb', line 12 def initialize(file, start_at: nil, skip_key: true) @filename = file @start_at = start_at || ColumnLocation.new(line_location: LineLocation.new(file)) @file = file.is_a?(StringIO) || file.is_a?(IO) ? file : ::File.new(file) @file.pos = @start_at.line_location.byte_offset @line_tokenizer = LineTokenizer.new('', skip_key: skip_key) end |
Instance Attribute Details
#file ⇒ Object (readonly)
Returns the value of attribute file.
10 11 12 |
# File 'lib/spellr/tokenizer.rb', line 10 def file @file end |
#filename ⇒ Object (readonly)
Returns the value of attribute filename.
10 11 12 |
# File 'lib/spellr/tokenizer.rb', line 10 def filename @filename end |
Instance Method Details
#each_line_with_stats ⇒ Object
rubocop:disable Metrics/MethodLength
55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/spellr/tokenizer.rb', line 55 def each_line_with_stats # rubocop:disable Metrics/MethodLength char_offset = @start_at.line_location.char_offset byte_offset = @start_at.line_location.byte_offset file.each_line.with_index(@start_at.line_location.line_number) do |line, line_number| yield line, line_number, char_offset, byte_offset char_offset += line.length byte_offset += line.bytesize end ensure file.close end |
#each_term(&block) ⇒ Object
29 30 31 32 33 34 35 |
# File 'lib/spellr/tokenizer.rb', line 29 def each_term(&block) file.each_line do |line| prepare_tokenizer_for_line(line)&.each_term(&block) end ensure file.close end |
#each_token(skip_term_proc: nil) ⇒ Object
37 38 39 40 41 42 43 44 45 |
# File 'lib/spellr/tokenizer.rb', line 37 def each_token(skip_term_proc: nil) each_line_with_stats do |line, line_number, char_offset, byte_offset| prepare_tokenizer_for_line(line)&.each_token(skip_term_proc: skip_term_proc) do |token| token.line = prepare_line(line, line_number, char_offset, byte_offset) yield token end end end |
#map(&block) ⇒ Object
25 26 27 |
# File 'lib/spellr/tokenizer.rb', line 25 def map(&block) enum_for(:each_token).map(&block) end |
#normalized_terms ⇒ Object
69 70 71 |
# File 'lib/spellr/tokenizer.rb', line 69 def normalized_terms enum_for(:each_term).map(&:spellr_normalize).uniq.sort end |
#prepare_line(line, line_number, char_offset, byte_offset) ⇒ Object
47 48 49 50 51 52 53 |
# File 'lib/spellr/tokenizer.rb', line 47 def prepare_line(line, line_number, char_offset, byte_offset) line_location = LineLocation.new( filename, line_number, char_offset: char_offset, byte_offset: byte_offset ) column_location = ColumnLocation.new(line_location: line_location) Token.new(line, location: column_location) end |
#terms ⇒ Object
leftovers:test
21 22 23 |
# File 'lib/spellr/tokenizer.rb', line 21 def terms # leftovers:test enum_for(:each_term).to_a end |