Class: StringEater::RubyTokenizerEachChar

Inherits:

Object

Object
StringEater::RubyTokenizerEachChar

show all

Defined in:: lib/ruby-tokenizer-each-char.rb

Overview

this tokenizer is very slow, but it illustrates the basic idea of the C tokenizer

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.add_field(name, opts = {}) ⇒ `Object`

# File 'lib/ruby-tokenizer-each-char.rb', line 13

def self.add_field name, opts={}
  self.tokens << StringEater::Token::new_field(name, opts)
  define_method(name) {@extracted_tokens[name]}
end

.combine_fields(opts = {}) ⇒ `Object`

# File 'lib/ruby-tokenizer-each-char.rb', line 22

def self.combine_fields opts={}
  from_token_index = self.tokens.index{|t| t.name == opts[:from]}
  to_token_index = self.tokens.index{|t| t.name == opts[:to]}
  self.combined_tokens << [opts[:as], from_token_index, to_token_index]
  define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
end

.combined_tokens ⇒ `Object`



9
10
11

# File 'lib/ruby-tokenizer-each-char.rb', line 9

def self.combined_tokens
  @combined_tokens ||= []
end

.look_for(tokens) ⇒ `Object`



18
19
20

# File 'lib/ruby-tokenizer-each-char.rb', line 18

def self.look_for tokens
  self.tokens << StringEater::Token::new_separator(tokens)
end

.tokens ⇒ `Object`



5
6
7

# File 'lib/ruby-tokenizer-each-char.rb', line 5

def self.tokens
  @tokens ||= []
end

Instance Method Details

#combined_tokens ⇒ `Object`



33
34
35

# File 'lib/ruby-tokenizer-each-char.rb', line 33

def combined_tokens
  @combined_tokens ||= self.class.combined_tokens
end

#describe_line ⇒ `Object`

# File 'lib/ruby-tokenizer-each-char.rb', line 43

def describe_line
  tokens.inject("") do |desc, t|
    desc << (t.string || t.name.to_s || "xxxxxx")
  end
end

#find_breakpoints(string) ⇒ `Object`

# File 'lib/ruby-tokenizer-each-char.rb', line 49

def find_breakpoints string
  tokenize!(string) unless @string == string
  tokens.inject([]) do |bp, t|
    bp << t.breakpoints
    bp
  end.flatten.uniq
end

#refresh_tokens ⇒ `Object`

# File 'lib/ruby-tokenizer-each-char.rb', line 37

def refresh_tokens
  @combined_tokens = nil
  @tokens = nil
  tokens
end

#tokenize!(string, &block) ⇒ `Object`

# File 'lib/ruby-tokenizer-each-char.rb', line 57

def tokenize! string, &block
  @string = string
  @extracted_tokens ||= {}
  @extracted_tokens.clear
  @tokens_to_find ||= tokens.each_with_index.map do |t, i| 
    [i, t.string] if t.string
  end.compact
  @tokens_to_extract_indeces ||= tokens.each_with_index.map do |t, i|
    i if t.extract?
  end.compact

  tokens.first.breakpoints[0] = 0

  find_index = 0

  curr_token = @tokens_to_find[find_index]
  curr_token_index = curr_token[0]
  curr_token_length = curr_token[1].length
  looking_for_index = 0
  looking_for = curr_token[1][looking_for_index]

  counter = 0
  string.each_char do |c|
    if c == looking_for
      if looking_for_index == 0
        # entering new token
        if curr_token_index > 0
          t = tokens[curr_token_index - 1]
          t.breakpoints[1] = counter
          if t.extract?
            @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
          end
        end
        tokens[curr_token_index].breakpoints[0] = counter
      end
      if looking_for_index >= (curr_token_length - 1)
        # leaving token
        tokens[curr_token_index].breakpoints[1] = counter

        if curr_token_index >= tokens.size-1
          # we're done!
          break
        else
          tokens[curr_token_index + 1].breakpoints[0] = counter + 1
        end

        # next token
        find_index += 1
        if find_index >= @tokens_to_find.length
          # we're done!
          break
        end
        curr_token = @tokens_to_find[find_index]
        curr_token_index = curr_token[0]
        curr_token_length = curr_token[1].length
        looking_for_index = 0
      else
        looking_for_index += 1
      end
    end
    looking_for = curr_token[1][looking_for_index]
    counter += 1
  end

  last_token = tokens.last
  last_token.breakpoints[1] = string.length

  if last_token.extract?
    @extracted_tokens[last_token.name] = string[last_token.breakpoints[0]..last_token.breakpoints[1]]
  end

  combined_tokens.each do |combiner|
    name = combiner[0]
    from = @tokens[combiner[1]].breakpoints[0]
    to = @tokens[combiner[2]].breakpoints[1]
    @extracted_tokens[name] = string[from...to]
  end

  if block_given?
    yield @extracted_tokens
  end

  # return self for chaining
  self
end

#tokens ⇒ `Object`



29
30
31

# File 'lib/ruby-tokenizer-each-char.rb', line 29

def tokens
  @tokens ||= self.class.tokens
end

Class: StringEater::RubyTokenizerEachChar

Overview

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.add_field(name, opts = {}) ⇒ Object

.combine_fields(opts = {}) ⇒ Object

.combined_tokens ⇒ Object

.look_for(tokens) ⇒ Object

.tokens ⇒ Object

Instance Method Details

#combined_tokens ⇒ Object

#describe_line ⇒ Object

#find_breakpoints(string) ⇒ Object

#refresh_tokens ⇒ Object

#tokenize!(string, &block) ⇒ Object

#tokens ⇒ Object

.add_field(name, opts = {}) ⇒ `Object`

.combine_fields(opts = {}) ⇒ `Object`

.combined_tokens ⇒ `Object`

.look_for(tokens) ⇒ `Object`

.tokens ⇒ `Object`

#combined_tokens ⇒ `Object`

#describe_line ⇒ `Object`

#find_breakpoints(string) ⇒ `Object`

#refresh_tokens ⇒ `Object`

#tokenize!(string, &block) ⇒ `Object`

#tokens ⇒ `Object`