Class: StringEater::RubyTokenizerEachChar

Inherits:
Object
  • Object
show all
Defined in:
lib/ruby-tokenizer-each-char.rb

Overview

this tokenizer is very slow, but it illustrates the basic idea of the C tokenizer

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.add_field(name, opts = {}) ⇒ Object



13
14
15
16
# File 'lib/ruby-tokenizer-each-char.rb', line 13

def self.add_field name, opts={}
  self.tokens << StringEater::Token::new_field(name, opts)
  define_method(name) {@extracted_tokens[name]}
end

.combine_fields(opts = {}) ⇒ Object



22
23
24
25
26
27
# File 'lib/ruby-tokenizer-each-char.rb', line 22

def self.combine_fields opts={}
  from_token_index = self.tokens.index{|t| t.name == opts[:from]}
  to_token_index = self.tokens.index{|t| t.name == opts[:to]}
  self.combined_tokens << [opts[:as], from_token_index, to_token_index]
  define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
end

.combined_tokensObject



9
10
11
# File 'lib/ruby-tokenizer-each-char.rb', line 9

def self.combined_tokens
  @combined_tokens ||= []
end

.look_for(tokens) ⇒ Object



18
19
20
# File 'lib/ruby-tokenizer-each-char.rb', line 18

def self.look_for tokens
  self.tokens << StringEater::Token::new_separator(tokens)
end

.tokensObject



5
6
7
# File 'lib/ruby-tokenizer-each-char.rb', line 5

def self.tokens
  @tokens ||= []
end

Instance Method Details

#combined_tokensObject



33
34
35
# File 'lib/ruby-tokenizer-each-char.rb', line 33

def combined_tokens
  @combined_tokens ||= self.class.combined_tokens
end

#describe_lineObject



43
44
45
46
47
# File 'lib/ruby-tokenizer-each-char.rb', line 43

def describe_line
  tokens.inject("") do |desc, t|
    desc << (t.string || t.name.to_s || "xxxxxx")
  end
end

#find_breakpoints(string) ⇒ Object



49
50
51
52
53
54
55
# File 'lib/ruby-tokenizer-each-char.rb', line 49

def find_breakpoints string
  tokenize!(string) unless @string == string
  tokens.inject([]) do |bp, t|
    bp << t.breakpoints
    bp
  end.flatten.uniq
end

#refresh_tokensObject



37
38
39
40
41
# File 'lib/ruby-tokenizer-each-char.rb', line 37

def refresh_tokens
  @combined_tokens = nil
  @tokens = nil
  tokens
end

#tokenize!(string, &block) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/ruby-tokenizer-each-char.rb', line 57

def tokenize! string, &block
  @string = string
  @extracted_tokens ||= {}
  @extracted_tokens.clear
  @tokens_to_find ||= tokens.each_with_index.map do |t, i| 
    [i, t.string] if t.string
  end.compact
  @tokens_to_extract_indeces ||= tokens.each_with_index.map do |t, i|
    i if t.extract?
  end.compact

  tokens.first.breakpoints[0] = 0

  find_index = 0

  curr_token = @tokens_to_find[find_index]
  curr_token_index = curr_token[0]
  curr_token_length = curr_token[1].length
  looking_for_index = 0
  looking_for = curr_token[1][looking_for_index]

  counter = 0
  string.each_char do |c|
    if c == looking_for
      if looking_for_index == 0
        # entering new token
        if curr_token_index > 0
          t = tokens[curr_token_index - 1]
          t.breakpoints[1] = counter
          if t.extract?
            @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
          end
        end
        tokens[curr_token_index].breakpoints[0] = counter
      end
      if looking_for_index >= (curr_token_length - 1)
        # leaving token
        tokens[curr_token_index].breakpoints[1] = counter

        if curr_token_index >= tokens.size-1
          # we're done!
          break
        else
          tokens[curr_token_index + 1].breakpoints[0] = counter + 1
        end

        # next token
        find_index += 1
        if find_index >= @tokens_to_find.length
          # we're done!
          break
        end
        curr_token = @tokens_to_find[find_index]
        curr_token_index = curr_token[0]
        curr_token_length = curr_token[1].length
        looking_for_index = 0
      else
        looking_for_index += 1
      end
    end
    looking_for = curr_token[1][looking_for_index]
    counter += 1
  end

  last_token = tokens.last
  last_token.breakpoints[1] = string.length

  if last_token.extract?
    @extracted_tokens[last_token.name] = string[last_token.breakpoints[0]..last_token.breakpoints[1]]
  end

  combined_tokens.each do |combiner|
    name = combiner[0]
    from = @tokens[combiner[1]].breakpoints[0]
    to = @tokens[combiner[2]].breakpoints[1]
    @extracted_tokens[name] = string[from...to]
  end

  if block_given?
    yield @extracted_tokens
  end

  # return self for chaining
  self
end

#tokensObject



29
30
31
# File 'lib/ruby-tokenizer-each-char.rb', line 29

def tokens
  @tokens ||= self.class.tokens
end