Class: StringEater::RubyTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/ruby-tokenizer.rb

Overview

this tokenizer is fairly fast, but not necessarily faster than regexps

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.add_field(name, opts = {}) ⇒ Object



11
12
13
14
# File 'lib/ruby-tokenizer.rb', line 11

def self.add_field name, opts={}
  self.tokens << StringEater::Token::new_field(name, opts)
  define_method(name) {@extracted_tokens[name]}
end

.combine_fields(opts = {}) ⇒ Object



20
21
22
23
24
25
# File 'lib/ruby-tokenizer.rb', line 20

def self.combine_fields opts={}
  from_token_index = self.tokens.index{|t| t.name == opts[:from]}
  to_token_index = self.tokens.index{|t| t.name == opts[:to]}
  self.combined_tokens << [opts[:as], from_token_index, to_token_index]
  define_method(opts[:as]) {@extracted_tokens[opts[:as]]}
end

.combined_tokensObject



7
8
9
# File 'lib/ruby-tokenizer.rb', line 7

def self.combined_tokens
  @combined_tokens ||= []
end

.look_for(tokens) ⇒ Object



16
17
18
# File 'lib/ruby-tokenizer.rb', line 16

def self.look_for tokens
  self.tokens << StringEater::Token::new_separator(tokens)
end

.tokensObject



3
4
5
# File 'lib/ruby-tokenizer.rb', line 3

def self.tokens
  @tokens ||= []
end

Instance Method Details

#combined_tokensObject



31
32
33
# File 'lib/ruby-tokenizer.rb', line 31

def combined_tokens
  @combined_tokens ||= self.class.combined_tokens
end

#describe_lineObject



41
42
43
44
45
# File 'lib/ruby-tokenizer.rb', line 41

def describe_line
  tokens.inject("") do |desc, t|
    desc << (t.string || t.name.to_s || "xxxxxx")
  end
end

#find_breakpoints(string) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/ruby-tokenizer.rb', line 47

def find_breakpoints(string)
  @literal_tokens ||= tokens.select{|t| t.string}
  @breakpoints ||= Array.new(2*@literal_tokens.size + 2)
  @breakpoints[0] = 0
  @breakpoints[-1] = string.length
  start_point = 0
  @literal_tokens.each_with_index do |t, i|
    @breakpoints[2*i+1], start_point = find_end_of(t, string, start_point)
    @breakpoints[2*i+2] = start_point
  end
  @breakpoints
end

#refresh_tokensObject



35
36
37
38
39
# File 'lib/ruby-tokenizer.rb', line 35

def refresh_tokens
  @combined_tokens = nil
  @tokens = nil
  tokens
end

#tokenize!(string, &block) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/ruby-tokenizer.rb', line 60

def tokenize! string, &block
  @extracted_tokens ||= {}
  @extracted_tokens.clear
  @tokens_to_extract ||= tokens.select{|t| t.extract?}

  find_breakpoints(string)
  last_important_bp = [@breakpoints.length, tokens.size].min
  (0...last_important_bp).each do |i|
    tokens[i].breakpoints = [@breakpoints[i], @breakpoints[i+1]]
  end

  @tokens_to_extract.each do |t|
    @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]]
  end

  combined_tokens.each do |combiner|
    name = combiner[0]
    from = @tokens[combiner[1]].breakpoints[0]
    to = @tokens[combiner[2]].breakpoints[1]
    @extracted_tokens[name] = string[from...to]
  end

  if block_given?
    yield @extracted_tokens
  end

  # return self for chaining
  self
end

#tokensObject



27
28
29
# File 'lib/ruby-tokenizer.rb', line 27

def tokens
  @tokens ||= self.class.tokens
end