Class: StringEater::RubyTokenizer
- Inherits:
-
Object
- Object
- StringEater::RubyTokenizer
- Defined in:
- lib/ruby-tokenizer.rb
Overview
this tokenizer is fairly fast, but not necessarily faster than regexps
Class Method Summary collapse
- .add_field(name, opts = {}) ⇒ Object
- .combine_fields(opts = {}) ⇒ Object
- .combined_tokens ⇒ Object
- .look_for(tokens) ⇒ Object
- .tokens ⇒ Object
Instance Method Summary collapse
- #combined_tokens ⇒ Object
- #describe_line ⇒ Object
- #find_breakpoints(string) ⇒ Object
- #refresh_tokens ⇒ Object
- #tokenize!(string, &block) ⇒ Object
- #tokens ⇒ Object
Class Method Details
.add_field(name, opts = {}) ⇒ Object
11 12 13 14 |
# File 'lib/ruby-tokenizer.rb', line 11 def self.add_field name, opts={} self.tokens << StringEater::Token::new_field(name, opts) define_method(name) {@extracted_tokens[name]} end |
.combine_fields(opts = {}) ⇒ Object
20 21 22 23 24 25 |
# File 'lib/ruby-tokenizer.rb', line 20 def self.combine_fields opts={} from_token_index = self.tokens.index{|t| t.name == opts[:from]} to_token_index = self.tokens.index{|t| t.name == opts[:to]} self.combined_tokens << [opts[:as], from_token_index, to_token_index] define_method(opts[:as]) {@extracted_tokens[opts[:as]]} end |
.combined_tokens ⇒ Object
7 8 9 |
# File 'lib/ruby-tokenizer.rb', line 7 def self.combined_tokens @combined_tokens ||= [] end |
.look_for(tokens) ⇒ Object
16 17 18 |
# File 'lib/ruby-tokenizer.rb', line 16 def self.look_for tokens self.tokens << StringEater::Token::new_separator(tokens) end |
.tokens ⇒ Object
3 4 5 |
# File 'lib/ruby-tokenizer.rb', line 3 def self.tokens @tokens ||= [] end |
Instance Method Details
#combined_tokens ⇒ Object
31 32 33 |
# File 'lib/ruby-tokenizer.rb', line 31 def combined_tokens @combined_tokens ||= self.class.combined_tokens end |
#describe_line ⇒ Object
41 42 43 44 45 |
# File 'lib/ruby-tokenizer.rb', line 41 def describe_line tokens.inject("") do |desc, t| desc << (t.string || t.name.to_s || "xxxxxx") end end |
#find_breakpoints(string) ⇒ Object
47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/ruby-tokenizer.rb', line 47 def find_breakpoints(string) @literal_tokens ||= tokens.select{|t| t.string} @breakpoints ||= Array.new(2*@literal_tokens.size + 2) @breakpoints[0] = 0 @breakpoints[-1] = string.length start_point = 0 @literal_tokens.each_with_index do |t, i| @breakpoints[2*i+1], start_point = find_end_of(t, string, start_point) @breakpoints[2*i+2] = start_point end @breakpoints end |
#refresh_tokens ⇒ Object
35 36 37 38 39 |
# File 'lib/ruby-tokenizer.rb', line 35 def refresh_tokens @combined_tokens = nil @tokens = nil tokens end |
#tokenize!(string, &block) ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/ruby-tokenizer.rb', line 60 def tokenize! string, &block @extracted_tokens ||= {} @extracted_tokens.clear @tokens_to_extract ||= tokens.select{|t| t.extract?} find_breakpoints(string) last_important_bp = [@breakpoints.length, tokens.size].min (0...last_important_bp).each do |i| tokens[i].breakpoints = [@breakpoints[i], @breakpoints[i+1]] end @tokens_to_extract.each do |t| @extracted_tokens[t.name] = string[t.breakpoints[0]...t.breakpoints[1]] end combined_tokens.each do |combiner| name = combiner[0] from = @tokens[combiner[1]].breakpoints[0] to = @tokens[combiner[2]].breakpoints[1] @extracted_tokens[name] = string[from...to] end if block_given? yield @extracted_tokens end # return self for chaining self end |
#tokens ⇒ Object
27 28 29 |
# File 'lib/ruby-tokenizer.rb', line 27 def tokens @tokens ||= self.class.tokens end |