Class: StringEater::CTokenizer
- Inherits:
-
Object
- Object
- StringEater::CTokenizer
- Defined in:
- lib/c-tokenizer.rb,
ext/string-eater/c-tokenizer.c
Direct Known Subclasses
Class Method Summary collapse
- .add_field(name, opts = {}) ⇒ Object
-
.dup_tokens ⇒ Object
This is very slow, only do it when necessary.
- .look_for(tokens) ⇒ Object
- .tokens ⇒ Object
Instance Method Summary collapse
- #ctokenize!(string, tokens_to_find_indexes, tokens_to_find_strings, tokens_to_extract_indexes, tokens_to_extract_names) ⇒ Object
- #describe_line ⇒ Object
- #do_extra_parsing ⇒ Object
- #extract_all_fields ⇒ Object
- #extract_fields(*fields) ⇒ Object
- #extract_no_fields ⇒ Object
-
#initialize ⇒ CTokenizer
constructor
A new instance of CTokenizer.
-
#refresh_tokens ⇒ Object
This is very slow, only do it once before processing.
- #tokenize!(string, &block) ⇒ Object
- #tokens ⇒ Object
Constructor Details
#initialize ⇒ CTokenizer
Returns a new instance of CTokenizer.
22 23 24 |
# File 'lib/c-tokenizer.rb', line 22 def initialize refresh_tokens end |
Class Method Details
.add_field(name, opts = {}) ⇒ Object
8 9 10 11 |
# File 'lib/c-tokenizer.rb', line 8 def self.add_field name, opts={} self.tokens << StringEater::Token::new_field(name, opts) define_method(name) {@extracted_tokens[name]} end |
.dup_tokens ⇒ Object
This is very slow, only do it when necessary
18 19 20 |
# File 'lib/c-tokenizer.rb', line 18 def self.dup_tokens Marshal.load(Marshal.dump(tokens)) end |
.look_for(tokens) ⇒ Object
13 14 15 |
# File 'lib/c-tokenizer.rb', line 13 def self.look_for tokens self.tokens << StringEater::Token::new_separator(tokens) end |
.tokens ⇒ Object
4 5 6 |
# File 'lib/c-tokenizer.rb', line 4 def self.tokens @tokens ||= [] end |
Instance Method Details
#ctokenize!(string, tokens_to_find_indexes, tokens_to_find_strings, tokens_to_extract_indexes, tokens_to_extract_names) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# File 'ext/string-eater/c-tokenizer.c', line 12
static VALUE tokenize_string(VALUE self,
VALUE string,
VALUE tokens_to_find_indexes,
VALUE tokens_to_find_strings,
VALUE tokens_to_extract_indexes,
VALUE tokens_to_extract_names)
{
const char* input_string = StringValueCStr(string);
VALUE extracted_tokens = rb_hash_new();
VALUE curr_token;
unsigned int curr_token_ix;
long n_tokens_to_find = RARRAY_LEN(tokens_to_find_indexes);
size_t str_len = strlen(input_string);
size_t ix;
char c;
char looking_for;
size_t looking_for_len;
size_t looking_for_ix = 0;
long find_ix = 0;
const char* looking_for_token;
unsigned int n_tokens = (unsigned int)RARRAY_LEN(rb_iv_get(self, "@tokens"));
size_t startpoint = 0;
long n_tokens_to_extract = RARRAY_LEN(tokens_to_extract_indexes);
long last_token_extracted_ix = 0;
long next_token_to_extract_ix = NUM2UINT(rb_ary_entry(tokens_to_extract_indexes, last_token_extracted_ix));
curr_token = rb_ary_entry(tokens_to_find_strings, find_ix);
curr_token_ix = NUM2UINT(rb_ary_entry(tokens_to_find_indexes, find_ix));
looking_for_token = StringValueCStr(curr_token);
looking_for_len = strlen(looking_for_token);
looking_for = looking_for_token[looking_for_ix];
for(ix = 0; ix < str_len; ix++)
{
c = input_string[ix];
if(c == looking_for)
{
if(looking_for_ix == 0)
{
/* entering new token */
if(curr_token_ix > 0)
{
/* extract, if necessary */
if((curr_token_ix - 1) == next_token_to_extract_ix)
{
last_token_extracted_ix++;
if(last_token_extracted_ix < n_tokens_to_extract)
{
next_token_to_extract_ix = NUM2UINT(rb_ary_entry(tokens_to_extract_indexes, last_token_extracted_ix));
}
else
{
next_token_to_extract_ix = -1;
}
rb_hash_aset(extracted_tokens,
rb_ary_entry(tokens_to_extract_names, curr_token_ix - 1),
rb_usascii_str_new(input_string + startpoint,
ix - startpoint));
}
}
startpoint = ix;
}
if(looking_for_ix >= looking_for_len - 1)
{
/* leaving token */
if(curr_token_ix >= n_tokens-1)
{
break;
}
else
{
startpoint = ix + 1;
}
/* next token */
find_ix++;
if(find_ix >= n_tokens_to_find)
{
/* done! */
break;
}
curr_token = rb_ary_entry(tokens_to_find_strings, find_ix);
curr_token_ix = NUM2UINT(rb_ary_entry(tokens_to_find_indexes, find_ix));
looking_for_token = StringValueCStr(curr_token);
looking_for_len = strlen(looking_for_token);
looking_for_ix = 0;
}
else
{
looking_for_ix++;
}
looking_for = looking_for_token[looking_for_ix];
}
else
{
/* reset token (or just keep looking, which is also fine) */
looking_for_ix = 0;
looking_for = looking_for_token[looking_for_ix];
}
}
curr_token_ix = n_tokens - 1;
if(ix < str_len && curr_token_ix == next_token_to_extract_ix)
{
rb_hash_aset(extracted_tokens,
rb_ary_entry(tokens_to_extract_names, curr_token_ix),
rb_usascii_str_new(input_string + startpoint,
str_len - startpoint));
}
return extracted_tokens;
}
|
#describe_line ⇒ Object
76 77 78 79 80 |
# File 'lib/c-tokenizer.rb', line 76 def describe_line tokens.inject("") do |desc, t| desc << (t.string || t.name.to_s || "xxxxxx") end end |
#do_extra_parsing ⇒ Object
82 83 |
# File 'lib/c-tokenizer.rb', line 82 def do_extra_parsing end |
#extract_all_fields ⇒ Object
30 31 32 33 34 35 |
# File 'lib/c-tokenizer.rb', line 30 def extract_all_fields @token_filter = lambda do |t| t.opts[:extract] = true if t.name end refresh_tokens end |
#extract_fields(*fields) ⇒ Object
44 45 46 47 48 49 |
# File 'lib/c-tokenizer.rb', line 44 def extract_fields *fields @token_filter = lambda do |t| t.opts[:extract] = fields.include?(t.name) end refresh_tokens end |
#extract_no_fields ⇒ Object
37 38 39 40 41 42 |
# File 'lib/c-tokenizer.rb', line 37 def extract_no_fields @token_filter = lambda do |t| t.opts[:extract] = false if t.name end refresh_tokens end |
#refresh_tokens ⇒ Object
This is very slow, only do it once before processing
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/c-tokenizer.rb', line 52 def refresh_tokens @tokens = self.class.dup_tokens if @token_filter @tokens.each{|t| @token_filter.call(t)} end tokens_to_find = tokens.each_with_index.map do |t, i| [i, t.string] if t.string end.compact @tokens_to_find_indexes = tokens_to_find.map{|t| t[0]} @tokens_to_find_strings = tokens_to_find.map{|t| t[1]} tokens_to_extract = tokens.each_with_index.map do |t, i| [i, t.name] if t.extract? end.compact @tokens_to_extract_indexes = tokens_to_extract.map{|t| t[0]} @tokens_to_extract_names = tokens.map{|t| t.name} @have_tokens_to_extract = (@tokens_to_extract_indexes.size > 0) end |
#tokenize!(string, &block) ⇒ Object
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/c-tokenizer.rb', line 85 def tokenize! string, &block @string = string @extracted_tokens ||= {} @extracted_tokens.clear return unless @have_tokens_to_extract @extracted_tokens = ctokenize!(@string, @tokens_to_find_indexes, @tokens_to_find_strings, @tokens_to_extract_indexes, @tokens_to_extract_names) # extra parsing hook do_extra_parsing if block_given? yield @extracted_tokens end # return self for chaining self end |
#tokens ⇒ Object
26 27 28 |
# File 'lib/c-tokenizer.rb', line 26 def tokens @tokens end |