Class: StringEater::CTokenizer

Inherits:

Object

Object
StringEater::CTokenizer

show all

Defined in:: lib/c-tokenizer.rb,
ext/string-eater/c-tokenizer.c

Direct Known Subclasses

Tokenizer

Class Method Summary collapse

.add_field(name, opts = {}) ⇒ Object
.dup_tokens ⇒ Object

This is very slow, only do it when necessary.
.look_for(tokens) ⇒ Object
.tokens ⇒ Object

Instance Method Summary collapse

#ctokenize!(string, tokens_to_find_indexes, tokens_to_find_strings, tokens_to_extract_indexes, tokens_to_extract_names) ⇒ Object
#describe_line ⇒ Object
#do_extra_parsing ⇒ Object
#extract_all_fields ⇒ Object
#extract_fields(*fields) ⇒ Object
#extract_no_fields ⇒ Object
#initialize ⇒ CTokenizer constructor

A new instance of CTokenizer.
#refresh_tokens ⇒ Object

This is very slow, only do it once before processing.
#tokenize!(string, &block) ⇒ Object
#tokens ⇒ Object

Constructor Details

#initialize ⇒ `CTokenizer`

Returns a new instance of CTokenizer.



22
23
24

# File 'lib/c-tokenizer.rb', line 22

def initialize
  refresh_tokens
end

Class Method Details

.add_field(name, opts = {}) ⇒ `Object`

# File 'lib/c-tokenizer.rb', line 8

def self.add_field name, opts={}
  self.tokens << StringEater::Token::new_field(name, opts)
  define_method(name) {@extracted_tokens[name]}
end

.dup_tokens ⇒ `Object`

This is very slow, only do it when necessary



18
19
20

# File 'lib/c-tokenizer.rb', line 18

def self.dup_tokens
  Marshal.load(Marshal.dump(tokens))
end

.look_for(tokens) ⇒ `Object`



13
14
15

# File 'lib/c-tokenizer.rb', line 13

def self.look_for tokens
  self.tokens << StringEater::Token::new_separator(tokens)
end

.tokens ⇒ `Object`



4
5
6

# File 'lib/c-tokenizer.rb', line 4

def self.tokens
  @tokens ||= []
end

Instance Method Details

#ctokenize!(string, tokens_to_find_indexes, tokens_to_find_strings, tokens_to_extract_indexes, tokens_to_extract_names) ⇒ `Object`

# File 'ext/string-eater/c-tokenizer.c', line 12

static VALUE tokenize_string(VALUE self, 
    VALUE string,
    VALUE tokens_to_find_indexes,
    VALUE tokens_to_find_strings,
    VALUE tokens_to_extract_indexes,
    VALUE tokens_to_extract_names)
{
  const char* input_string = StringValueCStr(string);
  VALUE extracted_tokens = rb_hash_new();
  VALUE curr_token;
  unsigned int curr_token_ix;
  long n_tokens_to_find = RARRAY_LEN(tokens_to_find_indexes);
  size_t str_len = strlen(input_string);
  size_t ix;
  char c;
  char looking_for;
  size_t looking_for_len;
  size_t looking_for_ix = 0;
  long find_ix = 0;
  const char*  looking_for_token;
  unsigned int n_tokens = (unsigned int)RARRAY_LEN(rb_iv_get(self, "@tokens"));

  size_t startpoint = 0;

  long n_tokens_to_extract = RARRAY_LEN(tokens_to_extract_indexes);
  long last_token_extracted_ix = 0;

  long next_token_to_extract_ix = NUM2UINT(rb_ary_entry(tokens_to_extract_indexes, last_token_extracted_ix));

  curr_token = rb_ary_entry(tokens_to_find_strings, find_ix);
  curr_token_ix = NUM2UINT(rb_ary_entry(tokens_to_find_indexes, find_ix));
  looking_for_token = StringValueCStr(curr_token);
  looking_for_len = strlen(looking_for_token);
  looking_for = looking_for_token[looking_for_ix];

  for(ix = 0; ix < str_len; ix++)
  {
    c = input_string[ix];
    if(c == looking_for)
    {
      if(looking_for_ix == 0)
      {
        /* entering new token */
        if(curr_token_ix > 0)
        {
          /* extract, if necessary */
          if((curr_token_ix - 1) == next_token_to_extract_ix)
          {
            last_token_extracted_ix++;
            if(last_token_extracted_ix < n_tokens_to_extract)
            {
              next_token_to_extract_ix = NUM2UINT(rb_ary_entry(tokens_to_extract_indexes, last_token_extracted_ix));
            }
            else
            {
              next_token_to_extract_ix = -1;
            }
            rb_hash_aset(extracted_tokens,
                rb_ary_entry(tokens_to_extract_names, curr_token_ix - 1),
                rb_usascii_str_new(input_string + startpoint,
                  ix - startpoint));
          }
        }
        startpoint = ix;
      }
      if(looking_for_ix >= looking_for_len - 1)
      {
        /* leaving token */
        if(curr_token_ix >= n_tokens-1)
        {
          break;
        }
        else
        {
          startpoint = ix + 1;
        }


        /* next token */
        find_ix++;
        if(find_ix >= n_tokens_to_find)
        {
          /* done! */
          break;
        }

        curr_token = rb_ary_entry(tokens_to_find_strings, find_ix);
        curr_token_ix = NUM2UINT(rb_ary_entry(tokens_to_find_indexes, find_ix));
        looking_for_token = StringValueCStr(curr_token);
        looking_for_len = strlen(looking_for_token);
        looking_for_ix = 0;
      }
      else
      {
        looking_for_ix++;
      }
      looking_for = looking_for_token[looking_for_ix];
    }
    else
    {
      /* reset token (or just keep looking, which is also fine) */
      looking_for_ix = 0;
      looking_for = looking_for_token[looking_for_ix];
    }
  }

  curr_token_ix = n_tokens - 1;

  if(ix < str_len && curr_token_ix == next_token_to_extract_ix)
  {
    rb_hash_aset(extracted_tokens,
        rb_ary_entry(tokens_to_extract_names, curr_token_ix),
        rb_usascii_str_new(input_string + startpoint,
          str_len - startpoint));
  }

  return extracted_tokens;
}

#describe_line ⇒ `Object`

# File 'lib/c-tokenizer.rb', line 76

def describe_line
  tokens.inject("") do |desc, t|
    desc << (t.string || t.name.to_s || "xxxxxx")
  end
end

#do_extra_parsing ⇒ `Object`



82
83

# File 'lib/c-tokenizer.rb', line 82

def do_extra_parsing
end

#extract_all_fields ⇒ `Object`

# File 'lib/c-tokenizer.rb', line 30

def extract_all_fields
  @token_filter = lambda do |t|
    t.opts[:extract] = true if t.name
  end
  refresh_tokens
end

#extract_fields(*fields) ⇒ `Object`

# File 'lib/c-tokenizer.rb', line 44

def extract_fields *fields
  @token_filter = lambda do |t|
    t.opts[:extract] = fields.include?(t.name)
  end
  refresh_tokens
end

#extract_no_fields ⇒ `Object`

# File 'lib/c-tokenizer.rb', line 37

def extract_no_fields
  @token_filter = lambda do |t|
    t.opts[:extract] = false if t.name
  end
  refresh_tokens
end

#refresh_tokens ⇒ `Object`

This is very slow, only do it once before processing

# File 'lib/c-tokenizer.rb', line 52

def refresh_tokens
  @tokens = self.class.dup_tokens

  if @token_filter
    @tokens.each{|t| @token_filter.call(t)}
  end

  tokens_to_find = tokens.each_with_index.map do |t, i|
    [i, t.string] if t.string
  end.compact

  @tokens_to_find_indexes = tokens_to_find.map{|t| t[0]}
  @tokens_to_find_strings = tokens_to_find.map{|t| t[1]}

  tokens_to_extract = tokens.each_with_index.map do |t, i|
    [i, t.name] if t.extract?
  end.compact

  @tokens_to_extract_indexes = tokens_to_extract.map{|t| t[0]}
  @tokens_to_extract_names = tokens.map{|t| t.name}

  @have_tokens_to_extract = (@tokens_to_extract_indexes.size > 0)
end

#tokenize!(string, &block) ⇒ `Object`

# File 'lib/c-tokenizer.rb', line 85

def tokenize! string, &block
  @string = string
  @extracted_tokens ||= {}
  @extracted_tokens.clear

  return unless @have_tokens_to_extract

  @extracted_tokens = ctokenize!(@string, 
                                 @tokens_to_find_indexes,
                                 @tokens_to_find_strings,
                                 @tokens_to_extract_indexes,
                                 @tokens_to_extract_names)

  # extra parsing hook
  do_extra_parsing

  if block_given?
    yield @extracted_tokens
  end

  # return self for chaining
  self
end

#tokens ⇒ `Object`



26
27
28

# File 'lib/c-tokenizer.rb', line 26

def tokens
  @tokens
end

Class: StringEater::CTokenizer

Direct Known Subclasses

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ CTokenizer

Class Method Details

.add_field(name, opts = {}) ⇒ Object

.dup_tokens ⇒ Object

.look_for(tokens) ⇒ Object

.tokens ⇒ Object

Instance Method Details

#ctokenize!(string, tokens_to_find_indexes, tokens_to_find_strings, tokens_to_extract_indexes, tokens_to_extract_names) ⇒ Object

#describe_line ⇒ Object

#do_extra_parsing ⇒ Object

#extract_all_fields ⇒ Object

#extract_fields(*fields) ⇒ Object

#extract_no_fields ⇒ Object

#refresh_tokens ⇒ Object

#tokenize!(string, &block) ⇒ Object

#tokens ⇒ Object

#initialize ⇒ `CTokenizer`

.add_field(name, opts = {}) ⇒ `Object`

.dup_tokens ⇒ `Object`

.look_for(tokens) ⇒ `Object`

.tokens ⇒ `Object`

#ctokenize!(string, tokens_to_find_indexes, tokens_to_find_strings, tokens_to_extract_indexes, tokens_to_extract_names) ⇒ `Object`

#describe_line ⇒ `Object`

#do_extra_parsing ⇒ `Object`

#extract_all_fields ⇒ `Object`

#extract_fields(*fields) ⇒ `Object`

#extract_no_fields ⇒ `Object`

#refresh_tokens ⇒ `Object`

#tokenize!(string, &block) ⇒ `Object`

#tokens ⇒ `Object`