Class: Tokenizer

Inherits:

Object

Object
Tokenizer

show all

Defined in:: lib/tsql_shparser/tsql_tokenizer.rb

Overview

Class to tokenize a given string or file.

Constant Summary collapse

VERSION =

"0.0.1"

Instance Method Summary collapse

#current_token ⇒ Object
#get_next_token ⇒ Object
#initialize(file = nil) ⇒ Tokenizer constructor

A new instance of Tokenizer.
#look_back(m) ⇒ Object
#previous_token ⇒ Object
#tokenize_file(file = nil) ⇒ Object
#tokenize_string(str) ⇒ Object

Split the string into its sub-strings and return an array of triplets.
#unget_token ⇒ Object

Constructor Details

#initialize(file = nil) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 130

def initialize(file=nil)
    @input_file = file
    @tokens = []
    @position = 0
end

Instance Method Details

#current_token ⇒ `Object`

356	# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 356 def current_token; look_back(0); end

#get_next_token ⇒ `Object`

# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 340

def get_next_token
  tok = ((@position >= 0) ? @tokens[@position] : nil)
  return tok unless tok

  token = Token.new(*tok)
  @position += 1  
  token
end

#look_back(m) ⇒ `Object`

# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 349

def look_back(m)
  tok = ((@position >= m) ? @tokens[@position-m] : nil)
  return tok unless tok
  token = Token.new(*tok)    
  token
end

#previous_token ⇒ `Object`

357	# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 357 def previous_token; look_back(1); end

#tokenize_file(file = nil) ⇒ `Object`

# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 328

def tokenize_file(file=nil)
  @input_file ||= file 
  if @input_file
    arr = IO.readlines(@input_file)    
    tokenize_string(arr.join)      
  end
rescue
  puts $!.to_s
ensure
  return @tokens.length
end

#tokenize_string(str) ⇒ `Object`

Split the string into its sub-strings and return an array of triplets. Each triplet contains the ending line number, ending column number and the sub-string (token string). A token string may spill over multiple lines.

# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 184

def tokenize_string(str)
  #puts str; puts
  stream = str.split('')

  slc = nil      # Single Line Comment indicator
  mlc = nil      # Multi  Line Comment indicator
  sq  = nil      # Single Quote indicator
  dq  = nil      # Double Quote indicator
  bkt = nil      # Bracket indicator
  
  qtok  = ""     # accumulator for quoted-string token 
  atok  = ""     # accumulator for all types of tokens except quoted-string
  qstr  = ""     # Final quoted string 
  tok_arr  = []  # token array
  
  col    = 0     # Column number of the token
  line   = 1     # Line number of the token
  i      = 1     # Current Position in the stream
  prev_c = ''    # Single character look behind
  
  while stream != []
      c = stream.shift
      #puts c
      case 
      when c =~ /[ \t]/
        unless (slc or mlc or sq or dq or bkt)
          tok_arr += tok_split(line,col,atok)
          atok = ""
        end
      when ((prev_c == '-') and (c == '-'))
        unless (slc or mlc or sq or dq or bkt)
          slc = i
          atok.chop!
          #puts "starting a single-line comment @ #{i}"
        end
      when ((prev_c == '/') and (c == '*'))
        unless (slc or mlc or sq or dq or bkt)
            mlc = i
            atok.chop!
            #puts "starting a  multi-line comment @ #{i}"
        end
      when ((prev_c == '*') and (c == '/'))
        if (mlc and (mlc < (i-1)))
          mlc = nil
          c = ''
          #puts "  ending a  multi-line comment @ #{i}"
        end
      when ((c == "\r") or (c == "\n"))       
        unless (slc or mlc or sq or dq or bkt)
          tok_arr += tok_split(line,col,atok)
          atok = ""
        end

        (col = 0; line += 1) if (c == "\n")
        if slc       
          slc = nil     
          c = ''
          #puts "  ending a single-line comment @ #{i}"     
        end
      when (c == "'")
        unless (slc or mlc or dq or bkt)
          if sq 
            ### WARNING:
            # This logic is wrong: it assumes end of the single-quote token
            # But in case of a embedded/escaped single-quote the token has
            # not yet ended. Needs to be fixed in a later version. 
            ###
            sq = nil
            qtok += c  
            c = ''
            qstr = qtok              
            #puts "   ending single-quote @ #{i}"
          else
            sq = i
            
            if prev_c == 'N'
              qtok = 'N'
              atok.chop!
              temp_pos = col-1
            else
              qtok = ""
              temp_pos = col
            end
            tok_arr += tok_split(line,temp_pos,atok)
            atok = ""              
            #puts " starting single-quote @ #{i}"
          end
        end
      when (c == '"')
        unless (slc or mlc or sq or bkt)
          if dq 
            dq = nil
            qtok += c 
            c = ''
            qstr = qtok
            #puts "   ending double-quote @ #{i}"
          else
            dq = i
            qtok = ""
            tok_arr += tok_split(line,col,atok)
            atok = ""              
            #puts " starting double-quote @ #{i}"
          end
        end
      when (c == '[')
        unless (slc or mlc or sq or dq or bkt)
          bkt = i
          qtok = ""
          tok_arr += tok_split(line,col,atok)
          atok = ""            
          #puts " starting square-bracket @ #{i}"
        end
      when (c == ']')
        if bkt
          bkt = nil  
          qtok += c 
          c = ''
          qstr = qtok
          #puts "   ending square-bracket @ #{i}"
        end
      end

      qtok += c if (sq or dq or bkt)
      atok += c unless (slc or mlc or sq or dq or bkt)
      
      
      prev_c = c
      col += 1 
      i += 1
      
      (tok_arr += tok_split(line,col,qstr,true); qstr = "";) if qstr.size > 0 

  end

  tok_arr += tok_split(line, col, atok)
  
  raise "#{@input_file} Umatched quoted string at (#{line},#{col})" if (sq or dq or bkt)
  raise "#{@input_file} Incomplete Comment at (#{line},#{col})"     if mlc
  
  @tokens = tok_arr
  
end

#unget_token ⇒ `Object`

# File 'lib/tsql_shparser/tsql_tokenizer.rb', line 359

def unget_token
  @position -= 1 if (@position >= 0)
  @position
end

Class: Tokenizer

Overview

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file = nil) ⇒ Tokenizer

Instance Method Details

#current_token ⇒ Object

#get_next_token ⇒ Object

#look_back(m) ⇒ Object

#previous_token ⇒ Object

#tokenize_file(file = nil) ⇒ Object

#tokenize_string(str) ⇒ Object

#unget_token ⇒ Object