Class: SDL4R::Tokenizer

Inherits:

Object

Object
SDL4R::Tokenizer

show all

Defined in:: lib/sdl4r/tokenizer.rb

Overview

Tokenizer for SDL.

As Ruby’s IO standard libraries are not so much low-level, this class works on lines. This means that some token types reflect this line-oriented tokenizing.

The other solution would be to implement a proper tokenizer natively, which I don’t feel like doing right now.

– FIXME: implement a way of stacking the errors without raising an error immediately ++

Defined Under Namespace

Classes: Matcher

Constant Summary collapse

@@EOL_STRING = A string used at the end of each line in order to trigger the EOL token.

"\n"

@@matcher_sets =

{
  :top => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(:WHITESPACE, /\A\s+/, :push_back_eol => true),
    Matcher.new(:SEMICOLON, /\A;/),
    Matcher.new(:COLON, /\A:/),
    Matcher.new(:EQUAL, /\A=/),
    Matcher.new(:BLOCK_START, /\A\{/),
    Matcher.new(:BLOCK_END, /\A\}/),
    Matcher.new(:BOOLEAN, /\Atrue|false|on|off/),
    Matcher.new(:NULL, /\Anull/),
    Matcher.new(:ONE_LINE_COMMENT, /\A(?:#|--|\/\/).*\Z/, :push_back_eol => true) do
      def process_token(token)
        token.gsub!(/\A(?:#|--|\/\/)/, "")
      end
    end,
    Matcher.new(:INLINE_COMMENT, /\A\/\*[\s\S]*?\*\//) do
      def process_token(token)
        token.gsub!(/\A\/\*|\*\/\Z/, "")
      end
    end,
    Matcher.new(
      :MULTILINE_COMMENT_START,
      /\A\/\*.*\Z/,
      :next_mode => :multiline_comment,
      :push_back_eol => true) do
      def process_token(token)
        token.gsub!(/\A\/\*/, "")
      end
    end,
    Matcher.new(:CHARACTER, /\A'(?:[^\\']|\\.)'/) do
      def process_token(token)
        token.gsub!(/\A'|'\Z/, "")
      end
    end,
    Matcher.new(:INLINE_BACKQUOTE_STRING, /\A`[^`]*`/, :is_node => true) do
      def process_token(token)
        token.gsub!(/\A`|`\Z/, "")
      end
    end,
    Matcher.new(:INLINE_DOUBLE_QUOTE_STRING, /\A"(?:[^\\"]|\\.)*"/) do
      def process_token(token)
        token.gsub!(/\A"|"\Z/, "")
      end
    end,
    Matcher.new(
      :MULTILINE_BACKQUOTE_STRING_START,
      /\A`[^`]*\Z/,
      :next_mode => :multiline_backquote_string,
      :is_node => true) do
        def process_token(token)
          token.gsub!(/\A`/, "")
        end
      end,
    Matcher.new(
      :MULTILINE_DOUBLE_QUOTE_STRING_START,
      /\A"(?:[^\\"]|\\\S)*\\\s*\Z/,
      :next_mode => :multiline_double_quote_string,
      :push_back_eol => true) do
        def process_token(token)
          token.gsub!(/\A"|\\\s*\Z/, "")
        end
      end,
    Matcher.new(:INLINE_BINARY, /\A\[[\sA-Za-z0-9\/=\+]*\]/) do
        def process_token(token)
          token.gsub!(/\A\[|\s+|\]\Z/, "")
        end
      end,
    Matcher.new(
      :MULTILINE_BINARY_START, /\A\[[\sA-Za-z0-9\/=\+]*\Z/,
      :next_mode => :multiline_binary,
      :push_back_eol => true) do
        def process_token(token)
          token.gsub!(/\A\[|\s+/, "")
        end
      end,
    Matcher.new(
      :IDENTIFIER, /\A#{SDL4R::IDENTIFIER_START_CLASS}#{SDL4R::IDENTIFIER_PART_CLASS}*/),
    Matcher.new(:DATE, /\A-?\d+\/\d+\/\d+/, :is_node => true),
    Matcher.new(
      :TIME_OR_TIMESPAN,
      /\A(?:-?\d+d:)?-?\d+:\d+(?::\d+(?:\.\d+)?)?
        (?:-[a-zA-Z\/]+(?:[+-]\d+(?::\d+)?)?)?/ix),
    Matcher.new(:INTEGER, /\A[\+\-]?\d+L/i), # takes precedence on floats
    # the float regex is meant to also catch bad syntaxed floats like "1.2.2" (otherwise, we
    # would not detect this kind of errors easily).
    Matcher.new(
      :FLOAT, /\A[\+\-]?(?:\d+(?:F|D|BD)|\d*\.[\d\.]+(?:F|D|BD)?)/i),
    Matcher.new(:INTEGER, /\A[\+\-]?\d+L?/i),
    Matcher.new(:LINE_CONTINUATION, /\A\\\s*\Z/), # outside of comments, strings, etc
    Matcher.new(
      :UNCLOSED_DOUBLE_QUOTE_STRING,
      /\A"(?:[^\\"]|\\\S)*/,
      :error => "unclosed string"),
  ],
   :multiline_comment => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(:MULTILINE_COMMENT_END, /\A[\s\S]*?\*\//, :next_mode => :top) do
      def process_token(token)
        token.gsub!(/\*\/\Z/, "")
      end
    end,
    Matcher.new(:MULTILINE_COMMENT_PART, /\A.+\Z/, :push_back_eol => true)
  ],
   :multiline_backquote_string => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(:MULTILINE_BACKQUOTE_STRING_END, /\A[^`]*`/, :next_mode => :top) do
      def process_token(token)
        token.gsub!(/`\Z/, "")
      end
    end,
    Matcher.new(:MULTILINE_BACKQUOTE_STRING_PART, /\A[^`]*\Z/)
  ],
   :multiline_double_quote_string => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(
      :MULTILINE_DOUBLE_QUOTE_STRING_END, /\A(?:[^\\"]|\\\S)*"/, :next_mode => :top) do
        def process_token(token)
          token.gsub!(/\A\s+|"\Z/, "")
        end
      end,
    Matcher.new(
      :MULTILINE_DOUBLE_QUOTE_STRING_PART,
      /\A(?:[^\\"]|\\\S)*\\\s*\Z/,
      :push_back_eol => true) do
        def process_token(token)
          token.gsub!(/\A\s+|\\\s*\Z/, "")
        end
      end,
    Matcher.new(
      :UNCLOSED_DOUBLE_QUOTE_STRING,
      /\A(?:[^\\"]|\\\S)*\Z/,
      :error => "unclosed multiline string")
  ],
   :multiline_binary => [
    Matcher.new(:EOL, /\A\n/),
    Matcher.new(:MULTILINE_BINARY_END, /\A[\sA-Za-z0-9\/=\+]*\]/, :next_mode => :top) do
      def process_token(token)
        token.gsub!(/\s+|\]\Z/, "")
      end
    end,
    Matcher.new(:MULTILINE_BINARY_PART, /\A[\sA-Za-z0-9\/=\+]*\Z/, :push_back_eol => true) do
      def process_token(token)
        token.gsub!(/\s+/, "")
      end
    end
  ]
}

Instance Method Summary collapse

#initialize(io) ⇒ Tokenizer constructor

A new instance of Tokenizer.
#previous_token_type ⇒ Symbol

The type of the previous Token.
#raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos) ⇒ Object
#raise_unexpected_char(msg = "unexpected char") ⇒ Object

Raises a standard “unexpected character” error.
#read ⇒ Symbol

Goes to the next token.
#set_mode(mode) ⇒ self

Sets the current working mode of this Tokenizer.
#token ⇒ String

Text of the current token.
#token_line_no ⇒ Integer

Position of the current token (only meant for error tracking for the time being).
#token_pos ⇒ Integer

Position of the current token (only meant for error tracking for the time being).
#token_type ⇒ Symbol

Type of the current token (e.g. :WHITESPACE).
#unread ⇒ Object

Unreads the current token.

Constructor Details

#initialize(io) ⇒ `Tokenizer`

Returns a new instance of Tokenizer.

Parameters:

the (IO) —

IO to read from

Raises:

(ArgumentError) —

if io is nil.

# File 'lib/sdl4r/tokenizer.rb', line 238

def initialize io
  raise ArgumentError, 'io' unless io
  @io = io
  @scanner = nil
  @line_no = -1
  set_mode(:top)

  @token = nil
  @pushed_back_token = nil
  @previous_token = nil

  @token_pool = [] # a pool of reusable Tokens
end

Instance Method Details

#previous_token_type ⇒ `Symbol`

Returns the type of the previous Token.

Returns:

(Symbol) —

the type of the previous Token.



370
371
372

# File 'lib/sdl4r/tokenizer.rb', line 370

def previous_token_type
  @previous_token ? @previous_token.type : nil
end

#raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos) ⇒ `Object`

Raises:

(SdlParseError)

# File 'lib/sdl4r/tokenizer.rb', line 400

def raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos)
  line = (line_no == @line_no)? @scanner.string : nil
  raise SdlParseError.new(msg, line_no + 1, pos + 1, line)
end

#raise_unexpected_char(msg = "unexpected char") ⇒ `Object`

Raises a standard “unexpected character” error.



396
397
398

# File 'lib/sdl4r/tokenizer.rb', line 396

def raise_unexpected_char(msg = "unexpected char")
  raise_parse_error "#{msg}: <#{@scanner.peek(1)}>"
end

#read ⇒ `Symbol`

Goes to the next token.

Returns:

(Symbol) —

nil if eof has been reached, the current token type otherwise.

# File 'lib/sdl4r/tokenizer.rb', line 312

def read
  if @pushed_back_token
    read_pushed_back
    return @token.type
  end

  record_previous_token
  @token = nil

  if @line_no < 0 or @scanner.eos? # fetch a line if beginning or at end of line
    unless read_line
      if previous_token_type == :EOF
        return nil
      else
        @token = Token.new(nil, :EOF, nil, @line_no, @scanner ? @scanner.pos : 0)
        return @token.type
      end
    end
  end

  pos = @scanner.pos
  @matcher_set.each do |matcher|
    if token_text = @scanner.scan(matcher.regex)
      error = matcher.error
      if error
        raise_parse_error(error)

      else
        set_matcher_token(matcher, token_text, pos)
        if matcher.push_back_eol and @scanner.eos?
          @scanner.pos = @scanner.pos - @@EOL_STRING.size
        end
      end
      break
    end
  end

  raise_unexpected_char unless @token

  return @token.type
end

#set_mode(mode) ⇒ `self`

Sets the current working mode of this Tokenizer.

Parameters:

new (Symbol) —
mode
- :top (normal default mode)
- :multiline_comment
- :multiline_backquote_string
- :multiline_double_quote_string
- :multiline_binary

Returns:

(self)

Raises:

(ArgumentError) —

if the given mode is unknown.

# File 'lib/sdl4r/tokenizer.rb', line 286

def set_mode(mode)
  ms = @@matcher_sets[mode]
  raise ArgumentError, "unknown tokenizer mode #{mode.to_s}" unless ms
  @matcher_set = ms
  self
end

#token ⇒ `String`

Returns text of the current token.

Returns:

(String) —

text of the current token.



253
254
255

# File 'lib/sdl4r/tokenizer.rb', line 253

def token
  @token.text
end

#token_line_no ⇒ `Integer`

Returns position of the current token (only meant for error tracking for the time being).

Returns:

(Integer) —

position of the current token (only meant for error tracking for the time being)



264
265
266

# File 'lib/sdl4r/tokenizer.rb', line 264

def token_line_no
  @token.line_no
end

#token_pos ⇒ `Integer`

Returns position of the current token (only meant for error tracking for the time being).

Returns:

(Integer) —

position of the current token (only meant for error tracking for the time being)



270
271
272

# File 'lib/sdl4r/tokenizer.rb', line 270

def token_pos
  @token.pos
end

#token_type ⇒ `Symbol`

Returns type of the current token (e.g. :WHITESPACE).

Returns:

(Symbol) —

type of the current token (e.g. :WHITESPACE)



258
259
260

# File 'lib/sdl4r/tokenizer.rb', line 258

def token_type
  @token.type
end

#unread ⇒ `Object`

Unreads the current token. The previous token becomes the current one

Raises:

if #unread has been called twice in a row (no call to #read)

# File 'lib/sdl4r/tokenizer.rb', line 378

def unread
  if @pushed_back_token
    raise "only one token can be pushed back"
  else
    @pushed_back_token = @token
    @token = @previous_token

    # We have no memory of what happened before
    @previous_token = nil

    if @token.matcher
      next_mode = @token.matcher.next_mode
      set_mode(next_mode) if next_mode
    end
  end
end

Class: SDL4R::Tokenizer

Overview

Defined Under Namespace

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(io) ⇒ Tokenizer

Instance Method Details

#previous_token_type ⇒ Symbol

#raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos) ⇒ Object

#raise_unexpected_char(msg = "unexpected char") ⇒ Object

#read ⇒ Symbol

#set_mode(mode) ⇒ self

#token ⇒ String

#token_line_no ⇒ Integer

#token_pos ⇒ Integer

#token_type ⇒ Symbol

#unread ⇒ Object

#initialize(io) ⇒ `Tokenizer`

#previous_token_type ⇒ `Symbol`

#raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos) ⇒ `Object`

#raise_unexpected_char(msg = "unexpected char") ⇒ `Object`

#read ⇒ `Symbol`

#set_mode(mode) ⇒ `self`

#token ⇒ `String`

#token_line_no ⇒ `Integer`

#token_pos ⇒ `Integer`

#token_type ⇒ `Symbol`

#unread ⇒ `Object`