Class: SDL4R::Tokenizer
- Inherits:
-
Object
- Object
- SDL4R::Tokenizer
- Defined in:
- lib/sdl4r/tokenizer.rb
Overview
Tokenizer for SDL.
As Ruby’s IO standard libraries are not so much low-level, this class works on lines. This means that some token types reflect this line-oriented tokenizing.
The other solution would be to implement a proper tokenizer natively, which I don’t feel like doing right now.
– FIXME: implement a way of stacking the errors without raising an error immediately ++
Defined Under Namespace
Classes: Matcher
Constant Summary collapse
- @@EOL_STRING =
A string used at the end of each line in order to trigger the EOL token.
"\n"
- @@matcher_sets =
{ :top => [ Matcher.new(:EOL, /\A\n/), Matcher.new(:WHITESPACE, /\A\s+/, :push_back_eol => true), Matcher.new(:SEMICOLON, /\A;/), Matcher.new(:COLON, /\A:/), Matcher.new(:EQUAL, /\A=/), Matcher.new(:BLOCK_START, /\A\{/), Matcher.new(:BLOCK_END, /\A\}/), Matcher.new(:BOOLEAN, /\Atrue|false|on|off/), Matcher.new(:NULL, /\Anull/), Matcher.new(:ONE_LINE_COMMENT, /\A(?:#|--|\/\/).*\Z/, :push_back_eol => true) do def process_token(token) token.gsub!(/\A(?:#|--|\/\/)/, "") end end, Matcher.new(:INLINE_COMMENT, /\A\/\*[\s\S]*?\*\//) do def process_token(token) token.gsub!(/\A\/\*|\*\/\Z/, "") end end, Matcher.new( :MULTILINE_COMMENT_START, /\A\/\*.*\Z/, :next_mode => :multiline_comment, :push_back_eol => true) do def process_token(token) token.gsub!(/\A\/\*/, "") end end, Matcher.new(:CHARACTER, /\A'(?:[^\\']|\\.)'/) do def process_token(token) token.gsub!(/\A'|'\Z/, "") end end, Matcher.new(:INLINE_BACKQUOTE_STRING, /\A`[^`]*`/, :is_node => true) do def process_token(token) token.gsub!(/\A`|`\Z/, "") end end, Matcher.new(:INLINE_DOUBLE_QUOTE_STRING, /\A"(?:[^\\"]|\\.)*"/) do def process_token(token) token.gsub!(/\A"|"\Z/, "") end end, Matcher.new( :MULTILINE_BACKQUOTE_STRING_START, /\A`[^`]*\Z/, :next_mode => :multiline_backquote_string, :is_node => true) do def process_token(token) token.gsub!(/\A`/, "") end end, Matcher.new( :MULTILINE_DOUBLE_QUOTE_STRING_START, /\A"(?:[^\\"]|\\\S)*\\\s*\Z/, :next_mode => :multiline_double_quote_string, :push_back_eol => true) do def process_token(token) token.gsub!(/\A"|\\\s*\Z/, "") end end, Matcher.new(:INLINE_BINARY, /\A\[[\sA-Za-z0-9\/=\+]*\]/) do def process_token(token) token.gsub!(/\A\[|\s+|\]\Z/, "") end end, Matcher.new( :MULTILINE_BINARY_START, /\A\[[\sA-Za-z0-9\/=\+]*\Z/, :next_mode => :multiline_binary, :push_back_eol => true) do def process_token(token) token.gsub!(/\A\[|\s+/, "") end end, Matcher.new( :IDENTIFIER, /\A#{SDL4R::IDENTIFIER_START_CLASS}#{SDL4R::IDENTIFIER_PART_CLASS}*/), Matcher.new(:DATE, /\A-?\d+\/\d+\/\d+/, :is_node => true), Matcher.new( :TIME_OR_TIMESPAN, /\A(?:-?\d+d:)?-?\d+:\d+(?::\d+(?:\.\d+)?)? (?:-[a-zA-Z\/]+(?:[+-]\d+(?::\d+)?)?)?/ix), Matcher.new(:INTEGER, /\A[\+\-]?\d+L/i), # takes precedence on floats # the float regex is meant to also catch bad syntaxed floats like "1.2.2" (otherwise, we # would not detect this kind of errors easily). Matcher.new( :FLOAT, /\A[\+\-]?(?:\d+(?:F|D|BD)|\d*\.[\d\.]+(?:F|D|BD)?)/i), Matcher.new(:INTEGER, /\A[\+\-]?\d+L?/i), Matcher.new(:LINE_CONTINUATION, /\A\\\s*\Z/), # outside of comments, strings, etc Matcher.new( :UNCLOSED_DOUBLE_QUOTE_STRING, /\A"(?:[^\\"]|\\\S)*/, :error => "unclosed string"), ], :multiline_comment => [ Matcher.new(:EOL, /\A\n/), Matcher.new(:MULTILINE_COMMENT_END, /\A[\s\S]*?\*\//, :next_mode => :top) do def process_token(token) token.gsub!(/\*\/\Z/, "") end end, Matcher.new(:MULTILINE_COMMENT_PART, /\A.+\Z/, :push_back_eol => true) ], :multiline_backquote_string => [ Matcher.new(:EOL, /\A\n/), Matcher.new(:MULTILINE_BACKQUOTE_STRING_END, /\A[^`]*`/, :next_mode => :top) do def process_token(token) token.gsub!(/`\Z/, "") end end, Matcher.new(:MULTILINE_BACKQUOTE_STRING_PART, /\A[^`]*\Z/) ], :multiline_double_quote_string => [ Matcher.new(:EOL, /\A\n/), Matcher.new( :MULTILINE_DOUBLE_QUOTE_STRING_END, /\A(?:[^\\"]|\\\S)*"/, :next_mode => :top) do def process_token(token) token.gsub!(/\A\s+|"\Z/, "") end end, Matcher.new( :MULTILINE_DOUBLE_QUOTE_STRING_PART, /\A(?:[^\\"]|\\\S)*\\\s*\Z/, :push_back_eol => true) do def process_token(token) token.gsub!(/\A\s+|\\\s*\Z/, "") end end, Matcher.new( :UNCLOSED_DOUBLE_QUOTE_STRING, /\A(?:[^\\"]|\\\S)*\Z/, :error => "unclosed multiline string") ], :multiline_binary => [ Matcher.new(:EOL, /\A\n/), Matcher.new(:MULTILINE_BINARY_END, /\A[\sA-Za-z0-9\/=\+]*\]/, :next_mode => :top) do def process_token(token) token.gsub!(/\s+|\]\Z/, "") end end, Matcher.new(:MULTILINE_BINARY_PART, /\A[\sA-Za-z0-9\/=\+]*\Z/, :push_back_eol => true) do def process_token(token) token.gsub!(/\s+/, "") end end ] }
Instance Method Summary collapse
-
#initialize(io) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
-
#previous_token_type ⇒ Symbol
The type of the previous Token.
- #raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos) ⇒ Object
-
#raise_unexpected_char(msg = "unexpected char") ⇒ Object
Raises a standard “unexpected character” error.
-
#read ⇒ Symbol
Goes to the next token.
-
#set_mode(mode) ⇒ self
Sets the current working mode of this Tokenizer.
-
#token ⇒ String
Text of the current token.
-
#token_line_no ⇒ Integer
Position of the current token (only meant for error tracking for the time being).
-
#token_pos ⇒ Integer
Position of the current token (only meant for error tracking for the time being).
-
#token_type ⇒ Symbol
Type of the current token (e.g.
:WHITESPACE
). -
#unread ⇒ Object
Unreads the current token.
Constructor Details
#initialize(io) ⇒ Tokenizer
Returns a new instance of Tokenizer.
238 239 240 241 242 243 244 245 246 247 248 249 250 |
# File 'lib/sdl4r/tokenizer.rb', line 238 def initialize io raise ArgumentError, 'io' unless io @io = io @scanner = nil @line_no = -1 set_mode(:top) @token = nil @pushed_back_token = nil @previous_token = nil @token_pool = [] # a pool of reusable Tokens end |
Instance Method Details
#previous_token_type ⇒ Symbol
Returns the type of the previous Token.
370 371 372 |
# File 'lib/sdl4r/tokenizer.rb', line 370 def previous_token_type @previous_token ? @previous_token.type : nil end |
#raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos) ⇒ Object
400 401 402 403 |
# File 'lib/sdl4r/tokenizer.rb', line 400 def raise_parse_error(msg = "parse error", line_no = @line_no, pos = @scanner.pos) line = (line_no == @line_no)? @scanner.string : nil raise SdlParseError.new(msg, line_no + 1, pos + 1, line) end |
#raise_unexpected_char(msg = "unexpected char") ⇒ Object
Raises a standard “unexpected character” error.
396 397 398 |
# File 'lib/sdl4r/tokenizer.rb', line 396 def raise_unexpected_char(msg = "unexpected char") raise_parse_error "#{msg}: <#{@scanner.peek(1)}>" end |
#read ⇒ Symbol
Goes to the next token.
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 |
# File 'lib/sdl4r/tokenizer.rb', line 312 def read if @pushed_back_token read_pushed_back return @token.type end record_previous_token @token = nil if @line_no < 0 or @scanner.eos? # fetch a line if beginning or at end of line unless read_line if previous_token_type == :EOF return nil else @token = Token.new(nil, :EOF, nil, @line_no, @scanner ? @scanner.pos : 0) return @token.type end end end pos = @scanner.pos @matcher_set.each do |matcher| if token_text = @scanner.scan(matcher.regex) error = matcher.error if error raise_parse_error(error) else set_matcher_token(matcher, token_text, pos) if matcher.push_back_eol and @scanner.eos? @scanner.pos = @scanner.pos - @@EOL_STRING.size end end break end end raise_unexpected_char unless @token return @token.type end |
#set_mode(mode) ⇒ self
Sets the current working mode of this Tokenizer.
286 287 288 289 290 291 |
# File 'lib/sdl4r/tokenizer.rb', line 286 def set_mode(mode) ms = @@matcher_sets[mode] raise ArgumentError, "unknown tokenizer mode #{mode.to_s}" unless ms @matcher_set = ms self end |
#token ⇒ String
Returns text of the current token.
253 254 255 |
# File 'lib/sdl4r/tokenizer.rb', line 253 def token @token.text end |
#token_line_no ⇒ Integer
Returns position of the current token (only meant for error tracking for the time being).
264 265 266 |
# File 'lib/sdl4r/tokenizer.rb', line 264 def token_line_no @token.line_no end |
#token_pos ⇒ Integer
Returns position of the current token (only meant for error tracking for the time being).
270 271 272 |
# File 'lib/sdl4r/tokenizer.rb', line 270 def token_pos @token.pos end |
#token_type ⇒ Symbol
Returns type of the current token (e.g. :WHITESPACE
).
258 259 260 |
# File 'lib/sdl4r/tokenizer.rb', line 258 def token_type @token.type end |
#unread ⇒ Object
Unreads the current token. The previous token becomes the current one
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 |
# File 'lib/sdl4r/tokenizer.rb', line 378 def unread if @pushed_back_token raise "only one token can be pushed back" else @pushed_back_token = @token @token = @previous_token # We have no memory of what happened before @previous_token = nil if @token.matcher next_mode = @token.matcher.next_mode set_mode(next_mode) if next_mode end end end |