Class: RKelly::Tokenizer
- Inherits:
-
Object
- Object
- RKelly::Tokenizer
- Defined in:
- lib/rkelly/tokenizer.rb
Constant Summary collapse
- KEYWORDS =
%w{ break case catch continue default delete do else finally for function if in instanceof new return switch this throw try typeof var void while with const true false null debugger }
- RESERVED =
%w{ abstract boolean byte char class double enum export extends final float goto implements import int interface long native package private protected public short static super synchronized throws transient volatile }
- LITERALS =
{ # Punctuators '==' => :EQEQ, '!=' => :NE, '===' => :STREQ, '!==' => :STRNEQ, '<=' => :LE, '>=' => :GE, '||' => :OR, '&&' => :AND, '++' => :PLUSPLUS, '--' => :MINUSMINUS, '<<' => :LSHIFT, '<<=' => :LSHIFTEQUAL, '>>' => :RSHIFT, '>>=' => :RSHIFTEQUAL, '>>>' => :URSHIFT, '>>>='=> :URSHIFTEQUAL, '&=' => :ANDEQUAL, '%=' => :MODEQUAL, '^=' => :XOREQUAL, '|=' => :OREQUAL, '+=' => :PLUSEQUAL, '-=' => :MINUSEQUAL, '*=' => :MULTEQUAL, '/=' => :DIVEQUAL, }
- KEYWORDS_THAT_IMPLY_DIVISION =
Some keywords can be followed by regular expressions (eg, return and throw). Others can be followed by division.
%w{ this true false null }
- KEYWORDS_THAT_IMPLY_REGEX =
KEYWORDS - KEYWORDS_THAT_IMPLY_DIVISION
- SINGLE_CHARS_THAT_IMPLY_DIVISION =
[')', ']', '}']
Instance Method Summary collapse
-
#initialize(&block) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #raw_tokens(string) ⇒ Object
- #tokenize(string) ⇒ Object
Constructor Details
#initialize(&block) ⇒ Tokenizer
Returns a new instance of Tokenizer.
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/rkelly/tokenizer.rb', line 58 def initialize(&block) @lexemes = [] token(:COMMENT, /\A\/(?:\*(?:.)*?\*\/|\/[^\n]*)/m) token(:STRING, /\A"(?:[^"\\]*(?:\\.[^"\\]*)*)"|\A'(?:[^'\\]*(?:\\.[^'\\]*)*)'/m) # A regexp to match floating point literals (but not integer literals). token(:NUMBER, /\A\d+\.\d*(?:[eE][-+]?\d+)?|\A\d+(?:\.\d*)?[eE][-+]?\d+|\A\.\d+(?:[eE][-+]?\d+)?/m) do |type, value| value.gsub!(/\.(\D)/, '.0\1') if value =~ /\.\w/ value.gsub!(/\.$/, '.0') if value =~ /\.$/ value.gsub!(/^\./, '0.') if value =~ /^\./ [type, eval(value)] end token(:NUMBER, /\A0[xX][\da-fA-F]+|\A0[0-7]*|\A\d+/) do |type, value| [type, eval(value)] end token(:LITERALS, Regexp.new(LITERALS.keys.sort_by { |x| x.length }.reverse.map { |x| "\\A#{x.gsub(/([|+*^])/, '\\\\\1')}" }.join('|') )) do |type, value| [LITERALS[value], value] end token(:RAW_IDENT, /\A([_\$A-Za-z][_\$0-9A-Za-z]*)/) do |type,value| if KEYWORDS.include?(value) [value.upcase.to_sym, value] elsif RESERVED.include?(value) [:RESERVED, value] else [:IDENT, value] end end # To distinguish regular expressions from comments, we require that # regular expressions start with a non * character (ie, not look like # /*foo*/). Note that we can't depend on the length of the match to # correctly distinguish, since `/**/i` is longer if matched as a regular # expression than as matched as a comment. # Incidentally, we're also not matching empty regular expressions # (eg, // and //g). Here we could depend on match length and priority to # determine that these are actually comments, but it turns out to be # easier to not match them in the first place. token(:REGEXP, /\A\/(?:[^\/\r\n\\*]|\\[^\r\n])[^\/\r\n\\]*(?:\\[^\r\n][^\/\r\n\\]*)*\/[gim]*/) token(:S, /\A[\s\r\n]*/m) token(:SINGLE_CHAR, /\A./) do |type, value| [value, value] end end |
Instance Method Details
#raw_tokens(string) ⇒ Object
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/rkelly/tokenizer.rb', line 114 def raw_tokens(string) tokens = [] line_number = 1 accepting_regexp = true while string.length > 0 longest_token = nil @lexemes.each { |lexeme| next if lexeme.name == :REGEXP && !accepting_regexp match = lexeme.match(string) next if match.nil? longest_token = match if longest_token.nil? next if longest_token.value.length >= match.value.length longest_token = match } if longest_token.name != :S accepting_regexp = followable_by_regex(longest_token) end longest_token.line = line_number line_number += longest_token.value.scan(/\n/).length string = string.slice(Range.new(longest_token.value.length, -1)) tokens << longest_token end tokens end |
#tokenize(string) ⇒ Object
110 111 112 |
# File 'lib/rkelly/tokenizer.rb', line 110 def tokenize(string) raw_tokens(string).map { |x| x.to_racc_token } end |