Class: RKelly::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/rkelly/tokenizer.rb

Constant Summary collapse

KEYWORDS =
%w{
  break case catch continue default delete do else finally for function
  if in instanceof new return switch this throw try typeof var void while 
  with 

  const true false null debugger
}
RESERVED =
%w{
  abstract boolean byte char class double enum export extends
  final float goto implements import int interface long native package
  private protected public short static super synchronized throws
  transient volatile
}
LITERALS =
{
  # Punctuators
  '=='  => :EQEQ,
  '!='  => :NE,
  '===' => :STREQ,
  '!==' => :STRNEQ,
  '<='  => :LE,
  '>='  => :GE,
  '||'  => :OR,
  '&&'  => :AND,
  '++'  => :PLUSPLUS,
  '--'  => :MINUSMINUS,
  '<<'  => :LSHIFT,
  '<<=' => :LSHIFTEQUAL,
  '>>'  => :RSHIFT,
  '>>=' => :RSHIFTEQUAL,
  '>>>' => :URSHIFT,
  '>>>='=> :URSHIFTEQUAL,
  '&='  => :ANDEQUAL,
  '%='  => :MODEQUAL,
  '^='  => :XOREQUAL,
  '|='  => :OREQUAL,
  '+='  => :PLUSEQUAL,
  '-='  => :MINUSEQUAL,
  '*='  => :MULTEQUAL,
  '/='  => :DIVEQUAL,
}
KEYWORDS_THAT_IMPLY_DIVISION =

Some keywords can be followed by regular expressions (eg, return and throw). Others can be followed by division.

%w{
  this true false null
}
KEYWORDS_THAT_IMPLY_REGEX =
KEYWORDS - KEYWORDS_THAT_IMPLY_DIVISION
SINGLE_CHARS_THAT_IMPLY_DIVISION =
[')', ']', '}']

Instance Method Summary collapse

Constructor Details

#initialize(&block) ⇒ Tokenizer

Returns a new instance of Tokenizer.



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/rkelly/tokenizer.rb', line 58

def initialize(&block)
  @lexemes = []

  token(:COMMENT, /\A\/(?:\*(?:.)*?\*\/|\/[^\n]*)/m)
  token(:STRING, /\A"(?:[^"\\]*(?:\\.[^"\\]*)*)"|\A'(?:[^'\\]*(?:\\.[^'\\]*)*)'/m)

  # A regexp to match floating point literals (but not integer literals).
  token(:NUMBER, /\A\d+\.\d*(?:[eE][-+]?\d+)?|\A\d+(?:\.\d*)?[eE][-+]?\d+|\A\.\d+(?:[eE][-+]?\d+)?/m) do |type, value|
    value.gsub!(/\.(\D)/, '.0\1') if value =~ /\.\w/
    value.gsub!(/\.$/, '.0') if value =~ /\.$/
    value.gsub!(/^\./, '0.') if value =~ /^\./
    [type, eval(value)]
  end
  token(:NUMBER, /\A0[xX][\da-fA-F]+|\A0[0-7]*|\A\d+/) do |type, value|
    [type, eval(value)]
  end

  token(:LITERALS,
    Regexp.new(LITERALS.keys.sort_by { |x|
      x.length
    }.reverse.map { |x| "\\A#{x.gsub(/([|+*^])/, '\\\\\1')}" }.join('|')
  )) do |type, value|
    [LITERALS[value], value]
  end

  token(:RAW_IDENT, /\A([_\$A-Za-z][_\$0-9A-Za-z]*)/) do |type,value|
    if KEYWORDS.include?(value)
      [value.upcase.to_sym, value]
    elsif RESERVED.include?(value)
      [:RESERVED, value]
    else
      [:IDENT, value]
    end
  end

  # To distinguish regular expressions from comments, we require that
  # regular expressions start with a non * character (ie, not look like
  # /*foo*/). Note that we can't depend on the length of the match to
  # correctly distinguish, since `/**/i` is longer if matched as a regular
  # expression than as matched as a comment.
  # Incidentally, we're also not matching empty regular expressions
  # (eg, // and //g). Here we could depend on match length and priority to
  # determine that these are actually comments, but it turns out to be
  # easier to not match them in the first place.
  token(:REGEXP, /\A\/(?:[^\/\r\n\\*]|\\[^\r\n])[^\/\r\n\\]*(?:\\[^\r\n][^\/\r\n\\]*)*\/[gim]*/)
  token(:S, /\A[\s\r\n]*/m)

  token(:SINGLE_CHAR, /\A./) do |type, value|
    [value, value]
  end
end

Instance Method Details

#raw_tokens(string) ⇒ Object



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/rkelly/tokenizer.rb', line 114

def raw_tokens(string)
  tokens = []
  line_number = 1
  accepting_regexp = true
  while string.length > 0
    longest_token = nil

    @lexemes.each { |lexeme|
      next if lexeme.name == :REGEXP && !accepting_regexp

      match = lexeme.match(string)
      next if match.nil?
      longest_token = match if longest_token.nil?
      next if longest_token.value.length >= match.value.length
      longest_token = match
    }

    if longest_token.name != :S
      accepting_regexp = followable_by_regex(longest_token)
    end

    longest_token.line = line_number
    line_number += longest_token.value.scan(/\n/).length
    string = string.slice(Range.new(longest_token.value.length, -1))
    tokens << longest_token
  end
  tokens
end

#tokenize(string) ⇒ Object



110
111
112
# File 'lib/rkelly/tokenizer.rb', line 110

def tokenize(string)
  raw_tokens(string).map { |x| x.to_racc_token }
end