6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
# File 'lib/toylang/lexer/lexer.rb', line 6
def tokenize(code)
code = code.chomp.gsub(/^$\n/, '')
tokens = []
parse_indent = true
current_indent = 0
indent_stack = []
position = 0
while position < code.size
chunk = code[position..-1]
if chunk.start_with?('#')
next_line = chunk.index("\n")
offset = if next_line.nil?
chunk.size
else
next_line
end
elsif (identifier = chunk[/\A([a-z]\w*)/, 1])
token, offset = tokenize_identifier(identifier)
tokens << token
elsif (constant = chunk[/\A([A-Z]\w*)/, 1])
tokens << [:CONSTANT, constant]
offset = constant.size
elsif (number = chunk[/\A(([0-9]*[.])?[0-9]+)/, 1])
tokens << [:NUMBER, number.to_numeric]
offset = number.size
elsif (string = chunk[/\A"([^"]*)"/, 1])
tokens << [:STRING, string]
offset = string.size + 2
elsif parse_indent && (indent = chunk[/\A:\n( +)/m, 1])
if indent.size <= current_indent
raise "Bad indent level, got #{indent.size} indents, " \
"expected > #{current_indent}"
end
current_indent = indent.size
indent_stack.push(current_indent)
tokens << [:INDENT, indent.size]
offset = indent.size + 2
elsif parse_indent && (indent = chunk[/\A\n( *)/m, 1])
if indent.size == current_indent
tokens << [:NEWLINE, "\n"]
elsif indent.size < current_indent
while indent.size < current_indent
indent_stack.pop
current_indent = indent_stack.last || 0
tokens << [:DEDENT, indent.size]
end
tokens << [:NEWLINE, "\n"]
else
raise "Missing ':'"
end
offset = indent.size + 1
elsif (operator = chunk[
%r{\A(\|\||&&|==|!=|<=|>=|\+=|-=|\*=|/=|%=|\*\*=|&=|\|=|\^=|<<|>>|<<=|>>=|&&=|\|\|=|\*\*)}, 1])
tokens << [operator, operator]
offset = operator.size
elsif chunk.start_with?(' ')
offset = 1
else
value = chunk[0, 1]
tokens << [value, value]
offset = 1
end
position += offset
end
tokens << [:DEDENT, indent_stack.first || 0] while indent_stack.pop
tokens
end
|