Class: Graphlyte::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/graphlyte/lexer.rb

Overview

Transform a string into a stream of tokens - i.e. lexing

Constant Summary collapse

LINEFEED =
"\u000a"
CARRIAGE_RETURN =
"\u000d"
NEW_LINE =
[LINEFEED, CARRIAGE_RETURN].freeze
HORIZONTAL_TAB =
"\u0009"
SPACE =
"\u0020"
WHITESPACE =
[HORIZONTAL_TAB, SPACE].freeze
COMMENT_CHAR =
'#'
DOUBLE_QUOTE =
'"'
BLOCK_QUOTE =
'"""'
BACK_QUOTE =
'\\'
COMMA =
','
UNICODE_BOM =
"\ufeff"
IGNORED =
[UNICODE_BOM, COMMA, *WHITESPACE].freeze
PUNCTUATOR =
['!', '$', '&', '(', ')', '...', ':', '=', '@', '[', ']', '{', '|', '}'].freeze
LETTERS =
%w[
  A B C D E F G H I J K L M
  N O P Q R S T U V W X Y Z
  a b c d e f g h i j k l m
  n o p q r s t u v w x y z
].freeze
DIGITS =
%w[0 1 2 3 4 5 6 7 8 9].freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ Lexer

Returns a new instance of Lexer.



58
59
60
61
62
63
64
65
# File 'lib/graphlyte/lexer.rb', line 58

def initialize(source)
  @source = source
  @tokens = []
  @line = 1
  @column = 1
  @index = 0
  @lexeme_start_p = Lexing::Position.new(0, 0)
end

Instance Attribute Details

#columnObject

Returns the value of attribute column.



56
57
58
# File 'lib/graphlyte/lexer.rb', line 56

def column
  @column
end

#indexObject

Returns the value of attribute index.



56
57
58
# File 'lib/graphlyte/lexer.rb', line 56

def index
  @index
end

#lexeme_start_pObject

Returns the value of attribute lexeme_start_p.



56
57
58
# File 'lib/graphlyte/lexer.rb', line 56

def lexeme_start_p
  @lexeme_start_p
end

#lineObject

Returns the value of attribute line.



56
57
58
# File 'lib/graphlyte/lexer.rb', line 56

def line
  @line
end

#sourceObject (readonly)

Returns the value of attribute source.



55
56
57
# File 'lib/graphlyte/lexer.rb', line 55

def source
  @source
end

#tokensObject (readonly)

Returns the value of attribute tokens.



55
56
57
# File 'lib/graphlyte/lexer.rb', line 55

def tokens
  @tokens
end

Class Method Details

.lex(source) ⇒ Object



67
68
69
70
71
72
# File 'lib/graphlyte/lexer.rb', line 67

def self.lex(source)
  lexer = new(source)
  lexer.tokenize!

  lexer.tokens
end

Instance Method Details

#after_source_end_locationObject



86
87
88
# File 'lib/graphlyte/lexer.rb', line 86

def after_source_end_location
  Lexing::Location.eof
end

#block_chars_rawObject



215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# File 'lib/graphlyte/lexer.rb', line 215

def block_chars_raw
  chars = []
  terminated = false

  until eof? || (terminated = consume(BLOCK_QUOTE))
    chars << BLOCK_QUOTE if consume("\\#{BLOCK_QUOTE}")
    chars << '"' while consume(DOUBLE_QUOTE)
    while (char = string_character(block_string: true))
      chars << char
    end
  end

  lex_error('Unterminated string') unless terminated

  chars
end

#block_string_contentObject



190
191
192
193
194
195
196
197
198
199
200
# File 'lib/graphlyte/lexer.rb', line 190

def block_string_content
  chars = block_chars_raw

  lines = chomp_lines(chars.join.lines)
  # Consistent indentation
  left_margin = lines.map do |line|
    line.chars.take_while { _1 == ' ' }.length
  end.min

  lines.map { _1[left_margin..] }.join(LINEFEED)
end

#chomp_lines(lines) ⇒ Object

Strip leading and trailing blank lines, and whitespace on the right margins



203
204
205
# File 'lib/graphlyte/lexer.rb', line 203

def chomp_lines(lines)
  strip_trailing_blank_lines(strip_leading_blank_lines(lines.map(&:chomp)))
end

#consume(str = nil) ⇒ Object



243
244
245
246
247
248
249
250
251
# File 'lib/graphlyte/lexer.rb', line 243

def consume(str = nil)
  return if str && !match(str)

  c = str || lookahead

  self.index += c.length
  self.column += c.length
  c
end

#current_locationObject



253
254
255
# File 'lib/graphlyte/lexer.rb', line 253

def current_location
  Lexing::Location.new(lexeme_start_p, current_position)
end

#current_positionObject



257
258
259
# File 'lib/graphlyte/lexer.rb', line 257

def current_position
  Lexing::Position.new(line, column)
end

#digit?(char) ⇒ Boolean

Returns:

  • (Boolean)


382
383
384
# File 'lib/graphlyte/lexer.rb', line 382

def digit?(char)
  DIGITS.include?(char)
end

#eof?Boolean

Returns:

  • (Boolean)


94
95
96
# File 'lib/graphlyte/lexer.rb', line 94

def eof?
  !source_uncompleted?
end

#escaped_characterObject



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/graphlyte/lexer.rb', line 160

def escaped_character
  c = consume

  case c
  when DOUBLE_QUOTE then DOUBLE_QUOTE
  when BACK_QUOTE then BACK_QUOTE
  when '/' then '/'
  when 'b' then "\b"
  when 'f' then "\f"
  when 'n' then LINEFEED
  when 'r' then "\r"
  when 't' then "\t"
  when 'u' then hex_char
  else
    lex_error("Unexpected escaped character in string: #{c}")
  end
end

#exponent_partObject



349
350
351
352
353
354
355
356
357
358
# File 'lib/graphlyte/lexer.rb', line 349

def exponent_part
  return unless one_of(%w[e E])

  sign = one_of(%w[- +])
  lex_error("Expected a digit, got #{lookahead}") unless digit?(lookahead)

  digits = take_while { digit?(_1) }

  [sign, digits.join]
end

#fractional_partObject



341
342
343
344
345
346
347
# File 'lib/graphlyte/lexer.rb', line 341

def fractional_part
  return unless consume('.')

  lex_error("Expected a digit, got #{lookahead}") unless digit?(lookahead)

  take_while { digit?(_1) }
end

#hex_charObject



178
179
180
181
182
183
184
185
186
187
188
# File 'lib/graphlyte/lexer.rb', line 178

def hex_char
  char_code = [1, 2, 3, 4].map do
    d = consume
    hex_digit = (digit?(d) || ('a'...'f').cover?(d.downcase))
    lex_error("Expected a hex digit in unicode escape sequence. Got #{d.inspect}") unless hex_digit

    d
  end

  char_code.join.hex.chr
end

#ignore_comment_lineObject



386
387
388
389
390
# File 'lib/graphlyte/lexer.rb', line 386

def ignore_comment_line
  take_while { !NEW_LINE.include?(_1) }

  nil
end

#letter?(char) ⇒ Boolean

Returns:

  • (Boolean)


374
375
376
# File 'lib/graphlyte/lexer.rb', line 374

def letter?(char)
  LETTERS.include?(char)
end

#lex_error(msg) ⇒ Object

Raises:



111
112
113
# File 'lib/graphlyte/lexer.rb', line 111

def lex_error(msg)
  raise LexError, "#{msg} at #{line}:#{column}"
end

#lexical_tokenObject



279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# File 'lib/graphlyte/lexer.rb', line 279

def lexical_token
  c = consume
  t = if IGNORED.include?(c)
        nil
      elsif c == COMMENT_CHAR
        ignore_comment_line
      elsif name_start?(c)
        to_token(:NAME)   { name(c)   }
      elsif string_start?(c)
        to_token(:STRING) { string    }
      elsif numeric_start?(c)
        to_token(:NUMBER) { number(c) }
      else
        lex_error("Unexpected character: #{c.inspect}")
      end

  Production.new(t)
end

#lookahead(offset = 1) ⇒ Object



98
99
100
101
102
103
# File 'lib/graphlyte/lexer.rb', line 98

def lookahead(offset = 1)
  lookahead_p = (index - 1) + offset
  return "\0" if lookahead_p >= source.length

  source[lookahead_p]
end

#match(str) ⇒ Object



105
106
107
108
109
# File 'lib/graphlyte/lexer.rb', line 105

def match(str)
  str.chars.each_with_index.all? do |char, offset|
    lookahead(offset + 1) == char
  end
end

#name(char) ⇒ Object



360
361
362
363
364
# File 'lib/graphlyte/lexer.rb', line 360

def name(char)
  value = [char] + take_while { name_continue?(_1) }

  value.join
end

#name_continue?(char) ⇒ Boolean

Returns:

  • (Boolean)


370
371
372
# File 'lib/graphlyte/lexer.rb', line 370

def name_continue?(char)
  letter?(char) || digit?(char) || underscore?(char)
end

#name_start?(char) ⇒ Boolean

Returns:

  • (Boolean)


366
367
368
# File 'lib/graphlyte/lexer.rb', line 366

def name_start?(char)
  letter?(char) || underscore?(char)
end

#next_line!Object



298
299
300
301
# File 'lib/graphlyte/lexer.rb', line 298

def next_line!
  self.line += 1
  self.column = 1
end

#next_tokenObject



261
262
263
# File 'lib/graphlyte/lexer.rb', line 261

def next_token
  (punctuator || skip_line || lexical_token).token
end

#number(char) ⇒ Object



326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/graphlyte/lexer.rb', line 326

def number(char)
  is_negated = char == '-'

  int_part = is_negated ? [] : [char]
  int_part += take_while { digit?(_1) }

  frac_part = fractional_part
  exp_part = exponent_part

  Syntax::NumericLiteral.new(integer_part: int_part&.join(''),
                             fractional_part: frac_part&.join(''),
                             exponent_part: exp_part,
                             negated: is_negated)
end

#numeric_start?(char) ⇒ Boolean

Returns:

  • (Boolean)


307
308
309
310
311
312
313
314
315
316
# File 'lib/graphlyte/lexer.rb', line 307

def numeric_start?(char)
  case char
  when '-'
    DIGITS.include?(lookahead)
  when '0'
    !DIGITS.include?(lookahead)
  else
    char != '0' && DIGITS.include?(char)
  end
end

#one_of(strings) ⇒ Object



115
116
117
118
119
120
121
# File 'lib/graphlyte/lexer.rb', line 115

def one_of(strings)
  strings.each do |s|
    return s if consume(s)
  end

  nil
end

#punctuatorObject



265
266
267
268
269
# File 'lib/graphlyte/lexer.rb', line 265

def punctuator
  p = one_of(PUNCTUATOR)

  Production.new(Lexing::Token.new(:PUNCTUATOR, p, current_location)) if p
end

#seek(offset) ⇒ Object



239
240
241
# File 'lib/graphlyte/lexer.rb', line 239

def seek(offset)
  self.index += offset
end

#skip_lineObject



271
272
273
274
275
276
277
# File 'lib/graphlyte/lexer.rb', line 271

def skip_line
  lf = one_of([LINEFEED, "#{CARRIAGE_RETURN}#{LINEFEED}"])
  return unless lf

  next_line!
  Production.new(nil)
end

#source_uncompleted?Boolean

Returns:

  • (Boolean)


90
91
92
# File 'lib/graphlyte/lexer.rb', line 90

def source_uncompleted?
  index < source.length
end

#stringObject



123
124
125
126
127
128
129
130
131
132
# File 'lib/graphlyte/lexer.rb', line 123

def string
  if lookahead == DOUBLE_QUOTE && lookahead(2) != DOUBLE_QUOTE
    consume
    '' # The empty string
  elsif consume('""') # Block string
    block_string_content
  else
    string_content
  end
end

#string_character(block_string: false) ⇒ Object



145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/graphlyte/lexer.rb', line 145

def string_character(block_string: false)
  return if eof?
  return if lookahead == DOUBLE_QUOTE

  c = consume

  lex_error("Illegal character #{c.inspect}") if !block_string && NEW_LINE.include?(c)

  if c == BACK_QUOTE
    escaped_character
  else
    c
  end
end

#string_contentObject



134
135
136
137
138
139
140
141
142
143
# File 'lib/graphlyte/lexer.rb', line 134

def string_content
  chars = []
  while (char = string_character)
    chars << char
  end

  lex_error('Unterminated string') unless consume(DOUBLE_QUOTE)

  chars.join
end

#string_start?(char) ⇒ Boolean

Returns:

  • (Boolean)


303
304
305
# File 'lib/graphlyte/lexer.rb', line 303

def string_start?(char)
  char == '"'
end

#strip_leading_blank_lines(lines) ⇒ Object



207
208
209
# File 'lib/graphlyte/lexer.rb', line 207

def strip_leading_blank_lines(lines)
  lines.drop_while { _1 =~ /^\s*$/ }
end

#strip_trailing_blank_lines(lines) ⇒ Object



211
212
213
# File 'lib/graphlyte/lexer.rb', line 211

def strip_trailing_blank_lines(lines)
  strip_leading_blank_lines(lines.reverse).reverse
end

#take_whileObject



232
233
234
235
236
237
# File 'lib/graphlyte/lexer.rb', line 232

def take_while
  chars = []
  chars << consume while yield(lookahead)

  chars
end

#to_token(type) ⇒ Object



318
319
320
321
322
323
324
# File 'lib/graphlyte/lexer.rb', line 318

def to_token(type)
  i = index - 1
  value = yield
  j = index

  Lexing::Token.new(type, source[i..j], current_location, value: value)
end

#tokenize!Object



74
75
76
77
78
79
80
81
82
83
84
# File 'lib/graphlyte/lexer.rb', line 74

def tokenize!
  while source_uncompleted?
    self.lexeme_start_p = current_position

    token = next_token

    tokens << token if token
  end

  tokens << Lexing::Token.new(:EOF, nil, after_source_end_location)
end

#underscore?(char) ⇒ Boolean

Returns:

  • (Boolean)


378
379
380
# File 'lib/graphlyte/lexer.rb', line 378

def underscore?(char)
  char == '_'
end