Class: Graphlyte::Lexer
- Inherits:
-
Object
- Object
- Graphlyte::Lexer
- Defined in:
- lib/graphlyte/lexer.rb
Overview
Transform a string into a stream of tokens - i.e. lexing
Constant Summary collapse
- LINEFEED =
"\u000a"
- CARRIAGE_RETURN =
"\u000d"
- NEW_LINE =
[LINEFEED, CARRIAGE_RETURN].freeze
- HORIZONTAL_TAB =
"\u0009"
- SPACE =
"\u0020"
- WHITESPACE =
[HORIZONTAL_TAB, SPACE].freeze
- COMMENT_CHAR =
'#'
- DOUBLE_QUOTE =
'"'
- BLOCK_QUOTE =
'"""'
- BACK_QUOTE =
'\\'
- COMMA =
','
- UNICODE_BOM =
"\ufeff"
- IGNORED =
[UNICODE_BOM, COMMA, *WHITESPACE].freeze
- PUNCTUATOR =
['!', '$', '&', '(', ')', '...', ':', '=', '@', '[', ']', '{', '|', '}'].freeze
- LETTERS =
%w[ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z ].freeze
- DIGITS =
%w[0 1 2 3 4 5 6 7 8 9].freeze
Instance Attribute Summary collapse
-
#column ⇒ Object
Returns the value of attribute column.
-
#index ⇒ Object
Returns the value of attribute index.
-
#lexeme_start_p ⇒ Object
Returns the value of attribute lexeme_start_p.
-
#line ⇒ Object
Returns the value of attribute line.
-
#source ⇒ Object
readonly
Returns the value of attribute source.
-
#tokens ⇒ Object
readonly
Returns the value of attribute tokens.
Class Method Summary collapse
Instance Method Summary collapse
- #after_source_end_location ⇒ Object
- #block_chars_raw ⇒ Object
- #block_string_content ⇒ Object
-
#chomp_lines(lines) ⇒ Object
Strip leading and trailing blank lines, and whitespace on the right margins.
- #consume(str = nil) ⇒ Object
- #current_location ⇒ Object
- #current_position ⇒ Object
- #digit?(char) ⇒ Boolean
- #eof? ⇒ Boolean
- #escaped_character ⇒ Object
- #exponent_part ⇒ Object
- #fractional_part ⇒ Object
- #hex_char ⇒ Object
- #ignore_comment_line ⇒ Object
-
#initialize(source) ⇒ Lexer
constructor
A new instance of Lexer.
- #letter?(char) ⇒ Boolean
- #lex_error(msg) ⇒ Object
- #lexical_token ⇒ Object
- #lookahead(offset = 1) ⇒ Object
- #match(str) ⇒ Object
- #name(char) ⇒ Object
- #name_continue?(char) ⇒ Boolean
- #name_start?(char) ⇒ Boolean
- #next_line! ⇒ Object
- #next_token ⇒ Object
- #number(char) ⇒ Object
- #numeric_start?(char) ⇒ Boolean
- #one_of(strings) ⇒ Object
- #punctuator ⇒ Object
- #seek(offset) ⇒ Object
- #skip_line ⇒ Object
- #source_uncompleted? ⇒ Boolean
- #string ⇒ Object
- #string_character(block_string: false) ⇒ Object
- #string_content ⇒ Object
- #string_start?(char) ⇒ Boolean
- #strip_leading_blank_lines(lines) ⇒ Object
- #strip_trailing_blank_lines(lines) ⇒ Object
- #take_while ⇒ Object
- #to_token(type) ⇒ Object
- #tokenize! ⇒ Object
- #underscore?(char) ⇒ Boolean
Constructor Details
Instance Attribute Details
#column ⇒ Object
Returns the value of attribute column.
56 57 58 |
# File 'lib/graphlyte/lexer.rb', line 56 def column @column end |
#index ⇒ Object
Returns the value of attribute index.
56 57 58 |
# File 'lib/graphlyte/lexer.rb', line 56 def index @index end |
#lexeme_start_p ⇒ Object
Returns the value of attribute lexeme_start_p.
56 57 58 |
# File 'lib/graphlyte/lexer.rb', line 56 def lexeme_start_p @lexeme_start_p end |
#line ⇒ Object
Returns the value of attribute line.
56 57 58 |
# File 'lib/graphlyte/lexer.rb', line 56 def line @line end |
#source ⇒ Object (readonly)
Returns the value of attribute source.
55 56 57 |
# File 'lib/graphlyte/lexer.rb', line 55 def source @source end |
#tokens ⇒ Object (readonly)
Returns the value of attribute tokens.
55 56 57 |
# File 'lib/graphlyte/lexer.rb', line 55 def tokens @tokens end |
Class Method Details
.lex(source) ⇒ Object
67 68 69 70 71 72 |
# File 'lib/graphlyte/lexer.rb', line 67 def self.lex(source) lexer = new(source) lexer.tokenize! lexer.tokens end |
Instance Method Details
#after_source_end_location ⇒ Object
86 87 88 |
# File 'lib/graphlyte/lexer.rb', line 86 def after_source_end_location Lexing::Location.eof end |
#block_chars_raw ⇒ Object
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
# File 'lib/graphlyte/lexer.rb', line 215 def block_chars_raw chars = [] terminated = false until eof? || (terminated = consume(BLOCK_QUOTE)) chars << BLOCK_QUOTE if consume("\\#{BLOCK_QUOTE}") chars << '"' while consume(DOUBLE_QUOTE) while (char = string_character(block_string: true)) chars << char end end lex_error('Unterminated string') unless terminated chars end |
#block_string_content ⇒ Object
190 191 192 193 194 195 196 197 198 199 200 |
# File 'lib/graphlyte/lexer.rb', line 190 def block_string_content chars = block_chars_raw lines = chomp_lines(chars.join.lines) # Consistent indentation left_margin = lines.map do |line| line.chars.take_while { _1 == ' ' }.length end.min lines.map { _1[left_margin..] }.join(LINEFEED) end |
#chomp_lines(lines) ⇒ Object
Strip leading and trailing blank lines, and whitespace on the right margins
203 204 205 |
# File 'lib/graphlyte/lexer.rb', line 203 def chomp_lines(lines) strip_trailing_blank_lines(strip_leading_blank_lines(lines.map(&:chomp))) end |
#consume(str = nil) ⇒ Object
243 244 245 246 247 248 249 250 251 |
# File 'lib/graphlyte/lexer.rb', line 243 def consume(str = nil) return if str && !match(str) c = str || lookahead self.index += c.length self.column += c.length c end |
#current_location ⇒ Object
253 254 255 |
# File 'lib/graphlyte/lexer.rb', line 253 def current_location Lexing::Location.new(lexeme_start_p, current_position) end |
#current_position ⇒ Object
257 258 259 |
# File 'lib/graphlyte/lexer.rb', line 257 def current_position Lexing::Position.new(line, column) end |
#digit?(char) ⇒ Boolean
382 383 384 |
# File 'lib/graphlyte/lexer.rb', line 382 def digit?(char) DIGITS.include?(char) end |
#eof? ⇒ Boolean
94 95 96 |
# File 'lib/graphlyte/lexer.rb', line 94 def eof? !source_uncompleted? end |
#escaped_character ⇒ Object
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
# File 'lib/graphlyte/lexer.rb', line 160 def escaped_character c = consume case c when DOUBLE_QUOTE then DOUBLE_QUOTE when BACK_QUOTE then BACK_QUOTE when '/' then '/' when 'b' then "\b" when 'f' then "\f" when 'n' then LINEFEED when 'r' then "\r" when 't' then "\t" when 'u' then hex_char else lex_error("Unexpected escaped character in string: #{c}") end end |
#exponent_part ⇒ Object
349 350 351 352 353 354 355 356 357 358 |
# File 'lib/graphlyte/lexer.rb', line 349 def exponent_part return unless one_of(%w[e E]) sign = one_of(%w[- +]) lex_error("Expected a digit, got #{lookahead}") unless digit?(lookahead) digits = take_while { digit?(_1) } [sign, digits.join] end |
#fractional_part ⇒ Object
341 342 343 344 345 346 347 |
# File 'lib/graphlyte/lexer.rb', line 341 def fractional_part return unless consume('.') lex_error("Expected a digit, got #{lookahead}") unless digit?(lookahead) take_while { digit?(_1) } end |
#hex_char ⇒ Object
178 179 180 181 182 183 184 185 186 187 188 |
# File 'lib/graphlyte/lexer.rb', line 178 def hex_char char_code = [1, 2, 3, 4].map do d = consume hex_digit = (digit?(d) || ('a'...'f').cover?(d.downcase)) lex_error("Expected a hex digit in unicode escape sequence. Got #{d.inspect}") unless hex_digit d end char_code.join.hex.chr end |
#ignore_comment_line ⇒ Object
386 387 388 389 390 |
# File 'lib/graphlyte/lexer.rb', line 386 def ignore_comment_line take_while { !NEW_LINE.include?(_1) } nil end |
#letter?(char) ⇒ Boolean
374 375 376 |
# File 'lib/graphlyte/lexer.rb', line 374 def letter?(char) LETTERS.include?(char) end |
#lex_error(msg) ⇒ Object
111 112 113 |
# File 'lib/graphlyte/lexer.rb', line 111 def lex_error(msg) raise LexError, "#{msg} at #{line}:#{column}" end |
#lexical_token ⇒ Object
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 |
# File 'lib/graphlyte/lexer.rb', line 279 def lexical_token c = consume t = if IGNORED.include?(c) nil elsif c == COMMENT_CHAR ignore_comment_line elsif name_start?(c) to_token(:NAME) { name(c) } elsif string_start?(c) to_token(:STRING) { string } elsif numeric_start?(c) to_token(:NUMBER) { number(c) } else lex_error("Unexpected character: #{c.inspect}") end Production.new(t) end |
#lookahead(offset = 1) ⇒ Object
98 99 100 101 102 103 |
# File 'lib/graphlyte/lexer.rb', line 98 def lookahead(offset = 1) lookahead_p = (index - 1) + offset return "\0" if lookahead_p >= source.length source[lookahead_p] end |
#match(str) ⇒ Object
105 106 107 108 109 |
# File 'lib/graphlyte/lexer.rb', line 105 def match(str) str.chars.each_with_index.all? do |char, offset| lookahead(offset + 1) == char end end |
#name(char) ⇒ Object
360 361 362 363 364 |
# File 'lib/graphlyte/lexer.rb', line 360 def name(char) value = [char] + take_while { name_continue?(_1) } value.join end |
#name_continue?(char) ⇒ Boolean
370 371 372 |
# File 'lib/graphlyte/lexer.rb', line 370 def name_continue?(char) letter?(char) || digit?(char) || underscore?(char) end |
#name_start?(char) ⇒ Boolean
366 367 368 |
# File 'lib/graphlyte/lexer.rb', line 366 def name_start?(char) letter?(char) || underscore?(char) end |
#next_line! ⇒ Object
298 299 300 301 |
# File 'lib/graphlyte/lexer.rb', line 298 def next_line! self.line += 1 self.column = 1 end |
#next_token ⇒ Object
261 262 263 |
# File 'lib/graphlyte/lexer.rb', line 261 def next_token (punctuator || skip_line || lexical_token).token end |
#number(char) ⇒ Object
326 327 328 329 330 331 332 333 334 335 336 337 338 339 |
# File 'lib/graphlyte/lexer.rb', line 326 def number(char) is_negated = char == '-' int_part = is_negated ? [] : [char] int_part += take_while { digit?(_1) } frac_part = fractional_part exp_part = exponent_part Syntax::NumericLiteral.new(integer_part: int_part&.join(''), fractional_part: frac_part&.join(''), exponent_part: exp_part, negated: is_negated) end |
#numeric_start?(char) ⇒ Boolean
307 308 309 310 311 312 313 314 315 316 |
# File 'lib/graphlyte/lexer.rb', line 307 def numeric_start?(char) case char when '-' DIGITS.include?(lookahead) when '0' !DIGITS.include?(lookahead) else char != '0' && DIGITS.include?(char) end end |
#one_of(strings) ⇒ Object
115 116 117 118 119 120 121 |
# File 'lib/graphlyte/lexer.rb', line 115 def one_of(strings) strings.each do |s| return s if consume(s) end nil end |
#punctuator ⇒ Object
265 266 267 268 269 |
# File 'lib/graphlyte/lexer.rb', line 265 def punctuator p = one_of(PUNCTUATOR) Production.new(Lexing::Token.new(:PUNCTUATOR, p, current_location)) if p end |
#seek(offset) ⇒ Object
239 240 241 |
# File 'lib/graphlyte/lexer.rb', line 239 def seek(offset) self.index += offset end |
#skip_line ⇒ Object
271 272 273 274 275 276 277 |
# File 'lib/graphlyte/lexer.rb', line 271 def skip_line lf = one_of([LINEFEED, "#{CARRIAGE_RETURN}#{LINEFEED}"]) return unless lf next_line! Production.new(nil) end |
#source_uncompleted? ⇒ Boolean
90 91 92 |
# File 'lib/graphlyte/lexer.rb', line 90 def source_uncompleted? index < source.length end |
#string ⇒ Object
123 124 125 126 127 128 129 130 131 132 |
# File 'lib/graphlyte/lexer.rb', line 123 def string if lookahead == DOUBLE_QUOTE && lookahead(2) != DOUBLE_QUOTE consume '' # The empty string elsif consume('""') # Block string block_string_content else string_content end end |
#string_character(block_string: false) ⇒ Object
145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/graphlyte/lexer.rb', line 145 def string_character(block_string: false) return if eof? return if lookahead == DOUBLE_QUOTE c = consume lex_error("Illegal character #{c.inspect}") if !block_string && NEW_LINE.include?(c) if c == BACK_QUOTE escaped_character else c end end |
#string_content ⇒ Object
134 135 136 137 138 139 140 141 142 143 |
# File 'lib/graphlyte/lexer.rb', line 134 def string_content chars = [] while (char = string_character) chars << char end lex_error('Unterminated string') unless consume(DOUBLE_QUOTE) chars.join end |
#string_start?(char) ⇒ Boolean
303 304 305 |
# File 'lib/graphlyte/lexer.rb', line 303 def string_start?(char) char == '"' end |
#strip_leading_blank_lines(lines) ⇒ Object
207 208 209 |
# File 'lib/graphlyte/lexer.rb', line 207 def strip_leading_blank_lines(lines) lines.drop_while { _1 =~ /^\s*$/ } end |
#strip_trailing_blank_lines(lines) ⇒ Object
211 212 213 |
# File 'lib/graphlyte/lexer.rb', line 211 def strip_trailing_blank_lines(lines) strip_leading_blank_lines(lines.reverse).reverse end |
#take_while ⇒ Object
232 233 234 235 236 237 |
# File 'lib/graphlyte/lexer.rb', line 232 def take_while chars = [] chars << consume while yield(lookahead) chars end |
#to_token(type) ⇒ Object
318 319 320 321 322 323 324 |
# File 'lib/graphlyte/lexer.rb', line 318 def to_token(type) i = index - 1 value = yield j = index Lexing::Token.new(type, source[i..j], current_location, value: value) end |
#tokenize! ⇒ Object
74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/graphlyte/lexer.rb', line 74 def tokenize! while source_uncompleted? self.lexeme_start_p = current_position token = next_token tokens << token if token end tokens << Lexing::Token.new(:EOF, nil, after_source_end_location) end |
#underscore?(char) ⇒ Boolean
378 379 380 |
# File 'lib/graphlyte/lexer.rb', line 378 def underscore?(char) char == '_' end |