Class: Graphlyte::Lexer

Inherits:

Object

Object
Graphlyte::Lexer

show all

Defined in:: lib/graphlyte/lexer.rb

Overview

Transform a string into a stream of tokens - i.e. lexing

Constant Summary collapse

LINEFEED =

"\u000a"

CARRIAGE_RETURN =

"\u000d"

NEW_LINE =

[LINEFEED, CARRIAGE_RETURN].freeze

HORIZONTAL_TAB =

"\u0009"

SPACE =

"\u0020"

WHITESPACE =

[HORIZONTAL_TAB, SPACE].freeze

COMMENT_CHAR =

'#'

DOUBLE_QUOTE =

'"'

BLOCK_QUOTE =

'"""'

BACK_QUOTE =

'\\'

COMMA =

','

UNICODE_BOM =

"\ufeff"

IGNORED =

[UNICODE_BOM, COMMA, *WHITESPACE].freeze

PUNCTUATOR =

['!', '$', '&', '(', ')', '...', ':', '=', '@', '[', ']', '{', '|', '}'].freeze

LETTERS =

%w[
  A B C D E F G H I J K L M
  N O P Q R S T U V W X Y Z
  a b c d e f g h i j k l m
  n o p q r s t u v w x y z
].freeze

DIGITS =

%w[0 1 2 3 4 5 6 7 8 9].freeze

Instance Attribute Summary collapse

#column ⇒ Object

Returns the value of attribute column.
#index ⇒ Object

Returns the value of attribute index.
#lexeme_start_p ⇒ Object

Returns the value of attribute lexeme_start_p.
#line ⇒ Object

Returns the value of attribute line.
#source ⇒ Object readonly

Returns the value of attribute source.
#tokens ⇒ Object readonly

Returns the value of attribute tokens.

Class Method Summary collapse

.lex(source) ⇒ Object

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ `Lexer`

Returns a new instance of Lexer.

# File 'lib/graphlyte/lexer.rb', line 58

def initialize(source)
  @source = source
  @tokens = []
  @line = 1
  @column = 1
  @index = 0
  @lexeme_start_p = Lexing::Position.new(0, 0)
end

Instance Attribute Details

#column ⇒ `Object`

Returns the value of attribute column.



56
57
58

# File 'lib/graphlyte/lexer.rb', line 56

def column
  @column
end

#index ⇒ `Object`

Returns the value of attribute index.



56
57
58

# File 'lib/graphlyte/lexer.rb', line 56

def index
  @index
end

#lexeme_start_p ⇒ `Object`

Returns the value of attribute lexeme_start_p.



56
57
58

# File 'lib/graphlyte/lexer.rb', line 56

def lexeme_start_p
  @lexeme_start_p
end

#line ⇒ `Object`

Returns the value of attribute line.



56
57
58

# File 'lib/graphlyte/lexer.rb', line 56

def line
  @line
end

#source ⇒ `Object` (readonly)

Returns the value of attribute source.



55
56
57

# File 'lib/graphlyte/lexer.rb', line 55

def source
  @source
end

#tokens ⇒ `Object` (readonly)

Returns the value of attribute tokens.



55
56
57

# File 'lib/graphlyte/lexer.rb', line 55

def tokens
  @tokens
end

Class Method Details

.lex(source) ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 67

def self.lex(source)
  lexer = new(source)
  lexer.tokenize!

  lexer.tokens
end

Instance Method Details

#after_source_end_location ⇒ `Object`



86
87
88

# File 'lib/graphlyte/lexer.rb', line 86

def after_source_end_location
  Lexing::Location.eof
end

#block_chars_raw ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 215

def block_chars_raw
  chars = []
  terminated = false

  until eof? || (terminated = consume(BLOCK_QUOTE))
    chars << BLOCK_QUOTE if consume("\\#{BLOCK_QUOTE}")
    chars << '"' while consume(DOUBLE_QUOTE)
    while (char = string_character(block_string: true))
      chars << char
    end
  end

  lex_error('Unterminated string') unless terminated

  chars
end

#block_string_content ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 190

def block_string_content
  chars = block_chars_raw

  lines = chomp_lines(chars.join.lines)
  # Consistent indentation
  left_margin = lines.map do |line|
    line.chars.take_while { _1 == ' ' }.length
  end.min

  lines.map { _1[left_margin..] }.join(LINEFEED)
end

#chomp_lines(lines) ⇒ `Object`

Strip leading and trailing blank lines, and whitespace on the right margins



203
204
205

# File 'lib/graphlyte/lexer.rb', line 203

def chomp_lines(lines)
  strip_trailing_blank_lines(strip_leading_blank_lines(lines.map(&:chomp)))
end

#consume(str = nil) ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 243

def consume(str = nil)
  return if str && !match(str)

  c = str || lookahead

  self.index += c.length
  self.column += c.length
  c
end

#current_location ⇒ `Object`



253
254
255

# File 'lib/graphlyte/lexer.rb', line 253

def current_location
  Lexing::Location.new(lexeme_start_p, current_position)
end

#current_position ⇒ `Object`



257
258
259

# File 'lib/graphlyte/lexer.rb', line 257

def current_position
  Lexing::Position.new(line, column)
end

#digit?(char) ⇒ `Boolean`

Returns:

(Boolean)



382
383
384

# File 'lib/graphlyte/lexer.rb', line 382

def digit?(char)
  DIGITS.include?(char)
end

#eof? ⇒ `Boolean`

Returns:

(Boolean)



94
95
96

# File 'lib/graphlyte/lexer.rb', line 94

def eof?
  !source_uncompleted?
end

#escaped_character ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 160

def escaped_character
  c = consume

  case c
  when DOUBLE_QUOTE then DOUBLE_QUOTE
  when BACK_QUOTE then BACK_QUOTE
  when '/' then '/'
  when 'b' then "\b"
  when 'f' then "\f"
  when 'n' then LINEFEED
  when 'r' then "\r"
  when 't' then "\t"
  when 'u' then hex_char
  else
    lex_error("Unexpected escaped character in string: #{c}")
  end
end

#exponent_part ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 349

def exponent_part
  return unless one_of(%w[e E])

  sign = one_of(%w[- +])
  lex_error("Expected a digit, got #{lookahead}") unless digit?(lookahead)

  digits = take_while { digit?(_1) }

  [sign, digits.join]
end

#fractional_part ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 341

def fractional_part
  return unless consume('.')

  lex_error("Expected a digit, got #{lookahead}") unless digit?(lookahead)

  take_while { digit?(_1) }
end

#hex_char ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 178

def hex_char
  char_code = [1, 2, 3, 4].map do
    d = consume
    hex_digit = (digit?(d) || ('a'...'f').cover?(d.downcase))
    lex_error("Expected a hex digit in unicode escape sequence. Got #{d.inspect}") unless hex_digit

    d
  end

  char_code.join.hex.chr
end

#ignore_comment_line ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 386

def ignore_comment_line
  take_while { !NEW_LINE.include?(_1) }

  nil
end

#letter?(char) ⇒ `Boolean`

Returns:

(Boolean)



374
375
376

# File 'lib/graphlyte/lexer.rb', line 374

def letter?(char)
  LETTERS.include?(char)
end

#lex_error(msg) ⇒ `Object`

Raises:

(LexError)



111
112
113

# File 'lib/graphlyte/lexer.rb', line 111

def lex_error(msg)
  raise LexError, "#{msg} at #{line}:#{column}"
end

#lexical_token ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 279

def lexical_token
  c = consume
  t = if IGNORED.include?(c)
        nil
      elsif c == COMMENT_CHAR
        ignore_comment_line
      elsif name_start?(c)
        to_token(:NAME)   { name(c)   }
      elsif string_start?(c)
        to_token(:STRING) { string    }
      elsif numeric_start?(c)
        to_token(:NUMBER) { number(c) }
      else
        lex_error("Unexpected character: #{c.inspect}")
      end

  Production.new(t)
end

#lookahead(offset = 1) ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 98

def lookahead(offset = 1)
  lookahead_p = (index - 1) + offset
  return "\0" if lookahead_p >= source.length

  source[lookahead_p]
end

#match(str) ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 105

def match(str)
  str.chars.each_with_index.all? do |char, offset|
    lookahead(offset + 1) == char
  end
end

#name(char) ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 360

def name(char)
  value = [char] + take_while { name_continue?(_1) }

  value.join
end

#name_continue?(char) ⇒ `Boolean`

Returns:

(Boolean)



370
371
372

# File 'lib/graphlyte/lexer.rb', line 370

def name_continue?(char)
  letter?(char) || digit?(char) || underscore?(char)
end

#name_start?(char) ⇒ `Boolean`

Returns:

(Boolean)



366
367
368

# File 'lib/graphlyte/lexer.rb', line 366

def name_start?(char)
  letter?(char) || underscore?(char)
end

#next_line! ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 298

def next_line!
  self.line += 1
  self.column = 1
end

#next_token ⇒ `Object`



261
262
263

# File 'lib/graphlyte/lexer.rb', line 261

def next_token
  (punctuator || skip_line || lexical_token).token
end

#number(char) ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 326

def number(char)
  is_negated = char == '-'

  int_part = is_negated ? [] : [char]
  int_part += take_while { digit?(_1) }

  frac_part = fractional_part
  exp_part = exponent_part

  Syntax::NumericLiteral.new(integer_part: int_part&.join(''),
                             fractional_part: frac_part&.join(''),
                             exponent_part: exp_part,
                             negated: is_negated)
end

#numeric_start?(char) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/graphlyte/lexer.rb', line 307

def numeric_start?(char)
  case char
  when '-'
    DIGITS.include?(lookahead)
  when '0'
    !DIGITS.include?(lookahead)
  else
    char != '0' && DIGITS.include?(char)
  end
end

#one_of(strings) ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 115

def one_of(strings)
  strings.each do |s|
    return s if consume(s)
  end

  nil
end

#punctuator ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 265

def punctuator
  p = one_of(PUNCTUATOR)

  Production.new(Lexing::Token.new(:PUNCTUATOR, p, current_location)) if p
end

#seek(offset) ⇒ `Object`



239
240
241

# File 'lib/graphlyte/lexer.rb', line 239

def seek(offset)
  self.index += offset
end

#skip_line ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 271

def skip_line
  lf = one_of([LINEFEED, "#{CARRIAGE_RETURN}#{LINEFEED}"])
  return unless lf

  next_line!
  Production.new(nil)
end

#source_uncompleted? ⇒ `Boolean`

Returns:

(Boolean)



90
91
92

# File 'lib/graphlyte/lexer.rb', line 90

def source_uncompleted?
  index < source.length
end

#string ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 123

def string
  if lookahead == DOUBLE_QUOTE && lookahead(2) != DOUBLE_QUOTE
    consume
    '' # The empty string
  elsif consume('""') # Block string
    block_string_content
  else
    string_content
  end
end

#string_character(block_string: false) ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 145

def string_character(block_string: false)
  return if eof?
  return if lookahead == DOUBLE_QUOTE

  c = consume

  lex_error("Illegal character #{c.inspect}") if !block_string && NEW_LINE.include?(c)

  if c == BACK_QUOTE
    escaped_character
  else
    c
  end
end

#string_content ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 134

def string_content
  chars = []
  while (char = string_character)
    chars << char
  end

  lex_error('Unterminated string') unless consume(DOUBLE_QUOTE)

  chars.join
end

#string_start?(char) ⇒ `Boolean`

Returns:

(Boolean)



303
304
305

# File 'lib/graphlyte/lexer.rb', line 303

def string_start?(char)
  char == '"'
end

#strip_leading_blank_lines(lines) ⇒ `Object`



207
208
209

# File 'lib/graphlyte/lexer.rb', line 207

def strip_leading_blank_lines(lines)
  lines.drop_while { _1 =~ /^\s*$/ }
end

#strip_trailing_blank_lines(lines) ⇒ `Object`



211
212
213

# File 'lib/graphlyte/lexer.rb', line 211

def strip_trailing_blank_lines(lines)
  strip_leading_blank_lines(lines.reverse).reverse
end

#take_while ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 232

def take_while
  chars = []
  chars << consume while yield(lookahead)

  chars
end

#to_token(type) ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 318

def to_token(type)
  i = index - 1
  value = yield
  j = index

  Lexing::Token.new(type, source[i..j], current_location, value: value)
end

#tokenize! ⇒ `Object`

# File 'lib/graphlyte/lexer.rb', line 74

def tokenize!
  while source_uncompleted?
    self.lexeme_start_p = current_position

    token = next_token

    tokens << token if token
  end

  tokens << Lexing::Token.new(:EOF, nil, after_source_end_location)
end

#underscore?(char) ⇒ `Boolean`

Returns:

(Boolean)



378
379
380

# File 'lib/graphlyte/lexer.rb', line 378

def underscore?(char)
  char == '_'
end

Class: Graphlyte::Lexer

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ Lexer

Instance Attribute Details

#column ⇒ Object

#index ⇒ Object

#lexeme_start_p ⇒ Object

#line ⇒ Object

#source ⇒ Object (readonly)

#tokens ⇒ Object (readonly)

Class Method Details

.lex(source) ⇒ Object

Instance Method Details

#after_source_end_location ⇒ Object

#block_chars_raw ⇒ Object

#block_string_content ⇒ Object

#chomp_lines(lines) ⇒ Object

#consume(str = nil) ⇒ Object

#current_location ⇒ Object

#current_position ⇒ Object

#digit?(char) ⇒ Boolean

#eof? ⇒ Boolean

#escaped_character ⇒ Object

#exponent_part ⇒ Object

#fractional_part ⇒ Object

#hex_char ⇒ Object

#ignore_comment_line ⇒ Object

#letter?(char) ⇒ Boolean

#lex_error(msg) ⇒ Object

#lexical_token ⇒ Object

#lookahead(offset = 1) ⇒ Object

#match(str) ⇒ Object

#name(char) ⇒ Object

#name_continue?(char) ⇒ Boolean

#name_start?(char) ⇒ Boolean

#next_line! ⇒ Object

#next_token ⇒ Object

#number(char) ⇒ Object

#numeric_start?(char) ⇒ Boolean

#one_of(strings) ⇒ Object

#punctuator ⇒ Object

#seek(offset) ⇒ Object

#skip_line ⇒ Object

#source_uncompleted? ⇒ Boolean

#string ⇒ Object

#string_character(block_string: false) ⇒ Object

#string_content ⇒ Object

#string_start?(char) ⇒ Boolean

#strip_leading_blank_lines(lines) ⇒ Object

#strip_trailing_blank_lines(lines) ⇒ Object

#take_while ⇒ Object

#to_token(type) ⇒ Object

#tokenize! ⇒ Object

#underscore?(char) ⇒ Boolean

#initialize(source) ⇒ `Lexer`

#column ⇒ `Object`

#index ⇒ `Object`

#lexeme_start_p ⇒ `Object`

#line ⇒ `Object`

#source ⇒ `Object` (readonly)

#tokens ⇒ `Object` (readonly)

.lex(source) ⇒ `Object`

#after_source_end_location ⇒ `Object`

#block_chars_raw ⇒ `Object`

#block_string_content ⇒ `Object`

#chomp_lines(lines) ⇒ `Object`

#consume(str = nil) ⇒ `Object`

#current_location ⇒ `Object`

#current_position ⇒ `Object`

#digit?(char) ⇒ `Boolean`

#eof? ⇒ `Boolean`

#escaped_character ⇒ `Object`

#exponent_part ⇒ `Object`

#fractional_part ⇒ `Object`

#hex_char ⇒ `Object`

#ignore_comment_line ⇒ `Object`

#letter?(char) ⇒ `Boolean`

#lex_error(msg) ⇒ `Object`

#lexical_token ⇒ `Object`

#lookahead(offset = 1) ⇒ `Object`

#match(str) ⇒ `Object`

#name(char) ⇒ `Object`

#name_continue?(char) ⇒ `Boolean`

#name_start?(char) ⇒ `Boolean`

#next_line! ⇒ `Object`

#next_token ⇒ `Object`

#number(char) ⇒ `Object`

#numeric_start?(char) ⇒ `Boolean`

#one_of(strings) ⇒ `Object`

#punctuator ⇒ `Object`

#seek(offset) ⇒ `Object`

#skip_line ⇒ `Object`

#source_uncompleted? ⇒ `Boolean`

#string ⇒ `Object`

#string_character(block_string: false) ⇒ `Object`

#string_content ⇒ `Object`

#string_start?(char) ⇒ `Boolean`

#strip_leading_blank_lines(lines) ⇒ `Object`

#strip_trailing_blank_lines(lines) ⇒ `Object`

#take_while ⇒ `Object`

#to_token(type) ⇒ `Object`

#tokenize! ⇒ `Object`

#underscore?(char) ⇒ `Boolean`