Class: Riml::Lexer

Inherits:

Object

Object
Riml::Lexer

Includes:: Constants

Defined in:: lib/riml/lexer.rb

Constant Summary collapse

SINGLE_LINE_COMMENT_REGEX =

/\A[ \t\f]*"(.*)$/

OPERATOR_REGEX =

/\A#{Regexp.union(['||', '&&', '===', '+=', '-=', '.='] + COMPARISON_OPERATORS)}/

INTERPOLATION_REGEX =

/"([^"]*?)(\#\{([^"]*?)\})([^"]*?)"/m

ANCHORED_INTERPOLATION_REGEX =

/\A#{INTERPOLATION_REGEX}/m

INTERPOLATION_SPLIT_REGEX =

/(\#\{.*?\})/m

Constants included from Constants

Constants::BUILTIN_COMMANDS, Constants::BUILTIN_FUNCTIONS, Constants::COMPARISON_OPERATORS, Constants::COMPILED_STRING_LOCATION, Constants::DEFINE_KEYWORDS, Constants::END_KEYWORDS, Constants::IGNORECASE_CAPABLE_OPERATORS, Constants::KEYWORDS, Constants::REGISTERS, Constants::RIML_CLASS_COMMANDS, Constants::RIML_COMMANDS, Constants::RIML_END_KEYWORDS, Constants::RIML_FILE_COMMANDS, Constants::RIML_KEYWORDS, Constants::SPECIAL_VARIABLE_PREFIXES, Constants::SPLAT_LITERAL, Constants::UNKNOWN_LOCATION_INFO, Constants::VIML_COMMANDS, Constants::VIML_END_KEYWORDS, Constants::VIML_KEYWORDS

Instance Attribute Summary collapse

#current_indent ⇒ Object readonly

Returns the value of attribute current_indent.
#filename ⇒ Object readonly

Returns the value of attribute filename.
#ignore_indentation_check ⇒ Object

for REPL.
#lineno ⇒ Object

Returns the value of attribute lineno.
#parser_info ⇒ Object readonly

Returns the value of attribute parser_info.
#prev_token ⇒ Object readonly

Returns the value of attribute prev_token.
#tokens ⇒ Object readonly

Returns the value of attribute tokens.

Instance Method Summary collapse

#initialize(code, filename = nil, parser_info = false) ⇒ Lexer constructor

A new instance of Lexer.
#next_token ⇒ Object

TODO: fix this slow method.
#prev_token_is_keyword?(n = 2) ⇒ Boolean

Checks if any of previous n tokens are keywords.
#tokenize ⇒ Object
#tokenize_chunk ⇒ Object

Constructor Details

#initialize(code, filename = nil, parser_info = false) ⇒ `Lexer`

Returns a new instance of Lexer.

# File 'lib/riml/lexer.rb', line 23

def initialize(code, filename = nil, parser_info = false)
  code.chomp!
  @s = StringScanner.new(code)
  @filename = filename || COMPILED_STRING_LOCATION
  @parser_info = parser_info
  # array of doubles and triples: [tokenname, tokenval, lineno_to_add(optional)]
  # ex: [[:NEWLINE, "\n"]] OR [[:NEWLINE, "\n", 1]]
  @token_buf = []
  # array of doubles OR triples, depending if `@parser_info` is set to true
  # doubles: [tokenname, tokenval]
  # ex: [[:NEWLINE, "\n"], ...]
  # triples: [tokenname, tokenval, parser_info]
  # ex: [[:NEWLINE, "\n", { :lineno => 1, :filename => 'main.riml' }], ...]
  @tokens = []
  @prev_token = nil
  @lineno = 1
  @current_indent = 0
  @indent_pending = false
  @dedent_pending = false
  @in_function_declaration = false
end

Instance Attribute Details

#current_indent ⇒ `Object` (readonly)

Returns the value of attribute current_indent.



17
18
19

# File 'lib/riml/lexer.rb', line 17

def current_indent
  @current_indent
end

#filename ⇒ `Object` (readonly)

Returns the value of attribute filename.



17
18
19

# File 'lib/riml/lexer.rb', line 17

def filename
  @filename
end

#ignore_indentation_check ⇒ `Object`

for REPL



21
22
23

# File 'lib/riml/lexer.rb', line 21

def ignore_indentation_check
  @ignore_indentation_check
end

#lineno ⇒ `Object`

Returns the value of attribute lineno.



19
20
21

# File 'lib/riml/lexer.rb', line 19

def lineno
  @lineno
end

#parser_info ⇒ `Object` (readonly)

Returns the value of attribute parser_info.



17
18
19

# File 'lib/riml/lexer.rb', line 17

def parser_info
  @parser_info
end

#prev_token ⇒ `Object` (readonly)

Returns the value of attribute prev_token.



17
18
19

# File 'lib/riml/lexer.rb', line 17

def prev_token
  @prev_token
end

#tokens ⇒ `Object` (readonly)

Returns the value of attribute tokens.



17
18
19

# File 'lib/riml/lexer.rb', line 17

def tokens
  @tokens
end

Instance Method Details

#next_token ⇒ `Object`

TODO: fix this slow method

# File 'lib/riml/lexer.rb', line 51

def next_token
  while (buf_empty = @token_buf.empty?) && !@s.eos?
    tokenize_chunk
  end
  if !buf_empty
    token = @token_buf.shift
    if token.size == 3
      @lineno += token.pop
    end
    if @parser_info
      @tokens << decorate_token(token)
      @prev_token = token.first(2)
      return token
    else
      @tokens << token
      return @prev_token = token
    end
  end
  check_indentation unless ignore_indentation_check
  nil
end

#prev_token_is_keyword?(n = 2) ⇒ `Boolean`

Checks if any of previous n tokens are keywords. If any found, return the keyword, otherwise returns ‘false`.

Returns:

(Boolean)

# File 'lib/riml/lexer.rb', line 236

def prev_token_is_keyword?(n = 2)
  return false if n <= 0
  (1..n).each do |i|
    t = tokens[-i]
    if t && t[1] && KEYWORDS.include?(t[1])
      return t[1]
    end
  end
  false
end

#tokenize ⇒ `Object`

# File 'lib/riml/lexer.rb', line 45

def tokenize
  while next_token != nil; end
  @tokens
end

#tokenize_chunk ⇒ `Object`

# File 'lib/riml/lexer.rb', line 73

def tokenize_chunk
  # deal with line continuations
  if cont = @s.scan(/\A\r?\n*[ \t\f]*\\/m)
    @lineno += cont.each_line.to_a.size - 1
    return
  end

  # all lines that start with ':' pass right through unmodified
  if (prev_token.nil? || prev_token[0] == :NEWLINE) && @s.scan(/\A[ \t\f]*:(.*)?$/)
    @token_buf << [:EX_LITERAL, @s[1]]
    return
  end

  if splat_var = @s.scan(/\Aa:\d+/)
    @token_buf << [:SCOPE_MODIFIER, 'a:'] << [:IDENTIFIER, splat_var[2..-1]]
  # the 'n' scope modifier is added by riml
  elsif @s.check(/\A([bwtglsavn]:)(\w|\{)/)
    @token_buf << [:SCOPE_MODIFIER, @s[1]]
    @s.pos += 2
  elsif scope_modifier_literal = @s.scan(/\A([bwtglsavn]:)/)
    @token_buf << [:SCOPE_MODIFIER_LITERAL, scope_modifier_literal]
  elsif special_var_prefix = (!@s.check(/\A&(\w:)?&/) && @s.scan(/\A(&(\w:)?|\$|@)/))
    @token_buf << [:SPECIAL_VAR_PREFIX, special_var_prefix.strip]
    if special_var_prefix == '@'
      next_char = @s.peek(1)
      if REGISTERS.include?(next_char)
        @token_buf << [:IDENTIFIER, next_char]
        @s.getch
      end
    else
      @expecting_identifier = true
    end
  elsif @s.scan(/\A(function)\(/)
    @token_buf << [:IDENTIFIER, @s[1]]
    @s.pos -= 1
  elsif identifier = @s.check(/\A[a-zA-Z_][\w#]*(\?|!)?/)
    # keyword identifiers
    if KEYWORDS.include?(identifier)
      if identifier.match(/\Afunction/)
        old_identifier = identifier.dup
        identifier.sub!(/function/, "def")
        @s.pos += (old_identifier.size - identifier.size)
      end

      if DEFINE_KEYWORDS.include?(identifier)
        @in_function_declaration = true
      end

      # strip '?' out of token names and replace '!' with '_bang'
      token_name = identifier.sub(/\?\Z/, "").sub(/!\Z/, "_bang").upcase
      track_indent_level(identifier)

      if VIML_END_KEYWORDS.include?(identifier)
        token_name = :END
      end

      @token_buf << [token_name.to_sym, identifier]

    elsif BUILTIN_COMMANDS.include?(identifier) && @s.peek(identifier.size + 1)[-1, 1] != '('
      @token_buf << [:BUILTIN_COMMAND, identifier]
    elsif RIML_FILE_COMMANDS.include? identifier
      @token_buf << [:RIML_FILE_COMMAND, identifier]
    elsif RIML_CLASS_COMMANDS.include? identifier
      @token_buf << [:RIML_CLASS_COMMAND, identifier]
    elsif VIML_COMMANDS.include?(identifier) && (prev_token.nil? || prev_token[0] == :NEWLINE)
      @s.pos += identifier.size
      until_eol = @s.scan(/.*$/).to_s
      @token_buf << [:EX_LITERAL, identifier << until_eol]
      return
    # method names and variable names
    else
      @token_buf << [:IDENTIFIER, identifier]
    end

    @s.pos += identifier.size

    parse_dict_vals!

  elsif @in_function_declaration && (splat_param = @s.scan(/\A(\.{3}|\*[a-zA-Z_]\w*)/))
    @token_buf << [:SPLAT_PARAM, splat_param]
  # splat in calling context. ex: super(*args) or super(*(args + other_args)) or func('hey', *args)
  elsif !@in_function_declaration && prev_token && @s.check(/\A\*(\w+|\()/)
    @token_buf << [:SPLAT_ARG, @s.getch]
  # integer (octal)
  elsif octal = @s.scan(/\A0[0-7]+/)
    @token_buf << [:NUMBER, octal]
  # integer (hex)
  elsif hex = @s.scan(/\A0[xX][0-9a-fA-F]+/)
    @token_buf << [:NUMBER, hex]
  # integer or float (decimal)
  elsif decimal = @s.scan(/\A[0-9]+(\.[0-9]+([eE][+-]?[0-9]+)?)?/)
    @token_buf << [:NUMBER, decimal]
  elsif interpolation = @s.scan(ANCHORED_INTERPOLATION_REGEX)
    # "hey there, #{name}" = "hey there, " . name
    parts = interpolation[1...-1].split(INTERPOLATION_SPLIT_REGEX)
    handle_interpolation(*parts)
  elsif (single_line_comment = @s.check(SINGLE_LINE_COMMENT_REGEX)) && (prev_token.nil? || prev_token[0] == :NEWLINE)
    @s.pos += single_line_comment.size
    @s.pos += 1 unless @s.eos? # consume newline
    @lineno += single_line_comment.each_line.to_a.size
  elsif inline_comment = @s.scan(/\A[ \t\f]*"[^"]*?$/)
    @lineno += inline_comment.each_line.to_a.size - 1
  elsif (str = lex_string_double)
    @token_buf << [:STRING_D, str]
  elsif @s.scan(/\A'(([^']|'')*)'/)
    @token_buf << [:STRING_S, @s[1]]
  elsif newlines = @s.scan(/\A([\r\n]+)/)
    # push only 1 newline
    @token_buf << [:NEWLINE, "\n"] unless prev_token && prev_token[0] == :NEWLINE

    # pending indents/dedents
    if @indent_pending
      @indent_pending = false
    elsif @dedent_pending
      @dedent_pending = false
    end
    if @in_function_declaration
      @in_function_declaration = false
    end

    @lineno += newlines.size
  # heredoc
  elsif @s.scan(%r{\A<<(.+?)\r?\n})
    pattern = @s[1]
    @s.check(%r|(.+?\r?\n)(#{Regexp.escape(pattern)})|m)
    heredoc_string = @s[1]
    @s.pos += (pattern.size + heredoc_string.size)
    heredoc_string.chomp!
    if heredoc_string =~ INTERPOLATION_REGEX || %Q("#{heredoc_string}") =~ INTERPOLATION_REGEX
      parts = heredoc_string.split(INTERPOLATION_SPLIT_REGEX)
      handle_interpolation(*parts)
    else
      @token_buf << [:STRING_D, escape_chars!(heredoc_string)]
    end
    @lineno += heredoc_string.each_line.to_a.size
  # operators of more than 1 char
  elsif operator = @s.scan(OPERATOR_REGEX)
    @token_buf << [operator, operator]
  elsif regexp = @s.scan(%r{\A/.*?[^\\]/})
    @token_buf << [:REGEXP, regexp]
  # whitespaces
  elsif @s.scan(/\A[ \t\f]+/)
  # operators and tokens of single chars, one of: ( ) , . [ ] ! + - = < > /
  else
    value = @s.getch
    if value == '|'
      @token_buf << [:NEWLINE, "\n"]
    else
      @token_buf << [value, value]
    end
    # if we encounter `funcCall().`, the next character must be treated as
    # a dictionary retrieval operation, not a string concatenation
    # operation.
    # However, if we see `funcCall().l:localVar`, we know it must be a
    # string concatenation operation.
    if value == ']' || value == ')' && (@s.peek(1) == '.' && @s.peek(3) != ':')
      parse_dict_vals!
    end
  end
end

Class: Riml::Lexer

Constant Summary collapse

Constants included from Constants

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(code, filename = nil, parser_info = false) ⇒ Lexer

Instance Attribute Details

#current_indent ⇒ Object (readonly)

#filename ⇒ Object (readonly)

#ignore_indentation_check ⇒ Object

#lineno ⇒ Object

#parser_info ⇒ Object (readonly)

#prev_token ⇒ Object (readonly)

#tokens ⇒ Object (readonly)