Class: RubyLexer

Inherits:

Object

Object
RubyLexer

Defined in:: lib/ruby_lexer.rb

Constant Summary collapse

ESC_RE =

/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2}|M-[^\\]|(C-|c)[^\\]|[^0-7xMCc])/

EOF =

:eof_haha!

STR_FUNC_BORING = ruby constants for strings (should this be moved somewhere else?)

0x00

STR_FUNC_ESCAPE = TODO: remove and replace with REGEXP

0x01

STR_FUNC_EXPAND =

0x02

STR_FUNC_REGEXP =

0x04

STR_FUNC_AWORDS =

0x08

STR_FUNC_SYMBOL =

0x10

STR_FUNC_INDENT = <<-HEREDOC

0x20

STR_SQUOTE =

STR_FUNC_BORING

STR_DQUOTE =

STR_FUNC_BORING | STR_FUNC_EXPAND

STR_XQUOTE =

STR_FUNC_BORING | STR_FUNC_EXPAND

STR_REGEXP =

STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND

STR_SSYM =

STR_FUNC_SYMBOL

STR_DSYM =

STR_FUNC_SYMBOL | STR_FUNC_EXPAND

TOKENS =

{
  "!"   => :tBANG,
  "!="  => :tNEQ,
  "!~"  => :tNMATCH,
  ","   => :tCOMMA,
  ".."  => :tDOT2,
  "..." => :tDOT3,
  "="   => :tEQL,
  "=="  => :tEQ,
  "===" => :tEQQ,
  "=>"  => :tASSOC,
  "=~"  => :tMATCH,
}

Instance Attribute Summary collapse

#cmdarg ⇒ Object

Returns the value of attribute cmdarg.
#command_start ⇒ Object

Returns the value of attribute command_start.
#cond ⇒ Object

Returns the value of attribute cond.
#lex_state ⇒ Object

Additional context surrounding tokens that both the lexer and grammar use.
#lex_strterm ⇒ Object

Returns the value of attribute lex_strterm.
#lineno ⇒ Object
#nest ⇒ Object

Returns the value of attribute nest.
#parser ⇒ Object

HACK for very end of lexer…
#src ⇒ Object

Stream of data that yylex examines.
#string_buffer ⇒ Object

Returns the value of attribute string_buffer.
#token ⇒ Object

Last token read via yylex.
#warnings ⇒ Object

What handles warnings.
#yacc_value ⇒ Object

Value of last token which had a value associated with it.

Instance Method Summary collapse

#advance ⇒ Object

How the parser advances to the next token.
#arg_ambiguous ⇒ Object
#comments ⇒ Object
#expr_beg_push(val) ⇒ Object
#fix_arg_lex_state ⇒ Object
#heredoc(here) ⇒ Object

63 lines.
#heredoc_identifier ⇒ Object

51 lines.
#initialize ⇒ RubyLexer constructor

A new instance of RubyLexer.
#int_with_base(base) ⇒ Object
#parse_number ⇒ Object

Parse a number from the input stream.
#parse_quote ⇒ Object

58 lines.
#parse_string(quote) ⇒ Object

65 lines.
#process_token(command_state) ⇒ Object
#rb_compile_error(msg) ⇒ Object
#read_escape ⇒ Object

51 lines.
#regx_options ⇒ Object

15 lines.
#reset ⇒ Object
#tokadd_escape(term) ⇒ Object

20 lines.
#tokadd_string(func, term, paren) ⇒ Object

105 lines.
#unescape(s) ⇒ Object
#warning(s) ⇒ Object
#yylex ⇒ Object

Returns the next token.
#yylex_string ⇒ Object

23 lines.

Constructor Details

#initialize ⇒ `RubyLexer`

Returns a new instance of RubyLexer.

# File 'lib/ruby_lexer.rb', line 219

def initialize
  self.cond = StackState.new(:cond)
  self.cmdarg = StackState.new(:cmdarg)
  self.nest = 0
  @comments = []

  reset
end

Instance Attribute Details

#cmdarg ⇒ `Object`

Returns the value of attribute cmdarg.



3
4
5

# File 'lib/ruby_lexer.rb', line 3

def cmdarg
  @cmdarg
end

#command_start ⇒ `Object`

Returns the value of attribute command_start.



2
3
4

# File 'lib/ruby_lexer.rb', line 2

def command_start
  @command_start
end

#cond ⇒ `Object`

Returns the value of attribute cond.



4
5
6

# File 'lib/ruby_lexer.rb', line 4

def cond
  @cond
end

#lex_state ⇒ `Object`

Additional context surrounding tokens that both the lexer and grammar use.



11
12
13

# File 'lib/ruby_lexer.rb', line 11

def lex_state
  @lex_state
end

#lex_strterm ⇒ `Object`

Returns the value of attribute lex_strterm.



13
14
15

# File 'lib/ruby_lexer.rb', line 13

def lex_strterm
  @lex_strterm
end

#lineno ⇒ `Object`



240
241
242

# File 'lib/ruby_lexer.rb', line 240

def lineno
  @lineno ||= src.lineno
end

#nest ⇒ `Object`

Returns the value of attribute nest.



5
6
7

# File 'lib/ruby_lexer.rb', line 5

def nest
  @nest
end

#parser ⇒ `Object`

HACK for very end of lexer… sigh



15
16
17

# File 'lib/ruby_lexer.rb', line 15

def parser
  @parser
end

#src ⇒ `Object`

Stream of data that yylex examines.



18
19
20

# File 'lib/ruby_lexer.rb', line 18

def src
  @src
end

#string_buffer ⇒ `Object`

Returns the value of attribute string_buffer.



23
24
25

# File 'lib/ruby_lexer.rb', line 23

def string_buffer
  @string_buffer
end

#token ⇒ `Object`

Last token read via yylex.



21
22
23

# File 'lib/ruby_lexer.rb', line 21

def token
  @token
end

#warnings ⇒ `Object`

What handles warnings



29
30
31

# File 'lib/ruby_lexer.rb', line 29

def warnings
  @warnings
end

#yacc_value ⇒ `Object`

Value of last token which had a value associated with it.



26
27
28

# File 'lib/ruby_lexer.rb', line 26

def yacc_value
  @yacc_value
end

Instance Method Details

#advance ⇒ `Object`

How the parser advances to the next token.

Returns:

true if not at end of file (EOF).

# File 'lib/ruby_lexer.rb', line 67

def advance
  r = yylex
  self.token = r

  raise "yylex returned nil" unless r

  return RubyLexer::EOF != r
end

#arg_ambiguous ⇒ `Object`



76
77
78

# File 'lib/ruby_lexer.rb', line 76

def arg_ambiguous
  self.warning("Ambiguous first argument. make sure.")
end

#comments ⇒ `Object`

# File 'lib/ruby_lexer.rb', line 80

def comments
  c = @comments.join
  @comments.clear
  c
end

#expr_beg_push(val) ⇒ `Object`

# File 'lib/ruby_lexer.rb', line 86

def expr_beg_push val
  cond.push false
  cmdarg.push false
  self.lex_state = :expr_beg
  self.yacc_value = val
end

#fix_arg_lex_state ⇒ `Object`

# File 'lib/ruby_lexer.rb', line 93

def fix_arg_lex_state
  self.lex_state = if lex_state == :expr_fname || lex_state == :expr_dot
                     :expr_arg
                   else
                     :expr_beg
                   end
end

#heredoc(here) ⇒ `Object`

63 lines

# File 'lib/ruby_lexer.rb', line 101

def heredoc here # 63 lines
  _, eos, func, last_line = here

  indent  = (func & STR_FUNC_INDENT) != 0
  expand  = (func & STR_FUNC_EXPAND) != 0
  eos_re  = indent ? /[ \t]*#{eos}(\r?\n|\z)/ : /#{eos}(\r?\n|\z)/
  err_msg = "can't match #{eos_re.inspect} anywhere in "

  rb_compile_error err_msg if
    src.eos?

  if src.beginning_of_line? && src.scan(eos_re) then
    src.unread_many last_line # TODO: figure out how to remove this
    self.yacc_value = eos
    return :tSTRING_END
  end

  self.string_buffer = []

  if expand then
    case
    when src.scan(/#[$@]/) then
      src.pos -= 1 # FIX omg stupid
      self.yacc_value = src.matched
      return :tSTRING_DVAR
    when src.scan(/#[{]/) then
      self.yacc_value = src.matched
      return :tSTRING_DBEG
    when src.scan(/#/) then
      string_buffer << '#'
    end

    until src.scan(eos_re) do
      c = tokadd_string func, "\n", nil

      rb_compile_error err_msg if
        c == RubyLexer::EOF

      if c != "\n" then
        self.yacc_value = string_buffer.join.delete("\r")
        return :tSTRING_CONTENT
      else
        string_buffer << src.scan(/\n/)
      end

      rb_compile_error err_msg if
        src.eos?
    end

    # tack on a NL after the heredoc token - FIX NL should not be needed
    src.unread_many(eos + "\n") # TODO: remove this... stupid stupid stupid
  else
    until src.check(eos_re) do
      string_buffer << src.scan(/.*(\n|\z)/)
      rb_compile_error err_msg if
        src.eos?
    end
  end

  self.lex_strterm = [:heredoc, eos, func, last_line]
  self.yacc_value = string_buffer.join.delete("\r")

  return :tSTRING_CONTENT
end

#heredoc_identifier ⇒ `Object`

51 lines

# File 'lib/ruby_lexer.rb', line 166

def heredoc_identifier # 51 lines
  term, func = nil, STR_FUNC_BORING
  self.string_buffer = []

  case
  when src.scan(/(-?)(['"`])(.*?)\2/) then
    term = src[2]
    unless src[1].empty? then
      func |= STR_FUNC_INDENT
    end
    func |= case term
            when "\'" then
              STR_SQUOTE
            when '"' then
              STR_DQUOTE
            else
              STR_XQUOTE
            end
    string_buffer << src[3]
  when src.scan(/-?(['"`])(?!\1*\Z)/) then
    rb_compile_error "unterminated here document identifier"
  when src.scan(/(-?)(\w+)/) then
    term = '"'
    func |= STR_DQUOTE
    unless src[1].empty? then
      func |= STR_FUNC_INDENT
    end
    string_buffer << src[2]
  else
    return nil
  end

  if src.check(/.*\n/) then
    # TODO: think about storing off the char range instead
    line = src.string[src.pos, src.matched_size]
    src.string[src.pos, src.matched_size] = "\n"
    src.extra_lines_added += 1
    src.pos += 1
  else
    line = nil
  end

  self.lex_strterm = [:heredoc, string_buffer.join, func, line]

  if term == '`' then
    self.yacc_value = "`"
    return :tXSTRING_BEG
  else
    self.yacc_value = "\""
    return :tSTRING_BEG
  end
end

#int_with_base(base) ⇒ `Object`

# File 'lib/ruby_lexer.rb', line 228

def int_with_base base
  rb_compile_error "Invalid numeric format" if src.matched =~ /__/
  self.yacc_value = src.matched.to_i(base)
  return :tINTEGER
end

#parse_number ⇒ `Object`

Parse a number from the input stream.

Parameters:

c —

The first character of the number.

Returns:

A int constant wich represents a token.

# File 'lib/ruby_lexer.rb', line 250

def parse_number
  self.lex_state = :expr_end

  case
  when src.scan(/[+-]?0[xbd]\b/) then
    rb_compile_error "Invalid numeric format"
  when src.scan(/[+-]?0x[a-f0-9_]+/i) then
    int_with_base(16)
  when src.scan(/[+-]?0b[01_]+/) then
    int_with_base(2)
  when src.scan(/[+-]?0d[0-9_]+/) then
    int_with_base(10)
  when src.scan(/[+-]?0[Oo]?[0-7_]*[89]/) then
    rb_compile_error "Illegal octal digit."
  when src.scan(/[+-]?0[Oo]?[0-7_]+|0[Oo]/) then
    int_with_base(8)
  when src.scan(/[+-]?[\d_]+_(e|\.)/) then
    rb_compile_error "Trailing '_' in number."
  when src.scan(/[+-]?[\d_]+\.[\d_]+(e[+-]?[\d_]+)?\b|[+-]?[\d_]+e[+-]?[\d_]+\b/i) then
    number = src.matched
    if number =~ /__/ then
      rb_compile_error "Invalid numeric format"
    end
    self.yacc_value = number.to_f
    :tFLOAT
  when src.scan(/[+-]?0\b/) then
    int_with_base(10)
  when src.scan(/[+-]?[\d_]+\b/) then
    int_with_base(10)
  else
    rb_compile_error "Bad number format"
  end
end

#parse_quote ⇒ `Object`

58 lines

# File 'lib/ruby_lexer.rb', line 284

def parse_quote # 58 lines
  beg, nnd, short_hand, c = nil, nil, false, nil

  if src.scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
    rb_compile_error "unknown type of %string" if src.matched_size == 2
    c, beg, short_hand = src.matched, src.getch, false
  else                               # Short-hand (e.g. %{, %., %!, etc)
    c, beg, short_hand = 'Q', src.getch, true
  end

  if src.eos? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
    rb_compile_error "unterminated quoted string meets end of file"
  end

  # Figure nnd-char.  "\0" is special to indicate beg=nnd and that no nesting?
  nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
  nnd, beg = beg, "\0" if nnd.nil?

  token_type, self.yacc_value = nil, "%#{c}#{beg}"
  token_type, string_type = case c
                            when 'Q' then
                              ch = short_hand ? nnd : c + beg
                              self.yacc_value = "%#{ch}"
                              [:tSTRING_BEG,   STR_DQUOTE]
                            when 'q' then
                              [:tSTRING_BEG,   STR_SQUOTE]
                            when 'W' then
                              src.scan(/\s*/)
                              [:tWORDS_BEG,    STR_DQUOTE | STR_FUNC_AWORDS]
                            when 'w' then
                              src.scan(/\s*/)
                              [:tAWORDS_BEG,   STR_SQUOTE | STR_FUNC_AWORDS]
                            when 'x' then
                              [:tXSTRING_BEG,  STR_XQUOTE]
                            when 'r' then
                              [:tREGEXP_BEG,   STR_REGEXP]
                            when 's' then
                              self.lex_state  = :expr_fname
                              [:tSYMBEG,       STR_SSYM]
                            end

  rb_compile_error "Bad %string type. Expected [Qqwxr\W], found '#{c}'." if
    token_type.nil?

  self.lex_strterm = [:strterm, string_type, nnd, beg]

  return token_type
end

#parse_string(quote) ⇒ `Object`

65 lines

# File 'lib/ruby_lexer.rb', line 333

def parse_string(quote) # 65 lines
  _, string_type, term, open = quote

  space = false # FIX: remove these
  func = string_type
  paren = open
  term_re = Regexp.escape term

  awords = (func & STR_FUNC_AWORDS) != 0
  regexp = (func & STR_FUNC_REGEXP) != 0
  expand = (func & STR_FUNC_EXPAND) != 0

  unless func then # FIX: impossible, prolly needs == 0
    self.lineno = nil
    return :tSTRING_END
  end

  space = true if awords and src.scan(/\s+/)

  if self.nest == 0 && src.scan(/#{term_re}/) then
    if awords then
      quote[1] = nil
      return :tSPACE
    elsif regexp then
      self.yacc_value = self.regx_options
      self.lineno = nil
      return :tREGEXP_END
    else
      self.yacc_value = term
      self.lineno = nil
      return :tSTRING_END
    end
  end

  if space then
    return :tSPACE
  end

  self.string_buffer = []

  if expand
    case
    when src.scan(/#(?=[$@])/) then
      return :tSTRING_DVAR
    when src.scan(/#[{]/) then
      return :tSTRING_DBEG
    when src.scan(/#/) then
      string_buffer << '#'
    end
  end

  if tokadd_string(func, term, paren) == RubyLexer::EOF then
    rb_compile_error "unterminated string meets end of file"
  end

  self.yacc_value = string_buffer.join


  return :tSTRING_CONTENT
end

#process_token(command_state) ⇒ `Object`

# File 'lib/ruby_lexer.rb', line 1211

def process_token(command_state)

  token << src.matched if token =~ /^\w/ && src.scan(/[\!\?](?!=)/)

  result = nil
  last_state = lex_state


  case token
  when /^\$/ then
    self.lex_state, result = :expr_end, :tGVAR
  when /^@@/ then
    self.lex_state, result = :expr_end, :tCVAR
  when /^@/ then
    self.lex_state, result = :expr_end, :tIVAR
  else
    if token =~ /[!?]$/ then
      result = :tFID
    else
      if lex_state == :expr_fname then
        # ident=, not =~ => == or followed by =>
        # TODO test lexing of a=>b vs a==>b
        if src.scan(/=(?:(?![~>=])|(?==>))/) then
          result = :tIDENTIFIER
          token << src.matched
        end
      end

      result ||= if token =~ /^[A-Z]/ then
                   :tCONSTANT
                 else
                   :tIDENTIFIER
                 end
    end

    unless lex_state == :expr_dot then
      # See if it is a reserved word.
      keyword = Keyword.keyword token

      if keyword then
        state           = lex_state
        self.lex_state  = keyword.state
        self.yacc_value = token

        if keyword.id0 == :kDO then
          self.command_start = true
          return :kDO_COND  if cond.is_in_state
          return :kDO_BLOCK if cmdarg.is_in_state && state != :expr_cmdarg
          return :kDO_BLOCK if state == :expr_endarg
          return :kDO
        end

        return keyword.id0 if state == :expr_beg

        self.lex_state = :expr_beg if keyword.id0 != keyword.id1

        return keyword.id1
      end
    end

    if (lex_state == :expr_beg || lex_state == :expr_mid ||
        lex_state == :expr_dot || lex_state == :expr_arg ||
        lex_state == :expr_cmdarg) then
      if command_state then
        self.lex_state = :expr_cmdarg
      else
        self.lex_state = :expr_arg
      end
    else
      self.lex_state = :expr_end
    end
  end

  self.yacc_value = token


  self.lex_state = :expr_end if
    last_state != :expr_dot && self.parser.env[token.to_sym] == :lvar

  return result
end

#rb_compile_error(msg) ⇒ `Object`

Raises:

(SyntaxError)

# File 'lib/ruby_lexer.rb', line 394

def rb_compile_error msg
  msg += ". near line #{self.lineno}: #{src.rest[/^.*/].inspect}"
  raise SyntaxError, msg
end

#read_escape ⇒ `Object`

51 lines

# File 'lib/ruby_lexer.rb', line 399

def read_escape # 51 lines
  case
  when src.scan(/\\/) then                  # Backslash
    '\\'
  when src.scan(/n/) then                   # newline
    "\n"
  when src.scan(/t/) then                   # horizontal tab
    "\t"
  when src.scan(/r/) then                   # carriage-return
    "\r"
  when src.scan(/f/) then                   # form-feed
    "\f"
  when src.scan(/v/) then                   # vertical tab
    "\13"
  when src.scan(/a/) then                   # alarm(bell)
    "\007"
  when src.scan(/e/) then                   # escape
    "\033"
  when src.scan(/b/) then                   # backspace
    "\010"
  when src.scan(/s/) then                   # space
    " "
  when src.scan(/[0-7]{1,3}/) then          # octal constant
    src.matched.to_i(8).chr
  when src.scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
    src[1].to_i(16).chr
  when src.check(/M-\\[\\MCc]/) then
    src.scan(/M-\\/) # eat it
    c = self.read_escape
    c[0] = (c[0].ord | 0x80).chr
    c
  when src.scan(/M-(.)/) then
    c = src[1]
    c[0] = (c[0].ord | 0x80).chr
    c
  when src.check(/(C-|c)\\[\\MCc]/) then
    src.scan(/(C-|c)\\/) # eat it
    c = self.read_escape
    c[0] = (c[0].ord & 0x9f).chr
    c
  when src.scan(/C-\?|c\?/) then
    127.chr
  when src.scan(/(C-|c)(.)/) then
    c = src[2]
    c[0] = (c[0].ord & 0x9f).chr
    c
  when src.scan(/[McCx0-9]/) || src.eos? then
    rb_compile_error("Invalid escape character syntax")
  else
    src.getch
  end
end

#regx_options ⇒ `Object`

15 lines

# File 'lib/ruby_lexer.rb', line 452

def regx_options # 15 lines
  good, bad = [], []

  if src.scan(/[a-z]+/) then
    good, bad = src.matched.split(//).partition { |s| s =~ /^[ixmonesu]$/ }
  end

  unless bad.empty? then
    rb_compile_error("unknown regexp option%s - %s" %
                     [(bad.size > 1 ? "s" : ""), bad.join.inspect])
  end

  return good.join
end

#reset ⇒ `Object`

# File 'lib/ruby_lexer.rb', line 467

def reset
  self.command_start = true
  self.lex_strterm   = nil
  self.token         = nil
  self.yacc_value    = nil

  @src       = nil
  @lex_state = nil
end

#tokadd_escape(term) ⇒ `Object`

20 lines

# File 'lib/ruby_lexer.rb', line 482

def tokadd_escape term # 20 lines
  case
  when src.scan(/\\\n/) then
    # just ignore
  when src.scan(/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2})/) then
    self.string_buffer << src.matched
  when src.scan(/\\([MC]-|c)(?=\\)/) then
    self.string_buffer << src.matched
    self.tokadd_escape term
  when src.scan(/\\([MC]-|c)(.)/) then
    self.string_buffer << src.matched
  when src.scan(/\\[McCx]/) then
    rb_compile_error "Invalid escape character syntax"
  when src.scan(/\\(.)/m) then
    self.string_buffer << src.matched
  else
    rb_compile_error "Invalid escape character syntax"
  end
end

#tokadd_string(func, term, paren) ⇒ `Object`

105 lines

# File 'lib/ruby_lexer.rb', line 502

def tokadd_string(func, term, paren) # 105 lines
  awords = (func & STR_FUNC_AWORDS) != 0
  escape = (func & STR_FUNC_ESCAPE) != 0
  expand = (func & STR_FUNC_EXPAND) != 0
  regexp = (func & STR_FUNC_REGEXP) != 0
  symbol = (func & STR_FUNC_SYMBOL) != 0

  paren_re = paren.nil? ? nil : Regexp.new(Regexp.escape(paren))
  term_re  = Regexp.new(Regexp.escape(term))

  until src.eos? do
    c = nil
    handled = true
    case
    when self.nest == 0 && src.scan(term_re) then
      src.pos -= 1
      break
    when paren_re && src.scan(paren_re) then
      self.nest += 1
    when src.scan(term_re) then
      self.nest -= 1
    when awords && src.scan(/\s/) then
      src.pos -= 1
      break
    when expand && src.scan(/#(?=[\$\@\{])/) then
      src.pos -= 1
      break
    when expand && src.scan(/#(?!\n)/) then
      # do nothing
    when src.check(/\\/) then
      case
      when awords && src.scan(/\\\n/) then
        string_buffer << "\n"
        next
      when awords && src.scan(/\\\s/) then
        c = ' '
      when expand && src.scan(/\\\n/) then
        next
      when regexp && src.check(/\\/) then
        self.tokadd_escape term
        next
      when expand && src.scan(/\\/) then
        c = self.read_escape
      when src.scan(/\\\n/) then
        # do nothing
      when src.scan(/\\\\/) then
        string_buffer << '\\' if escape
        c = '\\'
      when src.scan(/\\/) then
        unless src.scan(term_re) || paren.nil? || src.scan(paren_re) then
          string_buffer << "\\"
        end
      else
        handled = false
      end
    else
      handled = false
    end # case

    unless handled then

      t = Regexp.escape term
      x = Regexp.escape(paren) if paren && paren != "\000"
      re = if awords then
             /[^#{t}#{x}\#\0\\\n\ ]+|./ # |. to pick up whatever
           else
             /[^#{t}#{x}\#\0\\]+|./
           end

      src.scan re
      c = src.matched

      rb_compile_error "symbol cannot contain '\\0'" if symbol && c =~ /\0/
    end # unless handled

    c ||= src.matched
    string_buffer << c
  end # until

  c ||= src.matched
  c = RubyLexer::EOF if src.eos?


  return c
end

#unescape(s) ⇒ `Object`

# File 'lib/ruby_lexer.rb', line 588

def unescape s

  r = {
    "a"    => "\007",
    "b"    => "\010",
    "e"    => "\033",
    "f"    => "\f",
    "n"    => "\n",
    "r"    => "\r",
    "s"    => " ",
    "t"    => "\t",
    "v"    => "\13",
    "\\"   => '\\',
    "\n"   => "",
    "C-\?" => 127.chr,
    "c\?"  => 127.chr,
  }[s]

  return r if r

  case s
  when /^[0-7]{1,3}/ then
    $&.to_i(8).chr
  when /^x([0-9a-fA-F]{1,2})/ then
    $1.to_i(16).chr
  when /^M-(.)/ then
    ($1[0].ord | 0x80).chr
  when /^(C-|c)(.)/ then
    ($2[0].ord & 0x9f).chr
  when /^[McCx0-9]/ then
    rb_compile_error("Invalid escape character syntax")
  else
    s
  end
end

#warning(s) ⇒ `Object`



624
625
626

# File 'lib/ruby_lexer.rb', line 624

def warning s
  # do nothing for now
end

#yylex ⇒ `Object`

Returns the next token. Also sets yy_val is needed.

Returns:

Description of the Returned Value

# File 'lib/ruby_lexer.rb', line 633

def yylex # 826 lines

  c = ''
  space_seen = false
  command_state = false
  src = self.src

  self.token = nil
  self.yacc_value = nil

  return yylex_string if lex_strterm

  command_state = self.command_start
  self.command_start = false

  last_state = lex_state

  loop do # START OF CASE
    if src.scan(/[\ \t\r\f\v]/) then # \s - \n + \v
      space_seen = true
      next
    elsif src.check(/[^a-zA-Z]/) then
      if src.scan(/\n|#/) then
        self.lineno = nil
        c = src.matched
        if c == '#' then
          src.pos -= 1

          while src.scan(/\s*#.*(\n+|\z)/) do
            @comments << src.matched.gsub(/^ +#/, '#').gsub(/^ +$/, '')
          end

          if src.eos? then
            return RubyLexer::EOF
          end
        end

        # Replace a string of newlines with a single one
        src.scan(/\n+/)

        if [:expr_beg, :expr_fname,
            :expr_dot, :expr_class].include? lex_state then
          next
        end

        self.command_start = true
        self.lex_state = :expr_beg
        return :tNL
      elsif src.scan(/[\]\)\}]/) then
        cond.lexpop
        cmdarg.lexpop
        self.lex_state = :expr_end
        self.yacc_value = src.matched
        result = {
          ")" => :tRPAREN,
          "]" => :tRBRACK,
          "}" => :tRCURLY
        }[src.matched]
        return result
      elsif src.scan(/\.\.\.?|,|![=~]?/) then
        self.lex_state = :expr_beg
        tok = self.yacc_value = src.matched
        return TOKENS[tok]
      elsif src.check(/\./) then
        if src.scan(/\.\d/) then
          rb_compile_error "no .<digit> floating literal anymore put 0 before dot"
        elsif src.scan(/\./) then
          self.lex_state = :expr_dot
          self.yacc_value = "."
          return :tDOT
        end
      elsif src.scan(/\(/) then
        result = :tLPAREN2
        self.command_start = true

        if lex_state == :expr_beg || lex_state == :expr_mid then
          result = :tLPAREN
        elsif space_seen then
          if lex_state == :expr_cmdarg then
            result = :tLPAREN_ARG
          elsif lex_state == :expr_arg then
            warning("don't put space before argument parentheses")
            result = :tLPAREN2
          end
        end

        self.expr_beg_push "("

        return result
      elsif src.check(/\=/) then
        if src.scan(/\=\=\=|\=\=|\=~|\=>|\=(?!begin\b)/) then
          self.fix_arg_lex_state
          tok = self.yacc_value = src.matched
          return TOKENS[tok]
        elsif src.scan(/\=begin(?=\s)/) then
          # @comments << '=' << src.matched
          @comments << src.matched

          unless src.scan(/.*?\n=end( |\t|\f)*[^(\n|\z)]*(\n|\z)/m) then
            @comments.clear
            rb_compile_error("embedded document meets end of file")
          end

          @comments << src.matched

          next
        else
          raise "you shouldn't be able to get here"
        end
      elsif src.scan(/\"(#{ESC_RE}|#(#{ESC_RE}|[^\{\#\@\$\"\\])|[^\"\\\#])*\"/o) then
        self.yacc_value = src.matched[1..-2].gsub(ESC_RE) { unescape $1 }
        self.lex_state = :expr_end
        return :tSTRING
      elsif src.scan(/\"/) then # FALLBACK
        self.lex_strterm = [:strterm, STR_DQUOTE, '"', "\0"] # TODO: question this
        self.yacc_value = "\""
        return :tSTRING_BEG
      elsif src.scan(/\@\@?\w*/) then
        self.token = src.matched

        rb_compile_error "`#{token}` is not allowed as a variable name" if
          token =~ /\@\d/

        return process_token(command_state)
      elsif src.scan(/\:\:/) then
        if (lex_state == :expr_beg ||
            lex_state == :expr_mid ||
            lex_state == :expr_class ||
            (lex_state.is_argument && space_seen)) then
          self.lex_state = :expr_beg
          self.yacc_value = "::"
          return :tCOLON3
        end

        self.lex_state = :expr_dot
        self.yacc_value = "::"
        return :tCOLON2
      elsif lex_state != :expr_end && lex_state != :expr_endarg && src.scan(/:([a-zA-Z_]\w*(?:[?!]|=(?!>))?)/) then
        self.yacc_value = src[1]
        self.lex_state = :expr_end
        return :tSYMBOL
      elsif src.scan(/\:/) then
        # ?: / then / when
        if (lex_state == :expr_end || lex_state == :expr_endarg||
            src.check(/\s/)) then
          self.lex_state = :expr_beg
          self.yacc_value = ":"
          return :tCOLON
        end

        case
        when src.scan(/\'/) then
          self.lex_strterm = [:strterm, STR_SSYM, src.matched, "\0"]
        when src.scan(/\"/) then
          self.lex_strterm = [:strterm, STR_DSYM, src.matched, "\0"]
        end

        self.lex_state = :expr_fname
        self.yacc_value = ":"
        return :tSYMBEG
      elsif src.check(/[0-9]/) then
        return parse_number
      elsif src.scan(/\[/) then
        result = src.matched

        if lex_state == :expr_fname || lex_state == :expr_dot then
          self.lex_state = :expr_arg
          case
          when src.scan(/\]\=/) then
            self.yacc_value = "[]="
            return :tASET
          when src.scan(/\]/) then
            self.yacc_value = "[]"
            return :tAREF
          else
            rb_compile_error "unexpected '['"
          end
        elsif lex_state == :expr_beg || lex_state == :expr_mid then
          result = :tLBRACK
        elsif lex_state.is_argument && space_seen then
          result = :tLBRACK
        end

        self.expr_beg_push "["

        return result
      elsif src.scan(/\'(\\.|[^\'])*\'/) then
        self.yacc_value = src.matched[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'")
        self.lex_state = :expr_end
        return :tSTRING
      elsif src.check(/\|/) then
        if src.scan(/\|\|\=/) then
          self.lex_state = :expr_beg
          self.yacc_value = "||"
          return :tOP_ASGN
        elsif src.scan(/\|\|/) then
          self.lex_state = :expr_beg
          self.yacc_value = "||"
          return :tOROP
        elsif src.scan(/\|\=/) then
          self.lex_state = :expr_beg
          self.yacc_value = "|"
          return :tOP_ASGN
        elsif src.scan(/\|/) then
          self.fix_arg_lex_state
          self.yacc_value = "|"
          return :tPIPE
        end
      elsif src.scan(/\{/) then
        result = if lex_state.is_argument || lex_state == :expr_end then
                   :tLCURLY      #  block (primary)
                 elsif lex_state == :expr_endarg then
                   :tLBRACE_ARG  #  block (expr)
                 else
                   :tLBRACE      #  hash
                 end

        self.expr_beg_push "{"

        return result
      elsif src.scan(/[+-]/) then
        sign = src.matched
        utype, type = if sign == "+" then
                        [:tUPLUS, :tPLUS]
                      else
                        [:tUMINUS, :tMINUS]
                      end

        if lex_state == :expr_fname || lex_state == :expr_dot then
          self.lex_state = :expr_arg
          if src.scan(/@/) then
            self.yacc_value = "#{sign}@"
            return utype
          else
            self.yacc_value = sign
            return type
          end
        end

        if src.scan(/\=/) then
          self.lex_state = :expr_beg
          self.yacc_value = sign
          return :tOP_ASGN
        end

        if (lex_state == :expr_beg || lex_state == :expr_mid ||
            (lex_state.is_argument && space_seen && !src.check(/\s/))) then
          if lex_state.is_argument then
            arg_ambiguous
          end

          self.lex_state = :expr_beg
          self.yacc_value = sign

          if src.check(/\d/) then
            if utype == :tUPLUS then
              return self.parse_number
            else
              return :tUMINUS_NUM
            end
          end

          return utype
        end

        self.lex_state = :expr_beg
        self.yacc_value = sign
        return type
      elsif src.check(/\*/) then
        if src.scan(/\*\*=/) then
          self.lex_state = :expr_beg
          self.yacc_value = "**"
          return :tOP_ASGN
        elsif src.scan(/\*\*/) then
          self.yacc_value = "**"
          self.fix_arg_lex_state
          return :tPOW
        elsif src.scan(/\*\=/) then
          self.lex_state = :expr_beg
          self.yacc_value = "*"
          return :tOP_ASGN
        elsif src.scan(/\*/) then
          result = if lex_state.is_argument && space_seen && src.check(/\S/) then
                     warning("`*' interpreted as argument prefix")
                     :tSTAR
                   elsif lex_state == :expr_beg || lex_state == :expr_mid then
                     :tSTAR
                   else
                     :tSTAR2
                   end
          self.yacc_value = "*"
          self.fix_arg_lex_state

          return result
        end
      elsif src.check(/\</) then
        if src.scan(/\<\=\>/) then
          self.fix_arg_lex_state
          self.yacc_value = "<=>"
          return :tCMP
        elsif src.scan(/\<\=/) then
          self.fix_arg_lex_state
          self.yacc_value = "<="
          return :tLEQ
        elsif src.scan(/\<\<\=/) then
          self.fix_arg_lex_state
          self.lex_state = :expr_beg
          self.yacc_value = "\<\<"
          return :tOP_ASGN
        elsif src.scan(/\<\</) then
          if (! [:expr_end,    :expr_dot,
                 :expr_endarg, :expr_class].include?(lex_state) &&
              (!lex_state.is_argument || space_seen)) then
            tok = self.heredoc_identifier
            if tok then
              return tok
            end
          end

          self.fix_arg_lex_state
          self.yacc_value = "\<\<"
          return :tLSHFT
        elsif src.scan(/\</) then
          self.fix_arg_lex_state
          self.yacc_value = "<"
          return :tLT
        end
      elsif src.check(/\>/) then
        if src.scan(/\>\=/) then
          self.fix_arg_lex_state
          self.yacc_value = ">="
          return :tGEQ
        elsif src.scan(/\>\>=/) then
          self.fix_arg_lex_state
          self.lex_state = :expr_beg
          self.yacc_value = ">>"
          return :tOP_ASGN
        elsif src.scan(/\>\>/) then
          self.fix_arg_lex_state
          self.yacc_value = ">>"
          return :tRSHFT
        elsif src.scan(/\>/) then
          self.fix_arg_lex_state
          self.yacc_value = ">"
          return :tGT
        end
      elsif src.scan(/\`/) then
        self.yacc_value = "`"
        case lex_state
        when :expr_fname then
          self.lex_state = :expr_end
          return :tBACK_REF2
        when :expr_dot then
          self.lex_state = if command_state then
                             :expr_cmdarg
                           else
                             :expr_arg
                           end
          return :tBACK_REF2
        end
        self.lex_strterm = [:strterm, STR_XQUOTE, '`', "\0"]
        return :tXSTRING_BEG
      elsif src.scan(/\?/) then
        if lex_state == :expr_end || lex_state == :expr_endarg then
          self.lex_state = :expr_beg
          self.yacc_value = "?"
          return :tEH
        end

        if src.eos? then
          rb_compile_error "incomplete character syntax"
        end

        if src.check(/\s|\v/) then
          unless lex_state.is_argument then
            c2 = { " " => 's',
                  "\n" => 'n',
                  "\t" => 't',
                  "\v" => 'v',
                  "\r" => 'r',
                  "\f" => 'f' }[src.matched]

            if c2 then
              warning("invalid character syntax; use ?\\" + c2)
            end
          end

          # ternary
          self.lex_state = :expr_beg
          self.yacc_value = "?"
          return :tEH
        elsif src.check(/\w(?=\w)/) then # ternary, also
          self.lex_state = :expr_beg
          self.yacc_value = "?"
          return :tEH
        end

        c = if src.scan(/\\/) then
              self.read_escape
            else
              src.getch
            end
        self.lex_state = :expr_end
        self.yacc_value = c[0].ord & 0xff
        return :tINTEGER
      elsif src.check(/\&/) then
        if src.scan(/\&\&\=/) then
          self.yacc_value = "&&"
          self.lex_state = :expr_beg
          return :tOP_ASGN
        elsif src.scan(/\&\&/) then
          self.lex_state = :expr_beg
          self.yacc_value = "&&"
          return :tANDOP
        elsif src.scan(/\&\=/) then
          self.yacc_value = "&"
          self.lex_state = :expr_beg
          return :tOP_ASGN
        elsif src.scan(/&/) then
          result = if lex_state.is_argument && space_seen &&
                       !src.check(/\s/) then
                     warning("`&' interpreted as argument prefix")
                     :tAMPER
                   elsif lex_state == :expr_beg || lex_state == :expr_mid then
                     :tAMPER
                   else
                     :tAMPER2
                   end

          self.fix_arg_lex_state
          self.yacc_value = "&"
          return result
        end
      elsif src.scan(/\//) then
        if lex_state == :expr_beg || lex_state == :expr_mid then
          self.lex_strterm = [:strterm, STR_REGEXP, '/', "\0"]
          self.yacc_value = "/"
          return :tREGEXP_BEG
        end

        if src.scan(/\=/) then
          self.yacc_value = "/"
          self.lex_state = :expr_beg
          return :tOP_ASGN
        end

        if lex_state.is_argument && space_seen then
          unless src.scan(/\s/) then
            arg_ambiguous
            self.lex_strterm = [:strterm, STR_REGEXP, '/', "\0"]
            self.yacc_value = "/"
            return :tREGEXP_BEG
          end
        end

        self.fix_arg_lex_state
        self.yacc_value = "/"

        return :tDIVIDE
      elsif src.scan(/\^=/) then
        self.lex_state = :expr_beg
        self.yacc_value = "^"
        return :tOP_ASGN
      elsif src.scan(/\^/) then
        self.fix_arg_lex_state
        self.yacc_value = "^"
        return :tCARET
      elsif src.scan(/\;/) then
        self.command_start = true
        self.lex_state = :expr_beg
        self.yacc_value = ";"
        return :tSEMI
      elsif src.scan(/\~/) then
        if lex_state == :expr_fname || lex_state == :expr_dot then
          src.scan(/@/)
        end

        self.fix_arg_lex_state
        self.yacc_value = "~"

        return :tTILDE
      elsif src.scan(/\\/) then
        if src.scan(/\n/) then
          self.lineno = nil
          space_seen = true
          next
        end
        rb_compile_error "bare backslash only allowed before newline"
      elsif src.scan(/\%/) then
        if lex_state == :expr_beg || lex_state == :expr_mid then
          return parse_quote
        end

        if src.scan(/\=/) then
          self.lex_state = :expr_beg
          self.yacc_value = "%"
          return :tOP_ASGN
        end

        if lex_state.is_argument && space_seen && ! src.check(/\s/) then
          return parse_quote
        end

        self.fix_arg_lex_state
        self.yacc_value = "%"

        return :tPERCENT
      elsif src.check(/\$/) then
        if src.scan(/(\$_)(\w+)/) then
          self.lex_state = :expr_end
          self.token = src.matched
          return process_token(command_state)
        elsif src.scan(/\$_/) then
          self.lex_state = :expr_end
          self.token = src.matched
          self.yacc_value = src.matched
          return :tGVAR
        elsif src.scan(/\$[~*$?!@\/\\;,.=:<>\"]|\$-\w?/) then
          self.lex_state = :expr_end
          self.yacc_value = src.matched
          return :tGVAR
        elsif src.scan(/\$([\&\`\'\+])/) then
          self.lex_state = :expr_end
          # Explicit reference to these vars as symbols...
          if last_state == :expr_fname then
            self.yacc_value = src.matched
            return :tGVAR
          else
            self.yacc_value = src[1].to_sym
            return :tBACK_REF
          end
        elsif src.scan(/\$([1-9]\d*)/) then
          self.lex_state = :expr_end
          if last_state == :expr_fname then
            self.yacc_value = src.matched
            return :tGVAR
          else
            self.yacc_value = src[1].to_i
            return :tNTH_REF
          end
        elsif src.scan(/\$0/) then
          self.lex_state = :expr_end
          self.token = src.matched
          return process_token(command_state)
        elsif src.scan(/\$\W|\$\z/) then # TODO: remove?
          self.lex_state = :expr_end
          self.yacc_value = "$"
          return "$"
        elsif src.scan(/\$\w+/)
          self.lex_state = :expr_end
          self.token = src.matched
          return process_token(command_state)
        end
      elsif src.check(/\_/) then
        if src.beginning_of_line? && src.scan(/\__END__(\n|\Z)/) then
          self.lineno = nil
          return RubyLexer::EOF
        elsif src.scan(/\_\w*/) then
          self.token = src.matched
          return process_token(command_state)
        end
      end
    end # END OF CASE

    if src.scan(/\004|\032|\000/) || src.eos? then # ^D, ^Z, EOF
      return RubyLexer::EOF
    else # alpha check
      if src.scan(/\W/) then
        rb_compile_error "Invalid char #{src.matched.inspect} in expression"
      end
    end

    self.token = src.matched if self.src.scan(/\w+/)

    return process_token(command_state)
  end
end

#yylex_string ⇒ `Object`

23 lines

# File 'lib/ruby_lexer.rb', line 1293

def yylex_string # 23 lines
  token = if lex_strterm[0] == :heredoc then
            self.heredoc lex_strterm
          else
            self.parse_string lex_strterm
          end

  if token == :tSTRING_END || token == :tREGEXP_END then
    self.lineno      = nil
    self.lex_strterm = nil
    self.lex_state   = :expr_end
  end

  return token
end

Class: RubyLexer

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ RubyLexer

Instance Attribute Details

#cmdarg ⇒ Object

#command_start ⇒ Object

#cond ⇒ Object

#lex_state ⇒ Object

#lex_strterm ⇒ Object

#lineno ⇒ Object

#nest ⇒ Object

#parser ⇒ Object

#src ⇒ Object

#string_buffer ⇒ Object

#token ⇒ Object

#warnings ⇒ Object

#yacc_value ⇒ Object

Instance Method Details

#advance ⇒ Object

#arg_ambiguous ⇒ Object

#comments ⇒ Object

#expr_beg_push(val) ⇒ Object

#fix_arg_lex_state ⇒ Object

#heredoc(here) ⇒ Object

#heredoc_identifier ⇒ Object

#int_with_base(base) ⇒ Object

#parse_number ⇒ Object

#parse_quote ⇒ Object

#parse_string(quote) ⇒ Object

#process_token(command_state) ⇒ Object

#rb_compile_error(msg) ⇒ Object

#read_escape ⇒ Object

#regx_options ⇒ Object

#reset ⇒ Object

#tokadd_escape(term) ⇒ Object

#tokadd_string(func, term, paren) ⇒ Object

#unescape(s) ⇒ Object

#warning(s) ⇒ Object

#yylex ⇒ Object

#yylex_string ⇒ Object

#initialize ⇒ `RubyLexer`

#cmdarg ⇒ `Object`

#command_start ⇒ `Object`

#cond ⇒ `Object`

#lex_state ⇒ `Object`

#lex_strterm ⇒ `Object`

#lineno ⇒ `Object`

#nest ⇒ `Object`

#parser ⇒ `Object`

#src ⇒ `Object`

#string_buffer ⇒ `Object`

#token ⇒ `Object`

#warnings ⇒ `Object`

#yacc_value ⇒ `Object`

#advance ⇒ `Object`

#arg_ambiguous ⇒ `Object`

#comments ⇒ `Object`

#expr_beg_push(val) ⇒ `Object`

#fix_arg_lex_state ⇒ `Object`

#heredoc(here) ⇒ `Object`

#heredoc_identifier ⇒ `Object`

#int_with_base(base) ⇒ `Object`

#parse_number ⇒ `Object`

#parse_quote ⇒ `Object`

#parse_string(quote) ⇒ `Object`

#process_token(command_state) ⇒ `Object`

#rb_compile_error(msg) ⇒ `Object`

#read_escape ⇒ `Object`

#regx_options ⇒ `Object`

#reset ⇒ `Object`

#tokadd_escape(term) ⇒ `Object`

#tokadd_string(func, term, paren) ⇒ `Object`

#unescape(s) ⇒ `Object`

#warning(s) ⇒ `Object`

#yylex ⇒ `Object`

#yylex_string ⇒ `Object`