Class: TinyGQL::Lexer

Inherits:

Object

Object
TinyGQL::Lexer

show all

Includes:: Literals

Defined in:: lib/tinygql/lexer.rb

Defined Under Namespace

Modules: LeadBytes, Literals

Constant Summary collapse

IDENTIFIER =

/[_A-Za-z][_0-9A-Za-z]*\b/

IGNORE =

%r{
  (?:
    [, \c\r\n\t]+ |
    \#.*$
  )*
}x

INT =

/[-]?(?:[0]|[1-9][0-9]*)/

FLOAT_DECIMAL =

/[.][0-9]+/

FLOAT_EXP =

/[eE][+-]?[0-9]+/

NUMERIC =

/#{INT}(#{FLOAT_DECIMAL}#{FLOAT_EXP}|#{FLOAT_DECIMAL}|#{FLOAT_EXP})?/

KEYWORDS =

[
  "on",
  "fragment",
  "true",
  "false",
  "null",
  "query",
  "mutation",
  "subscription",
  "schema",
  "scalar",
  "type",
  "extend",
  "implements",
  "interface",
  "union",
  "enum",
  "input",
  "directive",
  "repeatable"
].freeze

KW_RE =

/#{Regexp.union(KEYWORDS.sort)}\b/

ELLIPSIS =

'...'

QUOTE =

'"'

UNICODE_DIGIT =

/[0-9A-Za-z]/

FOUR_DIGIT_UNICODE =

/#{UNICODE_DIGIT}{4}/

N_DIGIT_UNICODE =

%r{#{LCURLY}#{UNICODE_DIGIT}{4,}#{RCURLY}}x

UNICODE_ESCAPE =

%r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}

STRING_ESCAPE = # graphql.github.io/graphql-spec/June2018/#sec-String-Value

%r{[\\][\\/bfnrt]}

BLOCK_QUOTE =

'"""'

ESCAPED_QUOTE =

/\\"/

STRING_CHAR =

/#{ESCAPED_QUOTE}|[^"\\]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/

PUNCT_LUT =

Literals.constants.each_with_object([]) { |n, o|
  o[Literals.const_get(n).ord] = n
}

LEAD_BYTES =

Array.new(255) { 0 }

QUOTED_STRING =

%r{#{QUOTE} ((?:#{STRING_CHAR})*) #{QUOTE}}x

BLOCK_STRING =

%r{
    #{BLOCK_QUOTE}
((?: [^"\\]               |  # Any characters that aren't a quote or slash
(?<!") ["]{1,2} (?!") |  # Any quotes that don't have quotes next to them
\\"{0,3}(?!")         |  # A slash followed by <= 3 quotes that aren't followed by a quote
\\                    |  # A slash
"{1,2}(?!")              # 1 or 2 " followed by something that isn't a quote
)*
(?:"")?)
    #{BLOCK_QUOTE}
}xm

ESCAPES =

/\\["\\\/bfnrt]/

ESCAPES_REPLACE =

{
  '\\"' => '"',
  "\\\\" => "\\",
  "\\/" => '/',
  "\\b" => "\b",
  "\\f" => "\f",
  "\\n" => "\n",
  "\\r" => "\r",
  "\\t" => "\t",
}

UTF_8 =

/\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i

VALID_STRING =

/\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o

KW_LUT =

[:INTERFACE,
:MUTATION,
:EXTEND,
:FALSE,
:ENUM,
:TRUE,
:NULL,
nil,
nil,
nil,
nil,
nil,
nil,
nil,
:QUERY,
nil,
nil,
:REPEATABLE,
:IMPLEMENTS,
:INPUT,
:TYPE,
:SCHEMA,
nil,
nil,
nil,
:DIRECTIVE,
:UNION,
nil,
nil,
:SCALAR,
nil,
:FRAGMENT]

Constants included from Literals

Literals::AMP, Literals::BANG, Literals::COLON, Literals::DIR_SIGN, Literals::EQUALS, Literals::LBRACKET, Literals::LCURLY, Literals::LPAREN, Literals::PIPE, Literals::RBRACKET, Literals::RCURLY, Literals::RPAREN, Literals::VAR_SIGN

Instance Attribute Summary collapse

#start ⇒ Object readonly

Returns the value of attribute start.

Instance Method Summary collapse

#_hash(key) ⇒ Object
#advance ⇒ Object
#done? ⇒ Boolean
#emit_block(value) ⇒ Object
#emit_string(value) ⇒ Object
#initialize(string) ⇒ Lexer constructor

A new instance of Lexer.
#line ⇒ Object
#next_token ⇒ Object
#replace_escaped_characters_in_place(raw_string) ⇒ Object

Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it.
#string_value ⇒ Object
#token_value ⇒ Object
#trim_whitespace(str) ⇒ Object

Constructor Details

#initialize(string) ⇒ `Lexer`

Returns a new instance of Lexer.

# File 'lib/tinygql/lexer.rb', line 118

def initialize string
  raise unless string.valid_encoding?

  @string = string
  @scan = StringScanner.new string
  @start = nil
end

Instance Attribute Details

#start ⇒ `Object` (readonly)

Returns the value of attribute start.



126
127
128

# File 'lib/tinygql/lexer.rb', line 126

def start
  @start
end

Instance Method Details

#_hash(key) ⇒ `Object`



368
369
370

# File 'lib/tinygql/lexer.rb', line 368

def _hash key
  (key * 18592990) >> 27 & 0x1f
end

#advance ⇒ `Object`

# File 'lib/tinygql/lexer.rb', line 136

def advance
  @scan.skip(IGNORE)

  return false if @scan.eos?

  @start = @scan.pos

  lead_byte = @string.getbyte(@start)
  lead_code = LEAD_BYTES[lead_byte]

  if lead_code == LeadBytes::PUNCT
    @scan.pos += 1
    PUNCT_LUT[lead_byte]

  elsif lead_code == LeadBytes::KW
    if len = @scan.skip(KW_RE)
      return :ON if len == 2
      return :SUBSCRIPTION if len == 12

      pos = @start

      # Second 2 bytes are unique, so we'll hash on those
      key = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1)

      KW_LUT[_hash(key)]
    else
      @scan.skip(IDENTIFIER)
      :IDENTIFIER
    end

  elsif lead_code == LeadBytes::IDENT
    @scan.skip(IDENTIFIER)
    :IDENTIFIER

  elsif lead_code == LeadBytes::INT
    @scan.skip(NUMERIC)
    @scan[1] ? :FLOAT : :INT

  elsif lead_code == LeadBytes::ELLIPSIS
    2.times do |i|
      raise unless @string.getbyte(@start + i + 1) == 46
    end
    @scan.pos += 3
    :ELLIPSIS

  elsif lead_code == LeadBytes::STRING
    raise unless @scan.skip(BLOCK_STRING) || @scan.skip(QUOTED_STRING)
    :STRING

  else
    @scan.pos += 1
    :UNKNOWN_CHAR
  end
end

#done? ⇒ `Boolean`

Returns:

(Boolean)



132
133
134

# File 'lib/tinygql/lexer.rb', line 132

def done?
  @scan.eos?
end

#emit_block(value) ⇒ `Object`

# File 'lib/tinygql/lexer.rb', line 262

def emit_block(value)
  value = trim_whitespace(value)
  emit_string(value)
end

#emit_string(value) ⇒ `Object`

# File 'lib/tinygql/lexer.rb', line 267

def emit_string(value)
  if !value.valid_encoding? || !value.match?(VALID_STRING)
    emit(:BAD_UNICODE_ESCAPE, value)
  else
    replace_escaped_characters_in_place(value)

    if !value.valid_encoding?
      emit(:BAD_UNICODE_ESCAPE, value)
    else
      value
    end
  end
end

#line ⇒ `Object`



128
129
130

# File 'lib/tinygql/lexer.rb', line 128

def line
  @scan.string[0, @scan.pos].count("\n") + 1
end

#next_token ⇒ `Object`

# File 'lib/tinygql/lexer.rb', line 207

def next_token
  return unless tok = advance
  val = case tok
  when :STRING then string_value
  when :ELLIPSIS then
    @string.byteslice(@scan.pos - 3, 3)
  when *Literals.constants
    @string.byteslice(@scan.pos - 1, 1)
  else
    token_value
  end

  [tok, val]
end

#replace_escaped_characters_in_place(raw_string) ⇒ `Object`

Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it

# File 'lib/tinygql/lexer.rb', line 224

def replace_escaped_characters_in_place(raw_string)
  raw_string.gsub!(ESCAPES, ESCAPES_REPLACE)
  raw_string.gsub!(UTF_8) do |_matched_str|
    codepoint_1 = ($1 || $2).to_i(16)
    codepoint_2 = $3

    if codepoint_2
      codepoint_2 = codepoint_2.to_i(16)
      if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate
          (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate
        # A surrogate pair
        combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000
        [combined].pack('U'.freeze)
      else
        # Two separate code points
        [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze)
      end
    else
      [codepoint_1].pack('U'.freeze)
    end
  end
  nil
end

#string_value ⇒ `Object`

# File 'lib/tinygql/lexer.rb', line 195

def string_value
  str = token_value
  block = str.start_with?('"""')
  str.gsub!(/\A"*|"*\z/, '')

  if block
    emit_block str
  else
    emit_string str
  end
end

#token_value ⇒ `Object`



191
192
193

# File 'lib/tinygql/lexer.rb', line 191

def token_value
  @string.byteslice(@scan.pos - @scan.matched_size, @scan.matched_size)
end

#trim_whitespace(str) ⇒ `Object`

# File 'lib/tinygql/lexer.rb', line 281

def trim_whitespace(str)
  # Early return for the most common cases:
  if str == ""
    return "".dup
  elsif !(has_newline = str.include?("\n")) && !(str.start_with?(" "))
    return str
  end

  lines = has_newline ? str.split("\n") : [str]
  common_indent = nil

  # find the common whitespace
  lines.each_with_index do |line, idx|
    if idx == 0
      next
    end
    line_length = line.size
    line_indent = if line.match?(/\A  [^ ]/)
      2
    elsif line.match?(/\A    [^ ]/)
      4
    elsif line.match?(/\A[^ ]/)
      0
    else
      line[/\A */].size
    end
    if line_indent < line_length && (common_indent.nil? || line_indent < common_indent)
      common_indent = line_indent
    end
  end

  # Remove the common whitespace
  if common_indent && common_indent > 0
    lines.each_with_index do |line, idx|
      if idx == 0
        next
      else
        line.slice!(0, common_indent)
      end
    end
  end

  # Remove leading & trailing blank lines
  while lines.size > 0 && lines[0].empty?
    lines.shift
  end
  while lines.size > 0 && lines[-1].empty?
    lines.pop
  end

  # Rebuild the string
  lines.size > 1 ? lines.join("\n") : (lines.first || "".dup)
end

Class: TinyGQL::Lexer

Defined Under Namespace

Constant Summary collapse

Constants included from Literals

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ Lexer

Instance Attribute Details

#start ⇒ Object (readonly)

Instance Method Details

#_hash(key) ⇒ Object

#advance ⇒ Object

#done? ⇒ Boolean

#emit_block(value) ⇒ Object

#emit_string(value) ⇒ Object

#line ⇒ Object

#next_token ⇒ Object

#replace_escaped_characters_in_place(raw_string) ⇒ Object

#string_value ⇒ Object

#token_value ⇒ Object

#trim_whitespace(str) ⇒ Object