Class: GraphQL::Language::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/graphql/language/lexer.rb

Defined Under Namespace

Modules: ByteFor, Punctuation

Constant Summary collapse

ESCAPES =
/\\["\\\/bfnrt]/
ESCAPES_REPLACE =
{
  '\\"' => '"',
  "\\\\" => "\\",
  "\\/" => '/',
  "\\b" => "\b",
  "\\f" => "\f",
  "\\n" => "\n",
  "\\r" => "\r",
  "\\t" => "\t",
}
UTF_8 =
/\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
VALID_STRING =
/\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
ESCAPED =
/(?:#{ESCAPES}|#{UTF_8})/o
IGNORE_REGEXP =
%r{
  (?:
    [, \c\r\n\t]+ |
    \#.*$
  )*
}x
IDENTIFIER_REGEXP =
/[_A-Za-z][_0-9A-Za-z]*/
INT_REGEXP =
/-?(?:[0]|[1-9][0-9]*)/
FLOAT_DECIMAL_REGEXP =
/[.][0-9]+/
FLOAT_EXP_REGEXP =
/[eE][+-]?[0-9]+/
NUMERIC_REGEXP =

TODO: FLOAT_EXP_REGEXP should not be allowed to follow INT_REGEXP, integers are not allowed to have exponent parts.

/#{INT_REGEXP}(#{FLOAT_DECIMAL_REGEXP}#{FLOAT_EXP_REGEXP}|#{FLOAT_DECIMAL_REGEXP}|#{FLOAT_EXP_REGEXP})?/
KEYWORDS =
[
  "on",
  "fragment",
  "true",
  "false",
  "null",
  "query",
  "mutation",
  "subscription",
  "schema",
  "scalar",
  "type",
  "extend",
  "implements",
  "interface",
  "union",
  "enum",
  "input",
  "directive",
  "repeatable"
].freeze
KEYWORD_REGEXP =
/#{Regexp.union(KEYWORDS.sort)}\b/
KEYWORD_BY_TWO_BYTES =
[
  :INTERFACE,
  :MUTATION,
  :EXTEND,
  :FALSE,
  :ENUM,
  :TRUE,
  :NULL,
  nil,
  nil,
  nil,
  nil,
  nil,
  nil,
  nil,
  :QUERY,
  nil,
  nil,
  :REPEATABLE,
  :IMPLEMENTS,
  :INPUT,
  :TYPE,
  :SCHEMA,
  nil,
  nil,
  nil,
  :DIRECTIVE,
  :UNION,
  nil,
  nil,
  :SCALAR,
  nil,
  :FRAGMENT
]
PUNCTUATION_NAME_FOR_BYTE =

A sparse array mapping the bytes for each punctuation to a symbol name for that punctuation

Punctuation.constants.each_with_object([]) { |name, arr|
  punct = Punctuation.const_get(name)
  arr[punct.ord] = name
}
QUOTE =
'"'
UNICODE_DIGIT =
/[0-9A-Za-z]/
FOUR_DIGIT_UNICODE =
/#{UNICODE_DIGIT}{4}/
N_DIGIT_UNICODE =
%r{#{Punctuation::LCURLY}#{UNICODE_DIGIT}{4,}#{Punctuation::RCURLY}}x
UNICODE_ESCAPE =
%r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}
STRING_ESCAPE =
%r{[\\][\\/bfnrt]}
BLOCK_QUOTE =
'"""'
ESCAPED_QUOTE =
/\\"/
STRING_CHAR =
/#{ESCAPED_QUOTE}|[^"\\\n\r]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/
QUOTED_STRING_REGEXP =
%r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x
BLOCK_STRING_REGEXP =
%r{
  #{BLOCK_QUOTE}
  (?: [^"\\]               |  # Any characters that aren't a quote or slash
     (?<!") ["]{1,2} (?!") |  # Any quotes that don't have quotes next to them
     \\"{0,3}(?!")         |  # A slash followed by <= 3 quotes that aren't followed by a quote
     \\                    |  # A slash
     "{1,2}(?!")              # 1 or 2 " followed by something that isn't a quote
  )*
  (?:"")?
  #{BLOCK_QUOTE}
}xm
FIRST_BYTES =

Use this array to check, for a given byte that will start a token, what kind of token might it start?

Array.new(255)

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(graphql_str, filename: nil, max_tokens: nil) ⇒ Lexer

Returns a new instance of Lexer.

[View source]

6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/graphql/language/lexer.rb', line 6

def initialize(graphql_str, filename: nil, max_tokens: nil)
  if !(graphql_str.encoding == Encoding::UTF_8 || graphql_str.ascii_only?)
    graphql_str = graphql_str.dup.force_encoding(Encoding::UTF_8)
  end
  @string = graphql_str
  @filename = filename
  @scanner = StringScanner.new(graphql_str)
  @pos = nil
  @max_tokens = max_tokens || Float::INFINITY
  @tokens_count = 0
  @finished = false
end

Instance Attribute Details

#posObject (readonly)

Returns the value of attribute pos.


23
24
25
# File 'lib/graphql/language/lexer.rb', line 23

def pos
  @pos
end

#tokens_countObject (readonly)

Returns the value of attribute tokens_count.


23
24
25
# File 'lib/graphql/language/lexer.rb', line 23

def tokens_count
  @tokens_count
end

Class Method Details

.replace_escaped_characters_in_place(raw_string) ⇒ Object

Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it

[View source]

328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
# File 'lib/graphql/language/lexer.rb', line 328

def self.replace_escaped_characters_in_place(raw_string)
  raw_string.gsub!(ESCAPED) do |matched_str|
    if (point_str_1 = $1 || $2)
      codepoint_1 = point_str_1.to_i(16)
      if (codepoint_2 = $3)
        codepoint_2 = codepoint_2.to_i(16)
        if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate
            (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate
          # A surrogate pair
          combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000
          [combined].pack('U'.freeze)
        else
          # Two separate code points
          [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze)
        end
      else
        [codepoint_1].pack('U'.freeze)
      end
    else
      ESCAPES_REPLACE[matched_str]
    end
  end
  nil
end

.tokenize(string) ⇒ Object

This is not used during parsing because the parser doesn't actually need tokens.

[View source]

355
356
357
358
359
360
361
362
363
364
365
366
367
368
# File 'lib/graphql/language/lexer.rb', line 355

def self.tokenize(string)
  lexer = GraphQL::Language::Lexer.new(string)
  tokens = []
  while (token_name = lexer.advance)
    new_token = [
      token_name,
      lexer.line_number,
      lexer.column_number,
      lexer.debug_token_value(token_name),
    ]
    tokens << new_token
  end
  tokens
end

Instance Method Details

#_hash(key) ⇒ Object

This produces a unique integer for bytes 2 and 3 of each keyword string See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html

[View source]

249
250
251
# File 'lib/graphql/language/lexer.rb', line 249

def _hash key
  (key * 18592990) >> 27 & 0x1f
end

#advanceObject

[View source]

25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/graphql/language/lexer.rb', line 25

def advance
  @scanner.skip(IGNORE_REGEXP)
  if @scanner.eos?
    @finished = true
    return false
  end
  @tokens_count += 1
  if @tokens_count > @max_tokens
    raise_parse_error("This query is too large to execute.")
  end
  @pos = @scanner.pos
  next_byte = @string.getbyte(@pos)
  next_byte_is_for = FIRST_BYTES[next_byte]
  case next_byte_is_for
  when ByteFor::PUNCTUATION
    @scanner.pos += 1
    PUNCTUATION_NAME_FOR_BYTE[next_byte]
  when ByteFor::NAME
    if len = @scanner.skip(KEYWORD_REGEXP)
      case len
      when 2
        :ON
      when 12
        :SUBSCRIPTION
      else
        pos = @pos

        # Use bytes 2 and 3 as a unique identifier for this keyword
        bytes = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1)
        KEYWORD_BY_TWO_BYTES[_hash(bytes)]
      end
    else
      @scanner.skip(IDENTIFIER_REGEXP)
      :IDENTIFIER
    end
  when ByteFor::IDENTIFIER
    @scanner.skip(IDENTIFIER_REGEXP)
    :IDENTIFIER
  when ByteFor::NUMBER
    if len = @scanner.skip(NUMERIC_REGEXP)

      if GraphQL.reject_numbers_followed_by_names
        new_pos = @scanner.pos
        peek_byte = @string.getbyte(new_pos)
        next_first_byte = FIRST_BYTES[peek_byte]
        if next_first_byte == ByteFor::NAME || next_first_byte == ByteFor::IDENTIFIER
          number_part = token_value
          name_part = @scanner.scan(IDENTIFIER_REGEXP)
          raise_parse_error("Name after number is not allowed (in `#{number_part}#{name_part}`)")
        end
      end
      # Check for a matched decimal:
      @scanner[1] ? :FLOAT : :INT
    else
      # Attempt to find the part after the `-`
      value = @scanner.scan(/-\s?[a-z0-9]*/i)
      invalid_byte_for_number_error_message = "Expected type 'number', but it was malformed#{value.nil? ? "" : ": #{value.inspect}"}."
      raise_parse_error(invalid_byte_for_number_error_message)
    end
  when ByteFor::ELLIPSIS
    if @string.getbyte(@pos + 1) != 46 || @string.getbyte(@pos + 2) != 46
      raise_parse_error("Expected `...`, actual: #{@string[@pos..@pos + 2].inspect}")
    end
    @scanner.pos += 3
    :ELLIPSIS
  when ByteFor::STRING
    if @scanner.skip(BLOCK_STRING_REGEXP) || @scanner.skip(QUOTED_STRING_REGEXP)
      :STRING
    else
      raise_parse_error("Expected string or block string, but it was malformed")
    end
  else
    @scanner.pos += 1
    :UNKNOWN_CHAR
  end
rescue ArgumentError => err
  if err.message == "invalid byte sequence in UTF-8"
    raise_parse_error("Parse error on bad Unicode escape sequence", nil, nil)
  end
end

#column_numberObject

[View source]

168
169
170
# File 'lib/graphql/language/lexer.rb', line 168

def column_number
  @scanner.string[0..@pos].split("\n").last.length
end

#debug_token_value(token_name) ⇒ Object

[View source]

112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/graphql/language/lexer.rb', line 112

def debug_token_value(token_name)
  if token_name && Lexer::Punctuation.const_defined?(token_name)
    Lexer::Punctuation.const_get(token_name)
  elsif token_name == :ELLIPSIS
    "..."
  elsif token_name == :STRING
    string_value
  elsif @scanner.matched_size.nil?
    @scanner.peek(1)
  else
    token_value
  end
end

#finished?Boolean

Returns:

  • (Boolean)
[View source]

19
20
21
# File 'lib/graphql/language/lexer.rb', line 19

def finished?
  @finished
end

#line_numberObject

[View source]

164
165
166
# File 'lib/graphql/language/lexer.rb', line 164

def line_number
  @scanner.string[0..@pos].count("\n") + 1
end

#raise_parse_error(message, line = line_number, col = column_number) ⇒ Object

[View source]

172
173
174
# File 'lib/graphql/language/lexer.rb', line 172

def raise_parse_error(message, line = line_number, col = column_number)
  raise GraphQL::ParseError.new(message, line, col, @string, filename: @filename)
end

#string_valueObject

[View source]

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/graphql/language/lexer.rb', line 141

def string_value
  str = token_value
  is_block = str.start_with?('"""')
  if is_block
    str.gsub!(/\A"""|"""\z/, '')
    return Language::BlockString.trim_whitespace(str)
  else
    str.gsub!(/\A"|"\z/, '')

    if !str.valid_encoding? || !str.match?(VALID_STRING)
      raise_parse_error("Bad unicode escape in #{str.inspect}")
    else
      Lexer.replace_escaped_characters_in_place(str)

      if !str.valid_encoding?
        raise_parse_error("Bad unicode escape in #{str.inspect}")
      else
        str
      end
    end
  end
end

#token_valueObject

[View source]

106
107
108
109
110
# File 'lib/graphql/language/lexer.rb', line 106

def token_value
  @string.byteslice(@scanner.pos - @scanner.matched_size, @scanner.matched_size)
rescue StandardError => err
  raise GraphQL::Error, "(token_value failed: #{err.class}: #{err.message})"
end