Class: GraphQL::Language::Lexer
- Inherits:
-
Object
- Object
- GraphQL::Language::Lexer
- Defined in:
- lib/graphql/language/lexer.rb
Defined Under Namespace
Modules: ByteFor, Punctuation
Constant Summary collapse
- ESCAPES =
/\\["\\\/bfnrt]/
- ESCAPES_REPLACE =
{ '\\"' => '"', "\\\\" => "\\", "\\/" => '/', "\\b" => "\b", "\\f" => "\f", "\\n" => "\n", "\\r" => "\r", "\\t" => "\t", }
- UTF_8 =
/\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
- VALID_STRING =
/\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
- ESCAPED =
/(?:#{ESCAPES}|#{UTF_8})/o
- IGNORE_REGEXP =
%r{ (?: [, \c\r\n\t]+ | \#.*$ )* }x
- IDENTIFIER_REGEXP =
/[_A-Za-z][_0-9A-Za-z]*/
- INT_REGEXP =
/-?(?:[0]|[1-9][0-9]*)/
- FLOAT_DECIMAL_REGEXP =
/[.][0-9]+/
- FLOAT_EXP_REGEXP =
/[eE][+-]?[0-9]+/
- NUMERIC_REGEXP =
TODO: FLOAT_EXP_REGEXP should not be allowed to follow INT_REGEXP, integers are not allowed to have exponent parts.
/#{INT_REGEXP}(#{FLOAT_DECIMAL_REGEXP}#{FLOAT_EXP_REGEXP}|#{FLOAT_DECIMAL_REGEXP}|#{FLOAT_EXP_REGEXP})?/
- KEYWORDS =
[ "on", "fragment", "true", "false", "null", "query", "mutation", "subscription", "schema", "scalar", "type", "extend", "implements", "interface", "union", "enum", "input", "directive", "repeatable" ].freeze
- KEYWORD_REGEXP =
/#{Regexp.union(KEYWORDS.sort)}\b/
- KEYWORD_BY_TWO_BYTES =
[ :INTERFACE, :MUTATION, :EXTEND, :FALSE, :ENUM, :TRUE, :NULL, nil, nil, nil, nil, nil, nil, nil, :QUERY, nil, nil, :REPEATABLE, :IMPLEMENTS, :INPUT, :TYPE, :SCHEMA, nil, nil, nil, :DIRECTIVE, :UNION, nil, nil, :SCALAR, nil, :FRAGMENT ]
- PUNCTUATION_NAME_FOR_BYTE =
A sparse array mapping the bytes for each punctuation to a symbol name for that punctuation
Punctuation.constants.each_with_object([]) { |name, arr| punct = Punctuation.const_get(name) arr[punct.ord] = name }
- QUOTE =
'"'
- UNICODE_DIGIT =
/[0-9A-Za-z]/
- FOUR_DIGIT_UNICODE =
/#{UNICODE_DIGIT}{4}/
- N_DIGIT_UNICODE =
%r{#{Punctuation::LCURLY}#{UNICODE_DIGIT}{4,}#{Punctuation::RCURLY}}x
- UNICODE_ESCAPE =
%r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}
- STRING_ESCAPE =
%r{[\\][\\/bfnrt]}
- BLOCK_QUOTE =
'"""'
- ESCAPED_QUOTE =
/\\"/
- STRING_CHAR =
/#{ESCAPED_QUOTE}|[^"\\\n\r]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/
- QUOTED_STRING_REGEXP =
%r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x
- BLOCK_STRING_REGEXP =
%r{ #{BLOCK_QUOTE} (?: [^"\\] | # Any characters that aren't a quote or slash (?<!") ["]{1,2} (?!") | # Any quotes that don't have quotes next to them \\"{0,3}(?!") | # A slash followed by <= 3 quotes that aren't followed by a quote \\ | # A slash "{1,2}(?!") # 1 or 2 " followed by something that isn't a quote )* (?:"")? #{BLOCK_QUOTE} }xm
- FIRST_BYTES =
Use this array to check, for a given byte that will start a token, what kind of token might it start?
Array.new(255)
Instance Attribute Summary collapse
-
#pos ⇒ Object
readonly
Returns the value of attribute pos.
-
#tokens_count ⇒ Object
readonly
Returns the value of attribute tokens_count.
Class Method Summary collapse
-
.replace_escaped_characters_in_place(raw_string) ⇒ Object
Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it.
-
.tokenize(string) ⇒ Object
This is not used during parsing because the parser doesn't actually need tokens.
Instance Method Summary collapse
-
#_hash(key) ⇒ Object
This produces a unique integer for bytes 2 and 3 of each keyword string See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html.
- #advance ⇒ Object
- #column_number ⇒ Object
- #debug_token_value(token_name) ⇒ Object
- #eos? ⇒ Boolean
-
#initialize(graphql_str, filename: nil, max_tokens: nil) ⇒ Lexer
constructor
A new instance of Lexer.
- #line_number ⇒ Object
- #raise_parse_error(message, line = line_number, col = column_number) ⇒ Object
- #string_value ⇒ Object
- #token_value ⇒ Object
Constructor Details
#initialize(graphql_str, filename: nil, max_tokens: nil) ⇒ Lexer
Returns a new instance of Lexer.
6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/graphql/language/lexer.rb', line 6 def initialize(graphql_str, filename: nil, max_tokens: nil) if !(graphql_str.encoding == Encoding::UTF_8 || graphql_str.ascii_only?) graphql_str = graphql_str.dup.force_encoding(Encoding::UTF_8) end @string = graphql_str @filename = filename @scanner = StringScanner.new(graphql_str) @pos = nil @max_tokens = max_tokens || Float::INFINITY @tokens_count = 0 end |
Instance Attribute Details
#pos ⇒ Object (readonly)
Returns the value of attribute pos.
22 23 24 |
# File 'lib/graphql/language/lexer.rb', line 22 def pos @pos end |
#tokens_count ⇒ Object (readonly)
Returns the value of attribute tokens_count.
22 23 24 |
# File 'lib/graphql/language/lexer.rb', line 22 def tokens_count @tokens_count end |
Class Method Details
.replace_escaped_characters_in_place(raw_string) ⇒ Object
Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 |
# File 'lib/graphql/language/lexer.rb', line 324 def self.replace_escaped_characters_in_place(raw_string) raw_string.gsub!(ESCAPED) do |matched_str| if (point_str_1 = $1 || $2) codepoint_1 = point_str_1.to_i(16) if (codepoint_2 = $3) codepoint_2 = codepoint_2.to_i(16) if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate # A surrogate pair combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000 [combined].pack('U'.freeze) else # Two separate code points [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze) end else [codepoint_1].pack('U'.freeze) end else ESCAPES_REPLACE[matched_str] end end nil end |
.tokenize(string) ⇒ Object
This is not used during parsing because the parser doesn't actually need tokens.
351 352 353 354 355 356 357 358 359 360 361 362 363 364 |
# File 'lib/graphql/language/lexer.rb', line 351 def self.tokenize(string) lexer = GraphQL::Language::Lexer.new(string) tokens = [] while (token_name = lexer.advance) new_token = [ token_name, lexer.line_number, lexer.column_number, lexer.debug_token_value(token_name), ] tokens << new_token end tokens end |
Instance Method Details
#_hash(key) ⇒ Object
This produces a unique integer for bytes 2 and 3 of each keyword string See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html
245 246 247 |
# File 'lib/graphql/language/lexer.rb', line 245 def _hash key (key * 18592990) >> 27 & 0x1f end |
#advance ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/graphql/language/lexer.rb', line 24 def advance @scanner.skip(IGNORE_REGEXP) return false if @scanner.eos? @tokens_count += 1 if @tokens_count > @max_tokens raise_parse_error("This query is too large to execute.") end @pos = @scanner.pos next_byte = @string.getbyte(@pos) next_byte_is_for = FIRST_BYTES[next_byte] case next_byte_is_for when ByteFor::PUNCTUATION @scanner.pos += 1 PUNCTUATION_NAME_FOR_BYTE[next_byte] when ByteFor::NAME if len = @scanner.skip(KEYWORD_REGEXP) case len when 2 :ON when 12 :SUBSCRIPTION else pos = @pos # Use bytes 2 and 3 as a unique identifier for this keyword bytes = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1) KEYWORD_BY_TWO_BYTES[_hash(bytes)] end else @scanner.skip(IDENTIFIER_REGEXP) :IDENTIFIER end when ByteFor::IDENTIFIER @scanner.skip(IDENTIFIER_REGEXP) :IDENTIFIER when ByteFor::NUMBER if len = @scanner.skip(NUMERIC_REGEXP) if GraphQL.reject_numbers_followed_by_names new_pos = @scanner.pos peek_byte = @string.getbyte(new_pos) next_first_byte = FIRST_BYTES[peek_byte] if next_first_byte == ByteFor::NAME || next_first_byte == ByteFor::IDENTIFIER number_part = token_value name_part = @scanner.scan(IDENTIFIER_REGEXP) raise_parse_error("Name after number is not allowed (in `#{number_part}#{name_part}`)") end end # Check for a matched decimal: @scanner[1] ? :FLOAT : :INT else # Attempt to find the part after the `-` value = @scanner.scan(/-\s?[a-z0-9]*/i) = "Expected type 'number', but it was malformed#{value.nil? ? "" : ": #{value.inspect}"}." raise_parse_error() end when ByteFor::ELLIPSIS if @string.getbyte(@pos + 1) != 46 || @string.getbyte(@pos + 2) != 46 raise_parse_error("Expected `...`, actual: #{@string[@pos..@pos + 2].inspect}") end @scanner.pos += 3 :ELLIPSIS when ByteFor::STRING if @scanner.skip(BLOCK_STRING_REGEXP) || @scanner.skip(QUOTED_STRING_REGEXP) :STRING else raise_parse_error("Expected string or block string, but it was malformed") end else @scanner.pos += 1 :UNKNOWN_CHAR end rescue ArgumentError => err if err. == "invalid byte sequence in UTF-8" raise_parse_error("Parse error on bad Unicode escape sequence", nil, nil) end end |
#column_number ⇒ Object
164 165 166 |
# File 'lib/graphql/language/lexer.rb', line 164 def column_number @scanner.string[0..@pos].split("\n").last.length end |
#debug_token_value(token_name) ⇒ Object
108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/graphql/language/lexer.rb', line 108 def debug_token_value(token_name) if token_name && Lexer::Punctuation.const_defined?(token_name) Lexer::Punctuation.const_get(token_name) elsif token_name == :ELLIPSIS "..." elsif token_name == :STRING string_value elsif @scanner.matched_size.nil? @scanner.peek(1) else token_value end end |
#eos? ⇒ Boolean
18 19 20 |
# File 'lib/graphql/language/lexer.rb', line 18 def eos? @scanner.eos? end |
#line_number ⇒ Object
160 161 162 |
# File 'lib/graphql/language/lexer.rb', line 160 def line_number @scanner.string[0..@pos].count("\n") + 1 end |
#raise_parse_error(message, line = line_number, col = column_number) ⇒ Object
168 169 170 |
# File 'lib/graphql/language/lexer.rb', line 168 def raise_parse_error(, line = line_number, col = column_number) raise GraphQL::ParseError.new(, line, col, @string, filename: @filename) end |
#string_value ⇒ Object
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/graphql/language/lexer.rb', line 137 def string_value str = token_value is_block = str.start_with?('"""') if is_block str.gsub!(/\A"""|"""\z/, '') return Language::BlockString.trim_whitespace(str) else str.gsub!(/\A"|"\z/, '') if !str.valid_encoding? || !str.match?(VALID_STRING) raise_parse_error("Bad unicode escape in #{str.inspect}") else Lexer.replace_escaped_characters_in_place(str) if !str.valid_encoding? raise_parse_error("Bad unicode escape in #{str.inspect}") else str end end end end |