Class: TinyGQL::Lexer

Inherits:
Object
  • Object
show all
Includes:
Literals
Defined in:
lib/tinygql/lexer.rb

Defined Under Namespace

Modules: LeadBytes, Literals

Constant Summary collapse

IDENTIFIER =
/[_A-Za-z][_0-9A-Za-z]*\b/
IGNORE =
%r{
  (?:
    [, \c\r\n\t]+ |
    \#.*$
  )*
}x
INT =
/[-]?(?:[0]|[1-9][0-9]*)/
FLOAT_DECIMAL =
/[.][0-9]+/
FLOAT_EXP =
/[eE][+-]?[0-9]+/
NUMERIC =
/#{INT}(#{FLOAT_DECIMAL}#{FLOAT_EXP}|#{FLOAT_DECIMAL}|#{FLOAT_EXP})?/
KEYWORDS =
[
  "on",
  "fragment",
  "true",
  "false",
  "null",
  "query",
  "mutation",
  "subscription",
  "schema",
  "scalar",
  "type",
  "extend",
  "implements",
  "interface",
  "union",
  "enum",
  "input",
  "directive",
  "repeatable"
].freeze
KW_RE =
/#{Regexp.union(KEYWORDS.sort)}\b/
ELLIPSIS =
'...'
QUOTE =
'"'
UNICODE_DIGIT =
/[0-9A-Za-z]/
FOUR_DIGIT_UNICODE =
/#{UNICODE_DIGIT}{4}/
N_DIGIT_UNICODE =
%r{#{LCURLY}#{UNICODE_DIGIT}{4,}#{RCURLY}}x
UNICODE_ESCAPE =
%r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}
STRING_ESCAPE =
%r{[\\][\\/bfnrt]}
BLOCK_QUOTE =
'"""'
ESCAPED_QUOTE =
/\\"/
STRING_CHAR =
/#{ESCAPED_QUOTE}|[^"\\]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/
PUNCT_LUT =
Literals.constants.each_with_object([]) { |n, o|
  o[Literals.const_get(n).ord] = n
}
LEAD_BYTES =
Array.new(255) { 0 }
QUOTED_STRING =
%r{#{QUOTE} ((?:#{STRING_CHAR})*) #{QUOTE}}x
BLOCK_STRING =
%r{
    #{BLOCK_QUOTE}
((?: [^"\\]               |  # Any characters that aren't a quote or slash
(?<!") ["]{1,2} (?!") |  # Any quotes that don't have quotes next to them
\\"{0,3}(?!")         |  # A slash followed by <= 3 quotes that aren't followed by a quote
\\                    |  # A slash
"{1,2}(?!")              # 1 or 2 " followed by something that isn't a quote
)*
(?:"")?)
    #{BLOCK_QUOTE}
}xm
ESCAPES =
/\\["\\\/bfnrt]/
ESCAPES_REPLACE =
{
  '\\"' => '"',
  "\\\\" => "\\",
  "\\/" => '/',
  "\\b" => "\b",
  "\\f" => "\f",
  "\\n" => "\n",
  "\\r" => "\r",
  "\\t" => "\t",
}
UTF_8 =
/\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
VALID_STRING =
/\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
KW_LUT =
[:INTERFACE,
:MUTATION,
:EXTEND,
:FALSE,
:ENUM,
:TRUE,
:NULL,
nil,
nil,
nil,
nil,
nil,
nil,
nil,
:QUERY,
nil,
nil,
:REPEATABLE,
:IMPLEMENTS,
:INPUT,
:TYPE,
:SCHEMA,
nil,
nil,
nil,
:DIRECTIVE,
:UNION,
nil,
nil,
:SCALAR,
nil,
:FRAGMENT]

Constants included from Literals

Literals::AMP, Literals::BANG, Literals::COLON, Literals::DIR_SIGN, Literals::EQUALS, Literals::LBRACKET, Literals::LCURLY, Literals::LPAREN, Literals::PIPE, Literals::RBRACKET, Literals::RCURLY, Literals::RPAREN, Literals::VAR_SIGN

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(string) ⇒ Lexer

Returns a new instance of Lexer.



118
119
120
121
122
123
124
# File 'lib/tinygql/lexer.rb', line 118

def initialize string
  raise unless string.valid_encoding?

  @string = string
  @scan = StringScanner.new string
  @start = nil
end

Instance Attribute Details

#startObject (readonly)

Returns the value of attribute start.



126
127
128
# File 'lib/tinygql/lexer.rb', line 126

def start
  @start
end

Instance Method Details

#_hash(key) ⇒ Object



368
369
370
# File 'lib/tinygql/lexer.rb', line 368

def _hash key
  (key * 18592990) >> 27 & 0x1f
end

#advanceObject



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/tinygql/lexer.rb', line 136

def advance
  @scan.skip(IGNORE)

  return false if @scan.eos?

  @start = @scan.pos

  lead_byte = @string.getbyte(@start)
  lead_code = LEAD_BYTES[lead_byte]

  if lead_code == LeadBytes::PUNCT
    @scan.pos += 1
    PUNCT_LUT[lead_byte]

  elsif lead_code == LeadBytes::KW
    if len = @scan.skip(KW_RE)
      return :ON if len == 2
      return :SUBSCRIPTION if len == 12

      pos = @start

      # Second 2 bytes are unique, so we'll hash on those
      key = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1)

      KW_LUT[_hash(key)]
    else
      @scan.skip(IDENTIFIER)
      :IDENTIFIER
    end

  elsif lead_code == LeadBytes::IDENT
    @scan.skip(IDENTIFIER)
    :IDENTIFIER

  elsif lead_code == LeadBytes::INT
    @scan.skip(NUMERIC)
    @scan[1] ? :FLOAT : :INT

  elsif lead_code == LeadBytes::ELLIPSIS
    2.times do |i|
      raise unless @string.getbyte(@start + i + 1) == 46
    end
    @scan.pos += 3
    :ELLIPSIS

  elsif lead_code == LeadBytes::STRING
    raise unless @scan.skip(BLOCK_STRING) || @scan.skip(QUOTED_STRING)
    :STRING

  else
    @scan.pos += 1
    :UNKNOWN_CHAR
  end
end

#done?Boolean

Returns:

  • (Boolean)


132
133
134
# File 'lib/tinygql/lexer.rb', line 132

def done?
  @scan.eos?
end

#emit_block(value) ⇒ Object



262
263
264
265
# File 'lib/tinygql/lexer.rb', line 262

def emit_block(value)
  value = trim_whitespace(value)
  emit_string(value)
end

#emit_string(value) ⇒ Object



267
268
269
270
271
272
273
274
275
276
277
278
279
# File 'lib/tinygql/lexer.rb', line 267

def emit_string(value)
  if !value.valid_encoding? || !value.match?(VALID_STRING)
    emit(:BAD_UNICODE_ESCAPE, value)
  else
    replace_escaped_characters_in_place(value)

    if !value.valid_encoding?
      emit(:BAD_UNICODE_ESCAPE, value)
    else
      value
    end
  end
end

#lineObject



128
129
130
# File 'lib/tinygql/lexer.rb', line 128

def line
  @scan.string[0, @scan.pos].count("\n") + 1
end

#next_tokenObject



207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/tinygql/lexer.rb', line 207

def next_token
  return unless tok = advance
  val = case tok
  when :STRING then string_value
  when :ELLIPSIS then
    @string.byteslice(@scan.pos - 3, 3)
  when *Literals.constants
    @string.byteslice(@scan.pos - 1, 1)
  else
    token_value
  end

  [tok, val]
end

#replace_escaped_characters_in_place(raw_string) ⇒ Object

Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it



224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/tinygql/lexer.rb', line 224

def replace_escaped_characters_in_place(raw_string)
  raw_string.gsub!(ESCAPES, ESCAPES_REPLACE)
  raw_string.gsub!(UTF_8) do |_matched_str|
    codepoint_1 = ($1 || $2).to_i(16)
    codepoint_2 = $3

    if codepoint_2
      codepoint_2 = codepoint_2.to_i(16)
      if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate
          (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate
        # A surrogate pair
        combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000
        [combined].pack('U'.freeze)
      else
        # Two separate code points
        [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze)
      end
    else
      [codepoint_1].pack('U'.freeze)
    end
  end
  nil
end

#string_valueObject



195
196
197
198
199
200
201
202
203
204
205
# File 'lib/tinygql/lexer.rb', line 195

def string_value
  str = token_value
  block = str.start_with?('"""')
  str.gsub!(/\A"*|"*\z/, '')

  if block
    emit_block str
  else
    emit_string str
  end
end

#token_valueObject



191
192
193
# File 'lib/tinygql/lexer.rb', line 191

def token_value
  @string.byteslice(@scan.pos - @scan.matched_size, @scan.matched_size)
end

#trim_whitespace(str) ⇒ Object



281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# File 'lib/tinygql/lexer.rb', line 281

def trim_whitespace(str)
  # Early return for the most common cases:
  if str == ""
    return "".dup
  elsif !(has_newline = str.include?("\n")) && !(str.start_with?(" "))
    return str
  end

  lines = has_newline ? str.split("\n") : [str]
  common_indent = nil

  # find the common whitespace
  lines.each_with_index do |line, idx|
    if idx == 0
      next
    end
    line_length = line.size
    line_indent = if line.match?(/\A  [^ ]/)
      2
    elsif line.match?(/\A    [^ ]/)
      4
    elsif line.match?(/\A[^ ]/)
      0
    else
      line[/\A */].size
    end
    if line_indent < line_length && (common_indent.nil? || line_indent < common_indent)
      common_indent = line_indent
    end
  end

  # Remove the common whitespace
  if common_indent && common_indent > 0
    lines.each_with_index do |line, idx|
      if idx == 0
        next
      else
        line.slice!(0, common_indent)
      end
    end
  end

  # Remove leading & trailing blank lines
  while lines.size > 0 && lines[0].empty?
    lines.shift
  end
  while lines.size > 0 && lines[-1].empty?
    lines.pop
  end

  # Rebuild the string
  lines.size > 1 ? lines.join("\n") : (lines.first || "".dup)
end