Class: HexaPDF::Content::Tokenizer
- Defined in:
- lib/hexapdf/content/parser.rb
Overview
More efficient tokenizer for content streams. This tokenizer class works directly on a string and not on an IO.
Changes:
-
Since a content stream is usually parsed front to back, a StopIteration error can be raised instead of returning
NO_MORE_TOKENS
once the end of the string is reached to avoid costly checks in each iteration. If this behaviour is wanted, pass “raise_on_eos: true” in the constructor. -
Indirect object references are not supported by this tokenizer!
See: PDF2.0 s7.2
Constant Summary
Constants inherited from Tokenizer
Tokenizer::DELIMITER, Tokenizer::NO_MORE_TOKENS, Tokenizer::TOKEN_ARRAY_END, Tokenizer::TOKEN_ARRAY_START, Tokenizer::TOKEN_DICT_END, Tokenizer::TOKEN_DICT_START, Tokenizer::WHITESPACE, Tokenizer::WHITESPACE_MULTI_RE, Tokenizer::WHITESPACE_OR_DELIMITER_RE
Instance Attribute Summary collapse
-
#string ⇒ Object
readonly
The string that is tokenized.
Attributes inherited from Tokenizer
Instance Method Summary collapse
-
#initialize(string, raise_on_eos: false) ⇒ Tokenizer
constructor
Creates a new tokenizer.
-
#next_token ⇒ Object
See: HexaPDF::Tokenizer#next_token.
-
#pos ⇒ Object
See: HexaPDF::Tokenizer#pos.
-
#pos=(pos) ⇒ Object
See: HexaPDF::Tokenizer#pos=.
-
#scan_until(re) ⇒ Object
See: HexaPDF::Tokenizer#scan_until.
Methods inherited from Tokenizer
#next_byte, #next_integer_or_keyword, #next_object, #next_xref_entry, #peek_token, #skip_whitespace
Constructor Details
#initialize(string, raise_on_eos: false) ⇒ Tokenizer
Creates a new tokenizer.
63 64 65 66 67 |
# File 'lib/hexapdf/content/parser.rb', line 63 def initialize(string, raise_on_eos: false) @ss = StringScanner.new(string) @string = string @raise_on_eos = raise_on_eos end |
Instance Attribute Details
#string ⇒ Object (readonly)
The string that is tokenized.
60 61 62 |
# File 'lib/hexapdf/content/parser.rb', line 60 def string @string end |
Instance Method Details
#next_token ⇒ Object
See: HexaPDF::Tokenizer#next_token
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# File 'lib/hexapdf/content/parser.rb', line 85 def next_token @ss.skip(WHITESPACE_MULTI_RE) byte = @string.getbyte(@ss.pos) || -1 if (48 <= byte && byte <= 57) || byte == 45 || byte == 43 || byte == 46 # 0..9 - + . parse_number elsif (65 <= byte && byte <= 90) || (96 <= byte && byte <= 121) parse_keyword elsif byte == 47 # / parse_name elsif byte == 40 # ( parse_literal_string elsif byte == 60 # < if @string.getbyte(@ss.pos + 1) == 60 @ss.pos += 2 TOKEN_DICT_START else parse_hex_string end elsif byte == 62 # > unless @string.getbyte(@ss.pos + 1) == 62 raise HexaPDF::MalformedPDFError.new("Delimiter '>' found at invalid position", pos: pos) end @ss.pos += 2 TOKEN_DICT_END elsif byte == 91 # [ @ss.pos += 1 TOKEN_ARRAY_START elsif byte == 93 # ] @ss.pos += 1 TOKEN_ARRAY_END elsif byte == 41 # ) raise HexaPDF::MalformedPDFError.new("Delimiter ')' found at invalid position", pos: pos) elsif byte == 123 || byte == 125 # { } ) Token.new(@ss.get_byte) elsif byte == 37 # % unless @ss.skip_until(/(?=[\r\n])/) (@raise_on_eos ? (raise StopIteration) : (return NO_MORE_TOKENS)) end next_token elsif byte == -1 @raise_on_eos ? raise(StopIteration) : NO_MORE_TOKENS else parse_keyword end end |
#pos ⇒ Object
See: HexaPDF::Tokenizer#pos
70 71 72 |
# File 'lib/hexapdf/content/parser.rb', line 70 def pos @ss.pos end |
#pos=(pos) ⇒ Object
See: HexaPDF::Tokenizer#pos=
75 76 77 |
# File 'lib/hexapdf/content/parser.rb', line 75 def pos=(pos) @ss.pos = pos end |
#scan_until(re) ⇒ Object
See: HexaPDF::Tokenizer#scan_until
80 81 82 |
# File 'lib/hexapdf/content/parser.rb', line 80 def scan_until(re) @ss.scan_until(re) end |