Class: REXML::Parsers::BaseParser

Inherits:
Object
  • Object
show all
Defined in:
lib/rexml/parsers/baseparser.rb

Overview

Using the Pull Parser

This API is experimental, and subject to change.

parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
while parser.has_next?
  res = parser.next
  puts res[1]['att'] if res.start_tag? and res[0] == 'b'
end

See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.

Notice that:

parser = PullParser.new( "<a>BAD DOCUMENT" )
while parser.has_next?
  res = parser.next
  raise res[1] if res.error?
end

Nat Price gave me some good ideas for the API.

Constant Summary collapse

LETTER =
'[:alpha:]'
DIGIT =
'[:digit:]'
COMBININGCHAR =

TODO

''
EXTENDER =

TODO

''
NCNAME_STR =
"[#{LETTER}_][-[:alnum:]._#{COMBININGCHAR}#{EXTENDER}]*"
QNAME_STR =
"(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
QNAME =
/(#{QNAME_STR})/
UNAME_STR =

Just for backward compatibility. For example, kramdown uses this. It’s not used in REXML.

"(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
NAMECHAR =
'[\-\w\.:]'
NAME =
"([\\w:]#{NAMECHAR}*)"
NMTOKEN =
"(?:#{NAMECHAR})+"
NMTOKENS =
"#{NMTOKEN}(\\s+#{NMTOKEN})*"
REFERENCE =
"&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
REFERENCE_RE =
/#{REFERENCE}/
DOCTYPE_START =
/\A\s*<!DOCTYPE\s/um
DOCTYPE_END =
/\A\s*\]\s*>/um
ATTRIBUTE_PATTERN =
/\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START =
/\A<!--/u
COMMENT_PATTERN =
/<!--(.*?)-->/um
CDATA_START =
/\A<!\[CDATA\[/u
CDATA_END =
/\A\s*\]\s*>/um
CDATA_PATTERN =
/<!\[CDATA\[(.*?)\]\]>/um
XMLDECL_START =
/\A<\?xml\s/u
XMLDECL_PATTERN =
/<\?xml\s+(.*?)\?>/um
INSTRUCTION_START =
/\A<\?/u
INSTRUCTION_PATTERN =
/<\?#{NAME}(\s+.*?)?\?>/um
TAG_MATCH =
/\A<((?>#{QNAME_STR}))/um
CLOSE_MATCH =
/\A\s*<\/(#{QNAME_STR})\s*>/um
VERSION =
/\bversion\s*=\s*["'](.*?)['"]/um
ENCODING =
/\bencoding\s*=\s*["'](.*?)['"]/um
STANDALONE =
/\bstandalone\s*=\s*["'](.*?)['"]/um
ENTITY_START =
/\A\s*<!ENTITY/
ELEMENTDECL_START =
/\A\s*<!ELEMENT/um
ELEMENTDECL_PATTERN =
/\A\s*(<!ELEMENT.*?)>/um
SYSTEMENTITY =
/\A\s*(%.*?;)\s*$/um
ENUMERATION =
"\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
NOTATIONTYPE =
"NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
ENUMERATEDTYPE =
"(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
ATTTYPE =
"(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
ATTVALUE =
"(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
DEFAULTDECL =
"(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
ATTDEF =
"\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
ATTDEF_RE =
/#{ATTDEF}/
ATTLISTDECL_START =
/\A\s*<!ATTLIST/um
ATTLISTDECL_PATTERN =
/\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
TEXT_PATTERN =
/\A([^<]*)/um
PUBIDCHAR =

Entity constants

"\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
SYSTEMLITERAL =
%Q{((?:"[^"]*")|(?:'[^']*'))}
PUBIDLITERAL =
%Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
EXTERNALID =
"(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
NDATADECL =
"\\s+NDATA\\s+#{NAME}"
PEREFERENCE =
"%#{NAME};"
ENTITYVALUE =
%Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
PEDEF =
"(?:#{ENTITYVALUE}|#{EXTERNALID})"
ENTITYDEF =
"(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL =
"<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL =
"<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL =
/\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
NOTATIONDECL_START =
/\A\s*<!NOTATION/um
EXTERNAL_ID_PUBLIC =
/\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
EXTERNAL_ID_SYSTEM =
/\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
PUBLIC_ID =
/\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
EREFERENCE =
/&(?!#{NAME};)/
DEFAULT_ENTITIES =
{
  'gt' => [/&gt;/, '&gt;', '>', />/],
  'lt' => [/&lt;/, '&lt;', '<', /</],
  'quot' => [/&quot;/, '&quot;', '"', /"/],
  "apos" => [/&apos;/, "&apos;", "'", /'/]
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ BaseParser

Returns a new instance of BaseParser.



163
164
165
166
167
168
169
170
171
# File 'lib/rexml/parsers/baseparser.rb', line 163

def initialize( source )
  self.stream = source
  @listeners = []
  @prefixes = Set.new
  @entity_expansion_count = 0
  @entity_expansion_limit = Security.entity_expansion_limit
  @entity_expansion_text_limit = Security.entity_expansion_text_limit
  @source.ensure_buffer
end

Instance Attribute Details

#entity_expansion_countObject (readonly)

Returns the value of attribute entity_expansion_count.



178
179
180
# File 'lib/rexml/parsers/baseparser.rb', line 178

def entity_expansion_count
  @entity_expansion_count
end

#entity_expansion_limit=(value) ⇒ Object (writeonly)

Sets the attribute entity_expansion_limit

Parameters:

  • value

    the value to set the attribute entity_expansion_limit to.



179
180
181
# File 'lib/rexml/parsers/baseparser.rb', line 179

def entity_expansion_limit=(value)
  @entity_expansion_limit = value
end

#entity_expansion_text_limit=(value) ⇒ Object (writeonly)

Sets the attribute entity_expansion_text_limit

Parameters:

  • value

    the value to set the attribute entity_expansion_text_limit to.



180
181
182
# File 'lib/rexml/parsers/baseparser.rb', line 180

def entity_expansion_text_limit=(value)
  @entity_expansion_text_limit = value
end

#sourceObject (readonly)

Returns the value of attribute source.



177
178
179
# File 'lib/rexml/parsers/baseparser.rb', line 177

def source
  @source
end

Instance Method Details

#add_listener(listener) ⇒ Object



173
174
175
# File 'lib/rexml/parsers/baseparser.rb', line 173

def add_listener( listener )
  @listeners << listener
end

#empty?Boolean

Returns true if there are no more events

Returns:

  • (Boolean)


208
209
210
# File 'lib/rexml/parsers/baseparser.rb', line 208

def empty?
  return (@source.empty? and @stack.empty?)
end

#entity(reference, entities) ⇒ Object



540
541
542
543
544
545
546
547
548
# File 'lib/rexml/parsers/baseparser.rb', line 540

def entity( reference, entities )
  return unless entities

  value = entities[ reference ]
  return if value.nil?

  record_entity_expansion
  unnormalize( value, entities )
end

#has_next?Boolean

Returns true if there are more events. Synonymous with !empty?

Returns:

  • (Boolean)


213
214
215
# File 'lib/rexml/parsers/baseparser.rb', line 213

def has_next?
  return !(@source.empty? and @stack.empty?)
end

#normalize(input, entities = nil, entity_filter = nil) ⇒ Object

Escapes all possible entities



551
552
553
554
555
556
557
558
559
560
561
562
563
564
# File 'lib/rexml/parsers/baseparser.rb', line 551

def normalize( input, entities=nil, entity_filter=nil )
  copy = input.clone
  # Doing it like this rather than in a loop improves the speed
  copy.gsub!( EREFERENCE, '&amp;' )
  entities.each do |key, value|
    copy.gsub!( value, "&#{key};" ) unless entity_filter and
                                entity_filter.include?(entity)
  end if entities
  copy.gsub!( EREFERENCE, '&amp;' )
  DEFAULT_ENTITIES.each do |key, value|
    copy.gsub!( value[3], value[1] )
  end
  copy
end

#peek(depth = 0) ⇒ Object

Peek at the depth event in the stack. The first element on the stack is at depth 0. If depth is -1, will parse to the end of the input stream and return the last event, which is always :end_document. Be aware that this causes the stream to be parsed up to the depth event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.



229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/rexml/parsers/baseparser.rb', line 229

def peek depth=0
  raise %Q[Illegal argument "#{depth}"] if depth < -1
  temp = []
  if depth == -1
    temp.push(pull()) until empty?
  else
    while @stack.size+temp.size < depth+1
      temp.push(pull())
    end
  end
  @stack += temp if temp.size > 0
  @stack[depth]
end

#positionObject



198
199
200
201
202
203
204
205
# File 'lib/rexml/parsers/baseparser.rb', line 198

def position
  if @source.respond_to? :position
    @source.position
  else
    # FIXME
    0
  end
end

#pullObject

Returns the next event. This is a PullEvent object.



244
245
246
247
248
249
250
251
252
# File 'lib/rexml/parsers/baseparser.rb', line 244

def pull
  @source.drop_parsed_content

  pull_event.tap do |event|
    @listeners.each do |listener|
      listener.receive event
    end
  end
end

#resetObject



187
188
189
190
191
192
193
194
195
196
# File 'lib/rexml/parsers/baseparser.rb', line 187

def reset
  @closed = nil
  @have_root = false
  @document_status = nil
  @tags = []
  @stack = []
  @entities = []
  @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
  @namespaces_restore_stack = []
end

#stream=(source) ⇒ Object



182
183
184
185
# File 'lib/rexml/parsers/baseparser.rb', line 182

def stream=( source )
  @source = SourceFactory.create_from( source )
  reset
end

#unnormalize(string, entities = nil, filter = nil) ⇒ Object

Unescapes all possible entities



567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
# File 'lib/rexml/parsers/baseparser.rb', line 567

def unnormalize( string, entities=nil, filter=nil )
  if string.include?("\r")
    rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
  else
    rv = string.dup
  end
  matches = rv.scan( REFERENCE_RE )
  return rv if matches.size == 0
  rv.gsub!( Private::CHARACTER_REFERENCES ) {
    m=$1
    if m.start_with?("x")
      code_point = Integer(m[1..-1], 16)
    else
      code_point = Integer(m, 10)
    end
    [code_point].pack('U*')
  }
  matches.collect!{|x|x[0]}.compact!
  if filter
    matches.reject! do |entity_reference|
      filter.include?(entity_reference)
    end
  end
  if matches.size > 0
    matches.tally.each do |entity_reference, n|
      entity_expansion_count_before = @entity_expansion_count
      entity_value = entity( entity_reference, entities )
      if entity_value
        if n > 1
          entity_expansion_count_delta =
            @entity_expansion_count - entity_expansion_count_before
          record_entity_expansion(entity_expansion_count_delta * (n - 1))
        end
        re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
        rv.gsub!( re, entity_value )
        if rv.bytesize > @entity_expansion_text_limit
          raise "entity expansion has grown too large"
        end
      else
        er = DEFAULT_ENTITIES[entity_reference]
        rv.gsub!( er[0], er[2] ) if er
      end
    end
    rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' )
  end
  rv
end

#unshift(token) ⇒ Object

Push an event back on the head of the stream. This method has (theoretically) infinite depth.



219
220
221
# File 'lib/rexml/parsers/baseparser.rb', line 219

def unshift token
  @stack.unshift(token)
end