Class: Xsv::SaxParser

Inherits:
Object
  • Object
show all
Defined in:
lib/xsv/sax_parser.rb

Constant Summary collapse

ATTR_REGEX =
/((\p{Alnum}+)="(.*?)")/mn

Instance Method Summary collapse

Instance Method Details

#parse(io) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/xsv/sax_parser.rb', line 9

def parse(io)
  responds_to_end_element = respond_to?(:end_element)
  responds_to_characters = respond_to?(:characters)

  state = :look_start
  if io.is_a?(String)
    pbuf = io.dup
    eof_reached = true
    must_read = false
  else
    pbuf = String.new(capacity: 8192)
    eof_reached = false
    must_read = true
  end

  loop do
    if must_read
      begin
        pbuf << io.sysread(2048)
      rescue EOFError, TypeError
        # EOFError is thrown by IO, rubyzip returns nil from sysread on EOF
        eof_reached = true
      end

      must_read = false
    end

    if state == :look_start
      if (o = pbuf.index("<"))
        chars = pbuf.slice!(0, o + 1).chop!.force_encoding("utf-8")

        if responds_to_characters && !chars.empty?
          characters(CGI.unescapeHTML(chars))
        end

        state = :look_end
      elsif eof_reached
        # Discard anything after the last tag in the document
        break
      else
        # Continue loop to read more data into the buffer
        must_read = true
        next
      end
    end

    if state == :look_end
      if (o = pbuf.index(">"))
        if (s = pbuf.index(" ")) && s < o
          tag_name = pbuf.slice!(0, s + 1).chop!
          args = pbuf.slice!(0, o - s)
        else
          tag_name = pbuf.slice!(0, o + 1).chop!
          args = nil
        end

        is_close_tag = tag_name.delete_prefix!("/")

        # Strip XML namespace from tag
        if (offset = tag_name.index(":"))
          tag_name.slice!(0, offset + 1)
        end

        if is_close_tag
          end_element(tag_name) if responds_to_end_element
        elsif args.nil?
          start_element(tag_name, nil)
        else
          attribute_buffer = {}
          attributes = args.scan(ATTR_REGEX)
          while (attr = attributes.delete_at(0))
            attribute_buffer[attr[1].to_sym] = attr[2]
          end

          start_element(tag_name, attribute_buffer)

          end_element(tag_name) if responds_to_end_element && args.end_with?("/")
        end

        state = :look_start
      elsif eof_reached
        raise Xsv::Error, "Malformed XML document, looking for end of tag beyond EOF"
      else
        must_read = true
      end
    end
  end
end