Class: UnderOs::Parser::HTML

Inherits:
Object
  • Object
show all
Defined in:
lib/under_os/parser/html.rb

Instance Method Summary collapse

Instance Method Details

#close_tagObject



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/under_os/parser/html.rb', line 44

def close_tag
  if m = @chunk.match(/\A<\/([a-z]+)>/)
    while node = @stack.pop
      if node[:tag] != m[1]
        if @stack.size > 0
          @stack.last[:children] += node[:children] || []
          node.delete(:children)
          node.delete(:text)
        end
      else
        break
      end
    end

    @node = @stack.last

    m[0].size
  end
end

#merge_data_attrs(hash) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/under_os/parser/html.rb', line 83

def merge_data_attrs(hash)
  hash.keys.each do |key|
    if key.to_s.starts_with?('data-')
      hash[:data] ||= {}

      value = hash.delete(key)
      key   = key.to_s.gsub(/^data\-/, '').camelize

      hash[:data][key.to_sym] = value
    end
  end

  hash
end

#open_tagObject



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/under_os/parser/html.rb', line 26

def open_tag
  if m = @chunk.match(/\A<([a-z]+)([^>]*)>/)
    @node = {tag: m[1], attrs: parse_attrs_in(m[2])}

    if parent = @stack.last
      parent[:children] ||= []
      parent[:children] << @node
      parent.delete(:text) # it can have either text or children
    else
      @top << @node
    end

    @stack << @node

    m[0].size
  end
end

#parse(html) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/under_os/parser/html.rb', line 2

def parse(html)
  html = html.strip.gsub(/<\!--[\s\S]*?-->/, '').gsub(/>\s+/, '>').gsub(/\s+</, '<')

  [].tap do |top|
    @top   = top
    @stack = []
    @node  = nil
    i      = 0

    while i < html.size
      @chunk = html.slice(i, html.size)

      i += open_tag || close_tag || plain_text
    end

    # closing all the missing tags
    while node = @stack.shift
      node.delete(:children)
      node.delete(:text)
      @top << node if ! @top.include?(node)
    end
  end
end

#parse_attrs_in(string) ⇒ Object



72
73
74
75
76
77
78
79
80
81
# File 'lib/under_os/parser/html.rb', line 72

def parse_attrs_in(string)
  merge_data_attrs({}.tap do |hash|
    string.scan(/([a-z][a-z_\-\d]+)=('|")(.+?)(\2)/).each do |match|
      value = match[0] == match[2] ? true : match[2]
      value = true  if value == 'true'
      value = false if value == 'false'
      hash[match[0].to_sym] = value
    end
  end)
end

#plain_textObject



64
65
66
67
68
69
70
# File 'lib/under_os/parser/html.rb', line 64

def plain_text
  if m = @chunk.match(/\A([^<]+)/)
    @stack.last[:text] = m[1] if @stack.last

    m[0].size
  end
end