Class: Unmarkdown::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/unmarkdown/parser.rb

Constant Summary collapse

BLOCK_ELEMENT_NAMES =
%w{h1 h2 h3 h4 h5 h6 blockquote pre hr ul ol li p div}.freeze
/((?:https?|ftp):[^'"\s]+)/i.freeze
%r{([-.\w]+\@[-a-z0-9]+(?:\.[-a-z0-9]+)*\.[a-z]+)}i.freeze

Instance Method Summary collapse

Constructor Details

#initialize(html, options = {}) ⇒ Parser

Returns a new instance of Parser.



9
10
11
12
# File 'lib/unmarkdown/parser.rb', line 9

def initialize(html, options = {})
  @html = html
  @options = options
end

Instance Method Details

#parseObject



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/unmarkdown/parser.rb', line 14

def parse
  # If the HTML fragment starts with a comment, it is ignored. Add an
  # enclosing body tag to ensure everything is included.
  html = @html
  unless html.include?('<body')
    html = "<body>#{@html}</body>"
  end

  # Setup document
  doc = Nokogiri::HTML(html)
  doc.encoding = 'UTF-8'

  # Reset bookkeeping
  @list = []
  @list_position = []

  # Parse the root node recursively
  root_node = doc.xpath('//body')
  markdown = parse_nodes(root_node.children)

  # Strip whitespace
  markdown.rstrip.gsub(/\n{2}+/, "\n\n")

  # TODO: Strip trailing whitespace
end