Class: ManBook::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/manbook/parser.rb

Class Method Summary collapse

Class Method Details

.parse(html_file) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/manbook/parser.rb', line 4

def parse(html_file)
  #
  # The way we extract the title is highly dependent of the concrete HTML. Yet, I found no other way
  # to extract the title of a man page ín a reliable way.
  #
  doc = Nokogiri::HTML(File.read(html_file))

  title = doc.xpath("//b[text() = 'NAME']/../following-sibling::p[1]/descendant-or-self::text()").to_s

  if title.blank?
    title = doc.xpath("//h2[text() = 'NAME']/following-sibling::p[1]/descendant-or-self::text()").to_s
  end

  # fall back to document title
  if title.blank?
    title = doc.xpath("//html/head/title/text()").to_s
  end

  author = doc.xpath("//b[text() = 'AUTHORS']/../following-sibling::p[1]/descendant-or-self::text()").to_s

  if author.empty?
    author = doc.xpath("//h2[text() = 'AUTHORS']/following-sibling::p[1]/descendant-or-self::text()").to_s
  end

  Page.new.tap do |page|
    page.file_name = File.basename(html_file)
    page.title     = title.split("\n").join(' ')
    page.author    = author.split("\n").join(' ')
  end
end