Class: WebpageArchivist::HtmlDocument

Inherits:
Object
  • Object
show all
Defined in:
lib/webpage-archivist/html_document.rb

Overview

API around Nokogiri

Constant Summary collapse

ENCODING_REGEXP =
/<meta http-equiv="content-type" content="text\/html; charset=([^"]+)"/i
CONVERTER =
Iconv.new('UTF-8//IGNORE//TRANSLIT', 'ASCII//IGNORE//TRANSLIT')

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(content, uri = nil, charset = nil) ⇒ HtmlDocument

Create document

content

the content



17
18
19
20
21
22
23
# File 'lib/webpage-archivist/html_document.rb', line 17

def initialize content, uri = nil, charset = nil
  @charset = charset
  unless @charset
    @charset = ENCODING_REGEXP.match(CONVERTER.iconv(content)).andand[1].andand.upcase
  end
  @content = Nokogiri::HTML(content, uri, @charset)
end

Instance Attribute Details

#charsetObject (readonly)

Returns the value of attribute charset.



9
10
11
# File 'lib/webpage-archivist/html_document.rb', line 9

def charset
  @charset
end

#contentObject (readonly)

Returns the value of attribute content.



9
10
11
# File 'lib/webpage-archivist/html_document.rb', line 9

def content
  @content
end

Instance Method Details

#each_image(&block) ⇒ Object

Call a block for each image Block call parameter will be the image node



48
49
50
51
52
# File 'lib/webpage-archivist/html_document.rb', line 48

def each_image &block
  content.search('img[@src]').each do |img|
    block.yield img
  end
end

Call a block for each link Block call parameter will be the link node



56
57
58
59
60
# File 'lib/webpage-archivist/html_document.rb', line 56

def each_link &block
  content.search('a[@href]').each do |link|
    block.yield link
  end
end

#each_script(&block) ⇒ Object

Call a block for each script Block call parameter will be the script node



40
41
42
43
44
# File 'lib/webpage-archivist/html_document.rb', line 40

def each_script &block
  content.search('script[@src]').each do |script|
    block.yield script
  end
end

#each_stylesheet(&block) ⇒ Object

Call a block for each stylesheet Block call parameter will be the stylesheet node



32
33
34
35
36
# File 'lib/webpage-archivist/html_document.rb', line 32

def each_stylesheet &block
  content.search('link[@type="text/css"]').each do |link|
    block.yield link
  end
end

#to_htmlObject

Convert document to html



26
27
28
# File 'lib/webpage-archivist/html_document.rb', line 26

def to_html
  content.to_html
end