Class: Snatch::Clean::HTML

Inherits:
Object
  • Object
show all
Defined in:
lib/snatch/clean/html.rb

Defined Under Namespace

Modules: HrefFixMethods, SrcFixMethods

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(doc, working_directory) ⇒ HTML

Returns a new instance of HTML.



49
50
51
52
# File 'lib/snatch/clean/html.rb', line 49

def initialize(doc, working_directory)
  @doc = doc
  @working_directory = working_directory
end

Instance Attribute Details

#docObject

Returns the value of attribute doc.



47
48
49
# File 'lib/snatch/clean/html.rb', line 47

def doc
  @doc
end

#working_directoryObject

Returns the value of attribute working_directory.



47
48
49
# File 'lib/snatch/clean/html.rb', line 47

def working_directory
  @working_directory
end

Class Method Details

.html_encode(string) ⇒ Object



58
59
60
# File 'lib/snatch/clean/html.rb', line 58

def self.html_encode(string)
  string.gsub(/./){ |char| "&#x#{char.unpack('U')[0].to_s(16)};" }
end

.update(doc, working_directory) ⇒ Object



54
55
56
# File 'lib/snatch/clean/html.rb', line 54

def self.update(doc, working_directory)
  new(doc, working_directory).update
end

.url_encode(string) ⇒ Object



62
63
64
# File 'lib/snatch/clean/html.rb', line 62

def self.url_encode(string)
  string.gsub(/./) { |char| '%' + char.unpack('H2' * char.size).join('%').upcase }
end

Instance Method Details

#updateObject



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/snatch/clean/html.rb', line 66

def update
  @doc.css('base, meta[name=generator]').each { |node| node.remove }

  @doc.search('//comment()').remove

  klass = Class.new { include HrefFixMethods }.new
  HrefFixMethods.instance_methods.each do |m|
    @doc.css('a[href]').each { |a| klass.send m, a }
  end

  klass = Class.new { include SrcFixMethods }.new
  SrcFixMethods.instance_methods.each do |m|
    @doc.css('[src]').each { |a| klass.send m, a }
  end
end