Module: WebPageArchiver::GeneratorHelpers

Included in:
DataUriHtmlGenerator, InlineHtmlGenerator, MhtmlGenerator
Defined in:
lib/web_page_archiver.rb

Overview

Generic methods To reuse in both the MhtmlGenerator as the InlineHtmlGenerator

Instance Method Summary collapse

Instance Method Details

#content_type(object) ⇒ String

Determines the conttent type of a file or download

Parameters:

  • object (File, URI)

    to test

Returns:

  • (String)

    mime-type / content type



58
59
60
61
62
63
64
# File 'lib/web_page_archiver.rb', line 58

def content_type(object)
  if object.is_a? File
    return MIME::Types.type_for(object.path).first
  else
    return object.meta["content-type"]
  end
end

#download_finished?Boolean

Tests wether all the required content has been downloaded

Returns:

  • (Boolean)


87
88
89
# File 'lib/web_page_archiver.rb', line 87

def download_finished?
  @contents.find{|k,v| v[:body] == nil } == nil
end

#initializeObject



16
17
18
19
20
21
22
23
# File 'lib/web_page_archiver.rb', line 16

def initialize
  @contents = {}
  @src = StringIO.new
  @boundary = "mimepart_#{Digest::MD5.hexdigest(Time.now.to_s)}"
  @threads  = []
  @queue    = Queue.new
  @conf     = { :base64_except=>["html"] }
end

#join_uri(base_filename_or_uri, path) ⇒ String

Creates a absolute URI-string for referenced resources in base file name

Parameters:

  • base_filename_or_uri (String, URI)

    from where the resource is linked

  • path (String)

    of the resource (relative or absolute) within the parent resource

Returns:

  • (String)

    URI-string



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/web_page_archiver.rb', line 30

def join_uri(base_filename_or_uri, path)
  stream = open(base_filename_or_uri)
  joined = ""
  if stream.is_a? File
    base_filename_or_uri = base_filename_or_uri.path if base_filename_or_uri.is_a? File
    
    windows_drive_matcher = /((.*):\/)/
    windows_drive_match_data = base_filename_or_uri.match windows_drive_matcher
    if windows_drive_match_data
      base_filename_or_uri = base_filename_or_uri.gsub(windows_drive_matcher,'WINDOWS.DRIVE/')
    end
    
    joined = URI::join("file://#{base_filename_or_uri}", path)
    joined = joined.to_s.gsub('file://','').gsub('file:','')
    
    if windows_drive_match_data
      joined = joined.gsub('WINDOWS.DRIVE/',windows_drive_match_data[1])
    end
  else
    joined = URI::join(base_filename_or_uri, path)
  end
  return joined.to_s
end

#start_download_thread(num = 5) ⇒ Array<Thread>

Processes the download queue

Parameters:

  • num (Integer) (defaults to: 5)

    number of threads

Returns:

  • (Array<Thread>)

    the ruby-threads opened



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/web_page_archiver.rb', line 70

def start_download_thread(num=5)
  num.times{
    t = Thread.start{
      while(@queue.empty? == false)
        k = @queue.pop
        next if @contents[k][:body] != nil
        v = @contents[k][:uri]
        f = open(v)
        @contents[k] = @contents[k].merge({ :body=>f.read, :uri=> v, :content_type=> content_type(f) })
      end
    }
    @threads.push t
  }
  return @threads
end