Class: ImgDl::Parser

Inherits:
Object
  • Object
show all
Includes:
Helper
Defined in:
lib/img_dl/parser.rb

Constant Summary collapse

Default_Options =
{url_limit_count: nil,url_reg: nil,image_limit_count: nil,image_reg: nil,recursive: false,prefix: nil,interval: 0}

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Helper

#define_options_helper

Constructor Details

#initialize(url, save_path, options = {}) ⇒ Parser

Returns a new instance of Parser.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/img_dl/parser.rb', line 16

def initialize url,save_path,options = {}
  @agent = Mechanize.new
  @agent.user_agent_alias = 'Linux Mozilla'
  @origin_url = URI url
  @current_url = URI url
  @_urls = Hash.new 0
  @_imgs = Hash.new 0
  @save_path = save_path
  FileUtils.mkdir_p save_path
  @image_count = 0
  @url_count = 0
  @urls = Queue.new
  @error_urls = Queue.new
  enq_urls url
  @images = Queue.new
  @options = Default_Options.merge options
  define_options_helper @options
  @downloaded_image_count = 0
  @running = true
  @downloading = true
  @success_download = 0
  @status = "start"
  @dl_status = "ready"
end

Instance Attribute Details

#agentObject (readonly)

Returns the value of attribute agent.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def agent
  @agent
end

#dl_statusObject (readonly)

Returns the value of attribute dl_status.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def dl_status
  @dl_status
end

#downloaded_image_countObject (readonly)

Returns the value of attribute downloaded_image_count.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def downloaded_image_count
  @downloaded_image_count
end

#error_urlsObject (readonly)

Returns the value of attribute error_urls.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def error_urls
  @error_urls
end

#image_countObject (readonly)

Returns the value of attribute image_count.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def image_count
  @image_count
end

#optionsObject (readonly)

Returns the value of attribute options.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def options
  @options
end

#origin_urlObject (readonly)

Returns the value of attribute origin_url.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def origin_url
  @origin_url
end

#runningObject (readonly) Also known as: running?

Returns the value of attribute running.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def running
  @running
end

#statusObject (readonly)

Returns the value of attribute status.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def status
  @status
end

#success_downloadObject (readonly)

Returns the value of attribute success_download.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def success_download
  @success_download
end

#url_countObject (readonly)

Returns the value of attribute url_count.



13
14
15
# File 'lib/img_dl/parser.rb', line 13

def url_count
  @url_count
end

Instance Method Details

#default_headObject



75
76
77
# File 'lib/img_dl/parser.rb', line 75

def default_head
  @_default_head ||= {"USER-AGENT"=>"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17", "ACCEPT-ENCODING"=>"gzip,deflate,sdch","ACCEPT" => '*/*', "ACCEPT-CHARSET"=>"UTF-8,*;q=0.5", "ACCEPT-LANGUAGE"=>"zh-CN,zh;q=0.8","connection" => "close"}
end

#downloadObject



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/img_dl/parser.rb', line 79

def download
  @dl_status = "start"
  @_download_image = 0
  EM.run do
    loop do
      if !running? && (@images.empty? || (image_limit_count? && @_download_image >= image_limit_count))
        @dl_status = "all done"
        download_complete? and EM.stop
        break
      end
      if @images.empty?
        if running?
          @dl_status = "wait parser"
          sleep 3
          redo
        else
          next
        end
      end
      @_download_image += 1
      @dl_status = "shift image url"
      image_uri = @images.shift
      @dl_status = "download image #{image_uri}"
      http = EventMachine::HttpRequest.new(image_uri).get head: default_head
      http.callback { |res|
        res.response_header["CONTENT_TYPE"] =~ /^image\/(\w+)/
          type = $1
        if type
          @success_download += 1
          save_image type,res.response
        else
          @error_urls << [image_uri,"image download error"]
        end
        @downloaded_image_count += 1
        @dl_status = "success: download image #{image_uri}"
        download_complete? and EM.stop
      }
      http.errback  { |res|
        @error_urls << [image_uri,"image download error"]
        @downloaded_image_count += 1
        @dl_status = "failed: download image #{image_uri}"
        download_complete? and EM.stop
      }
    end
  end
  @dl_status = "download complete"
  @downloading = false
end

#parseObject



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/img_dl/parser.rb', line 48

def parse
  loop do
    break unless next_parse?
    sleep interval
    @status = "get url"
    url = @urls.shift
    url = URI.escape url if url.respond_to? :gsub
    @current_url = URI url
    begin
      page = @agent.get url
    rescue StandardError => e
      @error_urls << [url,e]
      puts e
      next
    end
    unless page.respond_to? :images
      redo
    end
    parse_images page
    if continue?
      parse_links page
    end
  end
  @running = false
  @status = "parser complete"
end

#startObject



41
42
43
44
45
46
# File 'lib/img_dl/parser.rb', line 41

def start
  Thread.start{parse}
  download
rescue StandardError => e
  p e
end