Class: JustCrawl::Register

Inherits:
Object
  • Object
show all
Defined in:
lib/just_crawl/register.rb

Defined Under Namespace

Classes: Result

Instance Method Summary collapse

Constructor Details

#initializeRegister

Returns a new instance of Register.



7
8
9
10
11
# File 'lib/just_crawl/register.rb', line 7

def initialize
  @unprocessed = Set.new
  @processing = Set.new
  @processed = Set.new
end

Instance Method Details

#add(pages) ⇒ Object



13
14
15
16
17
18
19
# File 'lib/just_crawl/register.rb', line 13

def add(pages)
  new_pages = pages.to_set - @processed - @processing - @unprocessed
  new_pages.each do |new_page|
    puts "  Adding #{new_page.url}" if $verbose
  end
  @unprocessed.merge(new_pages)
end

#completed(page) ⇒ Object



36
37
38
39
# File 'lib/just_crawl/register.rb', line 36

def completed(page)
  @processed << page
  @processing.delete(page)
end

#error_pagesObject



49
50
51
# File 'lib/just_crawl/register.rb', line 49

def error_pages
  @processed.select(&:error)
end

#errors?Boolean

Returns:

  • (Boolean)


53
54
55
# File 'lib/just_crawl/register.rb', line 53

def errors?
  !error_pages.empty?
end

#finished?Boolean

Returns:

  • (Boolean)


41
42
43
# File 'lib/just_crawl/register.rb', line 41

def finished?
  (@unprocessed.size + @processing.size).zero?
end

#next_pageObject



21
22
23
24
25
26
27
28
29
# File 'lib/just_crawl/register.rb', line 21

def next_page
  page = @unprocessed.first
  @unprocessed.delete(page)
  @processing << page if page
  if @processing.size > EM.threadpool_size
    puts "WARNING: #{@processing.size} pages are being process when EM threadpool only has #{EM.threadpool_size} threads."
  end
  page
end

Returns:

  • (Boolean)


68
69
70
# File 'lib/just_crawl/register.rb', line 68

def no_links_found?
  @processed.size <= 1
end

#processing_sizeObject



45
46
47
# File 'lib/just_crawl/register.rb', line 45

def processing_size
  @processing.size
end

#retry(page) ⇒ Object



31
32
33
34
# File 'lib/just_crawl/register.rb', line 31

def retry(page)
  @unprocessed << page
  @processing.delete(page)
end

#summarizeObject



57
58
59
60
61
62
63
64
65
66
# File 'lib/just_crawl/register.rb', line 57

def summarize
  if errors?
    puts "\nPages with errors:"
    error_pages.each do |page|
      puts page.to_s
    end
  else
    puts "\n#{@processed.size} pages crawled without errors."
  end
end