Module: NewsCrawler::Storage::URLQueue

Defined in:
lib/news_crawler/storage/url_queue.rb,
lib/news_crawler/storage/url_queue/mongo_storage.rb,
lib/news_crawler/storage/url_queue/url_queue_error.rb,
lib/news_crawler/storage/url_queue/url_queue_engine.rb

Overview

Store and manipulate url queue

Defined Under Namespace

Classes: DuplicateURLError, MongoEngine, URLQueueEngine

Constant Summary collapse

ACTION_LIST =
[:mark_visited, :mark_processed, :find_unvisited,
:find_unprocessed, :find_unprocessed_with_depth]
PROCESSED =
'processed'
PROCESSING =
'processing'
UNPROCESSED =
'unprocessed'

Class Method Summary collapse

Class Method Details

.add(url, ref_url = '') ⇒ Object

Add URL to queue

Parameters:

  • url (String)
  • ref_url (String) (defaults to: '')

    reference url



120
121
122
123
124
125
126
# File 'lib/news_crawler/storage/url_queue.rb', line 120

def add(url, ref_url = '')
  url = normalize_url url
  if ref_url != ''
    ref_url = normalize_url ref_url
  end
  @engine.add(url, ref_url)
end

.allArray

Get all url with status

Returns:

  • (Array)

    URL list



136
137
138
# File 'lib/news_crawler/storage/url_queue.rb', line 136

def all
  @engine.all
end

.clearFixnum

Clear URLQueue

Returns:

  • (Fixnum)

    number of urls removed



130
131
132
# File 'lib/news_crawler/storage/url_queue.rb', line 130

def clear
  @engine.clear
end

.find_all(module_name, state, max_depth = -1)) ⇒ Array

Find all visited urls with module’s state

Parameters:

  • module_name (String)
  • state (String)
  • max_depth (Fixnum) (defaults to: -1))

    max url depth return (inclusive)

Returns:

  • (Array)

    URL list



89
90
91
# File 'lib/news_crawler/storage/url_queue.rb', line 89

def find_all(module_name, state, max_depth = -1)
  @engine.find_all(module_name, state, max_depth)
end

.find_one(module_name, state, max_depth = -1)) ⇒ String?

Find one visited url with given module process state

Parameters:

  • module_name (String)
  • state (String)

    one of unprocessed, processing, processed

  • max_depth (Fixnum) (defaults to: -1))

    max url depth return (inclusive)

Returns:

  • (String, nil)

    URL



98
99
100
# File 'lib/news_crawler/storage/url_queue.rb', line 98

def find_one(module_name, state, max_depth = -1)
  @engine.find_one(module_name, state, max_depth)
end

.find_unvisited(max_depth = -1)) ⇒ Array

Get list of unvisited URL

Parameters:

  • max_depth (Fixnum) (defaults to: -1))

    maximum depth of url return

Returns:

  • (Array)

    unvisited url with maximum depth (option)



113
114
115
# File 'lib/news_crawler/storage/url_queue.rb', line 113

def find_unvisited(max_depth = -1)
  @engine.find_unvisited(max_depth)
end

.mark(module_name, url, state) ⇒ Object

Set processing state of url in given module

Parameters:

  • module_name (String)
  • url (String)
  • state (String)

    one of unprocessed, processing, processed



71
72
73
74
# File 'lib/news_crawler/storage/url_queue.rb', line 71

def mark(module_name, url, state)
  url = normalize_url url
  @engine.mark(module_name, url, state)
end

.mark_all(module_name, new_state, orig_state = nil) ⇒ Object

Mark all url to state

Parameters:

  • module_name (String)
  • new_state (String)

    new state

  • orig_state (String) (defaults to: nil)

    original state



80
81
82
# File 'lib/news_crawler/storage/url_queue.rb', line 80

def mark_all(module_name, new_state, orig_state = nil)
  @engine.mark_all(module_name, new_state, orig_state)
end

.mark_all_unvisitedObject

Mark all URLs as unvisited



63
64
65
# File 'lib/news_crawler/storage/url_queue.rb', line 63

def mark_all_unvisited
  @engine.mark_all_unvisited
end

.mark_visited(url) ⇒ Object

Mark an URL as visited

Parameters:

  • url (String)


57
58
59
60
# File 'lib/news_crawler/storage/url_queue.rb', line 57

def mark_visited(url)
  url = normalize_url url
  @engine.mark_visited(url)
end

.next_unprocessed(module_name, max_depth = -1)) ⇒ String?

Get next unprocessed a url and mark it as processing in atomic

Parameters:

  • module_name (String)
  • max_depth (Fixnum) (defaults to: -1))

    max url depth return (inclusive)

Returns:

  • (String, nil)

    URL or nil if url doesn’t exists



106
107
108
# File 'lib/news_crawler/storage/url_queue.rb', line 106

def next_unprocessed(module_name, max_depth = -1)
  @engine.next_unprocessed(module_name, max_depth)
end

.normalize_url(url) ⇒ Object



140
141
142
143
144
145
146
# File 'lib/news_crawler/storage/url_queue.rb', line 140

def normalize_url(url)
  if (!url.start_with? "http")
    "http://" + url
  else
    url
  end
end

.set_engine(engine, *opts) ⇒ Object

Set URLQueue storage engine

Parameters:

  • engine (Symbol, Object)

    specify database engine, pass an object for custom engine

  • opts (Hash)

    options pass to engine This can be

    • ‘:mongo`, `:mongodb` for MongoDB backend



43
44
45
46
47
48
49
50
51
52
53
# File 'lib/news_crawler/storage/url_queue.rb', line 43

def set_engine(engine, *opts)
  if engine.respond_to? :intern
    engine = engine.intern
  end
  engine_class = URLQueueEngine.get_engines[engine]
  if engine_class
    @engine = engine_class.new(*opts)
  else
    @engine = engine
  end
end