Class: Wgit::Indexer

Inherits:
Object
  • Object
show all
Defined in:
lib/wgit/indexer.rb

Overview

Class which crawls and saves the indexed Documents to a database.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(database, crawler = Wgit::Crawler.new) ⇒ Indexer

Initialize the Indexer.



142
143
144
145
# File 'lib/wgit/indexer.rb', line 142

def initialize(database, crawler = Wgit::Crawler.new)
  @db      = database
  @crawler = crawler
end

Instance Attribute Details

#crawlerObject (readonly)

The crawler used to index the WWW.



132
133
134
# File 'lib/wgit/indexer.rb', line 132

def crawler
  @crawler
end

#dbObject (readonly)

The database instance used to store Urls and Documents in.



135
136
137
# File 'lib/wgit/indexer.rb', line 135

def db
  @db
end

Instance Method Details

#index_page(url, insert_externals: true) {|doc| ... } ⇒ Object

Crawls a single webpage and stores it into the database. There is no max download limit so be careful of large pages. Logs info on the crawl using Wgit.logger as it goes along.

Yields:

  • (doc)

    Given the Wgit::Document of the crawled webpage, before it's inserted into the database allowing for prior manipulation. Return nil or false from the block to prevent the document from being saved into the database.



271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/wgit/indexer.rb', line 271

def index_page(url, insert_externals: true)
  document = @crawler.crawl_url(url) do |doc|
    result = true
    result = yield(doc) if block_given?

    if result && !doc.empty? && write_doc_to_db(doc)
      Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
    end
  end

  @db.url?(url) ? @db.update(url) : @db.insert(url)

  ext_urls = document&.external_links
  if insert_externals && ext_urls
    num_inserted_urls = write_urls_to_db(ext_urls)
    Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
  end

  nil
end

#index_site(url, insert_externals: true, allow_paths: nil, disallow_paths: nil) {|doc| ... } ⇒ Integer

Crawls a single website's pages and stores them into the database. There is no max download limit so be careful which sites you index. Logs info on the crawl using Wgit.logger as it goes along.

Yields:

  • (doc)

    Given the Wgit::Document of each crawled web page before it's inserted into the database allowing for prior manipulation. Return nil or false from the block to prevent the document from being saved into the database.



231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/wgit/indexer.rb', line 231

def index_site(
  url, insert_externals: true, allow_paths: nil, disallow_paths: nil
)
  crawl_opts = { allow_paths: allow_paths, disallow_paths: disallow_paths }
  total_pages_indexed = 0

  ext_urls = @crawler.crawl_site(url, crawl_opts) do |doc|
    result = true
    result = yield(doc) if block_given?

    if result && !doc.empty? && write_doc_to_db(doc)
      total_pages_indexed += 1
      Wgit.logger.info("Crawled and saved internal page: #{doc.url}")
    end
  end

  @db.url?(url) ? @db.update(url) : @db.insert(url)

  if insert_externals && ext_urls
    num_inserted_urls = write_urls_to_db(ext_urls)
    Wgit.logger.info("Found and saved #{num_inserted_urls} external url(s)")
  end

  Wgit.logger.info("Crawled and saved #{total_pages_indexed} docs for the \
site: #{url}")

  total_pages_indexed
end

#index_www(max_sites: -1,, max_data: 1_048_576_000) ⇒ Object

Retrieves uncrawled url's from the database and recursively crawls each site storing their internal pages into the database and adding their external url's to be crawled later on. Logs info on the crawl using Wgit.logger as it goes along.



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/wgit/indexer.rb', line 159

def index_www(max_sites: -1, max_data: 1_048_576_000)
  if max_sites.negative?
    Wgit.logger.info("Indexing until the database has been filled or it \
runs out of urls to crawl (which might be never).")
  end
  site_count = 0

  while keep_crawling?(site_count, max_sites, max_data)
    Wgit.logger.info("Current database size: #{@db.size}")

    uncrawled_urls = @db.uncrawled_urls(limit: 100)

    if uncrawled_urls.empty?
      Wgit.logger.info('No urls to crawl, exiting.')

      return
    end
    Wgit.logger.info("Starting crawl loop for: #{uncrawled_urls}")

    docs_count = 0
    urls_count = 0

    uncrawled_urls.each do |url|
      unless keep_crawling?(site_count, max_sites, max_data)
        Wgit.logger.info("Reached max number of sites to crawl or \
database capacity, exiting.")

        return
      end
      site_count += 1

      site_docs_count = 0
      ext_links = @crawler.crawl_site(url) do |doc|
        if !doc.empty? && write_doc_to_db(doc)
          docs_count += 1
          site_docs_count += 1
        end
      end

      raise 'Error updating url' unless @db.update(url) == 1

      urls_count += write_urls_to_db(ext_links)

      Wgit.logger.info("Crawled and saved #{site_docs_count} docs for the \
site: #{url}")
    end

    Wgit.logger.info("Crawled and saved docs for #{docs_count} url(s) \
overall for this iteration.")
    Wgit.logger.info("Found and saved #{urls_count} external url(s) for \
the next iteration.")

    nil
  end
end

#keep_crawling?(site_count, max_sites, max_data) ⇒ Boolean (protected)

Returns whether or not to keep crawling based on the DB size and current loop iteration.



303
304
305
306
307
308
# File 'lib/wgit/indexer.rb', line 303

def keep_crawling?(site_count, max_sites, max_data)
  return false if @db.size >= max_data
  return true  if max_sites.negative?

  site_count < max_sites
end

#write_doc_to_db(doc) ⇒ Boolean (protected)

Write the doc to the DB. Note that the unique url index on the documents collection deliberately prevents duplicate inserts.



315
316
317
318
319
320
321
322
323
324
# File 'lib/wgit/indexer.rb', line 315

def write_doc_to_db(doc)
  @db.insert(doc)
  Wgit.logger.info("Saved document for url: #{doc.url}")

  true
rescue Mongo::Error::OperationFailure
  Wgit.logger.info("Document already exists: #{doc.url}")

  false
end

#write_urls_to_db(urls) ⇒ Boolean (protected)

Write the urls to the DB. Note that the unique url index on the urls collection deliberately prevents duplicate inserts.



331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# File 'lib/wgit/indexer.rb', line 331

def write_urls_to_db(urls)
  count = 0

  return count unless urls.respond_to?(:each)

  urls.each do |url|
    if url.invalid?
      Wgit.logger.info("Ignoring invalid external url: #{url}")
      next
    end

    @db.insert(url)
    count += 1
    Wgit.logger.info("Inserted external url: #{url}")
  rescue Mongo::Error::OperationFailure
    Wgit.logger.info("External url already exists: #{url}")
  end

  count
end