Class: ClearanceJobsComCrawler

Inherits:

Object

Object
ClearanceJobsComCrawler

show all

Includes:: FailureHandler

Defined in:: lib/clearancejobscom/clearance_jobs_com_crawler.rb

Instance Method Summary collapse

#collect_links_on_page(page) ⇒ Object

Collect the links on the page.
#crawl ⇒ Object

Run the crawler.
#gen_json ⇒ Object
#get_next_page(page_num) ⇒ Object

Get the next page.
#get_next_page_url(page_num) ⇒ Object

Get the URL for the next page.
#get_page(url) ⇒ Object

Get the page.
#get_page_count ⇒ Object

Get the correct total # of pages.
#initialize(search_term, requests = nil, cm_hash = nil) ⇒ ClearanceJobsComCrawler constructor

A new instance of ClearanceJobsComCrawler.
#parse_listings(listings) ⇒ Object

Parse the listings on the page.
#set_base_url ⇒ Object

Get base url.

Methods included from FailureHandler

#get_retry

Constructor Details

#initialize(search_term, requests = nil, cm_hash = nil) ⇒ `ClearanceJobsComCrawler`

Returns a new instance of ClearanceJobsComCrawler.

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 15

def initialize(search_term, requests=nil, cm_hash=nil)
  @search_term = search_term
  @requests = requests
  @base_url = set_base_url

  # Handle crawler manager info
  @reporter = HarvesterReporter.new(cm_hash)
end

Instance Method Details

#collect_links_on_page(page) ⇒ `Object`

Collect the links on the page

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 70

def collect_links_on_page(page)
  html = Nokogiri::HTML.parse(page)
  return html.css(".cj-search-result-item-title").css("a").map{|a| a['href']}
end

#crawl ⇒ `Object`

Run the crawler

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 25

def crawl
  page_count = get_page_count
  
  (1..page_count).each do |page_num|
    listing_links = collect_links_on_page(get_next_page(page_num))
    parse_listings(listing_links)
  end
end

#gen_json ⇒ `Object`



87
88
89

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 87

def gen_json
  return @reporter.gen_json
end

#get_next_page(page_num) ⇒ `Object`

Get the next page



65
66
67

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 65

def get_next_page(page_num)
  return get_page(get_next_page_url(page_num))
end

#get_next_page_url(page_num) ⇒ `Object`

Get the URL for the next page

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 44

def get_next_page_url(page_num)
  if @base_url.include?("keywords")
    return @base_url+"&PAGE="+page_num.to_s+"&limit=25"
  else
    return @base_url+"PAGE="+page_num.to_s+"&limit=25"
  end
end

#get_page(url) ⇒ `Object`

Get the page



53
54
55

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 53

def get_page(url)
  get_retry(url, @requests, 0)
end

#get_page_count ⇒ `Object`

Get the correct total # of pages

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 58

def get_page_count
  page_html = Nokogiri::HTML.parse(get_next_page(1))
  result_count = page_html.css("#viewing").text.split(" of ")[1].gsub(",", "").to_i
  return (result_count/25.0).ceil
end

#parse_listings(listings) ⇒ `Object`

Parse the listings on the page

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 76

def parse_listings(listings)
  found_listings = Array.new
  listings.each do |listing|
    parser = ClearanceJobsComParser.new(listing, get_page(listing), @requests)
    parsed_listing = parser.parse
    found_listings.push(parsed_listing) if parsed_listing
  end

  @reporter.report_results(found_listings, listings.first)
end

#set_base_url ⇒ `Object`

Get base url

# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 35

def set_base_url
  if @search_term == nil
    @base_url = "https://www.clearancejobs.com/jobs?"
  else
    @base_url = "https://www.clearancejobs.com/jobs?keywords="+CGI.escape(@search_term)
  end
end

Class: ClearanceJobsComCrawler

Instance Method Summary collapse

Methods included from FailureHandler

Constructor Details

#initialize(search_term, requests = nil, cm_hash = nil) ⇒ ClearanceJobsComCrawler

Instance Method Details

#collect_links_on_page(page) ⇒ Object

#crawl ⇒ Object

#gen_json ⇒ Object

#get_next_page(page_num) ⇒ Object

#get_next_page_url(page_num) ⇒ Object

#get_page(url) ⇒ Object

#get_page_count ⇒ Object

#parse_listings(listings) ⇒ Object

#set_base_url ⇒ Object