Class: ClearanceJobsComCrawler
- Inherits:
-
Object
- Object
- ClearanceJobsComCrawler
- Includes:
- FailureHandler
- Defined in:
- lib/clearancejobscom/clearance_jobs_com_crawler.rb
Instance Method Summary collapse
-
#collect_links_on_page(page) ⇒ Object
Collect the links on the page.
-
#crawl ⇒ Object
Run the crawler.
- #gen_json ⇒ Object
-
#get_next_page(page_num) ⇒ Object
Get the next page.
-
#get_next_page_url(page_num) ⇒ Object
Get the URL for the next page.
-
#get_page(url) ⇒ Object
Get the page.
-
#get_page_count ⇒ Object
Get the correct total # of pages.
-
#initialize(search_term, requests = nil, cm_hash = nil) ⇒ ClearanceJobsComCrawler
constructor
A new instance of ClearanceJobsComCrawler.
-
#parse_listings(listings) ⇒ Object
Parse the listings on the page.
-
#set_base_url ⇒ Object
Get base url.
Methods included from FailureHandler
Constructor Details
#initialize(search_term, requests = nil, cm_hash = nil) ⇒ ClearanceJobsComCrawler
Returns a new instance of ClearanceJobsComCrawler.
15 16 17 18 19 20 21 22 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 15 def initialize(search_term, requests=nil, cm_hash=nil) @search_term = search_term @requests = requests @base_url = set_base_url # Handle crawler manager info @reporter = HarvesterReporter.new(cm_hash) end |
Instance Method Details
#collect_links_on_page(page) ⇒ Object
Collect the links on the page
70 71 72 73 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 70 def collect_links_on_page(page) html = Nokogiri::HTML.parse(page) return html.css(".cj-search-result-item-title").css("a").map{|a| a['href']} end |
#crawl ⇒ Object
Run the crawler
25 26 27 28 29 30 31 32 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 25 def crawl page_count = get_page_count (1..page_count).each do |page_num| listing_links = collect_links_on_page(get_next_page(page_num)) parse_listings(listing_links) end end |
#gen_json ⇒ Object
87 88 89 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 87 def gen_json return @reporter.gen_json end |
#get_next_page(page_num) ⇒ Object
Get the next page
65 66 67 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 65 def get_next_page(page_num) return get_page(get_next_page_url(page_num)) end |
#get_next_page_url(page_num) ⇒ Object
Get the URL for the next page
44 45 46 47 48 49 50 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 44 def get_next_page_url(page_num) if @base_url.include?("keywords") return @base_url+"&PAGE="+page_num.to_s+"&limit=25" else return @base_url+"PAGE="+page_num.to_s+"&limit=25" end end |
#get_page(url) ⇒ Object
Get the page
53 54 55 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 53 def get_page(url) get_retry(url, @requests, 0) end |
#get_page_count ⇒ Object
Get the correct total # of pages
58 59 60 61 62 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 58 def get_page_count page_html = Nokogiri::HTML.parse(get_next_page(1)) result_count = page_html.css("#viewing").text.split(" of ")[1].gsub(",", "").to_i return (result_count/25.0).ceil end |
#parse_listings(listings) ⇒ Object
Parse the listings on the page
76 77 78 79 80 81 82 83 84 85 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 76 def parse_listings(listings) found_listings = Array.new listings.each do |listing| parser = ClearanceJobsComParser.new(listing, get_page(listing), @requests) parsed_listing = parser.parse found_listings.push(parsed_listing) if parsed_listing end @reporter.report_results(found_listings, listings.first) end |
#set_base_url ⇒ Object
Get base url
35 36 37 38 39 40 41 |
# File 'lib/clearancejobscom/clearance_jobs_com_crawler.rb', line 35 def set_base_url if @search_term == nil @base_url = "https://www.clearancejobs.com/jobs?" else @base_url = "https://www.clearancejobs.com/jobs?keywords="+CGI.escape(@search_term) end end |