Class: CL_Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/craigslist_scraper.rb

Constant Summary collapse

USER_AGENT =
["Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1"]
PER_PAGE =

CL is broken into 120 items per page. #USER_AGENT pool to pretend we are different browsers on a subnet.

120

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ CL_Scraper

Returns a new instance of CL_Scraper.



9
10
11
12
13
14
# File 'lib/craigslist_scraper.rb', line 9

def initialize(url)
  @all = []
  @url = url if url
  @menu_hash = {}
  @submenu_hash = {}
end

Instance Attribute Details

#allObject

Returns the value of attribute all.



7
8
9
# File 'lib/craigslist_scraper.rb', line 7

def all
  @all
end

Returns the value of attribute menu_hash.



7
8
9
# File 'lib/craigslist_scraper.rb', line 7

def menu_hash
  @menu_hash
end

Returns the value of attribute submenu_hash.



7
8
9
# File 'lib/craigslist_scraper.rb', line 7

def submenu_hash
  @submenu_hash
end

Instance Method Details

#info_to_sym(attribute) ⇒ Object



88
89
90
91
# File 'lib/craigslist_scraper.rb', line 88

def info_to_sym(attribute)
  base = attribute.children[0].text.split(" ")[0]
  base.include?(":") ? base.gsub(/:/, "").to_sym : base.to_sym
end

#noko_page(page = @url) ⇒ Object



93
94
95
# File 'lib/craigslist_scraper.rb', line 93

def noko_page(page=@url)
  Nokogiri::HTML(open(page, 'User-Agent' => USER_AGENT[rand(0..USER_AGENT.length-1)]))
end

#scrape_by_pid(pid_link) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/craigslist_scraper.rb', line 64

def scrape_by_pid(pid_link)
  puts "Scraping #{pid_link}"
  listing = noko_page(pid_link)
  listing.search(".rows .result-row")
  item_info = {}
  item_info[:postingbody] = listing.search("#postingbody").text
  attrgroup = listing.search(".attrgroup span")
  attrgroup.each do |attribute|
    if attribute.children[1] == nil
      item_info[:year] = attribute.children[0].text  #special case, has no associated attrgroup identifier
    else
      if attribute.children[1].text == "\nmore ads  by this user        "
        item_info[:other_ads] = attrgroup.search("a").attribute("href").text
      elsif  attribute.children[0].text == "\n                        "
        item_info[:venue_date] = attribute.children[1].text
      else
        item_info[info_to_sym(attribute)] = attribute.children[1].text
      end
    end
  end
  item_info[:timeago] = listing.search(".timeago").first.text
  item_info
end

#scrape_category(category) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
# File 'lib/craigslist_scraper.rb', line 37

def scrape_category(category)
  listings = noko_page(category)
  num_listings = listings.search(".totalcount").first.text.to_i
  page_count = 1
  while page_count <= (num_listings/PER_PAGE).floor + 1
    page_url = category + "?s=" + "#{page_count*PER_PAGE}"
    scrape_page(page_url)
    sleep rand(5..8)           #Sleep to help avoid CL API from banning IP!
    page_count += 1
  end
end

#scrape_for_sale_categoriesObject



16
17
18
19
20
# File 'lib/craigslist_scraper.rb', line 16

def scrape_for_sale_categories
  sss = noko_page.search("#center #sss a")
  sss.each{|category| @menu_hash[category.children.text] = @url + category.attribute("href").text}
  @menu_hash
end

#scrape_page(page_url) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/craigslist_scraper.rb', line 49

def scrape_page(page_url)
  puts "Scraping #{page_url}"
  listings = noko_page(page_url)
  item_list = listings.search(".rows .result-row")
  item_list.each do |item|
    item_info = {}
    item_info[:pid] = item.attribute("data-pid").text
    item_info[:link] = item.search("a")[1].attribute("href").text
    item_info[:price] = item.search(".result-price").first.text.gsub(/\$/, "").to_i if item.search(".result-price").first != nil
    item_info[:title] = item.search(".result-title").text.downcase
    item_info[:location] = item.search(".result-info .result-meta .result-hood").text if item.search(".result-info .result-meta .result-hood").text != ""
    @all << item_info
  end
end

#scrape_second_level_menus(main_category) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/craigslist_scraper.rb', line 22

def scrape_second_level_menus(main_category)
  @submenu_hash.clear
  sub_page = noko_page(main_category)
  sub_lists = sub_page.search(".ul")
  sub_headers = sub_page.search("h3")
  sub_headers.each do |header|
    @submenu_hash[header.text.downcase] = {}
    sub_lists.each do |item|
      info = item.search("a")
      @submenu_hash[header.text.downcase][info[0].text] = info.attribute("href").text
    end
  end
  @submenu_hash
end