Class: CraigScrape::Listings

Inherits:

Scraper

Object
Scraper
CraigScrape::Listings

show all

Defined in:: lib/listings.rb

Overview

Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing

Constant Summary collapse

LABEL =

/^(.+?)[ ]*[\-]?$/

LOCATION =

/^[ ]*\((.*?)\)$/

IMG_TYPE =

/^[ ]*(.+)[ ]*$/

HEADER_DATE =

/^[ ]*(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[ ]+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Nov|Dec)[ ]+([0-9]{1,2})[ ]*$/i

SUMMARY_DATE =

/^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/

NEXT_PAGE_LINK =

/^[ ]*(?:next [\d]+ postings|Next \>\>)[ ]*$/

XPATH_POST_DATE =

"*[@class='itemdate']"

XPATH_POST_IMGPIC =

"*[@class='itempx']/*[@class='p']"

XPATH_PAGENAV_LINKS =

"//*[@class='ban']//a"

XPATHS_POST_PRICE = There’s a couple places that the price hangs out. We search in this order

["*[@class='itempp']", "*[@class='itemph']"]

Constants inherited from Scraper

Scraper::HTML_ENCODING, Scraper::HTML_TAG, Scraper::HTTP_HEADERS, Scraper::URL_PARTS

Instance Attribute Summary

Attributes inherited from Scraper

#url

Instance Method Summary collapse

#next_page ⇒ Object

Returns a Listings object of the next_page_url on the current listings object.
#next_page_href ⇒ Object

String, URL Path href-fragment of the next page link.
#next_page_url ⇒ Object

String, Full URL Path of the ‘next page’ link.
#posts ⇒ Object

Array, PostSummary objects found in the listing.

Methods inherited from Scraper

#attributes, #downloaded?, #initialize, #uri

Constructor Details

This class inherits a constructor from CraigScrape::Scraper

Instance Method Details

#next_page ⇒ `Object`

Returns a Listings object of the next_page_url on the current listings object



116
117
118

# File 'lib/listings.rb', line 116

def next_page
  CraigScrape::Listings.new URI.encode(next_page_url) if next_page_url
end

#next_page_href ⇒ `Object`

String, URL Path href-fragment of the next page link

# File 'lib/listings.rb', line 72

def next_page_href
  unless @next_page_href
   
    if html.at_xpath(XPATH_PAGENAV_LINKS)
      # Post 12/3
      next_link = html.xpath(XPATH_PAGENAV_LINKS).find{|link| NEXT_PAGE_LINK.match link.content}
      @next_page_href = next_link[:href] if next_link
    else 
      # Old style
      cursor = html.at 'p:last-of-type'
      
      cursor = cursor.at 'a' if cursor
      
      # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag 
      next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html

      # Search listings put their next page in a link towards the top
      next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
              
      # Some search pages have a bug, whereby a 'next page' link isn't displayed,
      # even though we can see that theres another page listed in the page-number links block at the top
      # and bottom of the listing page
      unless next_link
        cursor = html % 'div.sh:first-of-type > b:last-of-type'

        # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
        # We're looking good.
        next_link = cursor.next_element if cursor and /^[\d]+$/.match cursor.inner_html
      end
      
      # We have an anchor tag - so - let's assign the href:
      @next_page_href = next_link[:href] if next_link
    end
  end
  
  @next_page_href
end

#next_page_url ⇒ `Object`

String, Full URL Path of the ‘next page’ link



111
112
113

# File 'lib/listings.rb', line 111

def next_page_url
  (next_page_href) ? url_from_href(next_page_href) : nil
end

#posts ⇒ `Object`

Array, PostSummary objects found in the listing

# File 'lib/listings.rb', line 25

def posts
  unless @posts
    current_date = nil
    @posts = []

    # All we care about are p and h4 tags. This seemed to be the only way I could do this on Nokogiri:
    post_tags = html.search('*').reject{|n| !/^(?:p|h4)$/i.match n.name } 

    # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
    post_tags.pop if (
      post_tags.length > 0 and 
      post_tags.last.at('a') and 
      NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
    )

    # Now we iterate though the listings:
    post_tags.each do |el|
      case el.name
        when 'p'
         post_summary = parse_summary el, current_date

         # Validate that required fields are present:
         parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
  
         post_summary[:url] = url_from_href post_summary[:href]

         @posts << CraigScrape::Posting.new(post_summary)
       when 'h4'
        # Let's make sense of the h4 tag, and then read all the p tags below it
        if HEADER_DATE.match he_decode(el.inner_html)
          # Generally, the H4 tags contain valid dates. When they do - this is easy:
          current_date = most_recent_date $1, $2
        elsif html.at('h4:last-of-type') == el
          # There's a specific bug in craigslist, where these nonsense h4's just appear without anything relevant inside them.
          # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page, 
          # we need to pull up the full post in order to accurate tell the date.
          # Setting this to nil will achieve the eager-load.
          current_date = nil
        end
      end        
    end        
  end

  @posts
end