Class: ACLUScraper

Inherits:

Object

Object
ACLUScraper

show all

Defined in:: lib/acluscraper.rb

Instance Method Summary collapse

#initialize(url) ⇒ ACLUScraper constructor

A new instance of ACLUScraper.
#scrapeCase ⇒ Object

Get all the case documents.

Constructor Details

#initialize(url) ⇒ `ACLUScraper`

Returns a new instance of ACLUScraper.

# File 'lib/acluscraper.rb', line 7

def initialize(url)
  @url = url
  @casearray = Array.new
end

Instance Method Details

#scrapeCase ⇒ `Object`

Get all the case documents

# File 'lib/acluscraper.rb', line 13

def scrapeCase
  html = Nokogiri::HTML(open(@url))
  prevdate = ""

  html.css("tbody").each do |t|
    t.css("tr").each do |r|
      if !r.css("a").empty?
        dochash = Hash.new
        
        # Get date for filing
        if r.css("td")[0].text == "\u00a0"
          dochash[:date] = prevdate
        else
          prevdate = r.css("td")[0].text.to_s
          dochash[:date] = r.css("td")[0].text.to_s
        end

        a = r.css("a")
        dochash[:title] = a.text

        # Get URL
        if a[0]["href"].to_s.include? "https://"
          dochash[:url] = a[0]["href"]
        else
          dochash[:url] = "https://www.aclu.org" + a[0]["href"]
        end
        
        # Download documents
        `wget -P public/uploads #{dochash[:url]}`
        path = dochash[:url].split("/")
        dochash[:path] = "public/uploads/" + path[path.length-1].chomp.strip

        # Extract metadata and text
        begin
          u = UploadConvert.new(dochash[:path])
          metadata = u.extractMetadataPDF
          metadata.each{|k, v| dochash[k] = v}
          dochash[:text] = u.detectPDFType
          @casearray.push(dochash)
        rescue
        end
      end
    end
  end
  
  JSON.pretty_generate(@casearray)
end

Class: ACLUScraper

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ ACLUScraper

Instance Method Details

#scrapeCase ⇒ Object

#initialize(url) ⇒ `ACLUScraper`

#scrapeCase ⇒ `Object`