Class: Flatfish::Page

Inherits:

ActiveRecord::Base

Object
ActiveRecord::Base
Flatfish::Page

show all

Extended by:: Url

Defined in:: lib/flatfish/page.rb

Instance Attribute Summary collapse

#cd ⇒ Object

Returns the value of attribute cd.
#data ⇒ Object readonly

Returns the value of attribute data.
#url ⇒ Object readonly

Returns the value of attribute url.

Instance Method Summary collapse

#get_media(url) ⇒ Object

TODO replace w/ find_or_create.
#load_html ⇒ Object

load html from local or web.
#prep ⇒ Object
#process ⇒ Object
#read_in_blob(url) ⇒ Object

read in blob.
#setup(csv, config, schema, host) ⇒ Object

Setup - unpack the vars for the web page to be scraped.
#update_hrefs(css_selector) ⇒ Object

processes link tags absolutifies and passes media links on for tokenization.
#update_imgs(css_selector) ⇒ Object

processes image tags absolutifies images and passes internal ones on for tokenization.

Methods included from Url

absolutify, open_url

Instance Attribute Details

#cd ⇒ `Object`

Returns the value of attribute cd.



12
13
14

# File 'lib/flatfish/page.rb', line 12

def cd
  @cd
end

#data ⇒ `Object` (readonly)

Returns the value of attribute data.



11
12
13

# File 'lib/flatfish/page.rb', line 11

def data
  @data
end

#url ⇒ `Object` (readonly)

Returns the value of attribute url.



11
12
13

# File 'lib/flatfish/page.rb', line 11

def url
  @url
end

Instance Method Details

#get_media(url) ⇒ `Object`

TODO replace w/ find_or_create

# File 'lib/flatfish/page.rb', line 124

def get_media(url)
  media = Flatfish::Media.find_by_url(url)
  if media.nil?
    media = Flatfish::Media.create(:url => url) do |m|
      m.contents = read_in_blob(url)
    end
  end
  media
end

#load_html ⇒ `Object`

load html from local or web

# File 'lib/flatfish/page.rb', line 50

def load_html
  file = @local_source + @url.sub(@host, '')
  if (@url != @host) && !@local_source.nil? && File.exists?(file)
    f = File.open(file)
    @doc = Nokogiri::XML(f)
    f.close
  else
    html = Flatfish::Url.open_url(@url)
    @doc = Nokogiri::HTML(html)
  end
end

#prep ⇒ `Object`

# File 'lib/flatfish/page.rb', line 62

def prep
  #default to csv, fallback to title element
  @title = @title.nil? ? @doc.title: @title

  #build a hash of field => data
  html = Hash.new
  @fields.each_with_index do |selectors, i|
    next if -1 == selectors
    html[@schema[i]] = ''
    selectors.split('&&').each do |selector|
      update_hrefs(selector)
      update_imgs(selector)
      if @doc.css(selector).nil? then
        field = ''
      else
        # sub tokens and gnarly MS Quotes
        field = @doc.css(selector).to_s.gsub("%5BFLATFISH", '[').gsub("FLATFISH%5D", ']').gsub(/[”“]/, '"').gsub(/[‘’]/, "'")
      end
      html[@schema[i]] +=  field
    end
  end
  @data = {
    'url' => @url,
    'title' => @title,
    'path' => @path
  }
  @data.merge!(html)
end

#process ⇒ `Object`

# File 'lib/flatfish/page.rb', line 44

def process
  load_html
  self.attributes = prep
end

#read_in_blob(url) ⇒ `Object`

read in blob

# File 'lib/flatfish/page.rb', line 135

def read_in_blob(url)
  # assume local file
  file = url.sub(@host, @local_source)

  unless @local_source.nil? || !File.exists?(file)
    blob = file.read
  else
    blob = Flatfish::Url.open_url(URI.escape(url))
  end
  blob
end

#setup(csv, config, schema, host) ⇒ `Object`

Setup - unpack the vars for the web page to be scraped

csv - an array w/ all of the page specific config - has some key deets, where to save images, etc that the page has to know schema - dynamic column headers

# File 'lib/flatfish/page.rb', line 19

def setup(csv, config, schema, host)
  #parse the csv
  @url, @path, @title  = csv[0], csv[1], csv[2]
  @fields = []
  csv[3..-1].each do |field|
    unless field.nil?
      @fields << (field.strip! || field)
    else
      @fields << -1 #flag
    end
  end

  #current directory, we want http://example.com/about/ or http://example.com/home/
  @cd = (@url[-1,1] == '/')? @url: @url.slice(0..@url.rindex('/'))
  @schema = schema
  @host = host
  @local_source = config['local_source']

  # handle url == host, fix mangled @cd
  if @url == @host
    @cd = @url + '/'
  end
  Flatfish::Url.creds = {:http_basic_authentication => [config['basic_auth_user'], config['basic_auth_pass']]}
end

#update_hrefs(css_selector) ⇒ `Object`

processes link tags absolutifies and passes media links on for tokenization

# File 'lib/flatfish/page.rb', line 93

def update_hrefs(css_selector)
  @doc.css(css_selector + ' a').each do |a|

    #TODO finalize list of supported file types 
    href = Flatfish::Url.absolutify(a['href'], @cd)
    valid_exts = ['.doc', '.docx', '.pdf', '.pptx', '.ppt', '.xls', '.xlsx']
    if href =~ /#{@host}/  && valid_exts.include?(File.extname(href))
      media = get_media(href)
      href = "[FLATFISHmedia:#{media.id}FLATFISH]"
    end
    a['href'] = href
  end
end

#update_imgs(css_selector) ⇒ `Object`

processes image tags absolutifies images and passes internal ones on for tokenization

# File 'lib/flatfish/page.rb', line 109

def update_imgs(css_selector)
  @doc.css(css_selector + ' img').each do |img|
    next if img['src'].nil?

    # absolutify and tokenize our images
    src = Flatfish::Url.absolutify(img['src'], @cd)
    if src =~ /#{@host}/
      # check to see if it already exists
      media = get_media(src)
      img['src'] = "[FLATFISHmedia:#{media.id}FLATFISH]"
    end
  end
end

Class: Flatfish::Page

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Url

Instance Attribute Details

#cd ⇒ Object

#data ⇒ Object (readonly)

#url ⇒ Object (readonly)

Instance Method Details

#get_media(url) ⇒ Object

#load_html ⇒ Object

#prep ⇒ Object

#process ⇒ Object

#read_in_blob(url) ⇒ Object

#setup(csv, config, schema, host) ⇒ Object

#update_hrefs(css_selector) ⇒ Object

#update_imgs(css_selector) ⇒ Object