Class: Flatfish::Page
- Inherits:
-
ActiveRecord::Base
- Object
- ActiveRecord::Base
- Flatfish::Page
- Extended by:
- Url
- Defined in:
- lib/flatfish/page.rb
Instance Attribute Summary collapse
-
#cd ⇒ Object
Returns the value of attribute cd.
-
#data ⇒ Object
readonly
Returns the value of attribute data.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
-
#get_media(url) ⇒ Object
TODO replace w/ find_or_create.
-
#load_html ⇒ Object
load html from local or web.
- #prep ⇒ Object
- #process ⇒ Object
-
#read_in_blob(url) ⇒ Object
read in blob.
-
#setup(csv, config, schema, host) ⇒ Object
Setup - unpack the vars for the web page to be scraped.
-
#update_hrefs(css_selector) ⇒ Object
processes link tags absolutifies and passes media links on for tokenization.
-
#update_imgs(css_selector) ⇒ Object
processes image tags absolutifies images and passes internal ones on for tokenization.
Methods included from Url
Instance Attribute Details
#cd ⇒ Object
Returns the value of attribute cd.
12 13 14 |
# File 'lib/flatfish/page.rb', line 12 def cd @cd end |
#data ⇒ Object (readonly)
Returns the value of attribute data.
11 12 13 |
# File 'lib/flatfish/page.rb', line 11 def data @data end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
11 12 13 |
# File 'lib/flatfish/page.rb', line 11 def url @url end |
Instance Method Details
#get_media(url) ⇒ Object
TODO replace w/ find_or_create
124 125 126 127 128 129 130 131 132 |
# File 'lib/flatfish/page.rb', line 124 def get_media(url) media = Flatfish::Media.find_by_url(url) if media.nil? media = Flatfish::Media.create(:url => url) do |m| m.contents = read_in_blob(url) end end media end |
#load_html ⇒ Object
load html from local or web
50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/flatfish/page.rb', line 50 def load_html file = @local_source + @url.sub(@host, '') if (@url != @host) && !@local_source.nil? && File.exists?(file) f = File.open(file) @doc = Nokogiri::XML(f) f.close else html = Flatfish::Url.open_url(@url) @doc = Nokogiri::HTML(html) end end |
#prep ⇒ Object
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/flatfish/page.rb', line 62 def prep #default to csv, fallback to title element @title = @title.nil? ? @doc.title: @title #build a hash of field => data html = Hash.new @fields.each_with_index do |selectors, i| next if -1 == selectors html[@schema[i]] = '' selectors.split('&&').each do |selector| update_hrefs(selector) update_imgs(selector) if @doc.css(selector).nil? then field = '' else # sub tokens and gnarly MS Quotes field = @doc.css(selector).to_s.gsub("%5BFLATFISH", '[').gsub("FLATFISH%5D", ']').gsub(/[”“]/, '"').gsub(/[‘’]/, "'") end html[@schema[i]] += field end end @data = { 'url' => @url, 'title' => @title, 'path' => @path } @data.merge!(html) end |
#process ⇒ Object
44 45 46 47 |
# File 'lib/flatfish/page.rb', line 44 def process load_html self.attributes = prep end |
#read_in_blob(url) ⇒ Object
read in blob
135 136 137 138 139 140 141 142 143 144 145 |
# File 'lib/flatfish/page.rb', line 135 def read_in_blob(url) # assume local file file = url.sub(@host, @local_source) unless @local_source.nil? || !File.exists?(file) blob = file.read else blob = Flatfish::Url.open_url(URI.escape(url)) end blob end |
#setup(csv, config, schema, host) ⇒ Object
Setup - unpack the vars for the web page to be scraped
csv - an array w/ all of the page specific config - has some key deets, where to save images, etc that the page has to know schema - dynamic column headers
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/flatfish/page.rb', line 19 def setup(csv, config, schema, host) #parse the csv @url, @path, @title = csv[0], csv[1], csv[2] @fields = [] csv[3..-1].each do |field| unless field.nil? @fields << (field.strip! || field) else @fields << -1 #flag end end #current directory, we want http://example.com/about/ or http://example.com/home/ @cd = (@url[-1,1] == '/')? @url: @url.slice(0..@url.rindex('/')) @schema = schema @host = host @local_source = config['local_source'] # handle url == host, fix mangled @cd if @url == @host @cd = @url + '/' end Flatfish::Url.creds = {:http_basic_authentication => [config['basic_auth_user'], config['basic_auth_pass']]} end |
#update_hrefs(css_selector) ⇒ Object
processes link tags absolutifies and passes media links on for tokenization
93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/flatfish/page.rb', line 93 def update_hrefs(css_selector) @doc.css(css_selector + ' a').each do |a| #TODO finalize list of supported file types href = Flatfish::Url.absolutify(a['href'], @cd) valid_exts = ['.doc', '.docx', '.pdf', '.pptx', '.ppt', '.xls', '.xlsx'] if href =~ /#{@host}/ && valid_exts.include?(File.extname(href)) media = get_media(href) href = "[FLATFISHmedia:#{media.id}FLATFISH]" end a['href'] = href end end |
#update_imgs(css_selector) ⇒ Object
processes image tags absolutifies images and passes internal ones on for tokenization
109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/flatfish/page.rb', line 109 def update_imgs(css_selector) @doc.css(css_selector + ' img').each do |img| next if img['src'].nil? # absolutify and tokenize our images src = Flatfish::Url.absolutify(img['src'], @cd) if src =~ /#{@host}/ # check to see if it already exists media = get_media(src) img['src'] = "[FLATFISHmedia:#{media.id}FLATFISH]" end end end |