Class: Flatfish::Page

Inherits:
ActiveRecord::Base
  • Object
show all
Extended by:
Url
Defined in:
lib/flatfish/page.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Url

absolutify, open_url

Instance Attribute Details

#cdObject

Returns the value of attribute cd.



12
13
14
# File 'lib/flatfish/page.rb', line 12

def cd
  @cd
end

#dataObject (readonly)

Returns the value of attribute data.



11
12
13
# File 'lib/flatfish/page.rb', line 11

def data
  @data
end

#urlObject (readonly)

Returns the value of attribute url.



11
12
13
# File 'lib/flatfish/page.rb', line 11

def url
  @url
end

Instance Method Details

#get_media(url) ⇒ Object

TODO replace w/ find_or_create



124
125
126
127
128
129
130
131
132
# File 'lib/flatfish/page.rb', line 124

def get_media(url)
  media = Flatfish::Media.find_by_url(url)
  if media.nil?
    media = Flatfish::Media.create(:url => url) do |m|
      m.contents = read_in_blob(url)
    end
  end
  media
end

#load_htmlObject

load html from local or web



50
51
52
53
54
55
56
57
58
59
60
# File 'lib/flatfish/page.rb', line 50

def load_html
  file = @local_source + @url.sub(@host, '')
  if (@url != @host) && !@local_source.nil? && File.exists?(file)
    f = File.open(file)
    @doc = Nokogiri::XML(f)
    f.close
  else
    html = Flatfish::Url.open_url(@url)
    @doc = Nokogiri::HTML(html)
  end
end

#prepObject



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/flatfish/page.rb', line 62

def prep
  #default to csv, fallback to title element
  @title = @title.nil? ? @doc.title: @title

  #build a hash of field => data
  html = Hash.new
  @fields.each_with_index do |selectors, i|
    next if -1 == selectors
    html[@schema[i]] = ''
    selectors.split('&&').each do |selector|
      update_hrefs(selector)
      update_imgs(selector)
      if @doc.css(selector).nil? then
        field = ''
      else
        # sub tokens and gnarly MS Quotes
        field = @doc.css(selector).to_s.gsub("%5BFLATFISH", '[').gsub("FLATFISH%5D", ']').gsub(/[”“]/, '"').gsub(/[‘’]/, "'")
      end
      html[@schema[i]] +=  field
    end
  end
  @data = {
    'url' => @url,
    'title' => @title,
    'path' => @path
  }
  @data.merge!(html)
end

#processObject



44
45
46
47
# File 'lib/flatfish/page.rb', line 44

def process
  load_html
  self.attributes = prep
end

#read_in_blob(url) ⇒ Object

read in blob



135
136
137
138
139
140
141
142
143
144
145
# File 'lib/flatfish/page.rb', line 135

def read_in_blob(url)
  # assume local file
  file = url.sub(@host, @local_source)

  unless @local_source.nil? || !File.exists?(file)
    blob = file.read
  else
    blob = Flatfish::Url.open_url(URI.escape(url))
  end
  blob
end

#setup(csv, config, schema, host) ⇒ Object

Setup - unpack the vars for the web page to be scraped

csv - an array w/ all of the page specific config - has some key deets, where to save images, etc that the page has to know schema - dynamic column headers



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/flatfish/page.rb', line 19

def setup(csv, config, schema, host)
  #parse the csv
  @url, @path, @title  = csv[0], csv[1], csv[2]
  @fields = []
  csv[3..-1].each do |field|
    unless field.nil?
      @fields << (field.strip! || field)
    else
      @fields << -1 #flag
    end
  end

  #current directory, we want http://example.com/about/ or http://example.com/home/
  @cd = (@url[-1,1] == '/')? @url: @url.slice(0..@url.rindex('/'))
  @schema = schema
  @host = host
  @local_source = config['local_source']

  # handle url == host, fix mangled @cd
  if @url == @host
    @cd = @url + '/'
  end
  Flatfish::Url.creds = {:http_basic_authentication => [config['basic_auth_user'], config['basic_auth_pass']]}
end

#update_hrefs(css_selector) ⇒ Object

processes link tags absolutifies and passes media links on for tokenization



93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/flatfish/page.rb', line 93

def update_hrefs(css_selector)
  @doc.css(css_selector + ' a').each do |a|

    #TODO finalize list of supported file types 
    href = Flatfish::Url.absolutify(a['href'], @cd)
    valid_exts = ['.doc', '.docx', '.pdf', '.pptx', '.ppt', '.xls', '.xlsx']
    if href =~ /#{@host}/  && valid_exts.include?(File.extname(href))
      media = get_media(href)
      href = "[FLATFISHmedia:#{media.id}FLATFISH]"
    end
    a['href'] = href
  end
end

#update_imgs(css_selector) ⇒ Object

processes image tags absolutifies images and passes internal ones on for tokenization



109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/flatfish/page.rb', line 109

def update_imgs(css_selector)
  @doc.css(css_selector + ' img').each do |img|
    next if img['src'].nil?

    # absolutify and tokenize our images
    src = Flatfish::Url.absolutify(img['src'], @cd)
    if src =~ /#{@host}/
      # check to see if it already exists
      media = get_media(src)
      img['src'] = "[FLATFISHmedia:#{media.id}FLATFISH]"
    end
  end
end