Class: WWMD::Scrape

Inherits:
Object
  • Object
show all
Defined in:
lib/wwmd/page/scrape.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(page = '<>') ⇒ Scrape

create a new scrape object using passed HTML



20
21
22
23
24
25
26
# File 'lib/wwmd/page/scrape.rb', line 20

def initialize(page='<>')
  @page = page
  @hdoc = HDOC.parse(@page)
  @links = Array.new
  @debug = false
  @warn = false
end

Instance Attribute Details

#debugObject

Returns the value of attribute debug.



13
14
15
# File 'lib/wwmd/page/scrape.rb', line 13

def debug
  @debug
end

#hdocObject (readonly)

Returns the value of attribute hdoc.



17
18
19
# File 'lib/wwmd/page/scrape.rb', line 17

def hdoc
  @hdoc
end

links to javascript includes



16
17
18
# File 'lib/wwmd/page/scrape.rb', line 16

def jlinks
  @jlinks
end

links found on page



15
16
17
# File 'lib/wwmd/page/scrape.rb', line 15

def links
  @links
end

#warnObject

Returns the value of attribute warn.



14
15
16
# File 'lib/wwmd/page/scrape.rb', line 14

def warn
  @warn
end

Instance Method Details

default reject links (override using reject_links in helper script)



70
71
72
73
74
75
76
77
78
79
80
# File 'lib/wwmd/page/scrape.rb', line 70

def default_reject_links
  @links.reject! do |url|
    url.nil? ||
    url.extname == ".css" ||
    url.extname == ".pdf" ||
    url =~ /javascript:/i ||
    url =~ /mailto:/i ||
    url =~ /[\[\]]/ ||
    url =~ /^#/
  end
end

#for_commentsObject

scan page for comment fields



146
147
148
# File 'lib/wwmd/page/scrape.rb', line 146

def for_comments
  @page.scan(/\<!\s*--(.*?)--\s*\>/m).map { |x| x.to_s }
end

#for_formsObject

return an array of Form objects for forms on page



83
84
85
86
87
# File 'lib/wwmd/page/scrape.rb', line 83

def for_forms
  ret = []
  @hdoc.search("//form").each { |f| ret << Form.new(f) }
  ret
end

scrape the page for <script src=“”> tags



138
139
140
141
142
143
# File 'lib/wwmd/page/scrape.rb', line 138

def for_javascript_links
  urls = []
  @hdoc.search("//script[@src]").each { |tag| urls << tag['src'] }
  urls.reject! { |url| File.extname(url).clip != ".js" }
  return urls
end

#for_javascript_redirectObject

scrape the page for a script tag that contains a bare location.href tag (to redirect the page)



171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/wwmd/page/scrape.rb', line 171

def for_javascript_redirect
  redirs = []
  @hdoc.search("//script").each do |scr|
    scr.inner_html.scan(/.*location.href\s*=\s*['"]([^'"]+)['"]/i).each { |x| redirs += x }
  end
  if redirs.size > 1
    STDERR.puts "PARSE ERROR: more than one javascript redirect"
    return "ERR"
  end
  return redirs.first if not redirs.empty?
  return nil
end

use xpath searches to get

  • //a href

  • //area href

  • //frame src

  • //iframe src

  • //form action

  • //meta refresh content urls

then get //script tags and regexp out links in javascript function calls from elem.inner_html



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/wwmd/page/scrape.rb', line 98

def for_links(reject=true)
  self.urls_from_xpath("//a","href").each      { |url| @links << url } # get <a href=""> elements
  self.urls_from_xpath("//area","href").each   { |url| @links << url } # get <area href=""> elements
  self.urls_from_xpath("//frame","src").each   { |url| @links << url } # get <frame src=""> elements
  self.urls_from_xpath("//iframe","src").each  { |url| @links << url } # get <iframe src=""> elements
  self.urls_from_xpath("//form","action").each { |url| @links << url } # get <form action=""> elements

  # <meta> refresh
  @hdoc.search("//meta").each do |meta|
    next if meta['http-equiv'] != "refresh"
    next if not (content = meta['content'].split(/=/)[1])
    @links << content.strip
  end

  # add urls from onclick handlers
  @hdoc.search("*[@onclick]").each do |onclick|
    LINKS_REGEXP.each do |re|
      self.urls_from_regexp(onclick['onclick'],re).each do |url|
        @links << url
      end
    end
  end

  # add urls_from_regexp (limit to <script> tags (elem.inner_html))
  @hdoc.search("//script").each do |scr|
    LINKS_REGEXP.each do |re|
      self.urls_from_regexp(scr.inner_html,re).each { |url| @links << url }
    end
  end

  # re-define urls_from_helper in what you mix in
  begin
    self.urls_from_helper
  end

  self.reject_links; # reject links we don't care about
  return @links
end

#for_meta_refreshObject

scrape the page for a meta refresh tag and return the url from the contents attribute or nil



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/wwmd/page/scrape.rb', line 151

def for_meta_refresh
  has_mr = @hdoc.search("//meta").map { |x| x['http-equiv'] }.include?('Refresh')
  if has_mr
    urls = @hdoc.search("//meta[@content]").map { |x| x['content'].split(";",2)[1] }
    if urls.size > 1
      STDERR.puts "PARSE ERROR: more than one meta refresh tag"
      return "ERR"
    end
    k,v = urls.first.split("=",2)
    if k.upcase.strip != "URL"
      STDERR.puts "PARSE ERROR: content attribute of meta refresh does not contain url"
      return "ERR"
    end
    return v.strip
  else
    return nil
  end
end

NEED to move this to external configuration

list of urls we don’t care to store in our links list



64
65
66
67
# File 'lib/wwmd/page/scrape.rb', line 64

def reject_links
  putw "WARN: override reject_links in helper script" if @warn
  default_reject_links
end

#reset(page) ⇒ Object

reset this scrape object (called by WWMD::Page)



29
30
31
32
33
# File 'lib/wwmd/page/scrape.rb', line 29

def reset(page)
  @page = page
  @hdoc = HDOC.parse(@page)
  @links = Array.new
end

#urls_from_helperObject

define an urls_from_helper method in your task specific script



190
191
192
193
# File 'lib/wwmd/page/scrape.rb', line 190

def urls_from_helper
  putw "WARN: Please set an urls_from_helper override in your helper script" if @warn
  return nil
end

#urls_from_regexp(content, re, split = 0) ⇒ Object

scan the passed string for the configured regular expressions and return them as an array



37
38
39
40
41
42
43
44
45
46
47
# File 'lib/wwmd/page/scrape.rb', line 37

def urls_from_regexp(content,re,split=0)
  ret = []
  scrape = content.scan(re)
  scrape.each do |url|
    # cheat and take split string(,)[split]
    add = url.to_s.split(',')[split].gsub(/['"]/, '')
    next if (add == '' || add.nil?)
    ret << add
  end
  return ret
end

#urls_from_xpath(xpath, attr) ⇒ Object

xpath search for tags and return the passed attribute

urls_from_xpath("//a","href")


51
52
53
54
55
56
57
58
59
# File 'lib/wwmd/page/scrape.rb', line 51

def urls_from_xpath(xpath,attr)
  ret = []
  @hdoc.search(xpath).each do |elem|
    url = elem[attr]
    next if url.empty?
    ret << url.strip
  end
  return ret
end

#warningsObject

renamed class variable (for backward compat)



185
186
187
# File 'lib/wwmd/page/scrape.rb', line 185

def warnings#:nodoc:
  return @warn
end