Class: WWMD::Scrape
- Inherits:
-
Object
- Object
- WWMD::Scrape
- Defined in:
- lib/wwmd/page/scrape.rb
Instance Attribute Summary collapse
-
#debug ⇒ Object
Returns the value of attribute debug.
-
#hdoc ⇒ Object
readonly
Returns the value of attribute hdoc.
-
#jlinks ⇒ Object
links to javascript includes.
-
#links ⇒ Object
links found on page.
-
#warn ⇒ Object
Returns the value of attribute warn.
Instance Method Summary collapse
-
#default_reject_links ⇒ Object
default reject links (override using reject_links in helper script).
-
#for_comments ⇒ Object
scan page for comment fields.
-
#for_forms ⇒ Object
return an array of Form objects for forms on page.
-
#for_javascript_links ⇒ Object
scrape the page for <script src=“”> tags.
-
#for_javascript_redirect ⇒ Object
scrape the page for a script tag that contains a bare location.href tag (to redirect the page).
-
#for_links(reject = true) ⇒ Object
use xpath searches to get * //a href * //area href * //frame src * //iframe src * //form action * //meta refresh content urls then get //script tags and regexp out links in javascript function calls from elem.inner_html.
-
#for_meta_refresh ⇒ Object
scrape the page for a meta refresh tag and return the url from the contents attribute or nil.
-
#initialize(page = '<>') ⇒ Scrape
constructor
create a new scrape object using passed HTML.
-
#reject_links ⇒ Object
NEED to move this to external configuration.
-
#reset(page) ⇒ Object
reset this scrape object (called by WWMD::Page).
-
#urls_from_helper ⇒ Object
define an urls_from_helper method in your task specific script.
-
#urls_from_regexp(content, re, split = 0) ⇒ Object
scan the passed string for the configured regular expressions and return them as an array.
-
#urls_from_xpath(xpath, attr) ⇒ Object
xpath search for tags and return the passed attribute urls_from_xpath(“//a”,“href”).
-
#warnings ⇒ Object
renamed class variable (for backward compat).
Constructor Details
Instance Attribute Details
#debug ⇒ Object
Returns the value of attribute debug.
13 14 15 |
# File 'lib/wwmd/page/scrape.rb', line 13 def debug @debug end |
#hdoc ⇒ Object (readonly)
Returns the value of attribute hdoc.
17 18 19 |
# File 'lib/wwmd/page/scrape.rb', line 17 def hdoc @hdoc end |
#jlinks ⇒ Object
links to javascript includes
16 17 18 |
# File 'lib/wwmd/page/scrape.rb', line 16 def jlinks @jlinks end |
#links ⇒ Object
links found on page
15 16 17 |
# File 'lib/wwmd/page/scrape.rb', line 15 def links @links end |
#warn ⇒ Object
Returns the value of attribute warn.
14 15 16 |
# File 'lib/wwmd/page/scrape.rb', line 14 def warn @warn end |
Instance Method Details
#default_reject_links ⇒ Object
default reject links (override using reject_links in helper script)
70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/wwmd/page/scrape.rb', line 70 def default_reject_links @links.reject! do |url| url.nil? || url.extname == ".css" || url.extname == ".pdf" || url =~ /javascript:/i || url =~ /mailto:/i || url =~ /[\[\]]/ || url =~ /^#/ end end |
#for_comments ⇒ Object
scan page for comment fields
146 147 148 |
# File 'lib/wwmd/page/scrape.rb', line 146 def for_comments @page.scan(/\<!\s*--(.*?)--\s*\>/m).map { |x| x.to_s } end |
#for_forms ⇒ Object
return an array of Form objects for forms on page
83 84 85 86 87 |
# File 'lib/wwmd/page/scrape.rb', line 83 def for_forms ret = [] @hdoc.search("//form").each { |f| ret << Form.new(f) } ret end |
#for_javascript_links ⇒ Object
scrape the page for <script src=“”> tags
138 139 140 141 142 143 |
# File 'lib/wwmd/page/scrape.rb', line 138 def for_javascript_links urls = [] @hdoc.search("//script[@src]").each { |tag| urls << tag['src'] } urls.reject! { |url| File.extname(url).clip != ".js" } return urls end |
#for_javascript_redirect ⇒ Object
scrape the page for a script tag that contains a bare location.href tag (to redirect the page)
171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/wwmd/page/scrape.rb', line 171 def for_javascript_redirect redirs = [] @hdoc.search("//script").each do |scr| scr.inner_html.scan(/.*location.href\s*=\s*['"]([^'"]+)['"]/i).each { |x| redirs += x } end if redirs.size > 1 STDERR.puts "PARSE ERROR: more than one javascript redirect" return "ERR" end return redirs.first if not redirs.empty? return nil end |
#for_links(reject = true) ⇒ Object
use xpath searches to get
-
//a href
-
//area href
-
//frame src
-
//iframe src
-
//form action
-
//meta refresh content urls
then get //script tags and regexp out links in javascript function calls from elem.inner_html
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# File 'lib/wwmd/page/scrape.rb', line 98 def for_links(reject=true) self.urls_from_xpath("//a","href").each { |url| @links << url } # get <a href=""> elements self.urls_from_xpath("//area","href").each { |url| @links << url } # get <area href=""> elements self.urls_from_xpath("//frame","src").each { |url| @links << url } # get <frame src=""> elements self.urls_from_xpath("//iframe","src").each { |url| @links << url } # get <iframe src=""> elements self.urls_from_xpath("//form","action").each { |url| @links << url } # get <form action=""> elements # <meta> refresh @hdoc.search("//meta").each do || next if ['http-equiv'] != "refresh" next if not (content = ['content'].split(/=/)[1]) @links << content.strip end # add urls from onclick handlers @hdoc.search("*[@onclick]").each do |onclick| LINKS_REGEXP.each do |re| self.urls_from_regexp(onclick['onclick'],re).each do |url| @links << url end end end # add urls_from_regexp (limit to <script> tags (elem.inner_html)) @hdoc.search("//script").each do |scr| LINKS_REGEXP.each do |re| self.urls_from_regexp(scr.inner_html,re).each { |url| @links << url } end end # re-define urls_from_helper in what you mix in begin self.urls_from_helper end self.reject_links; # reject links we don't care about return @links end |
#for_meta_refresh ⇒ Object
scrape the page for a meta refresh tag and return the url from the contents attribute or nil
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/wwmd/page/scrape.rb', line 151 def has_mr = @hdoc.search("//meta").map { |x| x['http-equiv'] }.include?('Refresh') if has_mr urls = @hdoc.search("//meta[@content]").map { |x| x['content'].split(";",2)[1] } if urls.size > 1 STDERR.puts "PARSE ERROR: more than one meta refresh tag" return "ERR" end k,v = urls.first.split("=",2) if k.upcase.strip != "URL" STDERR.puts "PARSE ERROR: content attribute of meta refresh does not contain url" return "ERR" end return v.strip else return nil end end |
#reject_links ⇒ Object
NEED to move this to external configuration
list of urls we don’t care to store in our links list
64 65 66 67 |
# File 'lib/wwmd/page/scrape.rb', line 64 def reject_links putw "WARN: override reject_links in helper script" if @warn default_reject_links end |
#reset(page) ⇒ Object
reset this scrape object (called by WWMD::Page)
29 30 31 32 33 |
# File 'lib/wwmd/page/scrape.rb', line 29 def reset(page) @page = page @hdoc = HDOC.parse(@page) @links = Array.new end |
#urls_from_helper ⇒ Object
define an urls_from_helper method in your task specific script
190 191 192 193 |
# File 'lib/wwmd/page/scrape.rb', line 190 def urls_from_helper putw "WARN: Please set an urls_from_helper override in your helper script" if @warn return nil end |
#urls_from_regexp(content, re, split = 0) ⇒ Object
scan the passed string for the configured regular expressions and return them as an array
37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/wwmd/page/scrape.rb', line 37 def urls_from_regexp(content,re,split=0) ret = [] scrape = content.scan(re) scrape.each do |url| # cheat and take split string(,)[split] add = url.to_s.split(',')[split].gsub(/['"]/, '') next if (add == '' || add.nil?) ret << add end return ret end |
#urls_from_xpath(xpath, attr) ⇒ Object
xpath search for tags and return the passed attribute
urls_from_xpath("//a","href")
51 52 53 54 55 56 57 58 59 |
# File 'lib/wwmd/page/scrape.rb', line 51 def urls_from_xpath(xpath,attr) ret = [] @hdoc.search(xpath).each do |elem| url = elem[attr] next if url.empty? ret << url.strip end return ret end |
#warnings ⇒ Object
renamed class variable (for backward compat)
185 186 187 |
# File 'lib/wwmd/page/scrape.rb', line 185 def warnings#:nodoc: return @warn end |