Class: RHACK::Page
Overview
Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, … ) ) => Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, …
Direct Known Subclasses
Constant Summary collapse
- @@ignore =
for johnson
/google|_gat|tracker|adver/i
Instance Attribute Summary collapse
-
#body ⇒ Object
(also: #html)
readonly
Returns the value of attribute body.
-
#curl ⇒ Object
readonly
Returns the value of attribute curl.
-
#curl_res ⇒ Object
readonly
Returns the value of attribute curl_res.
-
#data ⇒ Object
(also: #hash)
readonly
Returns the value of attribute data.
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
-
#failed ⇒ Object
readonly
Returns the value of attribute failed.
-
#js ⇒ Object
readonly
Returns the value of attribute js.
-
#loc ⇒ Object
readonly
Returns the value of attribute loc.
-
#res ⇒ Object
result of page processing been made in frame context.
- #title(full = true) ⇒ Object
Instance Method Summary collapse
- #at(selector_or_node, options = {}) ⇒ Object (also: #first)
-
#dict(hash) ⇒ Object
hook to create even-looked lines defining a hash in my Verdana 10px, e.g.
- #empty? ⇒ Boolean
- #eval_js(frame = nil) ⇒ Object
- #eval_string(str) ⇒ Object
-
#expand_link(link) ⇒ Object
makes a relative path being on this page into an absolute path.
-
#failed? ⇒ Boolean
override this in a subclass.
- #find(selector_or_nodes, options = {}, &foreach) ⇒ Object (also: #all)
- #flatten_dict(hash) ⇒ Object
-
#form(form = 'form', hash = {}, opts = {}) ⇒ Object
FORMS #.
- #get_link(selector_or_node = 'a', options = {}, &onfound) ⇒ Object (also: #link, #get_href)
-
#get_links(links = 'a') ⇒ Object
(also: #get_hrefs, #links)
def get_src(link=‘img’) begin link = at(link) && at(link).src if link.is String rescue LibXML::XML::Error; nil end expand_link link if link end.
- #get_src(selector_or_node = 'img', options = {}, &onfound) ⇒ Object (also: #src)
-
#get_srcs(links = 'img') ⇒ Object
(also: #srcs)
TODO: make into same form as #get_src and #map.
-
#initialize(obj = '', loc = Hash.new(''), js = is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new)) ⇒ Page
constructor
Frame calls it with no args.
- #inspect ⇒ Object
-
#load_scripts(frame) ⇒ Object
def get_link(link=‘a’) begin link = at(link) && (at(link).href || at(link+‘//a’).href) if link.is String rescue XML::Error; nil end expand_link link if link end.
- #map(selector_or_nodes, options = {}, &mapper) ⇒ Object
- #map_json(selector_or_nodes, options = {}, &mapper) ⇒ Object
-
#parse(opts = {}) ⇒ Object
override this in a subclass MUST return self if successful MAY return false otherwise.
-
#process(curl, opts = {}) ⇒ Object
We can then alternate #process in Page subclasses Frame doesn’t mind about the value returned by #process unless it is nil or false.
-
#retry? ⇒ Boolean
override this in a subclass.
- #size ⇒ Object
- #submit(form, frame, hash = {}, opts = {}, &callback) ⇒ Object
-
#text(selector_or_node, options = {}) ⇒ Object
FINDERS PREPROCESSORS #.
- #texts(hash, options = {}) ⇒ Object
- #to_html ⇒ Object
- #to_xml ⇒ Object
- #url ⇒ Object (also: #href)
- #utf! ⇒ Object
Constructor Details
#initialize(obj = '', loc = Hash.new(''), js = is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new)) ⇒ Page
Frame calls it with no args
46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/rhack/page.rb', line 46 def initialize(obj='', loc=Hash.new(''), js=is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new)) loc = loc.parse:uri if !loc.is Hash @js = js if obj.is Curl::Easy or obj.kinda Scout c = obj.kinda(Scout) ? obj.http : obj # just (c, loc) would pass to #process opts variable that returns '' on any key process(c, loc.b || {}) else @body = obj @loc = loc end end |
Instance Attribute Details
#body ⇒ Object (readonly) Also known as: html
Returns the value of attribute body.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def body @body end |
#curl ⇒ Object (readonly)
Returns the value of attribute curl.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def curl @curl end |
#curl_res ⇒ Object (readonly)
Returns the value of attribute curl_res.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def curl_res @curl_res end |
#data ⇒ Object (readonly) Also known as: hash
Returns the value of attribute data.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def data @data end |
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def doc @doc end |
#failed ⇒ Object (readonly)
Returns the value of attribute failed.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def failed @failed end |
#js ⇒ Object (readonly)
Returns the value of attribute js.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def js @js end |
#loc ⇒ Object (readonly)
Returns the value of attribute loc.
36 37 38 |
# File 'lib/rhack/page.rb', line 36 def loc @loc end |
#res ⇒ Object
result of page processing been made in frame context
41 42 43 |
# File 'lib/rhack/page.rb', line 41 def res @res end |
#title(full = true) ⇒ Object
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 |
# File 'lib/rhack/page.rb', line 253 def title(full=true) if @data.nil? and !@failed and @body.b if full to_html unless defined? @doc if @doc.title.b @title = @doc.title else @title = @loc.href @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head') @title end else title true unless defined? @title if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40 @short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+'…' elsif @title.size > 40 @short_title = @title[/.{1,30}\S*/][0..38]+'…' else @short_title = @title end end else @loc.href end end |
Instance Method Details
#at(selector_or_node, options = {}) ⇒ Object Also known as: first
363 364 365 366 367 368 369 370 371 372 373 |
# File 'lib/rhack/page.rb', line 363 def at(selector_or_node, ={}) if selector_or_node and preresult = selector_or_node.is_a?(LibXML::XML::Node) ? selector_or_node : __at(selector_or_node) preresult = preprocess_search_result(preresult, [:preprocess]) block_given? ? yield(preresult) : preresult else node_is_missing!(selector_or_node, ) preresult end end |
#dict(hash) ⇒ Object
hook to create even-looked lines defining a hash in my Verdana 10px, e.g. dict key1: value1, …
key2: value2, ...
285 286 287 |
# File 'lib/rhack/page.rb', line 285 def dict(hash) hash.is_a?(Hash) ? hash : Hash[hash] end |
#empty? ⇒ Boolean
59 60 61 |
# File 'lib/rhack/page.rb', line 59 def empty? !@data && !@body.b end |
#eval_js(frame = nil) ⇒ Object
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
# File 'lib/rhack/page.rb', line 212 def eval_js(frame=nil) eval_string "document.location = window.location = #{@loc.to_json}; document.URL = document.baseURI = document.documentURI = location.href; document.domain = location.host;" find("script").each {|n| L.debug n.text.strip if text = n.text.strip.b js[:write_output] = '' eval_string text if res = js[:write_output].b then n.after res end n.remove! elsif frame and n.src eval_string frame.get_cached n.src end } end |
#eval_string(str) ⇒ Object
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
# File 'lib/rhack/page.rb', line 229 def eval_string(str) @js ||= Johnson::Runtime.new L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{Curl.carier_thread}" begin @js.evaluate(str) rescue Johnson::Error => e L.warn e. L.debug { if m = e..match(/(\w+) is undefined|([\w.]+) is not a function/) L.clr.hl! str, /\b#{m[1] || m[2]}\b/ end "\n\t#{str}" } end end |
#expand_link(link) ⇒ Object
makes a relative path being on this page into an absolute path
309 310 311 312 313 314 315 316 317 318 |
# File 'lib/rhack/page.rb', line 309 def (link) case link when /^\w+:\/\// then link when /^\/\// then @loc.protocol + ':' + link when /^\// then @loc.root + link when /^\?/ then File.join(@loc.root, @loc.path) + link when /^#/ then File.join(@loc.root, @loc.fullpath) + link else File.join @loc.root, File.dirname(@loc.path), link end end |
#failed? ⇒ Boolean
override this in a subclass
93 94 95 |
# File 'lib/rhack/page.rb', line 93 def failed?(*) @curl_res.error or @curl_res.code != 200 end |
#find(selector_or_nodes, options = {}, &foreach) ⇒ Object Also known as: all
376 377 378 379 380 381 382 383 384 385 386 387 |
# File 'lib/rhack/page.rb', line 376 def find(selector_or_nodes, ={}, &foreach) preresult = selector_or_nodes.is_a?(LibXML::XML::XPath::Object, Array) ? selector_or_nodes : __find(selector_or_nodes) if preresult.size > 0 preresult = preprocess_search_results(preresult, [:preprocess]) foreach ? preresult.each(&foreach) : preresult else node_is_missing!(selector_or_nodes, ) preresult end end |
#flatten_dict(hash) ⇒ Object
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
# File 'lib/rhack/page.rb', line 290 def flatten_dict(hash) result = {} hash.each {|k, v| if k.is String and k[' '] k.split(' ').each_with_index {|k_unit, k_idx| result[k_unit.to_sym] = v[k_idx] } elsif k.is Array k.each_with_index {|k_unit, k_idx| result[k_unit.to_sym] = v[k_idx] } else result[k.to_sym] = v end } result end |
#form(form = 'form', hash = {}, opts = {}) ⇒ Object
FORMS #
445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 |
# File 'lib/rhack/page.rb', line 445 def form(form='form', hash={}, opts={}) form = "[action=#{@loc.path.inspect}]" if form == :self if form.is String form_node = at form raise LibXML::XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form' else form_node = form end hash = form_node.inputs_all.merge!(hash) action = (form_node.action || @loc.path) if form_node['method'].downcase == 'post' [hash, form_node.enctype =~ /multipart/, action, opts] else action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b [action, opts] end end |
#get_link(selector_or_node = 'a', options = {}, &onfound) ⇒ Object Also known as: link, get_href
415 416 417 418 419 420 421 422 423 424 425 426 |
# File 'lib/rhack/page.rb', line 415 def get_link(selector_or_node='a', ={}, &onfound) at(selector_or_node, .merge(:preprocess => lambda {|node| unless href = node.href if node = node.find('a') href = node.href end end if href href end })) {|href| onfound && href ? onfound.call(href) : href} end |
#get_links(links = 'a') ⇒ Object Also known as: get_hrefs, links
def get_src(link=‘img’)
begin
link = at(link) && at(link).src if link.is String
rescue LibXML::XML::Error; nil
end
link if link
end
497 498 499 500 501 502 503 504 |
# File 'lib/rhack/page.rb', line 497 def get_links(links='a') begin links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String rescue LibXML::XML::Error links = [links] end links.map {|link| link}.uniq end |
#get_src(selector_or_node = 'img', options = {}, &onfound) ⇒ Object Also known as: src
406 407 408 409 410 411 412 |
# File 'lib/rhack/page.rb', line 406 def get_src(selector_or_node='img', ={}, &onfound) at(selector_or_node, .merge(:preprocess => lambda {|node| if src = node.src src end })) {|src| onfound && src ? onfound.call(src) : src} end |
#get_srcs(links = 'img') ⇒ Object Also known as: srcs
TODO: make into same form as #get_src and #map
479 480 481 482 483 484 485 486 |
# File 'lib/rhack/page.rb', line 479 def get_srcs(links='img') begin links = find(links).map {|e| e.src} if links.is String rescue LibXML::XML::Error links = [links] end links.map {|link| link}.uniq end |
#inspect ⇒ Object
73 74 75 76 77 78 79 80 |
# File 'lib/rhack/page.rb', line 73 def inspect sz = size if @json or @hash "<##{self.class.name} (#{@data ? sz.bytes : 'failed to parse'}) #{@json ? 'json' : 'url params'}>" else "<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>" end end |
#load_scripts(frame) ⇒ Object
def get_link(link=‘a’)
begin
link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
rescue XML::Error; nil
end
link if link
end
516 517 518 |
# File 'lib/rhack/page.rb', line 516 def load_scripts(frame) frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js} end |
#map(selector_or_nodes, options = {}, &mapper) ⇒ Object
430 431 432 433 434 435 436 |
# File 'lib/rhack/page.rb', line 430 def map(selector_or_nodes, ={}, &mapper) mapping = find(selector_or_nodes, .merge(:preprocess => mapper)) unless [:compact] == false mapping = mapping.to_a.compact end mapping end |
#map_json(selector_or_nodes, options = {}, &mapper) ⇒ Object
438 439 440 |
# File 'lib/rhack/page.rb', line 438 def map_json(selector_or_nodes, ={}, &mapper) JsonString map(selector_or_nodes, , &mapper) end |
#parse(opts = {}) ⇒ Object
override this in a subclass MUST return self if successful MAY return false otherwise
105 106 107 108 109 110 111 112 113 114 115 116 |
# File 'lib/rhack/page.rb', line 105 def parse(opts={}) if opts[:json] parse_json opts elsif opts[:hash] parse_hash opts elsif opts[:xml] parse_xml opts else parse_html opts end self end |
#process(curl, opts = {}) ⇒ Object
We can then alternate #process in Page subclasses Frame doesn’t mind about the value returned by #process unless it is nil or false
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/rhack/page.rb', line 191 def process(curl, opts={}) @loc = curl.last_effective_url.parse :uri @curl = curl @curl_res = curl.res if failed? should_proceed = failed! # false by default if retry? curl.retry! return false # callback will not proceed end unless should_proceed return false # nor callback or retry will not proceed end end L.debug "#{@loc.fullpath} -> #{@curl_res}" parse(opts) end |
#retry? ⇒ Boolean
override this in a subclass
98 99 100 |
# File 'lib/rhack/page.rb', line 98 def retry?(*) false end |
#size ⇒ Object
63 64 65 66 67 68 69 70 71 |
# File 'lib/rhack/page.rb', line 63 def size if @data.nil? (@body || '').size elsif @data == false 0 else @data.inspect.size end end |
#submit(form, frame, hash = {}, opts = {}, &callback) ⇒ Object
462 463 464 465 466 467 468 469 470 471 472 473 |
# File 'lib/rhack/page.rb', line 462 def submit(form, frame, hash={}, opts={}, &callback) (opts[:headers] ||= {}).Referer ||= @loc.href if @loc query = form(form, hash, opts) curr_target, new_target = frame.loc.href, (query[2] || query[0]) if need_retargeting = (frame.static && curr_target != new_target) frame.retarget new_target end page = frame.exec(*query, &callback) frame.retarget curr_target, :forced if need_retargeting page end |
#text(selector_or_node, options = {}) ⇒ Object
FINDERS PREPROCESSORS #
393 394 395 396 397 398 |
# File 'lib/rhack/page.rb', line 393 def text(selector_or_node, ={}) if node = at(selector_or_node, ) txt = node.text.strip block_given? ? yield(txt) : txt end end |
#texts(hash, options = {}) ⇒ Object
400 401 402 403 404 |
# File 'lib/rhack/page.rb', line 400 def texts(hash, ={}) hash.map_values {|selector_or_node| text(selector_or_node, ) } end |
#to_html ⇒ Object
245 246 247 |
# File 'lib/rhack/page.rb', line 245 def to_html @doc = @body.to_html end |
#to_xml ⇒ Object
249 250 251 |
# File 'lib/rhack/page.rb', line 249 def to_xml @doc = @body.to_xml end |
#url ⇒ Object Also known as: href
86 87 88 |
# File 'lib/rhack/page.rb', line 86 def url @loc.href end |
#utf! ⇒ Object
82 83 84 |
# File 'lib/rhack/page.rb', line 82 def utf! @body.utf! end |