Class: RHACK::Page

Inherits:
Object show all
Defined in:
lib/rhack/page.rb

Overview

Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, … ) ) => Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, …

Direct Known Subclasses

CodeIndiffirentPage, HashPage, HtmlPage, JsonPage, XmlPage

Constant Summary collapse

@@ignore =

for johnson

/google|_gat|tracker|adver/i

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(obj = '', loc = Hash.new(''), js = is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new)) ⇒ Page

Frame calls it with no args



46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/rhack/page.rb', line 46

def initialize(obj='', loc=Hash.new(''), js=is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new))
  loc = loc.parse:uri if !loc.is Hash
  @js = js
  if obj.is Curl::Easy or obj.kinda Scout
    c = obj.kinda(Scout) ? obj.http : obj
    # just (c, loc) would pass to #process opts variable that returns '' on any key
    process(c, loc.b || {})
  else
    @body = obj
    @loc = loc
  end
end

Instance Attribute Details

#bodyObject (readonly) Also known as: html

Returns the value of attribute body.



36
37
38
# File 'lib/rhack/page.rb', line 36

def body
  @body
end

#curlObject (readonly)

Returns the value of attribute curl.



36
37
38
# File 'lib/rhack/page.rb', line 36

def curl
  @curl
end

#curl_resObject (readonly)

Returns the value of attribute curl_res.



36
37
38
# File 'lib/rhack/page.rb', line 36

def curl_res
  @curl_res
end

#dataObject (readonly) Also known as: hash

Returns the value of attribute data.



36
37
38
# File 'lib/rhack/page.rb', line 36

def data
  @data
end

#docObject (readonly)

Returns the value of attribute doc.



36
37
38
# File 'lib/rhack/page.rb', line 36

def doc
  @doc
end

#failedObject (readonly)

Returns the value of attribute failed.



36
37
38
# File 'lib/rhack/page.rb', line 36

def failed
  @failed
end

#jsObject (readonly)

Returns the value of attribute js.



36
37
38
# File 'lib/rhack/page.rb', line 36

def js
  @js
end

#locObject (readonly)

Returns the value of attribute loc.



36
37
38
# File 'lib/rhack/page.rb', line 36

def loc
  @loc
end

#resObject

result of page processing been made in frame context



41
42
43
# File 'lib/rhack/page.rb', line 41

def res
  @res
end

#title(full = true) ⇒ Object



253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# File 'lib/rhack/page.rb', line 253

def title(full=true)
  if @data.nil? and !@failed and @body.b
    if full
      to_html unless defined? @doc
      if @doc.title.b
        @title = @doc.title
      else
        @title = @loc.href
        @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
        @title
      end
    else
      title true unless defined? @title
      if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40
        @short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+''
      elsif @title.size > 40
        @short_title = @title[/.{1,30}\S*/][0..38]+''
      else
        @short_title = @title
      end
    end
  else
    @loc.href
  end
end

Instance Method Details

#at(selector_or_node, options = {}) ⇒ Object Also known as: first



363
364
365
366
367
368
369
370
371
372
373
# File 'lib/rhack/page.rb', line 363

def at(selector_or_node, options={})
  if selector_or_node and preresult = selector_or_node.is_a?(LibXML::XML::Node) ? 
      selector_or_node : __at(selector_or_node)
      
    preresult = preprocess_search_result(preresult, options[:preprocess])
    block_given? ? yield(preresult) : preresult
  else
    node_is_missing!(selector_or_node, options)
    preresult
  end
end

#dict(hash) ⇒ Object

hook to create even-looked lines defining a hash in my Verdana 10px, e.g. dict key1: value1, …

key2: value2, ...


285
286
287
# File 'lib/rhack/page.rb', line 285

def dict(hash)
  hash.is_a?(Hash) ? hash : Hash[hash]
end

#empty?Boolean

Returns:

  • (Boolean)


59
60
61
# File 'lib/rhack/page.rb', line 59

def empty?
  !@data && !@body.b
end

#eval_js(frame = nil) ⇒ Object



212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/rhack/page.rb', line 212

def eval_js(frame=nil)
  eval_string "document.location = window.location = #{@loc.to_json};
  document.URL = document.baseURI = document.documentURI = location.href;
  document.domain = location.host;"
  find("script").each {|n|
    L.debug n.text.strip
    if text = n.text.strip.b
      js[:write_output] = ''
      eval_string text
      if res = js[:write_output].b then n.after res end
      n.remove!
    elsif frame and n.src
      eval_string frame.get_cached expand_link n.src
    end
  }
end

#eval_string(str) ⇒ Object



229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/rhack/page.rb', line 229

def eval_string(str)
  @js ||= Johnson::Runtime.new
  L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{Curl.carier_thread}"
  begin
    @js.evaluate(str)
  rescue Johnson::Error => e
    L.warn e.message
    L.debug {
      if m = e.message.match(/(\w+) is undefined|([\w.]+) is not a function/)
        L.clr.hl! str, /\b#{m[1] || m[2]}\b/
      end
      "\n\t#{str}"
    }
  end
end

makes a relative path being on this page into an absolute path



309
310
311
312
313
314
315
316
317
318
# File 'lib/rhack/page.rb', line 309

def expand_link(link)
  case link
    when /^\w+:\/\// then link
    when /^\/\// then @loc.protocol + ':' + link
    when /^\// then @loc.root + link
    when /^\?/ then File.join(@loc.root, @loc.path) + link
    when /^#/ then File.join(@loc.root, @loc.fullpath) + link
    else File.join @loc.root, File.dirname(@loc.path), link
  end
end

#failed?Boolean

override this in a subclass

Returns:

  • (Boolean)


93
94
95
# File 'lib/rhack/page.rb', line 93

def failed?(*)
  @curl_res.error or @curl_res.code != 200
end

#find(selector_or_nodes, options = {}, &foreach) ⇒ Object Also known as: all



376
377
378
379
380
381
382
383
384
385
386
387
# File 'lib/rhack/page.rb', line 376

def find(selector_or_nodes, options={}, &foreach)
  preresult = selector_or_nodes.is_a?(LibXML::XML::XPath::Object, Array) ?
    selector_or_nodes : __find(selector_or_nodes)
    
  if preresult.size > 0
    preresult = preprocess_search_results(preresult, options[:preprocess])
    foreach ? preresult.each(&foreach) : preresult
  else
    node_is_missing!(selector_or_nodes, options)
    preresult
  end
end

#flatten_dict(hash) ⇒ Object

maps lastname’ => tuple into => tuple, :lastname => tuple



290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/rhack/page.rb', line 290

def flatten_dict(hash)
  result = {}
  hash.each {|k, v|
    if k.is String and k[' ']
      k.split(' ').each_with_index {|k_unit, k_idx|
        result[k_unit.to_sym] = v[k_idx]
      }
    elsif k.is Array
      k.each_with_index {|k_unit, k_idx|
        result[k_unit.to_sym] = v[k_idx]
      }
    else
      result[k.to_sym] = v
    end
  }
  result
end

#form(form = 'form', hash = {}, opts = {}) ⇒ Object

FORMS #



445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
# File 'lib/rhack/page.rb', line 445

def form(form='form', hash={}, opts={})
  form = "[action=#{@loc.path.inspect}]" if form == :self
  if form.is String
         form_node = at form
         raise LibXML::XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
  else form_node = form
  end
  hash = form_node.inputs_all.merge!(hash)
  action = expand_link(form_node.action || @loc.path)
  if form_node['method'].downcase == 'post'
    [hash, form_node.enctype =~ /multipart/, action, opts]
  else
    action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b
    [action, opts]
  end
end


415
416
417
418
419
420
421
422
423
424
425
426
# File 'lib/rhack/page.rb', line 415

def get_link(selector_or_node='a', options={}, &onfound)
  at(selector_or_node, options.merge(:preprocess => lambda {|node|
    unless href = node.href
      if node = node.find('a')
        href = node.href
      end
    end
    if href
      expand_link href
    end
  })) {|href| onfound && href ? onfound.call(href) : href}
end

def get_src(link=‘img’)

begin
  link = at(link) && at(link).src if link.is String
rescue LibXML::XML::Error; nil
end
expand_link link if link

end



497
498
499
500
501
502
503
504
# File 'lib/rhack/page.rb', line 497

def get_links(links='a')
  begin
    links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
  rescue LibXML::XML::Error
    links = [links]
  end
  links.map {|link| expand_link link}.uniq
end

#get_src(selector_or_node = 'img', options = {}, &onfound) ⇒ Object Also known as: src



406
407
408
409
410
411
412
# File 'lib/rhack/page.rb', line 406

def get_src(selector_or_node='img', options={}, &onfound)
  at(selector_or_node, options.merge(:preprocess => lambda {|node|
    if src = node.src
      expand_link src
    end
  })) {|src| onfound && src ? onfound.call(src) : src}
end

#get_srcs(links = 'img') ⇒ Object Also known as: srcs

TODO: make into same form as #get_src and #map



479
480
481
482
483
484
485
486
# File 'lib/rhack/page.rb', line 479

def get_srcs(links='img')
  begin
    links = find(links).map {|e| e.src} if links.is String
  rescue LibXML::XML::Error
    links = [links]
  end
  links.map {|link| expand_link link}.uniq
end

#inspectObject



73
74
75
76
77
78
79
80
# File 'lib/rhack/page.rb', line 73

def inspect
  sz = size
  if @json or @hash
    "<##{self.class.name} (#{@data ? sz.bytes : 'failed to parse'}) #{@json ? 'json' : 'url params'}>"
  else
    "<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>"
  end
end

#load_scripts(frame) ⇒ Object

def get_link(link=‘a’)

begin
  link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
rescue XML::Error; nil
end
expand_link link if link

end



516
517
518
# File 'lib/rhack/page.rb', line 516

def load_scripts(frame)
  frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
end

#map(selector_or_nodes, options = {}, &mapper) ⇒ Object



430
431
432
433
434
435
436
# File 'lib/rhack/page.rb', line 430

def map(selector_or_nodes, options={}, &mapper)
  mapping = find(selector_or_nodes, options.merge(:preprocess => mapper))
  unless options[:compact] == false
    mapping = mapping.to_a.compact
  end
  mapping
end

#map_json(selector_or_nodes, options = {}, &mapper) ⇒ Object



438
439
440
# File 'lib/rhack/page.rb', line 438

def map_json(selector_or_nodes, options={}, &mapper)
  JsonString map(selector_or_nodes, options, &mapper)
end

#parse(opts = {}) ⇒ Object

override this in a subclass MUST return self if successful MAY return false otherwise



105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/rhack/page.rb', line 105

def parse(opts={})
  if opts[:json]
    parse_json opts
  elsif opts[:hash]
    parse_hash opts
  elsif opts[:xml]
    parse_xml opts
  else
    parse_html opts
  end
  self
end

#process(curl, opts = {}) ⇒ Object

We can then alternate #process in Page subclasses Frame doesn’t mind about the value returned by #process unless it is nil or false



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/rhack/page.rb', line 191

def process(curl, opts={})
  @loc = curl.last_effective_url.parse :uri
  @curl = curl
  @curl_res = curl.res
  
  if failed?
    should_proceed = failed! # false by default
    if retry?
      curl.retry!
      return false # callback will not proceed
    end
    unless should_proceed
      return false # nor callback or retry will not proceed
    end
  end
  
  L.debug "#{@loc.fullpath} -> #{@curl_res}"
  parse(opts)
end

#retry?Boolean

override this in a subclass

Returns:

  • (Boolean)


98
99
100
# File 'lib/rhack/page.rb', line 98

def retry?(*)
  false
end

#sizeObject



63
64
65
66
67
68
69
70
71
# File 'lib/rhack/page.rb', line 63

def size
  if @data.nil?
    (@body || '').size
  elsif @data == false
    0
  else
    @data.inspect.size
  end
end

#submit(form, frame, hash = {}, opts = {}, &callback) ⇒ Object



462
463
464
465
466
467
468
469
470
471
472
473
# File 'lib/rhack/page.rb', line 462

def submit(form, frame, hash={}, opts={}, &callback)
  (opts[:headers] ||= {}).Referer ||= @loc.href if @loc
  query = form(form, hash, opts)
  
  curr_target, new_target = frame.loc.href, (query[2] || query[0])
  if need_retargeting = (frame.static && curr_target != new_target)
    frame.retarget new_target
  end
  page = frame.exec(*query, &callback)
  frame.retarget curr_target, :forced if need_retargeting
  page
end

#text(selector_or_node, options = {}) ⇒ Object

FINDERS PREPROCESSORS #



393
394
395
396
397
398
# File 'lib/rhack/page.rb', line 393

def text(selector_or_node, options={})
  if node = at(selector_or_node, options)
    txt = node.text.strip
    block_given? ? yield(txt) : txt
  end
end

#texts(hash, options = {}) ⇒ Object



400
401
402
403
404
# File 'lib/rhack/page.rb', line 400

def texts(hash, options={})
  hash.map_values {|selector_or_node|
    text(selector_or_node, options)
  }
end

#to_htmlObject



245
246
247
# File 'lib/rhack/page.rb', line 245

def to_html
  @doc = @body.to_html
end

#to_xmlObject



249
250
251
# File 'lib/rhack/page.rb', line 249

def to_xml
  @doc = @body.to_xml
end

#urlObject Also known as: href



86
87
88
# File 'lib/rhack/page.rb', line 86

def url
  @loc.href
end

#utf!Object



82
83
84
# File 'lib/rhack/page.rb', line 82

def utf!
  @body.utf!
end