Class: RoCrawler::Base
- Inherits:
-
Object
- Object
- RoCrawler::Base
- Includes:
- RoSupport::Debug
- Defined in:
- lib/ro_crawler/base.rb
Instance Method Summary collapse
- #browser_close ⇒ Object
- #crawler(opt = {}) ⇒ Object
- #get_attrs_in(tags, *attr_names) ⇒ Object
- #get_contents ⇒ Object
- #get_home_url(url) ⇒ Object
- #get_html_from(url) ⇒ Object
- #get_link_titles ⇒ Object
- #get_tags_attrs_from(url, tags_selector, *attr_names) ⇒ Object
- #get_tags_from(url, selector) ⇒ Object
- #handle(results) ⇒ Object
-
#handle_accident_error {|@b| ... } ⇒ Object
Handle accident error eg.
- #handler(&blk) ⇒ Object
- #local? ⇒ Boolean
- #open_browser(driver) ⇒ Object
- #proxy?(port) ⇒ Boolean
- #spider(url, anchor_selector, intr_selector) ⇒ Object
Instance Method Details
#browser_close ⇒ Object
124 125 126 |
# File 'lib/ro_crawler/base.rb', line 124 def browser_close @b.close if @b.respond_to?(:close) end |
#crawler(opt = {}) ⇒ Object
49 50 51 52 53 54 55 56 57 |
# File 'lib/ro_crawler/base.rb', line 49 def crawler(opt={}) if local? if opt[:driver] @b = open_browser opt[:driver] else @b = open_browser :phantomjs end end end |
#get_attrs_in(tags, *attr_names) ⇒ Object
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# File 'lib/ro_crawler/base.rb', line 90 def get_attrs_in(, *attr_names) result = [] .each do |tag| attr_value = [] judge = lambda do |attr_names| attr_names.each do |attr_name| if attr_name == 'text' attr_value << tag.text elsif attr_name == 'inner_html' attr_value << tag.inner_html else attr_value << tag.attribute(attr_name).value end end end if attr_names.length > 1 judge.call(attr_names) elsif attr_names.length == 1 attr_name = attr_names judge.call(attr_name) attr_value = attr_value.join end result << attr_value end eval %Q(return #{result.to_args}) end |
#get_contents ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/ro_crawler/base.rb', line 19 def get_contents ro_raise(err "@link_titles is nil", output: ['@url']) if @link_titles.nil? @link_title_contents = @link_titles.dup @link_title_contents.each do |link_content| if link_content[0][/http/] link = link_content[0] else link = "#{@home_url}#{link_content[0]}" end link_content << intr = (link, @intr_selector, 'text') unless intr.is_a? String raise_log 'intr must be a string', 'ro_crawler_base.log' end link_content end end |
#get_home_url(url) ⇒ Object
136 137 138 |
# File 'lib/ro_crawler/base.rb', line 136 def get_home_url(url) url[/(http(s)?\:\/\/)(www\.)?.+\.((com)|(org)|(info)|(me)|(net)|(cn))/] end |
#get_html_from(url) ⇒ Object
59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/ro_crawler/base.rb', line 59 def get_html_from(url) html = "" @b ||= crawler get_html = lambda do if url[/http/] Nokogiri::HTML.parse @b.goto(url).html else Nokogiri::HTML.parse File.read(url) end end html = get_html.call end |
#get_link_titles ⇒ Object
15 16 17 |
# File 'lib/ro_crawler/base.rb', line 15 def get_link_titles @link_titles = (@url, @auchor_selector, 'href', 'text') end |
#get_tags_attrs_from(url, tags_selector, *attr_names) ⇒ Object
119 120 121 122 |
# File 'lib/ro_crawler/base.rb', line 119 def (url, , *attr_names) = (url, ) get_attrs_in(, *attr_names) end |
#get_tags_from(url, selector) ⇒ Object
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/ro_crawler/base.rb', line 74 def (url, selector) p "get tags: #{selector} from url:#{url}" html = get_html_from(url) result = html.css(selector) raise "#{selector} can't find result" if result.nil? if result.respond_to?(:inner_html) tag = result elsif result.is_a?(Nokogiri::XML::NodeSet) = [] result.each do |tag| << tag end end end |
#handle(results) ⇒ Object
128 129 130 131 132 133 134 |
# File 'lib/ro_crawler/base.rb', line 128 def handle(results) if results.is_a?(Array) results.each do |result| yield result end end end |
#handle_accident_error {|@b| ... } ⇒ Object
Handle accident error eg. timeout cause
141 142 143 |
# File 'lib/ro_crawler/base.rb', line 141 def handle_accident_error yield @b if @b && block_given? end |
#handler(&blk) ⇒ Object
37 38 39 |
# File 'lib/ro_crawler/base.rb', line 37 def handler(&blk) handle @link_title_contents, &blk end |
#local? ⇒ Boolean
45 46 47 |
# File 'lib/ro_crawler/base.rb', line 45 def local? `google-chrome --version`[/Google Chrome/] ? true : false end |
#open_browser(driver) ⇒ Object
41 42 43 |
# File 'lib/ro_crawler/base.rb', line 41 def open_browser(driver) ::Watir::Browser.new driver end |
#proxy?(port) ⇒ Boolean
145 146 147 |
# File 'lib/ro_crawler/base.rb', line 145 def proxy?(port) !`lsof -i:#{port}`.empty? end |
#spider(url, anchor_selector, intr_selector) ⇒ Object
5 6 7 8 9 10 11 12 13 |
# File 'lib/ro_crawler/base.rb', line 5 def spider(url, anchor_selector, intr_selector) @url = url @home_url = get_home_url(url) @auchor_selector = anchor_selector @intr_selector = intr_selector get_link_titles get_contents browser_close end |