Class: RoCrawler::Base

Inherits:
Object
  • Object
show all
Includes:
RoSupport::Debug
Defined in:
lib/ro_crawler/base.rb

Instance Method Summary collapse

Instance Method Details

#browser_closeObject



124
125
126
# File 'lib/ro_crawler/base.rb', line 124

def browser_close
  @b.close if @b.respond_to?(:close)
end

#crawler(opt = {}) ⇒ Object



49
50
51
52
53
54
55
56
57
# File 'lib/ro_crawler/base.rb', line 49

def crawler(opt={})
  if local?
    if opt[:driver]
      @b = open_browser opt[:driver]
    else
      @b = open_browser :phantomjs
    end
  end
end

#get_attrs_in(tags, *attr_names) ⇒ Object



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/ro_crawler/base.rb', line 90

def get_attrs_in(tags, *attr_names)
  result = []
  tags.each do |tag|
    attr_value = []
    judge = lambda do |attr_names|
      attr_names.each do |attr_name|
        if attr_name == 'text'
          attr_value << tag.text
        elsif attr_name == 'inner_html'
          attr_value << tag.inner_html
        else
          attr_value << tag.attribute(attr_name).value
        end
      end
    end

    if attr_names.length > 1
      judge.call(attr_names)
    elsif attr_names.length == 1
      attr_name = attr_names
      judge.call(attr_name)
      attr_value = attr_value.join
    end
    result << attr_value
  end

  eval %Q(return #{result.to_args})
end

#get_contentsObject



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/ro_crawler/base.rb', line 19

def get_contents
  ro_raise(err "@link_titles is nil", output: ['@url']) if @link_titles.nil?
  @link_title_contents = @link_titles.dup
  @link_title_contents.each do |link_content|
    if link_content[0][/http/]
      link = link_content[0]
    else
      link = "#{@home_url}#{link_content[0]}"
    end

    link_content << intr = get_tags_attrs_from(link, @intr_selector, 'text')
    unless intr.is_a? String
      raise_log 'intr must be a string', 'ro_crawler_base.log'
    end
    link_content
  end
end

#get_home_url(url) ⇒ Object



136
137
138
# File 'lib/ro_crawler/base.rb', line 136

def get_home_url(url)
  url[/(http(s)?\:\/\/)(www\.)?.+\.((com)|(org)|(info)|(me)|(net)|(cn))/]
end

#get_html_from(url) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/ro_crawler/base.rb', line 59

def get_html_from(url)
  html = ""
  @b ||= crawler

  get_html = lambda do
    if url[/http/]
      Nokogiri::HTML.parse @b.goto(url).html
    else
      Nokogiri::HTML.parse File.read(url)
    end
  end

  html = get_html.call
end


15
16
17
# File 'lib/ro_crawler/base.rb', line 15

def get_link_titles
  @link_titles = get_tags_attrs_from(@url, @auchor_selector, 'href', 'text')
end

#get_tags_attrs_from(url, tags_selector, *attr_names) ⇒ Object



119
120
121
122
# File 'lib/ro_crawler/base.rb', line 119

def get_tags_attrs_from(url, tags_selector, *attr_names)
  tags = get_tags_from(url, tags_selector)
  get_attrs_in(tags, *attr_names)
end

#get_tags_from(url, selector) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/ro_crawler/base.rb', line 74

def get_tags_from(url, selector)
  p "get tags: #{selector} from url:#{url}"
  html = get_html_from(url)
  result = html.css(selector)
  raise "#{selector} can't find result" if result.nil?
  if result.respond_to?(:inner_html)
    tag = result
  elsif result.is_a?(Nokogiri::XML::NodeSet)
    tags = []
    result.each do |tag|
      tags << tag
    end
    tags
  end
end

#handle(results) ⇒ Object



128
129
130
131
132
133
134
# File 'lib/ro_crawler/base.rb', line 128

def handle(results)
  if results.is_a?(Array)
    results.each do |result|
      yield result
    end
  end
end

#handle_accident_error {|@b| ... } ⇒ Object

Handle accident error eg. timeout cause

Yields:

  • (@b)


141
142
143
# File 'lib/ro_crawler/base.rb', line 141

def handle_accident_error
  yield @b if @b && block_given?
end

#handler(&blk) ⇒ Object



37
38
39
# File 'lib/ro_crawler/base.rb', line 37

def handler(&blk)
  handle @link_title_contents, &blk
end

#local?Boolean

Returns:

  • (Boolean)


45
46
47
# File 'lib/ro_crawler/base.rb', line 45

def local?
  `google-chrome --version`[/Google Chrome/] ? true : false
end

#open_browser(driver) ⇒ Object



41
42
43
# File 'lib/ro_crawler/base.rb', line 41

def open_browser(driver)
  ::Watir::Browser.new driver
end

#proxy?(port) ⇒ Boolean

Returns:

  • (Boolean)


145
146
147
# File 'lib/ro_crawler/base.rb', line 145

def proxy?(port)
  !`lsof -i:#{port}`.empty?
end

#spider(url, anchor_selector, intr_selector) ⇒ Object



5
6
7
8
9
10
11
12
13
# File 'lib/ro_crawler/base.rb', line 5

def spider(url, anchor_selector, intr_selector)
  @url = url
  @home_url = get_home_url(url)
  @auchor_selector = anchor_selector
  @intr_selector = intr_selector
  get_link_titles
  get_contents
  browser_close
end