Class: SNCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/sn_crawler.rb

Overview

A crawler class

Examples:

require 'sn_crawler'

url = "http://vnexpress.net/rss"
c = SNCrawler.new(url,"VNE","/channel/item:[title,description,pubDate,link]",nil)
c.get_links(true)
# => ["vnexpress.net/rss/tin-moi-nhat.rss",...]

c.get_news(true)
# => City in Jordan welcomes ISIS
#    Image: []
#    Now inserting City in Jordan welcomes ISIS

Instance Method Summary collapse

Constructor Details

#initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100) ⇒ SNCrawler

Initialize parameters

source => The source URL

name => Some lovely names

structure =>

structure format: /path/to/channel/item_name:

item_attributes_name => title of the page

item_attributes_name => description of the page

item_attributes_name => publicity time of the page

item_attributes_name => link to the page

For example: /channel/item:

db_conf => activerecord settings

limit the number of news that you want



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/sn_crawler.rb', line 61

def initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100)
  ## The rss source's url
  @source = source
  @source_name = name
  @structure = structure
  ## Crawled urls will be stored here
  @url = []
  @agent = Mechanize.new
  ## Establish a connection to DB server
  if !db_conf.nil? then
    ActiveRecord::Base.establish_connection(db_conf)
    @use_db = true
  else
    @use_db = false
  end
  @limit = limit
end

Instance Method Details

#clear_urlObject

Clear urls



154
155
156
# File 'lib/sn_crawler.rb', line 154

def clear_url
  @url = []
end

#create_table(options = "", verbose = true) ⇒ Object

Create table for our gem

Examples:

c.create_table("engine=MROONGA",true)


86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/sn_crawler.rb', line 86

def create_table(options = "", verbose = true)
  begin
    ActiveRecord::Migration.class_eval do
      create_table :sn_news, :options => options do |t|
        t.string :lang
        t.string  :title
        t.text :description
        t.text :content
        t.string :link
        t.string :images
        t.datetime :pubtime
        
        t.timestamps
      end
      
      add_index :sn_news, [:title], :unique => true, :name => "unique_title_on_news"
    end
  rescue => e
    if verbose then
      puts "Error(s): #{e.to_s}"
    end
    return false
  end
  return true
end

#finalizeObject



262
263
264
# File 'lib/sn_crawler.rb', line 262

def finalize
  ActiveRecord::Base.connection.close
end

Get urls from a source url



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/sn_crawler.rb', line 116

def get_links(verbose = false)
  page = @agent.get(@source)

  page.links_with(:href => /\.(rss|xml)/).each do |link|
    src = ""
    if !link.href.include? "http"
      src = URI.parse(@source).host + link.href 
    else 
      src = link.href
    end
    existed = 0
    @url.each do |u|
      if u == src
        existed = 1
      end
    end
    if existed == 0 && src.length <= 50
      @url << src
    end
  end

  if verbose then
    puts @url.to_s
  end
end

#get_news(verbose = false) ⇒ Object

Get news from urls

Note that you have to run this method ONLY after running get_links and create_table(in case of using DB)



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# File 'lib/sn_crawler.rb', line 164

def get_news(verbose = false)
  count = 0
  channel_path = "."
  structure_path = @structure.split(/\//)
  length = structure_path.length
  for i in 0..(length - 2) do
    channel_path += "/" + structure_path[i]
  end
  item_structure = structure_path[length - 1]
  item_sts = item_structure.split(/:/)
  ## Tag names
  item_tag = item_sts[0]
  item_sts2 = item_sts[1].gsub(/(\[|\])/,'').split(/,/)
  title_tag = item_sts2[0]
  des_tag = item_sts2[1]
  pubdate_tag = item_sts2[2]
  link_tag = item_sts2[3]
  @url.each do |u|
    request = Curl.get(u.to_s)
    begin
      source = XML::Parser.string(request.body_str)
      root_content = source.parse
      ## Find all channels
      channels = root_content.root.find(channel_path)
      ## For each channel processing the data
      channels.each do |c|
        lang_field = c.find_first('language')
        if lang_field.nil? then
          lang = "en_US"
        else 
          lang = lang_field.content.to_s
        end
        items = c.find(item_tag)
        puts item_tag
        items.each do |i|
          title = i.find_first(title_tag).content
          title = title.gsub("'","")
          puts title
          description = i.find_first(des_tag).content
          description = description.gsub("'","")
          doc = Nokogiri::HTML(description)
          img_url = []
          doc.search('img').each do |img_tag|
            img_url << img_tag.attributes['src'].value
          end
          puts "Image: #{img_url.to_s}"
          link = i.find_first(link_tag).content.gsub(" ","")
          pub_date = Time.strptime(i.find_first(pubdate_tag).content,"%A, %d %B %Y %H:%M:%S %Z")
          i_source = Curl.get(link).body_str
          content = Readability::Document.new(i_source).content
          content = content.gsub("'","").force_encoding("UTF-8")
          puts "Now inserting #{title}"
          time_now = Time.now.strftime("%Y-%m-%d %H:%M:%S")
          begin
            if @use_db then
              SNItem.create(
              :lang => lang,
              :title => title,
              :description => description,
              :link => link,
              :pubtime => pub_date.strftime("%Y-%m-%d %H:%M:%S"),
              :content => content,
              :images => img_url.to_s,
              :created_at => time_now,
              :updated_at => time_now
              )
            else
              if verbose then
                puts "You do not use DB"
              end
            end
          rescue => e
            if verbose then
              puts "Error(s): #{e.to_s}"
            end
          else
          end
          count = count + 1
          if verbose then
            puts "Got #{count} news"
          end
        end
      end
    rescue => e
      if verbose then
        puts "Error: #{e}"
      end
    end
    if count >= @limit then
      break
    end
  end

  if verbose then
    puts "We got #{count} news today."
  end
end

#set_url(url = []) ⇒ Object

Set Urls



146
147
148
# File 'lib/sn_crawler.rb', line 146

def set_url(url = [])
  @url = url
end