Class: SNCrawler

Inherits:

Object

Object
SNCrawler

Defined in:: lib/sn_crawler.rb

Overview

A crawler class

Examples:

require 'sn_crawler'

url = "http://vnexpress.net/rss"
c = SNCrawler.new(url,"VNE","/channel/item:[title,description,pubDate,link]",nil)
c.get_links(true)
# => ["vnexpress.net/rss/tin-moi-nhat.rss",...]

c.get_news(true)
# => City in Jordan welcomes ISIS
#    Image: []
#    Now inserting City in Jordan welcomes ISIS

Instance Method Summary collapse

#clear_url ⇒ Object

Clear urls.
#create_table(options = "", verbose = true) ⇒ Object

Create table for our gem.
#finalize ⇒ Object
#get_links(verbose = false) ⇒ Object

Get urls from a source url.
#get_news(verbose = false) ⇒ Object

Get news from urls.
#initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100) ⇒ SNCrawler constructor

Initialize parameters.
#set_url(url = []) ⇒ Object

Set Urls.

Constructor Details

#initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100) ⇒ `SNCrawler`

Initialize parameters

source => The source URL

name => Some lovely names

structure =>

structure format: /path/to/channel/item_name:

item_attributes_name => title of the page

item_attributes_name => description of the page

item_attributes_name => publicity time of the page

item_attributes_name => link to the page

For example: /channel/item:

db_conf => activerecord settings

limit the number of news that you want

# File 'lib/sn_crawler.rb', line 61

def initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100)
  ## The rss source's url
  @source = source
  @source_name = name
  @structure = structure
  ## Crawled urls will be stored here
  @url = []
  @agent = Mechanize.new
  ## Establish a connection to DB server
  if !db_conf.nil? then
    ActiveRecord::Base.establish_connection(db_conf)
    @use_db = true
  else
    @use_db = false
  end
  @limit = limit
end

Instance Method Details

#clear_url ⇒ `Object`

Clear urls



154
155
156

# File 'lib/sn_crawler.rb', line 154

def clear_url
  @url = []
end

#create_table(options = "", verbose = true) ⇒ `Object`

Create table for our gem

Examples:

c.create_table("engine=MROONGA",true)

# File 'lib/sn_crawler.rb', line 86

def create_table(options = "", verbose = true)
  begin
    ActiveRecord::Migration.class_eval do
      create_table :sn_news, :options => options do |t|
        t.string :lang
        t.string  :title
        t.text :description
        t.text :content
        t.string :link
        t.string :images
        t.datetime :pubtime
        
        t.timestamps
      end
      
      add_index :sn_news, [:title], :unique => true, :name => "unique_title_on_news"
    end
  rescue => e
    if verbose then
      puts "Error(s): #{e.to_s}"
    end
    return false
  end
  return true
end

#finalize ⇒ `Object`



262
263
264

# File 'lib/sn_crawler.rb', line 262

def finalize
  ActiveRecord::Base.connection.close
end

#get_links(verbose = false) ⇒ `Object`

Get urls from a source url

# File 'lib/sn_crawler.rb', line 116

def get_links(verbose = false)
  page = @agent.get(@source)

  page.links_with(:href => /\.(rss|xml)/).each do |link|
    src = ""
    if !link.href.include? "http"
      src = URI.parse(@source).host + link.href 
    else 
      src = link.href
    end
    existed = 0
    @url.each do |u|
      if u == src
        existed = 1
      end
    end
    if existed == 0 && src.length <= 50
      @url << src
    end
  end

  if verbose then
    puts @url.to_s
  end
end

#get_news(verbose = false) ⇒ `Object`

Get news from urls

Note that you have to run this method ONLY after running get_links and create_table(in case of using DB)

# File 'lib/sn_crawler.rb', line 164

def get_news(verbose = false)
  count = 0
  channel_path = "."
  structure_path = @structure.split(/\//)
  length = structure_path.length
  for i in 0..(length - 2) do
    channel_path += "/" + structure_path[i]
  end
  item_structure = structure_path[length - 1]
  item_sts = item_structure.split(/:/)
  ## Tag names
  item_tag = item_sts[0]
  item_sts2 = item_sts[1].gsub(/(\[|\])/,'').split(/,/)
  title_tag = item_sts2[0]
  des_tag = item_sts2[1]
  pubdate_tag = item_sts2[2]
  link_tag = item_sts2[3]
  @url.each do |u|
    request = Curl.get(u.to_s)
    begin
      source = XML::Parser.string(request.body_str)
      root_content = source.parse
      ## Find all channels
      channels = root_content.root.find(channel_path)
      ## For each channel processing the data
      channels.each do |c|
        lang_field = c.find_first('language')
        if lang_field.nil? then
          lang = "en_US"
        else 
          lang = lang_field.content.to_s
        end
        items = c.find(item_tag)
        puts item_tag
        items.each do |i|
          title = i.find_first(title_tag).content
          title = title.gsub("'","")
          puts title
          description = i.find_first(des_tag).content
          description = description.gsub("'","")
          doc = Nokogiri::HTML(description)
          img_url = []
          doc.search('img').each do |img_tag|
            img_url << img_tag.attributes['src'].value
          end
          puts "Image: #{img_url.to_s}"
          link = i.find_first(link_tag).content.gsub(" ","")
          pub_date = Time.strptime(i.find_first(pubdate_tag).content,"%A, %d %B %Y %H:%M:%S %Z")
          i_source = Curl.get(link).body_str
          content = Readability::Document.new(i_source).content
          content = content.gsub("'","").force_encoding("UTF-8")
          puts "Now inserting #{title}"
          time_now = Time.now.strftime("%Y-%m-%d %H:%M:%S")
          begin
            if @use_db then
              SNItem.create(
              :lang => lang,
              :title => title,
              :description => description,
              :link => link,
              :pubtime => pub_date.strftime("%Y-%m-%d %H:%M:%S"),
              :content => content,
              :images => img_url.to_s,
              :created_at => time_now,
              :updated_at => time_now
              )
            else
              if verbose then
                puts "You do not use DB"
              end
            end
          rescue => e
            if verbose then
              puts "Error(s): #{e.to_s}"
            end
          else
          end
          count = count + 1
          if verbose then
            puts "Got #{count} news"
          end
        end
      end
    rescue => e
      if verbose then
        puts "Error: #{e}"
      end
    end
    if count >= @limit then
      break
    end
  end

  if verbose then
    puts "We got #{count} news today."
  end
end

#set_url(url = []) ⇒ `Object`

Set Urls



146
147
148

# File 'lib/sn_crawler.rb', line 146

def set_url(url = [])
  @url = url
end

Class: SNCrawler

Overview

Instance Method Summary collapse

Constructor Details

#initialize(source = "", name = "", structure = "", db_conf = {}, limit = 100) ⇒ SNCrawler

Instance Method Details

#clear_url ⇒ Object

#create_table(options = "", verbose = true) ⇒ Object

#finalize ⇒ Object

#get_links(verbose = false) ⇒ Object

#get_news(verbose = false) ⇒ Object

#set_url(url = []) ⇒ Object