Class: Rspider::MysqlUrlStorage
- Inherits:
-
Object
- Object
- Rspider::MysqlUrlStorage
- Defined in:
- lib/rspider/MysqlUrlStorage.rb
Overview
The class MysqlUrlStorage store urls in Mysql database For better performance, we create an UrlStorage object to cache urls in memory
Instance Attribute Summary collapse
-
#cache ⇒ Object
Returns the value of attribute cache.
Instance Method Summary collapse
-
#<<(url) ⇒ Object
we discover a new url and record it.
- #close ⇒ Object
-
#error(url) ⇒ Object
we meet an error,so we log it.
-
#initialize(hash, source = "default") ⇒ MysqlUrlStorage
constructor
Param hash is a hash includes mysql-host,mysql-databasename,mysql-user,mysql-pass Param source is the name of cralwering task.
-
#md5(string) ⇒ Object
get the MD5 hash of string param “string”.
-
#pop ⇒ Object
got a url to cralwer.
-
#visited(url) ⇒ Object
we have cralwered an url ,so we recored it.
-
#visited?(url) ⇒ Boolean
asking if the url has been visited?.
Constructor Details
#initialize(hash, source = "default") ⇒ MysqlUrlStorage
Param hash is a hash includes mysql-host,mysql-databasename,mysql-user,mysql-pass Param source is the name of cralwering task
26 27 28 29 30 31 |
# File 'lib/rspider/MysqlUrlStorage.rb', line 26 def initialize(hash,source="default") @source=source @my=Mysql::new(hash["host"],hash["user"],hash["pass"],hash["db"]) raise MysqlException if @my.nil? @cache=UrlStorageCache.new end |
Instance Attribute Details
#cache ⇒ Object
Returns the value of attribute cache.
23 24 25 |
# File 'lib/rspider/MysqlUrlStorage.rb', line 23 def cache @cache end |
Instance Method Details
#<<(url) ⇒ Object
we discover a new url and record it
55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/rspider/MysqlUrlStorage.rb', line 55 def <<(url) return nil if @cache.urlStored?(url) ukey=md5(url)+@source crc=Zlib::crc32(url,@seed) sql="INSERT INTO `urls` (`url`,`source`,`added`,`visited`,`ukey`,`score`,`url_crc32`) VALUES ('"+url+"','"+@source+"','"+Time.now().to_i.to_s+"','0','"+ukey+"','"+Rspider::UrlScorer.score(url).to_s+"','"+crc.to_s+"')"; begin @my.query(sql) @cache.<<(url) rescue Mysql::Error,StandardError,Exception => e else end end |
#close ⇒ Object
103 104 105 |
# File 'lib/rspider/MysqlUrlStorage.rb', line 103 def close @my.close end |
#error(url) ⇒ Object
we meet an error,so we log it
96 97 98 99 100 101 102 |
# File 'lib/rspider/MysqlUrlStorage.rb', line 96 def error(url) @cache.error(url) ukey=md5(url)+@source crc=Zlib::crc32(url) sql="UPDATE `urls` SET score=score-3,errors=errors+1 WHERE url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1" @my.query(sql) end |
#md5(string) ⇒ Object
get the MD5 hash of string param “string”
33 34 35 36 37 |
# File 'lib/rspider/MysqlUrlStorage.rb', line 33 def md5(string) t=Digest::MD5.new t << string t.to_s end |
#pop ⇒ Object
got a url to cralwer
68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/rspider/MysqlUrlStorage.rb', line 68 def pop() #sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY errors asc,score desc,RAND() LIMIT 1" url=@cache.pop return url unless url.nil? sql="SELECT url FROM `urls` WHERE visited='0' AND `source`='"+@source+"' ORDER BY RAND() LIMIT 1" begin rs=@my.query(sql) rs.each do |r| return r[0] end rescue Mysql::Error return nil end end |
#visited(url) ⇒ Object
we have cralwered an url ,so we recored it
83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/rspider/MysqlUrlStorage.rb', line 83 def visited(url) @cache.visited(url) ukey=md5(url)+@source crc=Zlib::crc32(url) sql="UPDATE `urls` SET visited='"+Time.now.to_i.to_s+"' WHERE url_crc32='#{crc}' AND `ukey`='"+ukey+"' LIMIT 1" begin @my.query(sql) rescue Mysql::Error return false end return true end |
#visited?(url) ⇒ Boolean
asking if the url has been visited?
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/rspider/MysqlUrlStorage.rb', line 39 def visited?(url) return true if @cache.visited?(url) ukey=md5(url)+@source crc=Zlib::crc32(url) sql="SELECT visited FROM `urls` WHERE AND url_crc32=#{crc} AND `ukey`='"+ukey+"' LIMIT 1" begin rs=@my.query(sql) rs.each do |r| return true if r[0].to_i>0 end return nil rescue Mysql::Error => e return nil end end |