Class: CommonCrawlIndex::Client
- Inherits:
-
Object
- Object
- CommonCrawlIndex::Client
- Defined in:
- lib/common-crawl-index.rb
Constant Summary collapse
- HEADER_OFFSET =
8
- @@settings =
{ :access_key_id => nil, :secret_access_key => nil, :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" }
Class Method Summary collapse
- .config(settings = {}) ⇒ Object
- .denormalize_url(normalized_url, has_scheme = true) ⇒ Object
- .normalize_url(url, append_scheme = true) ⇒ Object
Instance Method Summary collapse
- #find_by_prefix(url, exact_match = false, &proc_block) ⇒ Object
-
#initialize(access_key_id = nil, secret_access_key = nil, cc_index_path = nil) ⇒ Client
constructor
A new instance of Client.
Constructor Details
#initialize(access_key_id = nil, secret_access_key = nil, cc_index_path = nil) ⇒ Client
Returns a new instance of Client.
20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/common-crawl-index.rb', line 20 def initialize(access_key_id=nil, secret_access_key=nil, cc_index_path = nil) @s3=AWS::S3.new( :access_key_id => access_key_id || @@settings[:access_key_id], :secret_access_key => secret_access_key || @@settings[:secret_access_key] ) @cc_index_path = cc_index_path || @@settings[:cc_index_path] proto,unused,@bucket_name,*rest=@cc_index_path.chomp.split File::SEPARATOR raise ArgumentError, "#{__FILE__}: Unknown S3 Protocol #{proto}" unless proto=~/^s3/ @object_name=File.join rest @block_size, @index_block_count = read( (0..7) ).unpack("LL") end |
Class Method Details
.config(settings = {}) ⇒ Object
14 15 16 |
# File 'lib/common-crawl-index.rb', line 14 def self.config(settings = {}) @@settings = @@settings.merge(settings) end |
.denormalize_url(normalized_url, has_scheme = true) ⇒ Object
53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/common-crawl-index.rb', line 53 def self.denormalize_url(normalized_url, has_scheme = true) scheme = "http" colon_index = 0 if has_scheme colon_index = normalized_url.rindex(":") scheme = normalized_url[colon_index+1..-1] if colon_index end url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1] uri = Addressable::URI.parse(url_with_scheme) uri.host = uri.host.split(".").reverse.join(".") uri.to_s end |
.normalize_url(url, append_scheme = true) ⇒ Object
43 44 45 46 47 48 49 50 51 |
# File 'lib/common-crawl-index.rb', line 43 def self.normalize_url(url, append_scheme = true) url_to_find = url norm_url_to_find = Addressable::URI.parse(url_to_find) norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".") norm_url = norm_url_to_find.to_s norm_url = norm_url[norm_url.index("\/\/")+2..-1] norm_url += ":" + norm_url_to_find.scheme if append_scheme norm_url end |
Instance Method Details
#find_by_prefix(url, exact_match = false, &proc_block) ⇒ Object
35 36 37 38 39 40 41 |
# File 'lib/common-crawl-index.rb', line 35 def find_by_prefix(url, exact_match = false, &proc_block) next_block = 0 while next_block < @index_block_count next_block = get_next_block_id(url, next_block) end get_matching_urls_from_data_blocks(next_block, url, exact_match, &proc_block) end |