Module: Sitemaps
- Defined in:
- lib/sitemaps.rb,
lib/sitemaps/parser.rb,
lib/sitemaps/fetcher.rb,
lib/sitemaps/version.rb
Overview
Discover, fetch and parse XML sitemaps as defined by the ‘sitemaps.org` spec.
Defined Under Namespace
Modules: Fetcher, Parser
Classes: Entry, Sitemap, Submap
Constant Summary
collapse
- VERSION =
"0.1.1".freeze
Class Method Summary
collapse
-
.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block) ⇒ Object
-
.fetch_recursive(url, fetch, max_entries, &block) ⇒ Object
-
.fetch_single(url, fetch, max_entries, &block) ⇒ Object
-
.parse(source) ⇒ Object
-
.parse_url(url) ⇒ Object
Class Method Details
.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block) ⇒ Object
22
23
24
25
26
27
|
# File 'lib/sitemaps.rb', line 22
def self.fetch(url, fetch: nil, recurse: true, max_entries: nil, &block)
fetch ||= -> (u) { Sitemaps::Fetcher.fetch(u) }
url = parse_url(url)
recurse ? fetch_recursive(url, fetch, max_entries, &block) : fetch_single(url, fetch, max_entries, &block)
end
|
.fetch_recursive(url, fetch, max_entries, &block) ⇒ Object
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
# File 'lib/sitemaps.rb', line 36
def self.fetch_recursive(url, fetch, max_entries, &block)
queue = [parse_url(url)]
maps = {}
loop do
begin
url = queue.pop
break if url.nil?
next unless maps[url].nil?
maps[url] = fetch_single(url, fetch, max_entries, &block)
queue.push(*maps[url].sitemaps.map(&:loc))
unless max_entries.nil?
max_entries -= maps[url].entries.length
break if max_entries <= 0
end
rescue => ex
$stderr.puts "ERROR FETCHING: #{url}, #{ex.message}, ignoring..."
next
end
end
maps.each_with_object(Sitemap.new([], [])) do |(_, map), result|
result.sitemaps.concat(map.sitemaps)
result.entries.concat(map.entries)
end
end
|
.fetch_single(url, fetch, max_entries, &block) ⇒ Object
29
30
31
32
33
34
|
# File 'lib/sitemaps.rb', line 29
def self.fetch_single(url, fetch, max_entries, &block)
url = parse_url(url)
source = fetch.call(url)
Sitemaps::Parser.parse(source, max_entries: max_entries, filter: block)
end
|
.parse(source) ⇒ Object
18
19
20
|
# File 'lib/sitemaps.rb', line 18
def self.parse(source)
Sitemaps::Parser.parse(source)
end
|
.parse_url(url) ⇒ Object
70
71
72
73
74
75
|
# File 'lib/sitemaps.rb', line 70
def self.parse_url(url)
return url if url.is_a? URI
url = "http://#{url}" unless url =~ %r{^https?://}
URI.parse(url)
end
|