Class: NHKore::BingScraper
- Inherits:
-
SearchScraper
- Object
- Scraper
- SearchScraper
- NHKore::BingScraper
- Defined in:
- lib/nhkore/search_scraper.rb
Constant Summary
Constants inherited from SearchScraper
SearchScraper::DEFAULT_RESULT_COUNT, SearchScraper::FUTSUU_REGEX, SearchScraper::FUTSUU_SITE, SearchScraper::IGNORE_LINK_REGEX, SearchScraper::YASASHII_REGEX, SearchScraper::YASASHII_SITE
Constants inherited from Scraper
Instance Attribute Summary collapse
-
#regex ⇒ Object
readonly
Returns the value of attribute regex.
-
#site ⇒ Object
readonly
Returns the value of attribute site.
Attributes inherited from Scraper
#kargs, #max_redirects, #max_retries, #redirect_rule, #str_or_io, #url
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(site, regex: nil, url: nil, **kargs) ⇒ BingScraper
constructor
A new instance of BingScraper.
- #scrape(slinks, page = NextPage.new()) ⇒ Object
- #scrape_html(slinks, page, next_page = NextPage.new()) ⇒ Object
- #scrape_rss(slinks, page, next_page = NextPage.new()) ⇒ Object
Methods inherited from SearchScraper
#fetch_valid_link?, #ignore_link?
Methods inherited from Scraper
#fetch_cookie, #html_doc, #join_url, #open, #open_file, #open_url, #read, #reopen, #rss_doc
Constructor Details
#initialize(site, regex: nil, url: nil, **kargs) ⇒ BingScraper
Returns a new instance of BingScraper.
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# File 'lib/nhkore/search_scraper.rb', line 90 def initialize(site,regex: nil,url: nil,**kargs) case site when :futsuu regex = FUTSUU_REGEX if regex.nil? site = FUTSUU_SITE when :yasashii regex = YASASHII_REGEX if regex.nil? site = YASASHII_SITE else raise ArgumentError,"invalid site[#{site}]" end raise ArgumentError,"empty regex[#{regex}]" if regex.nil? @regex = regex @site = site url = self.class.build_url(site,**kargs) if url.nil? # Delete class-specific args (don't pass to Open-URI). kargs.delete(:count) super(url,**kargs) end |
Instance Attribute Details
#regex ⇒ Object (readonly)
Returns the value of attribute regex.
87 88 89 |
# File 'lib/nhkore/search_scraper.rb', line 87 def regex @regex end |
#site ⇒ Object (readonly)
Returns the value of attribute site.
88 89 90 |
# File 'lib/nhkore/search_scraper.rb', line 88 def site @site end |
Class Method Details
.build_url(site, count: DEFAULT_RESULT_COUNT, **kargs) ⇒ Object
114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/nhkore/search_scraper.rb', line 114 def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs) url = ''.dup url << 'https://www.bing.com/search?' url << URI.encode_www_form( q: "site:#{site}", count: count ) return url end |
Instance Method Details
#scrape(slinks, page = NextPage.new()) ⇒ Object
126 127 128 129 130 131 132 133 134 |
# File 'lib/nhkore/search_scraper.rb', line 126 def scrape(slinks,page=NextPage.new()) next_page,link_count = scrape_html(slinks,page) if link_count <= 0 scrape_rss(slinks,page,next_page) end return next_page end |
#scrape_html(slinks, page, next_page = NextPage.new()) ⇒ Object
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# File 'lib/nhkore/search_scraper.rb', line 136 def scrape_html(slinks,page,next_page=NextPage.new()) doc = html_doc link_count = 0 anchors = doc.css('a') anchors.each do |anchor| href = anchor['href'].to_s href = Util.unspace_web_str(href).downcase next if ignore_link?(href) if (md = href.match(/first=(\d+)/)) count = md[1].to_i if count > page.count && (next_page.count < 0 || count < next_page.count) next_page.count = count next_page.url = join_url(href) end elsif href =~ regex && fetch_valid_link?(href) slinks.add_link(SearchLink.new(href)) link_count += 1 end end return [next_page,link_count] end |
#scrape_rss(slinks, page, next_page = NextPage.new()) ⇒ Object
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
# File 'lib/nhkore/search_scraper.rb', line 164 def scrape_rss(slinks,page,next_page=NextPage.new()) link_count = 0 if !@is_file uri = URI(@url) Util.replace_uri_query!(uri,format: 'rss') self.open(uri) doc = rss_doc rss_links = [] doc.items.each do |item| link = item.link.to_s link = Util.unspace_web_str(link).downcase rss_links << link next if ignore_link?(link) next if link !~ regex || !fetch_valid_link?(link) slinks.add_link(SearchLink.new(link)) link_count += 1 end # For RSS, Bing will keep returning the same links over and over # if it's the last page or the "first=" query is the wrong count. # Therefore, we have to test the previous RSS links (+page.rss_links+). if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links next_page.count = (page.count < 0) ? 0 : page.count next_page.count += doc.items.length next_page.rss_links = rss_links uri = URI(page.url.nil? ? @url : page.url) Util.replace_uri_query!(uri,first: next_page.count) next_page.url = uri end end return [next_page,link_count] end |