Class: GScraper::Search::WebQuery
- Includes:
- HasPages
- Defined in:
- lib/gscraper/search/web_query.rb
Constant Summary collapse
- PATH =
Web Search path
'/search'
- RESULTS_PER_PAGE =
Default results per-page
10
- LICENSES =
Web Search licenses
{ '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial|cc_nonderived)' => Licenses::CC_BY_NC_ND, '(cc_publicdomain|cc_attribute|cc_sharealike|cc_nonderived).-(cc_noncommercial)' => Licenses::CC_BY_SA, '(cc_publicdomain|cc_attribute|cc_sharealike|cc_noncommercial).-(cc_nonderived)' => Licenses::CC_BY_NC, '(cc_publicdomain|cc_attribute|cc_sharealike).-(cc_noncommercial|cc_nonderived)' => Licenses::CC_BY }
Constants inherited from Query
Query::DEFAULT_HOST, Query::SUB_DOMAIN
Instance Attribute Summary collapse
-
#filtered ⇒ Object
Filter the search results.
-
#in_format ⇒ Object
Search for results in the format.
-
#inside_domain ⇒ Object
Search for results inside the domain.
-
#not_in_format ⇒ Object
Search for results not in the format.
-
#occurs_within ⇒ Object
Search for results where the query occurs within the area.
-
#outside_domain ⇒ Object
Search for results outside the domain.
-
#region ⇒ Object
Search for results from the region.
-
#results_per_page ⇒ Object
Results per-page.
-
#rights ⇒ Object
Search for results which have the rights.
-
#within_past_day ⇒ Object
Search for results within the past day.
-
#within_past_months ⇒ Object
Search for results within the past months.
-
#within_past_week ⇒ Object
Search for results within the past week.
-
#within_past_year ⇒ Object
Search for results within the past year.
Attributes inherited from Query
#allintext, #allintitle, #allinurl, #define, #exact_phrase, #filetype, #info, #intext, #intitle, #inurl, #language, #link, #numeric_range, #query, #related, #search_host, #site, #with_words, #without_words
Class Method Summary collapse
-
.from_url(url, options = {}) {|query| ... } ⇒ WebQuery
Creates a new Web query from a search URL.
Instance Method Summary collapse
-
#each_sponsored_link {|ad| ... } ⇒ Enumerator
Iterates over the sponsored ads on the first page.
-
#initialize(options = {}) {|query| ... } ⇒ WebQuery
constructor
Creates a new Web query.
-
#page(page_index) ⇒ Page<Result>
Returns a page containing results at the specific page index.
-
#page_url(page_index) ⇒ URI::HTTP
Returns the URL that represents the query at a specific page index.
-
#result_at(index) ⇒ Object
Returns the result at the specified index.
-
#search_url ⇒ URI::HTTP
The URL that represents the query.
-
#sponsored_links ⇒ SponsoredLinks<SponsoredAd>
Returns the sponsored links for the query.
-
#top_result ⇒ Result
Returns the first result on the first page.
-
#top_sponsored_link ⇒ SponsoredAd
Returns the first sponsored ad on the first page of results.
Methods included from HasPages
#[], #each, #each_on_page, #each_on_pages, #each_page, #first_page, #page_cache, #page_index_of, #pages, #result_index_of, #result_offset_of
Methods inherited from Query
#expression, #format_modifier, #format_options
Constructor Details
#initialize(options = {}) {|query| ... } ⇒ WebQuery
Creates a new Web query.
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
# File 'lib/gscraper/search/web_query.rb', line 149 def initialize(={},&block) @agent = GScraper.web_agent() @results_per_page = .fetch(:results_per_page,RESULTS_PER_PAGE) @region = [:region] if [:within_past_day] @within_past_day = [:within_past_day] @within_past_week = false @within_past_months = false @within_past_year = false elsif [:within_past_week] @within_past_day = false @within_past_week = [:within_past_week] @within_past_months = false @within_past_year = false elsif [:within_past_months] @within_past_day = false @within_past_week = false @within_past_months = [:within_past_months] @within_past_year = false elsif [:within_past_year] @within_past_day = false @within_past_week = false @within_past_months = false @within_past_year = [:within_past_year] else @within_past_day = false @within_past_week = false @within_past_months = false @within_past_year = false end @occurs_within = [:occurs_within] @rights = [:rights] @filtered = [:filtered] super(,&block) end |
Instance Attribute Details
#filtered ⇒ Object
Filter the search results
90 91 92 |
# File 'lib/gscraper/search/web_query.rb', line 90 def filtered @filtered end |
#in_format ⇒ Object
Search for results in the format
60 61 62 |
# File 'lib/gscraper/search/web_query.rb', line 60 def in_format @in_format end |
#inside_domain ⇒ Object
Search for results inside the domain
81 82 83 |
# File 'lib/gscraper/search/web_query.rb', line 81 def inside_domain @inside_domain end |
#not_in_format ⇒ Object
Search for results not in the format
63 64 65 |
# File 'lib/gscraper/search/web_query.rb', line 63 def not_in_format @not_in_format end |
#occurs_within ⇒ Object
Search for results where the query occurs within the area
78 79 80 |
# File 'lib/gscraper/search/web_query.rb', line 78 def occurs_within @occurs_within end |
#outside_domain ⇒ Object
Search for results outside the domain
84 85 86 |
# File 'lib/gscraper/search/web_query.rb', line 84 def outside_domain @outside_domain end |
#region ⇒ Object
Search for results from the region
57 58 59 |
# File 'lib/gscraper/search/web_query.rb', line 57 def region @region end |
#results_per_page ⇒ Object
Results per-page
54 55 56 |
# File 'lib/gscraper/search/web_query.rb', line 54 def results_per_page @results_per_page end |
#rights ⇒ Object
Search for results which have the rights
87 88 89 |
# File 'lib/gscraper/search/web_query.rb', line 87 def rights @rights end |
#within_past_day ⇒ Object
Search for results within the past day
66 67 68 |
# File 'lib/gscraper/search/web_query.rb', line 66 def within_past_day @within_past_day end |
#within_past_months ⇒ Object
Search for results within the past months
72 73 74 |
# File 'lib/gscraper/search/web_query.rb', line 72 def within_past_months @within_past_months end |
#within_past_week ⇒ Object
Search for results within the past week
69 70 71 |
# File 'lib/gscraper/search/web_query.rb', line 69 def within_past_week @within_past_week end |
#within_past_year ⇒ Object
Search for results within the past year
75 76 77 |
# File 'lib/gscraper/search/web_query.rb', line 75 def within_past_year @within_past_year end |
Class Method Details
.from_url(url, options = {}) {|query| ... } ⇒ WebQuery
Creates a new Web query from a search URL.
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 |
# File 'lib/gscraper/search/web_query.rb', line 223 def WebQuery.from_url(url,={},&block) url = URI(url.to_s) [:search_host] = url.host [:results_per_page] = if url.query_params['num'] url.query_params['num'].to_i else RESULTS_PER_PAGE end [:query] = url.query_params['q'] [:exact_phrase] = url.query_params['as_epq'] [:with_words] = url.query_params['as_oq'] [:without_words] = url.query_params['as_eq'] [:language] = url.query_params['lr'] [:region] = url.query_params['cr'] if url.query_params['as_filetype'] [:filetype] = url.query_params['as_filetype'] end case url.query_params['as_qdr'] when 'd' [:within_past_day] = true when 'w' [:within_past_week] = true when 'm' [:within_past_months] = 1 when 'm2' [:within_past_months] = 2 when 'm3' [:within_past_months] = 3 when 'm6' [:within_past_months] = 6 when 'y' [:within_past_year] = true end if (url.query_params['as_nlo'] || url.query_params['as_nhi']) [:numeric_range] = Range.new( url.query_params['as_nlo'].to_i, url.query_params['as_nhi'].to_i ) end if url.query_params['as_occt'] [:occurs_within] = url.query_params['as_occt'].to_sym end [:site] = url.query_params['as_sitesearch'] [:rights] = LICENSES[url.query_params['as_rights']] [:filtered] = (url.query_params[:safe] == 'active') if url.query_params['as_rq'] [:related] = url.query_params['as_rq'] elsif url.query_params['as_lq'] [:link] = url.query_params['as_lq'] end return WebQuery.new(,&block) end |
Instance Method Details
#each_sponsored_link {|ad| ... } ⇒ Enumerator
Iterates over the sponsored ads on the first page.
499 500 501 |
# File 'lib/gscraper/search/web_query.rb', line 499 def each_sponsored_link(&block) sponsored_links.each(&block) end |
#page(page_index) ⇒ Page<Result>
Returns a page containing results at the specific page index.
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 |
# File 'lib/gscraper/search/web_query.rb', line 387 def page(page_index) Page.new do |new_page| doc = @agent.get(page_url(page_index)) if doc.at('//div/a[@href="http://www.google.com/support/bin/answer.py?answer=86640"]') raise(Blocked,"Google has temporarily blocked our IP Address",caller) end results = doc.search('//li[@class="g"]') results_length = [@results_per_page, results.length].min rank_offset = result_offset_of(page_index) results_length.times do |index| result = results[index] rank = rank_offset + (index + 1) link = result.at('.//h3/a') title = link.inner_text link_url = URI(link.get_attribute('href')).query_params['q'] url = URI(link_url) summary_text = '' if (content = (result.at('.//div[@class="s"]','.//td[@class="j"]//font'))) content.children.each do |elem| break if (!(elem.text?) && elem.name=='br') summary_text << elem.inner_text end end cached_url = nil similar_url = nil if (gl = result.at('.//div[@class="s"]')) if (cached_link = gl.at('.//a[1]')) cached_url = URI("http://#{search_host}" + cached_link.get_attribute('href')) end if (similar_link = gl.at('.//a[2]')) similar_url = URI("http://#{search_host}" + similar_link.get_attribute('href')) end end new_page << Result.new(rank,title,url,summary_text,cached_url,similar_url) end end end |
#page_url(page_index) ⇒ URI::HTTP
Returns the URL that represents the query at a specific page index.
369 370 371 372 373 374 375 376 |
# File 'lib/gscraper/search/web_query.rb', line 369 def page_url(page_index) url = search_url url.query_params['start'] = result_offset_of(page_index) url.query_params['sa'] = 'N' return url end |
#result_at(index) ⇒ Object
Returns the result at the specified index.
453 454 455 |
# File 'lib/gscraper/search/web_query.rb', line 453 def result_at(index) page(page_index_of(index))[result_index_of(index)] end |
#search_url ⇒ URI::HTTP
The URL that represents the query.
294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 |
# File 'lib/gscraper/search/web_query.rb', line 294 def search_url url = URI::HTTP.build(:host => search_host, :path => PATH) set_param = lambda { |param,value| url.query_params[param.to_s] = value if value } set_param.call('num',@results_per_page) set_param.call('q',expression) set_param.call('as_epq',@exact_phrase) set_param.call('as_oq',@with_words) set_param.call('as_eq',@without_words) set_param.call('lr',@language) set_param.call('cr',@region) set_param.call('as_filetype',@filetype) if @within_past_day url.query_params['as_qdr'] = 'd' elsif @within_past_week url.query_params['as_qdr'] = 'w' elsif @within_past_months case @within_past_months when 1 url.query_params['as_qdr'] = 'm' when 2 url.query_params['as_qdr'] = 'm2' when 3 url.query_params['as_qdr'] = 'm3' when 6 url.query_params['as_qdr'] = 'm6' end elsif @within_past_year url.query_params['as_qdr'] = 'y' end if @numeric_range.kind_of?(Range) url.query_params['as_nlo'] = @numeric_range.begin url.query_params['as_nhi'] = @numeric_range.end end case @occurs_within when :title, 'title' url.query_params['as_occt'] = 'title' when :body, 'body' url.query_params['as_occt'] = 'body' when :url, 'url' url.query_params['as_occt'] = 'url' when :links, 'links' url.query_params['as_occt'] = 'links' end set_param.call('as_sitesearch',@site) if @rights url.query_params['as_rights'] = LICENSES.reverse[@rights] end if @filtered url.query_params['safe'] = 'active' end return url end |
#sponsored_links ⇒ SponsoredLinks<SponsoredAd>
Returns the sponsored links for the query.
463 464 465 466 467 468 469 470 471 472 473 474 475 |
# File 'lib/gscraper/search/web_query.rb', line 463 def sponsored_links SponsoredLinks.new do |links| doc = @agent.get(search_url) # top and side ads doc.search('//h3/a[starts-with(@id,"pa")]').each do |link| title = link.inner_text url = URI("http://#{search_host}" + link.get_attribute('href')) links << SponsoredAd.new(title,url) end end end |
#top_result ⇒ Result
Returns the first result on the first page.
443 444 445 |
# File 'lib/gscraper/search/web_query.rb', line 443 def top_result first_page.first end |
#top_sponsored_link ⇒ SponsoredAd
Returns the first sponsored ad on the first page of results.
483 484 485 |
# File 'lib/gscraper/search/web_query.rb', line 483 def top_sponsored_link top_sponsored_links.first end |