Class: NHKore::BingScraper

Inherits:
SearchScraper show all
Defined in:
lib/nhkore/search_scraper.rb

Overview

Author:

  • Jonathan Bradley Whited

Since:

  • 0.2.0

Constant Summary

Constants inherited from SearchScraper

SearchScraper::DEFAULT_RESULT_COUNT, SearchScraper::FUTSUU_REGEX, SearchScraper::FUTSUU_SITE, SearchScraper::IGNORE_LINK_REGEX, SearchScraper::YASASHII_REGEX, SearchScraper::YASASHII_SITE

Constants inherited from Scraper

Scraper::DEFAULT_HEADER

Instance Attribute Summary collapse

Attributes inherited from Scraper

#kargs, #max_redirects, #max_retries, #redirect_rule, #str_or_io, #url

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from SearchScraper

#ignore_link?

Methods inherited from Scraper

#fetch_cookie, #html_doc, #join_url, #open, #open_file, #open_url, #read, #reopen, #rss_doc

Constructor Details

#initialize(site, regex: nil, url: nil, **kargs) ⇒ BingScraper

Returns a new instance of BingScraper.

Raises:

  • (ArgumentError)

Since:

  • 0.2.0



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/nhkore/search_scraper.rb', line 74

def initialize(site,regex: nil,url: nil,**kargs)
  case site
  when :futsuu
    regex = FUTSUU_REGEX if regex.nil?
    site = FUTSUU_SITE
  when :yasashii
    regex = YASASHII_REGEX if regex.nil?
    site = YASASHII_SITE
  else
    raise ArgumentError,"invalid site[#{site}]"
  end

  raise ArgumentError,"empty regex[#{regex}]" if regex.nil?

  @regex = regex
  @site = site
  url = self.class.build_url(site,**kargs) if url.nil?

  # Delete class-specific args (don't pass to Open-URI).
  kargs.delete(:count)

  super(url,**kargs)
end

Instance Attribute Details

#regexObject (readonly)

Since:

  • 0.2.0



71
72
73
# File 'lib/nhkore/search_scraper.rb', line 71

def regex
  @regex
end

#siteObject (readonly)

Since:

  • 0.2.0



72
73
74
# File 'lib/nhkore/search_scraper.rb', line 72

def site
  @site
end

Class Method Details

.build_url(site, count: DEFAULT_RESULT_COUNT, **kargs) ⇒ Object

Since:

  • 0.2.0



98
99
100
101
102
103
104
105
106
107
108
# File 'lib/nhkore/search_scraper.rb', line 98

def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
  url = ''.dup

  url << 'https://www.bing.com/search?'
  url << URI.encode_www_form(
    q: "site:#{site}",
    count: count
  )

  return url
end

Instance Method Details

#scrape(slinks, page = NextPage.new()) ⇒ Object

Since:

  • 0.2.0



110
111
112
113
114
115
116
117
118
# File 'lib/nhkore/search_scraper.rb', line 110

def scrape(slinks,page=NextPage.new())
  next_page,link_count = scrape_html(slinks,page)

  if link_count <= 0
    scrape_rss(slinks,page,next_page)
  end

  return next_page
end

#scrape_html(slinks, page, next_page = NextPage.new()) ⇒ Object

Since:

  • 0.2.0



120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/nhkore/search_scraper.rb', line 120

def scrape_html(slinks,page,next_page=NextPage.new())
  doc = html_doc
  link_count = 0

  anchors = doc.css('a')

  anchors.each do |anchor|
    href = anchor['href'].to_s
    href = Util.unspace_web_str(href).downcase

    next if ignore_link?(href)

    if (md = href.match(/first=(\d+)/))
      count = md[1].to_i

      if count > page.count && (next_page.count < 0 || count < next_page.count)
        next_page.count = count
        next_page.url = join_url(href)
      end
    elsif href =~ regex
      slinks.add_link(SearchLink.new(href))

      link_count += 1
    end
  end

  return [next_page,link_count]
end

#scrape_rss(slinks, page, next_page = NextPage.new()) ⇒ Object

Since:

  • 0.2.0



149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/nhkore/search_scraper.rb', line 149

def scrape_rss(slinks,page,next_page=NextPage.new())
  link_count = 0

  if !@is_file
    uri = URI(@url)

    Util.replace_uri_query!(uri,format: 'rss')
    self.open(uri)

    doc = rss_doc
    rss_links = []

    doc.items.each do |item|
      link = item.link.to_s
      link = Util.unspace_web_str(link).downcase

      rss_links << link

      next if ignore_link?(link)
      next if link !~ regex

      slinks.add_link(SearchLink.new(link))

      link_count += 1
    end

    # For RSS, Bing will keep returning the same links over and over
    # if it's the last page or the "first=" query is the wrong count.
    # Therefore, we have to test the previous RSS links (+page.rss_links+).
    if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links
      next_page.count = (page.count < 0) ? 0 : page.count
      next_page.count += doc.items.length
      next_page.rss_links = rss_links

      uri = URI(page.url.nil? ? @url : page.url)

      Util.replace_uri_query!(uri,first: next_page.count)

      next_page.url = uri
    end
  end

  return [next_page,link_count]
end