Class: NHKore::BingScraper

Inherits:
SearchScraper show all
Defined in:
lib/nhkore/search_scraper.rb

Constant Summary

Constants inherited from SearchScraper

SearchScraper::DEFAULT_RESULT_COUNT, SearchScraper::FUTSUU_REGEX, SearchScraper::FUTSUU_SITE, SearchScraper::IGNORE_LINK_REGEX, SearchScraper::YASASHII_REGEX, SearchScraper::YASASHII_SITE

Constants inherited from Scraper

Scraper::DEFAULT_HEADER

Instance Attribute Summary collapse

Attributes inherited from Scraper

#kargs, #max_redirects, #max_retries, #redirect_rule, #str_or_io, #url

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from SearchScraper

#fetch_valid_link?, #ignore_link?

Methods inherited from Scraper

#fetch_cookie, #html_doc, #join_url, #open, #open_file, #open_url, #read, #reopen, #rss_doc

Constructor Details

#initialize(site, regex: nil, url: nil, **kargs) ⇒ BingScraper

Returns a new instance of BingScraper.

Raises:

  • (ArgumentError)


90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/nhkore/search_scraper.rb', line 90

def initialize(site,regex: nil,url: nil,**kargs)
  case site
  when :futsuu
    regex = FUTSUU_REGEX if regex.nil?
    site = FUTSUU_SITE
  when :yasashii
    regex = YASASHII_REGEX if regex.nil?
    site = YASASHII_SITE
  else
    raise ArgumentError,"invalid site[#{site}]"
  end

  raise ArgumentError,"empty regex[#{regex}]" if regex.nil?

  @regex = regex
  @site = site
  url = self.class.build_url(site,**kargs) if url.nil?

  # Delete class-specific args (don't pass to Open-URI).
  kargs.delete(:count)

  super(url,**kargs)
end

Instance Attribute Details

#regexObject (readonly)

Returns the value of attribute regex.



87
88
89
# File 'lib/nhkore/search_scraper.rb', line 87

def regex
  @regex
end

#siteObject (readonly)

Returns the value of attribute site.



88
89
90
# File 'lib/nhkore/search_scraper.rb', line 88

def site
  @site
end

Class Method Details

.build_url(site, count: DEFAULT_RESULT_COUNT, **kargs) ⇒ Object



114
115
116
117
118
119
120
121
122
123
124
# File 'lib/nhkore/search_scraper.rb', line 114

def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
  url = ''.dup

  url << 'https://www.bing.com/search?'
  url << URI.encode_www_form(
    q: "site:#{site}",
    count: count
  )

  return url
end

Instance Method Details

#scrape(slinks, page = NextPage.new()) ⇒ Object



126
127
128
129
130
131
132
133
134
# File 'lib/nhkore/search_scraper.rb', line 126

def scrape(slinks,page=NextPage.new())
  next_page,link_count = scrape_html(slinks,page)

  if link_count <= 0
    scrape_rss(slinks,page,next_page)
  end

  return next_page
end

#scrape_html(slinks, page, next_page = NextPage.new()) ⇒ Object



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/nhkore/search_scraper.rb', line 136

def scrape_html(slinks,page,next_page=NextPage.new())
  doc = html_doc
  link_count = 0

  anchors = doc.css('a')

  anchors.each do |anchor|
    href = anchor['href'].to_s
    href = Util.unspace_web_str(href).downcase

    next if ignore_link?(href)

    if (md = href.match(/first=(\d+)/))
      count = md[1].to_i

      if count > page.count && (next_page.count < 0 || count < next_page.count)
        next_page.count = count
        next_page.url = join_url(href)
      end
    elsif href =~ regex && fetch_valid_link?(href)
      slinks.add_link(SearchLink.new(href))
      link_count += 1
    end
  end

  return [next_page,link_count]
end

#scrape_rss(slinks, page, next_page = NextPage.new()) ⇒ Object



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/nhkore/search_scraper.rb', line 164

def scrape_rss(slinks,page,next_page=NextPage.new())
  link_count = 0

  if !@is_file
    uri = URI(@url)

    Util.replace_uri_query!(uri,format: 'rss')
    self.open(uri)

    doc = rss_doc
    rss_links = []

    doc.items.each do |item|
      link = item.link.to_s
      link = Util.unspace_web_str(link).downcase

      rss_links << link

      next if ignore_link?(link)
      next if link !~ regex || !fetch_valid_link?(link)

      slinks.add_link(SearchLink.new(link))
      link_count += 1
    end

    # For RSS, Bing will keep returning the same links over and over
    # if it's the last page or the "first=" query is the wrong count.
    # Therefore, we have to test the previous RSS links (+page.rss_links+).
    if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links
      next_page.count = (page.count < 0) ? 0 : page.count
      next_page.count += doc.items.length
      next_page.rss_links = rss_links

      uri = URI(page.url.nil? ? @url : page.url)

      Util.replace_uri_query!(uri,first: next_page.count)

      next_page.url = uri
    end
  end

  return [next_page,link_count]
end