Class: NHKore::SearchScraper

Inherits:
Scraper
  • Object
show all
Defined in:
lib/nhkore/search_scraper.rb

Direct Known Subclasses

BingScraper

Constant Summary collapse

DEFAULT_RESULT_COUNT =
100
FUTSUU_SITE =
'nhk.or.jp/news/html/'
YASASHII_SITE =
'nhk.or.jp/news/easy/'
FUTSUU_REGEX =
/\A[^.]+\.#{Regexp.quote(FUTSUU_SITE)}.+\.html?/i.freeze
YASASHII_REGEX =
/\A[^.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i.freeze
%r{
  /about\.html?               # https://www3.nhk.or.jp/news/easy/about.html
  |/movieplayer\.html?        # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
  |/audio\.html?              # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
  |/news/easy/index\.html?    # https://www3.nhk.or.jp/news/easy/index.html
  |/disaster_earthquake.html  # https://www3.nhk.or.jp/news/easy/article/disaster_earthquake.html

  # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
  # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
  |/enqform\.html?
}x.freeze

Constants inherited from Scraper

NHKore::Scraper::DEFAULT_HEADER

Instance Attribute Summary

Attributes inherited from Scraper

#kargs, #max_redirects, #max_retries, #redirect_rule, #str_or_io, #url

Instance Method Summary collapse

Methods inherited from Scraper

#fetch_cookie, #html_doc, #join_url, #open, #open_file, #open_url, #read, #reopen, #rss_doc

Constructor Details

#initialize(url, eat_cookie: true, header: {}, **kargs) ⇒ SearchScraper

Search Engines are strict, so trigger using the default HTTP header fields with header: {} and fetch/set the cookie using eat_cookie: true.



47
48
49
# File 'lib/nhkore/search_scraper.rb', line 47

def initialize(url,eat_cookie: true,header: {},**kargs)
  super
end

Instance Method Details

#fetch_valid_link?(link) ⇒ Boolean

Returns:

  • (Boolean)


62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/nhkore/search_scraper.rb', line 62

def fetch_valid_link?(link)
  uri = begin
    URI(link)
  rescue StandardError
    return false # Bad URL.
  end

  begin
    ssl = uri.scheme.to_s.strip.downcase.include?('https')

    Net::HTTP.start(uri.host,uri.port,use_ssl: ssl) do |http|
      resp = http.head(uri.request_uri)
      code = resp.code

      return code != '404'
    end
  rescue StandardError
    # Ignore; try actually scraping the article anyway.
  end

  return true
end

#ignore_link?(link, cleaned: true) ⇒ Boolean

Returns:

  • (Boolean)


51
52
53
54
55
56
57
58
59
# File 'lib/nhkore/search_scraper.rb', line 51

def ignore_link?(link,cleaned: true)
  return true if link.nil?

  link = Util.unspace_web_str(link).downcase unless cleaned

  return true if link.empty?
  return true if IGNORE_LINK_REGEX.match?(link)
  return false
end