Class: NHKore::Scraper

Inherits:
Object
  • Object
show all
Extended by:
AttrBool::Ext
Defined in:
lib/nhkore/scraper.rb

Direct Known Subclasses

ArticleScraper, DictScraper, SearchScraper

Constant Summary collapse

DEFAULT_HEADER =
{
  'user-agent' => UserAgents.sample,
  'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp' \
              ',image/apng,*/*;application/signed-exchange',
  'dnt' => '1',
}.freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, eat_cookie: false, header: nil, is_file: false, max_redirects: 3, max_retries: 3, redirect_rule: :strict, str_or_io: nil, **kargs) ⇒ Scraper

max_redirects defaults to 3 for safety (infinite-loop attack).

All URL options: ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html

Pass in header: {} for the default HTTP header fields to be set.

Parameters:

  • eat_cookie (true, false) (defaults to: false)

    true to set the HTTP header field ‘cookie’, which can be an expensive (time-consuming) operation since it opens the URL again, but necessary for some URLs.

  • redirect_rule (nil, :lenient, :strict) (defaults to: :strict)


50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/nhkore/scraper.rb', line 50

def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,
    redirect_rule: :strict,str_or_io: nil,**kargs)
  super()

  if !header.nil? && !is_file
    # Some sites (Search Engines) hate scrapers, so need HTTP header fields.
    # If this isn't enough, look at googler for more header fields to set:
    # - https://github.com/jarun/googler
    # If necessary, can use Faraday, HTTParty, or RestClient gem and
    #   pass in to str_or_io.

    header = DEFAULT_HEADER.merge(header)
    kargs.merge!(header)
  end

  @eat_cookie = eat_cookie
  @is_file = is_file
  @kargs = kargs
  @max_redirects = max_redirects
  @max_retries = max_retries
  @redirect_rule = redirect_rule

  self.open(url,str_or_io,is_file: is_file)
end

Instance Attribute Details

#kargsObject (readonly)

Returns the value of attribute kargs.



34
35
36
# File 'lib/nhkore/scraper.rb', line 34

def kargs
  @kargs
end

#max_redirectsObject

Returns the value of attribute max_redirects.



35
36
37
# File 'lib/nhkore/scraper.rb', line 35

def max_redirects
  @max_redirects
end

#max_retriesObject

Returns the value of attribute max_retries.



36
37
38
# File 'lib/nhkore/scraper.rb', line 36

def max_retries
  @max_retries
end

#redirect_ruleObject

Returns the value of attribute redirect_rule.



37
38
39
# File 'lib/nhkore/scraper.rb', line 37

def redirect_rule
  @redirect_rule
end

#str_or_ioObject

Returns the value of attribute str_or_io.



38
39
40
# File 'lib/nhkore/scraper.rb', line 38

def str_or_io
  @str_or_io
end

#urlObject

Returns the value of attribute url.



39
40
41
# File 'lib/nhkore/scraper.rb', line 39

def url
  @url
end

Instance Method Details



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/nhkore/scraper.rb', line 75

def fetch_cookie(url)
  require 'http-cookie'

  open_url(url)

  cookies = Array(@str_or_io.meta['set-cookie']) # nil will be []

  if !cookies.empty?
    jar = HTTP::CookieJar.new
    uri = URI(url)

    cookies.each do |cookie|
      jar.parse(cookie,uri)
    end

    @kargs['cookie'] = HTTP::Cookie.cookie_value(jar.cookies(uri))
  end

  return self
end

#html_docObject



96
97
98
# File 'lib/nhkore/scraper.rb', line 96

def html_doc
  return Nokogiri::HTML(@str_or_io)
end

#join_url(relative_url) ⇒ Object



100
101
102
103
104
105
106
107
# File 'lib/nhkore/scraper.rb', line 100

def join_url(relative_url)
  # For a file, don't know what to do.
  # It would be unsafe to return something else;
  #   for example, it could return a lot of "../../../" to your root dir.
  return nil if @is_file

  return URI.join(@url,relative_url)
end

#open(url, str_or_io = nil, is_file: @is_file) ⇒ Object



109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/nhkore/scraper.rb', line 109

def open(url,str_or_io=nil,is_file: @is_file)
  @is_file = is_file
  @str_or_io = str_or_io
  @url = url

  if str_or_io.nil?
    if @is_file
      open_file(url)
    else
      fetch_cookie(url) if @eat_cookie
      open_url(url)
    end
  end

  return self
end

#open_file(file) ⇒ Object



126
127
128
129
130
131
132
133
134
# File 'lib/nhkore/scraper.rb', line 126

def open_file(file)
  @is_file = true
  @url = file

  # NHK's website tends to always use UTF-8.
  @str_or_io = File.open(file,'rt:UTF-8',**@kargs)

  return self
end

#open_url(url) ⇒ Object



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/nhkore/scraper.rb', line 136

def open_url(url)
  max_redirects = (@max_redirects.nil? || @max_redirects < 0) ? 10_000 : @max_redirects
  max_retries = (@max_retries.nil? || @max_retries < 0) ? 10_000 : @max_retries

  top_uri = URI(url)
  top_domain = Util.domain(top_uri.host)

  begin
    # Use URI().open() instead of URI.open()/(Kernel.)open() for safety (code-injection attack).
    # Use URI() instead of URI.parse() because url can be a URI (not just a string).
    @str_or_io = URI(url).open(redirect: false,**@kargs)
    @url = url
  rescue OpenURI::HTTPRedirect => redirect
    redirect_uri = redirect.uri

    if (max_redirects -= 1) < 0
      raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
    end

    case @redirect_rule
    when :lenient,:strict
      if redirect_uri.scheme != top_uri.scheme
        raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " \
          "scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
      end

      if @redirect_rule == :strict
        redirect_domain = Util.domain(redirect_uri.host)

        if redirect_domain != top_domain
          raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " \
            "domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
        end
      end
    end

    url = redirect_uri

    retry
  # Must come after HTTPRedirect since a subclass of HTTPError.
  rescue OpenURI::HTTPError => e
    msg = "HTTP error[#{e}] at URL[#{url}]"

    if e.to_s.include?('404 Not Found')
      raise Http404Error,msg
    else
      raise e.exception(msg)
    end
  rescue SocketError => e
    if (max_retries -= 1) < 0
      raise e.exception("Socket error[#{e}] at URL[#{url}]")
    end

    retry
  end

  return self
end

#readObject



195
196
197
198
199
# File 'lib/nhkore/scraper.rb', line 195

def read
  @str_or_io = @str_or_io.read if @str_or_io.respond_to?(:read)

  return @str_or_io
end

#reopenObject



201
202
203
# File 'lib/nhkore/scraper.rb', line 201

def reopen
  return self.open(@url)
end

#rss_docObject



205
206
207
208
209
# File 'lib/nhkore/scraper.rb', line 205

def rss_doc
  require 'rss'

  return RSS::Parser.parse(@str_or_io,validate: false)
end