Class: NHKore::Scraper

Inherits:
Object
  • Object
show all
Extended by:
AttrBool::Ext
Defined in:
lib/nhkore/scraper.rb

Overview

Author:

  • Jonathan Bradley Whited

Since:

  • 0.2.0

Direct Known Subclasses

ArticleScraper, DictScraper, SearchScraper

Constant Summary collapse

DEFAULT_HEADER =

Since:

  • 0.2.0

{
  'user-agent' => UserAgents.sample,
  'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp' \
              ',image/apng,*/*;application/signed-exchange',
  'dnt' => '1',
}.freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, eat_cookie: false, header: nil, is_file: false, max_redirects: 3, max_retries: 3, redirect_rule: :strict, str_or_io: nil, **kargs) ⇒ Scraper

max_redirects defaults to 3 for safety (infinite-loop attack).

All URL options: ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html

Pass in header: {} for the default HTTP header fields to be set.

Parameters:

  • eat_cookie (true, false) (defaults to: false)

    true to set the HTTP header field ‘cookie’, which can be an expensive (time-consuming) operation since it opens the URL again, but necessary for some URLs.

  • redirect_rule (nil, :lenient, :strict) (defaults to: :strict)

Since:

  • 0.2.0



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/nhkore/scraper.rb', line 53

def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,
    redirect_rule: :strict,str_or_io: nil,**kargs)
  super()

  if !header.nil? && !is_file
    # Some sites (Search Engines) hate scrapers, so need HTTP header fields.
    # If this isn't enough, look at googler for more header fields to set:
    # - https://github.com/jarun/googler
    # If necessary, can use Faraday, HTTParty, or RestClient gem and
    #   pass in to str_or_io.

    header = DEFAULT_HEADER.merge(header)
    kargs.merge!(header)
  end

  @eat_cookie = eat_cookie
  @is_file = is_file
  @kargs = kargs
  @max_redirects = max_redirects
  @max_retries = max_retries
  @redirect_rule = redirect_rule

  self.open(url,str_or_io,is_file: is_file)
end

Instance Attribute Details

#kargsObject (readonly)

Since:

  • 0.2.0



37
38
39
# File 'lib/nhkore/scraper.rb', line 37

def kargs
  @kargs
end

#max_redirectsObject

Since:

  • 0.2.0



38
39
40
# File 'lib/nhkore/scraper.rb', line 38

def max_redirects
  @max_redirects
end

#max_retriesObject

Since:

  • 0.2.0



39
40
41
# File 'lib/nhkore/scraper.rb', line 39

def max_retries
  @max_retries
end

#redirect_ruleObject

Since:

  • 0.2.0



40
41
42
# File 'lib/nhkore/scraper.rb', line 40

def redirect_rule
  @redirect_rule
end

#str_or_ioObject

Since:

  • 0.2.0



41
42
43
# File 'lib/nhkore/scraper.rb', line 41

def str_or_io
  @str_or_io
end

#urlObject

Since:

  • 0.2.0



42
43
44
# File 'lib/nhkore/scraper.rb', line 42

def url
  @url
end

Instance Method Details

Since:

  • 0.2.0



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/nhkore/scraper.rb', line 78

def fetch_cookie(url)
  require 'http-cookie'

  open_url(url)

  cookies = Array(@str_or_io.meta['set-cookie']) # nil will be []

  if !cookies.empty?
    jar = HTTP::CookieJar.new
    uri = URI(url)

    cookies.each do |cookie|
      jar.parse(cookie,uri)
    end

    @kargs['cookie'] = HTTP::Cookie.cookie_value(jar.cookies(uri))
  end

  return self
end

#html_docObject

Since:

  • 0.2.0



99
100
101
# File 'lib/nhkore/scraper.rb', line 99

def html_doc
  return Nokogiri::HTML(@str_or_io)
end

#join_url(relative_url) ⇒ Object

Since:

  • 0.2.0



103
104
105
106
107
108
109
110
# File 'lib/nhkore/scraper.rb', line 103

def join_url(relative_url)
  # For a file, don't know what to do.
  # It would be unsafe to return something else;
  #   for example, it could return a lot of "../../../" to your root dir.
  return nil if @is_file

  return URI.join(@url,relative_url)
end

#open(url, str_or_io = nil, is_file: @is_file) ⇒ Object

Since:

  • 0.2.0



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/nhkore/scraper.rb', line 112

def open(url,str_or_io=nil,is_file: @is_file)
  @is_file = is_file
  @str_or_io = str_or_io
  @url = url

  if str_or_io.nil?
    if @is_file
      open_file(url)
    else
      fetch_cookie(url) if @eat_cookie
      open_url(url)
    end
  end

  return self
end

#open_file(file) ⇒ Object

Since:

  • 0.2.0



129
130
131
132
133
134
135
136
137
# File 'lib/nhkore/scraper.rb', line 129

def open_file(file)
  @is_file = true
  @url = file

  # NHK's website tends to always use UTF-8.
  @str_or_io = File.open(file,'rt:UTF-8',**@kargs)

  return self
end

#open_url(url) ⇒ Object

Since:

  • 0.2.0



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/nhkore/scraper.rb', line 139

def open_url(url)
  max_redirects = (@max_redirects.nil? || @max_redirects < 0) ? 10_000 : @max_redirects
  max_retries = (@max_retries.nil? || @max_retries < 0) ? 10_000 : @max_retries

  top_uri = URI(url)
  top_domain = Util.domain(top_uri.host)

  begin
    # Use URI().open() instead of URI.open()/(Kernel.)open() for safety (code-injection attack).
    # Use URI() instead of URI.parse() because url can be a URI (not just a string).
    @str_or_io = URI(url).open(redirect: false,**@kargs)
    @url = url
  rescue OpenURI::HTTPRedirect => redirect
    redirect_uri = redirect.uri

    if (max_redirects -= 1) < 0
      raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
    end

    case @redirect_rule
    when :lenient,:strict
      if redirect_uri.scheme != top_uri.scheme
        raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " \
          "scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}")
      end

      if @redirect_rule == :strict
        redirect_domain = Util.domain(redirect_uri.host)

        if redirect_domain != top_domain
          raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " \
            "domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}")
        end
      end
    end

    url = redirect_uri

    retry
  # Must come after HTTPRedirect since a subclass of HTTPError.
  rescue OpenURI::HTTPError => e
    raise e.exception("HTTP error[#{e}] at URL[#{url}]")
  rescue SocketError => e
    if (max_retries -= 1) < 0
      raise e.exception("Socket error[#{e}] at URL[#{url}]")
    end

    retry
  end

  return self
end

#readObject

Since:

  • 0.2.0



192
193
194
195
196
# File 'lib/nhkore/scraper.rb', line 192

def read
  @str_or_io = @str_or_io.read if @str_or_io.respond_to?(:read)

  return @str_or_io
end

#reopenObject

Since:

  • 0.2.0



198
199
200
# File 'lib/nhkore/scraper.rb', line 198

def reopen
  return self.open(@url)
end

#rss_docObject

Since:

  • 0.2.0



202
203
204
205
206
# File 'lib/nhkore/scraper.rb', line 202

def rss_doc
  require 'rss'

  return RSS::Parser.parse(@str_or_io,validate: false)
end