Class: Grubby

Inherits:
Mechanize
  • Object
show all
Defined in:
lib/grubby.rb,
lib/grubby/version.rb

Defined Under Namespace

Classes: JsonParser, JsonScraper, PageScraper, Scraper

Constant Summary collapse

VERSION =
"1.0.0"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(singleton_journal = nil) ⇒ Grubby

Returns a new instance of Grubby.

Parameters:

  • singleton_journal (Pathname, String) (defaults to: nil)

    Optional journal file to persist the list of resources processed by #singleton. Useful to ensure only-once processing across multiple program runs.



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/grubby.rb', line 33

def initialize(singleton_journal = nil)
  super()

  # Prevent "memory leaks", and prevent mistakenly blank urls from
  # resolving.  (Blank urls resolve as a path relative to the last
  # history entry.  Without this setting, an erroneous `agent.get("")`
  # could sometimes successfully fetch a page.)
  self.max_history = 0

  # Prevent files of unforeseen content type from being buffered into
  # memory by default, in case they are very large.  However, increase
  # the threshold for what is considered "large", to prevent
  # unnecessary writes to disk.
  #
  # References:
  #   - http://docs.seattlerb.org/mechanize/Mechanize/PluggableParser.html
  #   - http://docs.seattlerb.org/mechanize/Mechanize/Download.html
  #   - http://docs.seattlerb.org/mechanize/Mechanize/File.html
  self.max_file_buffer = 1_000_000 # only applies to Mechanize::Download
  self.pluggable_parser.default = Mechanize::Download
  self.pluggable_parser["text/plain"] = Mechanize::File
  self.pluggable_parser["application/json"] = Grubby::JsonParser

  # Set up configurable rate limiting, and choose a reasonable default
  # rate limit.
  self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
  self.time_between_requests = 1.0

  @journal = singleton_journal ?
    singleton_journal.to_pathname.touch_file : Pathname::NULL
  @seen = SingletonKey.parse_file(@journal).
    group_by(&:purpose).transform_values{|sks| sks.map(&:key).index_to{ true } }
end

Instance Attribute Details

#time_between_requestsInteger, ...

Returns The enforced minimum amount of time to wait between requests, in seconds. If the value is a Range, a random number within the Range is chosen for each request.

Returns:

  • (Integer, Float, Range<Integer>, Range<Float>)

    The enforced minimum amount of time to wait between requests, in seconds. If the value is a Range, a random number within the Range is chosen for each request.



27
28
29
# File 'lib/grubby.rb', line 27

def time_between_requests
  @time_between_requests
end

Instance Method Details

#get_mirrored(mirror_uris, parameters = [], referer = nil, headers = {}) ⇒ Mechanize::Page, ...

Calls #get with each of mirror_uris until a successful (“200 OK”) response is recieved, and returns that #get result. Rescues and logs Mechanize::ResponseCodeError failures for all but the last mirror.

Parameters:

Returns:

Raises:

  • (Mechanize::ResponseCodeError)

    if all mirror_uris fail



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/grubby.rb', line 76

def get_mirrored(mirror_uris, parameters = [], referer = nil, headers = {})
  i = 0
  begin
    get(mirror_uris[i], parameters, referer, headers)
  rescue Mechanize::ResponseCodeError => e
    i += 1
    if i >= mirror_uris.length
      raise
    else
      $log.info("Mirror failed with response code #{e.response_code}: #{mirror_uris[i - 1]}")
      $log.debug("Trying next mirror: #{mirror_uris[i]}")
      retry
    end
  end
end

#singleton(target, purpose = "") {|resource| ... } ⇒ Boolean

Ensures only-once processing of the resource indicated by target for the specified purpose. A list of previously-processed resource URIs and content hashes is maintained in the Grubby instance. The given block is called with the fetched resource only if the resource’s URI and the resource’s content hash have not been previously processed under the specified purpose.

Parameters:

Yields:

  • (resource)

    processes the resource

Yield Parameters:

Returns:

  • (Boolean)

    whether the given block was called

Raises:

  • (Mechanize::ResponseCodeError)

    if fetching the resource results in error (see Mechanize#get)



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/grubby.rb', line 111

def singleton(target, purpose = "")
  series = []

  original_url = target.to_absolute_uri
  return if skip_singleton?(purpose, original_url.to_s, series)

  url = normalize_url(original_url)
  return if skip_singleton?(purpose, url.to_s, series)

  $log.info("Fetching #{url}")
  resource = get(url)
  skip = skip_singleton?(purpose, resource.uri.to_s, series) |
    skip_singleton?(purpose, "content hash: #{resource.content_hash}", series)

  yield resource unless skip

  series.map{|k| SingletonKey.new(purpose, k) }.append_to_file(@journal)

  !skip
end