Module: ScraperUtils::SpecSupport

Defined in:
lib/scraper_utils/spec_support.rb

Overview

Methods to support specs

Constant Summary collapse

AUSTRALIAN_STATES =
%w[ACT NSW NT QLD SA TAS VIC WA].freeze
STREET_TYPE_PATTERNS =
[
  /\bArcade\b/i,
  /\bAv(e(nue)?)?\b/i,
  /\bB(oulevard|lvd|vd)\b/i,
  /\b(Circuit|Cct)\b/i,
  /\bCir(cle)?\b/i,
  /\bCl(ose)?\b/i,
  /\bC(our|r)?t\b/i,
  /\bChase\b/i,
  /\bCorso\b/i,
  /\bCr(es(cent)?)?\b/i,
  /\bCross\b/i,
  /\bDr((ive)?|v)\b/i,
  /\bEnt(rance)?\b/i,
  /\bEsp(lanade)?\b/i,
  /\bGr(ove)?\b/i,
  /\bH(ighwa|w)y\b/i,
  /\bL(ane?|a)\b/i,
  /\bLoop\b/i,
  /\bM(ews|w)\b/i,
  /\bP(arade|de)\b/i,
  /\bParkway\b/i,
  /\bPl(ace)?\b/i,
  /\bPriv(ate)?\b/i,
  /\bProm(enade)?\b/i,
  /\bQuay\b/i,
  /\bR(oa)?d\b/i,
  /\bR(idge|dg)\b/i,
  /\bRise\b/i,
  /\bSq(uare)?\b/i,
  /\bSt(reet)?\b/i,
  /\bT(erra)?ce\b/i,
  /\bWa?y\b/i
].freeze
AUSTRALIAN_POSTCODES =
/\b\d{4}\b/.freeze
PLANNING_KEYWORDS =
[
  # Building types
  'dwelling', 'house', 'unit', 'building', 'structure', 'facility',
  # Modifications
  'addition', 'extension', 'renovation', 'alteration', 'modification',
  'replacement', 'upgrade', 'improvement',
  # Specific structures
  'carport', 'garage', 'shed', 'pool', 'deck', 'patio', 'pergola',
  'verandah', 'balcony', 'fence', 'wall', 'driveway',
  # Development types
  'subdivision', 'demolition', 'construction', 'development',
  # Services/utilities
  'signage', 'telecommunications', 'stormwater', 'water', 'sewer',
  # Approvals/certificates
  'certificate', 'approval', 'consent', 'permit'
].freeze
PLACEHOLDERS =
[
  /no description/i,
  /not available/i,
  /to be confirmed/i,
  /\btbc\b/i,
  %r{\bn/a\b}i
].freeze

Class Method Summary collapse

Class Method Details

.authority_label(results, prefix: '', suffix: '') ⇒ Object



78
79
80
81
82
83
84
85
86
# File 'lib/scraper_utils/spec_support.rb', line 78

def self.authority_label(results, prefix: '', suffix: '')
  return nil if results.nil?

  authority_labels = results.map { |record| record['authority_label'] }.compact.uniq
  return nil if authority_labels.empty?

  raise "Expected one authority_label, not #{authority_labels.inspect}" if authority_labels.size > 1
  "#{prefix}#{authority_labels.first}#{suffix}"
end

.bot_protection_detected?(page) ⇒ Boolean

Check if the page response indicates bot protection

Parameters:

  • page (Mechanize::Page)

    The page response to check

Returns:

  • (Boolean)

    True if bot protection is detected



278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# File 'lib/scraper_utils/spec_support.rb', line 278

def self.bot_protection_detected?(page)
  return true if %w[403 429].include?(page.code)

  return false unless page.body

  body_lower = page.body&.downcase

  # Check for common bot protection indicators
  bot_indicators = [
    'recaptcha',
    'cloudflare',
    'are you human',
    'bot detection',
    'security check',
    'verify you are human',
    'access denied',
    'blocked',
    'captcha'
  ]

  bot_indicators.any? { |indicator| body_lower.include?(indicator) }
end

.check_info_url_details(results, percentage, variation, bot_check_expected, &block) ⇒ Object



353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
# File 'lib/scraper_utils/spec_support.rb', line 353

def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
  count = 0
  failed = 0
  fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq

  fib_indices.each do |index|
    record = results[index]
    info_url = record["info_url"]
    puts "Checking info_url[#{index}]: #{info_url} has the expected reference, address and description..."

    page = block_given? ? block.call(info_url) : fetch_url_with_redirects(info_url)

    if bot_check_expected && bot_protection_detected?(page)
      puts "  Bot protection detected - skipping detailed validation"
      next
    end

    raise UnprocessableRecord, "Expected 200 response, got #{page.code}" unless page.code == "200"

    page_body = page.body.dup.force_encoding("UTF-8").gsub(/\s\s+/, " ")

    %w[council_reference address description].each do |attribute|
      count += 1
      expected = CGI.escapeHTML(record[attribute]).gsub(/\s\s+/, " ")
      expected2 = case attribute
                  when 'council_reference'
                    expected.sub(/\ADA\s*-\s*/, '')
                  when 'address'
                    expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/, '') # Handle Lismore post-code/state swap
                  else
                    expected
                  end
      expected3 = case attribute
                  when 'address'
                    expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/, '')
                  else
                    expected
                  end.gsub(/\s*,\s*/, ' ').gsub(/\s*-\s*/, '-')
      next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/, ' ').gsub(/\s*-\s*/, '-').include?(expected3)

      failed += 1
      desc2 = expected2 == expected ? '' : " or #{expected2.inspect}"
      desc3 = expected3 == expected ? '' : " or #{expected3.inspect}"
      puts "  Missing: #{expected.inspect}#{desc2}#{desc3}"
      puts "    IN: #{page_body}" if ENV['DEBUG']

      min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
      passed = count - failed
      raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
    end
  end

  puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!" if count > 0
end

.check_info_url_is_present(results, percentage, variation, &block) ⇒ Object



316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
# File 'lib/scraper_utils/spec_support.rb', line 316

def self.check_info_url_is_present(results, percentage, variation, &block)
  count = 0
  failed = 0
  fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq

  fib_indices.each do |index|
    record = results[index]
    info_url = record["info_url"]
    puts "Checking info_url[#{index}]: #{info_url} is present..."

    begin
      page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
      status = page.code.to_i
    rescue Mechanize::ResponseCodeError => e
      status = e.response_code.to_i
    end

    if [403, 429].include?(status)
      puts "  Bot protection detected - skipping"
      next
    end

    count += 1
    if status.between?(200, 299)
      puts "  OK: #{status}" if ENV['DEBUG']
    else
      failed += 1
      puts "  Failed: #{status}"
      min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
      passed = count - failed
      raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
    end
  end

  puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!" if count > 0
end

.fetch_url_head(url) ⇒ Object



66
67
68
69
70
# File 'lib/scraper_utils/spec_support.rb', line 66

def self.fetch_url_head(url)
  agent = Mechanize.new
  # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
  agent.head(url)
end

.fetch_url_with_redirects(url) ⇒ Object



72
73
74
75
76
# File 'lib/scraper_utils/spec_support.rb', line 72

def self.fetch_url_with_redirects(url)
  agent = Mechanize.new
  # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
  agent.get(url)
end

.geocodable?(address, ignore_case: false, known_suburbs: []) ⇒ Boolean

Check if an address is likely to be geocodable by analyzing its format. This is a bit stricter than needed - typically assert >= 75% match

Parameters:

  • address (String)

    The address to check

  • ignore_case (Boolean) (defaults to: false)

    Ignores case which relaxes suburb check

  • known_suburbs (Array<String>) (defaults to: [])

    Known suburbs to detect in address when there is no postcode and no uppercase suburb

Returns:

  • (Boolean)

    True if the address appears to be geocodable.



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/scraper_utils/spec_support.rb', line 139

def self.geocodable?(address, ignore_case: false, known_suburbs: [])
  return false if address.nil? || address.empty?
  check_address = ignore_case ? address.upcase : address

  # Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
  has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
  has_postcode = address.match?(AUSTRALIAN_POSTCODES)

  # Using the pre-compiled patterns
  has_street_type = STREET_TYPE_PATTERNS.any? { |pattern| check_address.match?(pattern) }

  uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
  has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
  has_known_suburb = known_suburbs.any? { |suburb| address.include?(suburb) }

  if ENV["DEBUG"]
    missing = []
    missing << "street type" unless has_street_type
    missing << "postcode/Uppercase suburb/Known suburb" unless has_postcode || has_uppercase_suburb || has_known_suburb
    missing << "state" unless has_state
    puts "  address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
  end

  has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
end

.placeholder?(text) ⇒ Boolean

Returns:

  • (Boolean)


173
174
175
# File 'lib/scraper_utils/spec_support.rb', line 173

def self.placeholder?(text)
  PLACEHOLDERS.any? { |placeholder| text.to_s.match?(placeholder) }
end

.reasonable_description?(text) ⇒ Boolean

Check if this looks like a “reasonable” description This is a bit stricter than needed - typically assert >= 75% match

Returns:

  • (Boolean)


202
203
204
205
206
207
208
209
210
211
# File 'lib/scraper_utils/spec_support.rb', line 202

def self.reasonable_description?(text)
  return false if placeholder?(text)

  # Long descriptions (3+ words) are assumed reasonable
  return true if text.to_s.split.size >= 3

  # Short descriptions must contain at least one planning keyword
  text_lower = text.to_s.downcase
  PLANNING_KEYWORDS.any? { |keyword| text_lower.include?(keyword) }
end

.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: []) ⇒ Object

Validates enough addresses are geocodable

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 50)

    The min percentage of addresses expected to be geocodable (default:50)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

  • ignore_case (Boolean) (defaults to: false)

    Ignores case which relaxes suburb check

  • known_suburbs (Array<String>) (defaults to: [])

    Known suburbs to detect in address when there is no postcode and no uppercase suburb

Raises:

  • RuntimeError if insufficient addresses are geocodable



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/scraper_utils/spec_support.rb', line 108

def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: [])
  return nil if results.empty?

  geocodable = results
                 .map { |record| record["address"] }
                 .uniq
                 .count do |text|
                   ok = ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case
                    if !ok && DebugUtils.verbose?
                      ScraperUtils::LogUtils.log(
                        "Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
                      )
                    end

                   ok
                   end
  puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
         "(#{(100.0 * geocodable / results.count).round(1)}%)"
  expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
  unless geocodable >= expected
    raise UnprocessableSite, "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}"
  end
  geocodable
end

.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3) ⇒ Object

Validates enough descriptions are reasonable

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 50)

    The min percentage of descriptions expected to be reasonable (default:50)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

Raises:

  • RuntimeError if insufficient descriptions are reasonable



182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/scraper_utils/spec_support.rb', line 182

def self.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3)
  return nil if results.empty?

  descriptions = results
                   .map { |record| record["description"] }
                   .uniq
                   .count do |text|
    selected = ScraperUtils::SpecSupport.reasonable_description? text
    puts "  description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
    selected
  end
  puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
         "(#{(100.0 * descriptions / results.count).round(1)}%)"
  expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
  raise UnprocessableSite, "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}" unless descriptions >= expected
  descriptions
end

.validate_info_urls_are_present!(results, percentage: 75, variation: 3) {|String| ... } ⇒ Object

Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 75)

    The min percentage of detail checks expected to pass (default:75)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

Yields:

  • (String)

    Optional block to customize URL fetching (e.g., handle terms agreement)

Raises:

  • RuntimeError if insufficient detail checks pass



248
249
250
251
252
253
254
255
256
# File 'lib/scraper_utils/spec_support.rb', line 248

def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
      check_info_url_is_present(results, percentage, variation, &block)
    end
  else
    check_info_url_is_present(results, percentage, variation, &block)
  end
end

.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false) {|String| ... } ⇒ Object

Validates that info_urls have expected details (unique URLs with content validation)

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 75)

    The min percentage of detail checks expected to pass (default:75)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

  • bot_check_expected (Boolean) (defaults to: false)

    Whether bot protection is acceptable

Yields:

  • (String)

    Optional block to customize URL fetching (e.g., handle terms agreement)

Raises:

  • RuntimeError if insufficient detail checks pass



265
266
267
268
269
270
271
272
273
# File 'lib/scraper_utils/spec_support.rb', line 265

def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false, &block)
  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
      check_info_url_details(results, percentage, variation, bot_check_expected, &block)
    end
  else
    check_info_url_details(results, percentage, variation, bot_check_expected, &block)
  end
end

.validate_page_response(page, bot_check_expected) ⇒ Object

Validate page response, accounting for bot protection

Parameters:

  • page (Mechanize::Page)

    The page response to validate

  • bot_check_expected (Boolean)

    Whether bot protection is acceptable

Raises:

  • RuntimeError if page response is invalid and bot protection not expected



305
306
307
308
309
310
311
312
# File 'lib/scraper_utils/spec_support.rb', line 305

def self.validate_page_response(page, bot_check_expected)
  if bot_check_expected && bot_protection_detected?(page)
    puts "  Bot protection detected - accepting as valid response"
    return
  end

  raise "Expected 200 response from the one expected info_url, got #{page.code}" unless page.code == "200"
end

.validate_unique_references!(records) ⇒ Object

Finds records with duplicate [authority_label, council_reference] keys.

Parameters:

  • records (Array<Hash>)

    All records to check

Raises:



91
92
93
94
95
96
97
98
99
# File 'lib/scraper_utils/spec_support.rb', line 91

def self.validate_unique_references!(records)
  groups = records.group_by do |r|
    [r["authority_label"], r["council_reference"]&.downcase]
  end
  duplicates = groups.select { |_k, g| g.size > 1 }
  return if duplicates.empty?

  raise UnprocessableSite, "Duplicate authority labels: #{duplicates.keys.map(&:inspect).join(', ')}"
end

.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false) {|String| ... } ⇒ Object

Validates that all records use the expected global info_url and it returns 200

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • expected_url (String)

    The expected global info_url for this authority

  • bot_check_expected (Boolean) (defaults to: false)

    Whether bot protection is acceptable

Yields:

  • (String)

    Optional block to customize URL fetching (e.g., handle terms agreement)

Raises:

  • RuntimeError if records don’t use the expected URL or it doesn’t return 200



219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/scraper_utils/spec_support.rb', line 219

def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false, &block)
  info_urls = results.map { |record| record["info_url"] }.uniq

  unless info_urls.size == 1
    raise "Expected all records to use one info_url '#{expected_url}', found: #{info_urls.size}"
  end
  unless info_urls.first == expected_url
    raise "Expected all records to use global info_url '#{expected_url}', found: #{info_urls.first}"
  end

  puts "Checking the one expected info_url returns 200: #{expected_url}"

  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}info_url") do
      page = block_given? ? block.call(expected_url) : fetch_url_with_redirects(expected_url)
      validate_page_response(page, bot_check_expected)
    end
  else
    page = block_given? ? block.call(expected_url) : fetch_url_with_redirects(expected_url)
    validate_page_response(page, bot_check_expected)
  end
end