Module: ScraperUtils::SpecSupport

Defined in:
lib/scraper_utils/spec_support.rb

Overview

Methods to support specs

Constant Summary collapse

AUSTRALIAN_STATES =
%w[ACT NSW NT QLD SA TAS VIC WA].freeze
STREET_TYPE_PATTERNS =
[
  /\bAv(e(nue)?)?\b/i,
  /\bB(oulevard|lvd)\b/i,
  /\b(Circuit|Cct)\b/i,
  /\bCl(ose)?\b/i,
  /\bC(our|r)?t\b/i,
  /\bCircle\b/i,
  /\bChase\b/i,
  /\bCr(es(cent)?)?\b/i,
  /\bDr((ive)?|v)\b/i,
  /\bEnt(rance)?\b/i,
  /\bGr(ove)?\b/i,
  /\bH(ighwa|w)y\b/i,
  /\bLane\b/i,
  /\bLoop\b/i,
  /\bParkway\b/i,
  /\bPl(ace)?\b/i,
  /\bPriv(ate)?\b/i,
  /\bParade\b/i,
  /\bR(oa)?d\b/i,
  /\bRise\b/i,
  /\bSt(reet)?\b/i,
  /\bSquare\b/i,
  /\bTerrace\b/i,
  /\bWay\b/i
].freeze
AUSTRALIAN_POSTCODES =
/\b\d{4}\b/.freeze
PLACEHOLDERS =
[
  /no description/i,
  /not available/i,
  /to be confirmed/i,
  /\btbc\b/i,
  %r{\bn/a\b}i
].freeze

Class Method Summary collapse

Class Method Details

.authority_label(results, prefix: '', suffix: '') ⇒ Object



45
46
47
48
49
50
51
52
53
# File 'lib/scraper_utils/spec_support.rb', line 45

def self.authority_label(results, prefix: '', suffix: '')
  return nil if results.nil?

  authority_labels = results.map { |record| record['authority_label']}.compact.uniq
  return nil if authority_labels.empty?

  raise "Expected one authority_label, not #{authority_labels.inspect}" if authority_labels.size > 1
  "#{prefix}#{authority_labels.first}#{suffix}"
end

.bot_protection_detected?(page) ⇒ Boolean

Check if the page response indicates bot protection

Parameters:

  • page (Mechanize::Page)

    The page response to check

Returns:

  • (Boolean)

    True if bot protection is detected



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/scraper_utils/spec_support.rb', line 193

def self.bot_protection_detected?(page)
  return true if %w[403 429].include?(page.code)

  return false unless page.body

  body_lower = page.body.downcase

  # Check for common bot protection indicators
  bot_indicators = [
    'recaptcha',
    'cloudflare',
    'are you human',
    'bot detection',
    'security check',
    'verify you are human',
    'access denied',
    'blocked',
    'captcha'
  ]

  bot_indicators.any? { |indicator| body_lower.include?(indicator) }
end

.fetch_url_with_redirects(url) ⇒ Object



39
40
41
42
43
# File 'lib/scraper_utils/spec_support.rb', line 39

def self.fetch_url_with_redirects(url)
  agent = Mechanize.new
  # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
  agent.get(url)
end

.geocodable?(address, ignore_case: false) ⇒ Boolean

Check if an address is likely to be geocodable by analyzing its format. This is a bit stricter than needed - typically assert >= 75% match

Parameters:

  • address (String)

    The address to check

Returns:

  • (Boolean)

    True if the address appears to be geocodable.



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/scraper_utils/spec_support.rb', line 78

def self.geocodable?(address, ignore_case: false)
  return false if address.nil? || address.empty?
  check_address = ignore_case ? address.upcase : address

  # Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
  has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
  has_postcode = address.match?(AUSTRALIAN_POSTCODES)

  # Using the pre-compiled patterns
  has_street_type = STREET_TYPE_PATTERNS.any? { |pattern| check_address.match?(pattern) }

  has_unit_or_lot = address.match?(/\b(Unit|Lot:?)\s+\d+/i)

  uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
  has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }

  if ENV["DEBUG"]
    missing = []
    unless has_street_type || has_unit_or_lot
      missing << "street type / unit / lot"
    end
    missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
    missing << "state" unless has_state
    puts "  address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
  end

  (has_street_type || has_unit_or_lot) && (has_postcode || has_uppercase_suburb) && has_state
end

.placeholder?(text) ⇒ Boolean

Returns:

  • (Boolean)


115
116
117
# File 'lib/scraper_utils/spec_support.rb', line 115

def self.placeholder?(text)
  PLACEHOLDERS.any? { |placeholder| text.to_s.match?(placeholder) }
end

.reasonable_description?(text) ⇒ Boolean

Check if this looks like a “reasonable” description This is a bit stricter than needed - typically assert >= 75% match

Returns:

  • (Boolean)


144
145
146
# File 'lib/scraper_utils/spec_support.rb', line 144

def self.reasonable_description?(text)
  !placeholder?(text) && text.to_s.split.size >= 3
end

.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3) ⇒ Object

Validates enough addresses are geocodable

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 50)

    The min percentage of addresses expected to be geocodable (default:50)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

Raises:

  • RuntimeError if insufficient addresses are geocodable



60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/scraper_utils/spec_support.rb', line 60

def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
  return nil if results.empty?

  geocodable = results
                 .map { |record| record["address"] }
                 .uniq
                 .count { |text| ScraperUtils::SpecSupport.geocodable? text }
  puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
         "(#{(100.0 * geocodable / results.count).round(1)}%)"
  expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
  raise "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}" unless geocodable >= expected
  geocodable
end

.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3) ⇒ Object

Validates enough descriptions are reasonable

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 50)

    The min percentage of descriptions expected to be reasonable (default:50)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

Raises:

  • RuntimeError if insufficient descriptions are reasonable



124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/scraper_utils/spec_support.rb', line 124

def self.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3)
  return nil if results.empty?

  descriptions = results
                   .map { |record| record["description"] }
                   .uniq
                   .count do |text|
    selected = ScraperUtils::SpecSupport.reasonable_description? text
    puts "  description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
    selected
  end
  puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
         "(#{(100.0 * descriptions / results.count).round(1)}%)"
  expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
  raise "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}" unless descriptions >= expected
  descriptions
end

.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false) ⇒ Object

Validates that info_urls have expected details (unique URLs with content validation)

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • percentage (Integer) (defaults to: 75)

    The min percentage of detail checks expected to pass (default:75)

  • variation (Integer) (defaults to: 3)

    The variation allowed in addition to percentage (default:3)

Raises:

  • RuntimeError if insufficient detail checks pass



180
181
182
183
184
185
186
187
188
# File 'lib/scraper_utils/spec_support.rb', line 180

def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false)
  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}info_url_details") do
      check_info_url_details(results, percentage, variation, bot_check_expected)
    end
  else
    check_info_url_details(results, percentage, variation, bot_check_expected)
  end
end

.validate_page_response(page, bot_check_expected) ⇒ Object

Validate page response, accounting for bot protection

Parameters:

  • page (Mechanize::Page)

    The page response to validate

  • bot_check_expected (Boolean)

    Whether bot protection is acceptable

Raises:

  • RuntimeError if page response is invalid and bot protection not expected



220
221
222
223
224
225
226
227
# File 'lib/scraper_utils/spec_support.rb', line 220

def self.validate_page_response(page, bot_check_expected)
  if bot_check_expected && bot_protection_detected?(page)
    puts "  Bot protection detected - accepting as valid response"
    return
  end

  raise "Expected 200 response from the one expected info_url, got #{page.code}" unless page.code == "200"
end

.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false) ⇒ Object

Validates that all records use the expected global info_url and it returns 200

Parameters:

  • results (Array<Hash>)

    The results from scraping an authority

  • expected_url (String)

    The expected global info_url for this authority

Raises:

  • RuntimeError if records don’t use the expected URL or it doesn’t return 200



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/scraper_utils/spec_support.rb', line 152

def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false)
  info_urls = results.map { |record| record["info_url"] }.uniq

  unless info_urls.size == 1
    raise "Expected all records to use one info_url '#{expected_url}', found: #{info_urls.size}"
  end
  unless info_urls.first == expected_url
    raise "Expected all records to use global info_url '#{expected_url}', found: #{info_urls.first}"
  end

  puts "Checking the one expected info_url returns 200: #{expected_url}"

  if defined?(VCR)
    VCR.use_cassette("#{authority_label(results, suffix: '_')}one_info_url") do
      page = fetch_url_with_redirects(expected_url)
      validate_page_response(page, bot_check_expected)
    end
  else
    page = fetch_url_with_redirects(expected_url)
    validate_page_response(page, bot_check_expected)
  end
end