Class: Wmap::UrlCrawler::AdwareTag
- Inherits:
-
Wmap::UrlCrawler
- Object
- Wmap::UrlCrawler
- Wmap::UrlCrawler::AdwareTag
- Defined in:
- lib/wmap/url_crawler/adware_tag.rb
Constant Summary
Constants inherited from Wmap::UrlCrawler
Crawl_timeout, Max_http_timeout
Constants included from Wmap::Utils::UrlMagic
Wmap::Utils::UrlMagic::Max_http_timeout, Wmap::Utils::UrlMagic::User_agent
Constants included from Wmap::Utils::DomainRoot
Wmap::Utils::DomainRoot::File_ccsld, Wmap::Utils::DomainRoot::File_cctld, Wmap::Utils::DomainRoot::File_gtld, Wmap::Utils::DomainRoot::File_tld
Instance Attribute Summary
Attributes inherited from Wmap::UrlCrawler
#crawl_depth, #crawl_done, #crawl_page_limit, #crawl_start, #data_dir, #discovered_urls_by_crawler, #http_timeout, #max_parallel, #signature_file, #tag_file, #tag_signatures, #tag_store, #user_agent, #verbose, #visited_urls_by_crawler
Instance Method Summary collapse
-
#check_adware(site, use_cache = true) ⇒ Object
Give a site, locate the landing page, then sift out the adware tag if found.
-
#fast_landing(site) ⇒ Object
Given a site, determine the landing url.
-
#find_tags(url) ⇒ Object
Search the page for known tag signatures.
-
#get_desc(url, tag) ⇒ Object
Search the url payload for known tag.
-
#get_ver(url, tag) ⇒ Object
Search the url payload for known tag version identifier.
-
#initialize(params = {}) ⇒ AdwareTag
constructor
Initialize the instance variables.
-
#load_sig_from_file(file, lc = true) ⇒ Object
load the known tag signatures into an instance variable.
-
#load_tag_from_file(file, lc = false) ⇒ Object
load the known tag store cache into an instance variable.
-
#refresh(num = @max_parallel, use_cache = true) ⇒ Object
Refresh adware tag store signatures.
-
#save_to_file!(file_tag = @tag_file, tags = @tag_store) ⇒ Object
(also: #save!)
Save the current tag store hash table into a file.
Methods inherited from Wmap::UrlCrawler
#crawl, #crawl_worker, #crawl_workers, #crawl_workers_on_file, #get_discovered_sites_by_crawler, #pre_crawl, #print_discovered_urls_by_crawler, #save_discovered_urls
Methods included from Wmap::Utils
#cidr_2_ips, #file_2_hash, #file_2_list, #get_nameserver, #get_nameservers, #host_2_ip, #host_2_ips, #is_cidr?, #is_fqdn?, #is_ip?, #list_2_file, #reverse_dns_lookup, #sort_ips, #valid_dns_record?, #zone_transferable?
Methods included from Wmap::Utils::Logger
Methods included from Wmap::Utils::UrlMagic
#create_absolute_url_from_base, #create_absolute_url_from_context, #host_2_url, #is_site?, #is_ssl?, #is_url?, #landing_location, #make_absolute, #normalize_url, #open_page, #redirect_location, #response_code, #response_headers, #url_2_host, #url_2_path, #url_2_port, #url_2_site, #urls_on_same_domain?
Methods included from Wmap::Utils::DomainRoot
#get_domain_root, #get_domain_root_by_ccsld, #get_domain_root_by_cctld, #get_domain_root_by_tlds, #get_sub_domain, #is_domain_root?, #print_ccsld, #print_cctld, #print_gtld
Constructor Details
#initialize(params = {}) ⇒ AdwareTag
Initialize the instance variables
22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 22 def initialize (params = {}) @verbose=params.fetch(:verbose, false) @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../../data/') Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir) # Set default instance variables @signature_file=File.dirname(__FILE__) + '/../../../settings/' + 'tag_signatures' file=params.fetch(:signature_file, @signature_file) @tag_signatures=load_sig_from_file(file) @tag_file=params.fetch(:tag_file, @data_dir + 'tag_sites') File.new(@tag_file, "w") unless File.exist?(@tag_file) # load the known tag store load_tag_from_file(@tag_file) @landings = Hash.new # cache landing page to reduce redundant browsing end |
Instance Method Details
#check_adware(site, use_cache = true) ⇒ Object
Give a site, locate the landing page, then sift out the adware tag if found
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 135 def check_adware(site,use_cache=true) puts "Check the site for known Adware tags: #{site}" if @verbose record = Hash.new if use_cache && @tag_store.key?(site) puts "Site entry already exist. Skipping: #{site}" if @verbose else url = fast_landing(site) if @landings.key?(url) record[site] = @landings[url] return record end = (url) return record if .size==0 tag_vers=.map do |tag| get_ver(url,tag) end tag_descs=.map do |tag| Base64.urlsafe_encode64(get_desc(url,tag)) end if record[site] = [url, .join("|"), tag_vers.join("|"), tag_descs.join("|")] @landings[url] = [url, .join("|"), tag_vers.join("|"), tag_descs.join("|")] @tag_store.merge!(record) puts "Tag entry loaded: #{record}" if @verbose else puts "No tag found. Skip site #{site}" if @verbose end end return record rescue => ee puts "Exception on method #{__method__}: #{ee}: #{site}" if @verbose end |
#fast_landing(site) ⇒ Object
Given a site, determine the landing url
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 169 def fast_landing(site) puts "Locate the landing url for: #{site}" if @verbose my_tracker=Wmap::SiteTracker.instance if my_tracker.known_sites.key?(site) # looking into the cache first if my_tracker.known_sites[site]['code'] >= 300 && my_tracker.known_sites[site]['code'] < 400 url = my_tracker.known_sites[site]['redirection'] else url = site end my_tracker = nil else # no cache, then need to do it fresh my_checker = Wmap::UrlChecker.new url = my_checker.landing_location(site) my_checker = nil end puts "Landing url found: #{url}" if @verbose return url rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose end |
#find_tags(url) ⇒ Object
Search the page for known tag signatures. If found return them in an array
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 193 def (url) puts "Search and return tags within the url payload: #{url}" if @verbose tag_list = [] doc = open_page(url) doc.text.each_line do |line| my_line = line.downcase @tag_signatures.keys.map do |tag| tag_list.push(tag) if my_line.include?(tag) end end return tag_list rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return [] end |
#get_desc(url, tag) ⇒ Object
Search the url payload for known tag. If found return the base64 encode whole script snippet.
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 275 def get_desc(url,tag) puts "Search and return tag script in url payload: #{url}, #{tag}" if @verbose recording=false tag_found=false tag_desc="" doc = open_page(url) doc.search('script').map do |script| if script.text.include?(tag) && script.text.length < 65535 return script.text end end doc = nil return tag_desc rescue => ee puts "Exception on method #{__method__}: #{ee}: #{url}: #{tag}" if @verbose return tag_desc end |
#get_ver(url, tag) ⇒ Object
Search the url payload for known tag version identifier. If found return a string, else empty string.
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 210 def get_ver(url,tag) puts "Search and return tag version within the url payload: #{url}, #{tag}" if @verbose tag_ver="" doc = open_page(url) case tag when "utag.js" # sample: ...,"code_release_version":"cb20190312032612",... doc.text.each_line do |line| my_line = line.downcase if my_line.include?("code_release_version") puts "Extract tag version from line: #{my_line}" if @verbose m = my_line.match(/\"code\_release\_version\"\:\"(?<ver>[a-z]+\d+)\"/) tag_ver = m[:ver] break end end when "analytics.js" # sample #1: ga('create', 'UA-19175804-2', 'knopfdoubleday.com'); doc.text.each_line do |line| my_line = line.downcase if my_line.include?("ga") && my_line.include?("create") #sample #2: __gaTracker('create', 'UA-121313929-1', 'auto'); puts "Extract tag version from line: #{my_line}" if @verbose m = my_line.match(/[\'|\"]create[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]\s*\,/) tag_ver = m[:ver] break end end when "ga.js" doc.text.each_line do |line| my_line = line.downcase puts my_line if @verbose if my_line.include?("push") && my_line.include?("_setaccount") # # sample #1: _gaq.push(['_setAccount', 'UA-13205363-65']); m = my_line.match(/[\'|\"]\_setaccount[\'|\"]\s*\,\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/) tag_ver = m[:ver] break end if my_line.include?("_gettracker") # sample #2: var pageTracker = _gat._getTracker("UA-12487327-1"); puts "Extract tag version from line: #{my_line}" if @verbose m = my_line.match(/\_gettracker\s*\(\s*[\'|\"](?<ver>\w+\-\d+\-\d+)[\'|\"]/) tag_ver = m[:ver] break end end when "all.js" # sample: appId : '749936668352954', doc.text.each_line do |line| my_line = line.downcase if my_line.include?("appid") && my_line.include?(":") puts "Extract tag version from line: #{my_line}" if @verbose m = my_line.match(/appid\s+\:\s+[\'|\"](?<ver>\d+)[\'|\"]\s*\,/) tag_ver = m[:ver] break end end else puts "Don't know how to locate Adware Tag version: #{tag}" # do nothing end doc = nil return tag_ver.upcase rescue => ee puts "Exception on method #{__method__}: #{ee}: #{url} : #{tag}" if @verbose return tag_ver end |
#load_sig_from_file(file, lc = true) ⇒ Object
load the known tag signatures into an instance variable
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 38 def load_sig_from_file (file, lc=true) puts "Loading data file: #{file}" if @verbose data_store=Hash.new f = File.open(file, 'r') f.each_line do |line| puts "Processing line: #{line}" if @verbose line=line.chomp.strip next if line.nil? next if line.empty? next if line =~ /^\s*#/ line=line.downcase if lc==true entry=line.split(',') if data_store.key?(entry[0]) next else data_store[entry[0]]=entry[1].strip end end f.close return data_store rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end |
#load_tag_from_file(file, lc = false) ⇒ Object
load the known tag store cache into an instance variable
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 64 def load_tag_from_file (file, lc=false) puts "Loading tag data file: #{file}" if @verbose @tag_store=Hash.new f = File.open(file, 'r') f.each_line do |line| puts "Processing line: #{line}" if @verbose line=line.chomp.strip next if line.nil? next if line.empty? next if line =~ /^\s*#/ line=line.downcase if lc==true entry=line.split(',') if @tag_store.key?(entry[0]) next else @tag_store[entry[0]]=[entry[1].strip, entry[2].strip, entry[3], entry[4]] end end f.close return @tag_store rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose return nil end |
#refresh(num = @max_parallel, use_cache = true) ⇒ Object
Refresh adware tag store signatures
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 107 def refresh (num=@max_parallel,use_cache=true) puts "Add entries to the local cache table from site tracker: " if @verbose results = Hash.new = @tag_store.keys if .size > 0 Parallel.map(, :in_processes => num) { |target| check_adware(target,use_cache) }.each do |process| if !process next else results.merge!(process) end end @tag_store.merge!(results) puts "Done loading adware entries." = nil return results else puts "Error: no entry is loaded. Please check your list and try again." end = nil return results rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose end |
#save_to_file!(file_tag = @tag_file, tags = @tag_store) ⇒ Object Also known as: save!
Save the current tag store hash table into a file
90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/wmap/url_crawler/adware_tag.rb', line 90 def save_to_file!(file_tag=@tag_file, =@tag_store) puts "Saving the current wordpress site table from memory to file: #{file_tag} ..." if @verbose =Time.now f=File.open(file_tag, 'w') f.write "# Local tag file created by class #{self.class} method #{__method__} at: #{}\n" f.write "# Site, Landing URL, Detected Adware Tag, Tag Version, Tag Description\n" .each do |key, val| f.write "#{key}, #{val[0]}, #{val[1]}, #{val[2]}, #{val[3]}\n" end f.close puts "Tag store cache table is successfully saved: #{file_tag}" rescue => ee puts "Exception on method #{__method__}: #{ee}" if @verbose end |