Class: Wmap::SiteTracker

Inherits:
Object
  • Object
show all
Includes:
Singleton, Utils
Defined in:
lib/wmap/site_tracker.rb,
lib/wmap/site_tracker/wp_tracker.rb,
lib/wmap/site_tracker/deactivated_site.rb

Overview

Main class to automatically track the site inventory

Direct Known Subclasses

DeactivatedSite, WpTracker

Defined Under Namespace

Classes: DeactivatedSite, WpTracker

Constant Summary

Constants included from Utils::UrlMagic

Utils::UrlMagic::Max_http_timeout, Utils::UrlMagic::User_agent

Constants included from Utils::DomainRoot

Utils::DomainRoot::File_ccsld, Utils::DomainRoot::File_cctld, Utils::DomainRoot::File_gtld, Utils::DomainRoot::File_tld

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Utils

#cidr_2_ips, #file_2_hash, #file_2_list, #get_nameserver, #get_nameservers, #host_2_ip, #host_2_ips, #is_cidr?, #is_fqdn?, #is_ip?, #list_2_file, #reverse_dns_lookup, #sort_ips, #valid_dns_record?, #zone_transferable?

Methods included from Utils::Logger

#wlog

Methods included from Utils::UrlMagic

#create_absolute_url_from_base, #create_absolute_url_from_context, #host_2_url, #is_site?, #is_ssl?, #is_url?, #landing_location, #make_absolute, #normalize_url, #open_page, #redirect_location, #response_code, #response_headers, #url_2_host, #url_2_path, #url_2_port, #url_2_site, #urls_on_same_domain?

Methods included from Utils::DomainRoot

#get_domain_root, #get_domain_root_by_ccsld, #get_domain_root_by_cctld, #get_domain_root_by_tlds, #get_sub_domain, #is_domain_root?, #print_ccsld, #print_cctld, #print_gtld

Constructor Details

#initialize(params = {}) ⇒ SiteTracker

Set default instance variables



21
22
23
24
25
26
27
28
29
30
31
# File 'lib/wmap/site_tracker.rb', line 21

def initialize (params = {})
	# Initialize the instance variables
	@data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../data/')
	Dir.mkdir(@data_dir) unless Dir.exist?(@data_dir)
	@sites_file=params.fetch(:sites_file, @data_dir+'sites')
	@verbose=params.fetch(:verbose, false)
	@max_parallel=params.fetch(:max_parallel, 30)
	File.new(@sites_file, "w") unless File.exist?(@sites_file)
	# Hash table to hold the site store
	load_site_stores_from_file(@sites_file)
end

Instance Attribute Details

#data_dirObject

Returns the value of attribute data_dir.



18
19
20
# File 'lib/wmap/site_tracker.rb', line 18

def data_dir
  @data_dir
end

#known_sitesObject

Returns the value of attribute known_sites.



18
19
20
# File 'lib/wmap/site_tracker.rb', line 18

def known_sites
  @known_sites
end

#max_parallelObject

Returns the value of attribute max_parallel.



18
19
20
# File 'lib/wmap/site_tracker.rb', line 18

def max_parallel
  @max_parallel
end

#sites_fileObject

Returns the value of attribute sites_file.



18
19
20
# File 'lib/wmap/site_tracker.rb', line 18

def sites_file
  @sites_file
end

#verboseObject

Returns the value of attribute verbose.



18
19
20
# File 'lib/wmap/site_tracker.rb', line 18

def verbose
  @verbose
end

Instance Method Details

#add(site) ⇒ Object

Setter to add site entry to the cache one at a time



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/wmap/site_tracker.rb', line 118

def add(site)
	puts "Add entry to the site store: #{site}"
	# Preliminary sanity check
	site=site.strip.downcase unless site.nil?
	if site_known?(site)
		puts  "Site already exists. Skip it: #{site}"
		return nil
	end
	site=normalize_url(site) if is_url?(site)
	site=url_2_site(site) if is_url?(site)
	puts "Site in standard format: #{site}" if @verbose
	raise "Exception on method #{__method__}: invalid site format of #{site}. Expected format is: http://your_website_name/" unless is_site?(site)
	trusted=false
	host=url_2_host(site)
	ip=host_2_ip(host)
	# Additional logic to refresh deactivated site, 02/12/2014
	deact=Wmap::SiteTracker::DeactivatedSite.instance
	deact.sites_file=@data_dir + "/" + "deactivated_sites"
	File.new(deact.sites_file, "w") unless File.exist?(deact.sites_file)
	deact.load_site_stores_from_file
	# only trust either the domain or IP we know
	if is_ip?(host)
		trusted=Wmap::CidrTracker.new(:data_dir=>@data_dir).ip_trusted?(ip)
	else
		root=get_domain_root(host)
		if root.nil?
			raise "Invalid web site format. Please check your record again."
		else
			domain_tracker=Wmap::DomainTracker.instance
			domain_tracker.domains_file=@data_dir + "/" + "domains"
			File.new(domain_tracker.domains_file, "w") unless File.exist?(domain_tracker.domains_file)
			domain_tracker.load_domains_from_file(domain_tracker.domains_file)
			trusted=domain_tracker.domain_known?(root)
			domain_tracker=nil
		end
	end
	# add record only if trusted
	host_tracker = Wmap::HostTracker.instance
	host_tracker.data_dir= @data_dir
	host_tracker.hosts_file = host_tracker.data_dir + "/" + "hosts"
	host_tracker.load_known_hosts_from_file(host_tracker.hosts_file)
	if trusted
		# Add logic to check site status before adding it
		checker=Wmap::UrlChecker.new(:data_dir=>@data_dir).check(site)
		raise "Site is currently down. Skip #{site}" if checker.nil?
		raise "Site is time-out. Skip #{site}" if checker["code"] == 10000
		# Skip the http site if it's un-responsive; for the https we'll keep it because we're interested in analysing the SSL layer later
		if is_https?(site)
			# do nothing
		else
			raise "Site is currently down. Skip #{site}" if checker['code']==10000
		end
		raise "Exception on add method - Fail to resolve the host-name: Host - #{host}, IP - #{ip}. Skip #{site}" unless is_ip?(ip)
		# Update the local host table when necessary
		if is_ip?(host)
			# Case #1: Trusted site contains IP
			if host_tracker.ip_known?(host)
				# Try local reverse DNS lookup first
				puts "Local hosts table lookup for IP: #{ip}" if @verbose
				host=host_tracker.local_ip_2_host(host)
				puts "Host found from the local hosts table for #{ip}: #{host}" if @verbose
				site.sub!(/\d+\.\d+\.\d+\.\d+/,host)
			else
				# Try reverse DNS lookup over Internet as secondary precaution
				puts "Reverse DNS lookup for IP: #{ip}" if @verbose
				host1=ip_2_host(host)
				puts "host1: #{host1}" if @verbose
				if is_fqdn?(host1)
					if host_tracker.domain_known?(host1)
						# replace IP with host-name only if domain root is known
						puts "Host found from the Internet reverse DNS lookup for #{ip}: #{host1}" if @verbose
						host=host1
						site.sub!(/\d+\.\d+\.\d+\.\d+/,host)
					end
				end
			end
			# Adding site for Case #1
			raise "Site already exist! Skip #{site}" if @known_sites.key?(site)
			puts "Adding site: #{site}" if @verbose
			@known_sites[site]=Hash.new
			@known_sites[site]=checker
			if deact.site_known?(site)
				deact.delete(site)
				deact.save!
			end
			puts "Site entry loaded: #{checker}"
			if is_fqdn?(host)
			# Add logic to update the hosts table for case #1 variance
			# -  case that reverse DNS lookup successful
				puts "Update local hosts table for host: #{host}"
				if host_tracker.host_known?(host)
					old_ip=host_tracker.local_host_2_ip(host)
					if old_ip != ip
						host_tracker.refresh(host)
					else
						puts "Host resolve to the same IP #{ip} - no need to update the local host table." if @verbose
					end
				else
					host_tracker.add(host)
				end
			end
		else
			# Case #2: Trusted site contains valid FQDN
			puts "Ading site: #{site}" if @verbose
			@known_sites[site]=Hash.new
			@known_sites[site]=checker
			if deact.site_known?(site)
				deact.delete(site)
				deact.save!
			end
			puts "Site entry loaded: #{checker}"
			# Add logic to update the hosts table for case #2
			puts "Update local hosts table for host: #{host}"
			if host_tracker.host_known?(host)
				old_ip=host_tracker.local_host_2_ip(host)
				if old_ip != ip
					host_tracker.refresh(host)
				else
					# Skip - no need to update the local hosts table
				end
			else
				host_tracker.add(host)
			end
		end
		deact=nil
		host_tracker=nil
		return checker
	else
		puts "Problem found: untrusted Internet domain or IP. Skip #{site}"
		deact=nil
		host_tracker.save!
		host_tracker=nil
		return nil
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}"
	checker=nil
	deact=nil
	host_tracker=nil
	return nil
end

#bulk_add(list, num = @max_parallel) ⇒ Object Also known as: adds

Setter to add site entry to the cache in batch (from a list)



274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# File 'lib/wmap/site_tracker.rb', line 274

def bulk_add(list,num=@max_parallel)
	puts "Add entries to the local site store from list:\n #{list}"
	results=Hash.new
	list = list - [nil,""]
	if list.size > 0
		puts "Start parallel adding on the sites:\n #{list}"
		Parallel.map(list, :in_processes => num) { |target|
			add(target)
		}.each do |process|
			if process.nil?
				next
			elsif process.empty?
				next #do nothing
			else
				results[process['url']]=Hash.new
				results[process['url']]=process
			end
		end
		@known_sites.merge!(results)
	else
		puts "Error: no entry is added. Please check your list and try again."
	end
	puts "Done adding site entries."
	if results.size>0
		puts "New entries added: #{results}"
	else
		puts "No new entry added. "
	end
	return results
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
end

#bulk_delete(list) ⇒ Object Also known as: dels

Setter to delete site entry to the cache in batch (from a list)



349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# File 'lib/wmap/site_tracker.rb', line 349

def bulk_delete(list)
	puts "Delete entries to the local site store from list:\n #{list}" if @verbose
	sites=list
	changes=Array.new
	if sites.size > 0
		sites.map do |x|
			x=url_2_site(x)
			site=delete(x)
			changes.push(site) unless site.nil?
		end
		puts "Done deleting sites from the list:\n #{list}"
		return changes
	else
		puts "Error: no entry is loaded. Please check your list and try again."
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
end

#bulk_refresh(list, num = @max_parallel) ⇒ Object Also known as: refreshs

‘Refresh sites in the site store in batch (from a list)



415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
# File 'lib/wmap/site_tracker.rb', line 415

def bulk_refresh(list,num=@max_parallel)
	puts "Refresh entries in the site store from list:\n #{list}" if @verbose
	results=Hash.new
	if list.size > 0
		puts "Start parallel refreshing on the sites:\n #{list}"
		Parallel.map(list, :in_processes => num) { |target|
			refresh(target)
		}.each do |process|
			if process.nil?
				next
			elsif process.empty?
				#do nothing
			else
				results[process['url']]=Hash.new
				results[process['url']]=process
			end
		end
		# Clean up old entries, by Y.L. 03/30/2015
		list.map {|x| @known_sites.delete(x)}
		# Add back fresh entries
		@known_sites.merge!(results)
		puts "Done refresh sites."
	else
		puts "Error: no entry is loaded. Please check your list and try again."
	end
	return results
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
end

#countObject

Count numbers of entries in the site store table



92
93
94
95
96
97
# File 'lib/wmap/site_tracker.rb', line 92

def count
	puts "Counting number of entries in the site store table ..."
	return @known_sites.size
rescue => ee
	puts "Exception on method #{__method__}: #{ee}"
end

#delete(site) ⇒ Object Also known as: del

Setter to remove entry from the site store one at a time



309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# File 'lib/wmap/site_tracker.rb', line 309

def delete(site)
	puts "Remove entry from the site store: #{site} " if @verbose
	# Additional logic to deactivate the site properly, by moving it to the DeactivatedSite list, 02/07/2014
	deact=Wmap::SiteTracker::DeactivatedSite.instance
	deact.sites_file=@data_dir + 'deactivated_sites'
	File.new(deact.sites_file, "w") unless File.exist?(deact.sites_file)
	site=site.strip.downcase
	site=url_2_site(site)
	if @known_sites.key?(site)
		site_info=@known_sites[site]
		deact.add(site,site_info)
		deact.save!
		deact=nil
		del=@known_sites.delete(site)
		puts "Entry cleared: #{site}"
		return del
	else
		puts "Entry not fund. Skip #{site}"
		deact=nil
		return nil
	end
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	deact=nil
end

#file_add(file) ⇒ Object

Setter to add site entry to the cache table in batch (from a file)



261
262
263
264
265
266
267
268
269
270
271
# File 'lib/wmap/site_tracker.rb', line 261

def file_add(file)
	puts "Add entries to the local site store from file: #{file}"
	raise "File non-exist. Please check your file path and name again: #{file}" unless File.exist?(file)
	changes=Hash.new
	sites=file_2_list(file)
	changes=bulk_add(sites) unless sites.nil? or sites.empty?
	puts "Done loading file #{file}. "
	return changes
rescue => ee
	puts "Exception on method #{__method__}: #{ee}"
end

#file_delete(file) ⇒ Object Also known as: file_del

Setter to delete site entry to the cache in batch (from a file)



337
338
339
340
341
342
343
344
345
# File 'lib/wmap/site_tracker.rb', line 337

def file_delete(file)
	puts "Delete entries to the local site store from file: #{file}" if @verbose
	raise "File non-exist. Please check your file path and name again: #{file}" unless File.exist?(file)
	sites=file_2_list(file)
	changes=Array.new
	changes=bulk_delete(sites) unless sites.nil? or sites.empty?
rescue => ee
	puts "Exception on method file_delete: #{ee} for file: #{file}" if @verbose
end

#file_refresh(file) ⇒ Object

‘Refresh sites in the site store in batch (from a file)



389
390
391
392
393
394
395
396
397
# File 'lib/wmap/site_tracker.rb', line 389

def file_refresh(file)
	puts "Refresh entries in the site store from file: #{file}" if @verbose
	changes=Hash.new
	sites=file_2_list(file)
	changes=bulk_refresh(sites) unless sites.nil? or sites.empty?
	return changes
rescue => ee
	puts "Exception on method #{__method__}: #{ee} for file: #{file}" if @verbose
end

#get_ext_sitesObject Also known as: get_ext

Retrieve external hosted sites into a list



517
518
519
520
521
522
523
524
525
526
527
528
529
530
# File 'lib/wmap/site_tracker.rb', line 517

def get_ext_sites
	puts "getter to retrieve all the external hosted sites. " if @verbose
	sites=Array.new
	@known_sites.keys.map do |key|
		if @known_sites[key]['status']=="ext_hosted"
			sites.push(key)
		end
	end
	sites.sort!
	return sites
rescue Exception => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#get_int_sitesObject Also known as: get_int

Retrieve a list of internal hosted site URLs



534
535
536
537
538
539
540
541
542
543
544
545
546
547
# File 'lib/wmap/site_tracker.rb', line 534

def get_int_sites
	puts "getter to retrieve all the internal hosted sites." if @verbose
	sites=Array.new
	@known_sites.keys.map do |key|
		if @known_sites[key]['status']=="int_hosted"
			sites.push(key)
		end
	end
	sites.sort!
	return sites
rescue Exception => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#get_ip_sitesObject

Retrieve a list of sites that contain an IP in the site URL



551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
# File 'lib/wmap/site_tracker.rb', line 551

def get_ip_sites
	puts "Getter to retrieve sites contain an IP instead of a host-name ." if @verbose
	sites=Array.new
	@known_sites.keys.map do |key|
		host=url_2_host(key)
		if is_ip?(host)
			sites.push(key)
		end
	end
	sites.sort!
	return sites
rescue Exception => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#get_prim_uniq_sitesObject Also known as: get_prime

Retrieve the unique sites from the local site store in the primary host format



821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
# File 'lib/wmap/site_tracker.rb', line 821

def get_prim_uniq_sites
	puts "Retrieve and prime unique sites in the site store. " if @verbose
	host_tracker=Wmap::HostTracker.instance
	host_tracker.data_dir=@data_dir
	primary_host_tracker=Wmap::HostTracker::PrimaryHost.instance
	primary_host_tracker.data_dir=@data_dir
	primary_host_tracker.hosts_file = primary_host_tracker.data_dir + "/" + "prime_hosts"
	primary_host_tracker.known_hosts=primary_host_tracker.load_known_hosts_from_file(@hosts_file)
	# Step 1. Retrieve the unique site list first
	sites=get_uniq_sites
	prim_uniq_sites=Array.new
	# Step 2. Iterate on the unique site list, spit out the site in the primary host format one at a time
	sites.map do |site|
		puts "Work on priming unique site: #{site}" if @verbose
		host=url_2_host(site)
		# case#1, for the IP only site, do nothing (presuming 'refresh_ip_sites' or 'refresh_all' method already take care of the potential discrepancy here).
		if is_ip?(host)
			prim_uniq_sites.push(site)
			next
		end
		ip=@known_sites[site]['ip']
		# case#2, for site with an unique IP, do nothing
		puts "Local hosts table entry count for #{ip}: #{host_tracker.alias[ip]}" if @verbose
		if host_tracker.alias[ip] == 1
			prim_uniq_sites.push(site)
			next
		end
		# case#3, case of multiple IPs for A DNS record, where the site IP may have 0 alias count, do nothing
		if host_tracker.alias[ip] == nil
			prim_uniq_sites.push(site)
			next
		end
		# case#4, for the site has a duplicate IP with others, we try to determine which one is the primary site
		# raise "Error: inconsistency detected on record: #{site}. Please run the following shell command to refresh it first: \n\srefresh #{site}" if tracker1.alias[ip].nil?
		if ( primary_host_tracker.known_hosts.key?(ip) and (host_tracker.alias[ip] > 1) )
			new_host=primary_host_tracker.prime(host)
			puts "Host: #{host}, New host:#{new_host}" if @verbose
			unless host==new_host
				new_site=site.sub(host,new_host)
				raise "Site not found in the site tracking data repository: #{new_site}. You may need to add it into the site store first. Execute the following shell command before trying again: \n\twadd #{new_site}\n" unless @known_sites.key?(new_site)
				new_ip=@known_sites[new_site]['ip']
				if new_ip==ip		# consistency check
					site=new_site
				else
					# TBD - case of multiple IPs for A DNS record
					#raise "Inconsistency found on prime host entrance: #{new_ip}, #{ip}; #{new_site}, #{site}. Please refresh your entries by running the following shell command: \n\s refresh #{new_site}"
				end
			end
		end
		prim_uniq_sites.push(site)
	end
	primary_host_tracker=nil
	host_tracker=nil
	return prim_uniq_sites
#rescue => ee
#	puts "Exception on method #{__method__}: #{ee}"
end

#get_redirection_url(site) ⇒ Object

Retrieve redirection URL if available



649
650
651
652
653
654
655
656
657
658
659
660
661
# File 'lib/wmap/site_tracker.rb', line 649

def get_redirection_url (site)
	puts "getter to retrieve the redirection URL from the site store." if @verbose
	site=site.strip.downcase
	if @known_sites.key?(site)
		return @known_sites[site]['redirection']
	else
		puts "Unknown site: #{site}" if @verbose
		return nil
	end
rescue Exception => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#get_redirection_urlsObject

Retrieve a list of redirection URLs from the site store



633
634
635
636
637
638
639
640
641
642
643
644
645
646
# File 'lib/wmap/site_tracker.rb', line 633

def get_redirection_urls
	puts "getter to retrieve all the redirection URLs from the site store." if @verbose
	urls=Array.new
	@known_sites.keys.map do |key|
		unless @known_sites[key]['redirection'].nil?
			urls.push(@known_sites[key]['redirection'])
		end
	end
	urls.sort!
	return urls
rescue Exception => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#get_ssl_sitesObject

Retrieve a list of sites that contain an IP in the site URL



618
619
620
621
622
623
624
625
626
627
628
629
630
# File 'lib/wmap/site_tracker.rb', line 618

def get_ssl_sites
	puts "getter to retrieve https sites from the site store." if @verbose
	sites=Array.new
	@known_sites.keys.map do |key|
		key =~ /https/i
		sites.push(key)
	end
	sites.sort!
	return sites
rescue Exception => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#get_uniq_sitesObject Also known as: uniq_sites

Retrieve a list of unique sites within the known site store



568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
# File 'lib/wmap/site_tracker.rb', line 568

def get_uniq_sites
	puts "Getter to retrieve unique sites containing unique IP:PORT key identifier." if @verbose=
	#primary_host_tracker=Wmap::HostTracker::PrimaryHost.instance
	sites=Hash.new
	#uniqueness=Hash.new
	host_tracker=Wmap::HostTracker.instance
	host_tracker.data_dir=@data_dir
	host_tracker.hosts_file=host_tracker.data_dir + '/' + 'hosts'
	host_tracker.load_known_hosts_from_file
	@known_sites.keys.map do |key|
		port=url_2_port(key).to_s
		host=url_2_host(key)
		md5=@known_sites[key]['md5']
		code=@known_sites[key]['code']
		ip=host_tracker.local_host_2_ip(host)
		ip=host_2_ip(host) if ip.nil?
		# filtering out 'un-reachable' sites
		next if (code == 10000 or code == 20000)
		# filtering out 'empty' sites
		next if (md5.nil? or md5.empty?)
		next if ip.nil?
		# url_new=key
		#if primary_host_tracker.ip_known?(ip)
		#	p_host=primary_host_tracker.known_hosts[ip]
		#	url_new=key.sub(host,p_host)
		#end
		id=ip+":"+port
		# filtering out duplicates by 'IP:PORT' key pair
		unless sites.key?(id)
			#if @known_sites.key?(key)
			#	sites[id]=url_new
			#else
				# Further filtering out redundant site by checking MD5 finger-print
				#unless uniqueness.key?(md5)
					sites[id]=key
				#	uniqueness[md5]=true
				#end
			#end
		end
	end
	#primary_host_tracker=nil
	host_tracker=nil
	return sites.values
rescue Exception => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#is_trusted?(site) ⇒ Boolean

determine site is trusted based on the known domains

Returns:

  • (Boolean)


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/wmap/site_tracker.rb', line 100

def is_trusted?(site)
	trusted=false
	host=url_2_host(site)
	root=get_domain_root(host)
	domain_tracker=Wmap::DomainTracker.instance
	domain_tracker.data_dir=@data_dir
	domain_tracker.domains_file=@data_dir + "/" + "domains"
	File.new(domain_tracker.domains_file, "w") unless File.exist?(domain_tracker.domains_file)
	domain_tracker.load_domains_from_file(domain_tracker.domains_file)
	trusted=domain_tracker.domain_known?(root)
	domain_tracker=nil
	return trusted
rescue => ee
	puts "Exception on method #{__method__}: #{ee}"
	return trusted
end

#load_site_stores_from_file(file = @sites_file) ⇒ Object

Setter to load the known hosts into an instance variable



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/wmap/site_tracker.rb', line 34

def load_site_stores_from_file (file=@sites_file)
	puts "Loading the site store data repository from file: #{file} " if @verbose
	@known_sites=Hash.new
	File.new(file, "w") unless File.exist?(file)
	f=File.open(file, 'r')
	f.each do |line|
		line=line.chomp.strip
		next if line.nil?
		next if line.empty?
		next if line =~ /^\s*#/
		entry=line.split(%r{\t+|\,})
		site=entry[0].downcase
		ip=entry[1]
		port=entry[2]
		status=entry[3]
		server=entry[4]
		res=entry[5].to_i
		fp=entry[6]
		loc=entry[7]
		timestamp=entry[8]
		puts "Loading entry: #{site} - #{ip} - #{status}" if @verbose
		@known_sites[site]= Hash.new unless @known_sites.key?(site)
		@known_sites[site]['ip']=ip
		@known_sites[site]['port']=port
		@known_sites[site]['status']=status
		@known_sites[site]['server']=server
		@known_sites[site]['code']=res
		@known_sites[site]['md5']=fp
		@known_sites[site]['redirection']=loc
		@known_sites[site]['timestamp']=timestamp
	end
	f.close
	puts "Successfully loading file: #{file}" if @verbose
	return @known_sites
rescue => ee
	puts "Exception on method #{__method__} for file #{file}: #{ee}"
end

Print summary report of all sites URL in the site store



739
740
741
742
743
744
745
746
747
748
# File 'lib/wmap/site_tracker.rb', line 739

def print_all_sites
	puts "\nSummary Report of the site store:"
	sites=@known_sites.keys.sort
	sites.each do |site|
		puts site
	end
	puts "End of the summary"
rescue => ee
	puts "Exception on method #{__method__} "
end

Print summary report of external hosted sites URL in the



881
882
883
884
885
886
887
888
# File 'lib/wmap/site_tracker.rb', line 881

def print_ext_sites
	puts "\nSummary Report of the External Hosted Site"
	sites=get_ext_sites
	sites.each do |site|
		puts site
	end
	return nil
end

Print summary report of internal hosted site URLs



892
893
894
895
896
897
898
899
# File 'lib/wmap/site_tracker.rb', line 892

def print_int_sites
	puts "\nSummary Report of the Internal Hosted Site"
	sites=get_int_sites
	sites.each do |site|
		puts site
	end
	return nil
end

Print summary report on all sites that contain an IP in the site URL



709
710
711
712
713
714
715
716
# File 'lib/wmap/site_tracker.rb', line 709

def print_ip_sites
	puts "Print sites contain an IP instead of a host-name."
	sites=get_ip_sites
	sites.map { |x| puts x }
	puts "End of report. "
rescue => ee
	puts "Exception on method #{__method__} "
end

Retrieve and print specific information of a site in the site store



719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
# File 'lib/wmap/site_tracker.rb', line 719

def print_site(site)
	puts "Site Information Report for: #{site}" if @verbose
	site=site.strip unless site.nil?
	raise "Unknown site: #{site}" unless @known_sites.key?(site)
	ip=@known_sites[site]['ip']
	port=@known_sites[site]['port']
	status=@known_sites[site]['status']
	server=@known_sites[site]['server']
	fp=@known_sites[site]['md5']
	loc=@known_sites[site]['redirection']
	res=@known_sites[site]['code']
	timestamp=@known_sites[site]['timestamp']
	puts "#{site},#{ip},#{port},#{status},#{server},#{res},#{fp},#{loc},#{timestamp}"
rescue => ee
	puts "Exception on method #{__method__} for #{site}: #{ee}"
end

Print summary report of internal hosted site URLs



903
904
905
906
907
908
909
910
# File 'lib/wmap/site_tracker.rb', line 903

def print_ssl_sites
	puts "\nSummary Report of the HTTPS Sites from the Site Store"
	sites=get_ssl_sites
	sites.each do |site|
		puts site
	end
	return nil
end

Print summary report of unique sites in the site store



913
914
915
916
917
918
919
920
# File 'lib/wmap/site_tracker.rb', line 913

def print_uniq_sites
	puts "Summary Report for the Unique sites:"
	puts "Website,Primary IP,Port,Hosting Status,Server,Response Code,Site MD5 Finger-print,Site Redirection,Timestamp"
	sites=get_uniq_sites
	sites.each do |site|
		print_site(site)
	end
end

#refresh(site) ⇒ Object

Setter to refresh the entry in the site store one at a time



370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
# File 'lib/wmap/site_tracker.rb', line 370

def refresh(site)
	puts "Refresh the local site store for site: #{site} "
	raise "Invalid site: #{site}" if site.nil? or site.empty?
	site=site.strip.downcase
	if @known_sites.key?(site)
		delete(site)
		site_info=add(site)
		puts "Done refresh entry: #{site}"
		return site_info
	else
		puts "Error entry non exist: #{site}"
	end
	return nil
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
	return nil
end

#refresh_allObject

Refresh all site entries in the stores in one shot



448
449
450
451
452
453
454
455
456
457
# File 'lib/wmap/site_tracker.rb', line 448

def refresh_all
	puts "Refresh all the entries within the local site store ... "
	changes=Hash.new
	changes=bulk_refresh(@known_sites.keys)
	@known_sites.merge!(changes)
	puts "Done refresh all entries."
	return changes
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
end

#refresh_ip_sitesObject

Refresh all site entries in the stores that contains an IP instead of a hostname



460
461
462
463
464
465
466
467
468
469
470
471
# File 'lib/wmap/site_tracker.rb', line 460

def refresh_ip_sites
	puts "Refresh all entries that contain an IP address instead of a FQDN ... "
	sites=get_ip_sites
	live_sites=sites.delete_if { |x| @known_sites[x]['code'] == 10000 or  @known_sites[x]['code'] == 20000 }
	changes=Hash.new
	changes=bulk_refresh(live_sites)
	@known_sites.merge!(changes)
	puts "Done refresh IP sites."
	return changes
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
end

#refresh_uniq_sitesObject

‘Refresh unique sites in the site store only



400
401
402
403
404
405
406
407
408
409
410
411
412
# File 'lib/wmap/site_tracker.rb', line 400

def refresh_uniq_sites
	puts "Refresh unique site entries in the site store. " if @verbose
	changes=Hash.new
	sites=get_uniq_sites
	if sites.size > 0
		changes=bulk_refresh(sites)
	else
		puts "Error: no entry is refreshed. Please check your site store and try again."
	end
	return changes
rescue => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
end

#resolve_ip_sitesObject

Perform local host table reverse lookup for the IP sites, in hope that the hostname could now be resolved since the site was discovered



664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
# File 'lib/wmap/site_tracker.rb', line 664

def resolve_ip_sites
	puts "Resolve sites that contain an IP address. Update the site cache table once a hostname is found in the local host table." if @verbose
	updates=Array.new
	sites=get_ip_sites
	host_tracker=Wmap::HostTracker.instance
	host_tracker.data_dir=@data_dir
	host_tracker.hosts_file = host_tracker.data_dir + "/" + "hosts"
	host_tracker.load_known_hosts_from_file
	sites.map do |site|
		puts "Work on resolve the IP site: #{site}" if @verbose
		ip=url_2_host(site)
		hostname=host_tracker.local_ip_2_host(ip)
		if hostname.nil?
			puts "Can't resolve #{ip} from the local host store. Skip #{site}" if @verbose
		else
			puts "Host-name found for IP #{ip}: #{hostname}" if @verbose
			updates.push(site)
			refresh(site)
		end
	end
	updates.sort!
	puts "The following sites are now refreshed: #{updates}" if @verbose
	host_tracker=nil
	return updates
rescue Exception => ee
	puts "Exception on method #{__method__}: #{ee}" if @verbose
end

#save_sites_to_file!(file_sites = @sites_file) ⇒ Object Also known as: save!

Save the current site store hash table into a file



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/wmap/site_tracker.rb', line 73

def save_sites_to_file!(file_sites=@sites_file)
	puts "Saving the current site store table from memory to file: #{file_sites}"
	timestamp=Time.now
	f=File.open(file_sites, 'w')
	f.write "# Local site store created by class #{self.class} method #{__method__} at: #{timestamp}\n"
	f.write "# Website,Primary IP,Port,Hosting Status,Server,Response Code,MD5 Finger-print,Redirection,Timestamp\n"
	@known_sites.keys.sort.map do |key|
		if is_trusted?(key)
			f.write "#{key},#{@known_sites[key]['ip']},#{@known_sites[key]['port']},#{@known_sites[key]['status']},#{@known_sites[key]['server']},#{@known_sites[key]['code']},#{@known_sites[key]['md5']},#{@known_sites[key]['redirection']},#{@known_sites[key]['timestamp']}\n"
		end
	end
	f.close
	puts "site store table is successfully saved: #{file_sites}"
rescue => ee
	puts "Exception on method #{__method__}: #{ee}"
end

#save_uniq_sites(file) ⇒ Object Also known as: dump

Retrieve and save unique sites information for the quarterly scan into a plain local file



752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
# File 'lib/wmap/site_tracker.rb', line 752

def save_uniq_sites(file)
	puts "Save unique sites information into a flat file: #{file}\nThis may take a long while as it go through a lengthy self correction check process, please be patient ..."
	prime_sites=get_prim_uniq_sites
	puts "Primary Sites: #{prime_sites}" if @verbose
	f=File.open(file,"w")
	f.write "Unique Sites Information Report\n"
	f.write "Site, IP, Port, Server, Hosting, Response Code, MD5, Redirect, Timestamps\n"
	prime_sites.map do |key|
		next if key.nil?
		site=key.strip
		raise "Unknown site: #{site}. You may need to add it into the site store first. Execute the following shell command before trying again: \n\wadd #{site}\n" unless @known_sites.key?(site)
		ip=@known_sites[site]['ip']
		port=@known_sites[site]['port']
		status=@known_sites[site]['status']
		server=@known_sites[site]['server']
		fp=@known_sites[site]['md5']
		loc=@known_sites[site]['redirection']
		res=@known_sites[site]['code']
		timestamp=@known_sites[site]['timestamp']
		f.write "#{site},#{ip},#{port},#{server},#{status},#{res},#{fp},#{loc},#{timestamp}\n"
	end
	f.close
	puts "Done!"
	return true  # success
rescue => ee
	puts "Exception on method #{__method__}: #{ee}"
	return false # fail
end

#save_uniq_sites_xml(file) ⇒ Object Also known as: dump_xml

Retrieve and save unique sites information for the quarterly scan into a XML file



783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
# File 'lib/wmap/site_tracker.rb', line 783

def save_uniq_sites_xml(file)
	puts "Save unique sites information into XML file: #{file}\nThis may take a long while as it go through lengthy self correctness check, please be patient ..."
	prime_sites=get_prim_uniq_sites
	builder = Nokogiri::XML::Builder.new do |xml|
		xml.root {
			xml.websites {
				prime_sites.each do |key|
					next if key.nil?
					site=key.strip
					raise "Unknown site: #{site}. You may need to add it into the site store first. Execute the following shell command before trying again: \n\twmap #{site}\n" unless @known_sites.key?(site)
					xml.site {
						xml.name site
						xml.ip_ @known_sites[site]['ip']
						xml.port_ @known_sites[site]['port']
						xml.status_ @known_sites[site]['status']
						xml.server_ @known_sites[site]['server']
						xml.fingerprint_ @known_sites[site]['md5']
						xml.redirection_ @known_sites[site]['redirection']
						xml.responsecode_ @known_sites[site]['code']
						xml.timestamp_ @known_sites[site]['timestamp']
					}
				end
			}
		}
	end
	puts builder.to_xml if @verbose
	f=File.new(file,'w')
	f.write(builder.to_xml)
	f.close
	puts "Done!"
	return true
rescue => ee
	puts "Exception on method #{__method__}: #{ee}"
	return false
end

#search(pattern) ⇒ Object

Search potential matching sites from the site store by using simple regular expression. Note that any upper-case char in the search string will be automatically converted into lower case



693
694
695
696
697
698
699
700
701
702
703
704
705
706
# File 'lib/wmap/site_tracker.rb', line 693

def search (pattern)
	puts "Search site store based on the regular expression: #{pattern}" if @verbose
	pattern=pattern.strip.downcase
	results=Array.new
	@known_sites.keys.map do |key|
		if key =~ /#{pattern}/i
			results.push(key)
		end
	end
	return results
rescue Exception => ee
	puts "Exception on method search: #{ee}" if @verbose
	return nil
end

#site_check(site) ⇒ Object Also known as: check

Quick check of the stored information of a site within the store



505
506
507
508
509
510
511
512
513
# File 'lib/wmap/site_tracker.rb', line 505

def site_check(site)
	raise "Web site store not loaded properly! " if @known_sites.nil?
	site=site.strip.downcase unless site.nil?
	site=url_2_site(site)
	return @known_sites[site] unless site.nil?
rescue => ee
	puts "Exception on method #{__method__}: #{ee}"
	return nil
end

#site_ip_known?(ip) ⇒ Boolean Also known as: siteip_known?

Quick validation check on an IP is already part of the site store

Returns:

  • (Boolean)


486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
# File 'lib/wmap/site_tracker.rb', line 486

def site_ip_known?(ip)
	ip=ip.chomp.strip
	known=false
	if is_ip?(ip)
		@known_sites.keys.map do |site|
			if @known_sites[site]['ip']==ip
				return true
			end
		end
	end
	myDis=nil
	return known
rescue => ee
	puts "Exception on method #{__method__}: #{ee}"
	return false
end

#site_known?(site) ⇒ Boolean Also known as: is_known?

Quick validation if a site is already covered under the site store

Returns:

  • (Boolean)


474
475
476
477
478
479
480
481
482
# File 'lib/wmap/site_tracker.rb', line 474

def site_known?(site)
	raise "Web site store not loaded properly! " if @known_sites.nil?
	site=site.strip.downcase unless site.nil?
	site=url_2_site(site)
	return @known_sites.key?(site) unless site.nil?
rescue => ee
	puts "Error checking web site #{site} against the site store: #{ee}"
	return false
end