Class: Metrocot

Inherits:
Object
  • Object
show all
Defined in:
lib/metrocot.rb

Overview

Purpose

This class implements the main Metrocot HTML scanner and a number of handy input scanners (for grabbing time, numbers, or text from HTML). The purpose of the Metrocot is to scan a XML dom for the patterns specified in the Metrocot pattern language.

Pattern Language

The Metrocot pattern language allows for the following types of patterns:

...

matches anything

+“some string”

matches that string

+/(some|pattern)/ matches that regexp pattern

./HPRICOT_PATH

matches a certain type of dom subtree

SPACE

matches zero or more white spaces

+(PATTERN_A PATTERN_B)

matches PATTERN_A followed by PATTERN_B

PATTERN+

matches one or more occurrences of PATTERN

Usage

0) create a Metricot and define the types of fields you want to extract (and their names). 1) use Hpricot to get the doc’s dom 2) use descend(xpath) to create a NodeScraper rooted at the Hpricot node(s) matching the xpath 3) use collect(pattern) to collect all entries found in the HTML which match the Metricot pattern

Defined Under Namespace

Modules: Scanners Classes: AnythingPattern, BasePattern, CompositePattern, MatchRange, NodeScraper, OneOrMorePattern, OptSpacePattern, PathPattern, TextPattern

Constant Summary collapse

VERSION =
'1.0.3'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(scanners) ⇒ Metrocot

Returns a new instance of Metrocot.



1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
# File 'lib/metrocot.rb', line 1154

def initialize( scanners )

	@scanners = {}
	@compiled_patterns = {}
	
	scanners.each { |name, value|
		if value.is_a? Class
			@scanners[name] = value.new
		else
			@scanners[name] = value
		end
	}

	@verbose = false

	log("scanners: #{@scanners.inspect}")

end

Instance Attribute Details

#verboseObject

Returns the value of attribute verbose.



1151
1152
1153
# File 'lib/metrocot.rb', line 1151

def verbose
  @verbose
end

Instance Method Details

#compile_pattern(pattern_s, node_scraper) ⇒ Object



1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
# File 'lib/metrocot.rb', line 1181

def compile_pattern( pattern_s, node_scraper )	

#		if @compiled_patterns.key? pattern_s 
#			return @compiled_patterns[ pattern_s ] 
#		end

	s = pattern_s
	patterns = []

	log("compiling: #{s}")
	
	while !s.empty? 

		log("left: #{s}")

		if s.index(")") == 0
			break
		end

		name = nil
		s.scan( /^(\w+)=/ ) { |match| 
			log "got name #{match.class} (#{match.size})"
			name = match[0]
		}
		s = s[(name.length + 1) .. -1] if name

		log("after name #{name}: #{s}")

		pattern = nil

		[PathPattern, TextPattern, AnythingPattern, OptSpacePattern].each { |pattern_class|
			pattern = pattern_class.parse(s)
			if pattern 
				pattern.metrocot = self
				pattern.node_scraper = node_scraper
				break 
			end
			log "not a #{pattern_class}"
		}

		if pattern 
			s = s[pattern.source.size .. -1]
			patterns << pattern
			log("found: #{pattern.description}")
			if name
				log("scanned as: #{name}")
				pattern.name = name.to_sym
			end
			next
		end
	
		if s[0..0] == "+" 
			raise "+ must follow pattern" unless patterns.size > 0
			raise "+ applied twice does not make sense" if patterns[-1].is_a? OneOrMorePattern
			pattern = OneOrMorePattern.new( patterns[-1] ) 
			pattern.metrocot = self
			pattern.node_scraper = node_scraper
			patterns[-1] = pattern
			log("now one or more: #{pattern.repeatee}")
			s = s[1 .. -1]
			next
		end

		if s[0..0] == "(" 
			pattern = compile_pattern( s[1 .. -1], node_scraper )
			close_par_index = pattern.source.size + 1
			raise "expected ')' found '#{s[0..0]}'" unless s[close_par_index..close_par_index] == ")"
			s = s[close_par_index + 1 .. -1]
			log("found nested: #{pattern.description} \"#{pattern.source}\"")
			patterns << pattern
			if name
				pattern.name = name.to_sym
			end
			next
		end

		raise "unrecognizable pattern: \"#{s[0..10]}...\""
			
	end
	
	pattern = if patterns.size > 1
		CompositePattern.new( patterns )
	elsif patterns.size == 1
		patterns[0]
	else 
		nil
	end

	if pattern
		pattern.metrocot = self
		pattern.node_scraper = node_scraper
		pattern.source = pattern_s[0 .. (0 - (1 + s.size))] 
	end

	return @compiled_patterns[ pattern_s ] = pattern
	
end

#log(s) ⇒ Object



1137
1138
1139
# File 'lib/metrocot.rb', line 1137

def log( s )
	puts( s ) if @verbose
end

#scanner_by_name(name) ⇒ Object

given a Symbol of a scanner (name), return a handle for that scanner declared when the metrocot was created.



1145
1146
1147
# File 'lib/metrocot.rb', line 1145

def scanner_by_name( name )
	@scanners[name]
end

#scrape(doc) ⇒ Object



1176
1177
1178
# File 'lib/metrocot.rb', line 1176

def scrape(doc) 
	NodeScraper.new( self, nil, nil, doc )
end