Class: Metrocot::NodeScraper

Inherits:
Object
  • Object
show all
Defined in:
lib/metrocot.rb

Overview

rooted at a node in the dom, the node srcaper is used to collect all matches of patterns.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(mcot, parent, root, hnode) ⇒ NodeScraper

Returns a new instance of NodeScraper.



891
892
893
894
895
896
897
# File 'lib/metrocot.rb', line 891

def initialize( mcot, parent, root, hnode )
	@mcot         = mcot 
	@parent       = parent 
	@root         = root || self
	@hnode        = hnode 
	@verbose      = mcot.verbose
end

Instance Attribute Details

#hnodeObject

Returns the value of attribute hnode.



888
889
890
# File 'lib/metrocot.rb', line 888

def hnode
  @hnode
end

#mcotObject

Returns the value of attribute mcot.



888
889
890
# File 'lib/metrocot.rb', line 888

def mcot
  @mcot
end

#parentObject

Returns the value of attribute parent.



888
889
890
# File 'lib/metrocot.rb', line 888

def parent
  @parent
end

#pattern_classesObject

Returns the value of attribute pattern_classes.



888
889
890
# File 'lib/metrocot.rb', line 888

def pattern_classes
  @pattern_classes
end

#rootObject

Returns the value of attribute root.



888
889
890
# File 'lib/metrocot.rb', line 888

def root
  @root
end

#top_part_namesObject

Returns the value of attribute top_part_names.



888
889
890
# File 'lib/metrocot.rb', line 888

def top_part_names
  @top_part_names
end

#verboseObject

Returns the value of attribute verbose.



888
889
890
# File 'lib/metrocot.rb', line 888

def verbose
  @verbose
end

Instance Method Details

#build_hnode_indexObject



933
934
935
936
937
938
939
940
941
942
943
# File 'lib/metrocot.rb', line 933

def build_hnode_index

	@flattened_hnodes = []
	@hnode_index = {}
	@hnode_succ_index = {}

	n = flatten_hnodes( 0, hnode )

	log( "built index for #{n} hnodes" )

end

#collect(pattern_s, &block) ⇒ Object

collects all occurrences of the data matching the pattern by calling the yield block for everything part of the dom subtree matching the pattern. The block can reject the dom match by returning nil. Anything other than nil will be appended to the list returned at the end.

Unlike collect_hashed(), the block will be given a list of parameter values matching the list of named fields in the pattern.

Example

mcot.scrape(doc).descend( "//ul/li" ) { |li| 
  li.collect( "liker=... \"likes\" likee=..." ) { |likes, liked| 
    [ likes, liked ]
  }
}


1027
1028
1029
# File 'lib/metrocot.rb', line 1027

def collect( pattern_s, &block )
	collect_gen( pattern_s, :positional, &block )
end

#collect_gen(pattern_s, call_with, &block) ⇒ Object



970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
# File 'lib/metrocot.rb', line 970

def collect_gen( pattern_s, call_with, &block )
	pattern = @mcot.compile_pattern( pattern_s, self )
	top_part_names = []
	if pattern.is_a? CompositePattern
		pattern.parts.each { |part|
			top_part_names << part.name if part.name
		}
	end
	log("top part names: #{top_part_names.join(", ")}")
	build_hnode_index
	pattern.dump( 0, $stdout ) if @verbose
	results = []
	match_range = MatchRange.new( self, 0, 0, flattened_hnodes.size, 0)
	while ! match_range.empty?
		result = nil
		pattern.each_match( match_range, {} )  { |sub_match_range, match_map|
			match_list = []
			result = if (call_with == :positional) && top_part_names.size > 0
				block_args = top_part_names.collect { |top_name|
					match_map[top_name]
				}
				log("calling pos scan block with: #{block_args.inspect}")
				result = block.call( *block_args )
			else
				log("calling hash scan block with: #{match_map.inspect}")
				result = block.call( match_map )
			end
			if result
				results << result
				match_range = match_range.following( sub_match_range )
			end

			result
		}

		break unless result
	end
	results
end

#collect_hashed(pattern_s, &block) ⇒ Object

collects all occurrences of the data matching the pattern by calling the yield block for everything part of the dom subtree matching the pattern. The block can reject the dom match by returning nil. Anything other than nil will be appended to the list returned at the end.

Unlike collect(), the block will be given a map of parameter values keyed by the names of the named fields in the pattern.

Example

mcot.scrape(doc).descend( "//ul/li" ) { |li| 
  li.collect_hashed( "killer=... verb=/(stabbed|shot|strangled)/ victim=... \"(with|using)\" weapon=..." ) { |map|
    Murder.new( map )
  }
}


1048
1049
1050
# File 'lib/metrocot.rb', line 1048

def collect_hashed( pattern_s, &block )
	collect_gen( pattern_s, :map, &block )
end

#descend(path) ⇒ Object



905
906
907
908
909
910
911
912
913
# File 'lib/metrocot.rb', line 905

def descend( path )

	results = {}
	@hnode.search( path ).each { |hchild| 
		results[hchild] = yield( NodeScraper.new( @mcot, self, @root, hchild ) ) 
	}
	results

end

#flatten_hnodes(ix, node) ⇒ Object



916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
# File 'lib/metrocot.rb', line 916

def flatten_hnodes( ix, node )
	@flattened_hnodes << node
	@hnode_index[node] = ix
	ix += 1 
	if node.elem?
		children = node.children
		unless children.nil?
			children.each { |child| 
				ix = flatten_hnodes(ix, child) 
			}
		end
	end
	@hnode_succ_index[node] = ix
	ix
end

#flattened_hnodesObject



946
947
948
949
950
951
# File 'lib/metrocot.rb', line 946

def flattened_hnodes
	unless @flattened_hnodes
		build_hnode_index
	end
	return @flattened_hnodes
end

#hnode_indexObject



954
955
956
957
958
959
# File 'lib/metrocot.rb', line 954

def hnode_index
	unless @hnode_index
		build_hnode_index
	end
	return @hnode_index
end

#hnode_succ_indexObject



962
963
964
965
966
967
# File 'lib/metrocot.rb', line 962

def hnode_succ_index
	unless @hnode_succ_index
		build_hnode_succ_index
	end
	return @hnode_succ_index
end

#log(s) ⇒ Object



900
901
902
# File 'lib/metrocot.rb', line 900

def log( s ) 
	mcot.log( s ) if @verbose
end

#scanner_by_name(name) ⇒ Object

returns the scanner declared with that name when the metrocot was created



1055
1056
1057
# File 'lib/metrocot.rb', line 1055

def scanner_by_name( name )
	return mcot.scanner_by_name(name)
end