Class: Metrocot::TextPattern

Inherits:
BasePattern show all
Defined in:
lib/metrocot.rb

Overview

Matches a certain text string or regex pattern

Instance Attribute Summary

Attributes inherited from BasePattern

#matched, #metrocot, #name, #node_scraper, #pattern_no, #pred, #source, #succ

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from BasePattern

#default_scanner, #dump, #dump_match_map, #log, #log_match_data, #optional, #with_scanned_match_data

Constructor Details

#initialize(source, text) ⇒ TextPattern

Returns a new instance of TextPattern.



565
566
567
568
# File 'lib/metrocot.rb', line 565

def initialize( source, text )
	super(source)
	@text = text
end

Class Method Details

.parse(s) ⇒ Object



574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
# File 'lib/metrocot.rb', line 574

def self.parse( s ) 

	if s.index("$") == 0
		return self.new( "$", /[\r\n]/ )
	end

	if s.index("/") == 0

		r_s = ""
		s = s[1..-1]
		src = "/"
		while !s.empty?

			if s.index("/") == 0
				s = s[1..-1]
				src << "/"
				break
			elsif s.index("\\/") == 0
				s = s[2..-1]
				r_s << "/"
				src << "\\/"
			else
				r_s << s[0..0]
				src << s[0..0]
				s = s[1..-1]
			end

		end

		regexp = Regexp.compile( r_s )
		return self.new( src, regexp )

	end

	if s.index("\"") == 0

		r_s = ""
		s = s[1..-1]
		src = "\""
		while !s.empty?

			if s.index("\"") == 0
				s = s[1..-1]
				src << "\""
				break
			elsif s.index("\\\"") == 0
				s = s[2..-1]
				r_s << "\""
				src << "\\\""
			else
				r_s << s[0..0]
				src << s[0..0]
				s = s[1..-1]
			end

		end

		return self.new( src, r_s )
	end

end

Instance Method Details

#descriptionObject



570
571
572
# File 'lib/metrocot.rb', line 570

def description
	"text \"#{@text}\""
end

#each_match(match_range, match_map) ⇒ Object



652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
# File 'lib/metrocot.rb', line 652

def each_match( match_range, match_map )

	super(match_range, match_map)

	match_index = match_range.start_index
	match_offset = match_range.start_offset


	# consume rest of first text node  

	hnodes = match_range.hnodes

	actual_match = nil

	while match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)

		while (match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)) && ! hnodes[match_index].text?
			log( "not text: ##{match_index} #{hnodes[match_index].class}" )
			match_index += 1
			match_offset = 0
		end

		unless (match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)) && hnodes[match_index].text?
			log( "no match found" )
			return nil
		end

		hnode_text = if match_index == match_range.end_index
			hnodes[match_index].inner_text[0...match_range.end_offset]
		else
			hnodes[match_index].inner_text
		end

		log( "trying text match on: #{hnode_text[match_offset .. -1]}" )

		next_match_offset = hnode_text.index( @text, match_offset )

		if next_match_offset.nil?
			log( "no match found for #{@text}" )
			match_index += 1
			match_offset = 0
			next
		end

		actual_match = if @text.is_a? Regexp
			hnode_text[next_match_offset..-1][@text]
		else
			@text
		end

		log( "next text match at #{match_index}.#{next_match_offset}: #{actual_match}" )

		match_start_offset = next_match_offset
		match_end_offset = match_start_offset + actual_match.size
					
		if match_end_offset >= hnode_text.size
			log( "matched entire string of #{match_end_offset - match_start_offset} chars" )
		else
			log( "matched first #{match_end_offset - match_start_offset} chars" )
		end

		result = with_scanned_match_data( match_map, actual_match ) { |match_map|
			yield( match_range.crop( match_index, match_start_offset, match_index, match_end_offset), match_map ) 
		}
		
		return result if result

		match_offset = match_end_offset

	end

	return nil
	
end

#priorityObject



636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
# File 'lib/metrocot.rb', line 636

def priority
	if name
		if @text.is_a?(String)
			-4
		else
			-5
		end
	else
		if @text.is_a?(String)
			-2
		else
			-3
		end
	end
end