Class: Sc::NewUriSelector

Inherits:
Selector show all
Defined in:
lib/scrappy/extractor/selectors/new_uri.rb

Instance Method Summary collapse

Methods inherited from Selector

#select

Methods included from Scrappy::Formats

#format

Instance Method Details

#filter(doc) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/scrappy/extractor/selectors/new_uri.rb', line 3

def filter doc
  contents = if sc::attribute.first
    # Select node's attribute if given
    sc::attribute.map { |attribute| [doc[:content][attribute], attribute] }
  else
    [ [doc[:value], nil] ]
  end
  
  @indexes ||= Hash.new(0)
  prefix = sc::prefix.first.to_s
  if !["http://", "https://"].include?(prefix)
    prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
  end
  suffix = sc::suffix.first.to_s
  
  nofollow = (sc::follow.first != "true")
  
  contents.map do |content, attribute|
    new_uri = if (content.to_s =~ /\Ahttp\:/ or content.to_s =~ /\Ahttps\:/)
      "#{content}#{suffix}"
    else
      variable = if sc::sequence.first.to_s=="true" 
        @indexes[prefix] += 1
      else
        if sc::downcase.first.to_s=="true"
          content.to_s.underscore
        else
          content.to_s.wikify
        end
      end
      "#{prefix}#{variable}#{suffix}"
    end
    
    { :uri=>new_uri, :content=>doc[:content], :value=>new_uri, :attribute=>attribute, :nofollow=>nofollow }
  end
end