Class: ContentScrapper

Inherits:
Object
  • Object
show all
Defined in:
lib/content_scrapper.rb

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(scrapper_config_file = nil) ⇒ ContentScrapper

Returns a new instance of ContentScrapper.



24
25
26
27
28
# File 'lib/content_scrapper.rb', line 24

def initialize(scrapper_config_file = nil)
  @content_mappings = []
  config_file = scrapper_config_file || ContentScrapper.default_config_file
  self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
end

Class Attribute Details

.defaultObject

Returns the value of attribute default.



10
11
12
# File 'lib/content_scrapper.rb', line 10

def default
  @default
end

.default_config_fileObject

Returns the value of attribute default_config_file.



10
11
12
# File 'lib/content_scrapper.rb', line 10

def default_config_file
  @default_config_file
end

Instance Attribute Details

#content_mappingsObject (readonly)

Returns the value of attribute content_mappings.



22
23
24
# File 'lib/content_scrapper.rb', line 22

def content_mappings
  @content_mappings
end

Class Method Details

.create_new_default(*args) ⇒ Object



13
14
15
# File 'lib/content_scrapper.rb', line 13

def create_new_default(*args)
  self.default = self.new(*args)
end

Instance Method Details

#clean_content(content) ⇒ Object



36
37
38
# File 'lib/content_scrapper.rb', line 36

def clean_content(content)
  @content_cleaner_block.nil? ? content : @content_cleaner_block.call(content)
end

#content_mapping(&block) ⇒ Object



30
31
32
33
34
# File 'lib/content_scrapper.rb', line 30

def content_mapping(&block)
  new_mapping = ContentMapping.new
  new_mapping.instance_eval(&block)
  @content_mappings << new_mapping
end

#loofah_tags(scrap_type) ⇒ Object



47
48
49
50
51
52
# File 'lib/content_scrapper.rb', line 47

def loofah_tags(scrap_type)
  @content_scrapper_block = lambda do |content|
    require 'loofah'
    Loofah.document(content).scrub!(scrap_type).to_s
  end
end

#missing_url_matcher(&block) ⇒ Object



77
78
79
# File 'lib/content_scrapper.rb', line 77

def missing_url_matcher(&block)
  @missing_url_matcher_handler_block = block
end

#rescue_scrapping(&block) ⇒ Object



73
74
75
# File 'lib/content_scrapper.rb', line 73

def rescue_scrapping(&block)
  @scrapping_exception_handler_block = block
end

#sanitize_tags(&sanitize_settings) ⇒ Object



40
41
42
43
44
45
# File 'lib/content_scrapper.rb', line 40

def sanitize_tags(&sanitize_settings)
  @content_cleaner_block = lambda do |content|
    require 'sanitize'
    Sanitize.clean(content, sanitize_settings.call())
  end
end

#scrap_content(url, options = {}) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/content_scrapper.rb', line 54

def scrap_content(url, options = {})
  content_mappings.each do | content_mapping |
    if content_mapping.matches_url?(url)
      return nil if content_mapping.content_xpaths_list.empty?
      begin
        doc = Nokogiri::HTML(options[:use_page] || Kernel.open(url))
        return content_mapping.scrap_content(doc, content_scrapper = self)
      rescue Exception
        unless @scrapping_exception_handler_block.nil?
          @scrapping_exception_handler_block.call($!, url)
        end
        return nil
      end
    end
  end
  @missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil?
  nil
end

#set_as_defaultObject



18
19
20
# File 'lib/content_scrapper.rb', line 18

def set_as_default
  ContentScrapper.default = self
end