Class: ContentScrapper
- Inherits:
-
Object
- Object
- ContentScrapper
- Defined in:
- lib/content_scrapper.rb
Class Attribute Summary collapse
-
.default ⇒ Object
Returns the value of attribute default.
-
.default_config_file ⇒ Object
Returns the value of attribute default_config_file.
Instance Attribute Summary collapse
-
#content_mappings ⇒ Object
readonly
Returns the value of attribute content_mappings.
Class Method Summary collapse
Instance Method Summary collapse
- #clean_content(content) ⇒ Object
- #content_mapping(&block) ⇒ Object
-
#initialize(scrapper_config_file = nil) ⇒ ContentScrapper
constructor
A new instance of ContentScrapper.
- #loofah_tags(scrap_type) ⇒ Object
- #missing_url_matcher(&block) ⇒ Object
- #rescue_scrapping(&block) ⇒ Object
- #sanitize_tags(&sanitize_settings) ⇒ Object
- #scrap_content(url, options = {}) ⇒ Object
- #set_as_default ⇒ Object
Constructor Details
#initialize(scrapper_config_file = nil) ⇒ ContentScrapper
Returns a new instance of ContentScrapper.
24 25 26 27 28 |
# File 'lib/content_scrapper.rb', line 24 def initialize(scrapper_config_file = nil) @content_mappings = [] config_file = scrapper_config_file || ContentScrapper.default_config_file self.instance_eval(File.read(config_file), config_file) unless config_file.nil? end |
Class Attribute Details
.default ⇒ Object
Returns the value of attribute default.
10 11 12 |
# File 'lib/content_scrapper.rb', line 10 def default @default end |
.default_config_file ⇒ Object
Returns the value of attribute default_config_file.
10 11 12 |
# File 'lib/content_scrapper.rb', line 10 def default_config_file @default_config_file end |
Instance Attribute Details
#content_mappings ⇒ Object (readonly)
Returns the value of attribute content_mappings.
22 23 24 |
# File 'lib/content_scrapper.rb', line 22 def content_mappings @content_mappings end |
Class Method Details
.create_new_default(*args) ⇒ Object
13 14 15 |
# File 'lib/content_scrapper.rb', line 13 def create_new_default(*args) self.default = self.new(*args) end |
Instance Method Details
#clean_content(content) ⇒ Object
36 37 38 |
# File 'lib/content_scrapper.rb', line 36 def clean_content(content) @content_cleaner_block.nil? ? content : @content_cleaner_block.call(content) end |
#content_mapping(&block) ⇒ Object
30 31 32 33 34 |
# File 'lib/content_scrapper.rb', line 30 def content_mapping(&block) new_mapping = ContentMapping.new new_mapping.instance_eval(&block) @content_mappings << new_mapping end |
#loofah_tags(scrap_type) ⇒ Object
47 48 49 50 51 52 |
# File 'lib/content_scrapper.rb', line 47 def (scrap_type) @content_scrapper_block = lambda do |content| require 'loofah' Loofah.document(content).scrub!(scrap_type).to_s end end |
#missing_url_matcher(&block) ⇒ Object
77 78 79 |
# File 'lib/content_scrapper.rb', line 77 def missing_url_matcher(&block) @missing_url_matcher_handler_block = block end |
#rescue_scrapping(&block) ⇒ Object
73 74 75 |
# File 'lib/content_scrapper.rb', line 73 def rescue_scrapping(&block) @scrapping_exception_handler_block = block end |
#sanitize_tags(&sanitize_settings) ⇒ Object
40 41 42 43 44 45 |
# File 'lib/content_scrapper.rb', line 40 def (&sanitize_settings) @content_cleaner_block = lambda do |content| require 'sanitize' Sanitize.clean(content, sanitize_settings.call()) end end |
#scrap_content(url, options = {}) ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/content_scrapper.rb', line 54 def scrap_content(url, = {}) content_mappings.each do | content_mapping | if content_mapping.matches_url?(url) return nil if content_mapping.content_xpaths_list.empty? begin doc = Nokogiri::HTML([:use_page] || Kernel.open(url)) return content_mapping.scrap_content(doc, content_scrapper = self) rescue Exception unless @scrapping_exception_handler_block.nil? @scrapping_exception_handler_block.call($!, url) end return nil end end end @missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil? nil end |
#set_as_default ⇒ Object
18 19 20 |
# File 'lib/content_scrapper.rb', line 18 def set_as_default ContentScrapper.default = self end |