Class: Newscrapi::Scrapper
- Inherits:
-
Object
- Object
- Newscrapi::Scrapper
- Defined in:
- lib/newscrapi/scrapper.rb,
lib/newscrapi/testing.rb
Class Attribute Summary collapse
-
.default ⇒ Object
Returns the value of attribute default.
-
.default_config_file ⇒ Object
Returns the value of attribute default_config_file.
Instance Attribute Summary collapse
-
#content_mappings ⇒ Object
readonly
Returns the value of attribute content_mappings.
-
#missing_content_handler_block ⇒ Object
readonly
Returns the value of attribute missing_content_handler_block.
-
#missing_url_matcher_handler_block ⇒ Object
readonly
Returns the value of attribute missing_url_matcher_handler_block.
-
#scrapping_exception_handler_block ⇒ Object
readonly
Returns the value of attribute scrapping_exception_handler_block.
Class Method Summary collapse
Instance Method Summary collapse
- #clean_content(content) ⇒ Object
- #content_mapping(&block) ⇒ Object
- #encode_to(encoding = nil) ⇒ Object
-
#initialize ⇒ Scrapper
constructor
A new instance of Scrapper.
- #loofah_tags(scrap_type) ⇒ Object
- #matching_content_mapper(url) ⇒ Object
- #missing_content(&block) ⇒ Object
- #missing_url_matcher(&block) ⇒ Object
-
#old_initialize ⇒ Scrapper
A new instance of Scrapper.
- #report_to_stderr ⇒ Object
- #rescue_scrapping(&block) ⇒ Object
- #sanitize_tags(&sanitize_settings) ⇒ Object
- #scrap_content(url, options = {}) ⇒ Object
- #set_as_default ⇒ Object
- #testing_report_to_stderr ⇒ Object
Constructor Details
#initialize ⇒ Scrapper
Returns a new instance of Scrapper.
33 34 35 36 37 |
# File 'lib/newscrapi/scrapper.rb', line 33 def initialize(scrapper_config_file = nil) @content_mappings = [] config_file = scrapper_config_file || Newscrapi::Scrapper.default_config_file self.instance_eval(File.read(config_file), config_file) unless config_file.nil? end |
Class Attribute Details
.default ⇒ Object
Returns the value of attribute default.
13 14 15 |
# File 'lib/newscrapi/scrapper.rb', line 13 def default @default end |
.default_config_file ⇒ Object
Returns the value of attribute default_config_file.
13 14 15 |
# File 'lib/newscrapi/scrapper.rb', line 13 def default_config_file @default_config_file end |
Instance Attribute Details
#content_mappings ⇒ Object (readonly)
Returns the value of attribute content_mappings.
25 26 27 |
# File 'lib/newscrapi/scrapper.rb', line 25 def content_mappings @content_mappings end |
#missing_content_handler_block ⇒ Object (readonly)
Returns the value of attribute missing_content_handler_block.
25 26 27 |
# File 'lib/newscrapi/scrapper.rb', line 25 def missing_content_handler_block @missing_content_handler_block end |
#missing_url_matcher_handler_block ⇒ Object (readonly)
Returns the value of attribute missing_url_matcher_handler_block.
25 26 27 |
# File 'lib/newscrapi/scrapper.rb', line 25 def missing_url_matcher_handler_block @missing_url_matcher_handler_block end |
#scrapping_exception_handler_block ⇒ Object (readonly)
Returns the value of attribute scrapping_exception_handler_block.
25 26 27 |
# File 'lib/newscrapi/scrapper.rb', line 25 def scrapping_exception_handler_block @scrapping_exception_handler_block end |
Class Method Details
.create_new_default(*args) ⇒ Object
16 17 18 |
# File 'lib/newscrapi/scrapper.rb', line 16 def create_new_default(*args) self.default = self.new(*args) end |
.parse_page(obj) ⇒ Object
28 29 30 31 |
# File 'lib/newscrapi/scrapper.rb', line 28 def self.parse_page(obj) return obj if obj.class == Nokogiri::HTML::Document Nokogiri::HTML(obj) end |
Instance Method Details
#clean_content(content) ⇒ Object
50 51 52 |
# File 'lib/newscrapi/scrapper.rb', line 50 def clean_content(content) @content_cleaner_block.nil? ? content : @content_cleaner_block.call(content) end |
#content_mapping(&block) ⇒ Object
44 45 46 47 48 |
# File 'lib/newscrapi/scrapper.rb', line 44 def content_mapping(&block) new_mapping = Newscrapi::Mapping.new new_mapping.instance_eval(&block) @content_mappings << new_mapping end |
#encode_to(encoding = nil) ⇒ Object
39 40 41 42 |
# File 'lib/newscrapi/scrapper.rb', line 39 def encode_to(encoding = nil) @encode_to = encoding unless encoding.nil? @encode_to end |
#loofah_tags(scrap_type) ⇒ Object
61 62 63 64 65 66 |
# File 'lib/newscrapi/scrapper.rb', line 61 def (scrap_type) @content_scrapper_block = lambda do |content| require 'loofah' Loofah.document(content).scrub!(scrap_type).to_s end end |
#matching_content_mapper(url) ⇒ Object
68 69 70 71 |
# File 'lib/newscrapi/scrapper.rb', line 68 def matching_content_mapper(url) content_mappings.each { | content_mapping | return content_mapping if content_mapping.matches_url?(url) } nil end |
#missing_content(&block) ⇒ Object
102 103 104 |
# File 'lib/newscrapi/scrapper.rb', line 102 def missing_content(&block) @missing_content_handler_block = block end |
#missing_url_matcher(&block) ⇒ Object
98 99 100 |
# File 'lib/newscrapi/scrapper.rb', line 98 def missing_url_matcher(&block) @missing_url_matcher_handler_block = block end |
#old_initialize ⇒ Scrapper
Returns a new instance of Scrapper.
5 6 7 8 9 |
# File 'lib/newscrapi/testing.rb', line 5 def initialize(scrapper_config_file = nil) @content_mappings = [] config_file = scrapper_config_file || Newscrapi::Scrapper.default_config_file self.instance_eval(File.read(config_file), config_file) unless config_file.nil? end |
#report_to_stderr ⇒ Object
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/newscrapi/scrapper.rb', line 106 def report_to_stderr rescue_scrapping do |exception, url| STDERR << "error occured during scrapping page #{url}\n" STDERR << "#{exception.}\n" STDERR << exception.backtrace.join("\n") end missing_url_matcher do |url| STDERR << "missing matcher for #{url}\n" end missing_content do |url| STDERR << "empty content for #{url}\n" end end |
#rescue_scrapping(&block) ⇒ Object
94 95 96 |
# File 'lib/newscrapi/scrapper.rb', line 94 def rescue_scrapping(&block) @scrapping_exception_handler_block = block end |
#sanitize_tags(&sanitize_settings) ⇒ Object
54 55 56 57 58 59 |
# File 'lib/newscrapi/scrapper.rb', line 54 def (&sanitize_settings) @content_cleaner_block = lambda do |content| require 'sanitize' Sanitize.clean(content, sanitize_settings.call()) end end |
#scrap_content(url, options = {}) ⇒ Object
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/newscrapi/scrapper.rb', line 73 def scrap_content(url, = {}) if (content_mapping = matching_content_mapper(url)).nil? @missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil? return nil end return nil if content_mapping.content_xpaths_list.empty? begin use_page = ensure_encoding([:use_page] || Kernel.open(url).read) doc = Newscrapi::Scrapper.parse_page(use_page) scrapped_content = content_mapping.scrap_content(doc, content_scrapper = self) @missing_content_handler_block.call(url) if !@missing_content_handler_block.nil? and scrapped_content.nil? return scrapped_content rescue Exception @scrapping_exception_handler_block.call($!, url) unless @scrapping_exception_handler_block.nil? return nil end nil end |
#set_as_default ⇒ Object
21 22 23 |
# File 'lib/newscrapi/scrapper.rb', line 21 def set_as_default Newscrapi::Scrapper.default = self end |
#testing_report_to_stderr ⇒ Object
12 13 14 15 16 17 18 |
# File 'lib/newscrapi/testing.rb', line 12 def testing_report_to_stderr rescue_scrapping do |exception, url| # extended_exception = Exception.new("error occured during scrapping page #{url}: #{exception.message}") # extended_exception.set_backtrace(exception.backtrace) raise exception #extended_exception end end |