Class: Newscrapi::Scrapper

Inherits:
Object
  • Object
show all
Defined in:
lib/newscrapi/scrapper.rb,
lib/newscrapi/testing.rb

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeScrapper

Returns a new instance of Scrapper.



33
34
35
36
37
# File 'lib/newscrapi/scrapper.rb', line 33

def initialize(scrapper_config_file = nil)
  @content_mappings = []
  config_file = scrapper_config_file || Newscrapi::Scrapper.default_config_file
  self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
end

Class Attribute Details

.defaultObject

Returns the value of attribute default.



13
14
15
# File 'lib/newscrapi/scrapper.rb', line 13

def default
  @default
end

.default_config_fileObject

Returns the value of attribute default_config_file.



13
14
15
# File 'lib/newscrapi/scrapper.rb', line 13

def default_config_file
  @default_config_file
end

Instance Attribute Details

#content_mappingsObject (readonly)

Returns the value of attribute content_mappings.



25
26
27
# File 'lib/newscrapi/scrapper.rb', line 25

def content_mappings
  @content_mappings
end

#missing_content_handler_blockObject (readonly)

Returns the value of attribute missing_content_handler_block.



25
26
27
# File 'lib/newscrapi/scrapper.rb', line 25

def missing_content_handler_block
  @missing_content_handler_block
end

#missing_url_matcher_handler_blockObject (readonly)

Returns the value of attribute missing_url_matcher_handler_block.



25
26
27
# File 'lib/newscrapi/scrapper.rb', line 25

def missing_url_matcher_handler_block
  @missing_url_matcher_handler_block
end

#scrapping_exception_handler_blockObject (readonly)

Returns the value of attribute scrapping_exception_handler_block.



25
26
27
# File 'lib/newscrapi/scrapper.rb', line 25

def scrapping_exception_handler_block
  @scrapping_exception_handler_block
end

Class Method Details

.create_new_default(*args) ⇒ Object



16
17
18
# File 'lib/newscrapi/scrapper.rb', line 16

def create_new_default(*args)
  self.default = self.new(*args)
end

.parse_page(obj) ⇒ Object



28
29
30
31
# File 'lib/newscrapi/scrapper.rb', line 28

def self.parse_page(obj)
  return obj if obj.class == Nokogiri::HTML::Document
  Nokogiri::HTML(obj)
end

Instance Method Details

#clean_content(content) ⇒ Object



50
51
52
# File 'lib/newscrapi/scrapper.rb', line 50

def clean_content(content)
  @content_cleaner_block.nil? ? content : @content_cleaner_block.call(content)
end

#content_mapping(&block) ⇒ Object



44
45
46
47
48
# File 'lib/newscrapi/scrapper.rb', line 44

def content_mapping(&block)
  new_mapping = Newscrapi::Mapping.new
  new_mapping.instance_eval(&block)
  @content_mappings << new_mapping
end

#encode_to(encoding = nil) ⇒ Object



39
40
41
42
# File 'lib/newscrapi/scrapper.rb', line 39

def encode_to(encoding = nil)
  @encode_to = encoding unless encoding.nil?
  @encode_to
end

#loofah_tags(scrap_type) ⇒ Object



61
62
63
64
65
66
# File 'lib/newscrapi/scrapper.rb', line 61

def loofah_tags(scrap_type)
  @content_scrapper_block = lambda do |content|
    require 'loofah'
    Loofah.document(content).scrub!(scrap_type).to_s
  end
end

#matching_content_mapper(url) ⇒ Object



68
69
70
71
# File 'lib/newscrapi/scrapper.rb', line 68

def matching_content_mapper(url)
  content_mappings.each { | content_mapping | return content_mapping if content_mapping.matches_url?(url) }
  nil
end

#missing_content(&block) ⇒ Object



102
103
104
# File 'lib/newscrapi/scrapper.rb', line 102

def missing_content(&block)
  @missing_content_handler_block = block
end

#missing_url_matcher(&block) ⇒ Object



98
99
100
# File 'lib/newscrapi/scrapper.rb', line 98

def missing_url_matcher(&block)
  @missing_url_matcher_handler_block = block
end

#old_initializeScrapper

Returns a new instance of Scrapper.

Returns:

  • (Scrapper)

    a new instance of Scrapper



5
6
7
8
9
# File 'lib/newscrapi/testing.rb', line 5

def initialize(scrapper_config_file = nil)
  @content_mappings = []
  config_file = scrapper_config_file || Newscrapi::Scrapper.default_config_file
  self.instance_eval(File.read(config_file), config_file) unless config_file.nil?
end

#report_to_stderrObject



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/newscrapi/scrapper.rb', line 106

def report_to_stderr
  rescue_scrapping do |exception, url|
    STDERR << "error occured during scrapping page #{url}\n"
    STDERR << "#{exception.message}\n"
    STDERR << exception.backtrace.join("\n")
  end

  missing_url_matcher do |url|
    STDERR << "missing matcher for #{url}\n"
  end

  missing_content do |url|
    STDERR << "empty content for #{url}\n"
  end
end

#rescue_scrapping(&block) ⇒ Object



94
95
96
# File 'lib/newscrapi/scrapper.rb', line 94

def rescue_scrapping(&block)
  @scrapping_exception_handler_block = block
end

#sanitize_tags(&sanitize_settings) ⇒ Object



54
55
56
57
58
59
# File 'lib/newscrapi/scrapper.rb', line 54

def sanitize_tags(&sanitize_settings)
  @content_cleaner_block = lambda do |content|
    require 'sanitize'
    Sanitize.clean(content, sanitize_settings.call())
  end
end

#scrap_content(url, options = {}) ⇒ Object



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/newscrapi/scrapper.rb', line 73

def scrap_content(url, options = {})
  if (content_mapping = matching_content_mapper(url)).nil?
    @missing_url_matcher_handler_block.call(url) unless @missing_url_matcher_handler_block.nil?
    return nil
  end
  return nil if content_mapping.content_xpaths_list.empty?
  begin
    use_page = ensure_encoding(options[:use_page] || Kernel.open(url).read)

    doc = Newscrapi::Scrapper.parse_page(use_page)
    scrapped_content = content_mapping.scrap_content(doc, content_scrapper = self)

    @missing_content_handler_block.call(url) if !@missing_content_handler_block.nil? and scrapped_content.nil?
    return scrapped_content
  rescue Exception
    @scrapping_exception_handler_block.call($!, url) unless @scrapping_exception_handler_block.nil?
    return nil
  end
  nil
end

#set_as_defaultObject



21
22
23
# File 'lib/newscrapi/scrapper.rb', line 21

def set_as_default
  Newscrapi::Scrapper.default = self
end

#testing_report_to_stderrObject



12
13
14
15
16
17
18
# File 'lib/newscrapi/testing.rb', line 12

def testing_report_to_stderr
  rescue_scrapping do |exception, url|
#      extended_exception = Exception.new("error occured during scrapping page #{url}: #{exception.message}")
#      extended_exception.set_backtrace(exception.backtrace)
    raise exception #extended_exception
  end
end