Class: Newscrapi::Mapping
- Inherits:
-
Object
- Object
- Newscrapi::Mapping
- Defined in:
- lib/newscrapi/mapping.rb
Instance Attribute Summary collapse
-
#content_xpaths_list ⇒ Object
readonly
Returns the value of attribute content_xpaths_list.
-
#url_pattern_regexp ⇒ Object
readonly
Returns the value of attribute url_pattern_regexp.
Instance Method Summary collapse
- #content_at(content_xpath) ⇒ Object
- #iconv(args) ⇒ Object
-
#initialize ⇒ Mapping
constructor
A new instance of Mapping.
-
#matches_url?(url) ⇒ Boolean
def suppose_encoding(encoding = nil) return @supposed_encoding if encoding.nil? @supposed_encoding = encoding end.
- #scrap_content(obj, content_scrapper = nil) ⇒ Object
- #url_pattern(pattern) ⇒ Object
Constructor Details
#initialize ⇒ Mapping
Returns a new instance of Mapping.
8 9 10 |
# File 'lib/newscrapi/mapping.rb', line 8 def initialize @content_xpaths_list = [] end |
Instance Attribute Details
#content_xpaths_list ⇒ Object (readonly)
Returns the value of attribute content_xpaths_list.
6 7 8 |
# File 'lib/newscrapi/mapping.rb', line 6 def content_xpaths_list @content_xpaths_list end |
#url_pattern_regexp ⇒ Object (readonly)
Returns the value of attribute url_pattern_regexp.
6 7 8 |
# File 'lib/newscrapi/mapping.rb', line 6 def url_pattern_regexp @url_pattern_regexp end |
Instance Method Details
#content_at(content_xpath) ⇒ Object
17 18 19 |
# File 'lib/newscrapi/mapping.rb', line 17 def content_at(content_xpath) @content_xpaths_list << content_xpath end |
#iconv(args) ⇒ Object
21 22 23 24 |
# File 'lib/newscrapi/mapping.rb', line 21 def iconv(args) suppose_encoding(args[:from]) convert_to(args[:to]) end |
#matches_url?(url) ⇒ Boolean
def suppose_encoding(encoding = nil)
return @supposed_encoding if encoding.nil?
@supposed_encoding = encoding
end
33 34 35 |
# File 'lib/newscrapi/mapping.rb', line 33 def matches_url?(url) url =~ @url_pattern_regexp end |
#scrap_content(obj, content_scrapper = nil) ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/newscrapi/mapping.rb', line 37 def scrap_content(obj, content_scrapper = nil) doc = Newscrapi::Scrapper.parse_page(obj) @content_xpaths_list.each do |content_xpath| content_section = doc.xpath(content_xpath) if content_section.count > 0 content = content_section.to_a.join("\n") content = content_scrapper.clean_content(content) unless content_scrapper.nil? return content end end nil end |
#url_pattern(pattern) ⇒ Object
12 13 14 15 |
# File 'lib/newscrapi/mapping.rb', line 12 def url_pattern(pattern) @url_pattern_regexp = pattern.class == String ? Regexp.compile("^#{Regexp.escape(pattern).gsub('\*','.*')}$") : pattern end |