Class: Newscrapi::Mapping

Inherits:
Object
  • Object
show all
Defined in:
lib/newscrapi/mapping.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeMapping

Returns a new instance of Mapping.



8
9
10
# File 'lib/newscrapi/mapping.rb', line 8

def initialize
  @content_xpaths_list = []
end

Instance Attribute Details

#content_xpaths_listObject (readonly)

Returns the value of attribute content_xpaths_list.



6
7
8
# File 'lib/newscrapi/mapping.rb', line 6

def content_xpaths_list
  @content_xpaths_list
end

#url_pattern_regexpObject (readonly)

Returns the value of attribute url_pattern_regexp.



6
7
8
# File 'lib/newscrapi/mapping.rb', line 6

def url_pattern_regexp
  @url_pattern_regexp
end

Instance Method Details

#content_at(content_xpath) ⇒ Object



17
18
19
# File 'lib/newscrapi/mapping.rb', line 17

def content_at(content_xpath)
  @content_xpaths_list << content_xpath
end

#iconv(args) ⇒ Object



21
22
23
24
# File 'lib/newscrapi/mapping.rb', line 21

def iconv(args)
  suppose_encoding(args[:from])
  convert_to(args[:to])
end

#matches_url?(url) ⇒ Boolean

def suppose_encoding(encoding = nil)

  return @supposed_encoding if encoding.nil?
  @supposed_encoding = encoding
end

Returns:

  • (Boolean)


33
34
35
# File 'lib/newscrapi/mapping.rb', line 33

def matches_url?(url)
  url =~ @url_pattern_regexp
end

#scrap_content(obj, content_scrapper = nil) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/newscrapi/mapping.rb', line 37

def scrap_content(obj, content_scrapper = nil)
  doc = Newscrapi::Scrapper.parse_page(obj)
  @content_xpaths_list.each do |content_xpath|
    content_section = doc.xpath(content_xpath)
    if content_section.count > 0
      content = content_section.to_a.join("\n")
      content = content_scrapper.clean_content(content) unless content_scrapper.nil?
      return content
    end
  end
  nil
end

#url_pattern(pattern) ⇒ Object



12
13
14
15
# File 'lib/newscrapi/mapping.rb', line 12

def url_pattern(pattern)
  @url_pattern_regexp = pattern.class == String ?
    Regexp.compile("^#{Regexp.escape(pattern).gsub('\*','.*')}$") : pattern
end