Class: Extractor

Inherits:

Object

Object
Extractor

show all

Defined in:: lib/feed_ninja/extractor.rb

Instance Attribute Summary collapse

#doc ⇒ Object

Returns the value of attribute doc.

Instance Method Summary collapse

Instance Attribute Details

#doc ⇒ `Object`

Returns the value of attribute doc.



5
6
7

# File 'lib/feed_ninja/extractor.rb', line 5

def doc
  @doc
end

Instance Method Details

#extract_image(base_url, xpath) ⇒ `Object`

# File 'lib/feed_ninja/extractor.rb', line 22

def extract_image(base_url, xpath)
  @doc.xpath(xpath).collect do | picture_src |
    if(picture_src.to_s.start_with? 'http') then
      picture_src.to_s
    else
      LOGGER.debug { "BASE URL IS #{base_url.class}" }
      "#{base_url.scheme}://#{base_url.host}/#{base_url.path}#{picture_src}"
    end
  end
end

#extract_images(base_url, xpaths) ⇒ `Object`

# File 'lib/feed_ninja/extractor.rb', line 14

def extract_images(base_url, xpaths)
  LOGGER.debug{ "collecting images for #{xpaths}" }
  [*xpaths].collect_concat do |xpath|
    LOGGER.debug{ "collecting image:xpath #{xpath}" }
    extract_image(URI(base_url), xpath)
  end
end

#extract_xml(xpaths) ⇒ `Object`

# File 'lib/feed_ninja/extractor.rb', line 33

def extract_xml(xpaths)
  LOGGER.debug{ "collecting text" }
  [*xpaths].collect_concat do |xpath|
    LOGGER.debug{ "collecting text:xpath #{xpath}" }
    @doc.xpath(xpath).collect do |result|
      LOGGER.debug{ "collecting text:result #{result}" }
      result.to_s
    end
  end
end

#fetch(uri) ⇒ `Object`

# File 'lib/feed_ninja/extractor.rb', line 7

def fetch uri
  open(uri) do |site|
    @doc = Nokogiri::HTML(site)
    #return extract_image(doc, site.base_uri), extract_xml(doc)
  end
end

Class: Extractor

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#doc ⇒ Object

Instance Method Details

#extract_image(base_url, xpath) ⇒ Object

#extract_images(base_url, xpaths) ⇒ Object

#extract_xml(xpaths) ⇒ Object

#fetch(uri) ⇒ Object

#doc ⇒ `Object`

#extract_image(base_url, xpath) ⇒ `Object`

#extract_images(base_url, xpaths) ⇒ `Object`

#extract_xml(xpaths) ⇒ `Object`

#fetch(uri) ⇒ `Object`