Class: Webxtractor
- Inherits:
-
Object
- Object
- Webxtractor
- Defined in:
- lib/webxtractor.rb
Class Method Summary collapse
- .get(url = nil) ⇒ Object
- .get_content(element, attribute) ⇒ Object
- .get_tag(page, selector, attribute: nil) ⇒ Object
- .normalize(text = nil) ⇒ Object
- .parse(body) ⇒ Object
Class Method Details
.get(url = nil) ⇒ Object
6 7 8 9 10 |
# File 'lib/webxtractor.rb', line 6 def self.get(url=nil) return unless url uri = URI.parse(url) parse(uri.read) end |
.get_content(element, attribute) ⇒ Object
41 42 43 44 45 46 47 48 49 |
# File 'lib/webxtractor.rb', line 41 def self.get_content(element, attribute) return if element.nil? text = if element.attributes[attribute].respond_to?(:value) element.attributes[attribute].value else element.text end normalize(text) end |
.get_tag(page, selector, attribute: nil) ⇒ Object
26 27 28 29 30 31 32 33 |
# File 'lib/webxtractor.rb', line 26 def self.get_tag(page, selector, attribute: nil) elements = page.css(selector) if elements.size > 1 elements.map {|element| get_content(element, attribute) } else get_content(elements.first, attribute) end end |
.normalize(text = nil) ⇒ Object
35 36 37 38 39 |
# File 'lib/webxtractor.rb', line 35 def self.normalize(text=nil) return if text.nil? text.gsub(/(\r\n|\n|\r)/," ") text.gsub(/\s+/, " ").strip end |
.parse(body) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/webxtractor.rb', line 12 def self.parse(body) page = Nokogiri::HTML(body) result = OpenStruct.new result.title = get_tag(page, 'title') result. = get_tag(page, 'meta[name=description]', attribute: "content") result. = get_tag(page, 'meta[name=keywords]', attribute: "content") result.h1 = get_tag(page, 'h1') result end |