Class: Webxtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/webxtractor.rb

Class Method Summary collapse

Class Method Details

.get(url = nil) ⇒ Object



6
7
8
9
10
# File 'lib/webxtractor.rb', line 6

def self.get(url=nil)
  return unless url
  uri = URI.parse(url)
  parse(uri.read)
end

.get_content(element, attribute) ⇒ Object



41
42
43
44
45
46
47
48
49
# File 'lib/webxtractor.rb', line 41

def self.get_content(element, attribute)
  return if element.nil?
  text = if element.attributes[attribute].respond_to?(:value)
    element.attributes[attribute].value
  else
    element.text
  end
  normalize(text)
end

.get_tag(page, selector, attribute: nil) ⇒ Object



26
27
28
29
30
31
32
33
# File 'lib/webxtractor.rb', line 26

def self.get_tag(page, selector, attribute: nil)
  elements = page.css(selector)
  if elements.size > 1
    elements.map {|element| get_content(element, attribute) }
  else
    get_content(elements.first, attribute)
  end
end

.normalize(text = nil) ⇒ Object



35
36
37
38
39
# File 'lib/webxtractor.rb', line 35

def self.normalize(text=nil)
  return if text.nil?
  text.gsub(/(\r\n|\n|\r)/," ")
  text.gsub(/\s+/, " ").strip
end

.parse(body) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/webxtractor.rb', line 12

def self.parse(body)
  page = Nokogiri::HTML(body)
  result = OpenStruct.new
  result.title = get_tag(page, 'title')
  result.meta_description = get_tag(page,
                                    'meta[name=description]',
                                    attribute: "content")
  result.meta_keywords = get_tag(page,
                                 'meta[name=keywords]',
                                 attribute: "content")
  result.h1 = get_tag(page, 'h1')
  result
end