Class: Extractula::Extractor
- Inherits:
-
Object
- Object
- Extractula::Extractor
show all
- Defined in:
- lib/extractula/extractor.rb
Overview
Abstract (more or less) extractor class from which custom extractor classes should descend. Subclasses of Extractula::Extractor will be automatically added to the Extracula module.
Instance Attribute Summary collapse
Class Method Summary
collapse
Instance Method Summary
collapse
Constructor Details
#initialize(url, html) ⇒ Extractor
Returns a new instance of Extractor.
64
65
66
67
|
# File 'lib/extractula/extractor.rb', line 64
def initialize url, html
@url = url.is_a?(Domainatrix::Url) ? url : Domainatrix.parse(url)
@html = html.is_a?(Nokogiri::HTML::Document) ? html : Nokogiri::HTML(html)
end
|
Instance Attribute Details
#html ⇒ Object
Returns the value of attribute html.
62
63
64
|
# File 'lib/extractula/extractor.rb', line 62
def html
@html
end
|
#url ⇒ Object
Returns the value of attribute url.
62
63
64
|
# File 'lib/extractula/extractor.rb', line 62
def url
@url
end
|
Class Method Details
14
15
16
17
18
19
20
|
# File 'lib/extractula/extractor.rb', line 14
def self. url, html
if @extractable_domain.is_a? Regexp
url.host + url.path =~ @extractable_domain
else
@extractable_domain ? @extractable_domain == url.domain : false
end
end
|
.domain(domain) ⇒ Object
10
11
12
|
# File 'lib/extractula/extractor.rb', line 10
def self.domain domain
@extractable_domain = domain
end
|
.inherited(subclass) ⇒ Object
6
7
8
|
# File 'lib/extractula/extractor.rb', line 6
def self.inherited subclass
Extractula. subclass
end
|
22
23
24
25
|
# File 'lib/extractula/extractor.rb', line 22
def self.media_type type = nil
@media_type = type if type
@media_type
end
|
Instance Method Details
#content ⇒ Object
89
90
91
|
# File 'lib/extractula/extractor.rb', line 89
def content
content_at(content_path, content_attr, content_block) ||
end
|
69
70
71
72
73
74
75
76
77
78
79
|
# File 'lib/extractula/extractor.rb', line 69
def
Extractula::ExtractedContent.new({
:url => url.url,
:media_type => media_type,
:title => title,
:content => content,
:summary => summary,
:image_urls => image_urls,
:video_embed => video_embed
})
end
|
#image_urls ⇒ Object
97
98
99
100
101
|
# File 'lib/extractula/extractor.rb', line 97
def image_urls
if image_urls_path
image_srcs_from html.search(image_urls_path)
end
end
|
81
82
83
|
# File 'lib/extractula/extractor.rb', line 81
def media_type
self.class.media_type || 'text'
end
|
#summary ⇒ Object
93
94
95
|
# File 'lib/extractula/extractor.rb', line 93
def summary
content_at(summary_path, summary_attr, summary_block)
end
|
#title ⇒ Object
85
86
87
|
# File 'lib/extractula/extractor.rb', line 85
def title
content_at(title_path, title_attr, title_block) || content_at("//title")
end
|
#video_embed ⇒ Object
103
104
105
106
107
|
# File 'lib/extractula/extractor.rb', line 103
def video_embed
if video_embed_path
embed_code_from html.search(video_embed_path)
end
end
|