Module: Textract
- Defined in:
- lib/textract.rb,
lib/textract/version.rb more...
Defined Under Namespace
Classes: Client
Constant Summary collapse
- TAG_WHITELIST =
attr_accessor :client
%w[ div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong figure ]
- VERSION =
"0.0.14"
Class Method Summary collapse
- .build_author(article, html) ⇒ Object
-
.generate_hash(text) ⇒ Object
def build_site.
- .get_author(html) ⇒ Object
- .get_og_tags(html, url) ⇒ Object
- .get_page_title(html) ⇒ Object
- .get_text(url, selectors = nil, format = "markdown") ⇒ Object
- .get_twitter(html) ⇒ Object
- .smart_extract(html, description, selectors) ⇒ Object
Class Method Details
permalink .build_author(article, html) ⇒ Object
[View source]
81 82 83 84 85 86 |
# File 'lib/textract.rb', line 81 def self.(article, html) { name: article. || (html), twitter: get_twitter(html), } end |
permalink .generate_hash(text) ⇒ Object
def build_site
90 91 92 |
# File 'lib/textract.rb', line 90 def self.generate_hash(text) Digest::MD5.hexdigest text end |
permalink .get_author(html) ⇒ Object
[View source]
68 69 70 71 72 73 74 |
# File 'lib/textract.rb', line 68 def self.(html) = Nokogiri::HTML(html).search('meta[name="author"]') if .empty? = Nokogiri::HTML(html).search('meta[property="author"]') end .attribute('content').value unless .empty? end |
permalink .get_og_tags(html, url) ⇒ Object
[View source]
19 20 21 22 23 24 25 |
# File 'lib/textract.rb', line 19 def self.(html, url) begin OpenGraph.new(html) rescue OpenGraph.new(url) end end |
permalink .get_page_title(html) ⇒ Object
[View source]
64 65 66 |
# File 'lib/textract.rb', line 64 def self.get_page_title(html) Nokogiri::HTML(html).search('head').search('title').text end |
permalink .get_text(url, selectors = nil, format = "markdown") ⇒ Object
[View source]
15 16 17 |
# File 'lib/textract.rb', line 15 def self.get_text(url, selectors=nil, format="markdown") @client = Client.new(url, selectors, format) end |
permalink .get_twitter(html) ⇒ Object
[View source]
76 77 78 79 |
# File 'lib/textract.rb', line 76 def self.get_twitter(html) = Nokogiri::HTML(html).search('meta[name="twitter:creator"]') .attribute('content').value unless .empty? end |
permalink .smart_extract(html, description, selectors) ⇒ Object
[View source]
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/textract.rb', line 27 def self.smart_extract(html, description, selectors) doc = Nokogiri::HTML html if selectors.nil? article = doc.search('article') else article = doc.search(selectors) end if article.count == 1 article_el = article[0] elsif !description.nil? and article.count == 0 els = [1,2,3] i = 1 until els.count < 2 search_text = description.split(" ")[0..i].join(" ") if search_text.index "'" els = doc.search "[text()*=\"#{search_text}\"]" else els = doc.search "[text()*='#{search_text}']" end i += 1 end if els.count == 1 el = els[0] article_el = el.parent else # do something else if multiple or no matches end else article_el = doc end Readability::Document.new(article_el.to_s, tags: TAG_WHITELIST, attributes: %w[src href], remove_empty_nodes: false, ) end |