Module: Textract
- Defined in:
- lib/textract.rb,
lib/textract/version.rb
Defined Under Namespace
Classes: Client
Constant Summary collapse
- VERSION =
"0.0.6.1"
Class Method Summary collapse
- .generate_hash(text) ⇒ Object
- .get_author(html) ⇒ Object
- .get_og_tags(html) ⇒ Object
- .get_page_title(html) ⇒ Object
-
.get_text(url, selectors = nil, format = "markdown") ⇒ Object
attr_accessor :client.
- .smart_extract(html, description, selectors) ⇒ Object
Class Method Details
.generate_hash(text) ⇒ Object
61 62 63 64 |
# File 'lib/textract.rb', line 61 def self.generate_hash(text) # require 'pry'; binding.pry Digest::MD5.hexdigest text end |
.get_author(html) ⇒ Object
56 57 58 59 |
# File 'lib/textract.rb', line 56 def self.(html) = Nokogiri::HTML(html).search('meta[name="author"]') .attribute('content').value unless .empty? end |
.get_og_tags(html) ⇒ Object
15 16 17 |
# File 'lib/textract.rb', line 15 def self.(html) OpenGraph.new(html) end |
.get_page_title(html) ⇒ Object
52 53 54 |
# File 'lib/textract.rb', line 52 def self.get_page_title(html) Nokogiri::HTML(html).search('head').search('title').text end |
.get_text(url, selectors = nil, format = "markdown") ⇒ Object
attr_accessor :client
11 12 13 |
# File 'lib/textract.rb', line 11 def self.get_text(url, selectors=nil, format="markdown") @client = Client.new(url, selectors, format) end |
.smart_extract(html, description, selectors) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/textract.rb', line 19 def self.smart_extract(html, description, selectors) doc = Nokogiri::HTML html if selectors.nil? article = doc.search('article') else article = doc.search(selectors) end if article.count == 1 article_el = article[0] elsif !description.nil? and article.count == 0 els = [1,2,3] i = 1 until els.count < 2 search_text = description.split(" ")[0..i].join(" ") els = doc.search "[text()*='#{search_text}']" i += 1 end if els.count == 1 el = els[0] article_el = el.parent else # do something else if multiple or no matches end else article_el = doc end article = Readability::Document.new(article_el.to_s, tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong], attributes: %w[src href], remove_empty_nodes: true, ) end |