Module: Textract

Defined in:
lib/textract.rb,
lib/textract/version.rb
more...

Defined Under Namespace

Classes: Client

Constant Summary collapse

TAG_WHITELIST =

attr_accessor :client

%w[
  div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong
  figure
]
VERSION =
"0.0.14"

Class Method Summary collapse

Class Method Details

.build_author(article, html) ⇒ Object

[View source]

81
82
83
84
85
86
# File 'lib/textract.rb', line 81

def self.build_author(article, html)
  {
    name: article.author || get_author(html),
    twitter: get_twitter(html),
  }
end

.generate_hash(text) ⇒ Object

def build_site

[View source]

90
91
92
# File 'lib/textract.rb', line 90

def self.generate_hash(text)
  Digest::MD5.hexdigest text
end

.get_author(html) ⇒ Object

[View source]

68
69
70
71
72
73
74
# File 'lib/textract.rb', line 68

def self.get_author(html)
  name_meta = Nokogiri::HTML(html).search('meta[name="author"]')
  if name_meta.empty?
    name_meta = Nokogiri::HTML(html).search('meta[property="author"]')
  end
  name_meta.attribute('content').value unless name_meta.empty?
end

.get_og_tags(html, url) ⇒ Object

[View source]

19
20
21
22
23
24
25
# File 'lib/textract.rb', line 19

def self.get_og_tags(html, url)
  begin
    OpenGraph.new(html)
  rescue
    OpenGraph.new(url)
  end
end

.get_page_title(html) ⇒ Object

[View source]

64
65
66
# File 'lib/textract.rb', line 64

def self.get_page_title(html)
  Nokogiri::HTML(html).search('head').search('title').text
end

.get_text(url, selectors = nil, format = "markdown") ⇒ Object

[View source]

15
16
17
# File 'lib/textract.rb', line 15

def self.get_text(url, selectors=nil, format="markdown")
  @client = Client.new(url, selectors, format)
end

.get_twitter(html) ⇒ Object

[View source]

76
77
78
79
# File 'lib/textract.rb', line 76

def self.get_twitter(html)
  twitter_meta = Nokogiri::HTML(html).search('meta[name="twitter:creator"]')
  twitter_meta.attribute('content').value unless twitter_meta.empty?
end

.smart_extract(html, description, selectors) ⇒ Object

[View source]

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/textract.rb', line 27

def self.smart_extract(html, description, selectors)
  doc = Nokogiri::HTML html
  if selectors.nil?
    article = doc.search('article')
  else
    article = doc.search(selectors)
  end
  if article.count == 1
    article_el = article[0]
  elsif !description.nil? and article.count == 0
    els = [1,2,3]
    i = 1
    until els.count < 2
      search_text = description.split(" ")[0..i].join(" ")
      if search_text.index "'"
        els = doc.search "[text()*=\"#{search_text}\"]"
      else
        els = doc.search "[text()*='#{search_text}']"
      end
      i += 1
    end
    if els.count == 1
      el = els[0]
      article_el = el.parent
    else
      # do something else if multiple or no matches
    end
  else
    article_el = doc
  end
  Readability::Document.new(article_el.to_s,
                            tags: TAG_WHITELIST,
                            attributes: %w[src href],
                            remove_empty_nodes: false,
                           )
end