Module: Textract

Defined in:
lib/textract.rb,
lib/textract/version.rb
more...

Defined Under Namespace

Classes: Client

Constant Summary collapse

TAG_WHITELIST =

attr_accessor :client

%w[
  div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong
  figure
]
VERSION =
"0.0.10"

Class Method Summary collapse

Class Method Details

.generate_hash(text) ⇒ Object

[View source]

69
70
71
# File 'lib/textract.rb', line 69

def self.generate_hash(text)
  Digest::MD5.hexdigest text
end

.get_author(html) ⇒ Object

[View source]

64
65
66
67
# File 'lib/textract.rb', line 64

def self.get_author(html)
  name_meta = Nokogiri::HTML(html).search('meta[name="author"]')
  name_meta.attribute('content').value unless name_meta.empty?
end

.get_og_tags(html, url) ⇒ Object

[View source]

19
20
21
22
23
24
25
# File 'lib/textract.rb', line 19

def self.get_og_tags(html, url)
  begin
    OpenGraph.new(html)
  rescue
    OpenGraph.new(url)
  end
end

.get_page_title(html) ⇒ Object

[View source]

60
61
62
# File 'lib/textract.rb', line 60

def self.get_page_title(html)
  Nokogiri::HTML(html).search('head').search('title').text
end

.get_text(url, selectors = nil, format = "markdown") ⇒ Object

[View source]

15
16
17
# File 'lib/textract.rb', line 15

def self.get_text(url, selectors=nil, format="markdown")
  @client = Client.new(url, selectors, format)
end

.smart_extract(html, description, selectors) ⇒ Object

[View source]

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/textract.rb', line 27

def self.smart_extract(html, description, selectors)
  doc = Nokogiri::HTML html
  if selectors.nil?
    article = doc.search('article')
  else
    article = doc.search(selectors)
  end
  if article.count == 1
    article_el = article[0]
  elsif !description.nil? and article.count == 0
    els = [1,2,3]
    i = 1
    until els.count < 2
      search_text = description.split(" ")[0..i].join(" ")
      els = doc.search "[text()*='#{search_text}']"
      i += 1
    end
    if els.count == 1
      el = els[0]
      article_el = el.parent
    else
      # do something else if multiple or no matches
    end
  else
    article_el = doc
  end
  Readability::Document.new(article_el.to_s,
                            tags: TAG_WHITELIST,
                            attributes: %w[src href],
                            remove_empty_nodes: false,
                           )
end