Class: Textract::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/textract.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, selectors, format) ⇒ Client

Returns a new instance of Client.


75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/textract.rb', line 75

def initialize(url, selectors, format)
  @url = url
  agent = Mechanize.new
  agent.user_agent_alias = 'Mac Safari'
  @html = agent.get(url).content
  @tags = Textract.get_og_tags(@html)

  @article = Textract.smart_extract(@html, @tags.description, selectors)
  if @article.content.nil?
    @text = ""
  else
    if format == 'markdown'
      @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
    else
      @text = @article.content
    end
  end
  @md5 = Textract.generate_hash @text
  @author = @article.author || Textract.get_author(@html)
  @title = @tags.title || Textract.get_page_title(@html)
end

Instance Attribute Details

#authorObject (readonly)

Returns the value of attribute author.


73
74
75
# File 'lib/textract.rb', line 73

def author
  @author
end

#htmlObject (readonly)

Returns the value of attribute html.


67
68
69
# File 'lib/textract.rb', line 67

def html
  @html
end

#md5Object (readonly)

Returns the value of attribute md5.


72
73
74
# File 'lib/textract.rb', line 72

def md5
  @md5
end

#tagsObject (readonly)

Returns the value of attribute tags.


69
70
71
# File 'lib/textract.rb', line 69

def tags
  @tags
end

#textObject (readonly)

Returns the value of attribute text.


71
72
73
# File 'lib/textract.rb', line 71

def text
  @text
end

#titleObject (readonly)

Returns the value of attribute title.


70
71
72
# File 'lib/textract.rb', line 70

def title
  @title
end

#urlObject (readonly)

Returns the value of attribute url.


68
69
70
# File 'lib/textract.rb', line 68

def url
  @url
end

Instance Method Details

#to_hObject


97
98
99
100
101
102
103
104
105
# File 'lib/textract.rb', line 97

def to_h
  {
    url: @url,
    text: @text,
    md5: @md5,
    author: @author,
    title: @title,
  }
end