Class: Textract::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/textract.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, selectors, format) ⇒ Client

Returns a new instance of Client.


82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/textract.rb', line 82

def initialize(url, selectors, format)
  @url = url
  agent = Mechanize.new
  agent.user_agent_alias = 'Mac Safari'
  @html = agent.get(url).content
  @tags = Textract.get_og_tags(@html, url)
  @url = @tags.url || @url

  @article = Textract.smart_extract(@html, @tags.description, selectors)
  if @article.content.nil?
    @text = ""
  else
    if format == 'markdown'
      @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
    else
      @text = @article.content
    end
  end
  @md5 = Textract.generate_hash @text
  @author = @article.author || Textract.get_author(@html)
  @title = @tags.title || Textract.get_page_title(@html)
end

Instance Attribute Details

#authorObject (readonly)

Returns the value of attribute author.


80
81
82
# File 'lib/textract.rb', line 80

def author
  @author
end

#htmlObject (readonly)

Returns the value of attribute html.


74
75
76
# File 'lib/textract.rb', line 74

def html
  @html
end

#md5Object (readonly)

Returns the value of attribute md5.


79
80
81
# File 'lib/textract.rb', line 79

def md5
  @md5
end

#tagsObject (readonly)

Returns the value of attribute tags.


76
77
78
# File 'lib/textract.rb', line 76

def tags
  @tags
end

#textObject (readonly)

Returns the value of attribute text.


78
79
80
# File 'lib/textract.rb', line 78

def text
  @text
end

#titleObject (readonly)

Returns the value of attribute title.


77
78
79
# File 'lib/textract.rb', line 77

def title
  @title
end

#urlObject (readonly)

Returns the value of attribute url.


75
76
77
# File 'lib/textract.rb', line 75

def url
  @url
end

Instance Method Details

#as_jsonObject


105
106
107
# File 'lib/textract.rb', line 105

def as_json
  to_h.to_json
end

#to_hObject


109
110
111
112
113
114
115
116
117
# File 'lib/textract.rb', line 109

def to_h
  {
    url: @url,
    text: @text,
    md5: @md5,
    author: @author,
    title: @title,
  }
end