Class: Textract::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/textract.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, selectors, format) ⇒ Client

Returns a new instance of Client.



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/textract.rb', line 138

def initialize(url, selectors, format)
  @url = url
  agent = Mechanize.new
  agent.user_agent_alias = 'Mac Safari'
  @html = agent.get(url).content
  @tags = Textract.get_og_tags(@html, url)
  if @tags.url.match(/^(http|ftp)s?:\/\//)
    @url = @tags.url
  end

  @article = Textract.smart_extract(@html, @tags.description, selectors)
  if @article.content.nil?
    @text = ""
  else
    if format == 'markdown'
      @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
    else
      @text = @article.content
    end
  end
  @md5 = Textract.generate_hash @text
  @author = Textract.build_author @article, @html
  @site = Textract.build_site @url, @html
  @title = @tags.title || Textract.get_page_title(@html)
  if @url.match(/\/robots.txt$/) and @title = @text
    @title = @url
  end
end

Instance Attribute Details

#authorObject (readonly)

Returns the value of attribute author.



135
136
137
# File 'lib/textract.rb', line 135

def author
  @author
end

#htmlObject (readonly)

Returns the value of attribute html.



129
130
131
# File 'lib/textract.rb', line 129

def html
  @html
end

#md5Object (readonly)

Returns the value of attribute md5.



134
135
136
# File 'lib/textract.rb', line 134

def md5
  @md5
end

#siteObject (readonly)

Returns the value of attribute site.



136
137
138
# File 'lib/textract.rb', line 136

def site
  @site
end

#tagsObject (readonly)

Returns the value of attribute tags.



131
132
133
# File 'lib/textract.rb', line 131

def tags
  @tags
end

#textObject (readonly)

Returns the value of attribute text.



133
134
135
# File 'lib/textract.rb', line 133

def text
  @text
end

#titleObject (readonly)

Returns the value of attribute title.



132
133
134
# File 'lib/textract.rb', line 132

def title
  @title
end

#urlObject (readonly)

Returns the value of attribute url.



130
131
132
# File 'lib/textract.rb', line 130

def url
  @url
end

Instance Method Details

#as_jsonObject



167
168
169
# File 'lib/textract.rb', line 167

def as_json
  to_h.to_json
end

#to_hObject



171
172
173
174
175
176
177
178
179
180
# File 'lib/textract.rb', line 171

def to_h
  {
    url: @url,
    text: @text,
    md5: @md5,
    author: @author,
    title: @title,
    site: @site,
  }
end