Class: Textract::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/textract.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, selectors, format) ⇒ Client

Returns a new instance of Client.


103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/textract.rb', line 103

def initialize(url, selectors, format)
  @url = url
  agent = Mechanize.new
  agent.user_agent_alias = 'Mac Safari'
  @html = agent.get(url).content
  @tags = Textract.get_og_tags(@html, url)
  if @tags.url.match(/^(http|ftp)s?:\/\//)
    @url = @tags.url
  end

  @article = Textract.smart_extract(@html, @tags.description, selectors)
  if @article.content.nil?
    @text = ""
  else
    if format == 'markdown'
      @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
    else
      @text = @article.content
    end
  end
  @md5 = Textract.generate_hash @text
  @author = Textract.build_author @article, @html
  @title = @tags.title || Textract.get_page_title(@html)
  if @url.match(/\/robots.txt$/) and @title = @text
    @title = @url
  end
end

Instance Attribute Details

#authorObject (readonly)

Returns the value of attribute author.


101
102
103
# File 'lib/textract.rb', line 101

def author
  @author
end

#htmlObject (readonly)

Returns the value of attribute html.


95
96
97
# File 'lib/textract.rb', line 95

def html
  @html
end

#md5Object (readonly)

Returns the value of attribute md5.


100
101
102
# File 'lib/textract.rb', line 100

def md5
  @md5
end

#tagsObject (readonly)

Returns the value of attribute tags.


97
98
99
# File 'lib/textract.rb', line 97

def tags
  @tags
end

#textObject (readonly)

Returns the value of attribute text.


99
100
101
# File 'lib/textract.rb', line 99

def text
  @text
end

#titleObject (readonly)

Returns the value of attribute title.


98
99
100
# File 'lib/textract.rb', line 98

def title
  @title
end

#urlObject (readonly)

Returns the value of attribute url.


96
97
98
# File 'lib/textract.rb', line 96

def url
  @url
end

Instance Method Details

#as_jsonObject


131
132
133
# File 'lib/textract.rb', line 131

def as_json
  to_h.to_json
end

#to_hObject


135
136
137
138
139
140
141
142
143
# File 'lib/textract.rb', line 135

def to_h
  {
    url: @url,
    text: @text,
    md5: @md5,
    author: @author,
    title: @title,
  }
end