Class: Yomu
- Inherits:
-
Object
- Object
- Yomu
- Defined in:
- lib/yomu.rb,
lib/yomu/version.rb
Constant Summary collapse
- GEMPATH =
File.dirname(File.dirname(__FILE__))
- JARPATH =
File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.11.jar')
- DEFAULT_SERVER_PORT =
an arbitrary, but perfectly cromulent, port
9293
- VERSION =
'0.3.2'
- @@server_port =
nil
- @@server_pid =
nil
Class Method Summary collapse
- ._client_read(type, data) ⇒ Object
- ._server_read(_, data) ⇒ Object
-
.kill_server! ⇒ Object
Kills server started by Yomu.server.
-
.read(type, data) ⇒ Object
Read text or metadata from a data buffer.
-
.server(type, custom_port = nil) ⇒ Object
Returns pid of Tika server, started as a new spawned process.
Instance Method Summary collapse
- #creation_date ⇒ Object
-
#data ⇒ Object
Returns the raw/unparsed content of the Yomu document.
-
#html ⇒ Object
Returns the text content of the Yomu document in HTML.
-
#initialize(input) ⇒ Yomu
constructor
Create a new instance of Yomu with a given document.
-
#metadata ⇒ Object
Returns the metadata hash of the Yomu document.
-
#mimetype ⇒ Object
Returns the mimetype object of the Yomu document.
- #path? ⇒ Boolean
-
#stream? ⇒ Boolean
Returns
true
if the Yomu document was specified from a stream or an object which responds toread
. -
#text ⇒ Object
Returns the text content of the Yomu document.
-
#uri? ⇒ Boolean
Returns
true
if the Yomu document was specified using a URI.
Constructor Details
#initialize(input) ⇒ Yomu
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/yomu.rb', line 97 def initialize(input) if input.is_a? String if File.exists? input @path = input elsif input =~ URI::regexp @uri = URI.parse input else raise Errno::ENOENT.new "missing file or invalid URI - #{input}" end elsif input.respond_to? :read @stream = input else raise TypeError.new "can't read from #{input.class.name}" end end |
Class Method Details
._client_read(type, data) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/yomu.rb', line 39 def self._client_read(type, data) switch = case type when :text '-t' when :html '-h' when :metadata '-m -j' when :mimetype '-m -j' end IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io| io.write data io.close_write io.read end end |
._server_read(_, data) ⇒ Object
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/yomu.rb', line 59 def self._server_read(_, data) s = TCPSocket.new('localhost', @@server_port) file = StringIO.new(data, 'r') while 1 chunk = file.read(65536) break unless chunk s.write(chunk) end # tell Tika that we're done sending data s.shutdown(Socket::SHUT_WR) resp = '' while 1 chunk = s.recv(65536) break if chunk.empty? || !chunk resp << chunk end resp ensure s.close unless s.nil? end |
.kill_server! ⇒ Object
Kills server started by Yomu.server
Always run this when you're done, or else Tika might run until you kill it manually
You might try putting your extraction in a begin..rescue...ensure...end block and
putting this method in the ensure block.
Yomu.server(:text)
reports = ["report1.docx", "report2.doc", "report3.pdf"]
begin
my_texts = reports.map{|report_path| Yomu.new(report_path).text }
rescue
ensure
Yomu.kill_server!
end
262 263 264 265 266 267 268 |
# File 'lib/yomu.rb', line 262 def self.kill_server! if @@server_pid Process.kill('INT', @@server_pid) @@server_pid = nil @@server_port = nil end end |
.read(type, data) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/yomu.rb', line 24 def self.read(type, data) result = @@server_port ? self._server_read(type, data) : self._client_read(type, data) case type when :text result when :html result when :metadata JSON.parse(result) when :mimetype MIME::Types[JSON.parse(result)['Content-Type']].first end end |
.server(type, custom_port = nil) ⇒ Object
Returns pid of Tika server, started as a new spawned process.
type :html, :text or :metadata
custom_port e.g. 9293
Yomu.server(:text, 9294)
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
# File 'lib/yomu.rb', line 225 def self.server(type, custom_port=nil) switch = case type when :text '-t' when :html '-h' when :metadata '-m -j' when :mimetype '-m -j' end @@server_port = custom_port || DEFAULT_SERVER_PORT begin TCPSocket.new('localhost', @@server_port).close rescue Errno::ECONNREFUSED @@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}") sleep(2) # Give the server 2 seconds to spin up. @@server_pid end end |
Instance Method Details
#creation_date ⇒ Object
166 167 168 169 170 171 172 173 174 |
# File 'lib/yomu.rb', line 166 def creation_date return @creation_date if defined? @creation_date if ['Creation-Date'] @creation_date = Time.parse(['Creation-Date']) else nil end end |
#data ⇒ Object
204 205 206 207 208 209 210 211 212 213 214 215 216 |
# File 'lib/yomu.rb', line 204 def data return @data if defined? @data if path? @data = File.read @path elsif uri? @data = Net::HTTP.get @uri elsif stream? @data = @stream.read end @data end |
#html ⇒ Object
129 130 131 132 133 |
# File 'lib/yomu.rb', line 129 def html return @html if defined? @html @html = Yomu.read :html, data end |
#metadata ⇒ Object
140 141 142 143 144 |
# File 'lib/yomu.rb', line 140 def return @metadata if defined? @metadata @metadata = Yomu.read :metadata, data end |
#mimetype ⇒ Object
152 153 154 155 156 157 158 |
# File 'lib/yomu.rb', line 152 def mimetype return @mimetype if defined? @mimetype type = ["Content-Type"].is_a?(Array) ? ["Content-Type"].first : ["Content-Type"] @mimetype = MIME::Types[type].first end |
#path? ⇒ Boolean
176 177 178 |
# File 'lib/yomu.rb', line 176 def path? defined? @path end |
#stream? ⇒ Boolean
195 196 197 |
# File 'lib/yomu.rb', line 195 def stream? defined? @stream end |