Class: Yomu

Inherits:
Object
  • Object
show all
Defined in:
lib/yomu.rb,
lib/yomu/version.rb

Constant Summary collapse

GEMPATH =
File.dirname(File.dirname(__FILE__))
JARPATH =
File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.11.jar')
DEFAULT_SERVER_PORT =

an arbitrary, but perfectly cromulent, port

9293
VERSION =
'0.3.2'
@@server_port =
nil
@@server_pid =
nil

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input) ⇒ Yomu

Create a new instance of Yomu with a given document.

Using a file path:

Yomu.new 'sample.pages'

Using a URL:

Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'

From a stream or an object which responds to read

Yomu.new File.open('sample.pages')


97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/yomu.rb', line 97

def initialize(input)
  if input.is_a? String
    if File.exists? input
      @path = input
    elsif input =~ URI::regexp
      @uri = URI.parse input
    else
      raise Errno::ENOENT.new "missing file or invalid URI - #{input}"
    end
  elsif input.respond_to? :read
    @stream = input
  else
    raise TypeError.new "can't read from #{input.class.name}"
  end
end

Class Method Details

._client_read(type, data) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/yomu.rb', line 39

def self._client_read(type, data)
  switch = case type
  when :text
    '-t'
  when :html
    '-h'
  when :metadata
    '-m -j'
  when :mimetype
    '-m -j'
  end

  IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
    io.write data
    io.close_write
    io.read
  end
end

._server_read(_, data) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/yomu.rb', line 59

def self._server_read(_, data)
  s = TCPSocket.new('localhost', @@server_port)
  file = StringIO.new(data, 'r')

  while 1
    chunk = file.read(65536)
    break unless chunk
    s.write(chunk)
  end

  # tell Tika that we're done sending data
  s.shutdown(Socket::SHUT_WR)

  resp = ''
  while 1
    chunk = s.recv(65536)
    break if chunk.empty? || !chunk
    resp << chunk
  end
  resp
ensure
  s.close unless s.nil?
end

.kill_server!Object

Kills server started by Yomu.server

Always run this when you're done, or else Tika might run until you kill it manually
You might try putting your extraction in a begin..rescue...ensure...end block and
  putting this method in the ensure block.

Yomu.server(:text)
reports = ["report1.docx", "report2.doc", "report3.pdf"]
begin
  my_texts = reports.map{|report_path| Yomu.new(report_path).text }
rescue
ensure
  Yomu.kill_server!
end


262
263
264
265
266
267
268
# File 'lib/yomu.rb', line 262

def self.kill_server!
  if @@server_pid
    Process.kill('INT', @@server_pid)
    @@server_pid = nil
    @@server_port = nil
  end
end

.read(type, data) ⇒ Object

Read text or metadata from a data buffer.

data = File.read 'sample.pages'
text = Yomu.read :text, data
 = Yomu.read :metadata, data


24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/yomu.rb', line 24

def self.read(type, data)
  result = @@server_port ? self._server_read(type, data) : self._client_read(type, data)

  case type
  when :text
    result
  when :html
    result
  when :metadata
    JSON.parse(result)
  when :mimetype
    MIME::Types[JSON.parse(result)['Content-Type']].first
  end
end

.server(type, custom_port = nil) ⇒ Object

Returns pid of Tika server, started as a new spawned process.

type :html, :text or :metadata
custom_port e.g. 9293

Yomu.server(:text, 9294)


225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/yomu.rb', line 225

def self.server(type, custom_port=nil)
  switch = case type
  when :text
    '-t'
  when :html
    '-h'
  when :metadata
    '-m -j'
  when :mimetype
    '-m -j'
  end

  @@server_port = custom_port || DEFAULT_SERVER_PORT

  begin
    TCPSocket.new('localhost', @@server_port).close
  rescue Errno::ECONNREFUSED
    @@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}")
    sleep(2) # Give the server 2 seconds to spin up.
    @@server_pid
  end
end

Instance Method Details

#creation_dateObject



166
167
168
169
170
171
172
173
174
# File 'lib/yomu.rb', line 166

def creation_date
  return @creation_date if defined? @creation_date

  if ['Creation-Date']
    @creation_date = Time.parse(['Creation-Date'])
  else
    nil
  end
end

#dataObject

Returns the raw/unparsed content of the Yomu document.

yomu = Yomu.new 'sample.pages'
yomu.data


204
205
206
207
208
209
210
211
212
213
214
215
216
# File 'lib/yomu.rb', line 204

def data
  return @data if defined? @data

  if path?
    @data = File.read @path
  elsif uri?
    @data = Net::HTTP.get @uri
  elsif stream?
    @data = @stream.read
  end

  @data
end

#htmlObject

Returns the text content of the Yomu document in HTML.

yomu = Yomu.new 'sample.pages'
yomu.html


129
130
131
132
133
# File 'lib/yomu.rb', line 129

def html
  return @html if defined? @html

  @html = Yomu.read :html, data
end

#metadataObject

Returns the metadata hash of the Yomu document.

yomu = Yomu.new 'sample.pages'
yomu.['Content-Type']


140
141
142
143
144
# File 'lib/yomu.rb', line 140

def 
  return @metadata if defined? @metadata

  @metadata = Yomu.read :metadata, data
end

#mimetypeObject

Returns the mimetype object of the Yomu document.

yomu = Yomu.new 'sample.docx'
yomu.mimetype.content_type #=> 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
yomu.mimetype.extensions #=> ['docx']


152
153
154
155
156
157
158
# File 'lib/yomu.rb', line 152

def mimetype
  return @mimetype if defined? @mimetype

  type = ["Content-Type"].is_a?(Array) ? ["Content-Type"].first : ["Content-Type"]

  @mimetype = MIME::Types[type].first
end

#path?Boolean

Returns:

  • (Boolean)


176
177
178
# File 'lib/yomu.rb', line 176

def path?
  defined? @path
end

#stream?Boolean

Returns true if the Yomu document was specified from a stream or an object which responds to read.

file = File.open('sample.pages')
yomu = Yomu.new file
yomu.stream? #=> true

Returns:

  • (Boolean)


195
196
197
# File 'lib/yomu.rb', line 195

def stream?
  defined? @stream
end

#textObject

Returns the text content of the Yomu document.

yomu = Yomu.new 'sample.pages'
yomu.text


118
119
120
121
122
# File 'lib/yomu.rb', line 118

def text
  return @text if defined? @text

  @text = Yomu.read :text, data
end

#uri?Boolean

Returns true if the Yomu document was specified using a URI.

yomu = Yomu.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
yomu.uri? #=> true

Returns:

  • (Boolean)


185
186
187
# File 'lib/yomu.rb', line 185

def uri?
  defined? @uri
end