Class: Henkei

Inherits:
Object
  • Object
show all
Defined in:
lib/henkei.rb,
lib/henkei/version.rb,
lib/henkei/configuration.rb

Overview

Henkei monkey patch for configuration support

Defined Under Namespace

Classes: Configuration

Constant Summary collapse

GEM_PATH =

rubocop:disable Metrics/ClassLength

File.dirname(File.dirname(__FILE__))
JAR_PATH =
File.join(Henkei::GEM_PATH, 'jar', 'tika-app-2.9.2.jar')
CONFIG_PATH =
File.join(Henkei::GEM_PATH, 'jar', 'tika-config.xml')
CONFIG_WITHOUT_OCR_PATH =
File.join(Henkei::GEM_PATH, 'jar', 'tika-config-without-ocr.xml')
VERSION =
'2.9.2.3'

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input) ⇒ Henkei

Create a new instance of Henkei with a given document.

Using a file path:

Henkei.new 'sample.pages'

Using a URL:

Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'

From a stream or an object which responds to read

Henkei.new File.open('sample.pages')


74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/henkei.rb', line 74

def initialize(input)
  if input.is_a? String
    if File.exist? input
      @path = input
    elsif input =~ URI::DEFAULT_PARSER.make_regexp
      @uri = URI.parse input
    else
      raise Errno::ENOENT, "missing file or invalid URI - #{input}"
    end
  elsif input.respond_to? :read
    @stream = input
  else
    raise TypeError, "can't read from #{input.class.name}"
  end
end

Class Method Details

.configurationObject



5
6
7
# File 'lib/henkei/configuration.rb', line 5

def self.configuration
  @configuration ||= Configuration.new
end

.configure {|configuration| ... } ⇒ Object

Yields:



9
10
11
# File 'lib/henkei/configuration.rb', line 9

def self.configure
  yield(configuration)
end

.mimetype(content_type) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
# File 'lib/henkei.rb', line 32

def self.mimetype(content_type)
  if Henkei.configuration.mime_library == 'mime/types' && defined?(MIME::Types)
    warn '[DEPRECATION] `mime/types` is deprecated. Please use `mini_mime` instead. ' \
         'Use Henkei.configure and assign "mini_mime" to `mime_library`.'
    MIME::Types[content_type].first
  else
    MiniMime.lookup_by_content_type(content_type).tap do |object|
      object.define_singleton_method(:extensions) { [extension] }
    end
  end
end

.read(type, data, include_ocr: false, encoding: nil) ⇒ Object

Read text or metadata from a data buffer.

data = File.read 'sample.pages'
text = Henkei.read :text, data
 = Henkei.read :metadata, data


50
51
52
53
54
55
56
57
58
# File 'lib/henkei.rb', line 50

def self.read(type, data, include_ocr: false, encoding: nil)
  result = client_read(type, data, include_ocr: include_ocr, encoding: encoding)

  case type
  when :text, :html then result
  when :metadata then JSON.parse(result)
  when :mimetype then Henkei.mimetype(JSON.parse(result)['Content-Type'])
  end
end

Instance Method Details

#creation_dateObject

Returns true if the Henkei document was specified using a file path.

henkei = Henkei.new 'sample.pages'
henkei.path? #=> true


157
158
159
160
161
162
# File 'lib/henkei.rb', line 157

def creation_date
  return @creation_date if defined? @creation_date
  return unless ['dcterms:created']

  @creation_date = Time.parse(['dcterms:created'])
end

#dataObject

Returns the raw/unparsed content of the Henkei document.

henkei = Henkei.new 'sample.pages'
henkei.data


197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/henkei.rb', line 197

def data
  return @data if defined? @data

  if path?
    @data = File.read @path
  elsif uri?
    @data = Net::HTTP.get @uri
  elsif stream?
    @data = @stream.read
  end

  @data
end

#html(include_ocr: false, encoding: nil) ⇒ Object

Returns the text content of the Henkei document in HTML.

henkei = Henkei.new 'sample.pages'
henkei.html

Include OCR results from images (includes embedded images in pages/docx/pdf etc)

henkei.html(include_ocr: true)

Set the output character encoding (e.g. ‘UTF-8’)

henkei.text(encoding: 'UTF-8')


122
123
124
125
126
# File 'lib/henkei.rb', line 122

def html(include_ocr: false, encoding: nil)
  return @html if defined? @html

  @html = Henkei.read :html, data, include_ocr: include_ocr, encoding: encoding
end

#metadataObject

Returns the metadata hash of the Henkei document.

henkei = Henkei.new 'sample.pages'
henkei.['Content-Type']


133
134
135
136
137
# File 'lib/henkei.rb', line 133

def 
  return @metadata if defined? @metadata

  @metadata = Henkei.read :metadata, data
end

#mimetypeObject

Returns the mimetype object of the Henkei document.

henkei = Henkei.new 'sample.docx'
henkei.mimetype.content_type #=> 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
henkei.mimetype.extensions #=> ['docx']


145
146
147
148
149
150
# File 'lib/henkei.rb', line 145

def mimetype
  return @mimetype if defined? @mimetype

  content_type = ['Content-Type'].is_a?(Array) ? ['Content-Type'].first : ['Content-Type']
  @mimetype = Henkei.mimetype(content_type)
end

#path?Boolean

Returns true if the Henkei document was specified using a file path.

henkei = Henkei.new '/my/document/path/sample.docx'
henkei.path? #=> true

Returns:

  • (Boolean)


169
170
171
# File 'lib/henkei.rb', line 169

def path?
  !!@path
end

#stream?Boolean

Returns true if the Henkei document was specified from a stream or an object which responds to read.

file = File.open('sample.pages')
henkei = Henkei.new file
henkei.stream? #=> true

Returns:

  • (Boolean)


188
189
190
# File 'lib/henkei.rb', line 188

def stream?
  !!@stream
end

#text(include_ocr: false, encoding: nil) ⇒ Object

Returns the text content of the Henkei document.

henkei = Henkei.new 'sample.pages'
henkei.text

Include OCR results from images (includes embedded images in pages/docx/pdf etc)

henkei.text(include_ocr: true)

Set the output character encoding (e.g. ‘UTF-8’)

henkei.text(encoding: 'UTF-8')


103
104
105
106
107
# File 'lib/henkei.rb', line 103

def text(include_ocr: false, encoding: nil)
  return @text if defined? @text

  @text = Henkei.read :text, data, include_ocr: include_ocr, encoding: encoding
end

#uri?Boolean

Returns true if the Henkei document was specified using a URI.

henkei = Henkei.new 'http://svn.apache.org/repos/asf/poi/trunk/test-data/document/sample.docx'
henkei.uri? #=> true

Returns:

  • (Boolean)


178
179
180
# File 'lib/henkei.rb', line 178

def uri?
  !!@uri
end