Module: Textractor

Defined in:
lib/textractor.rb,
lib/textractor/version.rb,
lib/textractor/extractors.rb

Defined Under Namespace

Modules: ContentTypeDetector, Extractors

Constant Summary collapse

UnknownContentType =
Class.new(StandardError)
FileNotFound =
Class.new(StandardError)
ContentTypeAlreadyRegistered =
Class.new(StandardError)
ContentTypeNotRegistered =
Class.new(StandardError)
VERSION =
'0.2.0'

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.content_type_detectorObject

Returns the value of attribute content_type_detector.



23
24
25
# File 'lib/textractor.rb', line 23

def content_type_detector
  @content_type_detector
end

Class Method Details

.clear_registryObject



51
52
53
# File 'lib/textractor.rb', line 51

def self.clear_registry
  @extractors = {}
end

.content_type_for_path(path) ⇒ Object



26
27
28
# File 'lib/textractor.rb', line 26

def self.content_type_for_path(path)
  content_type_detector.content_type_for_path(path) or raise UnknownContentType, "unable to determine content type for #{path}"
end

.extractor_for_content_type(content_type) ⇒ Object



43
44
45
# File 'lib/textractor.rb', line 43

def self.extractor_for_content_type(content_type)
  extractors[content_type] or raise ContentTypeNotRegistered, "#{content_type} is not registered with Textractor"
end

.extractorsObject



47
48
49
# File 'lib/textractor.rb', line 47

def self.extractors
  @extractors ||= {}
end

.register_basic_typesObject



55
56
57
58
59
60
# File 'lib/textractor.rb', line 55

def self.register_basic_types
  register_content_type("application/pdf", Extractors::PDFExtractor)
  register_content_type("application/msword", Extractors::DocExtractor)
  register_content_type("application/vnd.openxmlformats-officedocument.wordprocessingml.document", Extractors::DocxExtractor)
  register_content_type("text/plain", Extractors::TextExtractor)
end

.register_content_type(content_type, extractor = nil, &block) ⇒ Object



30
31
32
33
34
35
36
37
# File 'lib/textractor.rb', line 30

def self.register_content_type(content_type, extractor = nil, &block)
  raise ContentTypeAlreadyRegistered, "#{content_type} is already registered" if extractors[content_type]
  if extractor
    extractors[content_type] = extractor
  elsif block_given?
    extractors[content_type] = block
  end
end

.remove_content_type(content_type) ⇒ Object



39
40
41
# File 'lib/textractor.rb', line 39

def self.remove_content_type(content_type)
  extractors.delete content_type
end

.text_from_path(path, options = {}) ⇒ Object

Raises:



10
11
12
13
14
15
16
17
18
19
20
# File 'lib/textractor.rb', line 10

def self.text_from_path(path, options = {})
  raise FileNotFound unless File.exists?(path)
  content_type    = options.fetch(:content_type) { content_type_for_path(path) }
  extractor       = extractor_for_content_type(content_type)

  if extractor.is_a?(Proc)
    extractor.call(path)
  else
    extractor.new.text_from_path(path)
  end
end