Module: PlainTextExtractorDSL

Included in:
PlainTextExtractor
Defined in:
lib/picolena/templates/lib/plain_text_extractor_dsl.rb

Overview

Defines plain text extractors with DSL For example, to convert “Microsoft Office Word document” to plain text

PlainTextExtractor.new {
  every :doc, :dot
  as "application/msword"
  aka "Microsoft Office Word document"
  extract_content_with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
  which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
  or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#commandObject (readonly)

Returns the value of attribute command.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 13

def command
  @command
end

#content_and_file_examplesObject (readonly)

Returns the value of attribute content_and_file_examples.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 13

def content_and_file_examples
  @content_and_file_examples
end

#descriptionObject (readonly)

Returns the value of attribute description.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 13

def description
  @description
end

#extsObject (readonly)

Returns the value of attribute exts.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 13

def exts
  @exts
end

#mime_nameObject (readonly)

Returns the value of attribute mime_name.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 13

def mime_name
  @mime_name
end

#thumbnail_commandObject (readonly)

Returns the value of attribute thumbnail_command.



13
14
15
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 13

def thumbnail_command
  @thumbnail_command
end

Instance Method Details

#aka(description) ⇒ Object



30
31
32
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 30

def aka(description)
  @description=description
end

#as(mime_name) ⇒ Object



26
27
28
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 26

def as(mime_name)
  @mime_name=mime_name
end

#every(*exts) ⇒ Object



21
22
23
24
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 21

def every(*exts)
  @exts ||=[]
  @exts |= exts
end

#extract_content_from_archive_with(unpack_command) ⇒ Object

Unpack an archive and extract content from every supported file



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 76

def extract_content_from_archive_with(unpack_command)
  #FIXME: Cleaner code needed!
  @command=lambda {|source|
    begin
      global_temp_dir   = File.join(Dir::tmpdir, 'picolena_archive_temp')
      specific_temp_dir = File.join(global_temp_dir, source.base26_hash)
      FileUtils.mkpath specific_temp_dir
      specific_unpack_command=unpack_command.sub('SOURCE','"'<<source<<'"').sub(/TE?MPDIR/,'"'<<specific_temp_dir<<'"')
      silently_execute(specific_unpack_command)
      Dir["#{specific_temp_dir}/**/*"].select{|f| File.file?(f)}.map{|filename|
        content=PlainTextExtractor.extract_content_from(filename) rescue "---"
        ["##"<<filename.sub(specific_temp_dir,'').gsub('/', '>'),
          content]
      }.join("\n")
    ensure
      FileUtils.remove_entry_secure(specific_temp_dir)
      FileUtils.rmdir(global_temp_dir) rescue "not empty"
    end
  }
  (@dependencies||=[])<<unpack_command.dependencies
end

#extract_content_with(command_as_hash_or_string = nil, &block) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 53

def extract_content_with(command_as_hash_or_string=nil,&block)
  #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
@command=case command_as_hash_or_string
when String
  command_as_hash_or_string
when Hash
  command_for_current_platform(command_as_hash_or_string)
  else
    block || raise("No command defined for this extractor: #{description}")
  end
end

#extract_thumbnail_with(command_as_hash_or_string = nil, &block) ⇒ Object



65
66
67
68
69
70
71
72
73
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 65

def extract_thumbnail_with(command_as_hash_or_string=nil, &block)
  #TODO: Don't ignore block and use it as in extract_content_with
  @thumbnail_command=case command_as_hash_or_string
  when String
    command_as_hash_or_string
  when Hash
    command_for_current_platform(command_as_hash_or_string)
  end
end

#initialize(&block) ⇒ Object



15
16
17
18
19
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 15

def initialize(&block)
  @content_and_file_examples=[]
  self.instance_eval(&block)
  PlainTextExtractor.add(self)
end

#which_requires(*dependencies) ⇒ Object



34
35
36
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 34

def which_requires(*dependencies)
  @dependencies=dependencies
end

#which_should_for_example_extract(content, file) ⇒ Object Also known as: or_extract

used by rspec to test extractors:

which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'

this spec will pass if ‘basic.pdf’ and ‘yet_another.pdf’ are included in an indexed directory, if every dependency is installed, and if plain text output from the extractor applied to ‘basic.pdf’ and ‘yet_another.pdf’ respectively include ‘in a pdf file’ and ‘some other stuff inside another pdf file’



44
45
46
# File 'lib/picolena/templates/lib/plain_text_extractor_dsl.rb', line 44

def which_should_for_example_extract(content, file)
  @content_and_file_examples << [content,file[:from]]
end