Module: ODSExtractor

Defined in:
lib/ods_extractor.rb,
lib/ods_extractor/version.rb

Defined Under Namespace

Classes: CSVOutput, Error, RowOutput, SAXHandler, SheetFilterHandler

Constant Summary collapse

ACCEPT_ALL_SHEETS_PROC =
->(_sheet_name) { true }
PROGRESS_HANDLER_PROC =
->(bytes_read, bytes_remaining) { true }
CHUNK_SIZE =
32 * 1024
VERSION =
"0.1.1"

Class Method Summary collapse

Class Method Details

.extract(input_io:, output_handler:, sheet_names: ACCEPT_ALL_SHEETS_PROC, progress_handler_proc: PROGRESS_HANDLER_PROC) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/ods_extractor.rb', line 18

def self.extract(input_io:, output_handler:, sheet_names: ACCEPT_ALL_SHEETS_PROC, progress_handler_proc: PROGRESS_HANDLER_PROC)
  # Feed the XML from the extractor directly to the SAX parser
  entries = ZipKit::FileReader.read_zip_structure(io: input_io)
  contentx_xml_zip_entry = entries.find { |e| e.filename == "content.xml" }

  raise Error, "No `content.xml` found in the ODS file" unless contentx_xml_zip_entry

  sax_handler = ODSExtractor::SAXHandler.new(output_handler)
  sax_filter = ODSExtractor::SheetFilterHandler.new(sax_handler, sheet_names)

  # Because we do not have a random access IO to the deflated XML inside the zip, but
  # we will be reading the deflated bytes and inflating them ourselves, we can't really
  # use the standard Parser - we need to use the PushParser. The Parser "reads" by itself
  # from the IO it has been given, PushParser can be fed bytes as we deflate them.
  push_parser = Nokogiri::XML::SAX::PushParser.new(sax_filter)

  # The "extract" call reads N bytes, inflates them and then returns them. We do not
  # know how big the inflated data will be before we inflate it, and the libxml2
  # push parser will abort with an error if we force-feed it chunks which are too big.
  # So read smol.
  ex = contentx_xml_zip_entry.extractor_from(input_io)
  progress_handler_proc.call(0, contentx_xml_zip_entry.uncompressed_size)
  bytes_read = 0
  until ex.eof?
    chunk = ex.extract(CHUNK_SIZE)
    bytes_read += chunk.bytesize
    progress_handler_proc.call(bytes_read, contentx_xml_zip_entry.uncompressed_size - bytes_read)
    push_parser << chunk
  end
ensure
  push_parser&.finish
end