Class: Pho::FileManagement::FileSplitter

Inherits:
Object
  • Object
show all
Defined in:
lib/pho/upload.rb

Overview

Supports splitting RDF data files into smaller chunks of ntriples

Constant Summary collapse

DEFAULT_CHUNK_SIZE =
10000

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(dir = "/tmp", triples = DEFAULT_CHUNK_SIZE, handler = Pho::FileManagement::StatementHandler.new) ⇒ FileSplitter

Create a file splitter instance

dir:: temporary directory into which split files should be written
triples:: number of triples per split file
handler:: statement handler to allow pre-processing of statements


58
59
60
61
62
63
# File 'lib/pho/upload.rb', line 58

def initialize(dir="/tmp", triples=DEFAULT_CHUNK_SIZE, 
    handler=Pho::FileManagement::StatementHandler.new)
  @dir = dir
  @triples = triples
  @handler = handler
end

Instance Attribute Details

#dirObject (readonly)

Returns the value of attribute dir.



49
50
51
# File 'lib/pho/upload.rb', line 49

def dir
  @dir
end

#handlerObject (readonly)

Returns the value of attribute handler.



49
50
51
# File 'lib/pho/upload.rb', line 49

def handler
  @handler
end

#triplesObject (readonly)

Returns the value of attribute triples.



49
50
51
# File 'lib/pho/upload.rb', line 49

def triples
  @triples
end

Instance Method Details

#split_file(filename, format = :ntriples) ⇒ Object

Split a single file, in any parseable RDF format into smaller chunks of ntriples. Chunked files are stored in default temporary directory for this instance

filename:: name of the file to split
format:: input format, default is :ntriples


71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/pho/upload.rb', line 71

def split_file(filename, format=:ntriples)
  
  basename = File.basename(filename, ".#{filename.split(".").last}")
  count = 0
  stmts = []
  RDF::Reader.for(format).new(File.new(filename)) do |reader|
    reader.each_statement do |statement|            
      count += 1
      stmts << @handler.handle( statement ) 
      if count % @triples == 0
        RDF::Writer.open( File.join(@dir, "#{basename}_#{count}.nt") ) do |writer|
          stmts.each do |s|
            writer << s
          end
        end              
        stmts = []              
      end
    end
  end
  if !stmts.empty?
    RDF::Writer.open( File.join(@dir, "#{basename}_#{count}.nt") ) do |writer|
      stmts.each do |s|
        writer << s
      end
    end
  end
end

#split_files(list_of_filenames, format = :ntriples) ⇒ Object

Split a list of files into smaller chunks

list_of_filenames:: array of filenames
format:: format of the files, default is :ntriples


103
104
105
106
107
# File 'lib/pho/upload.rb', line 103

def split_files(list_of_filenames, format=:ntriples)
  list_of_filenames.each do |name|
    split_file(name, format)
  end
end