Class: Pho::FileManagement::FileSplitter

Inherits:
Object
  • Object
show all
Defined in:
lib/pho/upload.rb

Overview

Supports splitting RDF data files into smaller chunks of ntriples

Constant Summary collapse

DEFAULT_CHUNK_SIZE =
10000

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(dir = "/tmp", triples = DEFAULT_CHUNK_SIZE, handler = Pho::FileManagement::StatementHandler.new) ⇒ FileSplitter

Create a file splitter instance

dir

temporary directory into which split files should be written

triples

number of triples per split file

handler

statement handler to allow pre-processing of statements



58
59
60
61
62
63
# File 'lib/pho/upload.rb', line 58

def initialize(dir="/tmp", triples=DEFAULT_CHUNK_SIZE, 
    handler=Pho::FileManagement::StatementHandler.new)
  @dir = dir
  @triples = triples
  @handler = handler
end

Instance Attribute Details

#dirObject (readonly)

Returns the value of attribute dir.



49
50
51
# File 'lib/pho/upload.rb', line 49

def dir
  @dir
end

#handlerObject (readonly)

Returns the value of attribute handler.



49
50
51
# File 'lib/pho/upload.rb', line 49

def handler
  @handler
end

#triplesObject (readonly)

Returns the value of attribute triples.



49
50
51
# File 'lib/pho/upload.rb', line 49

def triples
  @triples
end

Instance Method Details

#split_file(filename, format = :ntriples) ⇒ Object

Split a single file, in any parseable RDF format into smaller chunks of ntriples. Chunked files are stored in default temporary directory for this instance

filename

name of the file to split

format

input format, default is :ntriples



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/pho/upload.rb', line 71

def split_file(filename, format=:ntriples)
  
  basename = File.basename(filename, ".#{filename.split(".").last}")
  count = 0
  stmts = []
  RDF::Reader.for(format).new(File.new(filename)) do |reader|
    reader.each_statement do |statement|            
      count += 1
      stmts << @handler.handle( statement ) 
      if count % @triples == 0
        RDF::Writer.open( File.join(@dir, "#{basename}_#{count}.nt") ) do |writer|
          stmts.each do |s|
            writer << s
          end
        end              
        stmts = []              
      end
    end
  end
  if !stmts.empty?
    RDF::Writer.open( File.join(@dir, "#{basename}_#{count}.nt") ) do |writer|
      stmts.each do |s|
        writer << s
      end
    end
  end
end

#split_files(list_of_filenames, format = :ntriples) ⇒ Object

Split a list of files into smaller chunks

list_of_filenames

array of filenames

format

format of the files, default is :ntriples



103
104
105
106
107
# File 'lib/pho/upload.rb', line 103

def split_files(list_of_filenames, format=:ntriples)
  list_of_filenames.each do |name|
    split_file(name, format)
  end
end