Class: ExternalArchive

Inherits:
External::Base show all
Defined in:
lib/external_archive.rb

Overview

ExternalArchive provides array-like access to archival data stored on disk. ExternalArchives consist of an IO object and an index of [start, length] pairs which indicate the start position and length of entries in the IO.

Direct Known Subclasses

ExternalArray

Constant Summary

Constants inherited from External::Base

External::Base::TEMPFILE_BASENAME

Instance Attribute Summary collapse

Attributes inherited from External::Base

#io

Attributes included from External::Chunkable

#default_blksize

Attributes included from External::Enumerable

#enumerate_to_a

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from External::Base

#closed?, #dup, #empty?, #eql?, #first, #flush, #inspect, #slice, #to_ary

Methods included from External::Chunkable

#chunk, #default_span, range_begin_and_end, #reverse_chunk, split_range, split_span

Methods included from External::Enumerable

#all?, #any?, #collect, #detect, #each_with_index, #entries, #find, #find_all, #include?, #map, #member?, #select

Constructor Details

#initialize(io = nil, io_index = nil) ⇒ ExternalArchive

Returns a new instance of ExternalArchive.



114
115
116
117
# File 'lib/external_archive.rb', line 114

def initialize(io=nil, io_index=nil)
  super(io)
  @io_index = io_index || []
end

Instance Attribute Details

#io_indexObject (readonly)

The underlying index of [position, length] arrays indicating where entries in the io are located.



112
113
114
# File 'lib/external_archive.rb', line 112

def io_index
  @io_index
end

Class Method Details

.[](*args) ⇒ Object

Array-like constructor for an ExternalArchive.



34
35
36
37
38
# File 'lib/external_archive.rb', line 34

def [](*args)
  extarc = new
  extarc.concat(args)
  extarc
end

.index_path(path) ⇒ Object

Returns the default io index filepath for path:

ExternalArchive.index_path("/path/to/file.txt")   # => "/path/to/file.index"


44
45
46
# File 'lib/external_archive.rb', line 44

def index_path(path)
  path ? path.chomp(File.extname(path)) + '.index' : nil
end

.open(path, mode = "rb", options = {}) ⇒ Object

Initializes an instance of self with File.open(path, mode) as an io. As with File.open, the instance will be passed to the block and closed when the block returns. If no block is given, open returns the new instance.

By default the instance will be initialized with an ExternalIndex io_index, linked to index_path(path). The instance will be automatically reindexed if it is empty but it’s io is not.

Options (specify using symbols):

io_index

Specifies the io_index manually. A filepath may be provided and it will be used instead of index_path(path). Array and ExternalIndex values are used directly.

reindex

Forces a call to reindex; using auto reindexing, reindex is normally only called when the instance is empty and the instance io is not. (default false)

auto_reindex

Turns on or off auto reindexing (default true)



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/external_archive.rb', line 66

def open(path, mode="rb", options={})
  options = {
    :io_index => nil,
    :reindex => false,
    :auto_reindex => true
  }.merge(options)
  
  index = options[:io_index]
  if index == nil
    index = index_path(path)
    FileUtils.touch(index) unless File.exists?(index)
  end
  
  begin
    io = path == nil ? nil : File.open(path, mode)
    io_index = case index
    when Array, ExternalIndex then index
    else ExternalIndex.open(index, 'r+', :format => 'II')
    end
  rescue(Errno::ENOENT)
    io.close if io
    io_index.close if io_index
    raise
  end
  
  extarc = new(io, io_index)
  
  # reindex if necessary
  if options[:reindex] || (options[:auto_reindex] && extarc.empty? && extarc.io.length > 0)
    extarc.reindex
  end
  
  if block_given?
    begin
      yield(extarc)
    ensure
      extarc.close
    end
  else
    extarc
  end
end

Instance Method Details

#+(another) ⇒ Object

def *(arg)

not_implemented

end



305
306
307
# File 'lib/external_archive.rb', line 305

def +(another)
  self.concat(another)
end

#<<(obj) ⇒ Object

def -(another)

not_implemented

end



313
314
315
316
# File 'lib/external_archive.rb', line 313

def <<(obj)
  self[length] = obj
  self
end

#<=>(another) ⇒ Object



318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# File 'lib/external_archive.rb', line 318

def <=>(another)
  case another
  when Array
    if another.length < self.length
      # if another is equal to the matching subset of self,
      # then self is obviously the longer array and wins.
      result = (self.to_a(another.length) <=> another)
      result == 0 ? 1 : result
    else
      self.to_a <=> another
    end
  when ExternalArray
    # if indexes are equal, additional 
    # 'quick' comparisons are allowed 
    if self.io_index == another.io_index
      
      # equal in comparison if the ios are equal
      return 0 if self.io.quick_compare(another.io)
    end
    
    self.io.flush
    another.io.flush
    
    # should chunk compare
    if another.length > self.length
      result = (self.to_a <=> another.to_a(self.length))
      result == 0 ? -1 : result
    elsif another.length < self.length
      result = (self.to_a(another.length) <=> another.to_a)
      result == 0 ? 1 : result
    else
      self.to_a <=> another.to_a
    end
  else
    raise TypeError.new("can't convert from #{another.class} to ExternalArchive or Array")
  end
end

#==(another) ⇒ Object



356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
# File 'lib/external_archive.rb', line 356

def ==(another)
  case another
  when Array
    # test simply based on length
    return false unless self.length == another.length

    # compare arrays
    self.to_a == another

  when ExternalArchive
    # test simply based on length
    return false unless self.length == another.length
    
    # if indexes are equal, additional 
    # 'quick' comparisons are allowed 
    if self.io_index == another.io_index
         
      # equal in comparison if the ios are equal
      #, (self.io_index.buffer_size/2).ceil) ??
      return true if self.io.sort_compare(another.io) == 0
    end

    # compare arrays
    self.to_a == another.to_a
  else
    false
  end      
end

#[](input, length = nil) ⇒ Object

Element Reference — Returns the entry at index, or returns an array starting at start and continuing for length entries, or returns an array specified by range. Negative indices count backward from the end of self (-1 is the last element). Returns nil if the index (or starting index) is out of range.

a = ExternalArchive[ "a", "b", "c", "d", "e" ]
a[2] +  a[0] + a[1]    #=> "cab"
a[6]                   #=> nil
a[1, 2]                #=> [ "b", "c" ]
a[1..3]                #=> [ "b", "c", "d" ]
a[4..7]                #=> [ "e" ]
a[6..10]               #=> nil
a[-3, 3]               #=> [ "c", "d", "e" ]
# special cases
a[5]                   #=> nil
a[5, 1]                #=> []
a[5..10]               #=> []


403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
# File 'lib/external_archive.rb', line 403

def [](input, length=nil)
  # two call types are required because while ExternalIndex can take 
  # a nil length, Array cannot and index can be either
  entry_indicies = (length == nil ? io_index[input] : io_index[input, length])
  
  case
  when entry_indicies == nil || entry_indicies.empty?
    # for conformance with array range retrieval,
    # simply return nil and [] indicies
    entry_indicies
    
  when length == nil && !input.kind_of?(Range)
    # a single entry was specified, read it
    entry_start, entry_length = entry_indicies
    io.pos = entry_start
    str_to_entry( io.read(entry_length) )
    
  else
    # multiple entries were specified, collect each
    pos = nil
    entry_indicies.collect do |(entry_start, entry_length)|
      next if entry_start == nil
 
      # only set io position if necessary
      unless pos == entry_start
        pos = entry_start
        io.pos = pos
      end
      
      pos += entry_length
      
      # read entry
      str_to_entry( io.read(entry_length) )
    end 
  end
end

#[]=(*args) ⇒ Object

Element Assignment — Sets the entry at index, or replaces a subset starting at start and continuing for length entries, or replaces a subset specified by range. A negative indices will count backward from the end of self. Inserts elements if length is zero. If nil is used in the second and third form, deletes elements from self. An IndexError is raised if a negative index points past the beginning of self. See also push, and unshift.

a = ExternalArchive.new
a[4] = "4"; a                  #=> [nil, nil, nil, nil, "4"]
a[0, 3] = [ 'a', 'b', 'c' ]; a #=> ["a", "b", "c", nil, "4"]
a[1..2] = [ '1', '2' ]; a      #=> ["a", '1', '2', nil, "4"]
a[0, 2] = "?"; a               #=> ["?", '2', nil, "4"]
a[0..2] = "A"; a               #=> ["A", "4"]
a[-1]   = "Z"; a               #=> ["A", "Z"]
a[1..-1] = nil; a              #=> ["A"]

Raises:

  • (ArgumentError)


456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
# File 'lib/external_archive.rb', line 456

def []=(*args)
  raise ArgumentError, "wrong number of arguments (1 for 2)" if args.length < 2
  
  one, two, value = args
  if args.length == 2
    value = two 
    two = nil
  end

  one = convert_to_int(one)
  case one
  when Fixnum
    if one < 0
      one += length
      raise IndexError, "index #{one} out of range" if one  < 0
    end
    
    entry_start = io.length
    io.pos = entry_start
    
    if two == nil
      # simple insertion 
      # (note it is important to write the entry to io 
      # first as a check that io is open for writing)

      entry_length = io.write( entry_to_str(value) )
      io.length += entry_length
      io_index[one] = [entry_start, entry_length]
      
    else
      values = case value
      when Array then value
      when ExternalArchive
        # special case, self will be reading and
        # writing from the same io, producing 
        # incorrect results
        
        # potential to load a huge amount of data
        value == self ? value.to_a : value
      else convert_to_ary(value)
      end
      
      # write each value to self, collecting the indicies
      indicies = []
      values.each do |value|
        entry_length = io.write( entry_to_str(value) )
        indicies << [entry_start, entry_length]
        
        io.length += entry_length
        entry_start += entry_length
      end
      
      # register the indicies
      io_index[one, two] = indicies
    end

  when Range
    raise TypeError, "can't convert Range into Integer" unless two == nil
    start, length, total = split_range(one)
    
    raise RangeError, "#{one} out of range" if start < 0
    self[start, length < 0 ? 0 : length + 1] = value

  when nil
    raise TypeError, "no implicit conversion from nil to integer"
  else
    raise TypeError, "can't convert #{one.class} into Integer"
  end
end

#anotherObject

Returns another instance of self.class; the new instance will be cached if self is cached.



158
159
160
# File 'lib/external_archive.rb', line 158

def another
  self.class.new(nil, cached? ? [] : io_index.another)
end

#at(index) ⇒ Object

Returns entry at index



535
536
537
# File 'lib/external_archive.rb', line 535

def at(index)
  self[index]
end

#cache=(input) ⇒ Object

Turns on or off caching by converting io_index to an Array (cache=true) or to an ExternalIndex (cache=false).



127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/external_archive.rb', line 127

def cache=(input)
  case
  when input && !cached?
    cache = io_index.to_a
    io_index.close
    @io_index = cache
    
  when !input && cached?
    io_index << {:format => 'II'}
    @io_index = ExternalIndex[*io_index]
    
  end
end

#cached?Boolean

Returns true if io_index is an Array.

Returns:

  • (Boolean)


120
121
122
# File 'lib/external_archive.rb', line 120

def cached?
  io_index.kind_of?(Array)
end

#clearObject

Removes all elements from self.



540
541
542
543
544
# File 'lib/external_archive.rb', line 540

def clear
  io.truncate(0)
  io_index.clear
  self
end

#close(path = nil, index_path = self.class.index_path(path), overwrite = false) ⇒ Object

Closes self as in External::Base#close. An io_path may be be specified to close io_index as well; when io_index is not an ExternalIndex, one is temporarily created with the current io_index content to ‘close’ and save the index.



145
146
147
148
149
150
151
152
153
154
# File 'lib/external_archive.rb', line 145

def close(path=nil, index_path=self.class.index_path(path), overwrite=false)
  case 
  when io_index.kind_of?(ExternalIndex)
    io_index.close(index_path, overwrite)
  when index_path != nil
    ExternalIndex[*io_index].close(index_path, overwrite)
  end
  
  super(path, overwrite)
end

#compactObject



546
547
548
549
550
551
552
553
# File 'lib/external_archive.rb', line 546

def compact
  # TODO - optimize?
  another = self.another
  each do |item|
    another << item unless item == nil
  end
  another
end

#concat(another) ⇒ Object

def compact!

not_implemented

end



559
560
561
562
563
564
565
566
567
# File 'lib/external_archive.rb', line 559

def concat(another)
  case another
  when Array, ExternalArchive
    self[length, another.length] = another
  else 
    raise TypeError.new("can't convert #{another.class} into ExternalArchive or Array")
  end
  self
end

#each(&block) ⇒ Object

Calls block once for each element in self, passing that element as a parameter.



612
613
614
615
616
617
# File 'lib/external_archive.rb', line 612

def each(&block) # :yield: item
  each_str do |str|
    # yield entry
    yield str_to_entry(str)
  end
end

#each_str(&block) ⇒ Object

Calls block once for each element string in self, passing that string as a parameter.



586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
# File 'lib/external_archive.rb', line 586

def each_str(&block) # :yield: string
  # tracking the position using a local variable 
  # is faster than calling io.pos.  
  pos = nil
  io_index.each do |(start, length)|
    if start == nil
      yield("")
      next  
    end
    
    # only set io position if necessary
    unless pos == start
      pos = start
      io.pos = pos
    end
    
    # advance position
    pos += length
    
    # yield entry string
    yield io.read(length)
  end
  self
end

#eachio_index(&block) ⇒ Object

Same as each, but passes the index of the element instead of the element itself.



620
621
622
623
# File 'lib/external_archive.rb', line 620

def eachio_index(&block) # :yield: index
  0.upto(length-1, &block)
  self
end

#entry_to_str(entry) ⇒ Object

Converts an entry into a string. By default this method returns entry.to_s.



172
173
174
# File 'lib/external_archive.rb', line 172

def entry_to_str(entry)
  entry.to_s
end

#last(n = nil) ⇒ Object

Returns the last n entries (default 1)



684
685
686
687
688
689
690
# File 'lib/external_archive.rb', line 684

def last(n=nil)
  return self[-1] if n.nil?

  start = length-n
  start = 0 if start < 0
  self[start, n]
end

#lengthObject

Returns the number of entries in self



693
694
695
# File 'lib/external_archive.rb', line 693

def length 
  io_index.length
end

#push(*obj) ⇒ Object

def pretty_print_cycle(q)

not_implemented

end



725
726
727
728
# File 'lib/external_archive.rb', line 725

def push(*obj)
  obj.each {|obj| self << obj }
  self
end

#reindex_by_regexp(pattern = /\r?\n/, options = {}) ⇒ Object

The speed of reindex_by_regexp is dictated by how fast the underlying code can match the pattern. Under ideal conditions (ie a very simple regexp), it will be as fast as reindex_by_sep.



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/external_archive.rb', line 192

def reindex_by_regexp(pattern=/\r?\n/, options={})
  options = {
    :range_or_span => nil,
    :blksize => 8388608,
    :carryover_limit => 8388608
  }.merge(options)
  
  reset_index do |io, index|
    span = options[:range_or_span] || io.default_span
    blksize = options[:blksize]
    carryover_limit = options[:carryover_limit]

    io.scan(span, blksize, carryover_limit) do |scan_pos, string|
      scanner = StringScanner.new(string)
      while advanced = scanner.search_full(pattern, true, false)
        break unless advanced > 0
          
        index << [scan_pos, advanced]
        scan_pos += advanced 
      end
      
      # allow a blockfor monitoring
      yield if block_given?
      scanner.rest_size
    end
  end
end

#reindex_by_sep(sep_str = $/, options = {}) ⇒ Object



220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# File 'lib/external_archive.rb', line 220

def reindex_by_sep(sep_str=$/, options={}) 
  sep_str = sep_str.to_s
  options = {
    :sep_regexp => Regexp.new(sep_str),
    :sep_length => sep_str.length,
    :entry_follows_sep => false,
    :exclude_sep => false,
    :range_or_span => nil,
    :blksize => 8388608,
    :carryover_limit => 8388608
  }.merge(options)
  
  regexp = options[:sep_regexp]
  sep_length = options[:sep_length]
  entry_follows_sep = options[:entry_follows_sep]
  exclude_sep = options[:exclude_sep]
  
  mode = case
  when !entry_follows_sep && !exclude_sep then 0
  when entry_follows_sep && exclude_sep then 1
  when entry_follows_sep && !exclude_sep then 2
  when !entry_follows_sep && exclude_sep then 3
  end
  
  reset_index do |io, index|
    # calculate default span after resetio_index in case any flush needs to happen
    span = options[:range_or_span] || io.default_span
    blksize = options[:blksize]
    carryover_limit = options[:carryover_limit]
    
    remainder = io.scan(span, blksize, carryover_limit) do |scan_pos, string|
      scanner = StringScanner.new(string)
      
      # When the entry follows the separator, the scanner must
      # be set right after the separator for the first entry, so
      # that the search will find the beginning of the next entry.
      if scan_pos == 0 && entry_follows_sep
        scanner.pos = sep_length
        scan_pos = sep_length
      end

      # Scan for entries documents by looking for the beginning
      # of the next entry,  signaling the end of the current entry.
      while advanced = scanner.skip_until(regexp)
      
        # adjust indicies as needed...
        io_index << case mode
        when 0 then [scan_pos, advanced]
        when 2 then [scan_pos-sep_length, advanced]
        else [scan_pos, advanced-sep_length]
        end
        
        scan_pos += advanced
      end
      
      # allow a blockfor monitoring
      yield if block_given?
      scanner.rest_size
    end
    
    # Unless the io is empty, there will be a remaining entry that 
    # doesn't get scanned when the entry follows the separator.  
    # Add the entry here.
    if entry_follows_sep && io.length != 0
      io_index << if exclude_sep
        [io.length - remainder, remainder]
      else
        [io.length - remainder - sep_length, remainder + sep_length]
      end
    end   
  end
end

#reset_index {|io, io_index| ... } ⇒ Object Also known as: reindex

Clears the io_index, and yields io and the io_index to the block for reindexing. The io is flushed and rewound before being yielded to the block. Returns self

Yields:



179
180
181
182
183
184
185
# File 'lib/external_archive.rb', line 179

def reset_index
  io_index.clear
  io.flush
  io.rewind
  yield(io, io_index) if block_given?
  self
end

#reverse_eachObject

:yield: item



765
766
767
768
769
# File 'lib/external_archive.rb', line 765

def reverse_each # :yield: item
  reverse_each_str do |str|
    yield( str_to_entry(str) )
  end
end

#reverse_each_str(&block) ⇒ Object

def reverse!

not_implemented

end



750
751
752
753
754
755
756
757
758
759
760
761
762
763
# File 'lib/external_archive.rb', line 750

def reverse_each_str(&block) # :yield: string
  io_index.reverse_each do |(start,length)|
    next if start == nil

    # A more optimized approach would
    # read in a chunk of entries and
    # iterate over them?
    io.pos = start
    
    # yield entry string
    yield io.read(length)
  end
  self
end

#sizeObject

Alias for length



784
785
786
# File 'lib/external_archive.rb', line 784

def size
  length
end

#str_to_entry(str) ⇒ Object

Converts an string read from io into an entry. By default the string is simply returned.



166
167
168
# File 'lib/external_archive.rb', line 166

def str_to_entry(str)
  str
end

#to_a(length = self.length) ⇒ Object

def slice!(*args)

not_implemented

end



796
797
798
# File 'lib/external_archive.rb', line 796

def to_a(length=self.length)
  length == 0 ? [] : self[0, length]
end

#values_at(*selectors) ⇒ Object

Returns an array containing the chars in io corresponding to the given selector(s). The selectors may be either integer indices or ranges



831
832
833
834
835
836
837
# File 'lib/external_archive.rb', line 831

def values_at(*selectors)
  another = self.another
  selectors.each do |s| 
    another << self[s]
  end
  another
end