Class: PEROBS::FlatFile

Inherits:
Object
  • Object
show all
Defined in:
lib/perobs/FlatFile.rb

Overview

The FlatFile class manages the storage file of the FlatFileDB. It contains a sequence of blobs Each blob consists of header and the actual blob data bytes.

Constant Summary collapse

INDEX_BTREE_ORDER =

The number of entries in a single BTree node of the index file.

65

Instance Method Summary collapse

Constructor Details

#initialize(dir) ⇒ FlatFile

Create a new FlatFile object for a database in the given path.

Parameters:

  • dir (String)

    Directory path for the data base file



47
48
49
50
51
52
# File 'lib/perobs/FlatFile.rb', line 47

def initialize(dir)
  @db_dir = dir
  @f = nil
  @index = BTree.new(@db_dir, 'index', INDEX_BTREE_ORDER)
  @space_list = SpaceTree.new(@db_dir)
end

Instance Method Details

#check(repair = false) ⇒ Integer

Check (and repair) the FlatFile.

Parameters:

  • repair (Boolean) (defaults to: false)

    True if errors should be fixed.

Returns:

  • (Integer)

    Number of errors found



446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
# File 'lib/perobs/FlatFile.rb', line 446

def check(repair = false)
  errors = 0
  return errors unless @f

  t = Time.now
  PEROBS.log.info "Checking FlatFile database" +
    "#{repair ? ' in repair mode' : ''}..."

  # First check the database blob file. Each entry should be readable and
  # correct and all IDs must be unique. We use a shadow index to keep
  # track of the already found IDs.
  new_index = BTree.new(@db_dir, 'new-index', INDEX_BTREE_ORDER)
  new_index.erase
  new_index.open

  each_blob_header do |pos, header|
    if header.is_valid?
      # We have a non-deleted entry.
      begin
        @f.seek(pos + FlatFileBlobHeader::LENGTH)
        buf = @f.read(header.length)
        if buf.length != header.length
          PEROBS.log.error "Premature end of file in blob with ID " +
            "#{header.id}."
          discard_damaged_blob(header) if repair
          errors += 1
          next
        end

        # Uncompress the data if the compression bit is set in the mark
        # byte.
        if header.is_compressed?
          begin
            buf = Zlib.inflate(buf)
          rescue Zlib::BufError, Zlib::DataError
            PEROBS.log.error "Corrupted compressed block with ID " +
              "#{header.id} found."
            discard_damaged_blob(header) if repair
            errors += 1
            next
          end
        end

        if header.crc && checksum(buf) != header.crc
          PEROBS.log.error "Checksum failure while checking blob " +
            "with ID #{header.id}"
          discard_damaged_blob(header) if repair
          errors += 1
          next
        end
      rescue IOError => e
        PEROBS.log.fatal "Check of blob with ID #{header.id} failed: " +
          e.message
      end

      # Check if the ID has already been found in the file.
      if (previous_address = new_index.get(header.id))
        PEROBS.log.error "Multiple blobs for ID #{header.id} found. " +
          "Addresses: #{previous_address}, #{pos}"
        previous_header = FlatFileBlobHeader.read_at(@f, previous_address,
                                                     header.id)
        if repair
          # We have two blobs with the same ID and we must discard one of
          # them.
          if header.is_outdated?
            discard_damaged_blob(header)
          elsif previous_header.is_outdated?
            discard_damaged_blob(previous_header)
          else
            PEROBS.log.error "None of the blobs with same ID have " +
              "the outdated flag set. Deleting the smaller one."
            discard_damaged_blob(header.length < previous_header.length ?
                                 header : previous_header)
          end
          next
        end
      else
        # ID is unique so far. Add it to the shadow index.
        new_index.insert(header.id, pos)
      end

    end
  end
  # We no longer need the new index.
  new_index.close
  new_index.erase

  # Now we check the index data. It must be correct and the entries must
  # match the blob file. All entries in the index must be in the blob file
  # and vise versa.
  begin
    index_ok = @index.check do |id, address|
      has_id_at?(id, address)
    end
    unless index_ok && @space_list.check(self) && cross_check_entries
      regenerate_index_and_spaces if repair
    end
  rescue PEROBS::FatalError
    errors += 1
    regenerate_index_and_spaces if repair
  end

  sync if repair
  PEROBS.log.info "check_db completed in #{Time.now - t} seconds. " +
    "#{errors} errors found."

  errors
end

#clear_all_marksObject

Clear alls marks.



339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# File 'lib/perobs/FlatFile.rb', line 339

def clear_all_marks
  t = Time.now
  PEROBS.log.info "Clearing all marks..."

  total_blob_count = 0
  marked_blob_count = 0

  each_blob_header do |pos, header|
    total_blob_count += 1
    if header.is_valid? && header.is_marked?
      # Clear all valid and marked blocks.
      marked_blob_count += 1
      header.clear_mark_flag
    end
  end
  PEROBS.log.info "#{marked_blob_count} marks in #{total_blob_count} " +
    "objects cleared in #{Time.now - t} seconds"
end

#closeObject

Close the flat file. This method must be called to ensure that all data is really written into the filesystem.



99
100
101
102
103
104
105
106
107
108
109
# File 'lib/perobs/FlatFile.rb', line 99

def close
  @space_list.close
  @index.close

  if @f
    @f.flush
    @f.flock(File::LOCK_UN)
    @f.close
    @f = nil
  end
end

#defragmentizeObject

Eliminate all the holes in the file. This is an in-place implementation. No additional space will be needed on the file system.



360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
# File 'lib/perobs/FlatFile.rb', line 360

def defragmentize
  distance = 0
  new_file_size = 0
  deleted_blobs = 0
  valid_blobs = 0
  t = Time.now
  PEROBS.log.info "Defragmenting FlatFile"
  # Iterate over all entries.
  each_blob_header do |pos, header|
    # Total size of the current entry
    entry_bytes = FlatFileBlobHeader::LENGTH + header.length
    if header.is_valid?
      # We have found a valid entry.
      valid_blobs += 1
      if distance > 0
        begin
          # Read current entry into a buffer
          @f.seek(pos)
          buf = @f.read(entry_bytes)
          # Write the buffer right after the end of the previous entry.
          @f.seek(pos - distance)
          @f.write(buf)
          # Update the index with the new position
          @index.insert(header.id, pos - distance)
          # Mark the space between the relocated current entry and the
          # next valid entry as deleted space.
          FlatFileBlobHeader.new(@f, @f.pos, 0,
                                 distance - FlatFileBlobHeader::LENGTH,
                                 0, 0).write
          @f.flush
        rescue IOError => e
          PEROBS.log.fatal "Error while moving blob for ID #{header.id}: " +
            e.message
        end
      end
      new_file_size = pos + FlatFileBlobHeader::LENGTH + header.length
    else
      deleted_blobs += 1
      distance += entry_bytes
    end
  end
  PEROBS.log.info "FlatFile defragmented in #{Time.now - t} seconds"
  PEROBS.log.info "#{distance / 1000} KiB/#{deleted_blobs} blobs of " +
    "#{@f.size / 1000} KiB/#{valid_blobs} blobs or " +
    "#{'%.1f' % (distance.to_f / @f.size * 100.0)}% reclaimed"

  @f.flush
  @f.truncate(new_file_size)
  @f.flush
  @space_list.clear

  sync
end

#delete_obj_by_address(addr, id) ⇒ Object

Delete the blob that is stored at the specified address.

Parameters:

  • addr (Integer)

    Address of the blob to delete

  • id (Integer)

    ID of the blob to delete



137
138
139
140
141
142
# File 'lib/perobs/FlatFile.rb', line 137

def delete_obj_by_address(addr, id)
  @index.remove(id)
  header = FlatFileBlobHeader.read_at(@f, addr, id)
  header.clear_flags
  @space_list.add_space(addr, header.length)
end

#delete_obj_by_id(id) ⇒ Boolean

Delete the blob for the specified ID.

Parameters:

  • id (Integer)

    ID of the object to be deleted

Returns:

  • (Boolean)

    True if object was deleted, false otherwise



125
126
127
128
129
130
131
132
# File 'lib/perobs/FlatFile.rb', line 125

def delete_obj_by_id(id)
  if (pos = find_obj_addr_by_id(id))
    delete_obj_by_address(pos, id)
    return true
  end

  return false
end

#delete_unmarked_objectsObject

Delete all unmarked objects.



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/perobs/FlatFile.rb', line 145

def delete_unmarked_objects
  PEROBS.log.info "Deleting unmarked objects..."
  t = Time.now

  deleted_ids = []
  each_blob_header do |pos, header|
    if header.is_valid? && !header.is_marked?
      delete_obj_by_address(pos, header.id)
      deleted_ids << header.id
    end
  end
  defragmentize

  PEROBS.log.info "#{deleted_ids.length} unmarked objects deleted " +
    "in #{Time.now - t} seconds"
  deleted_ids
end

#find_obj_addr_by_id(id) ⇒ Integer

Find the address of the object with the given ID.

Parameters:

  • id (Integer)

    ID of the object

Returns:

  • (Integer)

    Offset in the flat file or nil if not found



260
261
262
# File 'lib/perobs/FlatFile.rb', line 260

def find_obj_addr_by_id(id)
  @index.get(id)
end

#has_id_at?(id, address) ⇒ Boolean

Returns:

  • (Boolean)


586
587
588
589
# File 'lib/perobs/FlatFile.rb', line 586

def has_id_at?(id, address)
  header = FlatFileBlobHeader.read_at(@f, address)
  header.is_valid? && header.id == id
end

#has_space?(address, size) ⇒ Boolean

Returns:

  • (Boolean)


581
582
583
584
# File 'lib/perobs/FlatFile.rb', line 581

def has_space?(address, size)
  header = FlatFileBlobHeader.read_at(@f, address)
  !header.is_valid? && header.length == size
end

#inspectObject



591
592
593
594
595
596
597
598
599
600
601
602
603
# File 'lib/perobs/FlatFile.rb', line 591

def inspect
  s = '['
  each_blob_header do |pos, header|
    s << "{ :pos => #{pos}, :flags => #{header.flags}, " +
         ":length => #{header.length}, :id => #{header.id}, " +
         ":crc => #{header.crc}"
    if header.is_valid?
      s << ", :value => #{@f.read(header.length)}"
    end
    s << " }\n"
  end
  s + ']'
end

#is_marked_by_id?(id) ⇒ Boolean

Return true if the object with the given ID is marked, false otherwise.

Parameters:

  • id (Integer)

    ID of the object

Returns:

  • (Boolean)


329
330
331
332
333
334
335
336
# File 'lib/perobs/FlatFile.rb', line 329

def is_marked_by_id?(id)
  if (addr = find_obj_addr_by_id(id))
    header = FlatFileBlobHeader.read_at(@f, addr, id)
    return header.is_marked?
  end

  false
end

#mark_obj_by_address(addr, id) ⇒ Object

Mark the object at the specified address.

Parameters:

  • addr (Integer)

    Offset in the file

  • id (Integer)

    ID of the object



323
324
325
# File 'lib/perobs/FlatFile.rb', line 323

def mark_obj_by_address(addr, id)
  FlatFileBlobHeader.read_at(@f, addr, id).set_mark_flag
end

#mark_obj_by_id(id) ⇒ Object

Mark the object with the given ID.

Parameters:

  • id (Integer)

    ID of the object



314
315
316
317
318
# File 'lib/perobs/FlatFile.rb', line 314

def mark_obj_by_id(id)
  if (addr = find_obj_addr_by_id(id))
    mark_obj_by_address(addr, id)
  end
end

#openObject

Open the flat file for reading and writing.



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/perobs/FlatFile.rb', line 55

def open
  file_name = File.join(@db_dir, 'database.blobs')
  new_db_created = false
  begin
    if File.exist?(file_name)
      @f = File.open(file_name, 'rb+')
    else
      PEROBS.log.info "New FlatFile database '#{file_name}' created"
      @f = File.open(file_name, 'wb+')
      new_db_created = true
    end
  rescue IOError => e
    PEROBS.log.fatal "Cannot open FlatFile database #{file_name}: " +
      e.message
  end
  unless @f.flock(File::LOCK_NB | File::LOCK_EX)
    PEROBS.log.fatal "FlatFile database '#{file_name}' is locked by " +
      "another process"
  end

  begin
    @index.open(!new_db_created)
    @space_list.open
  rescue FatalError
    # Ensure that the index is really closed.
    @index.close
    # Erase it completely
    @index.erase
    # Then create it again.
    @index.open

    # Ensure that the spaces list is really closed.
    @space_list.close
    # Erase it completely
    @space_list.erase
    # Then create it again
    @space_list.open

    regenerate_index_and_spaces
  end
end

#read_obj_by_address(addr, id) ⇒ String

Read the object at the specified address.

Parameters:

  • addr (Integer)

    Offset in the flat file

  • id (Integer)

    ID of the data blob

Returns:

  • (String)

    Raw object data



279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# File 'lib/perobs/FlatFile.rb', line 279

def read_obj_by_address(addr, id)
  header = FlatFileBlobHeader.read_at(@f, addr, id)
  if header.id != id
    PEROBS.log.fatal "Database index corrupted: Index for object " +
      "#{id} points to object with ID #{header.id}"
  end

  buf = nil

  begin
    @f.seek(addr + FlatFileBlobHeader::LENGTH)
    buf = @f.read(header.length)
  rescue IOError => e
    PEROBS.log.fatal "Cannot read blob for ID #{id}: #{e.message}"
  end

  # Uncompress the data if the compression bit is set in the flags byte.
  if header.is_compressed?
    begin
      buf = Zlib.inflate(buf)
    rescue Zlib::BufError, Zlib::DataError
      PEROBS.log.fatal "Corrupted compressed block with ID " +
        "#{header.id} found."
    end
  end

  if checksum(buf) != header.crc
    PEROBS.log.fatal "Checksum failure while reading blob ID #{id}"
  end

  buf
end

#read_obj_by_id(id) ⇒ String or nil

Read the object with the given ID.

Parameters:

  • id (Integer)

    ID of the object

Returns:

  • (String or nil)

    Raw object data if found, otherwise nil



267
268
269
270
271
272
273
# File 'lib/perobs/FlatFile.rb', line 267

def read_obj_by_id(id)
  if (addr = find_obj_addr_by_id(id))
    return read_obj_by_address(addr, id)
  end

  nil
end

#refreshObject

This method iterates over all entries in the FlatFile and removes the entry and inserts it again. This is useful to update all entries in cased the storage format has changed.



417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
# File 'lib/perobs/FlatFile.rb', line 417

def refresh
  # This iteration might look scary as we iterate over the entries while
  # while we are rearranging them. Re-inserted items may be inserted
  # before or at the current entry and this is fine. They also may be
  # inserted after the current entry and will be re-read again unless they
  # are inserted after the original file end.
  file_size = @f.size
  PEROBS.log.info "Refreshing the DB..."
  t = Time.now
  each_blob_header do |pos, header|
    if header.is_valid?
      buf = read_obj_by_address(pos, header.id)
      delete_obj_by_address(pos, header.id)
      write_obj_by_id(header.id, buf)
    end

    # Some re-inserted blobs may be inserted after the original file end.
    # No need to process those blobs again.
    break if pos >= file_size
  end
  PEROBS.log.info "DB refresh completed in #{Time.now - t} seconds"

  # Reclaim the space saved by compressing entries.
  defragmentize
end

#regenerate_index_and_spacesObject

This method clears the index tree and the free space list and regenerates them from the FlatFile.



557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
# File 'lib/perobs/FlatFile.rb', line 557

def regenerate_index_and_spaces
  PEROBS.log.warn "Re-generating FlatFileDB index and space files"
  @index.clear
  @space_list.clear

  each_blob_header do |pos, header|
    if header.is_valid?
      if (duplicate_pos = @index.get(header.id))
        PEROBS.log.error "FlatFile contains multiple blobs for ID " +
          "#{header.id}. First blob is at address #{duplicate_pos}. " +
          "Other blob found at address #{pos}."
        @space_list.add_space(pos, header.length) if header.length > 0
        discard_damaged_blob(header)
      else
        @index.insert(header.id, pos)
      end
    else
      @space_list.add_space(pos, header.length) if header.length > 0
    end
  end

  sync
end

#syncObject

Force outstanding data to be written to the filesystem.



112
113
114
115
116
117
118
119
120
# File 'lib/perobs/FlatFile.rb', line 112

def sync
  begin
    @f.flush
  rescue IOError => e
    PEROBS.log.fatal "Cannot sync flat file database: #{e.message}"
  end
  @index.sync
  @space_list.sync
end

#write_obj_by_id(id, raw_obj) ⇒ Integer

Write the given object into the file. This method never uses in-place updates for existing objects. A new copy is inserted first and only when the insert was successful, the old copy is deleted and the index updated.

Parameters:

  • id (Integer)

    ID of the object

  • raw_obj (String)

    Raw object as String

Returns:

  • (Integer)

    position of the written blob in the blob file



170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# File 'lib/perobs/FlatFile.rb', line 170

def write_obj_by_id(id, raw_obj)
  # Check if we have already an object with the given ID. We'll mark it as
  # outdated and save the header for later deletion. In case this
  # operation is aborted or interrupted we ensure that we either have the
  # old or the new version available.
  if (old_addr = find_obj_addr_by_id(id))
    old_header = FlatFileBlobHeader.read_at(@f, old_addr)
    old_header.set_outdated_flag
  end

  crc = checksum(raw_obj)

  # If the raw_obj is larger then 256 characters we will compress it to
  # safe some space in the database file. For smaller strings the
  # performance impact of compression is not compensated by writing
  # less data to the storage.
  compressed = false
  if raw_obj.length > 256
    raw_obj = Zlib.deflate(raw_obj)
    compressed = true
  end

  addr, length = find_free_blob(raw_obj.length)
  begin
    if length != -1
      # Just a safeguard so we don't overwrite current data.
      header = FlatFileBlobHeader.read_at(@f, addr)
      if header.length != length
        PEROBS.log.fatal "Length in free list (#{length}) and header " +
          "(#{header.length}) for address #{addr} don't match."
      end
      if raw_obj.length > header.length
        PEROBS.log.fatal "Object (#{raw_obj.length}) is longer than " +
          "blob space (#{header.length})."
      end
      if header.is_valid?
        PEROBS.log.fatal "Entry at address #{addr} with flags: " +
          "#{header.flags} is already used for ID #{header.id}."
      end
    end
    flags = 1 << FlatFileBlobHeader::VALID_FLAG_BIT
    flags |= (1 << FlatFileBlobHeader::COMPRESSED_FLAG_BIT) if compressed
    if old_addr && old_header.is_marked?
      # This method might be called in the middle of an operation that
      # uses the mark flag. We must ensure that the flag is carried over
      # to the new header.
      flags |= (1 << FlatFileBlobHeader::MARK_FLAG_BIT)
    end
    FlatFileBlobHeader.new(@f, addr, flags, raw_obj.length, id, crc).write
    @f.write(raw_obj)
    if length != -1 && raw_obj.length < length
      # The new object was not appended and it did not completely fill the
      # free space. So we have to write a new header to mark the remaining
      # empty space.
      unless length - raw_obj.length >= FlatFileBlobHeader::LENGTH
        PEROBS.log.fatal "Not enough space to append the empty space " +
          "header (space: #{length} bytes, object: #{raw_obj.length} " +
          "bytes)."
      end
      space_address = @f.pos
      space_length = length - FlatFileBlobHeader::LENGTH - raw_obj.length
      FlatFileBlobHeader.new(@f, space_address, 0, space_length,
                             0, 0).write
      # Register the new space with the space list.
      @space_list.add_space(space_address, space_length) if space_length > 0
    end

    # Once the blob has been written we can update the index as well.
    @index.insert(id, addr)

    if old_addr
      # If we had an existing object stored for the ID we have to mark
      # this entry as deleted now.
      old_header.clear_flags
      # And register the newly freed space with the space list.
      @space_list.add_space(old_addr, old_header.length)
    else
      @f.flush
    end
  rescue IOError => e
    PEROBS.log.fatal "Cannot write blob for ID #{id} to FlatFileDB: " +
      e.message
  end

  addr
end