Module: CorrectHorseBatteryStaple::Backend::IsamKD::InstanceMethods

Defined in:: lib/correct_horse_battery_staple/backend/isam_kd.rb

Instance Method Summary collapse

#binwrite(*args) ⇒ Object
#each(&block) ⇒ Object

some core Enumerable building blocks.
#file_range_read(file_range = nil) ⇒ Object
#file_size(file) ⇒ Object
#file_string ⇒ Object
#fix_stats(stats) ⇒ Object
#get_word_by_idx(n) ⇒ Object

this version is much slower than the other - 1.5x total runtime slower in some cases.
#initialize_backend_variables ⇒ Object
#inspect ⇒ Object

Show some information about.
#len2coord(len) ⇒ Object

make the search space more square by increasing the length of the “word length” axis.
#load_kdtree ⇒ Object
#nth_chunk(n, string) ⇒ Object

return a string representing the nth_record.
#openmode ⇒ Object
#pad(size, io) ⇒ Object
#page_size ⇒ Object
#parse_prelude ⇒ Object
#parse_record(string, index = 0, word = CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) ⇒ Object

Parse a record into a Word object, which can be provided or will otherwise be constructed as needed fourth arg is a length range which can act as a filter; if not satisfied, nil will be returned.
#parse_record_into_array(string, index, length_range = nil) ⇒ Object

Parse a record into an array of [word, frequency] IFF the word fits into the length_range or length_range is nil.
#percentile_index(percentile, round = true) ⇒ Object

rather than using a StatisticalArray, we do direct indexing into the file/string.
#pick(count, options = {}) ⇒ Object

optimized pick - does NOT support :filter, though.
#pos_of_nth_word_in_file(n) ⇒ Object
#precache(max = -1)) ⇒ Object

Format of header:.
#prelude ⇒ Object
#record_percentile_range_read(percentile_range) ⇒ Object

memoize :record_range_read.
#record_range_for_percentile(range) ⇒ Object
#record_range_read(record_range = nil) ⇒ Object
#records_size ⇒ Object

file I/O.
#records_string ⇒ Object

returns a string representing the record-holding portion of the file.
#round_up(val, blocksize = page_size) ⇒ Object

many MMUs in default mode and modern highcap drives have 4k pages/blocks.
#size ⇒ Object
#sorted_entries ⇒ Object

we presume that the ISAM file has been sorted.
#word_length(chunk_string) ⇒ Object
#write_corpus_to_io(corpus, io = STDOUT) ⇒ Object
#write_kdtree(corpus, io) ⇒ Object

Instance Method Details

#binwrite(*args) ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 118

def binwrite(*args)
  method = io.respond_to?(:binwrite) ? :binwrite : :write
  io.send(method, *args)
end

#each(&block) ⇒ `Object`

some core Enumerable building blocks

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 301

def each(&block)
  string = records_string
  max_index = size - 1
  index = 0
  while index < max_index
    word = parse_record(string, index)
    word.index = index
    word.percentile = [(index-0.5)/size,0].max * 100
    yield word
    index += 1
  end
end

#file_range_read(file_range = nil) ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 373

def file_range_read(file_range = nil)
  file_range ||= 0...file_size(@file)
  pos = @file.tell
  @file.seek(file_range.first)
  @file.read(range_count(file_range))
ensure
  @file.seek(pos)
end

#file_size(file) ⇒ `Object`



166
167
168

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 166

def file_size(file)
  (file.respond_to?(:size) ? file.size : file.stat.size)
end

#file_string ⇒ `Object`



369
370
371

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 369

def file_string
  @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
end

#fix_stats(stats) ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 30

def fix_stats(stats)
  stats.each do |k,v|
    if v.respond_to?(:nan?) && v.nan?
      stats[k] = -1
    end
  end
  stats
end

#get_word_by_idx(n) ⇒ `Object`

this version is much slower than the other - 1.5x total runtime slower in some cases.

def get_word_by_idx_direct(n)

@file.seek(pos_of_nth_word_in_file(n))
chunk = @file.read(@entry_length)
parse_record(chunk)

end

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 291

def get_word_by_idx(n)
  chunk = nth_chunk(n, records_string)
  parse_record(chunk).tap do |w|
    w.index      = n
    w.percentile = [(n-0.5)/size,0].max * 100
  end
end

#initialize_backend_variables ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 23

def initialize_backend_variables
  @length_scaling_factor = 15
  @page_size = 4096
end

#inspect ⇒ `Object`

Show some information about

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 214

def inspect
  super + "\n" + <<INSPECT
File size: #{file_size(@file)}
Word length: #{@word_length}
Frequency bytes: #{@frequency_length}
Total record bytes: #{@records_length}
Offset of K-D Tree index: #{@offset_index1}
Total K-D Tree index bytes: #{file_size(@file) - @offset_index1}
K-D Tree Signature: #{file_range_read(@offset_index1..(@offset_index1+3))}

Prelude:
#{@prelude.map {|k,v| k=="stats" ? "" : "  #{k}: #{v}\n" }.join("") }
INSPECT
end

#len2coord(len) ⇒ `Object`

make the search space more square by increasing the length of the “word length” axis



114
115
116

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 114

def len2coord(len)
  len * (@length_scaling_factor || 10)
end

#load_kdtree ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 229

def load_kdtree
  @file.seek(@offset_index1)
  KDTree.new @file
end

#nth_chunk(n, string) ⇒ `Object`

return a string representing the nth_record



273
274
275

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 273

def nth_chunk(n, string)
  string[@entry_length * n, @entry_length]
end

#openmode ⇒ `Object`



123
124
125

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 123

def openmode
  IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w"
end

#pad(size, io) ⇒ `Object`



94
95
96

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 94

def pad(size, io)
  io.write([].pack("x#{size}"))
end

#page_size ⇒ `Object`



39
40
41

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 39

def page_size
  @page_size || 4096
end

#parse_prelude ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 174

def parse_prelude
  @file.seek 0
  prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)

  # byte offset of first record from beginning of file
  # total length of JSON string (without padding)
  (@record_offset, @prelude_len)  = prelude_buf.unpack("NN")

  # read more if our initial read didn't slurp in the entire prelude
  if @prelude_len > prelude_buf.length
    prelude_buf += @file.read(@prelude_len - prelude_buf.length)
  end

  @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}

  # includes prefix length byte
  @word_length      = @prelude["wlen"]     || raise(ArgumentError, "Word length is not defined!")

  # as network byte order int
  @frequency_length = @prelude["flen"]     || 4

  # total length of record
  @entry_length     = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")

  @offset_index1    = @prelude["offset_index1"] || raise(ArgumentError, "No index offset!")

  @records_length   = @prelude["records_length"] || raise(ArgumentError, "No records length!")

  @entry_count      = @prelude["n"] || raise(ArgumentError, "Number of records not included!")

  @length_scaling_factor = @prelude["length_scaling_factor"] || 10
  
  load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]

  @prelude
end

#parse_record(string, index = 0, word = CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) ⇒ `Object`

Parse a record into a Word object, which can be provided or will otherwise be constructed as needed fourth arg is a length range which can act as a filter; if not satisfied, nil will be returned

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 258

def parse_record(string, index=0,
                 word=CorrectHorseBatteryStaple::Word.new(:word => ""),
                 length_range = nil)
  bare = parse_record_into_array(string, index, length_range)
  return nil unless bare
  word.word = bare[0]
  word.frequency = bare[1]
  word
end

#parse_record_into_array(string, index, length_range = nil) ⇒ `Object`

Parse a record into an array of [word, frequency] IFF the word fits into the length_range or length_range is nil

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 241

def parse_record_into_array(string, index, length_range = nil)
  chunk = nth_chunk(index, string)
  raise "No chunk for index #{index}" unless chunk
  actual_word_length = chunk.unpack("C")[0]
  if !length_range || length_range.include?(actual_word_length)
    # returns [word, frequency]
    chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
  else
    nil
  end
end

#percentile_index(percentile, round = true) ⇒ `Object`

rather than using a StatisticalArray, we do direct indexing into the file/string

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 402

def percentile_index(percentile, round=true)
  r = percentile.to_f/100 * count + 0.5
  round ? r.round : r
end

#pick(count, options = {}) ⇒ `Object`

optimized pick - does NOT support :filter, though

Raises:

(NotImplementedError)

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 328

def pick(count, options = {})
  # incompat check
  raise NotImplementedError, "ISAM does not support :filter option" if options[:filter]

  options = {:percentile  => 0..100,
             :word_length => 0..20}.merge(options)
  
  result = []
  found_indexes = []
  iterations = 0
  max_iterations = [1000, 4 * count].max
  while (result.size < count && iterations < max_iterations)
    len = random_in_range(options[:word_length])
    pct = random_in_range(options[:percentile])
    word_idx = @kdtree.nearest(len2coord(len), pct)
    unless found_indexes.include?(word_idx)
      found_indexes << word_idx
      word = get_word_by_idx(word_idx)
      if options[:word_length].include?(word.word.length)
        result << word
      else
        STDERR.puts "non-qualifying word: #{word.word.length}"
      end
    end
    iterations += 1
  end

  # validate that we succeeded
  raise "Cannot find #{count} words matching criteria" if result.length < count

  result
end

#pos_of_nth_word_in_file(n) ⇒ `Object`



277
278
279

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 277

def pos_of_nth_word_in_file(n)
  pos = @record_offset + (n * @entry_length)
end

#precache(max = -1)) ⇒ `Object`

Format of header:

0..3 - OB - offset of body start in bytes; network byte order 4..7 - LP - length of prelude in network byte order 8..OB-1 - P - JSON-encoded prelude hash and space padding OB..EOF - array of fixed size records as described in prelude

Contents of Prelude (after JSON decoding):

P - length of word part of record P - length of frequency part of record (always 4 bytes) P - length of total part of record P - number of records P - field name sorted by (word or frequency) P - corpus statistics P - absolute file offset of KDTree index P - length in bytes of records section, excluding padding P - what length was multiplied by in creating KDTree (usually 15)

Format of record:

2 bytes - LW - actual length of word within field P bytes - LW bytes of word (W) + P-LW bytes of padding P (4) bytes - frequency as network byte order long

After record section, there is padding up to the next page_size boundary, and then there is a dumped KDTree which extends to EOF.

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 160

def precache(max = -1)
  return if max > -1 && file_size(@file) > max
  @file.seek 0
  @file = StringIO.new @file.read, "r"
end

#prelude ⇒ `Object`



170
171
172

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 170

def prelude
  @prelude || parse_prelude
end

#record_percentile_range_read(percentile_range) ⇒ `Object`

memoize :record_range_read

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 395

def record_percentile_range_read(percentile_range)
  record_range = record_range_for_percentile(percentile_range)
  record_range_read(record_range)
end

#record_range_for_percentile(range) ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 407

def record_range_for_percentile(range)
  range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
  (percentile_index(range.begin, false).floor * @entry_length ...
   percentile_index(range.end,   false).ceil * @entry_length)
end

#record_range_read(record_range = nil) ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 389

def record_range_read(record_range = nil)
  record_range ||= 0...records_size
  file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
end

#records_size ⇒ `Object`

file I/O



365
366
367

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 365

def records_size
  @records_length
end

#records_string ⇒ `Object`

returns a string representing the record-holding portion of the file

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 384

def records_string
  @records_string ||=
    record_range_read(0 ... records_size)
end

#round_up(val, blocksize = page_size) ⇒ `Object`

many MMUs in default mode and modern highcap drives have 4k pages/blocks



44
45
46

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 44

def round_up(val, blocksize=page_size)
  [(val.to_f/blocksize).ceil, 1].max * blocksize
end

#size ⇒ `Object`



314
315
316

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 314

def size
  @entry_count ||= records_size / @entry_length
end

#sorted_entries ⇒ `Object`

we presume that the ISAM file has been sorted



322
323
324

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 322

def sorted_entries
  @sorted_entries ||= entries
end

#word_length(chunk_string) ⇒ `Object`



268
269
270

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 268

def word_length(chunk_string)
  chunk_string.unpack("C")
end

#write_corpus_to_io(corpus, io = STDOUT) ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 48

def write_corpus_to_io(corpus, io=STDOUT)
  io.rewind

  # includes prefix length byte
  @word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1
  @freq_length = 4
  @entry_length = @word_length + @freq_length

  stats = fix_stats(corpus.stats)
  corpus_word_count = corpus.length

  prelude = {
    "wlen"           => @word_length,
    "flen"           => 4,
    "entrylen"       => @word_length + @freq_length,
    "sort"           => "frequency",
    "n"              => corpus_word_count,
    "stats"          => stats,
    "flags"          => 0,
    "length_scaling_factor" => (@length_scaling_factor || 15),
    "records_length" => "0000000000",
    "offset_records" => "0000000000",
    "offset_index1"  => "0000000000",
    "offset_index2"  => "0000000000"
  }

  prelude_json_length = prelude.to_json.length
  prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0)

  prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"]
  offset_index1 = prelude["offset_records"] +
    round_up(records_length, page_size)

  prelude["offset_index1"]  = offset_index1

  io.write([offset_records, prelude_json_length, prelude.to_json].
           pack("NNA#{offset_records-8}"))

  corpus.each_with_index do |w, index|
    io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N"))
  end

  pad(offset_index1 - (offset_records + records_length), io)
  write_kdtree(corpus, io)
end

#write_kdtree(corpus, io) ⇒ `Object`

# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 98

def write_kdtree(corpus, io)
  i = -1
  k = KDTree.new(
                 corpus.entries.map {|w| [
                                          len2coord(w.word.length.to_f),
                                          w.percentile.to_f,
                                          i+=1
                                         ]
                 }
                 )

  k.persist(io)
end

Module: CorrectHorseBatteryStaple::Backend::IsamKD::InstanceMethods

Instance Method Summary collapse

Instance Method Details

#binwrite(*args) ⇒ Object

#each(&block) ⇒ Object

#file_range_read(file_range = nil) ⇒ Object

#file_size(file) ⇒ Object

#file_string ⇒ Object

#fix_stats(stats) ⇒ Object

#get_word_by_idx(n) ⇒ Object

#initialize_backend_variables ⇒ Object

#inspect ⇒ Object

#len2coord(len) ⇒ Object

#load_kdtree ⇒ Object

#nth_chunk(n, string) ⇒ Object

#openmode ⇒ Object

#pad(size, io) ⇒ Object

#page_size ⇒ Object

#parse_prelude ⇒ Object

#parse_record(string, index = 0, word = CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) ⇒ Object

#parse_record_into_array(string, index, length_range = nil) ⇒ Object

#percentile_index(percentile, round = true) ⇒ Object

#pick(count, options = {}) ⇒ Object

#pos_of_nth_word_in_file(n) ⇒ Object

#precache(max = -1)) ⇒ Object

#prelude ⇒ Object

#record_percentile_range_read(percentile_range) ⇒ Object

#record_range_for_percentile(range) ⇒ Object

#record_range_read(record_range = nil) ⇒ Object

#records_size ⇒ Object

#records_string ⇒ Object

#round_up(val, blocksize = page_size) ⇒ Object

#size ⇒ Object

#sorted_entries ⇒ Object

#word_length(chunk_string) ⇒ Object

#write_corpus_to_io(corpus, io = STDOUT) ⇒ Object

#write_kdtree(corpus, io) ⇒ Object