Module: CorrectHorseBatteryStaple::Backend::IsamKD::InstanceMethods

Defined in:
lib/correct_horse_battery_staple/backend/isam_kd.rb

Instance Method Summary collapse

Instance Method Details

#binwrite(*args) ⇒ Object



118
119
120
121
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 118

def binwrite(*args)
  method = io.respond_to?(:binwrite) ? :binwrite : :write
  io.send(method, *args)
end

#each(&block) ⇒ Object

some core Enumerable building blocks



301
302
303
304
305
306
307
308
309
310
311
312
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 301

def each(&block)
  string = records_string
  max_index = size - 1
  index = 0
  while index < max_index
    word = parse_record(string, index)
    word.index = index
    word.percentile = [(index-0.5)/size,0].max * 100
    yield word
    index += 1
  end
end

#file_range_read(file_range = nil) ⇒ Object



373
374
375
376
377
378
379
380
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 373

def file_range_read(file_range = nil)
  file_range ||= 0...file_size(@file)
  pos = @file.tell
  @file.seek(file_range.first)
  @file.read(range_count(file_range))
ensure
  @file.seek(pos)
end

#file_size(file) ⇒ Object



166
167
168
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 166

def file_size(file)
  (file.respond_to?(:size) ? file.size : file.stat.size)
end

#file_stringObject



369
370
371
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 369

def file_string
  @file.is_a?(StringIO) ? @file.string : file_range_read(nil)
end

#fix_stats(stats) ⇒ Object



30
31
32
33
34
35
36
37
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 30

def fix_stats(stats)
  stats.each do |k,v|
    if v.respond_to?(:nan?) && v.nan?
      stats[k] = -1
    end
  end
  stats
end

#get_word_by_idx(n) ⇒ Object

this version is much slower than the other - 1.5x total runtime slower in some cases.

def get_word_by_idx_direct(n)

@file.seek(pos_of_nth_word_in_file(n))
chunk = @file.read(@entry_length)
parse_record(chunk)

end



291
292
293
294
295
296
297
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 291

def get_word_by_idx(n)
  chunk = nth_chunk(n, records_string)
  parse_record(chunk).tap do |w|
    w.index      = n
    w.percentile = [(n-0.5)/size,0].max * 100
  end
end

#initialize_backend_variablesObject



23
24
25
26
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 23

def initialize_backend_variables
  @length_scaling_factor = 15
  @page_size = 4096
end

#inspectObject

Show some information about



214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 214

def inspect
  super + "\n" + <<INSPECT
File size: #{file_size(@file)}
Word length: #{@word_length}
Frequency bytes: #{@frequency_length}
Total record bytes: #{@records_length}
Offset of K-D Tree index: #{@offset_index1}
Total K-D Tree index bytes: #{file_size(@file) - @offset_index1}
K-D Tree Signature: #{file_range_read(@offset_index1..(@offset_index1+3))}

Prelude:
#{@prelude.map {|k,v| k=="stats" ? "" : "  #{k}: #{v}\n" }.join("") }
INSPECT
end

#len2coord(len) ⇒ Object

make the search space more square by increasing the length of the “word length” axis



114
115
116
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 114

def len2coord(len)
  len * (@length_scaling_factor || 10)
end

#load_kdtreeObject



229
230
231
232
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 229

def load_kdtree
  @file.seek(@offset_index1)
  KDTree.new @file
end

#nth_chunk(n, string) ⇒ Object

return a string representing the nth_record



273
274
275
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 273

def nth_chunk(n, string)
  string[@entry_length * n, @entry_length]
end

#openmodeObject



123
124
125
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 123

def openmode
  IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w"
end

#pad(size, io) ⇒ Object



94
95
96
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 94

def pad(size, io)
  io.write([].pack("x#{size}"))
end

#page_sizeObject



39
40
41
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 39

def page_size
  @page_size || 4096
end

#parse_preludeObject



174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 174

def parse_prelude
  @file.seek 0
  prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH)

  # byte offset of first record from beginning of file
  # total length of JSON string (without padding)
  (@record_offset, @prelude_len)  = prelude_buf.unpack("NN")

  # read more if our initial read didn't slurp in the entire prelude
  if @prelude_len > prelude_buf.length
    prelude_buf += @file.read(@prelude_len - prelude_buf.length)
  end

  @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {}

  # includes prefix length byte
  @word_length      = @prelude["wlen"]     || raise(ArgumentError, "Word length is not defined!")

  # as network byte order int
  @frequency_length = @prelude["flen"]     || 4

  # total length of record
  @entry_length     = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!")

  @offset_index1    = @prelude["offset_index1"] || raise(ArgumentError, "No index offset!")

  @records_length   = @prelude["records_length"] || raise(ArgumentError, "No records length!")

  @entry_count      = @prelude["n"] || raise(ArgumentError, "Number of records not included!")

  @length_scaling_factor = @prelude["length_scaling_factor"] || 10
  
  load_stats_from_hash(@prelude["stats"]) if @prelude["stats"]

  @prelude
end

#parse_record(string, index = 0, word = CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) ⇒ Object

Parse a record into a Word object, which can be provided or will otherwise be constructed as needed fourth arg is a length range which can act as a filter; if not satisfied, nil will be returned



258
259
260
261
262
263
264
265
266
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 258

def parse_record(string, index=0,
                 word=CorrectHorseBatteryStaple::Word.new(:word => ""),
                 length_range = nil)
  bare = parse_record_into_array(string, index, length_range)
  return nil unless bare
  word.word = bare[0]
  word.frequency = bare[1]
  word
end

#parse_record_into_array(string, index, length_range = nil) ⇒ Object

Parse a record into an array of [word, frequency] IFF the word fits into the length_range or length_range is nil



241
242
243
244
245
246
247
248
249
250
251
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 241

def parse_record_into_array(string, index, length_range = nil)
  chunk = nth_chunk(index, string)
  raise "No chunk for index #{index}" unless chunk
  actual_word_length = chunk.unpack("C")[0]
  if !length_range || length_range.include?(actual_word_length)
    # returns [word, frequency]
    chunk.unpack("xa#{actual_word_length}@#{@word_length}N")
  else
    nil
  end
end

#percentile_index(percentile, round = true) ⇒ Object

rather than using a StatisticalArray, we do direct indexing into the file/string



402
403
404
405
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 402

def percentile_index(percentile, round=true)
  r = percentile.to_f/100 * count + 0.5
  round ? r.round : r
end

#pick(count, options = {}) ⇒ Object

optimized pick - does NOT support :filter, though

Raises:

  • (NotImplementedError)


328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 328

def pick(count, options = {})
  # incompat check
  raise NotImplementedError, "ISAM does not support :filter option" if options[:filter]

  options = {:percentile  => 0..100,
             :word_length => 0..20}.merge(options)
  
  result = []
  found_indexes = []
  iterations = 0
  max_iterations = [1000, 4 * count].max
  while (result.size < count && iterations < max_iterations)
    len = random_in_range(options[:word_length])
    pct = random_in_range(options[:percentile])
    word_idx = @kdtree.nearest(len2coord(len), pct)
    unless found_indexes.include?(word_idx)
      found_indexes << word_idx
      word = get_word_by_idx(word_idx)
      if options[:word_length].include?(word.word.length)
        result << word
      else
        STDERR.puts "non-qualifying word: #{word.word.length}"
      end
    end
    iterations += 1
  end

  # validate that we succeeded
  raise "Cannot find #{count} words matching criteria" if result.length < count

  result
end

#pos_of_nth_word_in_file(n) ⇒ Object



277
278
279
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 277

def pos_of_nth_word_in_file(n)
  pos = @record_offset + (n * @entry_length)
end

#precache(max = -1)) ⇒ Object

Format of header:

0..3 - OB - offset of body start in bytes; network byte order 4..7 - LP - length of prelude in network byte order 8..OB-1 - P - JSON-encoded prelude hash and space padding OB..EOF - array of fixed size records as described in prelude

Contents of Prelude (after JSON decoding):

P - length of word part of record P - length of frequency part of record (always 4 bytes) P - length of total part of record P - number of records P - field name sorted by (word or frequency) P - corpus statistics P - absolute file offset of KDTree index P - length in bytes of records section, excluding padding P - what length was multiplied by in creating KDTree (usually 15)

Format of record:

2 bytes - LW - actual length of word within field P bytes - LW bytes of word (W) + P-LW bytes of padding P (4) bytes - frequency as network byte order long

After record section, there is padding up to the next page_size boundary, and then there is a dumped KDTree which extends to EOF.



160
161
162
163
164
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 160

def precache(max = -1)
  return if max > -1 && file_size(@file) > max
  @file.seek 0
  @file = StringIO.new @file.read, "r"
end

#preludeObject



170
171
172
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 170

def prelude
  @prelude || parse_prelude
end

#record_percentile_range_read(percentile_range) ⇒ Object

memoize :record_range_read



395
396
397
398
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 395

def record_percentile_range_read(percentile_range)
  record_range = record_range_for_percentile(percentile_range)
  record_range_read(record_range)
end

#record_range_for_percentile(range) ⇒ Object



407
408
409
410
411
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 407

def record_range_for_percentile(range)
  range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
  (percentile_index(range.begin, false).floor * @entry_length ...
   percentile_index(range.end,   false).ceil * @entry_length)
end

#record_range_read(record_range = nil) ⇒ Object



389
390
391
392
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 389

def record_range_read(record_range = nil)
  record_range ||= 0...records_size
  file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset))
end

#records_sizeObject

file I/O



365
366
367
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 365

def records_size
  @records_length
end

#records_stringObject

returns a string representing the record-holding portion of the file



384
385
386
387
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 384

def records_string
  @records_string ||=
    record_range_read(0 ... records_size)
end

#round_up(val, blocksize = page_size) ⇒ Object

many MMUs in default mode and modern highcap drives have 4k pages/blocks



44
45
46
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 44

def round_up(val, blocksize=page_size)
  [(val.to_f/blocksize).ceil, 1].max * blocksize
end

#sizeObject



314
315
316
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 314

def size
  @entry_count ||= records_size / @entry_length
end

#sorted_entriesObject

we presume that the ISAM file has been sorted



322
323
324
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 322

def sorted_entries
  @sorted_entries ||= entries
end

#word_length(chunk_string) ⇒ Object



268
269
270
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 268

def word_length(chunk_string)
  chunk_string.unpack("C")
end

#write_corpus_to_io(corpus, io = STDOUT) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 48

def write_corpus_to_io(corpus, io=STDOUT)
  io.rewind

  # includes prefix length byte
  @word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1
  @freq_length = 4
  @entry_length = @word_length + @freq_length

  stats = fix_stats(corpus.stats)
  corpus_word_count = corpus.length

  prelude = {
    "wlen"           => @word_length,
    "flen"           => 4,
    "entrylen"       => @word_length + @freq_length,
    "sort"           => "frequency",
    "n"              => corpus_word_count,
    "stats"          => stats,
    "flags"          => 0,
    "length_scaling_factor" => (@length_scaling_factor || 15),
    "records_length" => "0000000000",
    "offset_records" => "0000000000",
    "offset_index1"  => "0000000000",
    "offset_index2"  => "0000000000"
  }

  prelude_json_length = prelude.to_json.length
  prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0)

  prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"]
  offset_index1 = prelude["offset_records"] +
    round_up(records_length, page_size)

  prelude["offset_index1"]  = offset_index1

  io.write([offset_records, prelude_json_length, prelude.to_json].
           pack("NNA#{offset_records-8}"))

  corpus.each_with_index do |w, index|
    io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N"))
  end

  pad(offset_index1 - (offset_records + records_length), io)
  write_kdtree(corpus, io)
end

#write_kdtree(corpus, io) ⇒ Object



98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 98

def write_kdtree(corpus, io)
  i = -1
  k = KDTree.new(
                 corpus.entries.map {|w| [
                                          len2coord(w.word.length.to_f),
                                          w.percentile.to_f,
                                          i+=1
                                         ]
                 }
                 )

  k.persist(io)
end