Module: CorrectHorseBatteryStaple::Backend::IsamKD::InstanceMethods
- Defined in:
- lib/correct_horse_battery_staple/backend/isam_kd.rb
Instance Method Summary collapse
- #binwrite(*args) ⇒ Object
-
#each(&block) ⇒ Object
some core Enumerable building blocks.
- #file_range_read(file_range = nil) ⇒ Object
- #file_size(file) ⇒ Object
- #file_string ⇒ Object
- #fix_stats(stats) ⇒ Object
-
#get_word_by_idx(n) ⇒ Object
this version is much slower than the other - 1.5x total runtime slower in some cases.
- #initialize_backend_variables ⇒ Object
-
#inspect ⇒ Object
Show some information about.
-
#len2coord(len) ⇒ Object
make the search space more square by increasing the length of the “word length” axis.
- #load_kdtree ⇒ Object
-
#nth_chunk(n, string) ⇒ Object
return a string representing the nth_record.
- #openmode ⇒ Object
- #pad(size, io) ⇒ Object
- #page_size ⇒ Object
- #parse_prelude ⇒ Object
-
#parse_record(string, index = 0, word = CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) ⇒ Object
Parse a record into a Word object, which can be provided or will otherwise be constructed as needed fourth arg is a length range which can act as a filter; if not satisfied, nil will be returned.
-
#parse_record_into_array(string, index, length_range = nil) ⇒ Object
Parse a record into an array of [word, frequency] IFF the word fits into the length_range or length_range is nil.
-
#percentile_index(percentile, round = true) ⇒ Object
rather than using a StatisticalArray, we do direct indexing into the file/string.
-
#pick(count, options = {}) ⇒ Object
optimized pick - does NOT support :filter, though.
- #pos_of_nth_word_in_file(n) ⇒ Object
-
#precache(max = -1)) ⇒ Object
Format of header:.
- #prelude ⇒ Object
-
#record_percentile_range_read(percentile_range) ⇒ Object
memoize :record_range_read.
- #record_range_for_percentile(range) ⇒ Object
- #record_range_read(record_range = nil) ⇒ Object
-
#records_size ⇒ Object
file I/O.
-
#records_string ⇒ Object
returns a string representing the record-holding portion of the file.
-
#round_up(val, blocksize = page_size) ⇒ Object
many MMUs in default mode and modern highcap drives have 4k pages/blocks.
- #size ⇒ Object
-
#sorted_entries ⇒ Object
we presume that the ISAM file has been sorted.
- #word_length(chunk_string) ⇒ Object
- #write_corpus_to_io(corpus, io = STDOUT) ⇒ Object
- #write_kdtree(corpus, io) ⇒ Object
Instance Method Details
#binwrite(*args) ⇒ Object
118 119 120 121 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 118 def binwrite(*args) method = io.respond_to?(:binwrite) ? :binwrite : :write io.send(method, *args) end |
#each(&block) ⇒ Object
some core Enumerable building blocks
301 302 303 304 305 306 307 308 309 310 311 312 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 301 def each(&block) string = records_string max_index = size - 1 index = 0 while index < max_index word = parse_record(string, index) word.index = index word.percentile = [(index-0.5)/size,0].max * 100 yield word index += 1 end end |
#file_range_read(file_range = nil) ⇒ Object
373 374 375 376 377 378 379 380 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 373 def file_range_read(file_range = nil) file_range ||= 0...file_size(@file) pos = @file.tell @file.seek(file_range.first) @file.read(range_count(file_range)) ensure @file.seek(pos) end |
#file_size(file) ⇒ Object
166 167 168 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 166 def file_size(file) (file.respond_to?(:size) ? file.size : file.stat.size) end |
#file_string ⇒ Object
369 370 371 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 369 def file_string @file.is_a?(StringIO) ? @file.string : file_range_read(nil) end |
#fix_stats(stats) ⇒ Object
30 31 32 33 34 35 36 37 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 30 def fix_stats(stats) stats.each do |k,v| if v.respond_to?(:nan?) && v.nan? stats[k] = -1 end end stats end |
#get_word_by_idx(n) ⇒ Object
this version is much slower than the other - 1.5x total runtime slower in some cases.
def get_word_by_idx_direct(n)
@file.seek(pos_of_nth_word_in_file(n))
chunk = @file.read(@entry_length)
parse_record(chunk)
end
291 292 293 294 295 296 297 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 291 def get_word_by_idx(n) chunk = nth_chunk(n, records_string) parse_record(chunk).tap do |w| w.index = n w.percentile = [(n-0.5)/size,0].max * 100 end end |
#initialize_backend_variables ⇒ Object
23 24 25 26 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 23 def initialize_backend_variables @length_scaling_factor = 15 @page_size = 4096 end |
#inspect ⇒ Object
Show some information about
214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 214 def inspect super + "\n" + <<INSPECT File size: #{file_size(@file)} Word length: #{@word_length} Frequency bytes: #{@frequency_length} Total record bytes: #{@records_length} Offset of K-D Tree index: #{@offset_index1} Total K-D Tree index bytes: #{file_size(@file) - @offset_index1} K-D Tree Signature: #{file_range_read(@offset_index1..(@offset_index1+3))} Prelude: #{@prelude.map {|k,v| k=="stats" ? "" : " #{k}: #{v}\n" }.join("") } INSPECT end |
#len2coord(len) ⇒ Object
make the search space more square by increasing the length of the “word length” axis
114 115 116 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 114 def len2coord(len) len * (@length_scaling_factor || 10) end |
#load_kdtree ⇒ Object
229 230 231 232 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 229 def load_kdtree @file.seek(@offset_index1) KDTree.new @file end |
#nth_chunk(n, string) ⇒ Object
return a string representing the nth_record
273 274 275 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 273 def nth_chunk(n, string) string[@entry_length * n, @entry_length] end |
#openmode ⇒ Object
123 124 125 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 123 def openmode IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w" end |
#pad(size, io) ⇒ Object
94 95 96 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 94 def pad(size, io) io.write([].pack("x#{size}")) end |
#page_size ⇒ Object
39 40 41 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 39 def page_size @page_size || 4096 end |
#parse_prelude ⇒ Object
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 174 def parse_prelude @file.seek 0 prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH) # byte offset of first record from beginning of file # total length of JSON string (without padding) (@record_offset, @prelude_len) = prelude_buf.unpack("NN") # read more if our initial read didn't slurp in the entire prelude if @prelude_len > prelude_buf.length prelude_buf += @file.read(@prelude_len - prelude_buf.length) end @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {} # includes prefix length byte @word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!") # as network byte order int @frequency_length = @prelude["flen"] || 4 # total length of record @entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!") @offset_index1 = @prelude["offset_index1"] || raise(ArgumentError, "No index offset!") @records_length = @prelude["records_length"] || raise(ArgumentError, "No records length!") @entry_count = @prelude["n"] || raise(ArgumentError, "Number of records not included!") @length_scaling_factor = @prelude["length_scaling_factor"] || 10 load_stats_from_hash(@prelude["stats"]) if @prelude["stats"] @prelude end |
#parse_record(string, index = 0, word = CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) ⇒ Object
Parse a record into a Word object, which can be provided or will otherwise be constructed as needed fourth arg is a length range which can act as a filter; if not satisfied, nil will be returned
258 259 260 261 262 263 264 265 266 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 258 def parse_record(string, index=0, word=CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) = parse_record_into_array(string, index, length_range) return nil unless word.word = [0] word.frequency = [1] word end |
#parse_record_into_array(string, index, length_range = nil) ⇒ Object
Parse a record into an array of [word, frequency] IFF the word fits into the length_range or length_range is nil
241 242 243 244 245 246 247 248 249 250 251 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 241 def parse_record_into_array(string, index, length_range = nil) chunk = nth_chunk(index, string) raise "No chunk for index #{index}" unless chunk actual_word_length = chunk.unpack("C")[0] if !length_range || length_range.include?(actual_word_length) # returns [word, frequency] chunk.unpack("xa#{actual_word_length}@#{@word_length}N") else nil end end |
#percentile_index(percentile, round = true) ⇒ Object
rather than using a StatisticalArray, we do direct indexing into the file/string
402 403 404 405 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 402 def percentile_index(percentile, round=true) r = percentile.to_f/100 * count + 0.5 round ? r.round : r end |
#pick(count, options = {}) ⇒ Object
optimized pick - does NOT support :filter, though
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 328 def pick(count, = {}) # incompat check raise NotImplementedError, "ISAM does not support :filter option" if [:filter] = {:percentile => 0..100, :word_length => 0..20}.merge() result = [] found_indexes = [] iterations = 0 max_iterations = [1000, 4 * count].max while (result.size < count && iterations < max_iterations) len = random_in_range([:word_length]) pct = random_in_range([:percentile]) word_idx = @kdtree.nearest(len2coord(len), pct) unless found_indexes.include?(word_idx) found_indexes << word_idx word = get_word_by_idx(word_idx) if [:word_length].include?(word.word.length) result << word else STDERR.puts "non-qualifying word: #{word.word.length}" end end iterations += 1 end # validate that we succeeded raise "Cannot find #{count} words matching criteria" if result.length < count result end |
#pos_of_nth_word_in_file(n) ⇒ Object
277 278 279 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 277 def pos_of_nth_word_in_file(n) pos = @record_offset + (n * @entry_length) end |
#precache(max = -1)) ⇒ Object
Format of header:
0..3 - OB - offset of body start in bytes; network byte order 4..7 - LP - length of prelude in network byte order 8..OB-1 - P - JSON-encoded prelude hash and space padding OB..EOF - array of fixed size records as described in prelude
Contents of Prelude (after JSON decoding):
P - length of word part of record P - length of frequency part of record (always 4 bytes) P - length of total part of record P - number of records P - field name sorted by (word or frequency) P - corpus statistics P - absolute file offset of KDTree index P - length in bytes of records section, excluding padding P - what length was multiplied by in creating KDTree (usually 15)
Format of record:
2 bytes - LW - actual length of word within field P bytes - LW bytes of word (W) + P-LW bytes of padding P (4) bytes - frequency as network byte order long
After record section, there is padding up to the next page_size boundary, and then there is a dumped KDTree which extends to EOF.
160 161 162 163 164 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 160 def precache(max = -1) return if max > -1 && file_size(@file) > max @file.seek 0 @file = StringIO.new @file.read, "r" end |
#prelude ⇒ Object
170 171 172 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 170 def prelude @prelude || parse_prelude end |
#record_percentile_range_read(percentile_range) ⇒ Object
memoize :record_range_read
395 396 397 398 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 395 def record_percentile_range_read(percentile_range) record_range = record_range_for_percentile(percentile_range) record_range_read(record_range) end |
#record_range_for_percentile(range) ⇒ Object
407 408 409 410 411 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 407 def record_range_for_percentile(range) range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric) (percentile_index(range.begin, false).floor * @entry_length ... percentile_index(range.end, false).ceil * @entry_length) end |
#record_range_read(record_range = nil) ⇒ Object
389 390 391 392 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 389 def record_range_read(record_range = nil) record_range ||= 0...records_size file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset)) end |
#records_size ⇒ Object
file I/O
365 366 367 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 365 def records_size @records_length end |
#records_string ⇒ Object
returns a string representing the record-holding portion of the file
384 385 386 387 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 384 def records_string @records_string ||= record_range_read(0 ... records_size) end |
#round_up(val, blocksize = page_size) ⇒ Object
many MMUs in default mode and modern highcap drives have 4k pages/blocks
44 45 46 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 44 def round_up(val, blocksize=page_size) [(val.to_f/blocksize).ceil, 1].max * blocksize end |
#size ⇒ Object
314 315 316 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 314 def size @entry_count ||= records_size / @entry_length end |
#sorted_entries ⇒ Object
we presume that the ISAM file has been sorted
322 323 324 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 322 def sorted_entries @sorted_entries ||= entries end |
#word_length(chunk_string) ⇒ Object
268 269 270 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 268 def word_length(chunk_string) chunk_string.unpack("C") end |
#write_corpus_to_io(corpus, io = STDOUT) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 48 def write_corpus_to_io(corpus, io=STDOUT) io.rewind # includes prefix length byte @word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1 @freq_length = 4 @entry_length = @word_length + @freq_length stats = fix_stats(corpus.stats) corpus_word_count = corpus.length prelude = { "wlen" => @word_length, "flen" => 4, "entrylen" => @word_length + @freq_length, "sort" => "frequency", "n" => corpus_word_count, "stats" => stats, "flags" => 0, "length_scaling_factor" => (@length_scaling_factor || 15), "records_length" => "0000000000", "offset_records" => "0000000000", "offset_index1" => "0000000000", "offset_index2" => "0000000000" } prelude_json_length = prelude.to_json.length prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0) prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"] offset_index1 = prelude["offset_records"] + round_up(records_length, page_size) prelude["offset_index1"] = offset_index1 io.write([offset_records, prelude_json_length, prelude.to_json]. pack("NNA#{offset_records-8}")) corpus.each_with_index do |w, index| io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N")) end pad(offset_index1 - (offset_records + records_length), io) write_kdtree(corpus, io) end |
#write_kdtree(corpus, io) ⇒ Object
98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/correct_horse_battery_staple/backend/isam_kd.rb', line 98 def write_kdtree(corpus, io) i = -1 k = KDTree.new( corpus.entries.map {|w| [ len2coord(w.word.length.to_f), w.percentile.to_f, i+=1 ] } ) k.persist(io) end |