Module: CorrectHorseBatteryStaple::Backend::Isam::InstanceMethods
- Defined in:
- lib/correct_horse_battery_staple/backend/isam.rb
Instance Method Summary collapse
- #binwrite(*args) ⇒ Object
-
#each(&block) ⇒ Object
some core Enumerable building blocks.
- #file_range_read(file_range = nil) ⇒ Object
- #file_size(file) ⇒ Object
- #file_string ⇒ Object
- #fix_stats(stats) ⇒ Object
- #get_word_by_idx(n) ⇒ Object
- #initialize_backend_variables ⇒ Object
-
#inspect ⇒ Object
Show some information about.
-
#nth_chunk(n, string) ⇒ Object
return a string representing the nth_record.
- #openmode ⇒ Object
- #pad(size, io) ⇒ Object
- #page_size ⇒ Object
- #parse_prelude ⇒ Object
-
#parse_record(string, index = 0, word = CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) ⇒ Object
Parse a record into a Word object, which can be provided or will otherwise be constructed as needed fourth arg is a length range which can act as a filter; if not satisfied, nil will be returned.
-
#parse_record_into_array(string, index, length_range = nil) ⇒ Object
Parse a record into an array of [word, frequency] IFF the word fits into the length_range or length_range is nil.
-
#percentile_index(percentile, round = true) ⇒ Object
rather than using a StatisticalArray, we do direct indexing into the file/string.
- #pos_of_nth_word_in_file(n) ⇒ Object
-
#precache(max = -1)) ⇒ Object
Format of header:.
- #prelude ⇒ Object
-
#record_percentile_range_read(percentile_range) ⇒ Object
memoize :record_range_read.
- #record_range_for_percentile(range) ⇒ Object
- #record_range_read(record_range = nil) ⇒ Object
-
#records_size ⇒ Object
file I/O.
-
#records_string ⇒ Object
returns a string representing the record-holding portion of the file.
-
#round_up(val, blocksize = page_size) ⇒ Object
many MMUs in default mode and modern highcap drives have 4k pages/blocks.
- #size ⇒ Object
-
#sorted_entries ⇒ Object
we presume that the ISAM file has been sorted.
- #word_length(chunk_string) ⇒ Object
- #write_corpus_to_io(corpus, io = STDOUT) ⇒ Object
Instance Method Details
#binwrite(*args) ⇒ Object
92 93 94 95 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 92 def binwrite(*args) method = io.respond_to?(:binwrite) ? :binwrite : :write io.send(method, *args) end |
#each(&block) ⇒ Object
some core Enumerable building blocks
257 258 259 260 261 262 263 264 265 266 267 268 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 257 def each(&block) string = records_string max_index = size - 1 index = 0 while index < max_index word = parse_record(string, index) word.index = index word.percentile = [(index-0.5)/size,0].max * 100 yield word index += 1 end end |
#file_range_read(file_range = nil) ⇒ Object
292 293 294 295 296 297 298 299 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 292 def file_range_read(file_range = nil) file_range ||= 0...file_size(@file) pos = @file.tell @file.seek(file_range.first) @file.read(range_count(file_range)) ensure @file.seek(pos) end |
#file_size(file) ⇒ Object
140 141 142 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 140 def file_size(file) (file.respond_to?(:size) ? file.size : file.stat.size) end |
#file_string ⇒ Object
288 289 290 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 288 def file_string @file.is_a?(StringIO) ? @file.string : file_range_read(nil) end |
#fix_stats(stats) ⇒ Object
27 28 29 30 31 32 33 34 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 27 def fix_stats(stats) stats.each do |k,v| if v.respond_to?(:nan?) && v.nan? stats[k] = -1 end end stats end |
#get_word_by_idx(n) ⇒ Object
247 248 249 250 251 252 253 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 247 def get_word_by_idx(n) chunk = nth_chunk(n, records_string) parse_record(chunk).tap do |w| w.index = n w.percentile = [(n-0.5)/size,0].max * 100 end end |
#initialize_backend_variables ⇒ Object
22 23 24 25 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 22 def initialize_backend_variables @length_scaling_factor = 15 @page_size = 4096 end |
#inspect ⇒ Object
Show some information about
189 190 191 192 193 194 195 196 197 198 199 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 189 def inspect super + "\n" + <<INSPECT File size: #{file_size(@file)} Word length: #{@word_length} Frequency bytes: #{@frequency_length} Total record bytes: #{@records_length} Prelude: #{@prelude.map {|k,v| k=="stats" ? "" : " #{k}: #{v}\n" }.join("") } INSPECT end |
#nth_chunk(n, string) ⇒ Object
return a string representing the nth_record
239 240 241 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 239 def nth_chunk(n, string) string[@entry_length * n, @entry_length] end |
#openmode ⇒ Object
97 98 99 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 97 def openmode IO.respond_to?(:binwrite) ? "wb:ASCII-8BIT" : "w" end |
#pad(size, io) ⇒ Object
88 89 90 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 88 def pad(size, io) io.write([].pack("x#{size}")) end |
#page_size ⇒ Object
36 37 38 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 36 def page_size @page_size || 4096 end |
#parse_prelude ⇒ Object
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 148 def parse_prelude @file.seek 0 prelude_buf = @file.read(INITIAL_PRELUDE_LENGTH) # byte offset of first record from beginning of file # total length of JSON string (without padding) (@record_offset, @prelude_len) = prelude_buf.unpack("NN") # read more if our initial read didn't slurp in the entire prelude if @prelude_len > prelude_buf.length prelude_buf += @file.read(@prelude_len - prelude_buf.length) end @prelude = JSON.parse( prelude_buf.unpack("@8a#{@prelude_len}")[0] ) || {} # includes prefix length byte @word_length = @prelude["wlen"] || raise(ArgumentError, "Word length is not defined!") # as network byte order int @frequency_length = @prelude["flen"] || 4 # total length of record @entry_length = @prelude["entrylen"] || raise(ArgumentError, "Prelude does not include entrylen!") @offset_index1 = @prelude["offset_index1"] @offset_index2 = @prelude["offset_index2"] @entry_count = @prelude["n"] || raise(ArgumentError, "Number of records not included!") @records_length = @prelude["records_length"] || (@entry_length * @entry_count) @length_scaling_factor = @prelude["length_scaling_factor"] || 10 load_stats_from_hash(@prelude["stats"]) if @prelude["stats"] @prelude end |
#parse_record(string, index = 0, word = CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) ⇒ Object
Parse a record into a Word object, which can be provided or will otherwise be constructed as needed fourth arg is a length range which can act as a filter; if not satisfied, nil will be returned
224 225 226 227 228 229 230 231 232 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 224 def parse_record(string, index=0, word=CorrectHorseBatteryStaple::Word.new(:word => ""), length_range = nil) = parse_record_into_array(string, index, length_range) return nil unless word.word = [0] word.frequency = [1] word end |
#parse_record_into_array(string, index, length_range = nil) ⇒ Object
Parse a record into an array of [word, frequency] IFF the word fits into the length_range or length_range is nil
207 208 209 210 211 212 213 214 215 216 217 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 207 def parse_record_into_array(string, index, length_range = nil) chunk = nth_chunk(index, string) raise "No chunk for index #{index}" unless chunk actual_word_length = chunk.unpack("C")[0] if !length_range || length_range.include?(actual_word_length) # returns [word, frequency] chunk.unpack("xa#{actual_word_length}@#{@word_length}N") else nil end end |
#percentile_index(percentile, round = true) ⇒ Object
rather than using a StatisticalArray, we do direct indexing into the file/string
321 322 323 324 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 321 def percentile_index(percentile, round=true) r = percentile.to_f/100 * count + 0.5 round ? r.round : r end |
#pos_of_nth_word_in_file(n) ⇒ Object
243 244 245 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 243 def pos_of_nth_word_in_file(n) pos = @record_offset + (n * @entry_length) end |
#precache(max = -1)) ⇒ Object
Format of header:
0..3 - OB - offset of body start in bytes; network byte order 4..7 - LP - length of prelude in network byte order 8..OB-1 - P - JSON-encoded prelude hash and space padding OB..EOF - array of fixed size records as described in prelude
Contents of Prelude (after JSON decoding):
P - length of word part of record P - length of frequency part of record (always 4 bytes) P - length of total part of record P - number of records P - field name sorted by (word or frequency) P - corpus statistics P - absolute file offset of KDTree index P - length in bytes of records section, excluding padding P - what length was multiplied by in creating KDTree (usually 15)
Format of record:
2 bytes - LW - actual length of word within field P bytes - LW bytes of word (W) + P-LW bytes of padding P (4) bytes - frequency as network byte order long
After record section, there is padding up to the next page_size boundary, and then there is a dumped KDTree which extends to EOF.
134 135 136 137 138 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 134 def precache(max = -1) return if max > -1 && file_size(@file) > max @file.seek 0 @file = StringIO.new @file.read, "r" end |
#prelude ⇒ Object
144 145 146 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 144 def prelude @prelude || parse_prelude end |
#record_percentile_range_read(percentile_range) ⇒ Object
memoize :record_range_read
314 315 316 317 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 314 def record_percentile_range_read(percentile_range) record_range = record_range_for_percentile(percentile_range) record_range_read(record_range) end |
#record_range_for_percentile(range) ⇒ Object
326 327 328 329 330 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 326 def record_range_for_percentile(range) range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric) (percentile_index(range.begin, false).floor * @entry_length ... percentile_index(range.end, false).ceil * @entry_length) end |
#record_range_read(record_range = nil) ⇒ Object
308 309 310 311 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 308 def record_range_read(record_range = nil) record_range ||= 0...records_size file_range_read((record_range.first + @record_offset)...(range_count(record_range) + @record_offset)) end |
#records_size ⇒ Object
file I/O
284 285 286 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 284 def records_size @records_length end |
#records_string ⇒ Object
returns a string representing the record-holding portion of the file
303 304 305 306 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 303 def records_string @records_string ||= record_range_read(0 ... records_size) end |
#round_up(val, blocksize = page_size) ⇒ Object
many MMUs in default mode and modern highcap drives have 4k pages/blocks
41 42 43 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 41 def round_up(val, blocksize=page_size) [(val.to_f/blocksize).ceil, 1].max * blocksize end |
#size ⇒ Object
270 271 272 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 270 def size @entry_count ||= records_size / @entry_length end |
#sorted_entries ⇒ Object
we presume that the ISAM file has been sorted
278 279 280 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 278 def sorted_entries @sorted_entries ||= entries end |
#word_length(chunk_string) ⇒ Object
234 235 236 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 234 def word_length(chunk_string) chunk_string.unpack("C") end |
#write_corpus_to_io(corpus, io = STDOUT) ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/correct_horse_battery_staple/backend/isam.rb', line 45 def write_corpus_to_io(corpus, io=STDOUT) io.rewind # includes prefix length byte @word_length = corpus.reduce(0) { |m, e| m > e.word.length ? m : e.word.length } + 1 @freq_length = 4 @entry_length = @word_length + @freq_length stats = fix_stats(corpus.stats) corpus_word_count = corpus.length prelude = { "wlen" => @word_length, "flen" => 4, "entrylen" => @word_length + @freq_length, "sort" => "frequency", "n" => corpus_word_count, "stats" => stats, "flags" => 0, "length_scaling_factor" => (@length_scaling_factor || 15), "records_length" => "0000000000", "offset_records" => "0000000000", "offset_index1" => "0000000000", "offset_index2" => "0000000000" } prelude_json_length = prelude.to_json.length prelude["offset_records"] = offset_records = round_up(prelude_json_length+8.0) prelude["records_length"] = records_length = corpus_word_count * prelude["entrylen"] offset_index1 = prelude["offset_records"] + round_up(records_length, page_size) prelude["offset_index1"] = offset_index1 io.write([offset_records, prelude_json_length, prelude.to_json]. pack("NNA#{offset_records-8}")) corpus.each_with_index do |w, index| io.write(s=[w.word.length, w.word, w.frequency].pack("Ca#{@word_length-1}N")) end end |