Class: IOBlockReader::IOBlockReader

Inherits:
Object
  • Object
show all
Defined in:
lib/ioblockreader/ioblockreader.rb

Overview

Class giving a String-like interface over an IO, reading it by blocks. Very useful to access big files’ content as it was a String containing the whole file’s content.

Instance Method Summary collapse

Constructor Details

#initialize(io, options = {}) ⇒ IOBlockReader

Constructor

Parameters
  • io (IO): The IO object used to give the String interface

  • options (map<Symbol,Object>): Additional options:

    • :block_size (Fixnum): The block size in bytes used internally. [default = 268435456]

    • :blocks_in_memory (Fixnum): Maximal number of blocks in memory. If it is required to load more blocks than this value for a single operation, this value is ignored. [default = 2]



16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/ioblockreader/ioblockreader.rb', line 16

def initialize(io, options = {})
  # The underlying IO

  @io = io
  # Parse options

  @block_size = options[:block_size] || 268435456
  @blocks_in_memory = options[:blocks_in_memory] || 2
  # The blocks

  @blocks = []
  # The last accessed block, used as a cache for quick [] access

  @cached_block = nil
  @cached_block_end_offset = nil
end

Instance Method Details

#[](range) ⇒ Object

Get a subset of the data. DO NOT USE NEGATIVE INDEXES.

Parameters
  • range (Fixnum or Range): Range to extract

Result
  • String: The resulting data



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/ioblockreader/ioblockreader.rb', line 36

def [](range)
  #puts "[IOBlockReader] - [](#{range.inspect})"

  #display_current_blocks

  if (range.is_a?(Fixnum))
    # Use the cache if possible

    return @cached_block.data[range - @cached_block.offset] if ((@cached_block != nil) and (range >= @cached_block.offset) and (range < @cached_block_end_offset))
    #puts "[IOBlockReader] - [](#{range.inspect}) - Cache miss"

    # Separate this case for performance

    single_block_index, offset_in_block = range.divmod(@block_size)
    # First check if all blocks are already loaded

    if ((block = @blocks[single_block_index]) == nil)
      read_needed_blocks([single_block_index], single_block_index, single_block_index)
      block = @blocks[single_block_index]
    else
      block.touch
    end
    set_cache_block(block)
    return block.data[offset_in_block]
  else
    # Use the cache if possible

    return @cached_block.data[range.first - @cached_block.offset..range.last - @cached_block.offset] if ((@cached_block != nil) and (range.first >= @cached_block.offset) and (range.last < @cached_block_end_offset))
    #puts "[IOBlockReader] - [](#{range.inspect}) - Cache miss"

    first_block_index, first_offset_in_block = range.first.divmod(@block_size)
    last_block_index, last_offset_in_block = range.last.divmod(@block_size)
    # First check if all blocks are already loaded

    if (first_block_index == last_block_index)
      if ((block = @blocks[first_block_index]) == nil)
        read_needed_blocks([first_block_index], first_block_index, last_block_index)
        block = @blocks[first_block_index]
      else
        block.touch
      end
      set_cache_block(block)
      return block.data[first_offset_in_block..last_offset_in_block]
    else
      # Get all indexes to be loaded

      indexes_needing_loading = []
      (first_block_index..last_block_index).each do |block_index|
        if ((block = @blocks[block_index]) == nil)
          indexes_needing_loading << block_index
        else
          block.touch
        end
      end
      read_needed_blocks(indexes_needing_loading, first_block_index, last_block_index) if (!indexes_needing_loading.empty?)
      # Now read across the blocks

      result = @blocks[first_block_index].data[first_offset_in_block..-1].dup
      (first_block_index+1..last_block_index-1).each do |block_index|
        result.concat(@blocks[block_index].data)
      end
      result.concat(@blocks[last_block_index].data[0..last_offset_in_block])
      # There are more chances that the last block will be accessed again. Cache this one.

      set_cache_block(@blocks[last_block_index])
      return result
    end
  end
end

#each_block(range = 0) ⇒ Object

Iterate over blocks in the data. ! Do not use negative integers in the range.

Parameters
  • range (Range or Fixnum): The boundaries of the iteration, or the starting index [default = 0]

  • Block: Code called for each block encountered

    • Parameters
    • data (String): The data



216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# File 'lib/ioblockreader/ioblockreader.rb', line 216

def each_block(range = 0)
  #puts "[IOBlockReader] - each_block(#{range})"

  # Parse parameters

  begin_offset = range
  end_offset = nil
  if (range.is_a?(Range))
    begin_offset = range.first
    end_offset = range.last
  end

  current_block_index, begin_offset_in_first_block = begin_offset.divmod(@block_size)
  end_offset_block_index, end_offset_in_last_block = ((end_offset == nil) ? [nil, nil] : end_offset.divmod(@block_size))
  # Make sure first block is loaded

  if ((current_block = @blocks[current_block_index]) == nil)
    read_needed_blocks([current_block_index], current_block_index, current_block_index)
    current_block = @blocks[current_block_index]
  else
    current_block.touch
  end
  if (current_block_index == end_offset_block_index)
    # We have a Range in the same block

    if ((begin_offset_in_first_block == 0) and
        (end_offset_in_last_block == current_block.data.size-1))
      yield(current_block.data)
    else
      yield(current_block.data[begin_offset_in_first_block..end_offset_in_last_block])
    end
  else
    # We need to loop, but consider first block differently as it might be partially given

    if (begin_offset_in_first_block == 0)
      yield(current_block.data)
    else
      yield(current_block.data[begin_offset_in_first_block..-1])
    end
    if (!current_block.last_block?)
      # Now loop on all subsequent blocks unless we get to the last one

      finished = false
      while (!finished)
        # Read next block

        current_block_index += 1
        if ((current_block = @blocks[current_block_index]) == nil)
          read_needed_blocks([current_block_index], current_block_index, current_block_index)
          current_block = @blocks[current_block_index]
        else
          current_block.touch
        end
        if (end_offset_block_index == current_block_index)
          # We arrived on the last block of the Range

          if (end_offset_in_last_block == current_block.data.size-1)
            yield(current_block.data)
          else
            yield(current_block.data[0..end_offset_in_last_block])
          end
          finished = true
        else
          yield(current_block.data)
          finished = current_block.last_block?
        end
      end
    end
  end
end

#get_block_containing_offset(offset = 0) ⇒ Object

Get the block containing a given offset. This method is mainly used to provide some low-level access for processes needing great parsing performance.

Parameters
  • offset (Fixnum): The offset to be accessed [default = 0]

Return
  • String: The block of data containing this offset

  • Fixnum: The beginning offset of this data block

  • Boolean: Is this block the last one?



288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# File 'lib/ioblockreader/ioblockreader.rb', line 288

def get_block_containing_offset(offset = 0)
  #puts "[IOBlockReader] - get_block_containing_offset(#{offset})"

  # Use the cache if possible

  return [ @cached_block.data, @cached_block.offset, @cached_block.last_block? ] if ((@cached_block != nil) and (offset >= @cached_block.offset) and (offset < @cached_block_end_offset))
  #puts "[IOBlockReader] - get_block_containing_offset(#{offset}) - Cache miss"

  single_block_index, _ = offset.divmod(@block_size)
  if ((block = @blocks[single_block_index]) == nil)
    read_needed_blocks([single_block_index], single_block_index, single_block_index)
    block = @blocks[single_block_index]
  else
    block.touch
  end
  set_cache_block(block)
  return block.data, block.offset, block.last_block?
end

#index(token, offset = 0, max_size_regexp = 32) ⇒ Object

Perform a search of a token (or a list of tokens) in the IO. Warning: The token(s) to be found have to be smaller than the block size given to the constructor, otherwise they won’t be found (you’ve been warned!). If you really need to search for tokens bigger than block size, extract the data using [] operator first, and then use index on it ; it will however make a complete copy of the data in memory prior to searching tokens.

Parameters
  • token (String, Regexp or list<Object>): Token to be found. Can be a list of tokens. Please note than using a list of tokens is slower than using a single Regexp.

  • offset (Fixnum): Offset starting the search [optional = 0]

  • max_size_regexp (Fixnum): Maximal number of characters the match should take in case of a Regexp token. Ignored if token is a String. [optional = 32]

Result
  • Fixnum: Index of the token (or the first one found from the given token list), or nil if none found.

  • Fixnum: In case token was an Array, return the index of the matching token in the array, or nil if none found.



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/ioblockreader/ioblockreader.rb', line 104

def index(token, offset = 0, max_size_regexp = 32)
  #puts "[IOBlockReader] - index(#{token.inspect}, #{offset}, #{max_size_regexp})"

  # Separate the trivial algo for performance reasons

  current_block_index, offset_in_current_block = offset.divmod(@block_size)
  if ((current_block = @blocks[current_block_index]) == nil)
    read_needed_blocks([current_block_index], current_block_index, current_block_index)
    current_block = @blocks[current_block_index]
  else
    current_block.touch
  end
  index_in_block = nil
  index_matching_token = nil
  if (token_is_array = token.is_a?(Array))
    token.each_with_index do |token2, idx|
      index_token2_in_block = current_block.data.index(token2, offset_in_current_block)
      if (index_token2_in_block != nil) and ((index_in_block == nil) or (index_token2_in_block < index_in_block))
        index_in_block = index_token2_in_block
        index_matching_token = idx
      end
    end
  else
    index_in_block = current_block.data.index(token, offset_in_current_block)
  end
  if (index_in_block == nil)
    # We have to search further: across blocks

    # Compute the size of the token to be searched

    token_size = 0
    if token_is_array
      token.each do |token2|
        if (token2.is_a?(String))
          token_size = token2.size if (token2.size > token_size)
        else
          token_size = max_size_regexp if (max_size_regexp > token_size)
        end
      end
    elsif (token.is_a?(String))
      token_size = token.size
    else
      token_size = max_size_regexp
    end
    # Loop on subsequent blocks to search for token

    result = nil
    while ((result == nil) and (!current_block.last_block?))
      #puts "[IOBlockReader] - index(#{token.inspect}, #{offset}, #{max_size_regexp}) - No find in last block #{current_block}. Continuing..."

      # Check that next block is loaded

      if ((next_block = @blocks[current_block_index+1]) == nil)
        read_needed_blocks([current_block_index+1], current_block_index+1, current_block_index+1)
        next_block = @blocks[current_block_index+1]
      else
        next_block.touch
      end
      # Get data across the 2 blocks if needed: enough to search for token_size data only

      if (token_size > 1)
        cross_data = current_block.data[1-token_size..-1] + next_block.data[0..token_size-2]
        #puts "[IOBlockReader] - index(#{token.inspect}, #{offset}, #{max_size_regexp}) - Find token in cross data: #{cross_data.inspect}..."

        if token_is_array
          token.each_with_index do |token2, idx|
            index_token2_in_block = cross_data.index(token2)
            if (index_token2_in_block != nil) and ((index_in_block == nil) or (index_token2_in_block < index_in_block))
              index_in_block = index_token2_in_block
              index_matching_token = idx
            end
          end
        else
          index_in_block = cross_data.index(token)
        end
      end
      if (index_in_block == nil)
        #puts "[IOBlockReader] - index(#{token.inspect}, #{offset}, #{max_size_regexp}) - No find in cross blocks #{current_block} / #{next_block}. Continuing..." if (token_size > 1)

        # Search in the next block

        if token_is_array
          token.each_with_index do |token2, idx|
            index_token2_in_block = next_block.data.index(token2)
            if (index_token2_in_block != nil) and ((index_in_block == nil) or (index_token2_in_block < index_in_block))
              index_in_block = index_token2_in_block
              index_matching_token = idx
            end
          end
        else
          index_in_block = next_block.data.index(token)
        end
        if (index_in_block == nil)
          # Loop on the next block

          current_block_index += 1
          current_block = next_block
        else
          result = next_block.offset + index_in_block
        end
      else
        result = next_block.offset - token_size + 1 + index_in_block
      end
    end
    if token_is_array
      return result, index_matching_token
    else
      return result
    end
  elsif token_is_array
    return current_block.offset + index_in_block, index_matching_token
  else
    return current_block.offset + index_in_block
  end
end