Class: VCDIFF::Decoder

Inherits:
Object
  • Object
show all
Defined in:
lib/vcdiff.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(dictionary) ⇒ Decoder

Returns a new instance of Decoder.



15
16
17
18
19
20
21
22
23
24
25
# File 'lib/vcdiff.rb', line 15

def initialize(dictionary)
  @dictionary = File.read(dictionary, mode: "rb")

  @s_near = 4
  @s_same = 3

  # all cache values initialize to 0
  @near_cache = Array.new(@s_near, 0)
  @same_cache = Array.new(@s_same * 256, 0)
  @next_slot = 0
end

Instance Attribute Details

#dictionaryObject

Returns the value of attribute dictionary.



13
14
15
# File 'lib/vcdiff.rb', line 13

def dictionary
  @dictionary
end

Instance Method Details

#decode(file) ⇒ Object

Decodes a delta file using the dictionary given to the decoder



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/vcdiff.rb', line 28

def decode(file)
  delta_file = DeltaFile.read(File.new(file))

  if delta_file.header.header_indicator != 0
    raise NotImplementedError, "Header indicator of #{delta_file.header.header_indicator} can't be handled"
  end

  if delta_file.header.custom_codetable?
    raise NotImplementedError, "Unable to handle a custom codetable"
  end

  # there's no simple way to determine the number of windows, since the
  # count isn't given ahead of time, so we'll need to manually iterate
  # through the windows
  window_stream = StringIO.new(delta_file.windows)

  target_file = ""

  until window_stream.eof?
    # reads only one window's worth of bytes from the stream
    next_window = VCDIFFWindow.read(window_stream)

    if next_window.compressed_only? || next_window.target_data?
      raise NotImplementedError, "Can only handle VCD_SOURCE windows"
    end

    length, position = next_window.source_data_length, next_window.source_data_position

    source_window = @dictionary[position, length]

    target_file << process_delta_encoding(source_window, next_window.delta_encoding)
  end

  target_file
end

#process_delta_encoding(source_window, delta_encoding) ⇒ Object

takes a delta encoding and processes it against the source window.

this corresponds to section (6) in RFC3284, which outlines processing the instructions, data and addresses arrays.



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/vcdiff.rb', line 69

def process_delta_encoding(source_window, delta_encoding)
  # to_a is needed here to unwrap the BinData::Array, which doesn't
  # know about method calls like #shift
  instructions      = delta_encoding.instructions.to_a
  add_run_data      = delta_encoding.add_run_data.to_a
  copy_address_data = delta_encoding.copy_address_data.to_a

  code_table = CodeTable::DEFAULT_TABLE

  # the final string for this window
  target_window = ""

  add_run_index = 0

  until instructions.empty?
    # instructions is a sequence of tupes (index, [size1], [size2]),
    # where size1 and size2 existence depends on the instruction entry
    # which _index_ points to.
    index = instructions.shift

    # instruction pair looked up in the code table
    instruction = code_table[index]
    type1, size1, mode1, type2, size2, mode2 = instruction

    if type1 != CodeTable::NOOP && size1 == 0
      instruction_size_1 = read_int(instructions)
    else
      instruction_size_1 = size1
    end

    if type2 != CodeTable::NOOP && size2 == 0
      instruction_size_2 = read_int(instructions)
    else
      instruction_size_2 = size2
    end

    case type1
    when CodeTable::NOOP
      next
    when CodeTable::RUN
      if mode1 != 0
        warn "Warning: RUN found with mode #{mode1} -- value will be ignored"
      end

      if instruction_size_1 == 0
        raise ArgumentError, "File contains a RUN instruction of size 0, must be > 0"
      end

      # repeat a single character instruction_size_1 times.
      # since add_run_data is an array of byte values, we
      # call #[x, 1] with *n to get n copies of the byte
      # at index x, since ary[x, 1] == [ary[x]].
      target_window << (add_run_data[add_run_index, 1] * instruction_size_1).pack("C*")
      add_run_index += 1
    when CodeTable::ADD
      if mode1 != 0
        warn "Warning: ADD found with mode #{mode1} -- value will be ignored"
      end

      if instruction_size_1 == 0
        raise ArgumentError, "File contains an ADD instruction of size 0, must be > 0"
      end

      target_window << (add_run_data[add_run_index, instruction_size_1]).pack("C*")
      add_run_index += instruction_size_1
    when CodeTable::COPY
      # from (5.3) of RFC3284:
      #
      #   The address of a COPY instruction is encoded using different modes,
      #   depending on the type of cached address used, if any.
      #
      #   Let "addr" be the address of a COPY instruction to be decoded and
      #   "here" be the current location in the target data (i.e., the start of
      #   the data about to be encoded or decoded).  Let near[j] be the jth
      #   element in the near cache, and same[k] be the kth element in the same
      #   cache.  Below are the possible address modes:
      #
      #      VCD_SELF: This mode has value 0.  The address was encoded by
      #         itself as an integer.
      #
      #      VCD_HERE: This mode has value 1.  The address was encoded as the
      #         integer value "here - addr".
      #
      #      Near modes: The "near modes" are in the range [2,s_near+1].  Let m
      #         be the mode of the address encoding.  The address was encoded
      #         as the integer value "addr - near[m-2]".
      #
      #      Same modes: The "same modes" are in the range
      #         [s_near+2,s_near+s_same+1].  Let m be the mode of the encoding.
      #         The address was encoded as a single byte b such that "addr ==
      #         same[(m - (s_near+2))*256 + b]".
      #

      here = target_window.length - 1

      case mode1
      when 0 # VCD_SELF
        addr = read_int(copy_address_data)
      when 1 # VCD_HERE
        addr = here - read_int(copy_address_data)
      when 2..(@s_near + 1) # near modes
        addr = read_int(copy_address_data) + @near_cache[mode1 - 2]
      when (@s_near+2)..(@s_near+@s_same+1) # same modes
        # address is encoded as a single byte
        b = copy_address_data.shift
        addr = @same_cache[(mode1 - (@s_near + 2))*256 + b]
      else
        raise ArgumentError, "invalid mode #{mode1}"
      end

      target_window << source_window[addr, instruction_size_1]

      # now update the "near" and "same" caches.
      if @s_near > 0
        @near_cache[@next_slot] = addr
        @next_slot = (@next_slot + 1) % @s_near
      end

      if @s_same > 0
        @same_cache[addr % (@s_same * 256)] = addr
      end
    else
      raise ArgumentError, "Invalid file format, instruction of #{type1} (found at index #{index}) doesn't exist"
    end
  end

  target_window
end

#read_int(array) ⇒ Object

shifts a VCDIFF integer off the given array of bytes, modifying the array argument in-place.



200
201
202
203
204
205
206
207
# File 'lib/vcdiff.rb', line 200

def read_int(array)
  # first index where the MSB = index-7 bit is 0
  zero_msb_index = array.index { |e| e.to_i[7] == 0 }

  int_bytes = array.shift(zero_msb_index + 1)

  VCDIFFInt.read(int_bytes.pack("C*")).to_i
end