Class: Keep::Manifest

Inherits:
Object
  • Object
show all
Defined in:
lib/arvados/keep.rb

Constant Summary collapse

STREAM_TOKEN_REGEXP =
/^([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
STREAM_NAME_REGEXP =
/^(\.)(\/[^\/]+)*$/
EMPTY_DIR_TOKEN_REGEXP =

The exception when a file can have ‘.’ as a name

/^0:0:\.$/
FILE_TOKEN_REGEXP =
/^[[:digit:]]+:[[:digit:]]+:([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
FILE_NAME_REGEXP =
/^[[:digit:]]+:[[:digit:]]+:([^\/]+(\/[^\/]+)*)$/
NON_8BIT_ENCODED_CHAR =
/[^\\]\\[4-7][0-7][0-7]/

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(manifest_text) ⇒ Manifest

Class to parse a manifest text and provide common views of that data.



114
115
116
117
# File 'lib/arvados/keep.rb', line 114

def initialize(manifest_text)
  @text = manifest_text
  @files = nil
end

Class Method Details

.unescape(s) ⇒ Object



140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/arvados/keep.rb', line 140

def self.unescape(s)
  return nil if s.nil?

  # Parse backslash escapes in a Keep manifest stream or file name.
  s.gsub(/\\(\\|[0-7]{3})/) do |_|
    case $1
    when '\\'
      '\\'
    else
      $1.to_i(8).chr
    end
  end
end

.valid?(manifest) ⇒ Boolean

Returns:

  • (Boolean)


303
304
305
306
307
308
309
310
# File 'lib/arvados/keep.rb', line 303

def self.valid? manifest
  begin
    validate! manifest
    true
  rescue ArgumentError
    false
  end
end

.validate!(manifest) ⇒ Object

Verify that a given manifest is valid according to dev.arvados.org/projects/arvados/wiki/Keep_manifest_format



252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# File 'lib/arvados/keep.rb', line 252

def self.validate! manifest
  raise ArgumentError.new "No manifest found" if !manifest

  return true if manifest.empty?

  raise ArgumentError.new "Invalid manifest: does not end with newline" if !manifest.end_with?("\n")
  line_count = 0
  manifest.each_line do |line|
    line_count += 1

    words = line[0..-2].split(/ /)
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing stream name" if words.empty?

    count = 0

    word = words.shift
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on stream token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
    unescaped_word = unescape(word)
    count += 1 if word =~ STREAM_TOKEN_REGEXP and unescaped_word =~ STREAM_NAME_REGEXP and unescaped_word !~ /\/\.\.?(\/|$)/
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1

    count = 0
    word = words.shift
    while word =~ Locator::LOCATOR_REGEXP
      word = words.shift
      count += 1
    end
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0

    count = 0
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on file token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR
    while unescape(word) =~ EMPTY_DIR_TOKEN_REGEXP or
      (word =~ FILE_TOKEN_REGEXP and unescape(word) =~ FILE_NAME_REGEXP and ($~[1].split('/') & ['..', '.']).empty?)
      word = words.shift
      count += 1
    end

    if word
      raise ArgumentError.new "Manifest invalid for stream #{line_count}: invalid file token #{word.inspect}"
    elsif count == 0
      raise ArgumentError.new "Manifest invalid for stream #{line_count}: no file tokens"
    end

    # Ruby's split() method silently drops trailing empty tokens
    # (which are not allowed by the manifest format) so we have to
    # check trailing spaces manually.
    raise ArgumentError.new "Manifest invalid for stream #{line_count}: trailing space" if line.end_with? " \n"
  end
  true
end

Instance Method Details

#each_file_specObject



166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/arvados/keep.rb', line 166

def each_file_spec
  return to_enum(__method__) unless block_given?
  @text.each_line do |line|
    stream_name = nil
    in_file_tokens = false
    line.scan(/\S+/) do |token|
      if stream_name.nil?
        stream_name = unescape token
      elsif in_file_tokens or not Locator.valid? token
        in_file_tokens = true

        start_pos, file_size, file_name = split_file_token(token)
        stream_name_adjuster = ''
        if file_name.include?('/')                # '/' in filename
          dirname, sep, basename = file_name.rpartition('/')
          stream_name_adjuster = sep + dirname   # /dir_parts
          file_name = basename
        end

        yield [stream_name + stream_name_adjuster, start_pos, file_size, file_name]
      end
    end
  end
  true
end

#each_lineObject



119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/arvados/keep.rb', line 119

def each_line
  return to_enum(__method__) unless block_given?
  @text.each_line do |line|
    stream_name = nil
    block_tokens = []
    file_tokens = []
    line.scan(/\S+/) do |token|
      if stream_name.nil?
        stream_name = unescape token
      elsif file_tokens.empty? and Locator.valid? token
        block_tokens << token
      else
        file_tokens << unescape(token)
      end
    end
    # Ignore blank lines
    next if stream_name.nil?
    yield [stream_name, block_tokens, file_tokens]
  end
end

#exact_file_count?(want_count) ⇒ Boolean

Returns:

  • (Boolean)


230
231
232
# File 'lib/arvados/keep.rb', line 230

def exact_file_count?(want_count)
  files_count(want_count + 1) == want_count
end

#filesObject



192
193
194
195
196
197
198
199
200
201
202
203
# File 'lib/arvados/keep.rb', line 192

def files
  if @files.nil?
    file_sizes = Hash.new(0)
    each_file_spec do |streamname, _, filesize, filename|
      file_sizes[[streamname, filename]] += filesize
    end
    @files = file_sizes.each_pair.map do |(streamname, filename), size|
      [streamname, filename, size]
    end
  end
  @files
end

#files_count(stop_after = nil) ⇒ Object



205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/arvados/keep.rb', line 205

def files_count(stop_after=nil)
  # Return the number of files represented in this manifest.
  # If stop_after is provided, files_count will read the manifest
  # incrementally, and return immediately when it counts that number of
  # files.  This can help you avoid parsing the entire manifest if you
  # just want to check if a small number of files are specified.
  if stop_after.nil? or not @files.nil?
    # Avoid counting empty dir placeholders
    return files.reject{|_, name, size| name == '.' and size == 0}.size
  end
  seen_files = {}
  each_file_spec do |streamname, _, filesize, filename|
    # Avoid counting empty dir placeholders
    next if filename == "." and filesize == 0
    seen_files[[streamname, filename]] = true
    return stop_after if (seen_files.size >= stop_after)
  end
  seen_files.size
end

#files_sizeObject



225
226
227
228
# File 'lib/arvados/keep.rb', line 225

def files_size
  # Return the total size of all files in this manifest.
  files.reduce(0) { |total, (_, _, size)| total + size }
end

#has_file?(want_stream, want_file = nil) ⇒ Boolean

Returns:

  • (Boolean)


238
239
240
241
242
243
244
245
246
247
248
# File 'lib/arvados/keep.rb', line 238

def has_file?(want_stream, want_file=nil)
  if want_file.nil?
    want_stream, want_file = File.split(want_stream)
  end
  each_file_spec do |streamname, _, _, name|
    if streamname == want_stream and name == want_file
      return true
    end
  end
  false
end

#minimum_file_count?(want_count) ⇒ Boolean

Returns:

  • (Boolean)


234
235
236
# File 'lib/arvados/keep.rb', line 234

def minimum_file_count?(want_count)
  files_count(want_count) >= want_count
end

#split_file_token(token) ⇒ Object



158
159
160
161
162
163
164
# File 'lib/arvados/keep.rb', line 158

def split_file_token token
  start_pos, filesize, filename = token.split(':', 3)
  if filename.nil?
    raise ArgumentError.new "Invalid file token '#{token}'"
  end
  [start_pos.to_i, filesize.to_i, unescape(filename)]
end

#unescape(s) ⇒ Object



154
155
156
# File 'lib/arvados/keep.rb', line 154

def unescape(s)
  self.class.unescape(s)
end