Class: Keep::Manifest
- Inherits:
-
Object
- Object
- Keep::Manifest
- Defined in:
- lib/arvados/keep.rb
Constant Summary collapse
- STREAM_TOKEN_REGEXP =
/^([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
- STREAM_NAME_REGEXP =
/^(\.)(\/[^\/]+)*$/
- EMPTY_DIR_TOKEN_REGEXP =
The exception when a file can have ‘.’ as a name
/^0:0:\.$/
- FILE_TOKEN_REGEXP =
/^[[:digit:]]+:[[:digit:]]+:([^\000-\040\\]|\\[0-3][0-7][0-7])+$/
- FILE_NAME_REGEXP =
/^[[:digit:]]+:[[:digit:]]+:([^\/]+(\/[^\/]+)*)$/
- NON_8BIT_ENCODED_CHAR =
/[^\\]\\[4-7][0-7][0-7]/
Class Method Summary collapse
- .unescape(s) ⇒ Object
- .valid?(manifest) ⇒ Boolean
-
.validate!(manifest) ⇒ Object
Verify that a given manifest is valid according to dev.arvados.org/projects/arvados/wiki/Keep_manifest_format.
Instance Method Summary collapse
- #each_file_spec ⇒ Object
- #each_line ⇒ Object
- #exact_file_count?(want_count) ⇒ Boolean
- #files ⇒ Object
- #files_count(stop_after = nil) ⇒ Object
- #files_size ⇒ Object
- #has_file?(want_stream, want_file = nil) ⇒ Boolean
-
#initialize(manifest_text) ⇒ Manifest
constructor
Class to parse a manifest text and provide common views of that data.
- #minimum_file_count?(want_count) ⇒ Boolean
- #split_file_token(token) ⇒ Object
- #unescape(s) ⇒ Object
Constructor Details
#initialize(manifest_text) ⇒ Manifest
Class to parse a manifest text and provide common views of that data.
114 115 116 117 |
# File 'lib/arvados/keep.rb', line 114 def initialize(manifest_text) @text = manifest_text @files = nil end |
Class Method Details
.unescape(s) ⇒ Object
140 141 142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/arvados/keep.rb', line 140 def self.unescape(s) return nil if s.nil? # Parse backslash escapes in a Keep manifest stream or file name. s.gsub(/\\(\\|[0-7]{3})/) do |_| case $1 when '\\' '\\' else $1.to_i(8).chr end end end |
.valid?(manifest) ⇒ Boolean
303 304 305 306 307 308 309 310 |
# File 'lib/arvados/keep.rb', line 303 def self.valid? manifest begin validate! manifest true rescue ArgumentError false end end |
.validate!(manifest) ⇒ Object
Verify that a given manifest is valid according to dev.arvados.org/projects/arvados/wiki/Keep_manifest_format
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
# File 'lib/arvados/keep.rb', line 252 def self.validate! manifest raise ArgumentError.new "No manifest found" if !manifest return true if manifest.empty? raise ArgumentError.new "Invalid manifest: does not end with newline" if !manifest.end_with?("\n") line_count = 0 manifest.each_line do |line| line_count += 1 words = line[0..-2].split(/ /) raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing stream name" if words.empty? count = 0 word = words.shift raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on stream token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR unescaped_word = unescape(word) count += 1 if word =~ STREAM_TOKEN_REGEXP and unescaped_word =~ STREAM_NAME_REGEXP and unescaped_word !~ /\/\.\.?(\/|$)/ raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid stream name #{word.inspect if word}" if count != 1 count = 0 word = words.shift while word =~ Locator::LOCATOR_REGEXP word = words.shift count += 1 end raise ArgumentError.new "Manifest invalid for stream #{line_count}: missing or invalid locator #{word.inspect if word}" if count == 0 count = 0 raise ArgumentError.new "Manifest invalid for stream #{line_count}: >8-bit encoded chars not allowed on file token #{word.inspect}" if word =~ NON_8BIT_ENCODED_CHAR while unescape(word) =~ EMPTY_DIR_TOKEN_REGEXP or (word =~ FILE_TOKEN_REGEXP and unescape(word) =~ FILE_NAME_REGEXP and ($~[1].split('/') & ['..', '.']).empty?) word = words.shift count += 1 end if word raise ArgumentError.new "Manifest invalid for stream #{line_count}: invalid file token #{word.inspect}" elsif count == 0 raise ArgumentError.new "Manifest invalid for stream #{line_count}: no file tokens" end # Ruby's split() method silently drops trailing empty tokens # (which are not allowed by the manifest format) so we have to # check trailing spaces manually. raise ArgumentError.new "Manifest invalid for stream #{line_count}: trailing space" if line.end_with? " \n" end true end |
Instance Method Details
#each_file_spec ⇒ Object
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
# File 'lib/arvados/keep.rb', line 166 def each_file_spec return to_enum(__method__) unless block_given? @text.each_line do |line| stream_name = nil in_file_tokens = false line.scan(/\S+/) do |token| if stream_name.nil? stream_name = unescape token elsif in_file_tokens or not Locator.valid? token in_file_tokens = true start_pos, file_size, file_name = split_file_token(token) stream_name_adjuster = '' if file_name.include?('/') # '/' in filename dirname, sep, basename = file_name.rpartition('/') stream_name_adjuster = sep + dirname # /dir_parts file_name = basename end yield [stream_name + stream_name_adjuster, start_pos, file_size, file_name] end end end true end |
#each_line ⇒ Object
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/arvados/keep.rb', line 119 def each_line return to_enum(__method__) unless block_given? @text.each_line do |line| stream_name = nil block_tokens = [] file_tokens = [] line.scan(/\S+/) do |token| if stream_name.nil? stream_name = unescape token elsif file_tokens.empty? and Locator.valid? token block_tokens << token else file_tokens << unescape(token) end end # Ignore blank lines next if stream_name.nil? yield [stream_name, block_tokens, file_tokens] end end |
#exact_file_count?(want_count) ⇒ Boolean
230 231 232 |
# File 'lib/arvados/keep.rb', line 230 def exact_file_count?(want_count) files_count(want_count + 1) == want_count end |
#files ⇒ Object
192 193 194 195 196 197 198 199 200 201 202 203 |
# File 'lib/arvados/keep.rb', line 192 def files if @files.nil? file_sizes = Hash.new(0) each_file_spec do |streamname, _, filesize, filename| file_sizes[[streamname, filename]] += filesize end @files = file_sizes.each_pair.map do |(streamname, filename), size| [streamname, filename, size] end end @files end |
#files_count(stop_after = nil) ⇒ Object
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
# File 'lib/arvados/keep.rb', line 205 def files_count(stop_after=nil) # Return the number of files represented in this manifest. # If stop_after is provided, files_count will read the manifest # incrementally, and return immediately when it counts that number of # files. This can help you avoid parsing the entire manifest if you # just want to check if a small number of files are specified. if stop_after.nil? or not @files.nil? # Avoid counting empty dir placeholders return files.reject{|_, name, size| name == '.' and size == 0}.size end seen_files = {} each_file_spec do |streamname, _, filesize, filename| # Avoid counting empty dir placeholders next if filename == "." and filesize == 0 seen_files[[streamname, filename]] = true return stop_after if (seen_files.size >= stop_after) end seen_files.size end |
#files_size ⇒ Object
225 226 227 228 |
# File 'lib/arvados/keep.rb', line 225 def files_size # Return the total size of all files in this manifest. files.reduce(0) { |total, (_, _, size)| total + size } end |
#has_file?(want_stream, want_file = nil) ⇒ Boolean
238 239 240 241 242 243 244 245 246 247 248 |
# File 'lib/arvados/keep.rb', line 238 def has_file?(want_stream, want_file=nil) if want_file.nil? want_stream, want_file = File.split(want_stream) end each_file_spec do |streamname, _, _, name| if streamname == want_stream and name == want_file return true end end false end |
#minimum_file_count?(want_count) ⇒ Boolean
234 235 236 |
# File 'lib/arvados/keep.rb', line 234 def minimum_file_count?(want_count) files_count(want_count) >= want_count end |
#split_file_token(token) ⇒ Object
158 159 160 161 162 163 164 |
# File 'lib/arvados/keep.rb', line 158 def split_file_token token start_pos, filesize, filename = token.split(':', 3) if filename.nil? raise ArgumentError.new "Invalid file token '#{token}'" end [start_pos.to_i, filesize.to_i, unescape(filename)] end |
#unescape(s) ⇒ Object
154 155 156 |
# File 'lib/arvados/keep.rb', line 154 def unescape(s) self.class.unescape(s) end |