Module: RightData

Defined in:: lib/main.rb,
lib/right_data.rb,
lib/FileSystemItem.rb,
lib/FileSystemTree.rb

Defined Under Namespace

Classes: FileSystemItem, FileSystemTree

Constant Summary collapse

BLOCK_SIZE =

1024*8

IGNORE_FILES =

[".DS_Store", ".typeAttributes.dict", "empty-file"]

Class Method Summary collapse

.cache_not_working_on_write(master, master_cache, indexing_function) ⇒ Object
.cache_serializing_on_write(master) ⇒ Object
.check_file_in_image_index(master_index, file_to_check) ⇒ Object
.check_file_in_index(master_index, file_to_check) ⇒ Object
.dup_report(prunable) ⇒ Object

Run this in a directory that is suspected of containing self-duplicate files.
.each_set_of_duplicates(*paths, &block) ⇒ Object
.eliminate_non_duplicates(partition, size, offset) ⇒ Object
.get_block(file, offset) ⇒ Object
.git?(path) ⇒ Boolean
.hello ⇒ Object
.identical_images?(a, b) ⇒ Boolean
.ignore_test(f) ⇒ Object
.index_by_name(*paths) ⇒ Object
.index_by_size(*paths) ⇒ Object
.is_visual_media?(f) ⇒ Boolean

Is this a picture? If so, we’ll be using imagemagick’s compare feature later on.
.prune_image_report(master, prunable) ⇒ Object
.prune_report(master, prunable) ⇒ Object

Run this in a directory (prunable) that is suspected of containing duplicate files that already exist in master.
.repo_report(search_dir) ⇒ Object

Run this on a directory that is suspected of containing unchecked in GIT or SVN repos.
.scan_for_dup(prunable) ⇒ Object
.scan_for_prunable(master, prune, &block) ⇒ Object
.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block) ⇒ Object

tree = scan_for_prunable(master,prune) { |a,b| puts “#bb.size : #a” }; nil.
.scan_for_prunable_images(master, prune, &block) ⇒ Object
.scan_for_repos(prune, &block) ⇒ Object
.svn?(path) ⇒ Boolean

This is a weak check! Also does nothing to check one svn in another.
.test ⇒ Object

Class Method Details

.cache_not_working_on_write(master, master_cache, indexing_function) ⇒ `Object`

# File 'lib/main.rb', line 113

def self.cache_not_working_on_write(master, master_cache, indexing_function)
  if File.exist?(master_cache)
    puts "# Master cache FOUND at #{master_cache}."
    master_index = File.open(master_cache) do |f| 
      YAML::load(f)
    end
  else
    puts "# Master cache not found at #{master_cache}."
    master_index = indexing_function.call(master)
    puts "# Writing #{master_cache}."
    File.open(master_cache, "w") do |f| 
      YAML.dump(master_index, f)
    end  
    puts "# Wrote #{master_cache}."
  end
  master_index
end

.cache_serializing_on_write(master) ⇒ `Object`

# File 'lib/main.rb', line 132

def self.cache_serializing_on_write(master)
  master_cache = File.join(master,".rightPruneCache")
  if File.exist?(master_cache)
    puts "# Master cache FOUND at #{master_cache}."
    master_index = File.open(master_cache) do |f| 
      rval = {}
      f.each_line do |l|
        kv = Marshal.load(l)
        rval[kv.first] = kv.last
      end
      rval
    end
  else
    puts "# Master cache not found at #{master_cache}."
    master_index = index_by_size(master)
    puts "# Writing #{master_cache}."
    File.open(master_cache, "w") do |f| 
      master_index.each_pair do |k,v|
        Marshal.dump([k,v], f)
      end
      # f.write(master_index.inspect)
    end  
    puts "# Wrote #{master_cache}."
  end
end

.check_file_in_image_index(master_index, file_to_check) ⇒ `Object`

# File 'lib/main.rb', line 164

def self.check_file_in_image_index(master_index, file_to_check)
  size = File.size(file_to_check)
  return [] if size == 0 # Ignore empty files
  possible_master_dups = master_index[File.basename(file_to_check).downcase] || []
  r = possible_master_dups.find { |master_file| self.identical_images?(master_file,file_to_check) }
  r == nil ? [] : [r] # Original check API wanted an array
end

.check_file_in_index(master_index, file_to_check) ⇒ `Object`

# File 'lib/main.rb', line 172

def self.check_file_in_index(master_index, file_to_check)
  size = File.size(file_to_check)
  return [] if size == 0 # Ignore empty files
  possible_master_dups = master_index[size] || []
    offset = 0
    while !possible_master_dups.empty? && offset <= size
      file_to_check_block = get_block(file_to_check, offset)
      new_possible_master_dups = []
      possible_master_dups.each do |master|
        block = get_block(master,offset)
        if(block == file_to_check_block)
          new_possible_master_dups << master
        end
      end
      possible_master_dups = new_possible_master_dups
      offset += BLOCK_SIZE
    end
  # puts possible_master_dups.inspect
  possible_master_dups
end

.dup_report(prunable) ⇒ `Object`

Run this in a directory that is suspected of containing self-duplicate files. Compare to: fdupes -r -n prunable



20
21
22

# File 'lib/right_data.rb', line 20

def self.dup_report(prunable)
  RightData::scan_for_dup(prunable)
end

.each_set_of_duplicates(*paths, &block) ⇒ `Object`

# File 'lib/main.rb', line 48

def self.each_set_of_duplicates(*paths, &block)
  sizes = Hash.new {|h, k| h[k] = [] }
  Find.find(*paths) { |f| sizes[File.size(f)] << f if File.file? f }

  sizes.each_pair do |size, files|
  # puts files.inspect
    next unless files.size > 1
    offset = 0
    files = [files]
    while !files.empty? && offset <= size
      files = eliminate_non_duplicates(files, size, offset, &block)
      offset += BLOCK_SIZE
    end
  end
end

.eliminate_non_duplicates(partition, size, offset) ⇒ `Object`

# File 'lib/main.rb', line 64

def self.eliminate_non_duplicates(partition, size, offset)
  possible_duplicates = []
  partition.each do |possible_duplicate_set|
    blocks = Hash.new {|h, k| h[k] = [] }
    possible_duplicate_set.each do |f|
      block = open(f, 'rb') do |file|
        file.seek(offset)
        file.read(BLOCK_SIZE)
      end
      blocks[block || ''] << f
    end
    blocks.each_value do |files|
      if files.size > 1
        if offset+BLOCK_SIZE >= size
          # We know these are duplicates.
          yield files
        else
          # We suspect these are duplicates, but we need to compare
          # more blocks of data.
          possible_duplicates << files
        end
      end
    end
  end
 return possible_duplicates
end

.get_block(file, offset) ⇒ `Object`

# File 'lib/main.rb', line 158

def self.get_block(file,offset)
  open(file, 'r') do |f|
    f.seek(offset); f.read(BLOCK_SIZE)
  end
end

.git?(path) ⇒ `Boolean`

Returns:

(Boolean)



318
319
320

# File 'lib/main.rb', line 318

def self.git?(path)
  File.directory?(File.join(path, ".git"))
end

.hello ⇒ `Object`

4	# File 'lib/right_data.rb', line 4 def self.hello; "Hi!"; end

.identical_images?(a, b) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/main.rb', line 29

def self.identical_images?(a,b)
  return false unless self.is_visual_media?(a) && self.is_visual_media?(b)
  # rmagick1.signature <=> rmagick2.signature
  # rmagick1.compare_channel(rmagick2, MeanAbsoluteErrorMetric).last == 0

  if true # Do fuzzy test:
    cmd = "compare -metric AE -fuzz 5% -compose src \"#{a.gsub(/\"/,'\"')}\" \"#{b.gsub(/\"/,'\"')}\" /dev/null 2>&1"
    `#{cmd}`.chomp.to_i < 1000
  else
    cmd = "compare -metric AE -compose src \"#{a.gsub(/\"/,'\"')}\" \"#{b.gsub(/\"/,'\"')}\" /dev/null 2>&1"
    # puts "Executing comparison: #{cmd}"
    # >> a = `compare -metric ae a.png b.png /dev/null 2>&1`
    # => "0\n"
    "0" == `#{cmd}`.chomp
  end

  # TODO Consider checking rotated 90,180,270 degrees and scaled to other image...
end

.ignore_test(f) ⇒ `Object`

# File 'lib/main.rb', line 15

def self.ignore_test(f)
  IGNORE_FILES.include?(File.basename(f)) || 
    File.symlink?(f) || 
    (File.size(f) == 0) || # Ignore empty files
    File.basename(f).downcase =~ /\.tmp$/ ||
    File.basename(f).downcase =~ /\.swp$/
end

.index_by_name(*paths) ⇒ `Object`

# File 'lib/main.rb', line 91

def self.index_by_name(*paths)
  names = Hash.new {|h, k| h[k] = [] }
  count = 0
  Find.find(*paths) { |f| 
    names[File.basename(f).downcase] << f if File.file?(f) && !ignore_test(f)
    count += 1
  }
  puts "# Indexed #{count} files by name."
  names
end

.index_by_size(*paths) ⇒ `Object`

# File 'lib/main.rb', line 102

def self.index_by_size(*paths)
  sizes = Hash.new {|h, k| h[k] = [] }
  count = 0
  Find.find(*paths) { |f| 
    sizes[File.size(f)] << f if File.file?(f) && !ignore_test(f)
    count += 1
  }
  puts "# Indexed #{count} files by size."
  sizes
end

.is_visual_media?(f) ⇒ `Boolean`

Is this a picture? If so, we’ll be using imagemagick’s compare feature later on

Returns:

(Boolean)

# File 'lib/main.rb', line 24

def self.is_visual_media?(f)
  ext = File.basename(f).downcase.split(".").last 
  ["jpg","jpeg","gif","bmp","png"].include?(ext)
end

.prune_image_report(master, prunable) ⇒ `Object`

# File 'lib/right_data.rb', line 13

def self.prune_image_report(master,prunable)
  tree = RightData::scan_for_prunable_images(master,prunable) 
  tree.report('rm -rf'); nil
end

.prune_report(master, prunable) ⇒ `Object`

Run this in a directory (prunable) that is suspected of containing duplicate files that already exist in master. E.g. check a discovered backup drive and whether anything on it is valid

# File 'lib/right_data.rb', line 8

def self.prune_report(master,prunable)
  tree = RightData::scan_for_prunable(master,prunable) 
  tree.report('rm -rf'); nil
end

.repo_report(search_dir) ⇒ `Object`

Run this on a directory that is suspected of containing unchecked in GIT or SVN repos. Get back a list of all repos, versions and whether any files are unchecked in.



26
27
28

# File 'lib/right_data.rb', line 26

def self.repo_report(search_dir)
  tree = RightData::scan_for_repos(search_dir) 
end

.scan_for_dup(prunable) ⇒ `Object`

# File 'lib/main.rb', line 200

def self.scan_for_dup(prunable)
  each_set_of_duplicates(prunable) do |dups|
    puts "# #{Escape.shell_command(dups.shift)}"
    dups.each do |d|
      puts Escape.shell_command(["rm","-rf",d," # dup"])
    end
  end
end

.scan_for_prunable(master, prune, &block) ⇒ `Object`

# File 'lib/main.rb', line 215

def self.scan_for_prunable(master, prune, &block)
  indexing_function    = Proc.new { |a| self.index_by_size(a) }
  check_index_function = Proc.new { |a,b| self.check_file_in_index(a,b) }
  scan_for_prunable_base(master, prune, indexing_function, check_index_function, "size", &block)
end

.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block) ⇒ `Object`

tree = scan_for_prunable(master,prune) { |a,b| puts “#bb.size : #a” }; nil

# File 'lib/main.rb', line 222

def self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block)
  puts "# Ignoring: #{IGNORE_FILES.inspect}"

  master_cache = File.join(master,".rightPruneCache-#{kind}")
  master_index = cache_not_working_on_write(master, master_cache, indexing_function)

  # master_index = index_by_size(master)
  puts "# Found #{master_index.size} unique #{kind}s."

  # dups = check_file_in_index(master_index, "/Users/jonathan/Dropbox/2261093437_fac9fa9008_b.jpg")

  # Get prune count for progress updates:
  prune_count = `find "#{prune}" | wc -l`.chomp.to_i
  prune_updates = (0..25).inject({}) { |a,i| a[(i * (prune_count / 25)).to_i]=true;a }

  count = 0

  # Recursively compare the files in the filesystem.
  # When a parent node gets a response from all its children
  # that they are dups OR ignorable, that NODE becomes dup_or_ignorable too.
  # This propagates.
  # Then, there is a traversal that grabs all base nodes that are non_dup like:
  # rm -rf /a_path_duped/here     # 14 dups / 9 ignores
  # rm -rf /b_path_duped/way/here # 1 dup
  tree = FileSystemItem.new(prune, :parent => nil)
  # Mark the nodes:
  tree.traverse do |n|
    puts "# [#{count} / #{prune_count}] #{((count/prune_count)*100).to_i}%" if prune_updates[count]
    # Could keep track of empty dirs too...
    if File.directory?(n.path)
      # If empty dir...
      if n.leaf?
        n.ignorable = true
        n.parent.increment_ignorable_children
        next false # Don't bother, no kids
      else
        next true
      end
    end
    count += 1
    if ignore_test(n.path)
      n.ignorable = true
      n.parent.increment_ignorable_children
    else
      # puts n.path
      duplicates = check_index_function.call(master_index, n.path)
      if(!duplicates.empty?) 
        n.duplicates = duplicates
        n.parent.increment_duplicate_children
      end
    end
    true
  end
  puts "# We counted #{count} files. Tree thinks it has #{tree.files}."
  return tree

  if nil
  Find.find(prune) { |f|
    if File.directory? f
      puts "Dir: #{f}"
      prunable_dirs[f] = {}
      next
    end
    # next unless File.file? f
    count += 1
    duplicates = check_file_in_index(master_index, f)
    if(!duplicates.empty?) 
      dups[f] = duplicates
      prunable_files[f] = duplicates
      block.call(f, duplicates) unless block.nil?
    else
      prunable_files[f] = false
    end
  }

  puts "After check. Found #{dups.size} / #{count} dups in master."
  puts "After check. Found #{dups.first.inspect}"
  end

  # puts "Dirs scanned."
  #prunable_dirs.each_pair do |file,prunable|
    #puts "#{'#' if !prunable} #{file}"
  #end

  # puts "Files scanned."
  # prunable_files.keys.sort.each do |file|
    # prunable = prunable_files[file]
    # puts "#{'#' if !prunable} #{file}"
  # end
  # prunable_files
end

.scan_for_prunable_images(master, prune, &block) ⇒ `Object`

# File 'lib/main.rb', line 209

def self.scan_for_prunable_images(master, prune, &block)
  indexing_function    = Proc.new { |a| self.index_by_name(a) }
  check_index_function = Proc.new { |a,b| self.check_file_in_image_index(a,b) }
  self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, "image", &block)
end

.scan_for_repos(prune, &block) ⇒ `Object`

# File 'lib/main.rb', line 321

def self.scan_for_repos(prune, &block)
  tree = FileSystemTree.new(prune, :parent => nil)
  repos = {}
  # Mark the nodes:
  tree.traverse do |n|
    if File.directory?(n.path)
      if svn?(n.path)
        cd_cmd = Escape.shell_command(["cd",n.path])
        status = `#{cd_cmd}; svn status`
        info   = `#{cd_cmd}; svn info`
        repos[n.path] = { :kind => "svn", :status => status, :info => info }
      end
      if git?(n.path)
        cd_cmd = Escape.shell_command(["cd",n.path])
        status = `#{cd_cmd}; git status`
        info   = `#{cd_cmd}; git show`
        repos[n.path] = { :kind => "git", :status => status, :info => info }
      end
      !repos[n.path] # recurse only if we DID NOT find a repo
    end
  end
  repos.keys.sort.each do |k|
    puts "Found #{repos[k][:kind]} repo at: #{k}. \n\tStatus: #{repos[k][:status]}"
  end
  return repos
end

.svn?(path) ⇒ `Boolean`

This is a weak check! Also does nothing to check one svn in another.

Returns:

(Boolean)



315
316
317

# File 'lib/main.rb', line 315

def self.svn?(path)
  File.directory?(File.join(path, ".svn"))
end

.test ⇒ `Object`

# File 'lib/main.rb', line 193

def self.test
  master = "/Users/jonathan/Dropbox"
  prune  = "/Users/jonathan/Desktop/Old"
  scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }
  # each_set_of_duplicates(prune) 
end

Module: RightData

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.cache_not_working_on_write(master, master_cache, indexing_function) ⇒ Object

.cache_serializing_on_write(master) ⇒ Object

.check_file_in_image_index(master_index, file_to_check) ⇒ Object

.check_file_in_index(master_index, file_to_check) ⇒ Object

.dup_report(prunable) ⇒ Object

.each_set_of_duplicates(*paths, &block) ⇒ Object

.eliminate_non_duplicates(partition, size, offset) ⇒ Object

.get_block(file, offset) ⇒ Object

.git?(path) ⇒ Boolean

.hello ⇒ Object

.identical_images?(a, b) ⇒ Boolean

.ignore_test(f) ⇒ Object

.index_by_name(*paths) ⇒ Object

.index_by_size(*paths) ⇒ Object

.is_visual_media?(f) ⇒ Boolean

.prune_image_report(master, prunable) ⇒ Object

.prune_report(master, prunable) ⇒ Object

.repo_report(search_dir) ⇒ Object

.scan_for_dup(prunable) ⇒ Object

.scan_for_prunable(master, prune, &block) ⇒ Object

.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block) ⇒ Object

.scan_for_prunable_images(master, prune, &block) ⇒ Object

.scan_for_repos(prune, &block) ⇒ Object

.svn?(path) ⇒ Boolean

.test ⇒ Object

.cache_not_working_on_write(master, master_cache, indexing_function) ⇒ `Object`

.cache_serializing_on_write(master) ⇒ `Object`

.check_file_in_image_index(master_index, file_to_check) ⇒ `Object`

.check_file_in_index(master_index, file_to_check) ⇒ `Object`

.dup_report(prunable) ⇒ `Object`

.each_set_of_duplicates(*paths, &block) ⇒ `Object`

.eliminate_non_duplicates(partition, size, offset) ⇒ `Object`

.get_block(file, offset) ⇒ `Object`

.git?(path) ⇒ `Boolean`

.hello ⇒ `Object`

.identical_images?(a, b) ⇒ `Boolean`

.ignore_test(f) ⇒ `Object`

.index_by_name(*paths) ⇒ `Object`

.index_by_size(*paths) ⇒ `Object`

.is_visual_media?(f) ⇒ `Boolean`

.prune_image_report(master, prunable) ⇒ `Object`

.prune_report(master, prunable) ⇒ `Object`

.repo_report(search_dir) ⇒ `Object`

.scan_for_dup(prunable) ⇒ `Object`

.scan_for_prunable(master, prune, &block) ⇒ `Object`

.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block) ⇒ `Object`

.scan_for_prunable_images(master, prune, &block) ⇒ `Object`

.scan_for_repos(prune, &block) ⇒ `Object`

.svn?(path) ⇒ `Boolean`

.test ⇒ `Object`