Class: Flay

Inherits:

Object

Object
Flay

show all

Defined in:: lib/flay.rb,
lib/flay_erb.rb

Defined Under Namespace

Classes: Erubis, Item, Location

Constant Summary collapse

VERSION = :nodoc:

"2.7.0"

DEFAULT_IGNORE = so I can move this to flog wholesale

".flayignore"

MAX_NODE_SIZE = :stopdoc:

MAX_AVG_MASS = prevents exponential blowout

Instance Attribute Summary collapse

#hashes ⇒ Object readonly

Returns the value of attribute hashes.
#identical ⇒ Object

:stopdoc:.
#mass_threshold ⇒ Object

:stopdoc:.
#masses ⇒ Object

:stopdoc:.
#option ⇒ Object readonly

Returns the value of attribute option.
#total ⇒ Object

:stopdoc:.

Class Method Summary collapse

.default_options ⇒ Object

Returns the default options.
.expand_dirs_to_files(*dirs) ⇒ Object

Expands *dirs to all files within that match ruby and rake extensions.
.filter_files(files, ignore = DEFAULT_IGNORE) ⇒ Object

A file filter mechanism similar to, but not as extensive as, .gitignore files:.
.load_plugins ⇒ Object

Loads all flay plugins.
.parse_options(args = ARGV) ⇒ Object

Process options in args, defaulting to ARGV.

Instance Method Summary collapse

#analyze(filter = nil) ⇒ Object

Prune, find identical nodes, and update masses.
#collapse_and_label(ary) ⇒ Object

:nodoc:.
#initialize(option = nil) ⇒ Flay constructor

Create a new instance of Flay with options.
#n_way_diff(*data) ⇒ Object

Output an n-way diff from data.
#pad_with_empty_strings(ary) ⇒ Object

:nodoc:.
#process(*files) ⇒ Object

Process any number of files.
#process_erb(file) ⇒ Object

Process erb and parse the result.
#process_fuzzy(node, difference) ⇒ Object

Process “fuzzy” matches for node.
#process_rb(file) ⇒ Object

Parse a ruby file and return the sexp.
#process_sexp(pt) ⇒ Object

Process a sexp pt.
#prune ⇒ Object

Prunes nodes that aren’t relevant to analysis or are already covered by another node.
#prune_conservatively ⇒ Object

Conservative prune.
#prune_liberally ⇒ Object

Liberal prune.
#report(io = $stdout) ⇒ Object

Output the report.
#sexp_to_rb(sexp) ⇒ Object
#split_and_group(ary) ⇒ Object

:nodoc:.
#summary ⇒ Object

Calculate summary scores on a per-file basis.
#update_masses ⇒ Object

Reset total and recalculate the masses for all nodes in hashes.

Constructor Details

#initialize(option = nil) ⇒ `Flay`

Create a new instance of Flay with options.

# File 'lib/flay.rb', line 209

def initialize option = nil
  @option = option || Flay.default_options
  @hashes = Hash.new { |h,k| h[k] = [] }

  self.identical      = {}
  self.masses         = {}
  self.total          = 0
  self.mass_threshold = @option[:mass]
end

Instance Attribute Details

#hashes ⇒ `Object` (readonly)

Returns the value of attribute hashes.



203
204
205

# File 'lib/flay.rb', line 203

def hashes
  @hashes
end

#identical ⇒ `Object`

:stopdoc:



202
203
204

# File 'lib/flay.rb', line 202

def identical
  @identical
end

#mass_threshold ⇒ `Object`

:stopdoc:



202
203
204

# File 'lib/flay.rb', line 202

def mass_threshold
  @mass_threshold
end

#masses ⇒ `Object`

:stopdoc:



202
203
204

# File 'lib/flay.rb', line 202

def masses
  @masses
end

#option ⇒ `Object` (readonly)

Returns the value of attribute option.



203
204
205

# File 'lib/flay.rb', line 203

def option
  @option
end

#total ⇒ `Object`

:stopdoc:



202
203
204

# File 'lib/flay.rb', line 202

def total
  @total
end

Class Method Details

.default_options ⇒ `Object`

Returns the default options.

# File 'lib/flay.rb', line 31

def self.default_options
  {
    :diff    => false,
    :mass    => 16,
    :summary => false,
    :verbose => false,
    :number  => true,
    :timeout => 10,
    :liberal => false,
    :fuzzy   => false,
    :only   => nil,
  }
end

.expand_dirs_to_files(*dirs) ⇒ `Object`

Expands *dirs to all files within that match ruby and rake extensions. – REFACTOR: from flog

# File 'lib/flay.rb', line 128

def self.expand_dirs_to_files *dirs
  extensions = ["rb"] + Flay.load_plugins

  dirs.flatten.map { |p|
    if File.directory? p then
      Dir[File.join(p, "**", "*.{#{extensions.join(",")}}")]
    else
      p
    end
  }.flatten.map { |s| s.sub(/^\.\//, "") } # strip "./" from paths
end

.filter_files(files, ignore = DEFAULT_IGNORE) ⇒ `Object`

A file filter mechanism similar to, but not as extensive as, .gitignore files:

+ If a pattern does not contain a slash, it is treated as a shell glob. + If a pattern ends in a slash, it matches on directories (and contents). + Otherwise, it matches on relative paths.

File.fnmatch is used throughout, so glob patterns work for all 3 types.

# File 'lib/flay.rb', line 153

def self.filter_files files, ignore = DEFAULT_IGNORE
  ignore_paths = if ignore.respond_to? :read then
                   ignore.read
                 elsif File.exists? ignore then
                   File.read ignore
                 end

  if ignore_paths then
    nonglobs, globs = ignore_paths.split("\n").partition { |p| p.include? "/" }
    dirs, ifiles    = nonglobs.partition { |p| p.end_with? "/" }
    dirs            = dirs.map { |s| s.chomp "/" }

    only_paths = File::FNM_PATHNAME
    files = files.reject { |f|
      dirs.any?     { |i| File.fnmatch?(i, File.dirname(f), only_paths) } ||
        globs.any?  { |i| File.fnmatch?(i, f) } ||
        ifiles.any? { |i| File.fnmatch?(i, f, only_paths) }
    }
  end

  files
end

.load_plugins ⇒ `Object`

Loads all flay plugins. Files must be named “flay_*.rb”.

# File 'lib/flay.rb', line 179

def self.load_plugins
  unless defined? @@plugins then
    @@plugins = []

    plugins = Gem.find_files("flay_*.rb").reject { |p| p =~ /flay_task/ }

    plugins.each do |plugin|
      plugin_name = File.basename(plugin, ".rb").sub(/^flay_/, "")
      next if @@plugins.include? plugin_name
      begin
        load plugin
        @@plugins << plugin_name
      rescue LoadError => e
        warn "error loading #{plugin.inspect}: #{e.message}. skipping..."
      end
    end
  end
  @@plugins
rescue
  # ignore
end

.parse_options(args = ARGV) ⇒ `Object`

Process options in args, defaulting to ARGV.

# File 'lib/flay.rb', line 48

def self.parse_options args = ARGV
  options = self.default_options

  OptionParser.new do |opts|
    opts.banner  = "flay [options] files_or_dirs"
    opts.version = Flay::VERSION

    opts.separator ""
    opts.separator "Specific options:"
    opts.separator ""

    opts.on("-h", "--help", "Display this help.") do
      puts opts
      exit
    end

    opts.on("-f", "--fuzzy [DIFF]", Integer,
            "Detect fuzzy (copy & paste) duplication (default 1).") do |n|
      options[:fuzzy] = n || 1
    end

    opts.on("-l", "--liberal", "Use a more liberal detection method.") do
      options[:liberal] = true
    end

    opts.on("-m", "--mass MASS", Integer,
            "Sets mass threshold (default = #{options[:mass]})") do |m|
      options[:mass] = m.to_i
    end

    opts.on("-#", "Don't number output (helps with diffs)") do |m|
      options[:number] = false
    end

    opts.on("-v", "--verbose", "Verbose. Show progress processing files.") do
      options[:verbose] = true
    end

    opts.on("-o", "--only NODE", String, "Only show matches on NODE type.") do |s|
      options[:only] = s.to_sym
    end

    opts.on("-d", "--diff", "Diff Mode. Display N-Way diff for ruby.") do
      options[:diff] = true
    end

    opts.on("-s", "--summary", "Summarize. Show flay score per file only.") do
      options[:summary] = true
    end

    opts.on("-t", "--timeout TIME", Integer,
            "Set the timeout. (default = #{options[:timeout]})") do |t|
      options[:timeout] = t.to_i
    end

    extensions = ["rb"] + Flay.load_plugins

    opts.separator ""
    opts.separator "Known extensions: #{extensions.join(", ")}"

    extensions.each do |meth|
      msg = "options_#{meth}"
      send msg, opts, options if self.respond_to?(msg)
    end

    begin
      opts.parse! args
    rescue => e
      abort "#{e}\n\n#{opts}"
    end
  end

  options
end

Instance Method Details

#analyze(filter = nil) ⇒ `Object`

Prune, find identical nodes, and update masses.

# File 'lib/flay.rb', line 256

def analyze filter = nil
  self.prune

  self.hashes.each do |hash,nodes|
    identical[hash] = nodes[1..-1].all? { |n| n == nodes.first }
  end

  update_masses

  sorted = masses.sort_by { |h,m|
    [-m,
     hashes[h].first.file,
     hashes[h].first.line,
     hashes[h].first.first.to_s]
  }

  sorted.map { |hash, mass|
    nodes = hashes[hash]

    next unless nodes.first.first == filter if filter

    same  = identical[hash]
    node  = nodes.first
    n     = nodes.size
    bonus = "*#{n}" if same

    locs = nodes.sort_by { |x| [x.file, x.line] }.each_with_index.map { |x, i|
      extra = :fuzzy if x.modified?
      Location[x.file, x.line, extra]
    }

    Item[hash, node.first, bonus, mass, locs]
  }.compact
end

#collapse_and_label(ary) ⇒ `Object`

:nodoc:

# File 'lib/flay.rb', line 474

def collapse_and_label ary # :nodoc:
  ary[0].zip(*ary[1..-1]).map { |lines|
    if lines.uniq.size == 1 then
      "   #{lines.first}"
    else
      lines.reject { |l| l.empty? }.map { |l| "#{l.group}: #{l}" }
    end
  }
end

#n_way_diff(*data) ⇒ `Object`

Output an n-way diff from data. This is only used if –diff is given.

# File 'lib/flay.rb', line 438

def n_way_diff *data
  comments = []
  codes    = []

  split_and_group(data).each do |subdata|
    n = subdata.find_index { |s| s !~ /^#/ }

    comment, code = subdata[0..n-1], subdata[n..-1]
    comment = [] if n == 0

    comments << comment
    codes    << code
  end

  comments = collapse_and_label pad_with_empty_strings comments
  codes    = collapse_and_label pad_with_empty_strings codes

  (comments + codes).flatten.join("\n")
end

#pad_with_empty_strings(ary) ⇒ `Object`

:nodoc:

# File 'lib/flay.rb', line 468

def pad_with_empty_strings ary # :nodoc:
  max = ary.map { |s| s.size }.max

  ary.map { |a| a + ([""] * (max - a.size)) }
end

#process(*files) ⇒ `Object`

Process any number of files.

# File 'lib/flay.rb', line 222

def process(*files) # TODO: rename from process - should act as SexpProcessor
  files.each do |file|
    warn "Processing #{file}" if option[:verbose]

    ext = File.extname(file).sub(/^\./, "")
    ext = "rb" if ext.nil? || ext.empty?
    msg = "process_#{ext}"

    unless respond_to? msg then
      warn "  Unknown file type: #{ext}, defaulting to ruby"
      msg = "process_rb"
    end

    begin
      sexp = begin
               send msg, file
             rescue => e
               warn "  #{e.message.strip}"
               warn "  skipping #{file}"
               nil
             end

      next unless sexp

      process_sexp sexp
    rescue SyntaxError => e
      warn "  skipping #{file}: #{e.message}"
    end
  end
end

#process_erb(file) ⇒ `Object`

Process erb and parse the result. Returns the sexp of the parsed ruby.

# File 'lib/flay_erb.rb', line 13

def process_erb file
  erb = File.read file

  ruby = Erubis.new(erb).src
  begin
    RubyParser.new.process(ruby, file)
  rescue => e
    warn ruby if option[:verbose]
    raise e
  end
end

#process_fuzzy(node, difference) ⇒ `Object`

Process “fuzzy” matches for node. A fuzzy match is a subset of node up to difference elements less than the original.

# File 'lib/flay.rb', line 341

def process_fuzzy node, difference
  return unless node.has_code?

  avg_mass = node.mass / node.size
  return if node.size > MAX_NODE_SIZE or avg_mass > MAX_AVG_MASS

  tmpl, code = node.split_code
  tmpl.modified = true

  (code.size - 1).downto(code.size - difference) do |n|
    code.combination(n).each do |subcode|
      new_node = tmpl + subcode

      next unless new_node.any? { |sub| Sexp === sub }
      next if new_node.mass < self.mass_threshold

      # they're already structurally similar, don"t bother adding another
      next if self.hashes[new_node.structural_hash].any? { |sub|
        sub.file == new_node.file and sub.line == new_node.line
      }

      self.hashes[new_node.structural_hash] << new_node
    end
  end
end

#process_rb(file) ⇒ `Object`

Parse a ruby file and return the sexp.

– TODO: change the system and rename this to parse_rb.

# File 'lib/flay.rb', line 310

def process_rb file
  begin
    RubyParser.new.process(File.binread(file), file, option[:timeout])
  rescue Timeout::Error
    warn "TIMEOUT parsing #{file}. Skipping."
  end
end

#process_sexp(pt) ⇒ `Object`

Process a sexp pt.

# File 'lib/flay.rb', line 321

def process_sexp pt
  pt.deep_each do |node|
    next unless node.any? { |sub| Sexp === sub }
    next if node.mass < self.mass_threshold

    self.hashes[node.structural_hash] << node

    process_fuzzy node, option[:fuzzy] if option[:fuzzy]
  end
end

#prune ⇒ `Object`

Prunes nodes that aren’t relevant to analysis or are already covered by another node.

# File 'lib/flay.rb', line 371

def prune
  # prune trees that aren't duped at all, or are too small
  self.hashes.delete_if { |_,nodes| nodes.size == 1 }
  self.hashes.delete_if { |_,nodes| nodes.all?(&:modified?) }

  return prune_liberally if option[:liberal]

  prune_conservatively
end

#prune_conservatively ⇒ `Object`

Conservative prune. Remove any bucket that is known to contain a subnode element of a node in another bucket.

# File 'lib/flay.rb', line 385

def prune_conservatively
  hashes_to_prune = {}

  # extract all subtree hashes from all nodes
  self.hashes.values.each do |nodes|
    nodes.first.all_structural_subhashes.each do |h|
      hashes_to_prune[h] = true
    end
  end

  # nuke subtrees so we show the biggest matching tree possible
  self.hashes.delete_if { |h,_| hashes_to_prune[h] }
end

#prune_liberally ⇒ `Object`

Liberal prune. Remove any element from a bucket that is known to be a subnode of another node. Removed by identity.

# File 'lib/flay.rb', line 403

def prune_liberally
  update_masses

  hashes_to_prune = Hash.new { |h,k| h[k] = [] }

  # record each subtree by subhash, but skip if subtree mass > parent mass
  self.hashes.values.each do |nodes|
    nodes.each do |node|
      tophash  = node.structural_hash
      topscore = self.masses[tophash]

      node.deep_each do |subnode|
        subhash  = subnode.structural_hash
        subscore = self.masses[subhash]

        next if subscore and subscore > topscore

        hashes_to_prune[subhash] << subnode
      end
    end
  end

  # nuke only individual items by object identity
  self.hashes.each do |h,v|
    v.delete_eql hashes_to_prune[h]
  end

  # nuke buckets we happened to fully empty
  self.hashes.delete_if { |k,v| v.size <= 1 }
end

#report(io = $stdout) ⇒ `Object`

Output the report. Duh.

# File 'lib/flay.rb', line 504

def report io = $stdout
  only = option[:only]

  data = analyze only

  io.puts "Total score (lower is better) = #{self.total}"

  if option[:summary] then
    io.puts

    self.summary.sort_by { |_,v| -v }.each do |file, score|
      io.puts "%8.2f: %s" % [score, file]
    end

    return
  end

  data.each_with_index do |item, count|
    prefix = "%d) " % (count + 1) if option[:number]

    match = item.identical? ? "IDENTICAL" : "Similar"

    io.puts
    io.puts "%s%s code found in %p (mass%s = %d)" %
      [prefix, match, item.name, item.bonus, item.mass]

    item.locations.each_with_index do |loc, i|
      loc_prefix = "%s: " % (?A.ord + i).chr if option[:diff]
      extra = " (FUZZY)" if loc.fuzzy?
      io.puts "  %s%s:%d%s" % [loc_prefix, loc.file, loc.line, extra]
    end

    if option[:diff] then
      io.puts

      nodes = hashes[item.structural_hash]

      sources = nodes.map do |s|
        msg = "sexp_to_#{File.extname(s.file).sub(/./, "")}"
        self.respond_to?(msg) ? self.send(msg, s) : sexp_to_rb(s)
      end

      io.puts n_way_diff(*sources)
    end
  end
end

#sexp_to_rb(sexp) ⇒ `Object`

# File 'lib/flay.rb', line 551

def sexp_to_rb sexp
  begin
    require "ruby2ruby"
  rescue LoadError
    return "ruby2ruby is required for diff"
  end
  @r2r ||= Ruby2Ruby.new
  @r2r.process sexp.deep_clone
end

#split_and_group(ary) ⇒ `Object`

:nodoc:

# File 'lib/flay.rb', line 458

def split_and_group ary # :nodoc:
  ary.each_with_index.map { |s, i|
    c = (?A.ord + i).chr
    s.scan(/^.*/).map { |s2|
      s2.group = c
      s2
    }
  }
end

#summary ⇒ `Object`

Calculate summary scores on a per-file basis. For –summary.

# File 'lib/flay.rb', line 487

def summary
  score = Hash.new 0

  masses.each do |hash, mass|
    sexps = hashes[hash]
    mass_per_file = mass.to_f / sexps.size
    sexps.each do |sexp|
      score[sexp.file] += mass_per_file
    end
  end

  score
end

#update_masses ⇒ `Object`

Reset total and recalculate the masses for all nodes in hashes.

# File 'lib/flay.rb', line 294

def update_masses
  self.total = 0
  masses.clear
  self.hashes.each do |hash, nodes|
    masses[hash] = nodes.first.mass * nodes.size
    masses[hash] *= (nodes.size) if identical[hash]
    self.total += masses[hash]
  end
end

Class: Flay

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(option = nil) ⇒ Flay

Instance Attribute Details

#hashes ⇒ Object (readonly)

#identical ⇒ Object

#mass_threshold ⇒ Object

#masses ⇒ Object

#option ⇒ Object (readonly)

#total ⇒ Object

Class Method Details

.default_options ⇒ Object

.expand_dirs_to_files(*dirs) ⇒ Object

.filter_files(files, ignore = DEFAULT_IGNORE) ⇒ Object

.load_plugins ⇒ Object

.parse_options(args = ARGV) ⇒ Object

Instance Method Details

#analyze(filter = nil) ⇒ Object

#collapse_and_label(ary) ⇒ Object

#n_way_diff(*data) ⇒ Object

#pad_with_empty_strings(ary) ⇒ Object

#process(*files) ⇒ Object

#process_erb(file) ⇒ Object

#process_fuzzy(node, difference) ⇒ Object

#process_rb(file) ⇒ Object

#process_sexp(pt) ⇒ Object

#prune ⇒ Object

#prune_conservatively ⇒ Object

#prune_liberally ⇒ Object

#report(io = $stdout) ⇒ Object

#sexp_to_rb(sexp) ⇒ Object

#split_and_group(ary) ⇒ Object

#summary ⇒ Object

#update_masses ⇒ Object

#initialize(option = nil) ⇒ `Flay`

#hashes ⇒ `Object` (readonly)

#identical ⇒ `Object`

#mass_threshold ⇒ `Object`

#masses ⇒ `Object`

#option ⇒ `Object` (readonly)

#total ⇒ `Object`

.default_options ⇒ `Object`

.expand_dirs_to_files(*dirs) ⇒ `Object`

.filter_files(files, ignore = DEFAULT_IGNORE) ⇒ `Object`

.load_plugins ⇒ `Object`

.parse_options(args = ARGV) ⇒ `Object`

#analyze(filter = nil) ⇒ `Object`

#collapse_and_label(ary) ⇒ `Object`

#n_way_diff(*data) ⇒ `Object`

#pad_with_empty_strings(ary) ⇒ `Object`

#process(*files) ⇒ `Object`

#process_erb(file) ⇒ `Object`

#process_fuzzy(node, difference) ⇒ `Object`

#process_rb(file) ⇒ `Object`

#process_sexp(pt) ⇒ `Object`

#prune ⇒ `Object`

#prune_conservatively ⇒ `Object`

#prune_liberally ⇒ `Object`

#report(io = $stdout) ⇒ `Object`

#sexp_to_rb(sexp) ⇒ `Object`

#split_and_group(ary) ⇒ `Object`

#summary ⇒ `Object`

#update_masses ⇒ `Object`