Module: Discretizer

Includes:: Consistency, Entropy

Included in:: FSelector::BaseDiscrete, FSelector::CFS_d, FSelector::KS_CCBF, FSelector::ReliefF_d, FSelector::Relief_d

Defined in:: lib/fselector/discretizer.rb

Overview

discretize continuous feature

Instance Method Summary collapse

#discretize_by_Chi2!(delta = 0.02)
discretize by Chi2 algorithm.
#discretize_by_ChiMerge!(alpha = 0.10)
discretize by ChiMerge algorithm.
#discretize_by_equal_frequency!(n_interval)
discretize by equal-frequency intervals.
#discretize_by_equal_width!(n_interval)
discretize by equal-width intervals.
#discretize_by_MID!
discretize by Multi-Interval Discretization (MID) algorithm.
#discretize_by_TID!
discretize by Three-Interval Discretization (TID) algorithm.

Methods included from Entropy

#get_conditional_entropy, #get_information_gain, #get_joint_entropy, #get_marginal_entropy, #get_symmetrical_uncertainty

Methods included from Consistency

#get_IR, #get_IR_by_count, #get_IR_by_feature, #get_instance_count

Instance Method Details

#discretize_by_Chi2!(delta = 0.02)

Note:

Chi2 does some feature reduction if a discretized feature has only one interval. Using delta==0.02 reproduces exactly the same results as that of the original Chi2 algorithm

discretize by Chi2 algorithm

ref: Chi2: Feature Selection and Discretization of Numeric Attributes

Parameters:

delta (Float) (defaults to: 0.02) —
upper bound of data inconsistency rate

# File 'lib/fselector/discretizer.rb', line 159

def discretize_by_Chi2!(delta=0.02)    
  # degree of freedom equals one less than number of classes     
  df = get_classes.size-1
      
  #
  # Phase 1
  #
  
  sig_level = 0.5
  sig_level0 = sig_level
  
  inst_cnt = get_instance_count
  inconsis_rate = get_IR_by_count(inst_cnt)
 
  # f2bs = {
    # :'sepal-length' => [4.4],
    # :'sepal-width' => [2.0],
    # :'petal-length' => [1.0, 3.0, 5.0],
    # :'petal-width' => [0.1, 1.0, 1.7],
  # }
  
  while true
    chisq = pval2chisq(sig_level, df)
    f2bs = {} # cut ponts
    
    each_feature do |f|
      bs, cs, qs = chi2_init(f)
      chi2_merge(bs, cs, qs, chisq)
      
      f2bs[f] = bs
    end
    
    inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
    
    if inconsis_rate <= delta
      sig_level -= 0.1
      sig_level0 = sig_level
      
      break if sig_level0 <= 0.2 # phase 1 stop at level == 0.2
    else # data inconsistency
      break
    end 
  end
  
  #
  # Phase 2
  #
  
  try_levels = [0.1, 0.01, 0.001, 1e-4, 
                1e-5, 1e-6, 1e-7, 1e-8, 
                1e-9, 1e-10, 1e-11, 1e-12]           
  mergeble_fs = []
  f2sig_level = {}
  
  each_feature do |f|
    mergeble_fs << f
    f2sig_level[f] = sig_level0
  end
  
  f2bs = {} # cut ponts
  
  while not mergeble_fs.empty?
    mergeble_fs.each do |f|
      #pp f
      bs, cs, qs = chi2_init(f)
      chisq_now = pval2chisq(f2sig_level[f], df)
      chi2_merge(bs, cs, qs, chisq_now)
      
      # backup
      bs_bak = nil
      if f2bs.has_key? f
        bs_bak = f2bs[f]
      end
      f2bs[f] = bs
      
      inconsis_rate = chi2_get_inconsistency_rate(inst_cnt, f2bs)
      
      if (inconsis_rate <= delta)
        # try next level
        next_level = chi2_decrease_sig_level(f2sig_level[f], try_levels)
        f2sig_level[f] = next_level
        
        if not next_level # we've tried all levels
          mergeble_fs.delete(f)
        else
          f2bs[f] = bs # record cut points for this level
        end
      else # cause more inconsistency
        f2bs[f] = bs_bak if bs_bak # restore last cut points
        mergeble_fs.delete(f) # not mergeble
      end
    end
  end
  #pp f2bs
  #pp f2sig_level
  
  # if there is only one interval, remove this feature
  each_sample do |k, s|
    s.delete_if { |f, v| f2bs[f].size <= 1 }
  end
  
  # discretize according to each feature's cut points
  discretize_at_cutpoints!(f2bs)
end

#discretize_by_ChiMerge!(alpha = 0.10)

Note:

data structure will be altered

discretize by ChiMerge algorithm

ref: ChiMerge: Discretization of Numberic Attributes

Parameters:

alpha (Float) (defaults to: 0.10) —
confidence level, the smaller the less intervals

# File 'lib/fselector/discretizer.rb', line 72

def discretize_by_ChiMerge!(alpha=0.10)
  # degree of freedom equals one less than number of classes
  df = get_classes.size-1
  chisq = pval2chisq(alpha, df)
  
  # for intialization
  hzero = {}
  each_class do |k|
    hzero[k] = 0.0
  end
  
  # determine the final boundaries for each feature
  f2bs = {}
  each_feature do |f|
    #f = :"sepal-length"
    # 1a. initialize boundaries
    bs, cs, qs = [], [], []
    fvs = get_feature_values(f).uniq.sort
    fvs.each do |v|
      bs << v
      cs << hzero.dup
    end
    
    # 1b. initialize counts for each interval
    each_sample do |k, s|
      next if not s.has_key? f
      i = bs.rindex { |x| s[f] >= x }
      cs[i][k] += 1.0
    end
    
    # 1c. initialize chi-squared values between two adjacent intervals
    cs.each_with_index do |c, i|
      if i+1 < cs.size
        qs << chisq_calc(c, cs[i+1])
      end
    end
    
    # 2. iteratively merge intervals
    until qs.empty? or qs.min > chisq
      qs.each_with_index do |q, i|
        next if q != qs.min
        
        # update cs for merged two intervals
        cm = {}
        each_class do |k|
          cm[k] = cs[i][k]+cs[i+1][k]
        end
        
        # update qs if necessary
        # before merged intervals
        if i-1 >= 0
          qs[i-1] = chisq_calc(cs[i-1], cm)
        end
        # after merged intervals
        if i+1 < qs.size
          qs[i+1] = chisq_calc(cm, cs[i+2])
        end
        
        # merge up
        bs.delete_at(i+1)
        cs.delete_at(i);cs.delete_at(i);cs.insert(i, cm)
        qs.delete_at(i)
        
        # break out
        break
      end
    end
    
    # 3. record the final boundaries
    f2bs[f] = bs
  end
  
  # discretize according to each feature's boundaries
  discretize_at_cutpoints!(f2bs)
end

#discretize_by_equal_frequency!(n_interval)

Note:

data structure will be altered

discretize by equal-frequency intervals

Parameters:

n_interval (Integer) —
desired number of intervals

# File 'lib/fselector/discretizer.rb', line 43

def discretize_by_equal_frequency!(n_interval)
  n_interval = 1 if n_interval < 1 # at least one interval
  
  # first determine the boundaries
  f2bs = Hash.new { |h,k| h[k] = [] }
  each_feature do |f|
    fvs = get_feature_values(f).sort
    # number of samples in each interval
    ns = (fvs.size.to_f/n_interval).round
    fvs.each_with_index do |v, i|
      if i%ns == 0
        f2bs[f] << v
      end
    end
  end
  
  # then discretize based on cut points
  discretize_at_cutpoints!(f2bs)
end

#discretize_by_equal_width!(n_interval)

Note:

data structure will be altered

discretize by equal-width intervals

Parameters:

n_interval (Integer) —
desired number of intervals

# File 'lib/fselector/discretizer.rb', line 16

def discretize_by_equal_width!(n_interval)
  n_interval = 1 if n_interval < 1 # at least one interval
  
  # first determine the boundary of each feature
  f2bs = Hash.new { |h,k| h[k] = [] }
  each_feature do |f|
    fvs = get_feature_values(f)
    fmin, fmax = fvs.min, fvs.max
    delta = (fmax-fmin)/n_interval
    
    n_interval.times do |i|
      f2bs[f] << fmin + i*delta
     end
  end
  
  # then discretize based on cut points
  discretize_at_cutpoints!(f2bs)
end

#discretize_by_MID!

Note:

no missing feature value is allowed, and data structure will be altered

discretize by Multi-Interval Discretization (MID) algorithm

ref: Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning

# File 'lib/fselector/discretizer.rb', line 272

def discretize_by_MID!    
  # determine the final boundaries
  f2cp = {} # cut points for each feature
  each_feature do |f|
    cv = get_class_labels
    fv = get_feature_values(f)
    
    n = cv.size
    abort "[#{__FILE__}@#{__LINE__}]: \n"+
          "  missing feature value is not allowed!" if n != fv.size
    
    # sort cv and fv according to ascending order of fv
    sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
    cv = cv.values_at(*sis)
    fv = fv.values_at(*sis)
    
    # get initial boundaries
    bs = []
    fv.each_with_index do |v, i|
      # cut point (Ta) for feature A must always be a value between
      # two examples of different classes in the sequence of sorted examples
      # see orginal reference
      if i < n-1 and cv[i] != cv[i+1]
        bs << v
      end
    end
    bs.uniq! # remove duplicates
    
    # main algorithm, iteratively determine cut point
    cp = []
    partition(cv, fv, bs, cp)
    
    # record cut points for feature (f)
    f2cp[f] = cp.sort # sorted cut points
  end
  
  # discretize based on cut points
  discretize_at_cutpoints!(f2cp)
end

#discretize_by_TID!

Note:

no missing feature value is allowed, and data structure will be altered

discretize by Three-Interval Discretization (TID) algorithm

ref: Filter versus wrapper gene selection approaches in DNA microarray domains

# File 'lib/fselector/discretizer.rb', line 320

def discretize_by_TID!
  # cut points for each feature
  f2cp = {}
  
  each_feature do |f|
    cv = get_class_labels
    fv = get_feature_values(f)
    
    n = cv.size
    abort "[#{__FILE__}@#{__LINE__}]: \n"+
          "  missing feature value is not allowed!" if n != fv.size
    
    # sort cv and fv according to ascending order of fv
    sis = (0...n).to_a.sort { |i,j| fv[i] <=> fv[j] }
    cv = cv.values_at(*sis)
    fv = fv.values_at(*sis)
    
    # get initial boundaries
    bs = []
    fv_u = fv.uniq
    fv_u.each_with_index do |v, i|
      # cut points are the mean of two adjacent data points
      if i < fv_u.size-1
        bs << (v+fv_u[i+1])/2.0
      end
    end
    
    # test each pair cut point
    s_best, h1_best, h2_best = nil, nil, nil
    
    bs.each_with_index do |h1, i|        
      bs.each_with_index do |h2, j|
        next if j <= i
        
        n_h1 = (0...n).to_a.select { |x| fv[x] < h1 }.size.to_f
        n_h1_h2 = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 }.size.to_f
        n_h2 = (0...n).to_a.select { |x| fv[x] > h2 }.size.to_f
        
        s = 0.0
        
        each_class do |k|
          n_h1_k = (0...n).to_a.select { |x| fv[x] < h1 and cv[x] == k }.size.to_f
          n_h1_h2_k = (0...n).to_a.select { |x| fv[x] > h1 and fv[x] < h2 and cv[x] == k }.size.to_f
          n_h2_k = (0...n).to_a.select { |x| fv[x] > h2 and cv[x] == k }.size.to_f
          
          s += n_h1_k * Math.log2(n_h1_k/n_h1) if not n_h1_k.zero?
          s += n_h1_h2_k * Math.log2(n_h1_h2_k/n_h1_h2) if not n_h1_h2_k.zero?
          s += n_h2_k * Math.log2(n_h2_k/n_h2) if not n_h2_k.zero?
          
          #pp [s_best, s, h1, h2] + [n_h1, n_h1_k] + [n_h1_h2, n_h1_h2_k] + [n_h2, n_h2_k]
        end
        
        if not s_best or s > s_best
          s_best, h1_best, h2_best = s, h1, h2
          #pp [s_best, h1_best, h2_best]
        end
        
        break if s_best.zero? # allow early temination at maximum value 0.0
      end
      
      break if s_best.zero? # allow early temination at maximum value 0.0
    end
    
    #pp [s_best, h1_best, h2_best]
    f2cp[f] = [h1_best, h2_best]
  end
  
  # discretize based on cut points
  discretize_at_cutpoints!(f2cp, true)
end