Class: Bioinform::PWM

Inherits:
Object
  • Object
show all
Defined in:
lib/macroape/counting.rb

Instance Method Summary collapse

Instance Method Details

#count_distributionObject



73
74
75
# File 'lib/macroape/counting.rb', line 73

def count_distribution
  @count_distribution ||= count_distribution_after_threshold(worst_score)
end

#count_distribution_after_threshold(threshold) ⇒ Object



63
64
65
66
67
68
69
70
71
# File 'lib/macroape/counting.rb', line 63

def count_distribution_after_threshold(threshold)
  return @count_distribution.select{|score, count| score >= threshold}  if @count_distribution
  scores = { 0 => 1 }
  length.times do |column|
    scores.replace recalc_score_hash(scores, @matrix[column], threshold - best_suffix(column + 1))
    raise 'Hash overflow in PWM::ThresholdByPvalue#count_distribution_after_threshold'  if max_hash_size && scores.size > max_hash_size
  end
  scores
end

#count_distribution_under_pvalue(max_pvalue) ⇒ Object



29
30
31
32
33
34
35
36
37
38
# File 'lib/macroape/counting.rb', line 29

def count_distribution_under_pvalue(max_pvalue)
  cnt_distribution = {}
  look_for_count = max_pvalue * vocabulary_volume
  until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
    cnt_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue))
    max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
  end

  cnt_distribution
end

#counts_by_thresholds(*thresholds) ⇒ Object



90
91
92
93
94
95
# File 'lib/macroape/counting.rb', line 90

def counts_by_thresholds(*thresholds)
  scores = count_distribution_after_threshold(thresholds.min)
  thresholds.map{ |threshold|
    scores.inject(0.0){|sum,(score,count)|  (score >= threshold) ? sum + count : sum}
  }
end

#max_hash_size(*args) ⇒ Object



9
10
11
12
13
14
15
# File 'lib/macroape/counting.rb', line 9

def max_hash_size(*args)
  case args.size
  when 0 then @max_hash_size
  when 1 then max_hash_size!(args.first)
  else raise ArgumentError, '#max_hash_size method can get 0 or 1 argument'
  end
end

#max_hash_size!(new_max_hash_size) ⇒ Object

sets or gets limit size of calculation hash. It’s a defence against overuse CPU resources by non-appropriate data



4
5
6
7
# File 'lib/macroape/counting.rb', line 4

def max_hash_size!(new_max_hash_size)
  @max_hash_size = new_max_hash_size
  self
end

#pvalue_by_threshold(threshold) ⇒ Object



97
98
99
# File 'lib/macroape/counting.rb', line 97

def pvalue_by_threshold(threshold)
  counts_by_thresholds(threshold).first / vocabulary_volume
end

#recalc_score_hash(scores, column, least_sufficient) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/macroape/counting.rb', line 77

def recalc_score_hash(scores, column, least_sufficient)
  new_scores = Hash.new(0)
  scores.each do |score, count|
    4.times do |letter|
      new_score = score + column[letter]
      if new_score >= least_sufficient
        new_scores[new_score] += count * @background[letter]
      end
    end
  end
  new_scores
end

#threshold(pvalue) ⇒ Object



17
18
19
# File 'lib/macroape/counting.rb', line 17

def threshold(pvalue)
  thresholds(pvalue){|_, thresh, _| return thresh }
end

#thresholds(*pvalues) ⇒ Object



21
22
23
24
25
26
27
# File 'lib/macroape/counting.rb', line 21

def thresholds(*pvalues)
  thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
    threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
    real_pvalue = counts.end.to_f / vocabulary_volume
    yield pvalue, threshold, real_pvalue
  end
end

#thresholds_by_pvalues(*pvalues) ⇒ Object

ret-value: hash => [thresholds, counts] thresholds = left_threshold .. right_threshold (left_threshold < right_threshold) counts = left_count .. right_count (left_count > right_count)



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/macroape/counting.rb', line 44

def thresholds_by_pvalues(*pvalues)
  sorted_scores = count_distribution_under_pvalue(pvalues.max).sort.reverse
  scores = sorted_scores.map{|score,count| score}
  counts = sorted_scores.map{|score,count| count}
  partial_sums = counts.partial_sums

  results = {}

  pvalue_counts = pvalues.sort.collect_hash{|pvalue| [pvalue, pvalue * vocabulary_volume] }
  pvalue_counts.map do |pvalue,look_for_count|
    ind = partial_sums.index{|sum| sum >= look_for_count}
    minscore, count_at_minscore = scores[ind], partial_sums[ind]
    maxscore, count_at_maxscore = ind > 0  ?  [ scores[ind-1],  partial_sums[ind-1] ]  :  [ best_score + 1.0, 0.0 ]
    results[pvalue] = [(minscore .. maxscore), (count_at_minscore .. count_at_maxscore)]
  end

  results
end