Module: Annotations

Defined in:: lib/MARQ/annotations.rb

Defined Under Namespace

Modules: GO, UMLS

Constant Summary collapse

RANK_SIZE_BINS =

%w(1 2 3 4 5 7 10 15 20 30 40 50 65 80 100 125 150 175 200 250 300 350 400 450 500 600 700 800 900 1000 1500 2000 2500 3000)

Class Method Summary collapse

Class Method Details

.annotations(scores, type, pvalue = 0.05, algorithm = :rank) ⇒ `Object`

# File 'lib/MARQ/annotations.rb', line 295

def self.annotations(scores, type, pvalue = 0.05, algorithm = :rank) 
  annot = {}
  relevant = []

  dict_options = {}
  if type == "Words"
    dict_options = {:low => 0, :hi => 0.05, :limit => 100000}
  else
    dict_options = {:low => 0, :hi => 0.5, :limit => 100000}
  end

  case
  when type =~ /^(.*)_direct$/
    side = :direct
    type = $1
  when type =~ /^(.*)_inverse$/
    side = :inverse
    type = $1
  end


  terms_cache = {}
  scores.each{|experiment, info|
    dataset = experiment.match(/^(.*?): /)[1]
    name = $'.strip 
    case
    when side.nil?
      term_file = File.join(MARQ.datadir, MARQ.platform_type(dataset).to_s , 'annotations',type, dataset)
    when side == :direct && info[:score] > 0 || side == :inverse && info[:score] < 0
      term_file = File.join(MARQ.datadir, MARQ.platform_type(dataset).to_s , 'annotations',type + '_up', dataset)
    else
      term_file = File.join(MARQ.datadir, MARQ.platform_type(dataset).to_s , 'annotations',type + '_down', dataset)
    end

    if File.exist? term_file
      terms_cache[term_file] ||= YAML::load(File.open(term_file))
      terms = terms_cache[term_file] 
      annot[experiment] = {:dataset => (terms[:dataset] || []), :signature => (terms[name] || [])}
    else
      annot[experiment] = {:dataset =>  [], :signature => []}
    end

    relevant << experiment if info[:pvalue] <= pvalue
  }

  if algorithm == :rank
    ranks = scores.sort{|a,b| compare(a[1],b[1]) }.collect{|p| p[0]}
    terms = enrichment_rank(annot, ranks, dict_options)
  else
    terms = enrichment_hypergeometric(annot, relevant, dict_options)
  end

  merged_annotations = {}
  annot.each{|key, info|
    merged_annotations[key] = info[:dataset] + info[:signature]
  }
  [merged_annotations, terms]
end

.compare(a, b) ⇒ `Object`

# File 'lib/MARQ/annotations.rb', line 100

def self.compare(a,b)
  case 
  when a[:pvalue] < b[:pvalue]
    -1 
  when a[:pvalue] > b[:pvalue]
    1 
  when a[:pvalue] == b[:pvalue]
    b[:score].abs <=> a[:score].abs
  end
end

.enrichment_hypergeometric(annotations, relevant, options) ⇒ `Object`

# File 'lib/MARQ/annotations.rb', line 227

def self.enrichment_hypergeometric(annotations, relevant, options)
  dict_options = {
    :dict_options => {:low => 0, :hi => 0.5, :limit => 100000}
  }.merge(options)[:dict_options]
  positions = {}
  found_datasets = []

  dict = Dictionary::TF_IDF.new
  ranks.each_with_index{|experiment, rank|
    info = annotations[experiment]

    dataset_terms   = info[:dataset]
    signature_terms = info[:signature]
    
    dataset = exp2gds experiment

    terms = signature_terms
    terms += dataset_terms

    term_count = {}
    terms.each{|term|
      term_count[term] ||= 0
      term_count[term] += 1
    }
    dict.add(term_count)
  }
 
  best = dict.best(dict_options).keys

  terms = {}
  found_datasets = []
  annotations.each{|experiment, info|
    dataset_terms   = info[:dataset]
    signature_terms = info[:signature]

    dataset = exp2gds experiment

    signature_terms.each{|term|
      next if ! best.include? term
      terms[term] ||= {:relevant => 0, :total => 0}
      terms[term][:total]    += 1 
      terms[term][:relevant] += 1 if relevant.include? experiment
    }
    
    next if found_datasets.include? dataset
    found_datasets << dataset

    dataset_terms.each{|term|
      next if ! best.include? term
      terms[term] ||= {:relevant => 0, :total => 0}
      terms[term][:total]    += 1 
      terms[term][:relevant] += 1 if relevant.include? experiment
    }
  }
 

  total   = annotations.keys.length
  list    = relevant.length

  terms.each{|term, info|
    info[:pvalue] = Annotations.hypergeometric(total,info[:total],list, info[:relevant])
  }

  terms
end

.enrichment_rank(annotations, ranks, options = {}) ⇒ `Object`

# File 'lib/MARQ/annotations.rb', line 113

def self.enrichment_rank(annotations, ranks, options = {})
  dict_options = {
    :dict_options => {:low => 0, :hi => 0.5, :limit => 100000}
  }.merge(options)[:dict_options]
  positions = {}
  found_datasets = []

  dict = Dictionary::TF_IDF.new
  ranks.each_with_index{|experiment, rank|
    info = annotations[experiment]

    dataset_terms   = info[:dataset]
    signature_terms = info[:signature]
    
    dataset = exp2gds experiment

    terms = signature_terms
    terms += dataset_terms

    term_count = {}
    terms.each{|term|
      term_count[term] ||= 0
      term_count[term] += 1
    }
    dict.add(term_count)
  }
 
  best = dict.best(dict_options).keys

  found_datasets = []
  ranks.each_with_index{|experiment, rank|
    info = annotations[experiment]

    dataset_terms   = info[:dataset]
    signature_terms = info[:signature]

    dataset = exp2gds experiment

    terms = signature_terms
    
    if ! found_datasets.include? dataset
      terms += dataset_terms
      found_datasets << dataset
    end

    terms.uniq.each{|term|
      next if not best.include? term
      positions[term] ||= []
      positions[term] <<  rank
    }
  }
 
  scores = []


  sizes = {}
  RANK_SIZE_BINS.each{|size| sizes[size.to_i] = []}


  # For each term compute the rank score. Also, place it in the closest size
  # bin for the permutations.
  best.each_with_index{|term, pos|
    if positions[term]
      list = positions[term]

      # place it on the size bin 
      found = false
      sizes.keys.sort.each_with_index{|size,i|
        next if found
        if list.length < size
          found = true
          sizes[sizes.keys.sort[i-1]] << pos
        end
      }
      sizes[sizes.keys.sort.last] << pos if !found
      
      scores << Score::score(list, ranks.length, 0)[:score]
    else # it has no score
      scores << nil
    end
  }

  info = {}

  # Go through all the size bins, run the permutations and assign the pvalues
  # to all terms in the bin.
  sizes.keys.each{|size|
    next if size == 1
    next if sizes[size].empty?

    # This are the actual scores for the terms in the bin
    sub_list_scores = sizes[size].collect{|pos| scores[pos] || 0}

    # Compute the pvalues for all the terms in the bin. The size of the
    # permutation list is that of the bin
    pvalues = Score::pvalues(sub_list_scores, size, 0, ranks.length)

    # Save the information from the terms, score, hits, and pvalues.
    sizes[size].zip(pvalues).each{|p|
      pos = p[0]
      pvalue = p[1]
      score = scores[pos]
      next if score < 0

      term = best[pos]
      hits = positions[term].nil? ? 0 : positions[term].length 

      info[term] = {:score => score, :hits => hits, :pvalue => pvalue}
    }
  }

  info
end

.exp2gds(experiment) ⇒ `Object`

# File 'lib/MARQ/annotations.rb', line 95

def self.exp2gds(experiment)
  experiment =~ /(.*?):/
  $1
end

Module: Annotations

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.annotations(scores, type, pvalue = 0.05, algorithm = :rank) ⇒ Object

.compare(a, b) ⇒ Object

.enrichment_hypergeometric(annotations, relevant, options) ⇒ Object

.enrichment_rank(annotations, ranks, options = {}) ⇒ Object

.exp2gds(experiment) ⇒ Object

.annotations(scores, type, pvalue = 0.05, algorithm = :rank) ⇒ `Object`

.compare(a, b) ⇒ `Object`

.enrichment_hypergeometric(annotations, relevant, options) ⇒ `Object`

.enrichment_rank(annotations, ranks, options = {}) ⇒ `Object`

.exp2gds(experiment) ⇒ `Object`