Class: Matrix

Inherits:
Object
  • Object
show all
Extended by:
Resource
Defined in:
lib/rbbt/matrix.rb,
lib/rbbt/matrix/barcode.rb,
lib/rbbt/matrix/differential.rb,
lib/rbbt/expression_old/matrix.rb

Constant Summary collapse

MATRIX_DIR =
Matrix.root.find

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data, identifiers, labels = nil, key_field = nil, organism = nil, log2 = false, channel = false) ⇒ Matrix

Returns a new instance of Matrix.



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/rbbt/matrix.rb', line 14

def initialize(data_file, labels, value_type, format, organism=nil, identifiers=nil)
  @data_file = data_file
  @labels = labels
  @value_type = value_type
  @format = format
  @format ||=  begin
                 _header ||= TSV.parse_header(@data_file)
                 _header.key_field || "ID"
               end
  @organism = organism 
  _header = nil
  @organism ||=  begin
                   _header ||= TSV.parse_header(@data_file)
                   _header.namespace || Organism.default_code("Hsa")
                 end
  @identifiers = identifiers 
end

Class Attribute Details

.matrix_dirObject

Returns the value of attribute matrix_dir.



7
8
9
# File 'lib/rbbt/matrix.rb', line 7

def matrix_dir
  @matrix_dir
end

Instance Attribute Details

#channelObject

Returns the value of attribute channel.



28
29
30
# File 'lib/rbbt/expression_old/matrix.rb', line 28

def channel
  @channel
end

#dataObject

Returns the value of attribute data.



28
29
30
# File 'lib/rbbt/expression_old/matrix.rb', line 28

def data
  @data
end

#data_fileObject

Returns the value of attribute data_file.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def data_file
  @data_file
end

#formatObject

Returns the value of attribute format.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def format
  @format
end

#identifiersObject

Returns the value of attribute identifiers.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def identifiers
  @identifiers
end

#key_fieldObject

Returns the value of attribute key_field.



28
29
30
# File 'lib/rbbt/expression_old/matrix.rb', line 28

def key_field
  @key_field
end

#labelsObject

Returns the value of attribute labels.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def labels
  @labels
end

#log2Object

Returns the value of attribute log2.



28
29
30
# File 'lib/rbbt/expression_old/matrix.rb', line 28

def log2
  @log2
end

#organismObject

Returns the value of attribute organism.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def organism
  @organism
end

#samplesObject

Returns the value of attribute samples.



32
33
34
# File 'lib/rbbt/matrix.rb', line 32

def samples
  @samples ||= TSV.parse_header(@data_file).fields
end

#value_typeObject

Returns the value of attribute value_type.



13
14
15
# File 'lib/rbbt/matrix.rb', line 13

def value_type
  @value_type
end

Class Method Details

.geo_matrix_for(gds, key_field = nil, organism = nil) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/rbbt/expression_old/matrix.rb', line 15

def self.geo_matrix_for(gds, key_field = nil, organism = nil)
  data    = GEO[gds].values.produce.find
  samples = GEO[gds].samples.produce.find

  dataset_info = GEO[gds]['info.yaml'].produce.yaml
  platform     = dataset_info[:platform]
  identifiers  = GEO[platform].codes.produce.find

  log2         = ["count"].include? dataset_info[:value_type]

  Matrix.new(data, identifiers, samples, key_field, organism, log2)
end

Instance Method Details

#activity_cluster(outfile, factor = 2) ⇒ Object



15
16
17
18
19
20
21
22
23
24
# File 'lib/rbbt/matrix/barcode.rb', line 15

def activity_cluster(outfile, factor = 2)

  FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
  cmd =<<-EOF
source('#{Rbbt.share.R['barcode.R'].find}')
rbbt.GE.activity_cluster(#{ R.ruby2R self.data_file }, #{ R.ruby2R outfile }, #{R.ruby2R value_type})
  EOF

  R.run(cmd)
end

#average_label(value, field = nil) ⇒ Object



69
70
71
72
73
# File 'lib/rbbt/expression_old/matrix.rb', line 69

def average_label(value, field = nil)
  samples = find_samples(value, field)
  samples = remove_missing(samples)
  average_samples(samples)
end

#average_samples(samples) ⇒ Object



51
52
53
54
55
56
57
# File 'lib/rbbt/expression_old/matrix.rb', line 51

def average_samples(samples)
  path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'averaged_samples')}, {:samples => samples})
  Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
    Expression.average_samples(matrix_file, samples)
  end
  path
end

#barcode(path = nil, factor = 2) ⇒ Object



4
5
6
7
8
9
10
11
12
13
# File 'lib/rbbt/matrix/barcode.rb', line 4

def barcode(outfile, factor = 2)

  FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
  cmd =<<-EOF
source('#{Rbbt.share.R['barcode.R'].find}')
rbbt.GE.barcode(#{ R.ruby2R self.data_file }, #{ R.ruby2R outfile }, #{ R.ruby2R factor })
  EOF

  R.run(cmd)
end

#comparison(main, contrast, subsets = nil) ⇒ Object



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/rbbt/matrix.rb', line 91

def comparison(main, contrast, subsets = nil)
  subsets ||= self.subsets

  if main.index "="
    main_factor, main_value = main.split "=" 
    raise ParameterException, "Main selection not understood" if subsets[main_factor].nil? or subsets[main_factor][main_value].nil?
    value = subsets[main_factor][main_value]
    main_samples = String === value ? value.split(',') : value
  else
    main_samples = main.split(/[|,\n]/)
  end

  if contrast
    if contrast.index "="
      contrast_factor, contrast_value = contrast.split "=" 
      raise ParameterException, "Contrast selection not understood" if subsets[contrast_factor].nil? or subsets[contrast_factor][contrast_value].nil?
      value = subsets[contrast_factor][contrast_value]
      contrast_samples = String === value ? value.split(',') : value
    else
      contrast_samples = contrast.split(/[|,\n]/)
    end
  else
    if subsets and main_factor
      contrast_samples = subsets[main_factor].values.flatten.collect{|s| s.split ',' }.flatten.uniq - main_samples
    else
      contrast_samples = samples - main_samples
    end
  end
  main_samples = main_samples.compact.reject{|m| m.empty? }.collect{|m| m.strip }
  contrast_samples = contrast_samples.compact.reject{|m| m.empty? }.collect{|m| m.strip }

  [main_samples, contrast_samples]
end

#differential(main, contrast, path = nil) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/rbbt/matrix/differential.rb', line 4

def differential(main, contrast, path = nil)
  all_samples = self.samples
  if Array === main and Array === contrast
    main_samples, contrast_samples = main, contrast
  else
    main_samples, contrast_samples = comparison main, contrast
  end

  name = data_file =~ /:>/ ? File.basename(data_file) : data_file
  main_samples = main_samples & all_samples
  contrast_samples = contrast_samples & all_samples
  Persist.persist(name, :tsv, :persist => :update, :file => path,
                  :other => {:main => main_samples, :contrast => contrast_samples}, 
                  :prefix => "Diff", :dir => Matrix.matrix_dir.differential, :no_load => true) do |file|

    raise if file.nil?

      log2 = value_type.nil? or value_type == "count"
      log2 = false
      two_channel = false
      FileUtils.mkdir_p File.dirname(file) unless file.nil? or File.exists? File.dirname(file)

      cmd = <<-EOS

source('#{Rbbt.share.R["MA.R"].find}')

data = rbbt.dm.matrix.differential(#{ R.ruby2R data_file }, 
main = #{R.ruby2R(main_samples)}, 
contrast = #{R.ruby2R(contrast_samples)}, 
log2=#{ R.ruby2R log2 }, 
outfile = #{R.ruby2R file}, 
key.field = #{R.ruby2R format}, 
two.channel = #{R.ruby2R two_channel},
namespace = #{R.ruby2R organism}
)
      EOS

      R.run(cmd, :monitor => true)
  end
end

#find_samples(value, field = nil) ⇒ Object



59
60
61
62
63
# File 'lib/rbbt/expression_old/matrix.rb', line 59

def find_samples(value, field = nil)
  labels.select(field){|k,v|
    Array === v ? v.flatten.include?(value) : v == value
  }.keys
end

#label_differences(main, contrast = nil, field = nil) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/rbbt/expression_old/matrix.rb', line 92

def label_differences(main, contrast = nil, field = nil)
  all_samples = labels.keys
  main_samples = find_samples(main, field)
  if contrast
    contrast_samples = find_samples(contrast, field)
  else
    contrast_samples = all_samples - main_samples
  end

  main_samples = remove_missing(main_samples)
  contrast_samples = remove_missing(contrast_samples)

  sample_differences(main_samples, contrast_samples)
end

#matrix_file(path = nil) ⇒ Object



41
42
43
44
45
46
47
48
49
# File 'lib/rbbt/expression_old/matrix.rb', line 41

def matrix_file(path = nil)
  path ||= Persist.persistence_path(data, {:dir => Matrix::MATRIX_DIR}, {:identifiers => identifiers, :labels => labels, :key_field => key_field, :organism => organism})
  Persist.persist(data, :tsv, :file => path, :check => [data], :no_load => true) do
    matrix = Expression.load_matrix(data, identifiers, key_field, organism)
    matrix = matrix.select(:key => Organism.sanctioned_genes(organism).list) if matrix.key_field == "Ensembl Gene ID"
    matrix
  end
  path
end

#random_forest_importance(main, contrast = nil, field = nil, options = {}) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/rbbt/expression_old/matrix.rb', line 125

def random_forest_importance(main, contrast = nil, field = nil, options = {})
  features = Misc.process_options options, :features
  features ||= []

  path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'random_forest_importance')}, {:main => main, :contrast => contrast, :field => field, :features => features})
  Persist.persist(data, :tsv, :file => path, :no_load => false, :check => [matrix_file]) do
    all_samples = labels.keys
    main_samples = find_samples(main, field)
    if contrast
      contrast_samples = find_samples(contrast, field)
    else
      contrast_samples = all_samples - main_samples
    end


    main_samples     = remove_missing(main_samples)
    contrast_samples = remove_missing(contrast_samples)

    TmpFile.with_file do |result|
      R.run <<-EOF
library(randomForest);
orig = rbbt.tsv('#{matrix_file}');
main = c('#{main_samples * "', '"}')
contrast = c('#{contrast_samples * "', '"}')
features = c('#{features * "', '"}')

features = intersect(features, rownames(orig));
data = t(orig[features, c(main, contrast)])
data = cbind(data, Class = 0)
data[main, "Class"] = 1

rf = randomForest(factor(Class) ~ ., data, na.action = na.exclude)
rbbt.tsv.write(rf$importance, filename='#{ result }', key.field = '#{@key_field}')
      EOF

      TSV.open(result, :type => :single, :cast => :to_f)
    end
  end
end

#remove_missing(samples) ⇒ Object



65
66
67
# File 'lib/rbbt/expression_old/matrix.rb', line 65

def remove_missing(samples)
  @samples & samples
end

#sample_differences(main, contrast) ⇒ Object



84
85
86
87
88
89
90
# File 'lib/rbbt/expression_old/matrix.rb', line 84

def sample_differences(main, contrast)
  path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'sample_differences')}, {:main => main, :contrast => contrast, :log2 => log2, :channel => channel})
  Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
    Expression.differential(matrix_file, main, contrast, log2, channel)
  end
  path
end

#signature_set(field, cast = nil) ⇒ Object



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/rbbt/expression_old/matrix.rb', line 107

def signature_set(field, cast = nil)
  path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'signature_set')}, {:field => field, :cast => cast})
  Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
    signatures = TSV.open(matrix_file, :fields => [], :type => :list, :cast => cast)
    labels.values.flatten.uniq.sort.each do |value|
      begin
        s = Signature.tsv_field(label_differences(value), field, cast)
        s.fields = [value]
        signatures.attach s
      rescue Exception
        Log.warn("Signature for #{ value } did not compute")
      end
    end
    signatures
  end
  path
end

#subsetsObject



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/rbbt/matrix.rb', line 36

def subsets
  @subsets ||= begin
                 subsets = {}
                 case @labels
                 when Path
                   labels = @labels.tsv
                   factors = labels.fields
                   labels.through do |sample,values|
                     factors.zip(values).each do |factor,value|
                       subsets[factor] ||= {}
                       subsets[factor][value] ||= []
                       subsets[factor][value] << sample
                     end
                   end

                 when TSV
                   factors = @labels.fields
                   @labels.through do |sample,values|
                     factors.zip(values).each do |factor,value|
                       subsets[factor] ||= {}
                       subsets[factor][value] ||= []
                       subsets[factor][value] << sample
                     end
                   end
                 when Hash
                   @labels.each do |factor,info|
                     subsets[factors] ||= {}
                     info.each do |value, samples|
                       subsets[factors][value] = case samples
                                                 when Array 
                                                   samples
                                                 when String
                                                   samples.split ','
                                                 else
                                                   raise "Format of samples not understood: #{Misc.finguerprint samples}"
                                                 end

                     end
                   end
                 end

                 clean_subsets = {}
                 subsets.each do |factor,values|
                   next if values.nil? or values.size < 2
                   values.each do |level,samples|
                     next if samples.nil? or samples.length < 2
                     clean_subsets[factor] ||= {}
                     clean_subsets[factor][level] = samples
                   end
                 end

                 clean_subsets
               end
end

#to_gene(identifiers = nil) ⇒ Object



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/rbbt/matrix.rb', line 126

def to_gene(identifiers = nil)
  require 'rbbt/tsv/change_id'

  name = data_file =~ /:>/ ? File.basename(data_file) : data_file

  file = Persist.persist(data_file, :tsv, :prefix => "Gene", :dir => Matrix.matrix_dir.values, :no_load => true) do

    data = data_file.tsv(:cast => :to_f)

    identifiers = [identifiers, @identifiers, data.identifiers, Organism.identifiers(organism)].flatten.compact.uniq

    data.change_key("Ensembl Gene ID", :identifiers => identifiers.reverse) do |v|
      Misc.mean(v.compact)
    end
  end
  Matrix.new file, labels, value_type, "Ensembl Gene ID", organism
end