Class: OpenTox::RUtil

Inherits:
Object
  • Object
show all
Defined in:
lib/r-util.rb

Constant Summary collapse

@@feats =
{}

Instance Method Summary collapse

Constructor Details

#initializeRUtil

Returns a new instance of RUtil.



29
30
31
32
33
34
35
# File 'lib/r-util.rb', line 29

def initialize
  @r = RinRuby.new(true,false) unless defined?(@r) and @r
  @r.eval ".libPaths('#{PACKAGE_DIR}')"
  @r_packages = @r.pull "installed.packages()[,1]"
  ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto"
  @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')"
end

Instance Method Details

#boxplot(files, data, title = "") ⇒ Object

example: files = [“/tmp/box.svg”,“/tmp/box.png”] data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ] boxplot(files, data, “comparison1” )



81
82
83
84
85
86
87
# File 'lib/r-util.rb', line 81

def boxplot(files, data, title="")
  LOGGER.debug("r-util> create boxplot")
  assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s})
  plot_to_files(files) do |file|
    @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))"
  end
end

#dataframe_to_dataset(df, metadata = {}, subjectid = nil) ⇒ Object

converts a dataframe into a dataset (a new dataset is created at the dataset webservice) this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)



326
327
328
# File 'lib/r-util.rb', line 326

def dataframe_to_dataset( df, ={}, subjectid=nil )
  dataframe_to_dataset_indices( df, , subjectid, nil)
end

#dataset_to_dataframe(dataset, missing_value = "NA", subjectid = nil, features = nil) ⇒ Object

dataset should be loaded completely (use Dataset.find) takes duplicates into account replaces missing values with param <missing_value> returns dataframe-variable-name in R



246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# File 'lib/r-util.rb', line 246

def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
  LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
  
  # count duplicates
  num_compounds = {}
  dataset.features.keys.each do |f|
    dataset.compounds.each do |c|
      if dataset.data_entries[c]
        val = dataset.data_entries[c][f]
        size = val==nil ? 1 : val.size
        num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max
      else
        num_compounds[c] = 1
      end
    end
  end  
  
  # use either all, or the provided features, sorting is important as col-index := features
  if features
    features.sort!
  else
    features = dataset.features.keys.sort
  end
  compounds = []
  compound_names = []
  dataset.compounds.each do |c|
    count = 0
    num_compounds[c].times do |i|
      compounds << c
      compound_names << "#{c}$#{count}"
      count+=1
    end
  end

  # values into 2D array, then to dataframe
  d_values = []
  dataset.compounds.each do |c|
    num_compounds[c].times do |i|
      c_values = []
      features.each do |f|
        if dataset.data_entries[c]
          val = dataset.data_entries[c][f]
          v = val==nil ? "" : val[i].to_s
        else
          raise "wtf" if i>0
          v = ""
        end
        v = missing_value if v.size()==0
        c_values << v
      end
      d_values << c_values
    end
  end  
  df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
  assign_dataframe(df_name,d_values,compound_names,features)
  
  # set dataframe column types accordingly
  f_count = 1 #R starts at 1
  features.each do |f|
    feat = OpenTox::Feature.find(f,subjectid)
    nominal = feat.[RDF.type].to_a.flatten.include?(OT.NominalFeature)
    if nominal
      @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])"
    else
      @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])"
    end
    f_count += 1
  end
  #@r.eval "head(#{df_name})"
  
  # store compounds, and features (including metainformation)
  @@feats[df_name] = {}
  features.each do |f|
    @@feats[df_name][f] = dataset.features[f]
  end
  df_name
end

#double_hist_plot(files, data1, data2, is_numerical, log = false, name1 = "first", name2 = "second", title = "title", xaxis = "x-values") ⇒ Object

plots a double histogram data1 and data2 are arrays with values, either numerical or categorial (string values) is_numerical, boolean flag indicating value types log (only for numerical), plot logarithm of values



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/r-util.rb', line 129

def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values")
  LOGGER.debug("r-util> create double hist plot")
  all = data1 + data2
  if (is_numerical)
    @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values')
    {
      if (log)
      {
        data1 <- log(data1)
        data2 <- log(data2)
        xlab = paste('logarithm of',xlab,sep=' ')
      }
      xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2)))))
      h <- hist(rbind(data1,data2),plot=F)
      h1 <- hist(data1,plot=F,breaks=h$breaks)
      h2 <- hist(data2,plot=F,breaks=h$breaks)
      xlims = c(min(h$breaks),max(h$breaks))
      ylims = c(0,max(h1$counts,h2$counts))
      xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1))
      plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims,
        main=title, xlab=xlab, ylab='counts' )
      plot(h2, col=rgb(0,1,0,2/4), add=T )
      legend('topleft',names,lty=c(1,1),col=c('red','green'))
    }" 
    @r.assign("data1",data1)
    @r.assign("data2",data2)
    @r.legend = [name1, name2]
  else
    raise "log not valid for categorial" if log
    vals = all.uniq.sort!
    counts1 = vals.collect{|e| data1.count(e)}
    counts2 = vals.collect{|e| data2.count(e)}
    @r.data1 = counts1
    @r.data2 = counts2
    @r.value_names = [name1, name2]
    @r.legend = vals
    @r.eval("data <- cbind(data1,data2)")
  end
  
  plot_to_files(files) do |file|
    if (is_numerical)
      @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')"
    else
      @r.eval("bp <- barplot(data, beside=T, names.arg=value_names, 
        main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend),
      @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)"
    end
  end
end

#feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, features = nil, subjectid = nil, waiting_task = nil) ⇒ Object

embedds feature values of two datasets into 2D and plots it



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/r-util.rb', line 91

def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2,
    features=nil, subjectid=nil, waiting_task=nil)
    
  LOGGER.debug("r-util> create feature value plot")
  d1 = OpenTox::Dataset.find(dataset_uri1,subjectid)
  d2 = OpenTox::Dataset.find(dataset_uri2,subjectid)
  if features
    [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}} 
  else
    raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if 
      (d1.features.keys.sort != d2.features.keys.sort)
    features = d1.features.keys
  end
  raise "at least two features needed" if d1.features.keys.size<2
  waiting_task.progress(25) if waiting_task
  
  df1 = dataset_to_dataframe(d1,0,subjectid,features)
  df2 = dataset_to_dataframe(d2,0,subjectid,features)
  waiting_task.progress(50) if waiting_task
  
  @r.eval "df <- rbind(#{df1},#{df2})"
  @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))"
  @r.names = [dataset_name1, dataset_name2]
  LOGGER.debug("r-util> - convert data to 2d")
  #@r.eval "save.image(\"/tmp/image.R\")"
  @r.eval "df.2d <- plot_pre_process(df, method='sammon')"
  waiting_task.progress(75) if waiting_task
  
  LOGGER.debug("r-util> - plot data")
  plot_to_files(files) do |file|
    @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')"
  end
end

#install_package(package) ⇒ Object



53
54
55
56
57
58
# File 'lib/r-util.rb', line 53

def install_package( package )
  unless package_installed?(package)
    LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}"
    @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')"
  end
end

#package_installed?(package) ⇒ Boolean

Returns:

  • (Boolean)


49
50
51
# File 'lib/r-util.rb', line 49

def package_installed?( package )
  @r_packages.include?(package) 
end

#paired_ttest(array1, array2, significance_level = 0.95) ⇒ Object

<0 -> array1 << array2 0 -> no significant difference >0 -> array2 >> array1



63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/r-util.rb', line 63

def paired_ttest(array1, array2, significance_level=0.95)
  @r.assign "v1",array1
  @r.assign "v2",array2
  @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)"
  t = @r.pull "ttest$statistic"
  p = @r.pull "ttest$p.value"
  if (1-significance_level > p)
    t
  else
    0
  end
end

#quit_rObject



37
38
39
40
41
42
43
# File 'lib/r-util.rb', line 37

def quit_r
  begin
    @r.quit
    @r = nil
  rescue
  end
end

#rObject



45
46
47
# File 'lib/r-util.rb', line 45

def r
  @r
end

#stratified_k_fold_split(dataset, metadata = {}, missing_values = "NA", num_folds = 10, subjectid = nil, seed = 42, split_features = nil) ⇒ Object

stratified splits a dataset into k datasets according the feature values all features are taken into account unless <split_features> is given returns two arrays of datasets



189
190
191
# File 'lib/r-util.rb', line 189

def stratified_k_fold_split( dataset, ={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil )
  stratified_split_internal( dataset, , missing_values, num_folds, nil, subjectid, seed, split_features )
end

#stratified_split(dataset, metadata = {}, missing_values = "NA", pct = 0.3, subjectid = nil, seed = 42, split_features = nil) ⇒ Object

stratified splits a dataset into two dataset according to the feature values all features are taken into account unless <split_features> is given returns two datases



182
183
184
# File 'lib/r-util.rb', line 182

def stratified_split( dataset, ={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil )
  stratified_split_internal( dataset, , missing_values, nil, pct, subjectid, seed, split_features )
end