Class: OpenTox::RUtil
- Inherits:
-
Object
- Object
- OpenTox::RUtil
- Defined in:
- lib/r-util.rb
Constant Summary collapse
- @@feats =
{}
Instance Method Summary collapse
-
#boxplot(files, data, title = "") ⇒ Object
example: files = [“/tmp/box.svg”,“/tmp/box.png”] data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ] boxplot(files, data, “comparison1” ).
-
#dataframe_to_dataset(df, metadata = {}, subjectid = nil) ⇒ Object
converts a dataframe into a dataset (a new dataset is created at the dataset webservice) this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!).
-
#dataset_to_dataframe(dataset, missing_value = "NA", subjectid = nil, features = nil) ⇒ Object
dataset should be loaded completely (use Dataset.find) takes duplicates into account replaces missing values with param <missing_value> returns dataframe-variable-name in R.
-
#double_hist_plot(files, data1, data2, is_numerical, log = false, name1 = "first", name2 = "second", title = "title", xaxis = "x-values") ⇒ Object
plots a double histogram data1 and data2 are arrays with values, either numerical or categorial (string values) is_numerical, boolean flag indicating value types log (only for numerical), plot logarithm of values.
-
#feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, features = nil, subjectid = nil, waiting_task = nil) ⇒ Object
embedds feature values of two datasets into 2D and plots it.
-
#initialize ⇒ RUtil
constructor
A new instance of RUtil.
- #install_package(package) ⇒ Object
- #package_installed?(package) ⇒ Boolean
-
#paired_ttest(array1, array2, significance_level = 0.95) ⇒ Object
<0 -> array1 << array2 0 -> no significant difference >0 -> array2 >> array1.
- #quit_r ⇒ Object
- #r ⇒ Object
-
#stratified_k_fold_split(dataset, metadata = {}, missing_values = "NA", num_folds = 10, subjectid = nil, seed = 42, split_features = nil) ⇒ Object
stratified splits a dataset into k datasets according the feature values all features are taken into account unless <split_features> is given returns two arrays of datasets.
-
#stratified_split(dataset, metadata = {}, missing_values = "NA", pct = 0.3, subjectid = nil, seed = 42, split_features = nil) ⇒ Object
stratified splits a dataset into two dataset according to the feature values all features are taken into account unless <split_features> is given returns two datases.
Constructor Details
#initialize ⇒ RUtil
Returns a new instance of RUtil.
29 30 31 32 33 34 35 |
# File 'lib/r-util.rb', line 29 def initialize @r = RinRuby.new(true,false) unless defined?(@r) and @r @r.eval ".libPaths('#{PACKAGE_DIR}')" @r_packages = @r.pull "installed.packages()[,1]" ["sampling","gam","vegan"].each{|l| install_package(l)} #"caret", "smacof", "TunePareto" @r.eval "source('#{File.join(Gem.loaded_specs['opentox-ruby'].full_gem_path,'lib/stratification.R')}')" end |
Instance Method Details
#boxplot(files, data, title = "") ⇒ Object
example: files = [“/tmp/box.svg”,“/tmp/box.png”] data = [ [ :method, [4,4,5,5,4,3,2] ], [ :method2, [1,2,3,4,5,4,6] ], [ :asdf, [9,1,8,0,7,1,6] ] ] boxplot(files, data, “comparison1” )
81 82 83 84 85 86 87 |
# File 'lib/r-util.rb', line 81 def boxplot(files, data, title="") LOGGER.debug("r-util> create boxplot") assign_dataframe("boxdata",data.collect{|e| e[1]}.transpose,nil,data.collect{|e| e[0].to_s}) plot_to_files(files) do |file| @r.eval "boxplot(boxdata,main='#{title}',col=rep(2:#{data.size+1}))" end end |
#dataframe_to_dataset(df, metadata = {}, subjectid = nil) ⇒ Object
converts a dataframe into a dataset (a new dataset is created at the dataset webservice) this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
326 327 328 |
# File 'lib/r-util.rb', line 326 def dataframe_to_dataset( df, ={}, subjectid=nil ) dataframe_to_dataset_indices( df, , subjectid, nil) end |
#dataset_to_dataframe(dataset, missing_value = "NA", subjectid = nil, features = nil) ⇒ Object
dataset should be loaded completely (use Dataset.find) takes duplicates into account replaces missing values with param <missing_value> returns dataframe-variable-name in R
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 |
# File 'lib/r-util.rb', line 246 def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil ) LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}" # count duplicates num_compounds = {} dataset.features.keys.each do |f| dataset.compounds.each do |c| if dataset.data_entries[c] val = dataset.data_entries[c][f] size = val==nil ? 1 : val.size num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max else num_compounds[c] = 1 end end end # use either all, or the provided features, sorting is important as col-index := features if features features.sort! else features = dataset.features.keys.sort end compounds = [] compound_names = [] dataset.compounds.each do |c| count = 0 num_compounds[c].times do |i| compounds << c compound_names << "#{c}$#{count}" count+=1 end end # values into 2D array, then to dataframe d_values = [] dataset.compounds.each do |c| num_compounds[c].times do |i| c_values = [] features.each do |f| if dataset.data_entries[c] val = dataset.data_entries[c][f] v = val==nil ? "" : val[i].to_s else raise "wtf" if i>0 v = "" end v = missing_value if v.size()==0 c_values << v end d_values << c_values end end df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" assign_dataframe(df_name,d_values,compound_names,features) # set dataframe column types accordingly f_count = 1 #R starts at 1 features.each do |f| feat = OpenTox::Feature.find(f,subjectid) nominal = feat.[RDF.type].to_a.flatten.include?(OT.NominalFeature) if nominal @r.eval "#{df_name}[,#{f_count}] <- as.character(#{df_name}[,#{f_count}])" else @r.eval "#{df_name}[,#{f_count}] <- as.numeric(#{df_name}[,#{f_count}])" end f_count += 1 end #@r.eval "head(#{df_name})" # store compounds, and features (including metainformation) @@feats[df_name] = {} features.each do |f| @@feats[df_name][f] = dataset.features[f] end df_name end |
#double_hist_plot(files, data1, data2, is_numerical, log = false, name1 = "first", name2 = "second", title = "title", xaxis = "x-values") ⇒ Object
plots a double histogram data1 and data2 are arrays with values, either numerical or categorial (string values) is_numerical, boolean flag indicating value types log (only for numerical), plot logarithm of values
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
# File 'lib/r-util.rb', line 129 def double_hist_plot(files, data1, data2, is_numerical, log=false, name1="first", name2="second", title="title", xaxis="x-values") LOGGER.debug("r-util> create double hist plot") all = data1 + data2 if (is_numerical) @r.eval "double_plot <- function(data1, data2, log=FALSE, names=c('data1','data2'), title='title', xlab='x-values') { if (log) { data1 <- log(data1) data2 <- log(data2) xlab = paste('logarithm of',xlab,sep=' ') } xlims <- round(c(min(c(min(data1),min(data2))),max(c(max(data1),max(data2))))) h <- hist(rbind(data1,data2),plot=F) h1 <- hist(data1,plot=F,breaks=h$breaks) h2 <- hist(data2,plot=F,breaks=h$breaks) xlims = c(min(h$breaks),max(h$breaks)) ylims = c(0,max(h1$counts,h2$counts)) xaxps = c(min(h$breaks),max(h$breaks),(length(h$breaks)-1)) plot(h1, col=rgb(1,0,0,2/4), xlim=xlims, xaxp=xaxps, ylim=ylims, main=title, xlab=xlab, ylab='counts' ) plot(h2, col=rgb(0,1,0,2/4), add=T ) legend('topleft',names,lty=c(1,1),col=c('red','green')) }" @r.assign("data1",data1) @r.assign("data2",data2) @r.legend = [name1, name2] else raise "log not valid for categorial" if log vals = all.uniq.sort! counts1 = vals.collect{|e| data1.count(e)} counts2 = vals.collect{|e| data2.count(e)} @r.data1 = counts1 @r.data2 = counts2 @r.value_names = [name1, name2] @r.legend = vals @r.eval("data <- cbind(data1,data2)") end plot_to_files(files) do |file| if (is_numerical) @r.eval "double_plot(data1,data2,log=#{log ? "T":"F"},names=legend,title='#{title}',xlab='#{xaxis}')" else @r.eval("bp <- barplot(data, beside=T, names.arg=value_names, main='#{title}', col=sort(rep(2:3,length(legend))))") #legend.text=c(legend), @r.eval "text(bp, 0, round(data, 1),cex=1,pos=3)" end end end |
#feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, features = nil, subjectid = nil, waiting_task = nil) ⇒ Object
embedds feature values of two datasets into 2D and plots it
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/r-util.rb', line 91 def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, features=nil, subjectid=nil, waiting_task=nil) LOGGER.debug("r-util> create feature value plot") d1 = OpenTox::Dataset.find(dataset_uri1,subjectid) d2 = OpenTox::Dataset.find(dataset_uri2,subjectid) if features [d1, d2].each{|d| features.each{|f| raise "feature not included" unless d.features.keys.include?(f)}} else raise "different\n#{d1.features.keys.sort.to_yaml}\n#{d2.features.keys.sort.to_yaml}" if (d1.features.keys.sort != d2.features.keys.sort) features = d1.features.keys end raise "at least two features needed" if d1.features.keys.size<2 waiting_task.progress(25) if waiting_task df1 = dataset_to_dataframe(d1,0,subjectid,features) df2 = dataset_to_dataframe(d2,0,subjectid,features) waiting_task.progress(50) if waiting_task @r.eval "df <- rbind(#{df1},#{df2})" @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))" @r.names = [dataset_name1, dataset_name2] LOGGER.debug("r-util> - convert data to 2d") #@r.eval "save.image(\"/tmp/image.R\")" @r.eval "df.2d <- plot_pre_process(df, method='sammon')" waiting_task.progress(75) if waiting_task LOGGER.debug("r-util> - plot data") plot_to_files(files) do |file| @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')" end end |
#install_package(package) ⇒ Object
53 54 55 56 57 58 |
# File 'lib/r-util.rb', line 53 def install_package( package ) unless package_installed?(package) LOGGER.debug "r-util> installing r-package #{package} to #{PACKAGE_DIR}" @r.eval "install.packages('#{package}', repos='http://cran.r-project.org', lib='#{PACKAGE_DIR}')" end end |
#package_installed?(package) ⇒ Boolean
49 50 51 |
# File 'lib/r-util.rb', line 49 def package_installed?( package ) @r_packages.include?(package) end |
#paired_ttest(array1, array2, significance_level = 0.95) ⇒ Object
<0 -> array1 << array2 0 -> no significant difference >0 -> array2 >> array1
63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/r-util.rb', line 63 def paired_ttest(array1, array2, significance_level=0.95) @r.assign "v1",array1 @r.assign "v2",array2 @r.eval "ttest = t.test(as.numeric(v1),as.numeric(v2),paired=T)" t = @r.pull "ttest$statistic" p = @r.pull "ttest$p.value" if (1-significance_level > p) t else 0 end end |
#quit_r ⇒ Object
37 38 39 40 41 42 43 |
# File 'lib/r-util.rb', line 37 def quit_r begin @r.quit @r = nil rescue end end |
#r ⇒ Object
45 46 47 |
# File 'lib/r-util.rb', line 45 def r @r end |
#stratified_k_fold_split(dataset, metadata = {}, missing_values = "NA", num_folds = 10, subjectid = nil, seed = 42, split_features = nil) ⇒ Object
stratified splits a dataset into k datasets according the feature values all features are taken into account unless <split_features> is given returns two arrays of datasets
189 190 191 |
# File 'lib/r-util.rb', line 189 def stratified_k_fold_split( dataset, ={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil ) stratified_split_internal( dataset, , missing_values, num_folds, nil, subjectid, seed, split_features ) end |
#stratified_split(dataset, metadata = {}, missing_values = "NA", pct = 0.3, subjectid = nil, seed = 42, split_features = nil) ⇒ Object
stratified splits a dataset into two dataset according to the feature values all features are taken into account unless <split_features> is given returns two datases
182 183 184 |
# File 'lib/r-util.rb', line 182 def stratified_split( dataset, ={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil ) stratified_split_internal( dataset, , missing_values, nil, pct, subjectid, seed, split_features ) end |