Module: OpenTox::Algorithm::FeatureSelection
- Includes:
- OpenTox::Algorithm
- Defined in:
- lib/algorithm.rb
Instance Attribute Summary
Attributes included from OpenTox
Class Method Summary collapse
-
.rfe(params) ⇒ String
Recursive Feature Elimination using caret.
Methods included from OpenTox::Algorithm
effect, gauss, get_cdk_descriptors, get_jl_descriptors, get_ob_descriptors, isnull_or_singular?, load_ds_csv, min_frequency, numeric?, pc_descriptors, #run, sum_size, #to_rdfxml, zero_variance?
Methods included from OpenTox
#add_metadata, all, #delete, #initialize, #load_metadata, sign_in, text_to_html, #to_rdfxml
Class Method Details
.rfe(params) ⇒ String
Recursive Feature Elimination using caret
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 |
# File 'lib/algorithm.rb', line 516 def self.rfe(params) @r=RinRuby.new(false,false) @r.ds_csv_file = params[:ds_csv_file].to_s @r.prediction_feature = params[:prediction_feature].to_s @r.fds_csv_file = params[:fds_csv_file].to_s @r.del_missing = params[:del_missing] == true ? 1 : 0 r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_") @r.f_fds_r = r_result_file.to_s # need packs 'randomForest', 'RANN' @r.eval <<-EOR suppressPackageStartupMessages(library('caret')) suppressPackageStartupMessages(library('randomForest')) suppressPackageStartupMessages(library('RANN')) suppressPackageStartupMessages(library('doMC')) registerDoMC() set.seed(1) acts = read.csv(ds_csv_file, check.names=F) feats = read.csv(fds_csv_file, check.names=F) ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-) features = ds[,(dim(acts)[2]+1):(dim(ds)[2])] y = ds[,which(names(ds) == prediction_feature)] # assumes a data matrix 'features' and a vector 'y' of target values row.names(features)=NULL # features with all values missing removed na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) ) features = features[,!names(features) %in% na_col] # features with infinite values removed inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) ) features = features[,!names(features) %in% inf_col] # features with zero variance removed zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) ) features = features[,!names(features) %in% zero_var] pp = NULL if (del_missing) { # needed if rows should be removed na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) ) features = features[!na_ids,] y = y[!na_ids] pp = preProcess(features, method=c("scale", "center")) } else { # Use imputation if NA's random (only then!) pp = preProcess(features, method=c("scale", "center", "knnImpute")) } features = predict(pp, features) # features with nan values removed (sometimes preProcess return NaN values) nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) ) features = features[,!names(features) %in% nan_col] # determine subsets subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7) #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) #subsets = c(2,3,4,5,7,10,subsets) #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30) subsets = unique(sort(round(subsets))) subsets = subsets[subsets<=dim(features)[2]] subsets = subsets[subsets>1] # Recursive feature elimination rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets) # read existing dataset and select most useful features csv=feats[,c("SMILES", rfProfile$optVariables)] write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='') EOR r_result_file end |