Last updated: 2018-02-11

Code version: d53227e

preparation

  1. creat a file by mkdir Breastcancer and put the data file into this folder. in this case, we use matlab function as well, so we use .mat file for the data.

  2. creat a folder to track the code mkdir outlog

wrapper file

R functions

miss_gen = function(Y, percent = 0.2){
  N = dim(Y)[1]
  P = dim(Y)[2]
  colindex = sample(P,(P*percent))
  rowindex = sample(N,(N*percent))
  missindex = list(colindex = colindex, rowindex = rowindex)
  return(missindex)
}

# methods
flashG_wrapper = function(Y, missindex, rank_max,type = "constant"){
  # delete the value in the missing position
  Y_miss = Y
  Y_miss[missindex$rowindex, missindex$colindex] = NA
  data = flashr::flash_set_data(Y_miss)
  g_flash = flashr::flash_add_greedy(data,Kmax=rank_max,ash_param=list(method = "fdr"),var_type = type)
  misshat_flash = g_flash$EL %*% t(g_flash$EF)
  rmse = sqrt(mean((Y[missindex$rowindex, missindex$colindex] - misshat_flash[missindex$rowindex, missindex$colindex])^2))
  return(rmse)
}

flashB_wrapper = function(Y, missindex, rank_max,type = "constant"){
  # delete the value in the missing position
  Y_miss = Y
  Y_miss[missindex$rowindex, missindex$colindex] = NA
  data = flashr::flash_set_data(Y_miss)
  g_flash = flashr::flash_add_greedy(data,Kmax=rank_max,ash_param=list(method = "fdr"),var_type = type)
  b_flash = flash_backfit(data,g_flash)
  misshat_flash = b_flash$EL %*% t(b_flash$EF)
  rmse = sqrt(mean((Y[missindex$rowindex, missindex$colindex] - misshat_flash[missindex$rowindex, missindex$colindex])^2))
  return(rmse)
}

softImpute_wrapper = function(Y, missindex, rank_max){
  # delete the value in the missing position
  Y_miss = Y
  Y_miss[missindex$rowindex, missindex$colindex] = NA
  gsoft = softImpute::softImpute(Y_miss, rank.max = rank_max, lambda = 30)
  misshat_soft =  gsoft$u %*% diag(gsoft$d) %*% t(gsoft$v)
  rmse = sqrt(mean((Y[missindex$rowindex, missindex$colindex] - misshat_soft[missindex$rowindex, missindex$colindex])^2))
  return(rmse)
}

PMD_wrapper = function(Y, missindex, rank_max){
  # delete the value in the missing position
  Y_miss = Y
  Y_miss[missindex$rowindex, missindex$colindex] = NA
  out = PMA::PMD(Y_miss, K = rank_max)
  misshat_PMD =  out$u %*% diag(out$d) %*% t(out$v)
  rmse = sqrt(mean((Y[missindex$rowindex, missindex$colindex] - misshat_PMD[missindex$rowindex, missindex$colindex])^2))
  return(rmse)
}
library(R.matlab)
# call the R functions
source("~/HG/flash/data/CVmissflashr/template/Rfunction.R")
# read the data
# you need to change the path for different data
Y_data = R.matlab::readMat("~/HG/flash/data/CVmissflashr/Breastcancer/example.mat")
Y = Y_data$Y
N = dim(Y)[1]
P = dim(Y)[2]
Y = Y - rowMeans(Y) %*% t(rep(1,P))
result = c()
missindex = miss_gen(Y, percent = 0.1)
# creat a folder for the output and matlab in the folder we make
if(file.exists("NBSFAout")){
  unlink("NBSFAout", recursive= T)
}
system("mkdir NBSFAout")
# write the missing index
writeMat("./NBSFAout/missindex.mat", missindex = missindex)

flashG_rmse = flashG_wrapper(Y, missindex, rank_max = 50)
pmd_rmse  = PMD_wrapper(Y, missindex, rank_max= 50 )
soft_rmse = softImpute_wrapper(Y, missindex, rank_max = 50)
# then call the matlab estiamtion 
# this for the folder of the data
system("matlab -nosplash -nodesktop -r \"addpath(\'~/HG/flash/data/CVmissflashr/template\'); run(\'missvalue.m\');exit;\" ")
g_nsfa = readMat("./NBSFAout/NSFAresult.mat")
nsfa_rmse = as.vector(g_nsfa$rmse.NSFA)

result = c(flashG_rmse, nsfa_rmse,pmd_rmse,soft_rmse)

saveRDS(result, "./NBSFAout/output.rds")

matlab code

addpath('~/HG/flash/data/missingvalue/methods/NBSF/nsfa-master/');
addpath('~/HG/flash/data/missingvalue/methods/NBSF/nsfa-master/utils/');

% you need to change the path and the data file (not centered)
load ../example.mat;
Ycentered=Y-repmat(mean(Y,2),1,size(Y,2));
settings=defaultsettings();
[settings.D,settings.N]=size(Ycentered);
settings.iterations=100;
% now the missing is decide by the this is the missing index.(or you can upload the missing matrix)

file_path = fullfile(pwd,'NBSFAout/missindex.mat');
load(file_path);
mvmask = ones(settings.D,settings.N);
mvmask(missindex.rowindex,missindex.colindex) = 0;

initialsample=init_nsfa(settings);
[finalsample,resultstable]=nsfa(Ycentered,mvmask,initialsample,settings);
Y_hat = (finalsample.G *  finalsample.X);

sse_NSFA = sum(sum((Ycentered(missindex.rowindex,missindex.colindex) - Y_hat(missindex.rowindex,missindex.colindex)).^2));
mse_NSFA = sse_NSFA/(length(missindex.rowindex)*length(missindex.colindex))
rmse_NSFA = sqrt(mse_NSFA)

save_path = fullfile(pwd,'NBSFAout/NSFAresult');
save(save_path,'rmse_NSFA'); 

batch file

#!/bin/bash

#SBATCH --job-name=arrayJob
#SBATCH --output=/home/weidong/HG/flash/data/CVmissflashr/Breastcancer/outlog/arrayJob_%A_%a.out
#SBATCH --error=/home/weidong/HG/flash/data/CVmissflashr/Breastcancer/outlog/arrayJob_%A_%a.err
#SBATCH --array=1-100
#SBATCH --time=06:00:00
#SBATCH --partition=broadwl
#SBATCH --ntasks=1
#SBATCH --mem-per-cpu=6000


######################
# Begin work section #
######################

# Print this sub-job's task ID
cd /home/weidong/HG/flash/data/CVmissflashr/Breastcancer
mkdir test${SLURM_ARRAY_TASK_ID}
cd test${SLURM_ARRAY_TASK_ID}
Rscript --verbose /home/weidong/HG/flash/data/CVmissflashr/Breastcancer/runR.R

run

sbatch Jobs.sbatch

plot the result

T = 100
results = matrix(NA,ncol = 4, nrow = T)
for(i in 1:T){
  test_folder = paste("test", i, sep = "")
  out_folder = "NBSFAout"
  out_file = "output.rds"
  file_name = file.path(test_folder,out_folder,out_file)
  results[i,] = try(readRDS(file_name))
}

Session information

sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.13.3 (unknown)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] reshape2_1.4.3  flashr_0.4-6    workflowr_0.4.0 rmarkdown_1.6  
 [5] ggplot2_2.2.1   R.matlab_3.6.1  softImpute_1.4  Matrix_1.2-11  
 [9] PMA_1.0.9       impute_1.48.0   plyr_1.8.4      ssvd_1.0       

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.14      git2r_0.19.0      R.methodsS3_1.7.1
 [4] R.utils_2.5.0     iterators_1.0.9   tools_3.3.0      
 [7] digest_0.6.13     memoise_1.1.0     evaluate_0.10.1  
[10] tibble_1.3.4      gtable_0.2.0      lattice_0.20-35  
[13] rlang_0.1.6       foreach_1.4.4     rstudioapi_0.6   
[16] curl_2.8.1        yaml_2.1.16       parallel_3.3.0   
[19] gridExtra_2.3     httr_1.3.0        withr_2.1.1      
[22] stringr_1.2.0     knitr_1.18        devtools_1.13.3  
[25] rprojroot_1.2     grid_3.3.0        R6_2.2.2         
[28] flashr2_0.4-0     ashr_2.2-3        magrittr_1.5     
[31] scales_0.4.1      backports_1.1.2   codetools_0.2-15 
[34] htmltools_0.3.6   MASS_7.3-47       colorspace_1.3-2 
[37] labeling_0.3      stringi_1.1.6     lazyeval_0.2.0   
[40] doParallel_1.0.11 munsell_0.4.3     pscl_1.5.2       
[43] truncnorm_1.0-7   SQUAREM_2017.10-1 R.oo_1.21.0      

This R Markdown site was created with workflowr