Template for GTEx example on cluster

Last updated: 2018-02-11

Code version: d53227e

Install the package

1 please git clone the flashr on PPS.

git clone https://github.com/stephenslab/flashr.git

2 install flashr locally

R CMD build flashr

R CMD INSTALL flashr_0.2-2.tar.gz

prepare the data

original data

Please read your original data as a \(N \times P\) matrix in R.

In this case you can create a folder the restore the data as .rds file

mkdir testflashr
cd testflashr
cp gtexEQTL_zscore.rds GTEX/testflashr/

now you have the data matrix in gtexEQTL_zscore.rds

to restore the result and track the errors, you can crest folders

mkdir output
mkdir ourlog

run FLASH

Here I take the GTEx EQTL zscore as an example

creat a .R file

library(ashr)
library(flashr)
load("./gtexEQTL_zscore.rds")
Y = t(zscore)
data = flash_set_data(Y)
f_greedy = flash_add_greedy(data,Kmax=60)
f_greedy_bf = flash_backfit(data,f_greedy)
#f_greedy = flash_add_greedy(data,Kmax=60,var_type = "by_column",ash_param=list(method = "fdr"))
#f_greedy_bf = flash_backfit(data,f_greedy,var_type = "by_column",ash_param=list(method = "fdr"))
saveRDS(f_greedy,file = "./output/gflashvarcol.rds")
saveRDS(f_greedy_bf,file = "./output/bflashvarcol.rds")

here 60 is much larger than the sample size. our method doesn’t restrict \(K < \min(P,N)\). But for this case, 60 is enough. We call this file as flashwrapper.R

creat a .sbatch file

#!/bin/bash

#SBATCH --job-name=arrayJob
#SBATCH --output=/home/weidong/HG/flash/data/GTEX/testflashr/outlog/arrayJob_%A_%a.out
#SBATCH --error=/home/weidong/HG/flash/data/GTEX/testflashr/outlog/arrayJob_%A_%a.err
#SBATCH --array=1
#SBATCH --time=30:00:00
#SBATCH --partition=mstephens
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --mem-per-cpu=8G


######################
# Begin work section #
######################

# Print this sub-job's task ID
cd /home/weidong/HG/flash/data/GTEX/testflashr
Rscript --verbose flashwrapper.R

you can substitute the folder path /home/weidong/HG/flash/data/GTEX/testflashr. and we call this .sbatch file as runflash.sbatch

run

sbatch runflash.sbatch

plot

Get the result from flashr as bflashvarcol.rds (backfitting result).

b_flash = readRDS("../data/GTExdata/res_flashr2/bflashvarcol.rds")
load("../data/GTExdata/gtexEQTL_zscore.rds")
ssY = sum(zscore^2)
K = dim(b_flash$EL)[2] -1
pve = (sapply(seq(1,K),function(x){ sum(b_flash$EL[,x]^2 %*% t(b_flash$EF[,x]^2)) }))/ssY
pve = pmax(round(pve,3),0.001)

dat = read.table('../data/GTExColors.txt', sep = '\t', comment.char = '')
colordata = dat[c(1:6,9:18,21:23,26:30,32,33,35,36,38:53),1:2]

L = b_flash$EL[,1:14]
library(reshape2)
data_L = melt(L)
colnames(data_L) = c("tissue","loading","value")
library(ggplot2)
tissue_color = as.character(colordata[,2])
data_L$tissue = factor(data_L$tissue,levels = 1:44 ,labels = as.character(colordata[,1]) )
data_L$loading = factor(data_L$loading,levels = 1:14 ,labels = paste("Factor",1:14,"; pve:", pve[1:14]))


ggplot(data_L,aes(x = tissue,y = value,fill = factor(tissue) )) +
  geom_bar(stat = "identity",width = 0.6) +
  scale_fill_manual(values=tissue_color) +
  scale_x_discrete(labels = NULL) +
  theme_grey()+
  theme(legend.position="right", legend.text=element_text(size=9), axis.text.y = element_text(size = 5)) + 
  labs(title = "GTEx data", y = "factor values" ,x = "tissues", fill="tissue") +
  facet_wrap(~loading, ncol = 2, scales = "free_y") +
  guides(fill = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.3))

  #ggsave("flashrGTEx1.pdf", width = 8, height = 11)

# the 27th factor is zero
L = b_flash$EL[,15:26]
library(reshape2)
data_L = melt(L)
colnames(data_L) = c("tissue","loading","value")
library(ggplot2)
tissue_color = as.character(colordata[,2])
data_L$tissue = factor(data_L$tissue,levels = 1:44 ,labels = as.character(colordata[,1]) )
data_L$loading = factor(data_L$loading,levels = 1:12 ,labels = paste("Factor",15:26,"; pve:", pve[15:26]))


ggplot(data_L,aes(x = tissue,y = value,fill = factor(tissue) )) +
  geom_bar(stat = "identity",width = 0.6) +
  scale_fill_manual(values=tissue_color) +
  scale_x_discrete(labels = NULL) +
  theme_grey()+
  theme(legend.position="right", legend.text=element_text(size=9), axis.text.y = element_text(size = 5)) + 
  labs(title = "GTEx data", y = "factor values" ,x = "tissues", fill="tissue") +
  facet_wrap(~loading, ncol = 2, scales = "free_y") +
  guides(fill = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.3))

  #ggsave("flashrGTEx2.pdf", width = 8, height = 10)

An example to run SFA on RCC or PPS

go to the folder where the data is.

sfa wrapper

this example is on PPS cluster

SFA.wrapper = function(X, K){
  N = dim(X)[1]
  P = dim(X)[2]

  if(file.exists("SFAout")){
    unlink("SFAout", recursive= T)
  }
  system("mkdir SFAout")

  write.table(X,file="dscsfa.txt",row.names=F,col.names=F)
  # sfa command line on cluster
  # ~/flash/simulation/methods/sfa/src/sfa -gen ./dscsfamix.txt -g 600 -k 1 -n 200 -iter 100 -rand 999 -o ./SFAout/sfa

  # sqrt(mean(( loadings %*% factor - test$L_true%*%t(test$F_true))^2))/sqrt(mean((test$L_true%*%t(test$F_true))^2))
  system(paste("~/flash/simulation/methods/sfa/src/sfa", "-gen", "./dscsfa.txt", "-g", N, "-k", K,
               "-n", P, "-iter", 100, "-rand", 999, "-o", "./SFAout/sfa", sep=" "))

  if(file.info("./SFAout/sfa_F.out")$size == 1){
    return(list(L = 0, F = 0))
  } else{
    Fhat=read.table("./SFAout/sfa_F.out")
    lambda=read.table("./SFAout/sfa_lambda.out")
    Fhat=as.matrix(Fhat)
    lambda=as.matrix(lambda)
    return(list(L = lambda, F = Fhat))
  }

}

SFAmix.wrapper = function(X,K){
  N = dim(X)[1]
  P = dim(X)[2]
  
  write.table(X,file="dscsfamix.txt",row.names=F,col.names=F)
  print(class(X))
  print(dim(X))
  
  if(file.exists("SFAmixout")){
    unlink("SFAmixout", recursive= T)
  }
  system("mkdir SFAmixout")
  
  # ~/mvash/mvsim/SFAmix/SFAmix --nf K --y dscrsfamix.txt --out SFAmixout --sep space
  # this is run on the PPS cluster
  system(paste("~/mvash/mvsim/SFAmix/SFAmix","--nf",K,"--y","dscsfamix.txt","--out",
               "SFAmixout","--sep","space",sep=" "))
  
  alpha=read.table("./SFAmixout/PSI")
  
  if(file.info("./SFAmixout/EX")$size == 1){
    return(list(L = 0, F = 0))
  } else{
    Fhat=read.table("./SFAmixout/EX")
    lambda=read.table("./SFAmixout/LAM")
    Psi=as.vector(alpha)
    Fhat=as.matrix(Fhat)
    lambda=as.matrix(lambda)
    P=dim(lambda)[2]
    n=dim(Fhat)[1]
    return(list(L = Fhat, F = lambda))
  }
  
}

this is sfawrapper.R

creat run_sfa.R file

load("~/flash/dataanalysis/GTeX/EQTLzscore/gtexEQTL_zscore.rds")
setwd("~/flash/dataanalysis/GTeX/EQTLzscore/")
source("./sfawrapper.R")
source("./SFAmixwrapper.R")
Y = zscore
N = dim(Y)[1]
P = dim(Y)[2]
gsfa = SFA.wrapper(Y, 30)
saveRDS(gsfa, file = "~/flash/dataanalysis/GTeX/EQTLzscore/sfaGTExzscore.rds")


Y = t(zscore)
gsfamix = SFAmix.wrapper(Y,30)
saveRDS(gsfamix, file = "~/flash/dataanalysis/GTeX/EQTLzscore/sfamixGTExzscore.rds")

and run it

R CMD BATCH run_sfa.R

remark

For Y = t(zscore), SFA provide NaN result when set K = 30 and K = 26. (all the \(L_i\) and \(F_j\) are NaN). So I use Y = zscore which works in K = 30 and K = 26. In this case, I set K = 30

in this case we can get

> gsfa = SFA.wrapper(Y, 30)
 Setting G to be 16069
 Setting N to be 44
Trying to read in matrix with G=16069 and N =44
Opening matrix ./dscsfa.txt with 16069 rows and 44 columns
Opening matrix ./dscsfa.txt with 16069 rows and 44 columns
read in matrix 16069 by 44
initializing sfa...
G = 16069
N = 44
K = 30

But for SFAmix we should use Y = t(zscore).

The approaches make sure we can get \((\hat{\sigma}_1^2,\cdots,\hat{\sigma}_j^2,\cdots)\) is a 16069 vector.

SFA K=3

g3 = readRDS('../data/GTExdata/SFAres/sfaK_3.rds')
par(mfrow = c(1,3),mar=c(5.1,4.1,4.1,2.1)-1.9)
for(k in 1:3){
  barplot(g3$F[k,],main = paste("factor",k))
}

plot SFA result K = 30

sfares = readRDS("../data/GTExdata/SFAres/sfaGTExzscore.rds")
# the F is K by N matrix! not N by K matrix
par(mfrow = c(2,3),mar=c(5.1,4.1,4.1,2.1)-1.9)
for(k in 1:30){
  barplot(sfares$F[k,],main = paste("factor",k))
}

plot of SFA result with t(Y) K = 10

If you would like to use t(Y) you should use -vn in the sfa command line in order to get \((\hat{\sigma}_1^2,\cdots,\hat{\sigma}_j^2,\cdots)\) is a 16069 vector

you need add -mg to get the mean vector with length of 44

load("../data/GTExdata/gtexEQTL_zscore.rds")
gt10 = readRDS('../data/GTExdata/SFAres/sfaTK_10m.rds')
ssy = sum(zscore^2)
pve = sapply(seq(1:10),function(x){sum((gt10$LF$L[,x] %*% t(gt10$LF$F[x,]))^2)})
pve = pve/ssy

mu = sapply(seq(1:44),function(x){gt10$mu[[x]]})
par(mfrow = c(2,3),mar=c(5.1,4.1,4.1,2.1)-1.9)
barplot(mu,main = "mean vector")
for(k in 1:10){
  barplot(gt10$LF$L[,k],main = paste("factor",k,"pve",round(pve[k],3)))
}

problem in K = 20 and K = 30 in this case.

gsl: lu.c:147: ERROR: matrix is singular
Default GSL error handler invoked.

plot SFAmix result

sfamixres = readRDS("../data/GTExdata/SFAres/sfamixGTExzscore.rds")
dim(sfamixres$L)

[1] 44  1

dim(sfamixres$F)

[1]     1 16069

this is only a rank one matrix. I tried two time. two runs give rank one matrix.

Session information

sessionInfo()

R version 3.3.0 (2016-05-03)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.13.3 (unknown)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] reshape2_1.4.3  flashr_0.4-6    workflowr_0.4.0 rmarkdown_1.6  
 [5] ggplot2_2.2.1   R.matlab_3.6.1  softImpute_1.4  Matrix_1.2-11  
 [9] PMA_1.0.9       impute_1.48.0   plyr_1.8.4      ssvd_1.0       

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.14      git2r_0.19.0      R.methodsS3_1.7.1
 [4] R.utils_2.5.0     iterators_1.0.9   tools_3.3.0      
 [7] digest_0.6.13     memoise_1.1.0     evaluate_0.10.1  
[10] tibble_1.3.4      gtable_0.2.0      lattice_0.20-35  
[13] rlang_0.1.6       foreach_1.4.4     rstudioapi_0.6   
[16] curl_2.8.1        yaml_2.1.16       parallel_3.3.0   
[19] gridExtra_2.3     httr_1.3.0        withr_2.1.1      
[22] stringr_1.2.0     knitr_1.18        devtools_1.13.3  
[25] rprojroot_1.2     grid_3.3.0        R6_2.2.2         
[28] flashr2_0.4-0     ashr_2.2-3        magrittr_1.5     
[31] scales_0.4.1      backports_1.1.2   codetools_0.2-15 
[34] htmltools_0.3.6   MASS_7.3-47       colorspace_1.3-2 
[37] labeling_0.3      stringi_1.1.6     lazyeval_0.2.0   
[40] doParallel_1.0.11 munsell_0.4.3     pscl_1.5.2       
[43] truncnorm_1.0-7   SQUAREM_2017.10-1 R.oo_1.21.0

This R Markdown site was created with workflowr