Last updated: 2018-02-11
Code version: d53227e
1 please git clone the flashr
on PPS.
git clone https://github.com/stephenslab/flashr.git
2 install flashr
locally
R CMD build flashr
R CMD INSTALL flashr_0.2-2.tar.gz
Please read your original data as a \(N \times P\) matrix in R.
In this case you can create a folder the restore the data as .rds
file
mkdir testflashr
cd testflashr
cp gtexEQTL_zscore.rds GTEX/testflashr/
now you have the data matrix in gtexEQTL_zscore.rds
to restore the result and track the errors, you can crest folders
mkdir output
mkdir ourlog
Here I take the GTEx EQTL zscore as an example
library(ashr)
library(flashr)
load("./gtexEQTL_zscore.rds")
Y = t(zscore)
data = flash_set_data(Y)
f_greedy = flash_add_greedy(data,Kmax=60)
f_greedy_bf = flash_backfit(data,f_greedy)
#f_greedy = flash_add_greedy(data,Kmax=60,var_type = "by_column",ash_param=list(method = "fdr"))
#f_greedy_bf = flash_backfit(data,f_greedy,var_type = "by_column",ash_param=list(method = "fdr"))
saveRDS(f_greedy,file = "./output/gflashvarcol.rds")
saveRDS(f_greedy_bf,file = "./output/bflashvarcol.rds")
here 60 is much larger than the sample size. our method doesn’t restrict \(K < \min(P,N)\). But for this case, 60 is enough. We call this file as flashwrapper.R
#!/bin/bash
#SBATCH --job-name=arrayJob
#SBATCH --output=/home/weidong/HG/flash/data/GTEX/testflashr/outlog/arrayJob_%A_%a.out
#SBATCH --error=/home/weidong/HG/flash/data/GTEX/testflashr/outlog/arrayJob_%A_%a.err
#SBATCH --array=1
#SBATCH --time=30:00:00
#SBATCH --partition=mstephens
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --mem-per-cpu=8G
######################
# Begin work section #
######################
# Print this sub-job's task ID
cd /home/weidong/HG/flash/data/GTEX/testflashr
Rscript --verbose flashwrapper.R
you can substitute the folder path /home/weidong/HG/flash/data/GTEX/testflashr
. and we call this .sbatch
file as runflash.sbatch
sbatch runflash.sbatch
Get the result from flashr
as bflashvarcol.rds
(backfitting result).
b_flash = readRDS("../data/GTExdata/res_flashr2/bflashvarcol.rds")
load("../data/GTExdata/gtexEQTL_zscore.rds")
ssY = sum(zscore^2)
K = dim(b_flash$EL)[2] -1
pve = (sapply(seq(1,K),function(x){ sum(b_flash$EL[,x]^2 %*% t(b_flash$EF[,x]^2)) }))/ssY
pve = pmax(round(pve,3),0.001)
dat = read.table('../data/GTExColors.txt', sep = '\t', comment.char = '')
colordata = dat[c(1:6,9:18,21:23,26:30,32,33,35,36,38:53),1:2]
L = b_flash$EL[,1:14]
library(reshape2)
data_L = melt(L)
colnames(data_L) = c("tissue","loading","value")
library(ggplot2)
tissue_color = as.character(colordata[,2])
data_L$tissue = factor(data_L$tissue,levels = 1:44 ,labels = as.character(colordata[,1]) )
data_L$loading = factor(data_L$loading,levels = 1:14 ,labels = paste("Factor",1:14,"; pve:", pve[1:14]))
ggplot(data_L,aes(x = tissue,y = value,fill = factor(tissue) )) +
geom_bar(stat = "identity",width = 0.6) +
scale_fill_manual(values=tissue_color) +
scale_x_discrete(labels = NULL) +
theme_grey()+
theme(legend.position="right", legend.text=element_text(size=9), axis.text.y = element_text(size = 5)) +
labs(title = "GTEx data", y = "factor values" ,x = "tissues", fill="tissue") +
facet_wrap(~loading, ncol = 2, scales = "free_y") +
guides(fill = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.3))
#ggsave("flashrGTEx1.pdf", width = 8, height = 11)
# the 27th factor is zero
L = b_flash$EL[,15:26]
library(reshape2)
data_L = melt(L)
colnames(data_L) = c("tissue","loading","value")
library(ggplot2)
tissue_color = as.character(colordata[,2])
data_L$tissue = factor(data_L$tissue,levels = 1:44 ,labels = as.character(colordata[,1]) )
data_L$loading = factor(data_L$loading,levels = 1:12 ,labels = paste("Factor",15:26,"; pve:", pve[15:26]))
ggplot(data_L,aes(x = tissue,y = value,fill = factor(tissue) )) +
geom_bar(stat = "identity",width = 0.6) +
scale_fill_manual(values=tissue_color) +
scale_x_discrete(labels = NULL) +
theme_grey()+
theme(legend.position="right", legend.text=element_text(size=9), axis.text.y = element_text(size = 5)) +
labs(title = "GTEx data", y = "factor values" ,x = "tissues", fill="tissue") +
facet_wrap(~loading, ncol = 2, scales = "free_y") +
guides(fill = guide_legend(ncol = 1, keyheight = 0.8, keywidth = 0.3))
#ggsave("flashrGTEx2.pdf", width = 8, height = 10)
this example is on PPS cluster
SFA.wrapper = function(X, K){
N = dim(X)[1]
P = dim(X)[2]
if(file.exists("SFAout")){
unlink("SFAout", recursive= T)
}
system("mkdir SFAout")
write.table(X,file="dscsfa.txt",row.names=F,col.names=F)
# sfa command line on cluster
# ~/flash/simulation/methods/sfa/src/sfa -gen ./dscsfamix.txt -g 600 -k 1 -n 200 -iter 100 -rand 999 -o ./SFAout/sfa
# sqrt(mean(( loadings %*% factor - test$L_true%*%t(test$F_true))^2))/sqrt(mean((test$L_true%*%t(test$F_true))^2))
system(paste("~/flash/simulation/methods/sfa/src/sfa", "-gen", "./dscsfa.txt", "-g", N, "-k", K,
"-n", P, "-iter", 100, "-rand", 999, "-o", "./SFAout/sfa", sep=" "))
if(file.info("./SFAout/sfa_F.out")$size == 1){
return(list(L = 0, F = 0))
} else{
Fhat=read.table("./SFAout/sfa_F.out")
lambda=read.table("./SFAout/sfa_lambda.out")
Fhat=as.matrix(Fhat)
lambda=as.matrix(lambda)
return(list(L = lambda, F = Fhat))
}
}
SFAmix.wrapper = function(X,K){
N = dim(X)[1]
P = dim(X)[2]
write.table(X,file="dscsfamix.txt",row.names=F,col.names=F)
print(class(X))
print(dim(X))
if(file.exists("SFAmixout")){
unlink("SFAmixout", recursive= T)
}
system("mkdir SFAmixout")
# ~/mvash/mvsim/SFAmix/SFAmix --nf K --y dscrsfamix.txt --out SFAmixout --sep space
# this is run on the PPS cluster
system(paste("~/mvash/mvsim/SFAmix/SFAmix","--nf",K,"--y","dscsfamix.txt","--out",
"SFAmixout","--sep","space",sep=" "))
alpha=read.table("./SFAmixout/PSI")
if(file.info("./SFAmixout/EX")$size == 1){
return(list(L = 0, F = 0))
} else{
Fhat=read.table("./SFAmixout/EX")
lambda=read.table("./SFAmixout/LAM")
Psi=as.vector(alpha)
Fhat=as.matrix(Fhat)
lambda=as.matrix(lambda)
P=dim(lambda)[2]
n=dim(Fhat)[1]
return(list(L = Fhat, F = lambda))
}
}
this is sfawrapper.R
load("~/flash/dataanalysis/GTeX/EQTLzscore/gtexEQTL_zscore.rds")
setwd("~/flash/dataanalysis/GTeX/EQTLzscore/")
source("./sfawrapper.R")
source("./SFAmixwrapper.R")
Y = zscore
N = dim(Y)[1]
P = dim(Y)[2]
gsfa = SFA.wrapper(Y, 30)
saveRDS(gsfa, file = "~/flash/dataanalysis/GTeX/EQTLzscore/sfaGTExzscore.rds")
Y = t(zscore)
gsfamix = SFAmix.wrapper(Y,30)
saveRDS(gsfamix, file = "~/flash/dataanalysis/GTeX/EQTLzscore/sfamixGTExzscore.rds")
and run it
R CMD BATCH run_sfa.R
For Y = t(zscore)
, SFA provide NaN result when set K = 30
and K = 26
. (all the \(L_i\) and \(F_j\) are NaN). So I use Y = zscore
which works in K = 30
and K = 26
. In this case, I set K = 30
in this case we can get
> gsfa = SFA.wrapper(Y, 30)
Setting G to be 16069
Setting N to be 44
Trying to read in matrix with G=16069 and N =44
Opening matrix ./dscsfa.txt with 16069 rows and 44 columns
Opening matrix ./dscsfa.txt with 16069 rows and 44 columns
read in matrix 16069 by 44
initializing sfa...
G = 16069
N = 44
K = 30
But for SFAmix
we should use Y = t(zscore)
.
The approaches make sure we can get \((\hat{\sigma}_1^2,\cdots,\hat{\sigma}_j^2,\cdots)\) is a 16069
vector.
g3 = readRDS('../data/GTExdata/SFAres/sfaK_3.rds')
par(mfrow = c(1,3),mar=c(5.1,4.1,4.1,2.1)-1.9)
for(k in 1:3){
barplot(g3$F[k,],main = paste("factor",k))
}
sfares = readRDS("../data/GTExdata/SFAres/sfaGTExzscore.rds")
# the F is K by N matrix! not N by K matrix
par(mfrow = c(2,3),mar=c(5.1,4.1,4.1,2.1)-1.9)
for(k in 1:30){
barplot(sfares$F[k,],main = paste("factor",k))
}
If you would like to use t(Y)
you should use -vn
in the sfa
command line in order to get \((\hat{\sigma}_1^2,\cdots,\hat{\sigma}_j^2,\cdots)\) is a 16069
vector
you need add -mg
to get the mean vector with length of 44
load("../data/GTExdata/gtexEQTL_zscore.rds")
gt10 = readRDS('../data/GTExdata/SFAres/sfaTK_10m.rds')
ssy = sum(zscore^2)
pve = sapply(seq(1:10),function(x){sum((gt10$LF$L[,x] %*% t(gt10$LF$F[x,]))^2)})
pve = pve/ssy
mu = sapply(seq(1:44),function(x){gt10$mu[[x]]})
par(mfrow = c(2,3),mar=c(5.1,4.1,4.1,2.1)-1.9)
barplot(mu,main = "mean vector")
for(k in 1:10){
barplot(gt10$LF$L[,k],main = paste("factor",k,"pve",round(pve[k],3)))
}
gsl: lu.c:147: ERROR: matrix is singular
Default GSL error handler invoked.
sfamixres = readRDS("../data/GTExdata/SFAres/sfamixGTExzscore.rds")
dim(sfamixres$L)
[1] 44 1
dim(sfamixres$F)
[1] 1 16069
this is only a rank one matrix. I tried two time. two runs give rank one matrix.
sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.13.3 (unknown)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] reshape2_1.4.3 flashr_0.4-6 workflowr_0.4.0 rmarkdown_1.6
[5] ggplot2_2.2.1 R.matlab_3.6.1 softImpute_1.4 Matrix_1.2-11
[9] PMA_1.0.9 impute_1.48.0 plyr_1.8.4 ssvd_1.0
loaded via a namespace (and not attached):
[1] Rcpp_0.12.14 git2r_0.19.0 R.methodsS3_1.7.1
[4] R.utils_2.5.0 iterators_1.0.9 tools_3.3.0
[7] digest_0.6.13 memoise_1.1.0 evaluate_0.10.1
[10] tibble_1.3.4 gtable_0.2.0 lattice_0.20-35
[13] rlang_0.1.6 foreach_1.4.4 rstudioapi_0.6
[16] curl_2.8.1 yaml_2.1.16 parallel_3.3.0
[19] gridExtra_2.3 httr_1.3.0 withr_2.1.1
[22] stringr_1.2.0 knitr_1.18 devtools_1.13.3
[25] rprojroot_1.2 grid_3.3.0 R6_2.2.2
[28] flashr2_0.4-0 ashr_2.2-3 magrittr_1.5
[31] scales_0.4.1 backports_1.1.2 codetools_0.2-15
[34] htmltools_0.3.6 MASS_7.3-47 colorspace_1.3-2
[37] labeling_0.3 stringi_1.1.6 lazyeval_0.2.0
[40] doParallel_1.0.11 munsell_0.4.3 pscl_1.5.2
[43] truncnorm_1.0-7 SQUAREM_2017.10-1 R.oo_1.21.0
This R Markdown site was created with workflowr