T2D

Count reads

library(RADAR)

samplenames = c("Ctl1","Ctl4","Ctl8","Ctl16","Ctl17","Ctl18","Ctl20","T2D1","T2D4","T2D5","T2D6","T2D7","T2D9","T2D11","T2D12")

RADAR <- countReads(samplenames = samplenames,
                    gtf = "~/Database/genome/hg38/hg38_UCSC.gtf",
                    bamFolder = "~/Rohit_T2D/bam_files",
                    outputDir =  "~/Rohit_T2D",
                    modification = "IP",
                    threads = 25
)

monster <- RADAR

Summary of read count

library(RADAR)
library(qvalue)
#load("~/Rohit_T2D/RADAR.RData")
load("~/Rohit_T2D/RADAR_analysis.RData")
countSummary <- rbind("Input"= colSums(RADAR$reads)[1:length(RADAR$samplenames)], "IP" =colSums(RADAR$reads)[(1:length(RADAR$samplenames))+length(RADAR$samplenames)] )
colnames(countSummary) <- RADAR$samplenames
knitr::kable(countSummary,format = "pandoc" )

	Ctl1	Ctl4	Ctl8	Ctl16	Ctl17	Ctl18	Ctl20	T2D1	T2D4	T2D5	T2D6	T2D7	T2D9	T2D11	T2D12
Input	18320414	17749453	17923796	11508719	15118237	25923296	20302596	16803517	16105448	14146820	17202593	14336611	17758817	19782144	18779123
IP	30287782	25105375	21908869	23197918	21409901	31653712	24753657	27733641	26634012	28457739	22539106	36100747	27434039	22053013	19402847

X <- c(rep("Ctl",7),rep("T2D",8))
RADAR <- normalizeLibrary(RADAR,X)
RADAR <- RADAR::adjustExprLevel(RADAR)
RADAR <- filterBins(RADAR,minCountsCutOff = 15)

RADAR_pos <- RADAR::adjustExprLevel(RADAR[1:12], adjustBy = "pos")
RADAR_pos <- filterBins(RADAR_pos,minCountsCutOff = 15)

INPUT read count

Local INPUT read count

Plot distribution of number of reads in each 50 bp bins

hist(log10(rowMeans(RADAR$reads[rowMeans(RADAR$reads[,grep("input",colnames(RADAR$reads))])>1,grep("input",colnames(RADAR$reads))])  ),xlab = "log10 read count",main = "Distribution of INPUT read count in bins",xlim = c(0,3), col =rgb(0.2,0.2,0.2,0.5),cex.main = 2,cex.axis =2,cex.lab=2)
axis(side = 1, lwd = 2,cex.axis =2)
axis(side = 2, lwd = 2,cex.axis =2)

local read count V.S. geneSum

Compute the within group variability of INPUT geneSum VS INPUT local read count.

var.coef <- function(x){sd(as.numeric(x))/mean(as.numeric(x))}
## filter expressed genes
geneSum <- RADAR$geneSum[rowSums(RADAR$geneSum)>16,]
## within group variability
relative.var <- t( apply(geneSum,1,tapply,RADAR$X,var.coef) )
geneSum.var <- c(relative.var)

## For each gene used above, random sample a 50bp bin within this gene
set.seed(1)
r.bin50 <- tapply(rownames(RADAR$norm.input)[which(RADAR$geneBins$gene %in% rownames(geneSum))],as.character( RADAR$geneBins$gene[which(RADAR$geneBins$gene %in% rownames(geneSum))]) ,function(x){
  n <- sample(1:length(x),1)
  return(x[n])
})

relative.var <- apply(RADAR$norm.input[r.bin50,],1,tapply,RADAR$X,var.coef)
bin50.var <- c(relative.var)
bin50.var <- bin50.var[!is.na(bin50.var)]

## 100bp bins
r.bin100 <- tapply(rownames(RADAR$norm.input)[which(RADAR$geneBins$gene %in% rownames(geneSum))],as.character( RADAR$geneBins$gene[which(RADAR$geneBins$gene %in% rownames(geneSum))]) ,function(x){
  n <- sample(1:(length(x)-2),1)
  return(x[n:(n+1)])
})
relative.var <- lapply(r.bin100, function(x){ tapply( colSums(RADAR$norm.input[x,]),RADAR$X,var.coef ) })
bin100.var <- unlist(relative.var)
bin100.var <- bin100.var[!is.na(bin100.var)]

## 200bp bins
r.bin200 <- tapply(rownames(RADAR$norm.input)[which(RADAR$geneBins$gene %in% rownames(geneSum))],as.character( RADAR$geneBins$gene[which(RADAR$geneBins$gene %in% rownames(geneSum))]) ,function(x){
  if(length(x)>4){
    n <- sample(1:(length(x)-3),1)
    return(x[n:(n+3)])
  }else{
    return(NULL)
  }
})
r.bin200 <- r.bin200[which(!unlist(lapply(r.bin200,is.null)) ) ]
relative.var <- lapply(r.bin200, function(x){ tapply( colSums(RADAR$norm.input[x,]),RADAR$X,var.coef ) })

bin200.var <- unlist(relative.var)
bin200.var <- bin200.var[!is.na(bin200.var)]

relative.var <- data.frame(group=c(rep("geneSum",length(geneSum.var)),rep("bin-50bp",length(bin50.var)),rep("bin-100bp",length(bin100.var)),rep("bin-200bp",length(bin200.var))), variance=c(geneSum.var,bin50.var,bin100.var,bin200.var))

ggplot(relative.var,aes(variance,colour=group))+geom_density()+xlab("Relative variation")+
  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",size = 1),
                   axis.title.x=element_text(size=20, face="bold", hjust=0.5,family = "arial"),
                   axis.title.y=element_text(size=20, face="bold", vjust=0.4, angle=90,family = "arial"),
                   legend.position = c(0.88,0.88),legend.title=element_blank(),legend.text = element_text(size = 14, face = "bold",family = "arial"),
                   axis.text = element_text(size = 15,face = "bold",family = "arial",colour = "black")  )

Compare MeRIP-seq IP data with regular RNA-seq data

var.coef <- function(x){sd(as.numeric(x))/mean(as.numeric(x))}

ip_coef_var <- t(apply(RADAR$ip_adjExpr_filtered,1,tapply,X,var.coef))
ip_coef_var <- ip_coef_var[!apply(ip_coef_var,1,function(x){return(any(is.na(x)))}),]
#hist(c(ip_coef_var),main = "M14KO mouse liver\n m6A-IP",xlab = "within group coefficient of variation",breaks = 50)
###
gene_coef_var <-  t(apply(RADAR$geneSum,1,tapply,X,var.coef))
gene_coef_var <- gene_coef_var[!apply(gene_coef_var,1,function(x){return(any(is.na(x)))}),]
#hist(c(gene_coef_var),main = "M14KO mouse liver\n RNA-seq",xlab = "within group coefficient of variation",breaks = 50)

coef_var <- list('RNA-seq'=c(gene_coef_var),'m6A-IP'=c(ip_coef_var))
nn<-sapply(coef_var, length)
rs<-cumsum(nn)
re<-rs-nn+1
grp <- factor(rep(names(coef_var), nn), levels=names(coef_var))
coef_var.df <- data.frame(coefficient_var = c(c(gene_coef_var),c(ip_coef_var)),label = grp)

ggplot(data = coef_var.df,aes(coefficient_var,colour = grp))+geom_density()+  theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",size = 1),
                   axis.title.x=element_text(size=20, face="bold", hjust=0.5,family = "arial"),
                   axis.title.y=element_text(size=20, face="bold", vjust=0.4, angle=90,family = "arial"),
                   legend.position = c(0.9,0.9),legend.title=element_blank(),legend.text = element_text(size = 18, face = "bold",family = "arial"),
                   axis.text = element_text(size = 15,face = "bold",family = "arial",colour = "black")   )+
  xlab("Coefficient of variation")

Check mean variance relationship of the data.

ip_var <- t(apply(RADAR$ip_adjExpr_filtered,1,tapply,X,var))
#ip_var <- ip_coef_var[!apply(ip_var,1,function(x){return(any(is.na(x)))}),]
ip_mean <- t(apply(RADAR$ip_adjExpr_filtered,1,tapply,X,mean))

###
gene_var <-  t(apply(RADAR$geneSum,1,tapply,X,var))
#gene_var <- gene_var[!apply(gene_var,1,function(x){return(any(is.na(x)))}),]
gene_mean <- t(apply(RADAR$geneSum,1,tapply,X,mean))

all_var <- list('RNA-seq'=c(gene_var),'m6A-IP'=c(ip_var))
nn<-sapply(all_var, length)
rs<-cumsum(nn)
re<-rs-nn+1
group <- factor(rep(names(all_var), nn), levels=names(all_var))
all_var.df <- data.frame(variance = c(c(gene_var),c(ip_var)),mean= c(c(gene_mean),c(ip_mean)),label = group)

ggplot(data = all_var.df,aes(x=mean,y=variance,colour = group,shape = group))+geom_point(size = I(0.2))+stat_smooth(se = T,show.legend = F)+stat_smooth(se = F)+theme_bw() +xlab("Mean")+ylab("Variance") +theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",size = 1),
                   axis.title.x=element_text(size=22, face="bold", hjust=0.5,family = "arial"),
                   axis.title.y=element_text(size=22, face="bold", vjust=0.4, angle=90,family = "arial"),
                   legend.position = c(0.85,0.95),legend.title=element_blank(),legend.text = element_text(size = 20, face = "bold",family = "arial"),
                   axis.text = element_text(size = 15,face = "bold",family = "arial",colour = "black") ) +
  scale_x_continuous(limits = c(0,5000))+scale_y_continuous(limits = c(0,4e6))

inputCount <- as.data.frame(RADAR$reads[,1:length(RADAR$samplenames)])
logInputCount <- as.data.frame(log( RADAR$geneSum  )  )

ggplot(logInputCount, aes(x = `Ctl18` , y = `T2D6` ))+geom_point()+geom_abline(intercept = 0,slope = 1, lty = 2, colour = "red")+xlab(paste("Log counts Ctl18") )+ylab(paste("Log counts T2D6") )+theme_classic(base_line_size = 0.8)+theme(axis.title = element_text(size = 12,face = "bold"), axis.text =  element_text(size = 10,face = "bold",colour = "black") )

ggplot(logInputCount, aes(x = `Ctl18` , y = `T2D5` ))+geom_point()+geom_abline(intercept = 0,slope = 1, lty = 2, colour = "red")+xlab(paste("Log counts Ctl18") )+ylab(paste("Log counts T2D5") )+theme_classic(base_line_size = 0.8)+theme(axis.title = element_text(size = 12,face = "bold"), axis.text =  element_text(size = 10,face = "bold",colour = "black") )

PCA analysis

After geting the filtered and normalized IP read count for testing, we can use PCA analysis to check unkown variation.

top_bins <- RADAR$ip_adjExpr_filtered[order(rowMeans(RADAR$ip_adjExpr_filtered))[1:15000],]

We plot PCA colored by Ctl vs T2D

plotPCAfromMatrix(top_bins,group = X)+scale_color_discrete(name = "Group")

(Norm by position)

plotPCAfromMatrix(RADAR_pos$ip_adjExpr_filtered[order(rowMeans(RADAR_pos$ip_adjExpr_filtered))[1:15000],],group = X )+scale_color_discrete(name = "Group")

To check if samples are separated by different sequencing lanes, label the samples by lane batch.

batch <- c("A","A","A","B","B","C","C","A","A","B","A","B","B","A","C")
plotPCAfromMatrix(top_bins,group = paste0(batch))+scale_color_discrete(name = "Batch")

We can see that the samples are clearly separated by batches and we need to account for batch effect in the test. Thus we incorporate batch and gender as known covariates in the test.

Check the PCA after r egressing out the batch and gender.

library(rafalib)
X2 <- as.fumeric(c("A","A","A","B","B","A","A","A","A","B","A","B","B","A","A"))-1 # batch as covariates
X3 <- as.fumeric(c("A","A","A","A","A","C","C","A","A","A","A","A","A","A","C"))-1 # batch as covariates
X4 <- as.fumeric(c("M","F","M","M","M","F","M","M","M","F","M","M","M","M","F"))-1 # sex as covariates
registerDoParallel(cores = 10)
m6A.cov.out <- foreach(i = 1:nrow(top_bins), .combine = rbind) %dopar% {
  Y = top_bins[i,]
  resi <- residuals( lm(log(Y+1) ~ X2 + X3 + X4 ) )
  resi
}
rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
rownames(m6A.cov.out) <- rownames(top_bins)

plotPCAfromMatrix(m6A.cov.out,X,loglink = FALSE)+scale_color_discrete(name = "Group")

After regressing out the covariates, the samples are separated by diseases status in the PCA plot.

Run PoissonGamma test

cov <- cbind(X2,X3,X4)
RADAR <-  diffIP_parallel(RADAR,Covariates = cov,thread = 20)
RADAR_pos <-  diffIP_parallel(RADAR_pos[1:14],thread = 25, Covariates = cov,plotPvalue = F)

Other method

In order to compare performance of other method on this data set, we run other three methods on default mode on this dataset.

deseq2.res <- DESeq2(countdata = RADAR$ip_adjExpr_filtered,pheno = matrix(c(rep(0,7),rep(1,8)),ncol = 1),covariates = cov)
edgeR.res <- edgeR(countdata = RADAR$ip_adjExpr_filtered,pheno = matrix(c(rep(0,7),rep(1,8)),ncol = 1),covariates = cov)

filteredBins <- rownames(RADAR$all.est)
QNB.res <- QNB::qnbtest(control_ip = RADAR$reads[filteredBins,16:22],
                        treated_ip = RADAR$reads[filteredBins,23:30],
                        control_input = RADAR$reads[filteredBins,1:7],
                        treated_input = RADAR$reads[filteredBins,8:15],plot.dispersion = FALSE)

deseq2.res_pos <- DESeq2(countdata = RADAR_pos$ip_adjExpr_filtered,pheno = matrix(X,ncol = 1),covariates = cov  )
edgeR.res_pos <- edgeR(countdata = RADAR_pos$ip_adjExpr_filtered,pheno = matrix(X,ncol = 1),covariates = cov )

Compare distribution of p value.

pvalues <- data.frame(pvalue = c(RADAR$all.est[,"p_value3"],deseq2.res$pvalue,edgeR.res$pvalue,QNB.res$pvalue),
                      method = factor(rep(c("RADAR","DESeq2","edgeR","QNB"),c(length(RADAR$all.est[,"p_value3"]),
                                                                       length(deseq2.res$pvalue),
                                                                       length(edgeR.res$pvalue),
                                                                       length(QNB.res$pvalue)
                                                                       )
                                          ),levels =c("RADAR","DESeq2","edgeR","QNB")
                                   )
                      )
ggplot(pvalues, aes(x = pvalue))+geom_histogram(breaks = seq(0,1,0.04),col=I("black"))+facet_grid(.~method)+theme_bw()+xlab("p-value")+theme( axis.title =  element_text(size = 22, face = "bold"),strip.text = element_text(size = 25, face = "bold"),axis.text.x = element_text(size = 18, face = "bold",colour = "black"),axis.text.y = element_text(size = 15, face = "bold",colour = "black",angle = 90),panel.grid = element_blank(), axis.line = element_line(size = 0.7 ,colour = "black"),axis.ticks = element_line(colour = "black"), panel.spacing = unit(0.4, "lines") )+ scale_x_continuous(breaks = seq(0,1,0.2),labels=function(x) sprintf("%.1f", x))

Compare the distibution of beta

par(mfrow = c(1,4))
hist(RADAR$all.est[,"beta1"],main = "RADAR",xlab = "beta",breaks = 50,col =rgb(0.2,0.2,0.2,0.5),cex.main = 2.5,cex.axis =2,cex.lab=2)
hist(deseq2.res$log2FC, main = "DESeq2",xlab = "beta",breaks = 100,col =rgb(0.2,0.2,0.2,0.5),cex.main = 2.5,cex.axis =2,cex.lab=2)
hist(edgeR.res$log2FC,main = "edgeR", xlab = "beta",breaks = 50,col =rgb(0.2,0.2,0.2,0.5),cex.main = 2.5,cex.axis =2,cex.lab=2)
hist(QNB.res$log2.OR, main = "QNB",xlab = "beta",breaks = 50,col =rgb(0.2,0.2,0.2,0.5),cex.main = 2.5,cex.axis =2,cex.lab=2)

Compare norm by geneSum with norm by position

par(mfrow = c(1,3))

tmp <- hist(c(RADAR_pos$all.est[,"p_value3"]),plot = F)
tmp$counts <- tmp$counts*(nrow(RADAR$ip_adjExpr_filtered)/nrow(RADAR_pos$ip_adjExpr_filtered))
hist(RADAR$all.est[,"p_value3"],main = "RADAR",xlab = "p value",col =rgb(1,0,0,0.4),cex.main = 2.5,cex.axis =2,cex.lab=2, ylim = c(0,max(hist(RADAR$all.est[,"p_value3"],plot = F)$counts,tmp$counts)) )
plot(tmp,col=rgb(0,0,1,0.5),add = T)
axis(side = 1, lwd = 1,cex.axis =2)
axis(side = 2, lwd = 1,cex.axis =2)
legend("topright", c("By geneSum", "By position"), col=c(rgb(1,0,0,0.4), rgb(0,0,1,0.5)), lwd = 10,bty="n",text.font = 2)

y.scale <- max(hist(RADAR$all.est[,"p_value3"],plot = F)$counts)

tmp <- hist(c(deseq2.res_pos$pvalue),plot = F)
tmp$counts <- tmp$counts*(nrow(RADAR$ip_adjExpr_filtered)/nrow(RADAR_pos$ip_adjExpr_filtered))
hist(deseq2.res$pvalue,main = "DESeq2",xlab = "p value",col =rgb(1,0,0,0.4),cex.main = 2.5,cex.axis =2,cex.lab=2, ylim = c(0,y.scale) )
plot(tmp,col=rgb(0,0,1,0.5),add = T)
axis(side = 1, lwd = 1,cex.axis =2)
axis(side = 2, lwd = 1,cex.axis =2)
legend("topright", c("By geneSum", "By position"), col=c(rgb(1,0,0,0.4), rgb(0,0,1,0.5)), lwd = 10,bty="n",text.font = 2)

tmp <- hist(c(edgeR.res_pos$pvalue),plot = F)
tmp$counts <- tmp$counts*(nrow(RADAR$ip_adjExpr_filtered)/nrow(RADAR_pos$ip_adjExpr_filtered))
hist(edgeR.res$pvalue,main = "edgeR",xlab = "p value",col =rgb(1,0,0,0.4),cex.main = 2.5,cex.axis =2,cex.lab=2, ylim = c(0,max(hist(edgeR.res$pvalue,plot = F)$counts,tmp$counts,y.scale)) )
plot(tmp,col=rgb(0,0,1,0.5),add = T)
axis(side = 1, lwd = 1,cex.axis =2)
axis(side = 2, lwd = 1,cex.axis =2)
legend("topright", c("By geneSum", "By position"), col=c(rgb(1,0,0,0.4), rgb(0,0,1,0.5)), lwd = 10,bty="n",text.font = 2)

Number of significant bins detected at qvalue < 0.1

sigBins <- apply(cbind("RADAR"=RADAR$all.est[,"p_value3"],"DESeq2"=deseq2.res$pvalue,"edgeR"=edgeR.res$pvalue,"QNB"=QNB.res$pvalue),2, function(x){
  length( which( qvalue::qvalue(x)$qvalue < 0.1 ) )
})
print(sigBins)

##  RADAR DESeq2  edgeR    QNB
##  28031     17   7531     27

Plot overlap of significant hits

plot.new()
MyTools::venn_diagram4( names(which( qvalue::qvalue(RADAR$all.est[,"p_value3"])$qvalue < 0.1 ) ),
                        rownames(RADAR$all.est)[which( qvalue::qvalue(deseq2.res$pvalue)$qvalue < 0.1 ) ],
                        rownames(RADAR$all.est)[which( qvalue::qvalue(edgeR.res$pvalue)$qvalue < 0.1 ) ],
                        rownames(RADAR$all.est)[which( qvalue::qvalue(QNB.res$pvalue)$qvalue < 0.1 ) ],
                        "RADAR","DESeq2","edgeR","QNB")

self-FDR test

load("~/PoissonGamma_Method/T2D_data/selfFDR.RData")
ggplot(data = selfFDR, aes(x=method, y= self_FDR, colour = qvalue_cutoff))+geom_boxplot()+theme_bw() +xlab("Method")+ylab("Self-FDR")+ theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",size = 1),
                   axis.title.x=element_text(size=20, face="bold", hjust=0.5),
                   axis.title.y=element_text(size=20, face="bold", vjust=0.4, angle=90),
                   legend.title=element_text(size = 15,face = "bold"),legend.text = element_text(size = 18, face = "bold",family = "arial"),
                   axis.text = element_text(size = 15,face = "bold",colour = "black") , axis.ticks = element_line(colour = "black",size = 1) )+scale_color_discrete(name = "qvalue\ncutoff")

selfFDR_pos <- NULL
samplenames <- RADAR$samplenames
set.seed(3)
for(i in 1:20){
  ## Sample subset
  sample_id <-  c(sample(x = 1:7, size = 5,replace = F),7+sample(x = 1:8, size = 6,replace = F) )
  Truth_id <- setdiff(1:12,sample_id)


  if(length(unique(X2[sample_id])) == 1 ){ ## samples all from one batch
    ## run radar
  radar_sample <- diffIP_parallel(RADAR_pos[1:14] ,Covariates = cov[,2:3],exclude = samplenames[setdiff(1:15,sample_id)],thread = 15,plotPvalue = F)$all.est
  radar_sample <- cbind(radar_sample, "qvalue"= qvalue::qvalue(radar_sample[,"p_value3"])$qvalue  )
  radar_truth <-  RADAR_pos$all.est
  radar_truth <- cbind(radar_truth, "qvalue"= qvalue::qvalue(radar_truth[,"p_value3"])$qvalue   )

  radar_selfFDR <- sapply(c(0.1,0.05,0.01),function(x){length( which(! which(radar_sample[,"qvalue"]<x) %in% which(radar_truth[,"qvalue"]<0.1) ) )/length(which(radar_sample[,"qvalue"]<x))})

  ## run DESeq2
  deseq_sample <- DESeq2(countdata = RADAR_pos$ip_adjExpr_filtered[,sample_id],pheno = matrix(c(rep(0,7),rep(1,8))[sample_id],ncol = 1),covariates = cov[sample_id,2:3] )
  deseq_sample$qvalue <- qvalue::qvalue( deseq_sample$pvalue )$qvalue
  deseq_truth <- deseq2.res_pos
  deseq_truth$qvalue <- qvalue::qvalue( deseq_truth$pvalue )$qvalue

  deseq_selfFDR <-  sapply(c(0.1,0.05,0.01),function(x){length( which(! which(deseq_sample[,"qvalue"]<x) %in% which(deseq_truth[,"qvalue"]<0.1) ) )/length(which(deseq_sample[,"qvalue"]<x))})

  ## run edgeR
  edgeR_sample <- edgeR(countdata = RADAR_pos$ip_adjExpr_filtered[,sample_id],pheno = matrix(c(rep(0,7),rep(1,8))[sample_id],ncol = 1),covariates = cov[sample_id,2:3] )
  edgeR_sample$qvalue <- qvalue::qvalue( edgeR_sample$pvalue )$qvalue
  edgeR_truth <- edgeR.res_pos
  edgeR_truth$qvalue <- qvalue::qvalue( edgeR_truth$pvalue )$qvalue

  edgeR_selfFDR <-  sapply(c(0.1,0.05,0.01),function(x){length( which(! which(edgeR_sample[,"qvalue"]<x) %in% which(edgeR_truth[,"qvalue"]<0.1) ) )/length(which(edgeR_sample[,"qvalue"]<x))})

   fdr <- rbind( data.frame(self_FDR = radar_selfFDR, qvalue_cutoff=c("0.1","0.05","0.01"), method = rep("RADAR",3) ),
                   data.frame(self_FDR = deseq_selfFDR, qvalue_cutoff=c("0.1","0.05","0.01"), method = rep("DESeq2",3) ),
                   data.frame(self_FDR = edgeR_selfFDR, qvalue_cutoff=c("0.1","0.05","0.01"), method = rep("edgeR",3) )
                   )

  selfFDR_pos <- rbind(selfFDR_pos,fdr)
  }else if(length(unique(X3[sample_id])) == 1 ){
     ## run radar
  radar_sample <- diffIP_parallel(RADAR_pos[1:14] ,Covariates = cov[,c(1,3)],exclude = samplenames[setdiff(1:15,sample_id)],thread = 15,plotPvalue = F)$all.est
  radar_sample <- cbind(radar_sample, "qvalue"= qvalue::qvalue(radar_sample[,"p_value3"])$qvalue  )
  radar_truth <-  RADAR_pos$all.est
  radar_truth <- cbind(radar_truth, "qvalue"= qvalue::qvalue(radar_truth[,"p_value3"])$qvalue   )

  radar_selfFDR <- sapply(c(0.1,0.05,0.01),function(x){length( which(! which(radar_sample[,"qvalue"]<x) %in% which(radar_truth[,"qvalue"]<0.1) ) )/length(which(radar_sample[,"qvalue"]<x))})

  ## run DESeq2
  deseq_sample <- DESeq2(countdata = RADAR_pos$ip_adjExpr_filtered[,sample_id],pheno = matrix(c(rep(0,7),rep(1,8))[sample_id],ncol = 1),covariates = cov[sample_id,c(1,3)] )
  deseq_sample$qvalue <- qvalue::qvalue( deseq_sample$pvalue )$qvalue
  deseq_truth <- deseq2.res_pos
  deseq_truth$qvalue <- qvalue::qvalue( deseq_truth$pvalue )$qvalue

  deseq_selfFDR <-  sapply(c(0.1,0.05,0.01),function(x){length( which(! which(deseq_sample[,"qvalue"]<x) %in% which(deseq_truth[,"qvalue"]<0.1) ) )/length(which(deseq_sample[,"qvalue"]<x))})

  ## run edgeR
  edgeR_sample <- edgeR(countdata = RADAR_pos$ip_adjExpr_filtered[,sample_id],pheno = matrix(c(rep(0,7),rep(1,8))[sample_id],ncol = 1),covariates = cov[sample_id,c(1,3)] )
  edgeR_sample$qvalue <- qvalue::qvalue( edgeR_sample$pvalue )$qvalue
  edgeR_truth <- edgeR.res_pos
  edgeR_truth$qvalue <- qvalue::qvalue( edgeR_truth$pvalue )$qvalue

   edgeR_selfFDR <-  sapply(c(0.1,0.05,0.01),function(x){length( which(! which(edgeR_sample[,"qvalue"]<x) %in% which(edgeR_truth[,"qvalue"]<0.1) ) )/length(which(edgeR_sample[,"qvalue"]<x))})

   fdr <- rbind( data.frame(self_FDR = radar_selfFDR, qvalue_cutoff=c("0.1","0.05","0.01"), method = rep("RADAR",3) ),
                   data.frame(self_FDR = deseq_selfFDR, qvalue_cutoff=c("0.1","0.05","0.01"), method = rep("DESeq2",3) ),
                   data.frame(self_FDR = edgeR_selfFDR, qvalue_cutoff=c("0.1","0.05","0.01"), method = rep("edgeR",3) )
                   )

  selfFDR_pos <- rbind(selfFDR_pos,fdr)
  }else if(length(unique(X4[sample_id])) == 1){
  ## run radar
  radar_sample <- diffIP_parallel(RADAR_pos[1:14] ,Covariates = cov[,c(1,2)],exclude = samplenames[setdiff(1:15,sample_id)],thread = 15,plotPvalue = F)$all.est
  radar_sample <- cbind(radar_sample, "qvalue"= qvalue::qvalue(radar_sample[,"p_value3"])$qvalue  )
  radar_truth <-  RADAR_pos$all.est
  radar_truth <- cbind(radar_truth, "qvalue"= qvalue::qvalue(radar_truth[,"p_value3"])$qvalue   )

  radar_selfFDR <- sapply(c(0.1,0.05,0.01),function(x){length( which(! which(radar_sample[,"qvalue"]<x) %in% which(radar_truth[,"qvalue"]<0.1) ) )/length(which(radar_sample[,"qvalue"]<x))})

  ## run DESeq2
  deseq_sample <- DESeq2(countdata = RADAR_pos$ip_adjExpr_filtered[,sample_id],pheno = matrix(c(rep(0,7),rep(1,8))[sample_id],ncol = 1),covariates = cov[sample_id,c(1,2)] )
  deseq_sample$qvalue <- qvalue::qvalue( deseq_sample$pvalue )$qvalue
  deseq_truth <- deseq2.res_pos
  deseq_truth$qvalue <- qvalue::qvalue( deseq_truth$pvalue )$qvalue

  deseq_selfFDR <-  sapply(c(0.1,0.05,0.01),function(x){length( which(! which(deseq_sample[,"qvalue"]<x) %in% which(deseq_truth[,"qvalue"]<0.1) ) )/length(which(deseq_sample[,"qvalue"]<x))})

  ## run edgeR
  edgeR_sample <- edgeR(countdata = RADAR_pos$ip_adjExpr_filtered[,sample_id],pheno = matrix(c(rep(0,7),rep(1,8))[sample_id],ncol = 1),covariates = cov[sample_id,c(1,2)] )
  edgeR_sample$qvalue <- qvalue::qvalue( edgeR_sample$pvalue )$qvalue
  edgeR_truth <- edgeR.res_pos
  edgeR_truth$qvalue <- qvalue::qvalue( edgeR_truth$pvalue )$qvalue

   edgeR_selfFDR <-  sapply(c(0.1,0.05,0.01),function(x){length( which(! which(edgeR_sample[,"qvalue"]<x) %in% which(edgeR_truth[,"qvalue"]<0.1) ) )/length(which(edgeR_sample[,"qvalue"]<x))})

   fdr <- rbind( data.frame(self_FDR = radar_selfFDR, qvalue_cuttoff=c("0.1","0.05","0.01"), method = rep("RADAR",3) ),
                   data.frame(self_FDR = deseq_selfFDR, qvalue_cuttoff=c("0.1","0.05","0.01"), method = rep("DESeq2",3) ),
                   data.frame(self_FDR = edgeR_selfFDR, qvalue_cuttoff=c("0.1","0.05","0.01"), method = rep("edgeR",3) )
                   )

  selfFDR_pos <- rbind(selfFDR_pos,fdr)
  }else{
  ## run radar
  radar_sample <- diffIP_parallel(RADAR_pos[1:14],Covariates = cov ,exclude = samplenames[setdiff(1:15,sample_id)],thread = 15,plotPvalue = F)$all.est
  radar_sample <- cbind(radar_sample, "qvalue"= qvalue::qvalue(radar_sample[,"p_value3"])$qvalue  )
  radar_truth <-  RADAR_pos$all.est
  radar_truth <- cbind(radar_truth, "qvalue"= qvalue::qvalue(radar_truth[,"p_value3"])$qvalue   )

  radar_selfFDR <- sapply(c(0.1,0.05,0.01),function(x){length( which(! which(radar_sample[,"qvalue"]<x) %in% which(radar_truth[,"qvalue"]<0.1) ) )/length(which(radar_sample[,"qvalue"]<x))})

  ## run DESeq2
  deseq_sample <- DESeq2(countdata = RADAR_pos$ip_adjExpr_filtered[,sample_id],pheno = matrix(c(rep(0,7),rep(1,8))[sample_id],ncol = 1),covariates =  cov[sample_id,] )
  deseq_sample$qvalue <- qvalue::qvalue( deseq_sample$pvalue )$qvalue
  deseq_truth <- deseq2.res_pos
  deseq_truth$qvalue <- qvalue::qvalue( deseq_truth$pvalue )$qvalue

  deseq_selfFDR <-  sapply(c(0.1,0.05,0.01),function(x){length( which(! which(deseq_sample[,"qvalue"]<x) %in% which(deseq_truth[,"qvalue"]<0.1) ) )/length(which(deseq_sample[,"qvalue"]<x))})

  ## run edgeR
  edgeR_sample <- edgeR(countdata = RADAR_pos$ip_adjExpr_filtered[,sample_id],pheno = matrix(c(rep(0,7),rep(1,8))[sample_id],ncol = 1),covariates =  cov[sample_id,] )
  edgeR_sample$qvalue <- qvalue::qvalue( edgeR_sample$pvalue )$qvalue
  edgeR_truth <- edgeR.res_pos
  edgeR_truth$qvalue <- qvalue::qvalue( edgeR_truth$pvalue )$qvalue

   edgeR_selfFDR <-  sapply(c(0.1,0.05,0.01),function(x){length( which(! which(edgeR_sample[,"qvalue"]<x) %in% which(edgeR_truth[,"qvalue"]<0.1) ) )/length(which(edgeR_sample[,"qvalue"]<x))})

   fdr <- rbind( data.frame(self_FDR = radar_selfFDR, qvalue_cutoff=c("0.1","0.05","0.01"), method = rep("RADAR",3) ),
                   data.frame(self_FDR = deseq_selfFDR, qvalue_cutoff=c("0.1","0.05","0.01"), method = rep("DESeq2",3) ),
                   data.frame(self_FDR = edgeR_selfFDR, qvalue_cutoff=c("0.1","0.05","0.01"), method = rep("edgeR",3) )
                   )

  selfFDR_pos <- rbind(selfFDR_pos,fdr)
  }
}
selfFDR_pos$qvalue_cutoff <- factor(selfFDR_pos$qvalue_cutoff,levels = c("0.1","0.05","0.01"))
save(selfFDR_pos, file = "~/PoissonGamma_Method/T2D_data/selfFDR_pos.RData")

Self-FDR norm by local bin read count

load("~/PoissonGamma_Method/T2D_data/selfFDR_pos.RData")
selfFDR_all <- cbind(rbind(selfFDR, selfFDR_pos), "NormBy"= c(rep("By gene-level count",nrow(selfFDR)),rep("By bin-level count",nrow(selfFDR_pos))) )
selfFDR_all$NormBy <- factor(selfFDR_all$NormBy,levels = c("By gene-level count","By bin-level count"))

ggplot(data = selfFDR_all, aes(x=method, y= self_FDR, colour = qvalue_cutoff))+geom_boxplot()+facet_grid(~NormBy)+theme_bw()  +xlab("Method")+ylab("Self-FDR")+ theme( panel.grid.major = element_blank(),
                   axis.line = element_line(colour = "black",size = 1),
                   axis.title.x=element_text(size=20, face="bold", hjust=0.5,family = "arial"),
                   axis.title.y=element_text(size=20, face="bold", vjust=0.4, angle=90,family = "arial"),
                   legend.title=element_text(size = 15,face = "bold"),legend.text = element_text(size = 18, face = "bold",family = "arial"),
                   axis.text = element_text(size = 18,face = "bold",family = "arial",colour = "black") ,strip.text.x = element_text(size = 20,face = "bold") )+scale_color_discrete(name = "qvalue\ncutoff")

Permution

radar_permuted_P <-NULL
deseq_permuted_P <- NULL
edgeR_permuted_P <- NULL

set.seed(51)
for(i in 1:10){

  permute_X <- unlist( tapply(RADAR$X,batch,function(x){sample(x,length(x))}) )[match(RADAR$samplenames,unlist( tapply(RADAR$samplenames,batch,function(x)return(x)) ))]
  if ( all( permute_X == RADAR$X) | all( permute_X == c(rep("T2D",8),rep("Ctl",7))) ){
    cat("Duplicated with original design...\n")
  }else{
    radar_permute <- diffIP_parallel(c(RADAR[1:11],list("X"=permute_X),RADAR[13:14]) ,thread = 15,Covariates = cov,plotPvalue = F)$all.est[,"p_value3"]
    deseq_permute <- DESeq2(countdata = RADAR$ip_adjExpr_filtered,pheno = matrix(permute_X,ncol = 1),covariates = cov )$pvalue
    edgeR_permute <- edgeR(countdata = RADAR$ip_adjExpr_filtered,pheno = matrix(permute_X,ncol = 1),covariates =  cov )$pvalue

    radar_permuted_P <- cbind(radar_permuted_P, radar_permute)
    deseq_permuted_P <- cbind(deseq_permuted_P,deseq_permute)
    edgeR_permuted_P <- cbind(edgeR_permuted_P,edgeR_permute)
  }
}
save(radar_permuted_P ,deseq_permuted_P ,edgeR_permuted_P, file= "~/PoissonGamma_Method/T2D_data/permutation.RData")

load( "~/PoissonGamma_Method/T2D_data/permutation.RData")
for( i in 1:ncol(radar_permuted_P)){
par(mfrow=c(1,3))
hist(RADAR$all.est[,"p_value3"], col =rgb(1,0,0,0.4),main = "RADAR", xlab = "p value",cex.main = 2.5,cex.axis =2,cex.lab=2)
hist(radar_permuted_P[,i],col=rgb(0,0,1,0.5),add = T)
legend("topright", c("Original", "Permuted"), col=c(rgb(1,0,0,0.4), rgb(0,0,1,0.5)), lwd = 12,bty="n",text.font = 2, cex = 1.5)

hist(deseq2.res$pvalue, col =rgb(1,0,0,0.4),main = "DESeq2",xlab = "p value",cex.main = 2.5,cex.axis =2,cex.lab=2, ylim =c(0,max(hist(deseq_permuted_P[,i],plot = F)$counts) ) )
hist(deseq_permuted_P[,i],col=rgb(0,0,1,0.5),add = T)
legend("topright", c("Original", "Permuted"), col=c(rgb(1,0,0,0.4), rgb(0,0,1,0.5)), lwd = 12,bty="n",text.font = 2, cex = 1.5)

hist(edgeR.res$pvalue, col =rgb(1,0,0,0.4),main = "edgeR",xlab = "p value",cex.main = 2.5,cex.axis =2,cex.lab=2)
hist(edgeR_permuted_P[,i],col=rgb(0,0,1,0.5),add = T)
legend("topright", c("Original", "Permuted"), col=c(rgb(1,0,0,0.4), rgb(0,0,1,0.5)), lwd = 12,bty="n",text.font = 2, cex = 1.5)

}

combine all sets of permuted p value in one

par(mfrow=c(1,2))
hist(RADAR$all.est[,"p_value3"], col =rgb(1,0,0,0.4),main = "RADAR", xlab = "p value",font=2, cex.lab=1.5,  font.lab=2 ,cex.main = 2,cex.axis =1, lwd = 2.5)
tmp <- hist(c(radar_permuted_P),plot = F)
tmp$counts <- tmp$counts/10
plot(tmp,col=rgb(0,0,1,0.5),add = T)
legend("topright", c("Original", "Permuted"), col=c(rgb(1,0,0,0.4), rgb(0,0,1,0.5)), lwd = 12,bty="n",text.font = 2, cex = 1)

y.scale <- max(hist(RADAR$all.est[,"p_value3"],plot = F)$counts)

#hist(deseq2.res$pvalue, col =rgb(1,0,0,0.4),main = "DESeq2",xlab = "p value")
#tmp <- hist(c(deseq_permuted_P),plot = F)
#tmp$counts <- tmp$counts/10
#plot(tmp,col=rgb(0,0,1,0.5),add = T)
#legend("topright", c("Original", "Permuted"), col=c(rgb(1,0,0,0.4), rgb(0,0,1,0.5)), lwd = 10,bty="n")
hist(edgeR.res$pvalue, col =rgb(1,0,0,0.4),main = "edgeR",xlab = "p value",font=2, cex.lab=1.5,  font.lab=2 ,cex.main = 2 , cex.axis =1, ylim = c(0,y.scale), lwd = 2.5)
tmp <- hist(c(edgeR_permuted_P),plot = F)
tmp$counts <- tmp$counts/10
plot(tmp,col=rgb(0,0,1,0.5),add = T)
legend("topright", c("Original", "Permuted"), col=c(rgb(1,0,0,0.4), rgb(0,0,1,0.5)), lwd = 12,bty="n",text.font = 2, cex = 1)

Plot Heat map of significant bins by RADAR

topBins <- RADAR$ip_adjExpr_filtered[which(qvalue(RADAR$all.est[,"p_value3"])$qvalue<0.1),]
log_topBins <- log(topBins+1)

    registerDoParallel(cores = 4)
    cov.out.RADAR <- foreach(i = 1:nrow(log_topBins), .combine = rbind) %dopar% {
      Y = log_topBins[i,]
      tmp_data <- as.data.frame(cbind(Y,cov))
      resi <- residuals( lm(Y~.,data=tmp_data  ) )
      resi
    }
    rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
    rownames(cov.out.RADAR) <- rownames(log_topBins)
    colnames(cov.out.RADAR) <- colnames(log_topBins)

    dist.pear <- function(x) as.dist(1-cor(t(x)))
    hclust.ave <- function(x) hclust(x, method="average")
    gplots::heatmap.2(cov.out.RADAR,scale="row",trace="none",labRow=NA,main = "Methylation level of significant bins\n(Covariates regressed out)",
                      distfun=dist.pear, hclustfun=hclust.ave,col=rev(RColorBrewer::brewer.pal(9,"RdBu")))

Plot heap map of significant bins by edgeR

topBins <- RADAR$ip_adjExpr_filtered[which(qvalue(edgeR.res$pvalue)$qvalue<0.1),]
log_topBins <- log(topBins+1)

    registerDoParallel(cores = 4)
    cov.out.edgeR <- foreach(i = 1:nrow(log_topBins), .combine = rbind) %dopar% {
      Y = log_topBins[i,]
      tmp_data <- as.data.frame(cbind(Y,cov))
      resi <- residuals( lm(Y~.,data=tmp_data  ) )
      resi
    }
    rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
    rownames(cov.out.edgeR) <- rownames(log_topBins)
    colnames(cov.out.edgeR) <- colnames(log_topBins)

    dist.pear <- function(x) as.dist(1-cor(t(x)))
    hclust.ave <- function(x) hclust(x, method="average")
    gplots::heatmap.2(cov.out.edgeR,scale="row",trace="none",labRow=NA,main = "Methylation level of significant bins\n(Covariates regressed out)",
                      distfun=dist.pear, hclustfun=hclust.ave,col=rev(RColorBrewer::brewer.pal(9,"RdBu")))

Session information

sessionInfo()

## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 17.10
##
## Matrix products: default
## BLAS: /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
##
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
##  [1] grid      stats4    parallel  stats     graphics  grDevices utils
##  [8] datasets  methods   base
##
## other attached packages:
##  [1] VennDiagram_1.6.20        futile.logger_1.4.3
##  [3] qvalue_2.10.0             RADAR_0.1.5
##  [5] RcppArmadillo_0.9.100.5.0 Rcpp_0.12.19
##  [7] RColorBrewer_1.1-2        gplots_3.0.1
##  [9] doParallel_1.0.14         iterators_1.0.10
## [11] foreach_1.4.4             ggplot2_3.0.0
## [13] Rsamtools_1.30.0          Biostrings_2.46.0
## [15] XVector_0.18.0            GenomicFeatures_1.30.3
## [17] AnnotationDbi_1.40.0      Biobase_2.38.0
## [19] GenomicRanges_1.30.3      GenomeInfoDb_1.14.0
## [21] IRanges_2.12.0            S4Vectors_0.16.0
## [23] BiocGenerics_0.24.0
##
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-131.1             bitops_1.0-6
##  [3] matrixStats_0.54.0         bit64_0.9-7
##  [5] progress_1.2.0             httr_1.3.1
##  [7] rprojroot_1.3-2            tools_3.4.4
##  [9] backports_1.1.2            R6_2.2.2
## [11] KernSmooth_2.23-15         mgcv_1.8-23
## [13] DBI_1.0.0                  lazyeval_0.2.1
## [15] colorspace_1.3-2           withr_2.1.2
## [17] gridExtra_2.3              tidyselect_0.2.4
## [19] prettyunits_1.0.2          RMySQL_0.10.15
## [21] bit_1.1-14                 compiler_3.4.4
## [23] formatR_1.5                DelayedArray_0.4.1
## [25] labeling_0.3               rtracklayer_1.38.3
## [27] caTools_1.17.1             scales_0.5.0
## [29] stringr_1.3.1              digest_0.6.15
## [31] rmarkdown_1.10             DOSE_3.4.0
## [33] pkgconfig_2.0.1            htmltools_0.3.6
## [35] highr_0.7                  rlang_0.2.1
## [37] RSQLite_2.1.1              bindr_0.1.1
## [39] BiocParallel_1.12.0        gtools_3.8.1
## [41] GOSemSim_2.4.1             dplyr_0.7.6
## [43] RCurl_1.95-4.10            magrittr_1.5
## [45] GO.db_3.5.0                GenomeInfoDbData_1.0.0
## [47] Matrix_1.2-12              munsell_0.5.0
## [49] stringi_1.2.3              yaml_2.1.19
## [51] SummarizedExperiment_1.8.1 zlibbioc_1.24.0
## [53] plyr_1.8.4                 blob_1.1.1
## [55] gdata_2.18.0               DO.db_2.9
## [57] crayon_1.3.4               lattice_0.20-35
## [59] splines_3.4.4              hms_0.4.2
## [61] knitr_1.20                 pillar_1.2.3
## [63] igraph_1.2.1               fgsea_1.4.1
## [65] reshape2_1.4.3             codetools_0.2-15
## [67] biomaRt_2.34.2             futile.options_1.0.1
## [69] fastmatch_1.1-0            XML_3.98-1.11
## [71] glue_1.2.0                 evaluate_0.10.1
## [73] lambda.r_1.2.3             data.table_1.11.4
## [75] tidyr_0.8.1                gtable_0.2.0
## [77] purrr_0.2.5                assertthat_0.2.0
## [79] tibble_1.4.2               clusterProfiler_3.6.0
## [81] rvcheck_0.1.0              GenomicAlignments_1.14.2
## [83] memoise_1.1.0              MyTools_0.0.0
## [85] bindrcpp_0.2.2

This R Markdown site was created with workflowr

T2D

Zijie Zhang

2018-08-21