#dat
N_TCGA-A7-A0CE N_TCGA-A7-A13F N_TCGA-BH-A0BV N_TCGA-BH-A0BZ N_TCGA-BH-A0C0 N_TCGA-BH-A0DD N_TCGA-BH-A0DG N_TCGA-BH-A0E0 N_TCGA-BH-A0E1
ENSG00000242268 2235.104 8930.394 1669.3990 7220.17900 843.38370 17623.99000 1365.77 3989.945 0.00
ENSG00000270112 0.000 0.000 115.8172 71.55877 58.51108 90.56981 0.00 0.000 0.00
ENSG00000167578 87871.320 48200.280 39664.6500 56268.88000 41750.38000 60623.46000 34482.49 60082.270 38414.85
ENSG00000273842 0.000 0.000 0.0000 0.00000 0.00000 0.00000 0.00 0.000 0.00
ENSG00000078237 62259.070 47192.540 55185.9700 92661.98000 29613.28000 60556.09000 38171.99 57631.810 39990.15
ENSG00000146083 197103.800 192761.900 227316.9000 169238.70000 242694.80000 143619.00000 184802.10 254256.800 221540.70
#zero value count
n_ch = c()
for(i in 1:nrow(test_dat))
{
tmp = c(length(which(test_dat[i,1:17]==0)),length(which(test_dat[i,18:ncol(test_dat)]==0)))
n_ch = rbind(n_ch,tmp)
}
}
rownames(n_ch) = rownames(test_dat)
pie(c(21774,38709),c('#value ==0', '#value > 0'), main ="Number of values per gene in 17 noraml sample")
barplot(table(n_ch[,2]), ylim = c(0,30000), main = "Number of values per gene in 105 cancer sample")
library(limma)
vd_a = row.names(n_ch[which(as.numeric(n_ch[,1])==17 ),])
vd_b = row.names(n_ch[which(as.numeric(n_ch[,2]) > 1),])
universe <- sort(unique(c(vd_a, vd_b)))
Counts <- matrix(0, nrow=length(universe), ncol=2)
for (i in 1:length(universe)) {
Counts[i,1] <- universe[i] %in% vd_b
Counts[i,2] <- universe[i] %in% vd_a
}
colnames(Counts) <- c("C(#V<2)","N(#V=0)")
cols<-c("Red", "Green")
vennDiagram(vennCounts(Counts), circle.col=cols)
F_gene = unique(c(vd_a,vd_b))
F_dat = c()
t = c()
for (i in 1:nrow(test_dat))
{
if((rownames(test_dat)[i] %in% F_gene)==FALSE)
{
F_dat = rbind(F_dat,test_dat[i,])
t = c(t,rownames(test_dat)[i])
}
}
rownames(F_dat) = t
#glm testing
group <- factor(c(rep('N',17),c(rep('C',105))))
y <- DGEList(counts=F_dat,group=group, genes=rownames(F_dat))
#keep = rowSums(cpm(y)>1) > 100
#y <- y[keep, , keep.lib.sizes=FALSE]
#table(keep)
#model make
y <- calcNormFactors(y)
design <- model.matrix(~group)
y <- estimateDisp(y,design)
#test
fit <- glmFit(y,design)
lrt <- glmLRT(fit,coef=2)
#FDR
#FDR <- p.adjust(et$table$PValue, method="fdr")
FDR <- p.adjust(lrt$table$PValue, method="FDR")
sum(FDR < 0.001)
#15964
plotBCV(y)
#volcano plot
deg <- decideTestsDGE(lrt,adjust.method="fdr",p.value=0.05, lfc=1)
summary(is.de)
plotSmear(lrt, de.tags=rownames(lrt)[ as.logical( dt_significant )])
abline(h=c(-1, 1), col="blue")
#heatmap
library(gplots)
deg <- as.logical( decideTestsDGE( lrt, adjust.method="fdr", p.value=0.05, lfc=1) )
d1 <- rownames(F_dat)[ deg ]
d2 <- lrt$fitted.values[d1 , ]
vctr_colors = as.factor( c( "pink", "skyblue" ))
vctr_sample_colors <- as.character( vctr_colors[ as.numeric( lrt$samples$group ) ] )
heatmap.2( log2( d2 + 1 ), ColSideColors=vctr_sample_colors, key=TRUE, trace="none", col=heat.colors(200), scale="row" )
#DEG table
r_dat = cbind(lrt$fitted.values,lrt$table,FDR)[ deg,]
write.table(r_dat,'/Volumes/Data1/part1/20170511_TCGA_RNA/tmp/20170510_DEG.txt',sep='\t')
#annotation
library(biomaRt)
listMarts() # to see which database options are present
ensembl=useMart("ensembl") # using ensembl database data
listDatasets(ensembl) # function to see which datasets are present in ensembl
ensembl=useDataset("hsapiens_gene_ensembl",mart=ensembl) # from ensembl using homosapien gene data
listFilters(ensembl) # check which filters are available
listAttributes(ensembl) # check attributes are available to select.More information on ensembl data base
id=getBM(attributes=c("ensembl_gene_id","entrezgene","hgnc_symbol","go_id","name_1006","kegg_enzyme") ,filters = "ensembl_gene_id",values=rownames(r_dat), mart= ensembl) # fuction to get gene id's and gene name from data base
t_id = c()
g_id = unique(id[,1])
for(i in 1:length(g_id))
{
tmp = id[which(id[,1] == g_id[i]),]
t_id = rbind(t_id,apply(tmp,2,function(x) paste(unique(x),collapse=':') ))
}
t_dat2 = c()
for(i in 1:nrow(r_dat))
{
if ((length(which(rownames(r_dat)[i] == t_id[,1]))) > 0)
{
t_dat2 = rbind(t_dat2,c(r_dat[i,],t_id[which(rownames(r_dat)[i] == t_id[,1]),-1]))
}
else
{
t_dat2 = rbind(t_dat2,c(r_dat[i,],'NA','NA','NA','NA'))
}
}
rownames(t_dat2) = rownames(r_dat)
colnames(t_dat2) = c(colnames(r_dat),"entrezgene","hgnc_symbol","go_id","go_name")
write.table(t_dat2,'/Volumes/Data1/part1/20170511_TCGA_RNA/tmp/20170510_DEG.txt',sep='\t')
'Data analysis > RNA-seq' 카테고리의 다른 글
[RNA-seq]edgeR analysis code !!! group 지정버전 !!! 엄청 좋음 !! (0) | 2018.07.12 |
---|---|
edge R dispersion (0) | 2018.07.06 |