달력

52025  이전 다음

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31

'Data analysis/RNA-seq'에 해당되는 글 3건

  1. 2018.07.12 [RNA-seq]edgeR analysis code !!! group 지정버전 !!! 엄청 좋음 !!
  2. 2018.07.06 edge R dispersion
  3. 2017.05.11 [RNA-seq] edge R DEG

보호되어 있는 글입니다.
내용을 보시려면 비밀번호를 입력하세요.

edge R dispersion

2018. 7. 6. 11:06

보호되어 있는 글입니다.
내용을 보시려면 비밀번호를 입력하세요.

#dat

               N_TCGA-A7-A0CE N_TCGA-A7-A13F N_TCGA-BH-A0BV N_TCGA-BH-A0BZ N_TCGA-BH-A0C0 N_TCGA-BH-A0DD N_TCGA-BH-A0DG N_TCGA-BH-A0E0 N_TCGA-BH-A0E1

ENSG00000242268       2235.104       8930.394      1669.3990     7220.17900      843.38370    17623.99000        1365.77       3989.945           0.00

ENSG00000270112          0.000          0.000       115.8172       71.55877       58.51108       90.56981           0.00          0.000           0.00

ENSG00000167578      87871.320      48200.280     39664.6500    56268.88000    41750.38000    60623.46000       34482.49      60082.270       38414.85

ENSG00000273842          0.000          0.000         0.0000        0.00000        0.00000        0.00000           0.00          0.000           0.00

ENSG00000078237      62259.070      47192.540     55185.9700    92661.98000    29613.28000    60556.09000       38171.99      57631.810       39990.15

ENSG00000146083     197103.800     192761.900    227316.9000   169238.70000   242694.80000   143619.00000      184802.10     254256.800      221540.70





#zero value count

n_ch = c()

for(i in 1:nrow(test_dat))

{

tmp = c(length(which(test_dat[i,1:17]==0)),length(which(test_dat[i,18:ncol(test_dat)]==0)))

n_ch = rbind(n_ch,tmp)


}

}

rownames(n_ch) = rownames(test_dat)

pie(c(21774,38709),c('#value ==0', '#value > 0'), main ="Number of values per gene in 17 noraml sample")

barplot(table(n_ch[,2]), ylim = c(0,30000), main = "Number of values per gene in 105 cancer sample")


library(limma)

vd_a = row.names(n_ch[which(as.numeric(n_ch[,1])==17  ),])

vd_b = row.names(n_ch[which(as.numeric(n_ch[,2]) > 1),])


universe <- sort(unique(c(vd_a, vd_b)))

Counts <- matrix(0, nrow=length(universe), ncol=2)


for (i in 1:length(universe)) {

   Counts[i,1] <- universe[i] %in% vd_b

   Counts[i,2] <- universe[i] %in% vd_a

}


colnames(Counts) <- c("C(#V<2)","N(#V=0)")

cols<-c("Red", "Green")

vennDiagram(vennCounts(Counts), circle.col=cols) 


F_gene = unique(c(vd_a,vd_b))

F_dat = c()

t = c()

for (i in 1:nrow(test_dat))

{

if((rownames(test_dat)[i] %in% F_gene)==FALSE)

{

F_dat = rbind(F_dat,test_dat[i,])

t = c(t,rownames(test_dat)[i])



}


}

rownames(F_dat) = t



#glm testing

group <- factor(c(rep('N',17),c(rep('C',105))))

y <- DGEList(counts=F_dat,group=group, genes=rownames(F_dat))

#keep = rowSums(cpm(y)>1) > 100

#y <- y[keep, , keep.lib.sizes=FALSE]

#table(keep)


#model make

y <- calcNormFactors(y)

design <- model.matrix(~group)

y <- estimateDisp(y,design)



#test

fit <- glmFit(y,design)

lrt <- glmLRT(fit,coef=2)



 #FDR

#FDR <- p.adjust(et$table$PValue, method="fdr")

FDR <- p.adjust(lrt$table$PValue, method="FDR")

sum(FDR < 0.001)

#15964



plotBCV(y)


#volcano plot


deg <- decideTestsDGE(lrt,adjust.method="fdr",p.value=0.05, lfc=1)

summary(is.de)

plotSmear(lrt, de.tags=rownames(lrt)[ as.logical( dt_significant )])

abline(h=c(-1, 1), col="blue")


#heatmap

library(gplots) 

deg <- as.logical( decideTestsDGE( lrt, adjust.method="fdr", p.value=0.05, lfc=1) )

d1 <- rownames(F_dat)[ deg ]

d2 <- lrt$fitted.values[d1 , ]

vctr_colors = as.factor( c( "pink", "skyblue" ))

vctr_sample_colors <- as.character( vctr_colors[ as.numeric( lrt$samples$group ) ] )

heatmap.2( log2( d2 + 1 ), ColSideColors=vctr_sample_colors, key=TRUE, trace="none", col=heat.colors(200), scale="row" )



#DEG table


r_dat = cbind(lrt$fitted.values,lrt$table,FDR)[ deg,]

write.table(r_dat,'/Volumes/Data1/part1/20170511_TCGA_RNA/tmp/20170510_DEG.txt',sep='\t')


#annotation


library(biomaRt)

listMarts()    # to see which database options are present

ensembl=useMart("ensembl")  # using ensembl database data

listDatasets(ensembl)     # function to see which datasets are present in ensembl

ensembl=useDataset("hsapiens_gene_ensembl",mart=ensembl)   # from ensembl using homosapien gene data

listFilters(ensembl)  # check which filters are available

listAttributes(ensembl) # check attributes are available to select.More information on ensembl data base

id=getBM(attributes=c("ensembl_gene_id","entrezgene","hgnc_symbol","go_id","name_1006","kegg_enzyme") ,filters = "ensembl_gene_id",values=rownames(r_dat), mart= ensembl) # fuction to get  gene id's and gene name from data base





t_id = c()

g_id = unique(id[,1])

for(i in 1:length(g_id))

{

tmp = id[which(id[,1] == g_id[i]),]

t_id = rbind(t_id,apply(tmp,2,function(x) paste(unique(x),collapse=':') ))


}


t_dat2 = c()

for(i in 1:nrow(r_dat))

{

if ((length(which(rownames(r_dat)[i] == t_id[,1]))) > 0)

{

t_dat2 = rbind(t_dat2,c(r_dat[i,],t_id[which(rownames(r_dat)[i] == t_id[,1]),-1]))


}

else

{

t_dat2 = rbind(t_dat2,c(r_dat[i,],'NA','NA','NA','NA'))

}


}



rownames(t_dat2) = rownames(r_dat)

colnames(t_dat2) = c(colnames(r_dat),"entrezgene","hgnc_symbol","go_id","go_name")

write.table(t_dat2,'/Volumes/Data1/part1/20170511_TCGA_RNA/tmp/20170510_DEG.txt',sep='\t')


Posted by jjbang
|