![]() |
Python¥Ð¥¤¥ª/¥Ä¡¼¥ë/RNAseq¤-X EdgeR¤Þ¤ï¤êhttp://pepper.is.sci.toho-u.ac.jp/pepper/index.php?Python%A5%D0%A5%A4%A5%AA%2F%A5%C4%A1%BC%A5%EB%2FRNAseq%AD%A4-X%20EdgeR%A4%DE%A4%EF%A4%EA |
![]() |
Python¥Ð¥¤¥ª?¡¡Python¥Ð¥¤¥ª/¥Ä¡¼¥ë?
2825¡¡¡¡¡¡2019-07-12 (¶â) 12:24:39
»²¾È¡¡Æó·²´ÖÈæ³Ó¡ÊedgeR¡Ë | RNA-Seq ¤Ë¤è¤ëÈæ³Ó¥È¥é¥ó¥¹¥¯¥ê¥×¥È¡¼¥à²òÀÏ
¹¹¤Ë¸µ¤Ï¡¢edgeRUsersGuide.pdf
¤³¤Î¥Ú¡¼¥¸¤Ë¤è¤ë¤È¡¢
edgeR ¤Î¥Ð¡¼¥¸¥ç¥ó¤¬°Û¤Ê¤ë¤È¡¢²òÀÏ·ë²Ì¤â¾¯¤·°Û¤Ê¤Ã¤Æ¤¯¤ë¡£¤³¤Î¥Ú¡¼¥¸¤Ç¤Ï 3.10.0 ¤òÍøÍѤ·¤Æ¤ë¡£
¤È¤¤¤¦¤³¤È¤À¤¬¡¢¤É¤¦¤À¤í¤¦¤«¡©
¤³¤Î¥Ú¡¼¥¸¤Ë½¾¤Ã¤Æ¡¢¼«Á°¥Ç¡¼¥¿¤Ç»î¤¹¡£
> source("http://bioconductor.org/biocLite.R") > biocLite("edgeR") # °Í¸¥é¥¤¥Ö¥é¥ê¤¬Â¿¤¤¤Î¤Ç¡¢º£¤Þ¤Ç»È¤Ã¤Æ¤¤¤Ê¤¤¤È·ë¹½»þ´Ö¤¬¤«¤«¤ë > library(edgeR)
> count <- read.table("../all.counts.txt", sep="\t", header = T, row.names = 1) > head(count) Chr Start End Strand Length SRR453566.sorted.bam gene_0001 NC_001133.9 1807 2169 - 363 0 gene_0002 NC_001133.9 2480 2707 + 228 0 gene_0003 NC_001133.9 7235 9016 - 1782 0 > count <- count[,c('SRR453566.sorted.bam','SRR453567.sorted.bam','SRR453568.sorted.bam','SRR453569.sorted.bam','SRR453570.sorted.bam', 'SRR453571.sorted.bam')] > colnames(count) <- c("batch_1","batch_2","batch_3", "chemostat_1", "chemostat_2", "chemostat_3") > head(count) batch_1 batch_2 batch_3 chemostat_1 chemostat_2 chemostat_3 gene_0001 0 2 6 0 0 1 gene_0002 0 0 0 0 0 0 gene_0003 0 0 0 0 0 0
> count <- as.matrix(count) > dim(count) # 6420 6 > group <- factor(c("b", "b", "b", "c", "c", "c"))
¡ãexact test¡ä
# ¤Þ¤º¡¢count¥Ç¡¼¥¿¤Ègroup¤ò£±¤Ä¤ÎÊÑ¿ô¤ËÆþ¤ì¤ë¡Ê¥é¥Ù¥ëÉÕ¤±¡Ë > d <- DGEList(counts = count, group = group) # ȯ¸½¿ô¤ÎÈó¾ï¤Ë¾¯¤Ê¤¤gene¤ò½üµî¡¡¡¡[[edgeRUsersGuide:https://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf]] p55 > keep <- rowSums(cpm(d)>1) >= 2 # CPM(count per million)¤¬£±°Ê¾å¤¬£²¤Ä°Ê¾å > d <- d[keep, , keep.lib.sizes=FALSE] > dim(d) # 5899 6
# Àµµ¬²½ > d <- calcNormFactors(d) # ʬ»¶¤Î¿ä¬ > d <- estimateCommonDisp(d) > d <- estimateTagwiseDisp(d) # ¤Ç¡¢exact test¤Î¸¡Äê > result <- exactTest(d) > dim(result) # 5899 3 # ·ë²Ìɽ¼¨¤Î¤¿¤á¤Ë¤Ï > topTags(result) Comparison of groups: c-b logFC logCPM PValue FDR gene_2989 11.273497 8.543906 0 0 gene_4667 9.683678 9.715407 0 0 gene_3497 8.659467 11.419660 0 0 gene_3421 8.501696 7.911698 0 0 gene_4882 7.545279 10.715972 0 0 gene_0615 7.399665 8.409708 0 0 gene_1031 7.233161 7.473834 0 0 gene_5986 7.169033 6.784983 0 0 gene_2035 6.893378 8.849378 0 0 gene_0017 6.841963 10.445186 0 0 # logFC¤È¤Ï¡§log fold-change¡¢wild type (WT) ¤È knock out (KO) ¤Î¥·¥°¥Ê¥ëÃͤòÈæ³Ó¤¹¤ë¤È¤¡¢log2(KO)/log2(WT) # locCPM¤È¤Ï¡§log counts per million # FDR: False Discovery Rate # ·ë²Ì¤ò¥Õ¥¡¥¤¥ë¤Ë³ÊǼ¤¹¤ë > table <- as.data.frame(topTags(result, n = nrow(count))) > write.table(table, file = "result.txt", col.names = T, row.names = T, sep = "\t") # ¥Õ¥¡¥¤¥ë¤Î¡¢ÅÓÃæ¤Î¥¨¥ó¥È¥ê¡¼¤ÎÎã "logFC" "logCPM" "PValue" "FDR" "gene_2901" 0.167058457314462 6.8615588901153 0.0493154259214582 0.0667381733220193 "gene_0340" 0.289892217524922 5.33859450515354 0.0494251531593589 0.0668713253410684 "gene_0519" 0.189496458586853 6.08722498147699 0.0494834466115673 0.0669348432840255 "gene_0954" 0.256044030124387 5.65989391337004 0.0496916298717389 0.0672010372795479 "gene_3371" -0.205143944867366 9.08724668881671 0.0498370683360759 0.0673822750663561 "gene_0287" -0.255406659792859 5.95521336812612 0.0501243894523602 0.0677552184645904 "gene_5879" 0.208805500563976 5.93528971411533 0.0503683786485931 0.0680694308472052 "gene_5595" -0.172878902469866 6.9102371649123 0.0503961888612053 0.0680914150463238 "gene_2980" -0.174306726898056 6.36273018338751 0.0505773479860026 0.0683201796940756 "gene_3461" 0.190387033748589 6.37468042292314 0.0505886667068524 0.0683201796940756
# ·ë²Ì¤ò¿Þ¼¨¤¹¤ë > is.DEG <- as.logical(table$FDR < 0.01) > DEG.names <- rownames(table)[is.DEG] > plotSmear(result, de.tags = DEG.names)
¡ãÌàÅÙÈæ¸¡Äê¡ä
> design <- model.matrix(~ group) > design (Intercept) groupc 1 1 0 2 1 0 3 1 0 4 1 1 5 1 1 6 1 1 attr(,"assign") [1] 0 1 attr(,"contrasts") attr(,"contrasts")$group [1] "contr.treatment" # ²òÀϤ¹¤ë # ¤Þ¤º¤â¤¦°ìÅ٥ǡ¼¥¿¤òcount¤«¤éºî¤êľ¤· > d <- DGEList(counts = count, group = group) # Á°¤ÈƱÍͤˡ¢È¯¸½¿ô¤ÎÈó¾ï¤Ë¾¯¤Ê¤¤gene¤ò½üµî > keep <- rowSums(cpm(d)>1) >= 2 # CPM(count per million)¤¬£±°Ê¾å¤¬£²¤Ä°Ê¾å > d <- d[keep, , keep.lib.sizes=FALSE] # ¤³¤³¤«¤é¡¢Ê¬ÉÛ¡Êʬ»¶¡Ë¤Î¿äÄê¡¡GLM¤ò»È¤¦ > d <- calcNormFactors(d) > d <- estimateGLMCommonDisp(d, design) > d <- estimateGLMTrendedDisp(d, design) > d <- estimateGLMTagwiseDisp(d, design) # ʬÉۤ˥ե£¥Ã¥È > fit <- glmFit(d, design) # ÌàÅÙÈæ¸¡Äê ¤³¤Î¤È¤¡¢full model ¤È reduced model ¤òÈæ¤Ù¤ë¤Î¤Ç¡¢¤³¤Î¾ðÊó¤ò´Ø¿ô¤ËÍ¿¤¨¤ë¡£ glmFit ¤òÍøÍѤ·¤ÆºîÀ®¤·¤¿ fit ¤Ë¤Ï full model ¤Î¥Ç¡¼¥¿´Þ¤Þ¤ì¤Æ¤¤¤ë¡£½¾¤Ã¤Æ¡¢¤¢¤È¤Ï reduced model ¤Î¾ðÊó¤À¤±¤òÍ¿¤¨¤ì¤Ð¤è¤¤¡£¤³¤³¤Ç¡¢full model ¤È reduced model ¤Î ¥Ç¥¶¥¤¥ó¹ÔÎó¤ò¸«Èæ¤Ù¤Æ¤ß¤ë¤È¡¢reduced model ¤Ï full model ¤Î¥Ç¥¶¥¤¥ó¹ÔÎó¤Î 2 Îó¤ò ¤¹¤Ù¤Æ 0 ¤Ë¤·¤¿¤â¤Î¤ËÅù¤·¤¤¤³¤È¤¬¤ï¤«¤ë¡£¤½¤³¤Ç¡¢coef = 2 ¤È»ØÄꤷ¤Æ¡¢reduced model ¤Î¾ðÊó¤òÍ¿¤¨¤ë¡£ > lrt <- glmLRT(fit, coef = 2) > dim(lrt) # 5899 4 # ·ë²Ì¤Ï > topTags(lrt) Coefficient: groupc logFC logCPM LR PValue FDR gene_2989 11.272397 8.543271 2260.090 0 0 gene_3497 8.658186 11.419157 1934.825 0 0 gene_3421 8.500384 7.911092 2271.734 0 03859 gene_4882 7.543818 10.715354 2673.692 0 0 gene_0615 7.398431 8.409080 1897.589 0 0 # ¥Õ¥¡¥¤¥ë¤Ë½ÐÎÏ > lrttable <- as.data.frame(topTags(lrt, n = nrow(count))) > write.table(lrttable, file="lrtresult.txt", col.names=T, row.names=T, sep="\t") logFC logCPM LR PValue FDR "gene_3295" 0.247645680731094 5.40510860572214 3.87945550718096 0.0488804073717157 0.065772245229414 "gene_4943" -0.194295943127542 7.18848127443337 3.8631344790163 0.0493580361534112 0.0663997845539276 "gene_3461" 0.190360832039291 6.37466110822439 3.85464691865404 0.0496083672460962 0.0667213311410673 "gene_6181" -0.204622554470669 5.77230370218388 3.85415389282258 0.0496229496429794 0.0667257305548064 "gene_1983" 0.856380812164874 1.74233409044916 3.8422391885909 0.0499767355704516 0.0671861356267307 "gene_5577" -0.263126831966537 5.60034347444897 3.84011138664067 0.050040196820857 0.0672561223618673 "gene_3658" -0.194852891673291 6.72394023500093 3.82819449899496 0.0503971930016531 0.0677205105960711 "gene_2310" -0.197601718589681 6.24609836497735 3.82407278015195 0.0505212936731495 0.0678718085579387 "gene_2980" -0.17430474127026 6.36273796901708 3.81688233743943 0.0507385643479778 0.0681481764773954 "gene_0447" 0.325651368079434 4.40033732555139 3.81349708835367 0.0508411968215156 0.0682704803209925
Áý¸º¤ÎÁ´ÂΤò¸«¤ë¤È¡¡¡ÊEdgeUsersGuide p58¡Ë
>summary(decideTests(lrt)) # 1% FDR groupc Down 2072 NotSig 1620 Up 2207
MD¿Þ¤òÉÁ¤¯¤³¤È¤¬¤Ç¤¤ë¡¡¡¡¡ÊEdgeUsersGuide p58¡Ë
> plotMD(lrt) > abline(h=c(-1,1), col="blue")
¡ãGO analysis¡ä
> biocLite("GO.db") > biocLite("org.Hs.eg.db") > go <- goana(lrt) goana.default(de = DEGenes, universe = universe, ...) ¤Ç¥¨¥é¡¼: No annotated genes found in universe