Python¥Ð¥¤¥ª?¡¡Python¥Ð¥¤¥ª/¥Ä¡¼¥ë?
1420¡¡¡¡¡¡2019-07-12 (¶â) 17:20:37
Python¥Ð¥¤¥ª/¥Ä¡¼¥ë/RNAseq¤-X EdgeR¤Þ¤ï¤ê¤È¤Û¤ÜƱ¤¸¤³¤È¤ò¡¢Python¤«¤éR¤ØÊü¤Ã¤Æ¹Ô¤¦¡£
»²¾È¡¡Æó·²´ÖÈæ³Ó¡ÊedgeR¡Ë | RNA-Seq ¤Ë¤è¤ëÈæ³Ó¥È¥é¥ó¥¹¥¯¥ê¥×¥È¡¼¥à²òÀÏ
¹¹¤Ë¸µ¤Ï¡¢edgeRUsersGuide.pdf
import pandas as pd import rpy2 from rpy2.robjects.packages import importr print(rpy2.__version__) from rpy2.robjects import pandas2ri pandas2ri.activate() from rpy2.robjects import r, StrVector, FactorVector base = importr('base') base.source("http://www.bioconductor.org/biocLite.R") biocinstaller = importr("BiocInstaller") # ºÇ½é¤À¤± biocinstaller.biocLite("edgeR") # ºÇ½é¤À¤± edgeR = importr("edgeR") count = pd.read_csv('../all.counts.txt', sep='\t', skiprows=1) # ½é¤á¤ÏGeneid¤òÉÕ¤±¤Æ¤ª¤¯ count = count[['Geneid', 'SRR453566.sorted.bam','SRR453567.sorted.bam','SRR453568.sorted.bam','SRR453569.sorted.bam','SRR453570.sorted.bam', 'SRR453571.sorted.bam']] count.columns = ['Geneid', "batch_1","batch_2","batch_3", "chemostat_1", "chemostat_2", "chemostat_3"] r.assign("count", pandas2ri.py2ri(count)) # count¤òR¤Ø¥¢¥Ã¥×¤¹¤ë r('genelist <- count[,c("Geneid")]') # R¾å¤ÇGeneid¤ò³Ð¤¨¤Æ¤ª¤¯ r('count <- count[, colnames(count) != "Geneid"]') # R¾å¤ÇGeneidÎó¤ò¾Ã¤¹ group = FactorVector(StrVector(['b', 'b', 'b', 'c', 'c', 'c'])) # group¤òºî¤Ã¤Æ¤ª¤¯ r.assign("group", group) r('d <- DGEList(counts = count, group = group, genes = genelist)') # count¤Ègroup¤«¤éDEGlist¤òºî¤ë¡¢genes¤ËGeneid(¡Ágenelist)¤òÆþ¤ì¤ë r('d <- calcNormFactors(d)') # ½êÄê¤Î¼ê³¤£± r('d <- estimateCommonDisp(d)') # £² r('d <- estimateTagwiseDisp(d)') # £³ r('result <- exactTest(d)') r('result0 <- result$table') # R¦¤Çtable¤À¤±¤Ë¤¹¤ë¡£Á´Éô»ý¤Ã¤Æ¤¯¤ë¤È¼«Æ°ÊÑ´¹»þ¤Ë pandas¥¯¥é¥¹¤Ç¤Ï¤Ê¤¯rpy2¥¯¥é¥¹¤Ë¤·¤«¤Ê¤é¤Ê¤¤ r('result0ix <- result$genes') # result$table ¤Ë¤ÏGeneid¤¬Æþ¤Ã¤Æ¤¤¤Ê¤¤ r0 = r['result0'] print('r0\n', r0.head()) r0ix = r['result0ix'] r0.index = r0ix['genes'] # r0¤Ëindex¤È¤·¤ÆGeneid¤òÉÕ¤±¤ë print('result0\n', r0.head()) print('pvalues\n', (r0.sort_values(['PValue', 'logFC'], ascending=[True, False])).head(20)) ## sort_values¤ÇÊ£¿ô¥¡¼¤ò»ØÄê¤Ç¤¤ë¡£¹¹¤Ëascending¾ò·ï¤¬°ã¤Ã¤Æ¤â¤è¤¤ r('r2 <- topTags(result, n=nrow(count))') # R¾å¤ÇtopTags¤òÄ̤¹¡£ r('r22 <- r2$table') # ·ë²Ì¤ÎtableÉôʬ¤À¤±¤òÃê½Ð¤¹¤ë r2 = r['r22'] # Python¦¤Ø»ý¤Ã¤Æ¤¯¤ë print('r2\n', r2.head())
################################ # ´°Á´¤ËºÇ½é¤«¤é count = pd.read_csv('../all.counts.txt', sep='\t', skiprows=1) count = count[['Geneid', 'SRR453566.sorted.bam','SRR453567.sorted.bam','SRR453568.sorted.bam','SRR453569.sorted.bam','SRR453570.sorted.bam', 'SRR453571.sorted.bam']] count.columns = ['Geneid', "batch_1","batch_2","batch_3", "chemostat_1", "chemostat_2", "chemostat_3"] r.assign("count", pandas2ri.py2ri(count)) r('genelist <- count[,c("Geneid")]') r('count <- count[, colnames(count) != "Geneid"]') group = FactorVector(StrVector(['b', 'b', 'b', 'c', 'c', 'c'])) r.assign("group", group) r('d <- DGEList(counts = count, group = group, genes=genelist)') r('row.names(d$count) <- genelist') r('row.names(d$counts) <- genelist') r('design <- model.matrix(~ group)') r('d <- estimateGLMCommonDisp(d, design)') r('d <- estimateGLMTrendedDisp(d, design)') r('d <- estimateGLMTagwiseDisp(d, design)') r('fit <- glmFit(d, design)') r('lrt <- glmLRT(fit, coef = 2)') # LRT¤ò½ÐÎÏ r('lrt0 <- topTags(lrt, n = nrow(count))') # topTag¤òÄ̤¹¡£ # print('lrt0\n', r['lrt0']) r('lrtgenes <- lrt0$table[, "genes"]') lrtgenes = r['lrtgenes'] r('lrttable <- lrt0$table') # tableÉôʬ¤À¤±¤òÃê½Ð¤¹¤ë lrttable = r['lrttable'] # Python¤Î¦¤Ø»ý¤Ã¤Æ¤¯¤ë print(type(lrttable)) print('lrttable\n', lrttable.head(20)) lrttable.to_csv("lrttable.csv", sep='\t') #decideTests¤ÇGeneid¤¬½ÐÎϤËÅϤµ¤ì¤Ê¤¤¡£ r('dt <- decideTests(lrt)') dt = pd.DataFrame(r['dt']) dt.index = lrtgenes print('dt\n', dt)