Python¥Ð¥¤¥ª?¡¡Python¥Ð¥¤¥ª/¥Ä¡¼¥ë?
1087¡¡¡¡¡¡2019-06-25 (²Ð) 11:15:26
ÆþÎϥǡ¼¥¿¤È¤·¤ÆÍѰդ¹¤ë¤â¤Î¡§¡¡
¬Äê¥Ç¡¼¥¿¤Ï¡¢study genes¤È¤·¤ÆÂоݤȤʤë°äÅÁ»Ò¤Î¥ê¥¹¥È¡£diff_tpm_ex.tsv¤ÏTPMȯ¸½Î̤κ¹¤Î¥ê¥¹¥È¤Ê¤Î¤À¤¬¡¢¸µ½èÍý¤Ç¤ÏNCBI GeneID¤¬ÉÕ¤¤¤Æ¤¤¤Ê¤¤¡£¾¡¼ê¤Ë¤Ä¤±¤¿geneid¤Î¤ß¡£
¤½¤ì¤Ç¡¢¤¢¤é¤«¤¸¤áɽgene_id_product_plus_GeneID.tsv¤òºî¤Ã¤Æ¤ª¤¡¢¤½¤ì¤ò»È¤Ã¤ÆÉ½¤ËÄɵ¤¹¤ë¡£É½gene_id_product_plus_GeneID.tsv¤ÎºîÀ®¤Ï¡¢
import pandas as pd import numpy as np import urllib.parse outputlist = [] fout = open('gene_id_product_plus_GeneID.tsv', 'w') with open('../s288c_e.gff', 'r') as fin: for line in fin: product = '' geneid = '' line=line.rstrip() if not line.startswith('#'): s=line.split('\t') items=s[8].split(';') for item in items: if item.startswith('product='): product = item.split('=')[1] product = urllib.parse.unquote(product) if item.startswith('gene_id='): geneid = item.split('=')[1] if item.startswith('gene='): gene = item.split('=')[1] if 'GeneID:' in item: pos = item.find('GeneID') end = item[pos:].find(',') NCBI_ID = item[pos+7:pos+end] if (geneid != '') and (product != '') and (gene != '') and (NCBI_ID != ''): outputlist.append(geneid+'\t'+product+'\t'+gene+'\t'+NCBI_ID) outputlist = sorted(list(set(outputlist))) print('Geneid\tproduct\tgene\tNCBI_ID', file=fout) for u in outputlist: print(u, file=fout) fout.close()
¸µ¤ØÌá¤Ã¤Æ¡¢Study¥Ç¡¼¥¿¤Î¼è¹þ¤ß¤ÈÊÑ´¹¡£
# Study¥Ç¡¼¥¿¤ò¼è¤ê¤³¤à import pandas as pd df_study = pd.read_csv('../diff_tpm_ex.tsv', sep='\t') df_study = df_study[:400] #print(df_study.head(), len(df_study)) # GeneID ¤òÉÕ¤±¤ë # ¤¢¤é¤«¤¸¤áɽgene_id_product_plus_GeneID.tsv¤òºî¤Ã¤Æ¤ª¤¯¡¡Saccha_gene_id-to-product-plus-GeneID-gene.ipynb df_GeneID = pd.read_csv('gene_id_product_plus_GeneID.tsv', sep='\t', index_col=0) print(df_GeneID.head()) df_study = df_study.merge(df_GeneID) print(df_study.head()['NCBI_ID'])
¤³¤Îdf_GeneID¤Ï¡¢
product gene NCBI_ID Geneid gene_0001 seripauperin PAU8 PAU8 851229 gene_0002 hypothetical protein PAU8 1466426 gene_0003 putative permease SEO1 SEO1 851230 gene_0004 hypothetical protein SEO1 851232 gene_0005 hypothetical protein SEO1 851233
df_study.head()['NCBI_ID']¤Ï
0 854753 1 855213 2 855134 3 1466426 4 851232
NBCI Taxonomy ID¤òÃΤë
from goatools.associations import read_ncbi_gene2go geneid2gos_Saccha = read_ncbi_gene2go("gene2go", taxids=[4932]) print("{N:,} annotated Saccha genes".format(N=len(geneid2gos_Saccha))) 0 items READ: gene2go 0 annotated mouse genes¤³¤ÎTaxID 4932 ¤Ï´Þ¤Þ¤ì¤Æ¤¤¤Ê¤¤¡£Âå¤ï¤ê¤Ë»È¤¨¤ë¤â¤Î¤¬¤¢¤ë¤«¡© ¸µ¥Ç¡¼¥¿¤ËÆþ¤Ã¤Æ¤¤¤ë 559292 ¤Þ¤µ¤Ë¤³¤ì¡£¤ä¤ê¤Ê¤ª¤¹¤È¡¢
from goatools.associations import read_ncbi_gene2go geneid2gos_Saccha = read_ncbi_gene2go("gene2go", taxids=[559292]) print("{N:,} annotated Saccha genes".format(N=len(geneid2gos_Saccha))) EXISTS: gene2go 5,964 items READ: gene2go 5,964 annotated genes
£±¡Ë¡¡NCBI¤Çgene list¤òºî¤ë¡¡
https://github.com/tanghaibao/goatools/tree/master/doc/md/README_NCBI_gene.md
browser¤Ç https://www.ncbi.nlm.nih.gov/gene/¡¡¢Í¡¡¸¡º÷¥¡¼¥ï¡¼¥É
genetype protein coding[Properties] AND "559292"[Taxonomy ID] AND alive[property]
¤³¤ì¤Ï¡¢genes_NCBI_****_ProteinCoding¤È¾Î¤·¤Æ¤¤¤ë¤â¤Î¤ËÅö¤¿¤ë¡£
²èÌ̤˸¡º÷·ë²Ì¤¬ÆÀ¤é¤ì¤¿¤é¡¢¡ÖSend to¡×¤ÇFile¢ÍTabularÁªÂò¡£·ë²Ì¤Ïgene_result.txt¤È¤·¤Æ¥À¥¦¥ó¥í¡¼¥É¤µ¤ì¤ë¡£ÀèÆ¬Éôʬ¤Ï
tax_id Org_name GeneID CurrentID Status Symbol Aliases description other_designations map_location chromosome genomic_nucleotide_accession.version start_position_on_the_genomic_accession end_position_on_the_genomic_accession orientation exon_count OMIM 559292 Saccharomyces cerevisiae S288C 854976 0 live RAD52 YML032C recombinase RAD52 recombinase RAD52 XIII NC_001145.3 212515 213930 minus 1 559292 Saccharomyces cerevisiae S288C 852457 0 live CDC28 YBR160W, CDK1, HSL5, SRM5 cyclin-dependent serine/threonine-protein kinase CDC28 cyclin-dependent serine/threonine-protein kinase CDC28 II NC_001134.8 560078 56097plus 1 559292 Saccharomyces cerevisiae S288C 856831 0 live RAD51 YER095W, MUT5 recombinase RAD51 recombinase RAD51 NC_001137.3 349980 351182 plus 1 559292 Saccharomyces cerevisiae S288C 851415 0 live RPO21 YDL140C, RPB1, RPB220, SUA8 DNA-directed RNA polymerase II core subunit RPO21 DNA-directed RNA polymerase II core subunit RPO21 IV NC_001136.10 205360 210561 minus1 559292 Saccharomyces cerevisiae S288C 851752 0 live SUP35 YDR172W, GST1, PNM2, SAL3, SUF12, SUP2, SUP36 translation termination factor GTPase eRF3 translation termination factor GTPase eRF3 IV NC_001136.10 808324 810381 plus 1
£²¡Ë¡¡¼¡¤Ë goatools ¤Încbi_gene_results_to_python.py¤ÇÊÑ´¹ goatools¤Î¥¹¥¯¥ê¥×¥È¡¡goatools/goatools/scripts/ncbi_gene_results_to_python.py¤Ç Python¥×¥í¥°¥é¥à¡Ê¤Î¥Ç¡¼¥¿¡Ë¤ËÊÑ´¹¤¹¤ë
scripts/ncbi_gene_results_to_python.py \ -i ~/src/RNAseq-Saccha/Saccha/GO/gene_result.txt \ -o ~/src/RNAseq-Saccha/Saccha/GO/gene_result.py 6002 lines READ: /home/yamanouc/src/RNAseq-Saccha/Saccha/GO/gene_result.txt 6002 geneids WROTE: /home/yamanouc/src/RNAseq-Saccha/Saccha/GO/gene_result.py
¤Ç¤¤¿gene_result.py¤Ï
"""Data downloaded from NCBI Gene converted into Python namedtuples.""" from collections import namedtuple WRITTEN = "2019_06_25" # 6002 items #pylint: disable=line-too-long,too-many-lines,invalid-name ntncbi = namedtuple('ntncbi', 'tax_id Org_name GeneID CurrentID Status Symbol Aliases description other_designations map_location chromosome genomic_nucleotide_accession_version start_position_on_the_genomic_accession end_position_on_the_genomic_accession orientation exon_count OMIM no_hdr0') GENEID2NT = { # 6,002 items 850289 : ntncbi(tax_id=559292, Org_name='Saccharomyces cerevisiae S288C', GeneID=850289, CurrentID=0, Status='live', Symbol='GEX1', Aliases=['YCL073C'], description='glutathione exchanger', other_designations='glutathione exchanger', map_location='', chromosome='III', genomic_nucleotide_accession_version='NC_001135.5', start_position_on_the_genomic_accession=6479, end_position_on_the_genomic_accession=8326, orientation='minus', exon_count=1, OMIM='', no_hdr0=''), 850290 : ntncbi(tax_id=559292, Org_name='Saccharomyces cerevisiae S288C', GeneID=850290, CurrentID=0, Status='live', Symbol='VBA3', Aliases=['YCL069W'], description='basic amino acid transporter', other_designations='basic amino acid transporter', map_location='', chromosome='III', genomic_nucleotide_accession_version='NC_001135.5', start_position_on_the_genomic_accession=9706, end_position_on_the_genomic_accession=11082, orientation='plus', exon_count=1, OMIM='', no_hdr0=''),
from gene_result import GENEID2NT as GeneID2nt_saccha from goatools.go_enrichment import GOEnrichmentStudy
goeaobj = GOEnrichmentStudy( GeneID2nt_saccha.keys(), # List of mouse protein-coding genes geneid2gos_saccha, # geneid/GO associations obodag, # Ontologies propagate_counts = False, alpha = 0.05, # default significance cut-off methods = ['fdr_bh']) # defult multipletest correction method 94% 5,627 of 6,002 population items found in association
df_study¤ÎGeneID¤ÎÎó¤À¤±¤ò¼è½Ð¤·¤Æ¥ê¥¹¥È²½¤·¡¢GOEA¤ËÍ¿¤¨¤ë¡£
# goea ¼Â¹Ô study_list = df_study['NCBI_ID'].values.tolist() #print(study_list) goea_results_all = goeaobj.run_study(study_list) goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05] goeaobj.wr_xlsx("SacchaGo.xlsx", goea_results_sig) goeaobj.wr_txt("SacchGo.txt", goea_results_sig)
²èÌ̽ÐÎϤÏ
study_in_pop {851972, 856068, 856071, 856072, 856074, 851980, 854031, ¡Ê¸åά¡Ë} 67% 705 of 1,050 study items found in association 2% 1,050 of 49,269 study items found in population(6002) Calculating 6,030 uncorrected p-values using fisher 6,030 GO terms are associated with 5,627 of 6,002 population items 1,220 GO terms are associated with 705 of 1,066 study items 133 GO terms found significant (< 0.05=alpha) after multitest correction: statsmodels fdr_bh 133 items WROTE: nbt3102.xlsx 133 GOEA results for 518 study items. WROTE: nbt3102.txt
·ë²Ì¤Ïp<0.05¤ÎÉôʬ¤À¤±¤ò¼è½Ð¤·¤Æ¡¢¥Õ¥¡¥¤¥ë²½¡£SacchaGo.xlsx¡¢SacchaGo.txt
³¤¤¤Æ¥°¥é¥Õ²½
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj plot_results("Saccha_Go_{NS}.png", goea_results_sig) 57 usr 263 GOs WROTE: Saccha_Go_BP.png 43 usr 121 GOs WROTE: Saccha_Go_CC.png 33 usr 89 GOs WROTE: Saccha_Go_MF.png
¥°¥é¥Õ¤ÏÈó¾ï¤ËÂ礤¤¡£
¤½¤³¤Ç¡¢¥¹¥¿¡¼¥È¤ÎGO¥¿¡¼¥à¤ò¡¢study count¿ô¤Î¿¤¤¤â¤Î¤Ë¸ÂÄê¡£ Àè¤Û¤ÉÆÀ¤é¤ì¤¿SacchaGo.xlsx¤ò³«¤¤¤Æ¡¢study_count¤ÎÍó¤Ç¥½¡¼¥È¤·¡¢32ÄøÅÙ¤è¤ê¿¤¤¤â¤Î¤òÁªÂò¤·¤¿¡£
# Plot subset starting from these significant GO terms goid_subset = { 'GO:0005737', # CC p cytoplasm 'GO:0005634', # CC p nucleus 'GO:0005739', # CC p mitochondrion 'GO:0005829', # CC p cytosol 'GO:0005783', # CC p endoplasmic reticulum 'GO:0046872', # MF p metal ion binding 'GO:0000166', # MF p nucleotide binding 'GO:0005524', # MF p ATP binding 'GO:0016740', # MF p transferase activity 'GO:0016787', # MF p hydrolase activity } plot_gos("SacchaGo_Subset.png", goid_subset, # Source GO ids obodag, goea_results=goea_results_all) # Use pvals for coloring
¤³¤Î·ë²Ì¤Î¿Þ¤¬