Python¥Ð¥¤¥ª?¡¡Python¥Ð¥¤¥ª/¥Ä¡¼¥ë?
1921¡¡¡¡¡¡2019-07-10 (¿å) 16:05:41
Cufflink¡¦Cuffdiff¡¦¤¢¤¿¤ê¤ÇTPM (FPRM) ¤ò½Ð¤·¤Æ¤«¤é¤Î½èÍý¡£
Á´ÂΤ˥ǡ¼¥¿¤Ï¤½¤³¤½¤³¤Þ¤È¤â¤«¡©
ȯ¸½¥Ñ¥¿¡¼¥ó¤Î»÷¤¿°äÅÁ»Ò¤Ï¡©
R¤ÈBioconductor¤ÇNGS²òÀÏ://catway.jp/lecture/tohoku-u/NGS-R-Bioconductor-2nd.html ¡ÊÆó³¬Æ²°¦¡Ë¤òÄɤ¦·Á¤Ç
¡ÖCuffdiff¤Î·ë²Ì¤òcummeRbund¤ÇÆÉ¹þ¤à¡×¡¡¢Í¡¡FRPM¥Ç¡¼¥¿
¤³¤Á¤é¤Ï¡¢´û¤Ë¤³¤³¤Þ¤Ç¤Ç¤¤Æ¤¤¤ë¡£
# ÅìËÌÂç¡¡Æó³¬Æ²°¦¡¡http://catway.jp/lecture/tohoku-u/NGS-R-Bioconductor-2nd.html
%matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt sns.set() sns.set_style('whitegrid')
# count_tpm.tsv¤ÎÆÉ¹þ¤ß
df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) all_zero_index = df.index[df.sum(axis=1) == 0] df = df.drop(all_zero_index) dfl = np.log10(df) dfl = dfl.replace([np.inf, -np.inf], -1) # Ãͤ¬inf/-inf¤Î¥Ç¡¼¥¿¤Ï-1¤ËÃÖ´¹ print(dfl.head()) #dfl.plot.kde(style=['-', '--', ':', '-', '-.', ':']) # Àþ¤Î¥Ñ¥¿¡¼¥ó¤Ï£µ¼ïÎष¤«¤Ê¤¤ sns.kdeplot(dfl['batch_1']) # ÊÌ¡¹¤Ë½ñ¤¤¤Æ½Å¤Í¤ë¤³¤È¤Ë¤·¤¿ sns.kdeplot(dfl['batch_2']) sns.kdeplot(dfl['batch_3']) sns.kdeplot(dfl['chemostat_1']) sns.kdeplot(dfl['chemostat_2']) sns.kdeplot(dfl['chemostat_3']) plt.xlim(-1, 5) plt.xlabel('$log_{10}(TPM)$') plt.ylabel('density') plt.show()
%matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set() sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic']) # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) all_zero_index = df.index[df.sum(axis=1) == 0] df = df.drop(all_zero_index) dfl = np.log10(df) dfl = dfl.replace([np.inf, -np.inf], -1) print(dfl.head()) #dfl.boxplot() #dfl.plot.box() sns.boxplot(data=dfl) #plt.xlim(-0.3, 5) plt.ylabel('$log_{10}(TPM)$') plt.show()
seaborn¤ÎÆÀ°Õµ»pairplot¤ò»È¤¦¤È²¿¤â¹Í¤¨¤Ê¤¯¤Æ¤è¤¤¡£
%matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set() #sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic'], markers='.') sns.set(style='whitegrid', font=['IPAexGothic']) # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) all_zero_index = df.index[df.sum(axis=1) == 0] df = df.drop(all_zero_index) dfl = np.log10(df) dfl = dfl.replace([np.inf, -np.inf], -1) print(dfl.head()) #dfl.boxplot() #dfl.plot.box() sns.pairplot(dfl, kind='reg', height=3) # size=0.1¤È¤¹¤ë¤È¿Þ´Ö¤¬µÍ¤Þ¤ë #plt.xlim(-0.3, 5) #plt.xlabel('$log_{10}(TPM)$') plt.show() dfl['batch'] = dfl[['batch_1', 'batch_2', 'batch_3']].mean(axis=1) # ²£Êý¸þ¤ÎÊ¿¶Ñ dfl['chemostat'] = dfl[['chemostat_1', 'chemostat_2', 'chemostat_3']].mean(axis=1) print(dfl.head()) sns.relplot(kind='scatter', x='batch', y='chemostat', data=dfl) # size=0.1¤È¤¹¤ë¤ÈÅÀ¤¬¾®¤µ¤¯¤Ê¤ë plt.show()
¹àÌܤò¥¯¥é¥¹¥¿²½¤·¤¿·ë²Ì¤ò»È¤Ã¤Æ´ó¤»¤ë¡£dendrogram¤ÎÌá¤êÃͤò»È¤¦¡£
%matplotlib inline # heatmap¡¡¥¯¥é¥¹¥¿¡¼²½¤Ë¤è¤Ã¤Æ´ó¤»¤ë import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list from scipy.spatial.distance import pdist sns.set() #sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic']) sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral') # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) #all_zero_index = df.index[df.sum(axis=1) == 0] #df = df.drop(all_zero_index) #df = df.replace(0, 0.00001) print(df.head()) print(df[-5:]) # ÀäÂÐÃͤ¬Â礤¤100genes¡Ê+¤ËÂ礤¤50¤È-¤ËÂ礤¤50¡Ë¤ò¼è¤ë #dfr = pd.concat([df[:200], df[-200:]]) dfr = pd.concat([df[:50], df[-50:]]) print(dfr.head()) print(dfr[-5:]) #df = df.applymap(lambda x: x+1) dfr = dfr + 1 # log¤ò¼è¤ëÁ°¤Ë£±¤ò¤¹ print('gene_0008\n', dfr.loc['gene_0008']) # log10¤ò¼è¤ë dfl = np.log10(dfr) #dfl = dfl.replace([np.inf, -np.inf], -5) dfl_t = dfl.T print('dfl\n', dfl.head()) #linkage_result_gene = linkage(dfl, method='average', metric='correlation') linkage_result_gene = linkage(dfl) plt.figure(figsize=(16,20)) res = dendrogram(linkage_result_gene, labels=dfl.index, leaf_rotation=90, leaf_font_size=10, get_leaves=True) leavesy = leaves_list(linkage_result_gene) print('leavesy', leavesy) lvs = pd.DataFrame([res['ivl'], res['leaves']], columns=res['ivl']).T lvs.columns = ['gene_id2', 'leaves'] print(lvs) dfl2 = pd.merge(lvs, dfl, left_on='gene_id2', right_index=True) print('dfl2\n', dfl2.head()) print('dfl2\n', dfl2[-5:]) #lvs¤Îʤӽç¤Î¤Þ¤Þ¤ÎÊý¤¬¤¤¤¤ print(dfl2.head()) plt.figure(figsize=(10, 50)) sns.set_palette('hot') #sns.heatmap(dfl2[['batch_1','batch_2','batch_3', 'chemostat_1','chemostat_2','chemostat_3']], vmax=1, vmin=-1, center=0) sns.heatmap(dfl2[['batch_1','batch_2','batch_3', 'chemostat_1','chemostat_2','chemostat_3']], vmax=3, vmin=0, center=1.0, cmap='YlOrBr') plt.show()
¤¿¤È¤¨¤Ð¡¢gene_0013¤Ë»÷¤¿¤Õ¤ë¤Þ¤¤¤Îgene¤òõ¤¹¡£ºÇ¤â¶á¤¤10¸Ä¤òɽ¼¨¡£
%matplotlib inline # JSD Jensen-Shannon divergence¤Ë¤è¤ë¡¢°ã¤¤¤Î»»Äê # ¤¢¤ë°äÅÁ»Ò¤È»÷¤¿È¯¸À¤ò¼¨¤¹°äÅÁ»Ò¤òõ¤¹ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list from scipy.spatial import distance import itertools sns.set() sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral') # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) df = df + 1 # log10¤ò¼è¤ë dfl = np.log10(df) dfl_t = dfl.T # ¤¹¤Ù¤Æ¤Î¹Ô¤ÎÁȹ礻¤Ç¡¢µ÷Î¥¤ò·×»»¤¹¤ë # ¥ª¡¼¥ë0¤Î¹Ô¤Ï½ü¤«¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡Ê³ÎΨ¤Ê¤Î¤ÇÁ´¤ÆÂ¤·¤Æ£±¤ËÀµµ¬²½¤¹¤ë¡Ë dfl_copy = dfl.replace(0.0, np.nan).dropna(how='all', axis=0) dfl = dfl.loc[dfl_copy.index] # print('dfl\n', dfl.head()); print() # dfl = dfl[:10] # target¤È¾¤Îgene¤È¤ÎÂФε÷Î¥¤ò·×»»¤¹¤ë target = 'gene_0013' dfl['d'] = dfl.apply(lambda x: distance.jensenshannon(x, dfl.loc[target]), axis=1) #print(dfl) dfx = dfl.sort_values('d', ascending=True)[:10] print('dfx\n', dfx) dfg = dfx.drop('d', axis=1).T dfg.plot() plt.ylabel('$log_{10}(TPM-1)$') plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.show()
·ë²Ì¤Ï