[[Python¥Ð¥¤¥ª]]¡¡[[Python¥Ð¥¤¥ª/¥Ä¡¼¥ë]]~ &counter();¡¡¡¡¡¡&lastmod();~ *TPM°Ê¹ß¤Î²òÀÏ [#h2dccce6] Cufflink¡¦Cuffdiff¡¦¤¢¤¿¤ê¤ÇTPM (FPRM) ¤ò½Ð¤·¤Æ¤«¤é¤Î½èÍý¡£ Á´ÂΤ˥ǡ¼¥¿¤Ï¤½¤³¤½¤³¤Þ¤È¤â¤«¡©~ ȯ¸½¥Ñ¥¿¡¼¥ó¤Î»÷¤¿°äÅÁ»Ò¤Ï¡©~ [[R¤ÈBioconductor¤ÇNGS²òÀÏ://catway.jp/lecture/tohoku-u/NGS-R-Bioconductor-2nd.html]] ¡ÊÆó³¬Æ²°¦¡Ë¤òÄɤ¦·Á¤Ç ¡ÖCuffdiff¤Î·ë²Ì¤òcummeRbund¤ÇÆÉ¹þ¤à¡×¡¡¢Í¡¡FRPM¥Ç¡¼¥¿ ¤³¤Á¤é¤Ï¡¢´û¤Ë¤³¤³¤Þ¤Ç¤Ç¤¤Æ¤¤¤ë¡£ **Á´ÂΤÎȯ¸½Î̥ǡ¼¥¿¤Î¥Á¥§¥Ã¥¯ [#v06dfb17] ***ȯ¸½Î̤ÎʬÉÛ¤Îɽ¼¨ [#lf15dc44] # ÅìËÌÂç¡¡Æó³¬Æ²°¦¡¡http://catway.jp/lecture/tohoku-u/NGS-R-Bioconductor-2nd.html %matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt sns.set() sns.set_style('whitegrid') # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) all_zero_index = df.index[df.sum(axis=1) == 0] df = df.drop(all_zero_index) dfl = np.log10(df) dfl = dfl.replace([np.inf, -np.inf], -1) # Ãͤ¬inf/-inf¤Î¥Ç¡¼¥¿¤Ï-1¤ËÃÖ´¹ print(dfl.head()) #dfl.plot.kde(style=['-', '--', ':', '-', '-.', ':']) # Àþ¤Î¥Ñ¥¿¡¼¥ó¤Ï£µ¼ïÎष¤«¤Ê¤¤ sns.kdeplot(dfl['batch_1']) # ÊÌ¡¹¤Ë½ñ¤¤¤Æ½Å¤Í¤ë¤³¤È¤Ë¤·¤¿ sns.kdeplot(dfl['batch_2']) sns.kdeplot(dfl['batch_3']) sns.kdeplot(dfl['chemostat_1']) sns.kdeplot(dfl['chemostat_2']) sns.kdeplot(dfl['chemostat_3']) plt.xlim(-1, 5) plt.xlabel('$log_{10}(TPM)$') plt.ylabel('density') plt.show() ·ë²Ì¤Ï &ref(density-distribution.png); ƱÍͤΥǡ¼¥¿¤ò¡¢È¢¤Ò¤²¿Þ¤ÇÉÁ¤¯¡£ ***ƱÍͤΥǡ¼¥¿¤ò¡¢È¢¤Ò¤²¿Þ¤ÇÉÁ¤¯¡£ [#r15dda03] %matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set() sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic']) # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) all_zero_index = df.index[df.sum(axis=1) == 0] df = df.drop(all_zero_index) dfl = np.log10(df) dfl = dfl.replace([np.inf, -np.inf], -1) print(dfl.head()) #dfl.boxplot() #dfl.plot.box() sns.boxplot(data=dfl) #plt.xlim(-0.3, 5) plt.ylabel('$log_{10}(TPM)$') plt.show() ·ë²Ì¤Ï &ref(box-chart.png); ***¥µ¥ó¥×¥ë´Ö¤Î»¶ÉÛ¿Þ¤òÉÁ¤¯ [#zc9f847b] seaborn¤ÎÆÀ°Õµ»pairplot¤ò»È¤¦¤È²¿¤â¹Í¤¨¤Ê¤¯¤Æ¤è¤¤¡£ %matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set() #sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic'], markers='.') sns.set(style='whitegrid', font=['IPAexGothic']) # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) all_zero_index = df.index[df.sum(axis=1) == 0] df = df.drop(all_zero_index) dfl = np.log10(df) dfl = dfl.replace([np.inf, -np.inf], -1) print(dfl.head()) #dfl.boxplot() #dfl.plot.box() sns.pairplot(dfl, kind='reg', height=3) # size=0.1¤È¤¹¤ë¤È¿Þ´Ö¤¬µÍ¤Þ¤ë #plt.xlim(-0.3, 5) #plt.xlabel('$log_{10}(TPM)$') plt.show() dfl['batch'] = dfl[['batch_1', 'batch_2', 'batch_3']].mean(axis=1) # ²£Êý¸þ¤ÎÊ¿¶Ñ dfl['chemostat'] = dfl[['chemostat_1', 'chemostat_2', 'chemostat_3']].mean(axis=1) print(dfl.head()) sns.relplot(kind='scatter', x='batch', y='chemostat', data=dfl) # size=0.1¤È¤¹¤ë¤ÈÅÀ¤¬¾®¤µ¤¯¤Ê¤ë plt.show() ·ë²Ì¤Ï¡¢ &ref(scatter-plot-large.png); &ref(scatter-plot-average.png); ***¥Ò¡¼¥È¥Þ¥Ã¥× [#u69ad453] ¹àÌܤò¥¯¥é¥¹¥¿²½¤·¤¿·ë²Ì¤ò»È¤Ã¤Æ´ó¤»¤ë¡£dendrogram¤ÎÌá¤êÃͤò»È¤¦¡£ %matplotlib inline # heatmap¡¡¥¯¥é¥¹¥¿¡¼²½¤Ë¤è¤Ã¤Æ´ó¤»¤ë import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list from scipy.spatial.distance import pdist sns.set() #sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic']) sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral') # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) #all_zero_index = df.index[df.sum(axis=1) == 0] #df = df.drop(all_zero_index) #df = df.replace(0, 0.00001) print(df.head()) print(df[-5:]) # ÀäÂÐÃͤ¬Â礤¤100genes¡Ê+¤ËÂ礤¤50¤È-¤ËÂ礤¤50¡Ë¤ò¼è¤ë #dfr = pd.concat([df[:200], df[-200:]]) dfr = pd.concat([df[:50], df[-50:]]) print(dfr.head()) print(dfr[-5:]) #df = df.applymap(lambda x: x+1) dfr = dfr + 1 # log¤ò¼è¤ëÁ°¤Ë£±¤ò¤¹ print('gene_0008\n', dfr.loc['gene_0008']) # log10¤ò¼è¤ë dfl = np.log10(dfr) #dfl = dfl.replace([np.inf, -np.inf], -5) dfl_t = dfl.T print('dfl\n', dfl.head()) #linkage_result_gene = linkage(dfl, method='average', metric='correlation') linkage_result_gene = linkage(dfl) plt.figure(figsize=(16,20)) res = dendrogram(linkage_result_gene, labels=dfl.index, leaf_rotation=90, leaf_font_size=10, get_leaves=True) leavesy = leaves_list(linkage_result_gene) print('leavesy', leavesy) lvs = pd.DataFrame([res['ivl'], res['leaves']], columns=res['ivl']).T lvs.columns = ['gene_id2', 'leaves'] print(lvs) dfl2 = pd.merge(lvs, dfl, left_on='gene_id2', right_index=True) print('dfl2\n', dfl2.head()) print('dfl2\n', dfl2[-5:]) #lvs¤Îʤӽç¤Î¤Þ¤Þ¤ÎÊý¤¬¤¤¤¤ print(dfl2.head()) plt.figure(figsize=(10, 50)) sns.set_palette('hot') #sns.heatmap(dfl2[['batch_1','batch_2','batch_3', 'chemostat_1','chemostat_2','chemostat_3']], vmax=1, vmin=-1, center=0) sns.heatmap(dfl2[['batch_1','batch_2','batch_3', 'chemostat_1','chemostat_2','chemostat_3']], vmax=3, vmin=0, center=1.0, cmap='YlOrBr') plt.show() ·ë²Ì¤Ï &ref(heatmap-dendrogram.png); &ref(heatmap.png); ***JSD (Jense-Shannon divergence) µ÷Î¥¤Ë¤è¤ëgene´Ö¤Î¤Õ¤ë¤Þ¤¤¤Î°ã¤¤¤Î»»Äê [#j0b52a21] ¤¿¤È¤¨¤Ð¡¢gene_0013¤Ë»÷¤¿¤Õ¤ë¤Þ¤¤¤Îgene¤òõ¤¹¡£ºÇ¤â¶á¤¤10¸Ä¤òɽ¼¨¡£ %matplotlib inline # JSD Jensen-Shannon divergence¤Ë¤è¤ë¡¢°ã¤¤¤Î»»Äê # ¤¢¤ë°äÅÁ»Ò¤È»÷¤¿È¯¸À¤ò¼¨¤¹°äÅÁ»Ò¤òõ¤¹ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list from scipy.spatial import distance import itertools sns.set() sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral') # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0) df = df + 1 # log10¤ò¼è¤ë dfl = np.log10(df) dfl_t = dfl.T # ¤¹¤Ù¤Æ¤Î¹Ô¤ÎÁȹ礻¤Ç¡¢µ÷Î¥¤ò·×»»¤¹¤ë # ¥ª¡¼¥ë0¤Î¹Ô¤Ï½ü¤«¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡Ê³ÎΨ¤Ê¤Î¤ÇÁ´¤ÆÂ¤·¤Æ£±¤ËÀµµ¬²½¤¹¤ë¡Ë dfl_copy = dfl.replace(0.0, np.nan).dropna(how='all', axis=0) dfl = dfl.loc[dfl_copy.index] # print('dfl\n', dfl.head()); print() # dfl = dfl[:10] # target¤È¾¤Îgene¤È¤ÎÂФε÷Î¥¤ò·×»»¤¹¤ë target = 'gene_0013' dfl['d'] = dfl.apply(lambda x: distance.jensenshannon(x, dfl.loc[target]), axis=1) #print(dfl) dfx = dfl.sort_values('d', ascending=True)[:10] print('dfx\n', dfx) dfg = dfx.drop('d', axis=1).T dfg.plot() plt.ylabel('$log_{10}(TPM-1)$') plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.show() ·ë²Ì¤Ï &ref(JSD-similar.png);