[[Python¥Ð¥¤¥ª]]¡¡[[Python¥Ð¥¤¥ª/¥Ä¡¼¥ë]]~
&counter();¡¡¡¡¡¡&lastmod();~

*TPM°Ê¹ß¤Î²òÀÏ [#h2dccce6]
Cufflink¡¦Cuffdiff¡¦¤¢¤¿¤ê¤ÇTPM (FPRM) ¤ò½Ð¤·¤Æ¤«¤é¤Î½èÍý¡£

Á´ÂΤ˥ǡ¼¥¿¤Ï¤½¤³¤½¤³¤Þ¤È¤â¤«¡©~
ȯ¸½¥Ñ¥¿¡¼¥ó¤Î»÷¤¿°äÅÁ»Ò¤Ï¡©~

[[R¤ÈBioconductor¤ÇNGS²òÀÏ://catway.jp/lecture/tohoku-u/NGS-R-Bioconductor-2nd.html]] ¡ÊÆó³¬Æ²°¦¡Ë¤òÄɤ¦·Á¤Ç

¡ÖCuffdiff¤Î·ë²Ì¤òcummeRbund¤ÇÆÉ¹þ¤à¡×¡¡¢Í¡¡FRPM¥Ç¡¼¥¿

¤³¤Á¤é¤Ï¡¢´û¤Ë¤³¤³¤Þ¤Ç¤Ç¤­¤Æ¤¤¤ë¡£

**Á´ÂΤÎȯ¸½Î̥ǡ¼¥¿¤Î¥Á¥§¥Ã¥¯ [#v06dfb17]
***ȯ¸½Î̤ÎʬÉÛ¤Îɽ¼¨ [#lf15dc44]
# ÅìËÌÂç¡¡Æó³¬Æ²°¦¡¡http://catway.jp/lecture/tohoku-u/NGS-R-Bioconductor-2nd.html
 %matplotlib inline
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 sns.set()
 sns.set_style('whitegrid')
# count_tpm.tsv¤ÎÆÉ¹þ¤ß
 df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)
 all_zero_index = df.index[df.sum(axis=1) == 0]
 df = df.drop(all_zero_index)
 dfl = np.log10(df)
 dfl = dfl.replace([np.inf, -np.inf], -1)  # Ãͤ¬inf/-inf¤Î¥Ç¡¼¥¿¤Ï-1¤ËÃÖ´¹
 print(dfl.head())
 #dfl.plot.kde(style=['-', '--', ':', '-', '-.', ':'])  # Àþ¤Î¥Ñ¥¿¡¼¥ó¤Ï£µ¼ïÎष¤«¤Ê¤¤
 sns.kdeplot(dfl['batch_1'])  # ÊÌ¡¹¤Ë½ñ¤¤¤Æ½Å¤Í¤ë¤³¤È¤Ë¤·¤¿
 sns.kdeplot(dfl['batch_2'])
 sns.kdeplot(dfl['batch_3'])
 sns.kdeplot(dfl['chemostat_1'])
 sns.kdeplot(dfl['chemostat_2'])
 sns.kdeplot(dfl['chemostat_3'])
 plt.xlim(-1, 5)
 plt.xlabel('$log_{10}(TPM)$')
 plt.ylabel('density')
 plt.show()

·ë²Ì¤Ï
&ref(density-distribution.png);

ƱÍͤΥǡ¼¥¿¤ò¡¢È¢¤Ò¤²¿Þ¤ÇÉÁ¤¯¡£
***ƱÍͤΥǡ¼¥¿¤ò¡¢È¢¤Ò¤²¿Þ¤ÇÉÁ¤¯¡£ [#r15dda03]
 %matplotlib inline
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 sns.set()
 sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic'])
 # count_tpm.tsv¤ÎÆÉ¹þ¤ß
 df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)
 all_zero_index = df.index[df.sum(axis=1) == 0]
 df = df.drop(all_zero_index)
 dfl = np.log10(df)
 dfl = dfl.replace([np.inf, -np.inf], -1)
 print(dfl.head())
 #dfl.boxplot()
 #dfl.plot.box()
 sns.boxplot(data=dfl)
 #plt.xlim(-0.3, 5)
 plt.ylabel('$log_{10}(TPM)$')
 plt.show()

·ë²Ì¤Ï
&ref(box-chart.png);

***¥µ¥ó¥×¥ë´Ö¤Î»¶ÉÛ¿Þ¤òÉÁ¤¯ [#zc9f847b]
seaborn¤ÎÆÀ°Õµ»pairplot¤ò»È¤¦¤È²¿¤â¹Í¤¨¤Ê¤¯¤Æ¤è¤¤¡£

 %matplotlib inline
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 sns.set()
 #sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic'], markers='.')
 sns.set(style='whitegrid', font=['IPAexGothic'])
 # count_tpm.tsv¤ÎÆÉ¹þ¤ß
 df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)
 all_zero_index = df.index[df.sum(axis=1) == 0]
 df = df.drop(all_zero_index)
 dfl = np.log10(df)
 dfl = dfl.replace([np.inf, -np.inf], -1)
 print(dfl.head())
 #dfl.boxplot()
 #dfl.plot.box()
 sns.pairplot(dfl, kind='reg', height=3)  # size=0.1¤È¤¹¤ë¤È¿Þ´Ö¤¬µÍ¤Þ¤ë
 #plt.xlim(-0.3, 5)
 #plt.xlabel('$log_{10}(TPM)$')
 plt.show()
 dfl['batch'] = dfl[['batch_1', 'batch_2', 'batch_3']].mean(axis=1) # ²£Êý¸þ¤ÎÊ¿¶Ñ
 dfl['chemostat'] = dfl[['chemostat_1', 'chemostat_2', 'chemostat_3']].mean(axis=1)
 print(dfl.head())
 sns.relplot(kind='scatter', x='batch', y='chemostat', data=dfl) # size=0.1¤È¤¹¤ë¤ÈÅÀ¤¬¾®¤µ¤¯¤Ê¤ë
 plt.show()

·ë²Ì¤Ï¡¢
&ref(scatter-plot-large.png);

&ref(scatter-plot-average.png);

***¥Ò¡¼¥È¥Þ¥Ã¥× [#u69ad453]

¹àÌܤò¥¯¥é¥¹¥¿²½¤·¤¿·ë²Ì¤ò»È¤Ã¤Æ´ó¤»¤ë¡£dendrogram¤ÎÌá¤êÃͤò»È¤¦¡£

 %matplotlib inline
 # heatmap¡¡¥¯¥é¥¹¥¿¡¼²½¤Ë¤è¤Ã¤Æ´ó¤»¤ë
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
 from scipy.spatial.distance import pdist
 sns.set()
 #sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic'])
 sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral')
 # count_tpm.tsv¤ÎÆÉ¹þ¤ß
 df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)
 #all_zero_index = df.index[df.sum(axis=1) == 0]
 #df = df.drop(all_zero_index)
 #df = df.replace(0, 0.00001)
 print(df.head())
 print(df[-5:])
 # ÀäÂÐÃͤ¬Â礭¤¤100genes¡Ê+¤ËÂ礭¤¤50¤È-¤ËÂ礭¤¤50¡Ë¤ò¼è¤ë
 #dfr = pd.concat([df[:200], df[-200:]])
  dfr = pd.concat([df[:50], df[-50:]])
  print(dfr.head())
  print(dfr[-5:])
 #df = df.applymap(lambda x: x+1)
 dfr = dfr + 1  # log¤ò¼è¤ëÁ°¤Ë£±¤ò­¤¹
 print('gene_0008\n', dfr.loc['gene_0008'])
 # log10¤ò¼è¤ë
 dfl = np.log10(dfr)
 #dfl = dfl.replace([np.inf, -np.inf], -5)
 
 dfl_t = dfl.T
 print('dfl\n', dfl.head())
 #linkage_result_gene = linkage(dfl, method='average', metric='correlation')
 linkage_result_gene = linkage(dfl)
 plt.figure(figsize=(16,20))
 res = dendrogram(linkage_result_gene, labels=dfl.index, leaf_rotation=90, 
 leaf_font_size=10, get_leaves=True)
 leavesy = leaves_list(linkage_result_gene)
 print('leavesy', leavesy)
 lvs = pd.DataFrame([res['ivl'], res['leaves']], columns=res['ivl']).T
 lvs.columns = ['gene_id2', 'leaves']
 print(lvs)
 
 dfl2 = pd.merge(lvs, dfl, left_on='gene_id2', right_index=True)
 print('dfl2\n', dfl2.head())
 print('dfl2\n', dfl2[-5:])
 
 #lvs¤Îʤӽç¤Î¤Þ¤Þ¤ÎÊý¤¬¤¤¤¤
 print(dfl2.head())
 plt.figure(figsize=(10, 50))
 sns.set_palette('hot')
 #sns.heatmap(dfl2[['batch_1','batch_2','batch_3', 'chemostat_1','chemostat_2','chemostat_3']], vmax=1, vmin=-1, center=0)
 sns.heatmap(dfl2[['batch_1','batch_2','batch_3', 'chemostat_1','chemostat_2','chemostat_3']], vmax=3, vmin=0, center=1.0, cmap='YlOrBr')
 plt.show()

·ë²Ì¤Ï
&ref(heatmap-dendrogram.png);

&ref(heatmap.png);

***JSD (Jense-Shannon divergence) µ÷Î¥¤Ë¤è¤ëgene´Ö¤Î¤Õ¤ë¤Þ¤¤¤Î°ã¤¤¤Î»»Äê [#j0b52a21]
¤¿¤È¤¨¤Ð¡¢gene_0013¤Ë»÷¤¿¤Õ¤ë¤Þ¤¤¤Îgene¤òõ¤¹¡£ºÇ¤â¶á¤¤10¸Ä¤òɽ¼¨¡£

 %matplotlib inline
 # JSD Jensen-Shannon divergence¤Ë¤è¤ë¡¢°ã¤¤¤Î»»Äê
 # ¤¢¤ë°äÅÁ»Ò¤È»÷¤¿È¯¸À¤ò¼¨¤¹°äÅÁ»Ò¤òõ¤¹
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
 from scipy.spatial import distance
 import itertools
 sns.set()
 sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral')
 # count_tpm.tsv¤ÎÆÉ¹þ¤ß
 df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)
 
 df = df + 1
 # log10¤ò¼è¤ë
 dfl = np.log10(df)
 dfl_t = dfl.T
 # ¤¹¤Ù¤Æ¤Î¹Ô¤ÎÁȹ礻¤Ç¡¢µ÷Î¥¤ò·×»»¤¹¤ë
 # ¥ª¡¼¥ë0¤Î¹Ô¤Ï½ü¤«¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡Ê³ÎΨ¤Ê¤Î¤ÇÁ´¤ÆÂ­¤·¤Æ£±¤ËÀµµ¬²½¤¹¤ë¡Ë
 dfl_copy = dfl.replace(0.0, np.nan).dropna(how='all', axis=0)
 dfl = dfl.loc[dfl_copy.index]
 # print('dfl\n', dfl.head()); print()
 
 # dfl = dfl[:10]
 # target¤È¾¤Îgene¤È¤ÎÂФε÷Î¥¤ò·×»»¤¹¤ë
 target = 'gene_0013'
 dfl['d'] = dfl.apply(lambda x: distance.jensenshannon(x, dfl.loc[target]), axis=1)
 #print(dfl)
 dfx = dfl.sort_values('d', ascending=True)[:10]
 print('dfx\n', dfx)
 
 dfg = dfx.drop('d', axis=1).T
 dfg.plot()
 plt.ylabel('$log_{10}(TPM-1)$')
 plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
 plt.show()

·ë²Ì¤Ï

&ref(JSD-similar.png);

¥È¥Ã¥×   ÊÔ½¸ º¹Ê¬ ¥Ð¥Ã¥¯¥¢¥Ã¥× źÉÕ Ê£À½ ̾Á°Êѹ¹ ¥ê¥í¡¼¥É   ¿·µ¬ °ìÍ÷ ñ¸ì¸¡º÷ ºÇ½ª¹¹¿·   ¥Ø¥ë¥×   ºÇ½ª¹¹¿·¤ÎRSS