Python¥Ð¥¤¥ª?¡¡Python¥Ð¥¤¥ª/¥Ä¡¼¥ë?
1921¡¡¡¡¡¡2019-07-10 (¿å) 16:05:41

TPM°Ê¹ß¤Î²òÀÏ

Cufflink¡¦Cuffdiff¡¦¤¢¤¿¤ê¤ÇTPM (FPRM) ¤ò½Ð¤·¤Æ¤«¤é¤Î½èÍý¡£

Á´ÂΤ˥ǡ¼¥¿¤Ï¤½¤³¤½¤³¤Þ¤È¤â¤«¡©
ȯ¸½¥Ñ¥¿¡¼¥ó¤Î»÷¤¿°äÅÁ»Ò¤Ï¡©

R¤ÈBioconductor¤ÇNGS²òÀÏ://catway.jp/lecture/tohoku-u/NGS-R-Bioconductor-2nd.html ¡ÊÆó³¬Æ²°¦¡Ë¤òÄɤ¦·Á¤Ç

¡ÖCuffdiff¤Î·ë²Ì¤òcummeRbund¤ÇÆÉ¹þ¤à¡×¡¡¢Í¡¡FRPM¥Ç¡¼¥¿

¤³¤Á¤é¤Ï¡¢´û¤Ë¤³¤³¤Þ¤Ç¤Ç¤­¤Æ¤¤¤ë¡£

Á´ÂΤÎȯ¸½Î̥ǡ¼¥¿¤Î¥Á¥§¥Ã¥¯

ȯ¸½Î̤ÎʬÉÛ¤Îɽ¼¨

# ÅìËÌÂç¡¡Æó³¬Æ²°¦¡¡http://catway.jp/lecture/tohoku-u/NGS-R-Bioconductor-2nd.html

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
sns.set()
sns.set_style('whitegrid')

# count_tpm.tsv¤ÎÆÉ¹þ¤ß

df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)
all_zero_index = df.index[df.sum(axis=1) == 0]
df = df.drop(all_zero_index)
dfl = np.log10(df)
dfl = dfl.replace([np.inf, -np.inf], -1)  # Ãͤ¬inf/-inf¤Î¥Ç¡¼¥¿¤Ï-1¤ËÃÖ´¹
print(dfl.head())
#dfl.plot.kde(style=['-', '--', ':', '-', '-.', ':'])  # Àþ¤Î¥Ñ¥¿¡¼¥ó¤Ï£µ¼ïÎष¤«¤Ê¤¤
sns.kdeplot(dfl['batch_1'])  # ÊÌ¡¹¤Ë½ñ¤¤¤Æ½Å¤Í¤ë¤³¤È¤Ë¤·¤¿
sns.kdeplot(dfl['batch_2'])
sns.kdeplot(dfl['batch_3'])
sns.kdeplot(dfl['chemostat_1'])
sns.kdeplot(dfl['chemostat_2'])
sns.kdeplot(dfl['chemostat_3'])
plt.xlim(-1, 5)
plt.xlabel('$log_{10}(TPM)$')
plt.ylabel('density')
plt.show()

·ë²Ì¤Ï density-distribution.png

ƱÍͤΥǡ¼¥¿¤ò¡¢È¢¤Ò¤²¿Þ¤ÇÉÁ¤¯¡£

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic'])
# count_tpm.tsv¤ÎÆÉ¹þ¤ß
df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)
all_zero_index = df.index[df.sum(axis=1) == 0]
df = df.drop(all_zero_index)
dfl = np.log10(df)
dfl = dfl.replace([np.inf, -np.inf], -1)
print(dfl.head())
#dfl.boxplot()
#dfl.plot.box()
sns.boxplot(data=dfl)
#plt.xlim(-0.3, 5)
plt.ylabel('$log_{10}(TPM)$')
plt.show()

·ë²Ì¤Ï box-chart.png

¥µ¥ó¥×¥ë´Ö¤Î»¶ÉÛ¿Þ¤òÉÁ¤¯

seaborn¤ÎÆÀ°Õµ»pairplot¤ò»È¤¦¤È²¿¤â¹Í¤¨¤Ê¤¯¤Æ¤è¤¤¡£

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
#sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic'], markers='.')
sns.set(style='whitegrid', font=['IPAexGothic'])
# count_tpm.tsv¤ÎÆÉ¹þ¤ß
df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)
all_zero_index = df.index[df.sum(axis=1) == 0]
df = df.drop(all_zero_index)
dfl = np.log10(df)
dfl = dfl.replace([np.inf, -np.inf], -1)
print(dfl.head())
#dfl.boxplot()
#dfl.plot.box()
sns.pairplot(dfl, kind='reg', height=3)  # size=0.1¤È¤¹¤ë¤È¿Þ´Ö¤¬µÍ¤Þ¤ë
#plt.xlim(-0.3, 5)
#plt.xlabel('$log_{10}(TPM)$')
plt.show()
dfl['batch'] = dfl[['batch_1', 'batch_2', 'batch_3']].mean(axis=1) # ²£Êý¸þ¤ÎÊ¿¶Ñ
dfl['chemostat'] = dfl[['chemostat_1', 'chemostat_2', 'chemostat_3']].mean(axis=1)
print(dfl.head())
sns.relplot(kind='scatter', x='batch', y='chemostat', data=dfl) # size=0.1¤È¤¹¤ë¤ÈÅÀ¤¬¾®¤µ¤¯¤Ê¤ë
plt.show()

·ë²Ì¤Ï¡¢ scatter-plot-large.png

scatter-plot-average.png

¥Ò¡¼¥È¥Þ¥Ã¥×

¹àÌܤò¥¯¥é¥¹¥¿²½¤·¤¿·ë²Ì¤ò»È¤Ã¤Æ´ó¤»¤ë¡£dendrogram¤ÎÌá¤êÃͤò»È¤¦¡£

%matplotlib inline
# heatmap¡¡¥¯¥é¥¹¥¿¡¼²½¤Ë¤è¤Ã¤Æ´ó¤»¤ë
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
from scipy.spatial.distance import pdist
sns.set()
#sns.set(style='whitegrid', palette='Set3', font=['IPAexGothic'])
sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral')
# count_tpm.tsv¤ÎÆÉ¹þ¤ß
df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)
#all_zero_index = df.index[df.sum(axis=1) == 0]
#df = df.drop(all_zero_index)
#df = df.replace(0, 0.00001)
print(df.head())
print(df[-5:])
# ÀäÂÐÃͤ¬Â礭¤¤100genes¡Ê+¤ËÂ礭¤¤50¤È-¤ËÂ礭¤¤50¡Ë¤ò¼è¤ë
#dfr = pd.concat([df[:200], df[-200:]])
 dfr = pd.concat([df[:50], df[-50:]])
 print(dfr.head())
 print(dfr[-5:])
#df = df.applymap(lambda x: x+1)
dfr = dfr + 1  # log¤ò¼è¤ëÁ°¤Ë£±¤ò­¤¹
print('gene_0008\n', dfr.loc['gene_0008'])
# log10¤ò¼è¤ë
dfl = np.log10(dfr)
#dfl = dfl.replace([np.inf, -np.inf], -5)

dfl_t = dfl.T
print('dfl\n', dfl.head())
#linkage_result_gene = linkage(dfl, method='average', metric='correlation')
linkage_result_gene = linkage(dfl)
plt.figure(figsize=(16,20))
res = dendrogram(linkage_result_gene, labels=dfl.index, leaf_rotation=90, 
leaf_font_size=10, get_leaves=True)
leavesy = leaves_list(linkage_result_gene)
print('leavesy', leavesy)
lvs = pd.DataFrame([res['ivl'], res['leaves']], columns=res['ivl']).T
lvs.columns = ['gene_id2', 'leaves']
print(lvs)

dfl2 = pd.merge(lvs, dfl, left_on='gene_id2', right_index=True)
print('dfl2\n', dfl2.head())
print('dfl2\n', dfl2[-5:])

#lvs¤Îʤӽç¤Î¤Þ¤Þ¤ÎÊý¤¬¤¤¤¤
print(dfl2.head())
plt.figure(figsize=(10, 50))
sns.set_palette('hot')
#sns.heatmap(dfl2[['batch_1','batch_2','batch_3', 'chemostat_1','chemostat_2','chemostat_3']], vmax=1, vmin=-1, center=0)
sns.heatmap(dfl2[['batch_1','batch_2','batch_3', 'chemostat_1','chemostat_2','chemostat_3']], vmax=3, vmin=0, center=1.0, cmap='YlOrBr')
plt.show()

·ë²Ì¤Ï heatmap-dendrogram.png

heatmap.png

JSD (Jense-Shannon divergence) µ÷Î¥¤Ë¤è¤ëgene´Ö¤Î¤Õ¤ë¤Þ¤¤¤Î°ã¤¤¤Î»»Äê

¤¿¤È¤¨¤Ð¡¢gene_0013¤Ë»÷¤¿¤Õ¤ë¤Þ¤¤¤Îgene¤òõ¤¹¡£ºÇ¤â¶á¤¤10¸Ä¤òɽ¼¨¡£

%matplotlib inline
# JSD Jensen-Shannon divergence¤Ë¤è¤ë¡¢°ã¤¤¤Î»»Äê
# ¤¢¤ë°äÅÁ»Ò¤È»÷¤¿È¯¸À¤ò¼¨¤¹°äÅÁ»Ò¤òõ¤¹
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
from scipy.spatial import distance
import itertools
sns.set()
sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral')
# count_tpm.tsv¤ÎÆÉ¹þ¤ß
df = pd.read_csv('count_tpm.tsv', sep='\t', index_col=0)

df = df + 1
# log10¤ò¼è¤ë
dfl = np.log10(df)
dfl_t = dfl.T
# ¤¹¤Ù¤Æ¤Î¹Ô¤ÎÁȹ礻¤Ç¡¢µ÷Î¥¤ò·×»»¤¹¤ë
# ¥ª¡¼¥ë0¤Î¹Ô¤Ï½ü¤«¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡Ê³ÎΨ¤Ê¤Î¤ÇÁ´¤ÆÂ­¤·¤Æ£±¤ËÀµµ¬²½¤¹¤ë¡Ë
dfl_copy = dfl.replace(0.0, np.nan).dropna(how='all', axis=0)
dfl = dfl.loc[dfl_copy.index]
# print('dfl\n', dfl.head()); print()

# dfl = dfl[:10]
# target¤È¾¤Îgene¤È¤ÎÂФε÷Î¥¤ò·×»»¤¹¤ë
target = 'gene_0013'
dfl['d'] = dfl.apply(lambda x: distance.jensenshannon(x, dfl.loc[target]), axis=1)
#print(dfl)
dfx = dfl.sort_values('d', ascending=True)[:10]
print('dfx\n', dfx)

dfg = dfx.drop('d', axis=1).T
dfg.plot()
plt.ylabel('$log_{10}(TPM-1)$')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

·ë²Ì¤Ï

JSD-similar.png


źÉÕ¥Õ¥¡¥¤¥ë: filescatter-plot-large.png 574·ï [¾ÜºÙ] filescatter-plot-average.png 542·ï [¾ÜºÙ] fileJSD-similar.png 534·ï [¾ÜºÙ] fileheatmap-dendrogram.png 559·ï [¾ÜºÙ] fileheatmap.png 567·ï [¾ÜºÙ] filedensity-distribution.png 575·ï [¾ÜºÙ] filebox-chart.png 496·ï [¾ÜºÙ]

¥È¥Ã¥×   ÊÔ½¸ Åà·ë º¹Ê¬ ¥Ð¥Ã¥¯¥¢¥Ã¥× źÉÕ Ê£À½ ̾Á°Êѹ¹ ¥ê¥í¡¼¥É   ¿·µ¬ °ìÍ÷ ñ¸ì¸¡º÷ ºÇ½ª¹¹¿·   ¥Ø¥ë¥×   ºÇ½ª¹¹¿·¤ÎRSS
Last-modified: 2019-07-10 (¿å) 16:05:41 (1355d)