Python¥Ð¥¤¥ª/¥á¥Ë¥å¡¼

ribosomal RNA ¤Î¥Õ¥£¥ë¥¿¥ê¥ó¥°

»²¾ÈÀâÌÀ¡§¥ê¥Ü¥½¡¼¥àRNA¤Î½üµî

¼ê½ç

cd /home/yamanouc/src/KishimotoRNA2

for f in 181012_10B_S7_R1_001 181012_1p2-1_S11_R1_001 181012_1p2-2_S13_R1_001 181012_2p5-1_S12_R1_001 181012_2p6-1_S14_R1_001 181012_45a_plus_S2_R1_001 181012_45b_plus_S3_R1_001 181012_45c_plus_S4_R1_001 181013_45A_minus_S18_R1_001 181019_43B_S1_R1_001 181019_45A_plus_S5_R1_001 181019_45L_S6_R1_001 181019_45a10D_plus_S9_R1_001 181019_45aIII6c_plus_S8_R1_001 181019_45d7B_plus_S10_R1_001 181026_10D_minus_S21_R1_001 181026_2-10B_minus_S20_R1_001 181026_45a_minus_S15_R1_001 181026_45b_minus_S16_R1_001 181026_45c_minus_S17_R1_001 181103_45alll6c_minus_S22_R1_001 181103_45d7B_minus_S23_R1_001 181103_Anc_S19_R1_001 PwOw_minus_S25_R1_001 PwOw_plus_S24_R1_001
do
nohup sortmerna --ref ./rRNA_databases/silva-bac-16s-id90.fasta,./index/silva-bac-16s-db:\
./rRNA_databases/silva-bac-23s-id98.fasta,./index/silva-bac-23s-db:\
./rRNA_databases/rfam-5s-database-id98.fasta,./index/rfam-5s-db:\
./rRNA_databases/rfam-5.8s-database-id98.fasta,./index/rfam-5.8s-db \
--reads 1st/1st_$f.fastq --sam --num_alignments 1 --fastx --aligned 1st/1st_$f.rRNA \
--other 1st/1st_$f.non_rRNA --log -v > 1st/1st_$f.out &
#
nohup sortmerna --ref ./rRNA_databases/silva-bac-16s-id90.fasta,./index/silva-bac-16s-db:\
./rRNA_databases/silva-bac-23s-id98.fasta,./index/silva-bac-23s-db:\
./rRNA_databases/rfam-5s-database-id98.fasta,./index/rfam-5s-db:\
./rRNA_databases/rfam-5.8s-database-id98.fasta,./index/rfam-5.8s-db \
--reads 2nd/2nd_$f.fastq --sam --num_alignments 1 --fastx --aligned 2nd/2nd_$f.rRNA \
--other 2nd/2nd_$f.non_rRNA --log -v > 2nd/2nd_$f.out &
done

¤Þ¤¿

cd /home/yamanouc/src/KishimotoRNA3
bash ./sortmerna.sh  3rd_181019_45a10D_plus_S9_R1_001.fastq     &
bash ./sortmerna.sh  3rd_181019_45d7B_plus_S10_R1_001.fastq     &
bash ./sortmerna.sh  3rd_181019_45aIII6c_plus_S8_R1_001.fastq   &
bash ./sortmerna.sh  3rd_181012_2p5-1_S12_R1_001.fastq          &
bash ./sortmerna.sh  3rd_181012_1p2-1_S11_R1_001.fastq          &
bash ./sortmerna.sh  3rd_181012_1p2-2_S13_R1_001.fastq          &
bash ./sortmerna.sh  3rd_181012_45b_plus_S3_R1_001.fastq        &
bash ./sortmerna.sh  3rd_181019_43B_S1_R1_001.fastq             &
bash ./sortmerna.sh  3rd_PwOw_plus_S16_R1_001.fastq             &
bash ./sortmerna.sh  3rd_181012_45c_plus_S4_R1_001.fastq        &
bash ./sortmerna.sh  3rd_181012_2p6-1_S14_R1_001.fastq          &
bash ./sortmerna.sh  3rd_181103_Anc_S15_R1_001.fastq            &
bash ./sortmerna.sh  3rd_181019_45L_S6_R1_001.fastq             &
bash ./sortmerna.sh  3rd_181019_45A_plus_S5_R1_001.fastq        &
bash ./sortmerna.sh  3rd_181012_45a_plus_S2_R1_001.fastq        &

¤¿¤À¤·sortmerna.sh¤Ï

sortmerna --ref ../KishimotoRNA2/rRNA_databases/silva-bac-16s-id90.fasta,../KishimotoRNA2/index/silva-bac-16s-db:../KishimotoRNA2/rRNA_databases/silva-bac-23s-id98.fasta,../KishimotoRNA2/index/silva-bac-23s-db:../KishimotoRNA2/rRNA_databases/rfam-5s-database-id98.fasta,../KishimotoRNA2/index/rfam-5s-db:../KishimotoRNA2/rRNA_databases/rfam-5.8s-database-id98.fasta,../KishimotoRNA2/index/rfam-5.8s-db --reads $1 --sam --num_alignments 1 --fastx --aligned $1.rRNA --other $1.non_rRNA --log -a 8 -v

·ë²Ì
readcount¤ÎALL¤Î¥Ú¡¼¥¸

FastQC¤Ë¤è¤ëQuality Check

¼ê½ç

fastqc --nogroup xxx/nonrRNA.fastq

·ë²Ì
FastQC results¤Î¥Ú¡¼¥¸

3¤Ä¤Îrun¤Îfastq¥Õ¥¡¥¤¥ë¤ò·ë¹ç

¤½¤ì¤¾¤ì¤Î¥µ¥ó¥×¥ë¤Î¥Ç¡¼¥¿¤Ï3run¤È¤âÆÈΩ¤Ê¤Î¤Ç¡¢¤½¤Î¤Þ¤Þ¥Þ¡¼¥¸¤·¤Æ¤è¤¤¤ÈȽÃÇ¡£

¶ñÂÎŪ¤Ë¤Ï¡¢run-1¤È2¤È3¤òcat¤Ç·ë¹ç¤·¤¿¡£
¡¡¡¡¢Í½ÐÎϤÏ<sample>.nonrRNA.fastq¤È¤·¤¿¡£

Trimmomatic¤Ë¤è¤ë¥È¥ê¥ß¥ó¥°

¼ê½ç

for f in 10B 1p2-1 1p2-2 2p5-1 2p6-1 45a_plus 45b_plus 45c_plus 43B 45A_plus 45L 45a10D_plus 45aIII6c_plus 45d7B_plus Anc PwOw_plus 45A_minus 10D_minus 2-10B_minus 45a_minus 45b_minus 45c_minus 45alll6c_minus 45d7B_minus PwOw_minus 
do
nohup java -jar -Xmx512m /usr/local/Trimmomatic/trimmomatic-0.38.jar SE \
 -threads 32         \
 -phred33               \
 -trimlog $f.trimlog \
 $f.nonrRNA.fastq \
 $f.nonrRNA.trim.fastq   \
 ILLUMINACLIP:TruSeq3-PE-2.fa:2:30:10 \
 LEADING:20 \
 TRAILING:20 \
 SLIDINGWINDOW:4:15 \
 MINLEN:36 > $f.trim.out &
done

·ë²Ì¤Ï¡¢
Trimming results¡¡¤Î¥Ú¡¼¥¸

HISAT-2¤Ë¤è¤ëAP012030¤Ø¤Î¥Þ¥Ã¥Ô¥ó¥°

¼ê½ç¤Ï¡¢¤¢¤é¤«¤¸¤áAP012030¤Î¥¤¥ó¥Ç¥Ã¥¯¥¹¤òºî¤Ã¤Æ¤ª¤­¡¢¤½¤ì¤¾¤ì¤Î¥µ¥ó¥×¥ë¤Îfastq¤ò¥Þ¥Ã¥Ô¥ó¥°½èÍý¤¹¤ë¡£
Python¥Ð¥¤¥ª/¥Ä¡¼¥ë/RNAseq­¢¥ê¡¼¥É¤Î»²¾ÈÇÛÎó¤Ø¤Î¥Þ¥Ã¥Ô¥ó¥°¡¡¤ÎHISAT2¤Î¹à»²¾È¡£

# hisat2.sh¤ÎÃæ¿È
for f in 10B 1p2-1 1p2-2 2p5-1 2p6-1 45a_plus 45b_plus 45c_plus 43B 45A_plus 45L 45a10D_plus 45aIII6c_plus 45d7B_plus Anc PwOw_plus 45A_minus 10D_minus 2-10B_minus 45a_minus 45b_minus 45c_minus 45alll6c_minus 45d7B_minus PwOw_minus 
do
  nohup hisat2 -p 32 --dta-cufflinks -x AP012030.fna -U $f.nonrRNA.trim.fastq  -S $f.sam > $f.hisat2.log &
done

·ë²Ì¤È¤·¤Æ¡¢¤½¤ì¤¾¤ì¤ÎSAM¥Õ¥¡¥¤¥ë¤¬¤Ç¤­¤ë¡£hisat2¤Î¥í¥°½ÐÎÏ<sample>hisat2.log¤ò ½¸¤á¤Æ¥ê¥¹¥È²½¤·¤¿¤â¤Î ¢Í¡¡hisat2 alignment

¥«¥¦¥ó¥È¥¢¥Ã¥×¤½¤Î1¡¡featureCounts¤Ë¤è¤ë¥«¥¦¥ó¥È¥¢¥Ã¥×

¤½¤ÎÁ°¤Ë¡¢SAM¥Õ¥¡¥¤¥ë¤ò¥½¡¼¥È¤·¤Æ¤«¤ÄBAM¤Ë¤·¤Æ¤ª¤¯¡£

# compresssam.sh
for f in 10B 1p2-1 1p2-2 2p5-1 2p6-1 45a_plus 45b_plus 45c_plus 43B 45A_plus 45L 45a10D_plus 45aIII6c_plus 45d7B_plus Anc PwOw_plus 45A_minus 10D_minus 2-10B_minus 45a_minus 45b_minus 45c_minus 45alll6c_minus 45d7B_minus PwOw_minus
do
  nohup samtools sort -o $f.bam -@ 32 $f.sam > $f.samtools_sort.log &
done

ÆÀ¤é¤ì¤¿ <sample>.bam ¤òfeatureCounts¤Ç¥«¥¦¥ó¥È¥¢¥Ã¥×¤¹¤ë¡£¤³¤Î»þ¡¢»²¾È¥²¥Î¥à¤Î ¥¢¥Î¥Æ¡¼¥·¥ç¥ó GFF ¥Õ¥¡¥¤¥ë¤¬É¬Íפǡ¢¤«¤Ä¡¢gene¥Õ¥£¡¼¥Á¥ã¡¼¤ËÂФ·¤Ægene_id¤òÆþ¤ì¤Æ¤ª¤¯É¬Íפ¬¤¢¤ë¡£¡Êgene¥Õ¥£¡¼¥Á¥ã¡¼¤ËÆþ¤ì¤¿¤³¤È¤¬¥­¡¼¤Ç¡¢¸å¤ÎfeatureCounts¤Îµ¯Æ°¥Ñ¥é¥á¡¼¥¿¤Ë -t gene ¤ò»ØÄꤹ¤ë¡£ -t CDS ¤È¤¹¤ë¤È¡¢CDS¥Õ¥£¡¼¥Á¥ã¡¼¤Ë¤Ïgene_id¤¬Æþ¤Ã¤Æ¤¤¤Ê¤¤¤Î¤Ç¡¢¥¨¥é¡¼¤òµ¯¤³¤¹¡£

¤½¤ì¤Çºî¤Ã¤¿¥¢¥Î¥Æ¡¼¥·¥ç¥ó¤òAP_e.gff¤È¤¹¤ë¡£

featureCounts¤Îµ¯Æ°¤Ï¡¢

# featureCounts.sh
nohup featureCounts -T 32 -t gene -g gene_id -a AP_e.gff -o Counts.txt *.sam > featureCounts.log &

nohup¤ÎɬÍ×À­¤Ï̵¤¤¤«¤âÃΤì¤Ê¤¤¡£¡Ê¤½¤ì¤Û¤É»þ´Ö¤¬¤«¤«¤é¤Ê¤¤¡Ë

featureCounts¤Î·ë²Ì¤ÎTPMÊäÀµ

# geneid_product.ipynb
import pandas as pd
import numpy as np
import urllib.parse

def normalize_per_million_reads(df):
    # RPM/FPM = reads per million  fragments per million
    sum_count = df.sum()
    return 10**6 * df / sum_count

def normalize_per_kilobase(df, gene_length):
    # FPKM = fragments per kilobase of exon per million reads mapped
    df_tmp = df.copy()
    print('N', df.T * 10**3)
    df_tmp = (df.T * 10**3 / gene_length).T 
    return df_tmp

def normalize_tpm(df, gene_length):
    # TPM = transcripts per kilobase million   https://bi.biopapyrus.jp/ 
    print('normalize_tpm')
    df_tmp = df.copy()
    df_tmp = normalize_per_kilobase(df_tmp, gene_length)
    df_tmp = normalize_per_million_reads(df_tmp)
    return df_tmp

with open('AP_e.gff', 'r') as fin:
    dict = {(0,0): ['','']}
    for line in fin:
        line=line.rstrip()
        if not line.startswith('#'):
            s=line.split('\t')
            startend = ( int(s[3]), int(s[4]) )
            items=s[8].split(';')
            for item in items:
                if item.startswith('product='):
                    product = item.split('=')[1]
                    product = urllib.parse.unquote(product)
                    if startend not in dict.keys():
                        dict[startend] = ['', product]
                        #print('added product -> ', startend, dict[startend])
                    elif dict[startend][1] != '':
                        print('dict', startend, 'product part is not empty', dict[startend])
                    else:
                        dict[startend] = [dict[startend][0], product]
                        #print('add product -> ', startend, dict[startend])
                if item.startswith('gene_id='):
                    geneid = item.split('=')[1]
                    if startend not in dict.keys():
                        dict[startend] = [geneid, '']
                        #print('added geneid -> ', startend, dict[startend])
                    elif dict[startend][0] != '':
                        print('dict', startend, 'geneid part is not empty', dict[startend])
                    else:
                        dict[startend] = [geneid, dict[startend][1]]
                        #print('add geneid -> ', startend, dict[startend])

fout = open('AP012030_geneid_product.tsv', 'w')
print('%s\t%s' % ('gene_id', 'product'), file=fout)
for k, v in sorted(dict.items(), key=lambda x: x[0][0], reverse=False):
    if (v[0]!='' and v[1]!=''):
        print('%s\t%s' % (v[0], v[1]), file=fout)
        #print(v)
fout.close()

### Counts.txt¤È¥Þ¡¼¥¸¤¹¤ë
dfc = pd.read_csv('Counts.txt', index_col=0, skiprows=1, sep='\t')
samples = ['10B.sam','10D_minus.sam','1p2-1.sam','1p2-2.sam','2-10B_minus.sam',\
       '2p5-1.sam','2p6-1.sam','43B.sam','45A_minus.sam','45A_plus.sam','45L.sam','45a10D_plus.sam',\
       '45aIII6c_plus.sam','45a_minus.sam','45a_plus.sam','45alll6c_minus.sam','45b_minus.sam',\
        '45b_plus.sam','45c_minus.sam','45c_plus.sam','45d7B_minus.sam','45d7B_plus.sam','Anc.sam',
        'PwOw_minus.sam','PwOw_plus.sam'] 
gene_products = pd.read_csv('AP012030_geneid_product.tsv', index_col=0, sep='\t')
dfg = dfc.join(gene_products, how='inner')
dfg.to_csv('count_prod.csv', sep='\t')
dfs = dfc[samples]  
dfs.to_csv('count_raw.csv', sep='\t')

# dfc_fpm = normalize_per_million_reads(dfc)
gene_length = dfc['Length']
#dfs_tpm = normalize_tpm(dfs, gene_length)
dfg_tpm = dfg.copy()
dfg_tpm[samples] = normalize_tpm(dfg[samples], gene_length)
dfg_tpm.to_csv("count_tpm.tsv", sep="\t")
print('complete')

·ë²Ì¤Îcount_tpm.tsv¤Ï¡¡¤³¤Î¤è¤¦ ¤Ë¤Ê¤Ã¤¿¡£

ȯ¸½ÃÍʬÉۤγÎǧ

TPMÊäÀµ¸å¤Îȯ¸½ÃͤÎʬÉÛ¤ò³Îǧ¡£

# Èæ³Ó¥Ñ¥¿¡¼¥ó¤ÏC:\Users\yamanouc\Dropbox\RNA½ÅÊ£\KishimotoRNA2\190605¡¡RNAseq ¥µ¥ó¥×¥ë̾.xlsx
# Anc - 43B - 45L - *1.2-1 - +2.5-1
# Anc - 43B - 45L - *1.2-2 - +2.6-2
# 45a 2-10B + - PwOw + - 45a III6c + - 45a 10D + - 45d 7B +
# 43B	45a + - 45b + - 45c + 45A + - 45L
# Python¥Ð¥¤¥ª/¥Ä¡¼¥ë/RNAseq­¤-2 PythonÈÇȯ¸½²òÀϡʳ¤­¤Î³¤­¡Ë»²¾È
#  https://pepper.is.sci.toho-u.ac.jp/pepper/index.php?Python%A5%D0%A5%A4%A5%AA%2F%A5%C4%A1%BC%A5%EB%2FRNAseq%AD%A4-2%20Python%C8%C7%C8%AF%B8%BD%B2%F2%C0%CF%A1%CA%C2%B3%A4%AD%A4%CE%C2%B3%A4%AD%A1%CB
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_style('whitegrid')

df = pd.read_csv("../count_tpm.tsv", sep="\t", index_col=0)
df = df[['Anc.sam','43B.sam','45L.sam','1p2-1.sam','2p5-1.sam']]

all_zero_index = df.index[df.sum(axis=1) == 0]
df = df.drop(all_zero_index)
dfl = np.log10(df)
dfl = dfl.replace([np.inf, -np.inf], -1)  # Ãͤ¬inf/-inf¤Î¥Ç¡¼¥¿¤Ï-1¤ËÃÖ´¹
#dfl.plot.kde(style=['-', '--', ':', '-', '-.', ':'])  # Àþ¤Î¥Ñ¥¿¡¼¥ó¤Ï£µ¼ïÎष¤«¤Ê¤¤
sns.kdeplot(dfl['Anc.sam'])  # ÊÌ¡¹¤Ë½ñ¤¤¤Æ½Å¤Í¤ë¤³¤È¤Ë¤·¤¿
sns.kdeplot(dfl['43B.sam'])
sns.kdeplot(dfl['45L.sam'])
sns.kdeplot(dfl['1p2-1.sam'])
sns.kdeplot(dfl['2p5-1.sam'])
plt.xlim(-1, 5)
plt.xlabel('$log_{10}(TPM)$')
plt.ylabel('density')
plt.show()

kdplot.png

¥Ò¡¼¥È¥Þ¥Ã¥×

gene¤´¤È¤Î¥µ¥ó¥×¥ë´Ö¤ÎÊѲ½¤ò¥Ò¡¼¥È¥Þ¥Ã¥×¤Çɽ¼¨¡£Á´gene¤òɽ¼¨¤¹¤ë¤È¤´¤Á¤ã¤´¤Á¤ã¤¹¤ë¤Î¤Ç¡¢¥µ¥ó¥×¥ë´Ö¤Îº¹¤¬¾®¤µ¤¤gene¤Ï̵»ë¤¹¤ë¤³¤È¤Ë¤¹¤ë¡£¤½¤ÎÁªÂòÊýË¡¤Ï¡¢ ÊÑÆ°·¸¿ô¡¡¡Êɸ½àÊк¹/Ê¿¶Ñ¡Ë¤¬°ìÄêÃ͡ʤ¿¤È¤¨¤Ð0.8¡Ë¤è¤ê¾®¤µ¤¤gene¤Ï̵»ë¡£

¤³¤³¤Ç¤Ï¡¢0.8¤òïçÃͤȤ·¤¿¤È¤³¤í¡¢267 gene¤¬»Ä¤Ã¤¿¡£¤½¤Î»Ä¤Ã¤¿gene¤ò¥Ò¡¼¥È¥Þ¥Ã¥×¡£

%matplotlib inline

# heatmap¡¡¥¯¥é¥¹¥¿¡¼²½¤Ë¤è¤Ã¤Æ´ó¤»¤ë
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
from scipy.spatial.distance import pdist
sns.set()
sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral')
# count_tpm.tsv¤ÎÆÉ¹þ¤ß
df = pd.read_csv("../count_tpm.tsv", sep="\t", index_col=0)
df = df[['Anc.sam','43B.sam','45L.sam','1p2-1.sam','2p5-1.sam']]
# ¥µ¥ó¥×¥ëTPM¡ÊlogÁ°¡Ë´Ö¤Îº¹¤¬¾®¤µ¤¤¹Ô¤ò½ü¤¯¡£¶ñÂÎŪ¤Ë¤ÏÊÑÆ°·¸¿ô¡Êɸ½àÊк¹/Ê¿¶Ñ¡Ë¤¬0.8°Ê¾å¤Î¤â¤Î¤ò»Ä¤¹¡£
dfr = df.copy()
dfr['dev']=(df.std(axis=1)/df.mean(axis=1))
dfr = dfr[dfr['dev'] > 0.8]
print('num dfr', len(dfr))
dfr = dfr + 1  # log¤ò¼è¤ëÁ°¤Ë£±¤ò­¤¹
# log10¤ò¼è¤ë
dfl = np.log10(dfr)

dfl_t = dfl.T
#linkage_result_gene = linkage(dfl, method='average', metric='correlation')
linkage_result_gene = linkage(dfl)
plt.figure(figsize=(16,20))
res = dendrogram(linkage_result_gene, labels=dfl.index, leaf_rotation=90, 
leaf_font_size=10, get_leaves=True)
leavesy = leaves_list(linkage_result_gene)
#print('leavesy', leavesy)
lvs = pd.DataFrame([res['ivl'], res['leaves']], columns=res['ivl']).T
lvs.columns = ['gene_id2', 'leaves']
#print(lvs)

dfl2 = pd.merge(lvs, dfl, left_on='gene_id2', right_index=True)

#lvs¤Îʤӽç¤Î¤Þ¤Þ¤ÎÊý¤¬¤¤¤¤
print(dfl2.head())
plt.figure(figsize=(10, 50))
sns.set_palette('hot')
sns.heatmap(dfl2[['Anc.sam','43B.sam','45L.sam','1p2-1.sam','2p5-1.sam']], vmax=3, vmin=0, center=1.0, cmap='YlOrBr')
plt.savefig('comp1.png')
plt.show()

¿Þ


źÉÕ¥Õ¥¡¥¤¥ë: filekdplot.png 694·ï [¾ÜºÙ]

¥È¥Ã¥×   ÊÔ½¸ Åà·ë º¹Ê¬ ¥Ð¥Ã¥¯¥¢¥Ã¥× źÉÕ Ê£À½ ̾Á°Êѹ¹ ¥ê¥í¡¼¥É   ¿·µ¬ °ìÍ÷ ñ¸ì¸¡º÷ ºÇ½ª¹¹¿·   ¥Ø¥ë¥×   ºÇ½ª¹¹¿·¤ÎRSS
Last-modified: 2019-09-01 (Æü) 17:00:28 (1302d)