![]() |
Python¥Ð¥¤¥ª/¥Ä¡¼¥ë/KakenRNAseqhttps://pepper.is.sci.toho-u.ac.jp:443/pepper/index.php?Python%A5%D0%A5%A4%A5%AA%2F%A5%C4%A1%BC%A5%EB%2FKakenRNAseq |
![]() |
»²¾ÈÀâÌÀ¡§¥ê¥Ü¥½¡¼¥àRNA¤Î½üµî
¼ê½ç
cd /home/yamanouc/src/KishimotoRNA2 for f in 181012_10B_S7_R1_001 181012_1p2-1_S11_R1_001 181012_1p2-2_S13_R1_001 181012_2p5-1_S12_R1_001 181012_2p6-1_S14_R1_001 181012_45a_plus_S2_R1_001 181012_45b_plus_S3_R1_001 181012_45c_plus_S4_R1_001 181013_45A_minus_S18_R1_001 181019_43B_S1_R1_001 181019_45A_plus_S5_R1_001 181019_45L_S6_R1_001 181019_45a10D_plus_S9_R1_001 181019_45aIII6c_plus_S8_R1_001 181019_45d7B_plus_S10_R1_001 181026_10D_minus_S21_R1_001 181026_2-10B_minus_S20_R1_001 181026_45a_minus_S15_R1_001 181026_45b_minus_S16_R1_001 181026_45c_minus_S17_R1_001 181103_45alll6c_minus_S22_R1_001 181103_45d7B_minus_S23_R1_001 181103_Anc_S19_R1_001 PwOw_minus_S25_R1_001 PwOw_plus_S24_R1_001 do nohup sortmerna --ref ./rRNA_databases/silva-bac-16s-id90.fasta,./index/silva-bac-16s-db:\ ./rRNA_databases/silva-bac-23s-id98.fasta,./index/silva-bac-23s-db:\ ./rRNA_databases/rfam-5s-database-id98.fasta,./index/rfam-5s-db:\ ./rRNA_databases/rfam-5.8s-database-id98.fasta,./index/rfam-5.8s-db \ --reads 1st/1st_$f.fastq --sam --num_alignments 1 --fastx --aligned 1st/1st_$f.rRNA \ --other 1st/1st_$f.non_rRNA --log -v > 1st/1st_$f.out & # nohup sortmerna --ref ./rRNA_databases/silva-bac-16s-id90.fasta,./index/silva-bac-16s-db:\ ./rRNA_databases/silva-bac-23s-id98.fasta,./index/silva-bac-23s-db:\ ./rRNA_databases/rfam-5s-database-id98.fasta,./index/rfam-5s-db:\ ./rRNA_databases/rfam-5.8s-database-id98.fasta,./index/rfam-5.8s-db \ --reads 2nd/2nd_$f.fastq --sam --num_alignments 1 --fastx --aligned 2nd/2nd_$f.rRNA \ --other 2nd/2nd_$f.non_rRNA --log -v > 2nd/2nd_$f.out & done
¤Þ¤¿
cd /home/yamanouc/src/KishimotoRNA3 bash ./sortmerna.sh 3rd_181019_45a10D_plus_S9_R1_001.fastq & bash ./sortmerna.sh 3rd_181019_45d7B_plus_S10_R1_001.fastq & bash ./sortmerna.sh 3rd_181019_45aIII6c_plus_S8_R1_001.fastq & bash ./sortmerna.sh 3rd_181012_2p5-1_S12_R1_001.fastq & bash ./sortmerna.sh 3rd_181012_1p2-1_S11_R1_001.fastq & bash ./sortmerna.sh 3rd_181012_1p2-2_S13_R1_001.fastq & bash ./sortmerna.sh 3rd_181012_45b_plus_S3_R1_001.fastq & bash ./sortmerna.sh 3rd_181019_43B_S1_R1_001.fastq & bash ./sortmerna.sh 3rd_PwOw_plus_S16_R1_001.fastq & bash ./sortmerna.sh 3rd_181012_45c_plus_S4_R1_001.fastq & bash ./sortmerna.sh 3rd_181012_2p6-1_S14_R1_001.fastq & bash ./sortmerna.sh 3rd_181103_Anc_S15_R1_001.fastq & bash ./sortmerna.sh 3rd_181019_45L_S6_R1_001.fastq & bash ./sortmerna.sh 3rd_181019_45A_plus_S5_R1_001.fastq & bash ./sortmerna.sh 3rd_181012_45a_plus_S2_R1_001.fastq &
¤¿¤À¤·sortmerna.sh¤Ï
sortmerna --ref ../KishimotoRNA2/rRNA_databases/silva-bac-16s-id90.fasta,../KishimotoRNA2/index/silva-bac-16s-db:../KishimotoRNA2/rRNA_databases/silva-bac-23s-id98.fasta,../KishimotoRNA2/index/silva-bac-23s-db:../KishimotoRNA2/rRNA_databases/rfam-5s-database-id98.fasta,../KishimotoRNA2/index/rfam-5s-db:../KishimotoRNA2/rRNA_databases/rfam-5.8s-database-id98.fasta,../KishimotoRNA2/index/rfam-5.8s-db --reads $1 --sam --num_alignments 1 --fastx --aligned $1.rRNA --other $1.non_rRNA --log -a 8 -v
·ë²Ì
readcount¤ÎALL¤Î¥Ú¡¼¥¸
¼ê½ç
fastqc --nogroup xxx/nonrRNA.fastq
·ë²Ì
FastQC results¤Î¥Ú¡¼¥¸
¤½¤ì¤¾¤ì¤Î¥µ¥ó¥×¥ë¤Î¥Ç¡¼¥¿¤Ï3run¤È¤âÆÈΩ¤Ê¤Î¤Ç¡¢¤½¤Î¤Þ¤Þ¥Þ¡¼¥¸¤·¤Æ¤è¤¤¤ÈȽÃÇ¡£
¶ñÂÎŪ¤Ë¤Ï¡¢run-1¤È2¤È3¤òcat¤Ç·ë¹ç¤·¤¿¡£
¡¡¡¡¢Í½ÐÎϤÏ<sample>.nonrRNA.fastq¤È¤·¤¿¡£
¼ê½ç
for f in 10B 1p2-1 1p2-2 2p5-1 2p6-1 45a_plus 45b_plus 45c_plus 43B 45A_plus 45L 45a10D_plus 45aIII6c_plus 45d7B_plus Anc PwOw_plus 45A_minus 10D_minus 2-10B_minus 45a_minus 45b_minus 45c_minus 45alll6c_minus 45d7B_minus PwOw_minus do nohup java -jar -Xmx512m /usr/local/Trimmomatic/trimmomatic-0.38.jar SE \ -threads 32 \ -phred33 \ -trimlog $f.trimlog \ $f.nonrRNA.fastq \ $f.nonrRNA.trim.fastq \ ILLUMINACLIP:TruSeq3-PE-2.fa:2:30:10 \ LEADING:20 \ TRAILING:20 \ SLIDINGWINDOW:4:15 \ MINLEN:36 > $f.trim.out & done
·ë²Ì¤Ï¡¢
Trimming results¡¡¤Î¥Ú¡¼¥¸
¼ê½ç¤Ï¡¢¤¢¤é¤«¤¸¤áAP012030¤Î¥¤¥ó¥Ç¥Ã¥¯¥¹¤òºî¤Ã¤Æ¤ª¤¡¢¤½¤ì¤¾¤ì¤Î¥µ¥ó¥×¥ë¤Îfastq¤ò¥Þ¥Ã¥Ô¥ó¥°½èÍý¤¹¤ë¡£
Python¥Ð¥¤¥ª/¥Ä¡¼¥ë/RNAseq¢¥ê¡¼¥É¤Î»²¾ÈÇÛÎó¤Ø¤Î¥Þ¥Ã¥Ô¥ó¥°¡¡¤ÎHISAT2¤Î¹à»²¾È¡£
# hisat2.sh¤ÎÃæ¿È for f in 10B 1p2-1 1p2-2 2p5-1 2p6-1 45a_plus 45b_plus 45c_plus 43B 45A_plus 45L 45a10D_plus 45aIII6c_plus 45d7B_plus Anc PwOw_plus 45A_minus 10D_minus 2-10B_minus 45a_minus 45b_minus 45c_minus 45alll6c_minus 45d7B_minus PwOw_minus do nohup hisat2 -p 32 --dta-cufflinks -x AP012030.fna -U $f.nonrRNA.trim.fastq -S $f.sam > $f.hisat2.log & done
·ë²Ì¤È¤·¤Æ¡¢¤½¤ì¤¾¤ì¤ÎSAM¥Õ¥¡¥¤¥ë¤¬¤Ç¤¤ë¡£hisat2¤Î¥í¥°½ÐÎÏ<sample>hisat2.log¤ò ½¸¤á¤Æ¥ê¥¹¥È²½¤·¤¿¤â¤Î ¢Í¡¡hisat2 alignment
¤½¤ÎÁ°¤Ë¡¢SAM¥Õ¥¡¥¤¥ë¤ò¥½¡¼¥È¤·¤Æ¤«¤ÄBAM¤Ë¤·¤Æ¤ª¤¯¡£
# compresssam.sh for f in 10B 1p2-1 1p2-2 2p5-1 2p6-1 45a_plus 45b_plus 45c_plus 43B 45A_plus 45L 45a10D_plus 45aIII6c_plus 45d7B_plus Anc PwOw_plus 45A_minus 10D_minus 2-10B_minus 45a_minus 45b_minus 45c_minus 45alll6c_minus 45d7B_minus PwOw_minus do nohup samtools sort -o $f.bam -@ 32 $f.sam > $f.samtools_sort.log & done
ÆÀ¤é¤ì¤¿ <sample>.bam ¤òfeatureCounts¤Ç¥«¥¦¥ó¥È¥¢¥Ã¥×¤¹¤ë¡£¤³¤Î»þ¡¢»²¾È¥²¥Î¥à¤Î ¥¢¥Î¥Æ¡¼¥·¥ç¥ó GFF ¥Õ¥¡¥¤¥ë¤¬É¬Íפǡ¢¤«¤Ä¡¢gene¥Õ¥£¡¼¥Á¥ã¡¼¤ËÂФ·¤Ægene_id¤òÆþ¤ì¤Æ¤ª¤¯É¬Íפ¬¤¢¤ë¡£¡Êgene¥Õ¥£¡¼¥Á¥ã¡¼¤ËÆþ¤ì¤¿¤³¤È¤¬¥¡¼¤Ç¡¢¸å¤ÎfeatureCounts¤Îµ¯Æ°¥Ñ¥é¥á¡¼¥¿¤Ë -t gene ¤ò»ØÄꤹ¤ë¡£ -t CDS ¤È¤¹¤ë¤È¡¢CDS¥Õ¥£¡¼¥Á¥ã¡¼¤Ë¤Ïgene_id¤¬Æþ¤Ã¤Æ¤¤¤Ê¤¤¤Î¤Ç¡¢¥¨¥é¡¼¤òµ¯¤³¤¹¡£
¤½¤ì¤Çºî¤Ã¤¿¥¢¥Î¥Æ¡¼¥·¥ç¥ó¤òAP_e.gff¤È¤¹¤ë¡£
featureCounts¤Îµ¯Æ°¤Ï¡¢
# featureCounts.sh nohup featureCounts -T 32 -t gene -g gene_id -a AP_e.gff -o Counts.txt *.sam > featureCounts.log &
nohup¤ÎɬÍ×À¤Ï̵¤¤¤«¤âÃΤì¤Ê¤¤¡£¡Ê¤½¤ì¤Û¤É»þ´Ö¤¬¤«¤«¤é¤Ê¤¤¡Ë
# geneid_product.ipynb import pandas as pd import numpy as np import urllib.parse def normalize_per_million_reads(df): # RPM/FPM = reads per million fragments per million sum_count = df.sum() return 10**6 * df / sum_count def normalize_per_kilobase(df, gene_length): # FPKM = fragments per kilobase of exon per million reads mapped df_tmp = df.copy() print('N', df.T * 10**3) df_tmp = (df.T * 10**3 / gene_length).T return df_tmp def normalize_tpm(df, gene_length): # TPM = transcripts per kilobase million https://bi.biopapyrus.jp/ print('normalize_tpm') df_tmp = df.copy() df_tmp = normalize_per_kilobase(df_tmp, gene_length) df_tmp = normalize_per_million_reads(df_tmp) return df_tmp with open('AP_e.gff', 'r') as fin: dict = {(0,0): ['','']} for line in fin: line=line.rstrip() if not line.startswith('#'): s=line.split('\t') startend = ( int(s[3]), int(s[4]) ) items=s[8].split(';') for item in items: if item.startswith('product='): product = item.split('=')[1] product = urllib.parse.unquote(product) if startend not in dict.keys(): dict[startend] = ['', product] #print('added product -> ', startend, dict[startend]) elif dict[startend][1] != '': print('dict', startend, 'product part is not empty', dict[startend]) else: dict[startend] = [dict[startend][0], product] #print('add product -> ', startend, dict[startend]) if item.startswith('gene_id='): geneid = item.split('=')[1] if startend not in dict.keys(): dict[startend] = [geneid, ''] #print('added geneid -> ', startend, dict[startend]) elif dict[startend][0] != '': print('dict', startend, 'geneid part is not empty', dict[startend]) else: dict[startend] = [geneid, dict[startend][1]] #print('add geneid -> ', startend, dict[startend]) fout = open('AP012030_geneid_product.tsv', 'w') print('%s\t%s' % ('gene_id', 'product'), file=fout) for k, v in sorted(dict.items(), key=lambda x: x[0][0], reverse=False): if (v[0]!='' and v[1]!=''): print('%s\t%s' % (v[0], v[1]), file=fout) #print(v) fout.close() ### Counts.txt¤È¥Þ¡¼¥¸¤¹¤ë dfc = pd.read_csv('Counts.txt', index_col=0, skiprows=1, sep='\t') samples = ['10B.sam','10D_minus.sam','1p2-1.sam','1p2-2.sam','2-10B_minus.sam',\ '2p5-1.sam','2p6-1.sam','43B.sam','45A_minus.sam','45A_plus.sam','45L.sam','45a10D_plus.sam',\ '45aIII6c_plus.sam','45a_minus.sam','45a_plus.sam','45alll6c_minus.sam','45b_minus.sam',\ '45b_plus.sam','45c_minus.sam','45c_plus.sam','45d7B_minus.sam','45d7B_plus.sam','Anc.sam', 'PwOw_minus.sam','PwOw_plus.sam'] gene_products = pd.read_csv('AP012030_geneid_product.tsv', index_col=0, sep='\t') dfg = dfc.join(gene_products, how='inner') dfg.to_csv('count_prod.csv', sep='\t') dfs = dfc[samples] dfs.to_csv('count_raw.csv', sep='\t') # dfc_fpm = normalize_per_million_reads(dfc) gene_length = dfc['Length'] #dfs_tpm = normalize_tpm(dfs, gene_length) dfg_tpm = dfg.copy() dfg_tpm[samples] = normalize_tpm(dfg[samples], gene_length) dfg_tpm.to_csv("count_tpm.tsv", sep="\t") print('complete')
·ë²Ì¤Îcount_tpm.tsv¤Ï¡¡¤³¤Î¤è¤¦ ¤Ë¤Ê¤Ã¤¿¡£
TPMÊäÀµ¸å¤Îȯ¸½ÃͤÎʬÉÛ¤ò³Îǧ¡£
# Èæ³Ó¥Ñ¥¿¡¼¥ó¤ÏC:\Users\yamanouc\Dropbox\RNA½ÅÊ£\KishimotoRNA2\190605¡¡RNAseq ¥µ¥ó¥×¥ë̾.xlsx # Anc - 43B - 45L - *1.2-1 - +2.5-1 # Anc - 43B - 45L - *1.2-2 - +2.6-2 # 45a 2-10B + - PwOw + - 45a III6c + - 45a 10D + - 45d 7B + # 43B 45a + - 45b + - 45c + 45A + - 45L # Python¥Ð¥¤¥ª/¥Ä¡¼¥ë/RNAseq¤-2 PythonÈÇȯ¸½²òÀϡʳ¤¤Î³¤¡Ë»²¾È # https://pepper.is.sci.toho-u.ac.jp/pepper/index.php?Python%A5%D0%A5%A4%A5%AA%2F%A5%C4%A1%BC%A5%EB%2FRNAseq%AD%A4-2%20Python%C8%C7%C8%AF%B8%BD%B2%F2%C0%CF%A1%CA%C2%B3%A4%AD%A4%CE%C2%B3%A4%AD%A1%CB %matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set() sns.set_style('whitegrid') df = pd.read_csv("../count_tpm.tsv", sep="\t", index_col=0) df = df[['Anc.sam','43B.sam','45L.sam','1p2-1.sam','2p5-1.sam']] all_zero_index = df.index[df.sum(axis=1) == 0] df = df.drop(all_zero_index) dfl = np.log10(df) dfl = dfl.replace([np.inf, -np.inf], -1) # Ãͤ¬inf/-inf¤Î¥Ç¡¼¥¿¤Ï-1¤ËÃÖ´¹ #dfl.plot.kde(style=['-', '--', ':', '-', '-.', ':']) # Àþ¤Î¥Ñ¥¿¡¼¥ó¤Ï£µ¼ïÎष¤«¤Ê¤¤ sns.kdeplot(dfl['Anc.sam']) # ÊÌ¡¹¤Ë½ñ¤¤¤Æ½Å¤Í¤ë¤³¤È¤Ë¤·¤¿ sns.kdeplot(dfl['43B.sam']) sns.kdeplot(dfl['45L.sam']) sns.kdeplot(dfl['1p2-1.sam']) sns.kdeplot(dfl['2p5-1.sam']) plt.xlim(-1, 5) plt.xlabel('$log_{10}(TPM)$') plt.ylabel('density') plt.show()
gene¤´¤È¤Î¥µ¥ó¥×¥ë´Ö¤ÎÊѲ½¤ò¥Ò¡¼¥È¥Þ¥Ã¥×¤Çɽ¼¨¡£Á´gene¤òɽ¼¨¤¹¤ë¤È¤´¤Á¤ã¤´¤Á¤ã¤¹¤ë¤Î¤Ç¡¢¥µ¥ó¥×¥ë´Ö¤Îº¹¤¬¾®¤µ¤¤gene¤Ï̵»ë¤¹¤ë¤³¤È¤Ë¤¹¤ë¡£¤½¤ÎÁªÂòÊýË¡¤Ï¡¢ ÊÑÆ°·¸¿ô¡¡¡Êɸ½àÊк¹/Ê¿¶Ñ¡Ë¤¬°ìÄêÃ͡ʤ¿¤È¤¨¤Ð0.8¡Ë¤è¤ê¾®¤µ¤¤gene¤Ï̵»ë¡£
¤³¤³¤Ç¤Ï¡¢0.8¤òïçÃͤȤ·¤¿¤È¤³¤í¡¢267 gene¤¬»Ä¤Ã¤¿¡£¤½¤Î»Ä¤Ã¤¿gene¤ò¥Ò¡¼¥È¥Þ¥Ã¥×¡£
%matplotlib inline
# heatmap¡¡¥¯¥é¥¹¥¿¡¼²½¤Ë¤è¤Ã¤Æ´ó¤»¤ë import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list from scipy.spatial.distance import pdist sns.set() sns.set(style='whitegrid', font=['IPAexGothic'], palette='Spectral') # count_tpm.tsv¤ÎÆÉ¹þ¤ß df = pd.read_csv("../count_tpm.tsv", sep="\t", index_col=0) df = df[['Anc.sam','43B.sam','45L.sam','1p2-1.sam','2p5-1.sam']] # ¥µ¥ó¥×¥ëTPM¡ÊlogÁ°¡Ë´Ö¤Îº¹¤¬¾®¤µ¤¤¹Ô¤ò½ü¤¯¡£¶ñÂÎŪ¤Ë¤ÏÊÑÆ°·¸¿ô¡Êɸ½àÊк¹/Ê¿¶Ñ¡Ë¤¬0.8°Ê¾å¤Î¤â¤Î¤ò»Ä¤¹¡£ dfr = df.copy() dfr['dev']=(df.std(axis=1)/df.mean(axis=1)) dfr = dfr[dfr['dev'] > 0.8] print('num dfr', len(dfr)) dfr = dfr + 1 # log¤ò¼è¤ëÁ°¤Ë£±¤ò¤¹ # log10¤ò¼è¤ë dfl = np.log10(dfr) dfl_t = dfl.T #linkage_result_gene = linkage(dfl, method='average', metric='correlation') linkage_result_gene = linkage(dfl) plt.figure(figsize=(16,20)) res = dendrogram(linkage_result_gene, labels=dfl.index, leaf_rotation=90, leaf_font_size=10, get_leaves=True) leavesy = leaves_list(linkage_result_gene) #print('leavesy', leavesy) lvs = pd.DataFrame([res['ivl'], res['leaves']], columns=res['ivl']).T lvs.columns = ['gene_id2', 'leaves'] #print(lvs) dfl2 = pd.merge(lvs, dfl, left_on='gene_id2', right_index=True) #lvs¤Îʤӽç¤Î¤Þ¤Þ¤ÎÊý¤¬¤¤¤¤ print(dfl2.head()) plt.figure(figsize=(10, 50)) sns.set_palette('hot') sns.heatmap(dfl2[['Anc.sam','43B.sam','45L.sam','1p2-1.sam','2p5-1.sam']], vmax=3, vmin=0, center=1.0, cmap='YlOrBr') plt.savefig('comp1.png') plt.show()