Python¥Ð¥¤¥ª?¡¡Python¥Ð¥¤¥ª/¥Ä¡¼¥ë?
536¡¡¡¡¡¡2019-08-01 (ÌÚ) 16:15:22

AP012030 Â裲¿Ø¥Ç¡¼¥¿¥á¥â

¥Ç¡¼¥¿Æþ¼ê¤Èmd5¥Á¥§¥Ã¥¯

Äɲåǡ¼¥¿

3rd_181012_10B_S7_R1_001.fastq.gz             3172376284  8·î  1 10:35
3rd_181012_1p2-1_S11_R1_001.fastq.gz          1587657130  8·î  1 10:54
3rd_181012_1p2-2_S13_R1_001.fastq.gz           996285657  8·î  1 10:51
3rd_181012_2p5-1_S12_R1_001.fastq.gz          1958339192  8·î  1 10:56
3rd_181012_2p6-1_S14_R1_001.fastq.gz            24334765  8·î  1 10:37
3rd_181012_45a_plus_S2_R1_001.fastq.gz           4927625  8·î  1 10:37
3rd_181012_45b_plus_S3_R1_001.fastq.gz         949903430  8·î  1 10:51
3rd_181012_45c_plus_S4_R1_001.fastq.gz          26952144  8·î  1 10:38
3rd_181019_43B_S1_R1_001.fastq.gz              804581376  8·î  1 10:50
3rd_181019_45A_plus_S5_R1_001.fastq.gz          13188421  8·î  1 10:38
3rd_181019_45L_S6_R1_001.fastq.gz               13762510  8·î  1 10:38
3rd_181019_45a10D_plus_S9_R1_001.fastq.gz     2899586318  8·î  1 11:03
3rd_181019_45aIII6c_plus_S8_R1_001.fastq.gz   1991989658  8·î  1 10:59
3rd_181019_45d7B_plus_S10_R1_001.fastq.gz     2621909118  8·î  1 11:03
3rd_181103_Anc_S15_R1_001.fastq.gz              15931112  8·î  1 10:52
3rd_PwOw_plus_S16_R1_001.fastq.gz               27902010  8·î  1 10:52

Á÷¿®¸µ¤Îmd5

b9fcb3b2a1c0c86d546c85df9918e3ec  181012_10B_S7_R1_001.fastq.gz
9a32bafc8f17d8f4d3c997666c2d8124  181012_1p2-1_S11_R1_001.fastq.gz
14ea823364a457f3a2820cc592891746  181012_1p2-2_S13_R1_001.fastq.gz
87296a5ab0651f6ef33016426aa683d5  181012_2p5-1_S12_R1_001.fastq.gz
e387abd95d87113870382dedaf31bdd4  181012_2p6-1_S14_R1_001.fastq.gz
69d1f073e9d4eaafc855310a1a7e2d3b  181012_45a_plus_S2_R1_001.fastq.gz
a3f1d243c79da8377b8fe01afc7901d4  181012_45b_plus_S3_R1_001.fastq.gz
9988a7366a559f86628e36a28b98e41a  181012_45c_plus_S4_R1_001.fastq.gz
d7b837feb5477e2f7aa34a65a853b7e1  181019_43B_S1_R1_001.fastq.gz
852751d57375e358a77453d4086cc175  181019_45A_plus_S5_R1_001.fastq.gz
389740c32f8b0c9a772b588297f6def1  181019_45L_S6_R1_001.fastq.gz
fed07f9afa0e0fd02e134ae08d68a8fc  181019_45a10D_plus_S9_R1_001.fastq.gz
061b8267c3997c1078df872f3d1e4b77  181019_45aIII6c_plus_S8_R1_001.fastq.gz
c750892531f03795fdf6e978625c32da  181019_45d7B_plus_S10_R1_001.fastq.gz
a759192ef94b69b3581747e57b1cc11c  181103_Anc_S15_R1_001.fastq.gz
a0993d4bf86518efef1112a9e6391f28  PwOw_plus_S16_R1_001.fastq.gz

¼ê¸µ¤Ç¤Îmd5

b9fcb3b2a1c0c86d546c85df9918e3ec  3rd_181012_10B_S7_R1_001.fastq.gz
9a32bafc8f17d8f4d3c997666c2d8124  3rd_181012_1p2-1_S11_R1_001.fastq.gz
14ea823364a457f3a2820cc592891746  3rd_181012_1p2-2_S13_R1_001.fastq.gz
87296a5ab0651f6ef33016426aa683d5  3rd_181012_2p5-1_S12_R1_001.fastq.gz
e387abd95d87113870382dedaf31bdd4  3rd_181012_2p6-1_S14_R1_001.fastq.gz
69d1f073e9d4eaafc855310a1a7e2d3b  3rd_181012_45a_plus_S2_R1_001.fastq.gz
a3f1d243c79da8377b8fe01afc7901d4  3rd_181012_45b_plus_S3_R1_001.fastq.gz
9988a7366a559f86628e36a28b98e41a  3rd_181012_45c_plus_S4_R1_001.fastq.gz
d7b837feb5477e2f7aa34a65a853b7e1  3rd_181019_43B_S1_R1_001.fastq.gz
852751d57375e358a77453d4086cc175  3rd_181019_45A_plus_S5_R1_001.fastq.gz
389740c32f8b0c9a772b588297f6def1  3rd_181019_45L_S6_R1_001.fastq.gz
fed07f9afa0e0fd02e134ae08d68a8fc  3rd_181019_45a10D_plus_S9_R1_001.fastq.gz
061b8267c3997c1078df872f3d1e4b77  3rd_181019_45aIII6c_plus_S8_R1_001.fastq.gz
c750892531f03795fdf6e978625c32da  3rd_181019_45d7B_plus_S10_R1_001.fastq.gz
a759192ef94b69b3581747e57b1cc11c  3rd_181103_Anc_S15_R1_001.fastq.gz
a0993d4bf86518efef1112a9e6391f28  3rd_PwOw_plus_S16_R1_001.fastq.gz

Quality Check

fastqc.sh¤òÍѰÕ

for u in *.gz
  do fastqc --nogroup $u &
done

bash fastqc.sh¤Çµ¯Æ°

r-rna¥Á¥§¥Ã¥¯

sortmerna¤Ï¡¢gz¥Õ¥¡¥¤¥ë¤Ï¼õ¤±ÉÕ¤±¤Ê¤¤¡£¤·¤¿¤¬¤Ã¤Æ¡¢Á´¥Õ¥¡¥¤¥ë¤òunzip¤¹¤ëɬÍפ¬¤¢¤ë¡£

for u in *.gz
do
  v=${u/.gz/.ugz}
  gzip -dc $u > $v &
done

sortmerna.sh¤òÍѰա£bash sortmerna.sh <¥Õ¥¡¥¤¥ë̾> ¤Ç£±¥Õ¥¡¥¤¥ë¤ò½èÍý

sortmerna --ref ../KishimotoRNA2/rRNA_databases/silva-bac-16s-id90.fasta,../KishimotoRNA2/index/silva-bac-16s-db:../KishimotoRNA2/rRNA_databases/silva-bac-23s-id98.fasta,../KishimotoRNA2/index/silva-bac-23s-db:../KishimotoRNA2/rRNA_databases/rfam-5s-database-id98.fasta,../KishimotoRNA2/index/rfam-5s-db:../KishimotoRNA2/rRNA_databases/rfam-5.8s-database-id98.fasta,../KishimotoRNA2/index/rfam-5.8s-db --reads $1 --sam --num_alignments 1 --fastx --aligned $1.rRNA --other $1.non_rRNA --log -a 8 -v

sortmerna-all.sh¡¡¡Á¡¡¤¹¤Ù¤Æ¤Î¥Õ¥¡¥¤¥ë¤ò½èÍý

bash ./sortmerna.sh  3rd_181012_10B_S7_R1_001.fastq             &
bash ./sortmerna.sh  3rd_181019_45a10D_plus_S9_R1_001.fastq     &
bash ./sortmerna.sh  3rd_181019_45d7B_plus_S10_R1_001.fastq     &
bash ./sortmerna.sh  3rd_181019_45aIII6c_plus_S8_R1_001.fastq   &
bash ./sortmerna.sh  3rd_181012_2p5-1_S12_R1_001.fastq          &
bash ./sortmerna.sh  3rd_181012_1p2-1_S11_R1_001.fastq          &
bash ./sortmerna.sh  3rd_181012_1p2-2_S13_R1_001.fastq          &
bash ./sortmerna.sh  3rd_181012_45b_plus_S3_R1_001.fastq        &
bash ./sortmerna.sh  3rd_181019_43B_S1_R1_001.fastq             &
bash ./sortmerna.sh  3rd_PwOw_plus_S16_R1_001.fastq             &
bash ./sortmerna.sh  3rd_181012_45c_plus_S4_R1_001.fastq        &
bash ./sortmerna.sh  3rd_181012_2p6-1_S14_R1_001.fastq          &
bash ./sortmerna.sh  3rd_181103_Anc_S15_R1_001.fastq            &
bash ./sortmerna.sh  3rd_181019_45L_S6_R1_001.fastq             &
bash ./sortmerna.sh  3rd_181019_45A_plus_S5_R1_001.fastq        &
bash ./sortmerna.sh  3rd_181012_45a_plus_S2_R1_001.fastq        &

Trimming

¡Á¡Á¡Á¡Á¡Á¡Á¡Á

HISAT2-build

hisat2-build ~/1710JNHX-0008/rawdata/AP012030-new.fasta AP012030-new-build.fna
Settings:
 Output files: "AP012030-new-build.fna.*.ht2"
 Line rate: 6 (line is 64 bytes)
 Lines per side: 1 (side is 64 bytes)
 Offset rate: 4 (one in 16)
 FTable chars: 10
 Strings: unpacked
 Local offset rate: 3 (one in 8)
 Local fTable chars: 6
 Local sequence length: 57344
 Local sequence overlap between two consecutive indexes: 1024
 Endianness: little
 Actual local endianness: little
 Sanity checking: disabled
 Assertions: disabled
 Random seed: 0
 Sizeofs: void*:8, int:4, long:8, size_t:8
Input files DNA, FASTA:
  /home/yamanouc/1710JNHX-0008/rawdata/AP012030-new.fasta
Reading reference sizes
  Time reading reference sizes: 00:00:00
Calculating joined length
Writing header
Reserving space for joined string
Joining reference sequences
  Time to join reference sequences: 00:00:00
  Time to read SNPs and splice sites: 00:00:00
Using parameters --bmax 866518 --dcv 1024
  Doing ahead-of-time memory usage test
  Passed!  Constructing with these parameters: --bmax 866518 --dcv 1024
Constructing suffix-array element generator
Building DifferenceCoverSample
 Building sPrime
 Building sPrimeOrder
 V-Sorting samples
 V-Sorting samples time: 00:00:00
 Allocating rank array
 Ranking v-sort output
 Ranking v-sort output time: 00:00:00
 Invoking Larsson-Sadakane on ranks
 Invoking Larsson-Sadakane on ranks time: 00:00:01
 Sanity-checking and returning
Building samples
Reserving space for 12 sample suffixes
Generating random suffixes
QSorting 12 sample offsets, eliminating duplicates
QSorting sample offsets, eliminating duplicates time: 00:00:00
Multikey QSorting 12 samples
  (Using difference cover)
  Multikey QSorting samples time: 00:00:00
Calculating bucket sizes
Splitting and merging
  Splitting and merging time: 00:00:00
Avg bucket size: 577678 (target: 866517)
Converting suffix-array elements to index image
Allocating ftab, absorbFtab
Entering GFM loop
Getting block 1 of 8
  Reserving size (866518) for bucket 1
  Calculating Z arrays for bucket 1
  Entering block accumulator loop for bucket 1:
  bucket 1: 10%
  bucket 1: 20%
  bucket 1: 30%
  bucket 1: 40%
  bucket 1: 50%
  bucket 1: 60%
  bucket 1: 70%
  bucket 1: 80%
  bucket 1: 90%
  bucket 1: 100%
  Sorting block of length 632333 for bucket 1
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 632334 for bucket 1
Getting block 2 of 8
  Reserving size (866518) for bucket 2
  Calculating Z arrays for bucket 2
  Entering block accumulator loop for bucket 2:
  bucket 2: 10%
  bucket 2: 20%
  bucket 2: 30%
  bucket 2: 40%
  bucket 2: 50%
  bucket 2: 60%
  bucket 2: 70%
  bucket 2: 80%
  bucket 2: 90%
  bucket 2: 100%
  Sorting block of length 695911 for bucket 2
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 695912 for bucket 2
Getting block 3 of 8
  Reserving size (866518) for bucket 3
  Calculating Z arrays for bucket 3
  Entering block accumulator loop for bucket 3:
  bucket 3: 10%
  bucket 3: 20%
  bucket 3: 30%
  bucket 3: 40%
  bucket 3: 50%
  bucket 3: 60%
  bucket 3: 70%
  bucket 3: 80%
  bucket 3: 90%
  bucket 3: 100%
  Sorting block of length 741622 for bucket 3
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 741623 for bucket 3
Getting block 4 of 8
  Reserving size (866518) for bucket 4
  Calculating Z arrays for bucket 4
  Entering block accumulator loop for bucket 4:
  bucket 4: 10%
  bucket 4: 20%
  bucket 4: 30%
  bucket 4: 40%
  bucket 4: 50%
  bucket 4: 60%
  bucket 4: 70%
  bucket 4: 80%
  bucket 4: 90%
  bucket 4: 100%
  Sorting block of length 585117 for bucket 4
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 585118 for bucket 4
Getting block 5 of 8
  Reserving size (866518) for bucket 5
  Calculating Z arrays for bucket 5
  Entering block accumulator loop for bucket 5:
  bucket 5: 10%
  bucket 5: 20%
  bucket 5: 30%
  bucket 5: 40%
  bucket 5: 50%
  bucket 5: 60%
  bucket 5: 70%
  bucket 5: 80%
  bucket 5: 90%
  bucket 5: 100%
  Sorting block of length 318349 for bucket 5
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 318350 for bucket 5
Getting block 6 of 8
  Reserving size (866518) for bucket 6
  Calculating Z arrays for bucket 6
  Entering block accumulator loop for bucket 6:
  bucket 6: 10%
  bucket 6: 20%
  bucket 6: 30%
  bucket 6: 40%
  bucket 6: 50%
  bucket 6: 60%
  bucket 6: 70%
  bucket 6: 80%
  bucket 6: 90%
  bucket 6: 100%
  Sorting block of length 719612 for bucket 6
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 719613 for bucket 6
Getting block 7 of 8
  Reserving size (866518) for bucket 7
  Calculating Z arrays for bucket 7
  Entering block accumulator loop for bucket 7:
  bucket 7: 10%
  bucket 7: 20%
  bucket 7: 30%
  bucket 7: 40%
  bucket 7: 50%
  bucket 7: 60%
  bucket 7: 70%
  bucket 7: 80%
  bucket 7: 90%
  bucket 7: 100%
  Sorting block of length 283374 for bucket 7
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 283375 for bucket 7
Getting block 8 of 8
  Reserving size (866518) for bucket 8
  Calculating Z arrays for bucket 8
  Entering block accumulator loop for bucket 8:
  bucket 8: 10%
  bucket 8: 20%
  bucket 8: 30%
  bucket 8: 40%
  bucket 8: 50%
  bucket 8: 60%
  bucket 8: 70%
  bucket 8: 80%
  bucket 8: 90%
  bucket 8: 100%
  Sorting block of length 645105 for bucket 8
  (Using difference cover)
  Sorting block time: 00:00:00
Returning block of 645106 for bucket 8
Exited GFM loop
fchr[A]: 0
fchr[C]: 1136926
fchr[G]: 2313079
fchr[T]: 3485960
fchr[$]: 4621430
Exiting GFM::buildToDisk()
Returning from initFromVector
Wrote 5735025 bytes to primary GFM file: AP012030-new-build.fna.1.ht2
Wrote 1155364 bytes to secondary GFM file: AP012030-new-build.fna.2.ht2
Re-opening _in1 and _in2 as input streams
Returning from GFM constructor
Returning from initFromVector
Wrote 2035637 bytes to primary GFM file: AP012030-new-build.fna.5.ht2
Wrote 1176518 bytes to secondary GFM file: AP012030-new-build.fna.6.ht2
Re-opening _in5 and _in5 as input streams
Returning from HierEbwt constructor
Headers:
    len: 4621430
    gbwtLen: 4621431
    nodes: 4621431
    sz: 1155358
    gbwtSz: 1155358
    lineRate: 6
    offRate: 4
    offMask: 0xfffffff0
    ftabChars: 10
    eftabLen: 0
    eftabSz: 0
    ftabLen: 1048577
    ftabSz: 4194308
    offsLen: 288840
    offsSz: 1155360
    lineSz: 64
    sideSz: 64
    sideGbwtSz: 48
    sideGbwtLen: 192
    numSides: 24070
    numLines: 24070
    gbwtTotLen: 1540480
    gbwtTotSz: 1540480
    reverse: 0
    linearFM: Yes
Total time for call to driver() for forward index: 00:00:03

HISAT2

hisat2 -p 32 -x AP012030-new-build.fna -1 ~/1710JNHX-0008/rawdata/Anc/Anc_R1.fastq.gz
    -2 ~/1710JNHX-0008/rawdata/Anc/Anc_R2.fastq.gz -S Anc.sam

47567149 reads; of these:
  47567149 (100.00%) were paired; of these:
    2046602 (4.30%) aligned concordantly 0 times
    44840697 (94.27%) aligned concordantly exactly 1 time
    679850 (1.43%) aligned concordantly >1 times
    ----
    2046602 pairs aligned concordantly 0 times; of these:
      30402 (1.49%) aligned discordantly 1 time
    ----
    2016200 pairs aligned 0 times concordantly or discordantly; of these:
      4032400 mates make up the pairs; of these:
        2317731 (57.48%) aligned 0 times
        1669267 (41.40%) aligned exactly 1 time
        45402 (1.13%) aligned >1 times
97.56% overall alignment rate

GeneBank¥Õ¥©¡¼¥Þ¥Ã¥È¤òGFF3¥Õ¥©¡¼¥Þ¥Ã¥È¤ËÊÑ´¹¤¹¤ë¥×¥í¥°¥é¥à¡¡gb2gff3.py¤òºîÀ®¤¹¤ë¡£

#Converting other formats to GFF3
from BCBio import GFF
from Bio import SeqIO

#in_file = "your_file.gb"
#out_file = "your_file.gff"
in_file = "/home/yamanouc/1710JNHX-0008/rawdata/AP012030.gb"
out_file = "AP012030.gff"
in_handle = open(in_file)
out_handle = open(out_file, "w")

GFF.write(SeqIO.parse(in_handle, "genbank"), out_handle)

in_handle.close()
out_handle.close()

¤³¤ì¤ò¼Â¹Ô¤¹¤ë¡£

python gb2gff3.py

¤Ç¤­¤¿¥Õ¥¡¥¤¥ëAP012030.gff¤ËÂФ·¤Æ add_gene_id¡¡¤ò¼Â¹Ô¤¹¤ë¡£

python /usr/local/RNAseq/add_gene_id AP012030.gff AP012030_e.gff

¥Ç¡¼¥¿Â¦¤Î½àÈ÷¤Ï¡¢samtools¤ò»È¤Ã¤Æsam¥Õ¥¡¥¤¥ëAnc.sam¤òbam¥Õ¥¡¥¤¥ëAnc.bam¤ËÊÑ´¹¤·¡¢¤«¤Ä¡¢position¤Çsort¤¹¤ë¡£ samtools view¤ò»È¤Ã¤Æsam¥Õ¥¡¥¤¥ë¤òbam¥Õ¥¡¥¤¥ë¤ËÊÑ´¹¡£@32¤Ï32¥¹¥ì¥Ã¥É»ÈÍÑ¡£

samtools view -@ 32 -b Anc.sam -o Anc.bam

samtools sort¤ò»È¤Ã¤Æbam¥Õ¥¡¥¤¥ëAnc.bam ¤ò¥½¡¼¥È¤·¤ÆAnc.sorted.bam¤Ë½ÐÎÏ

samtools sort -@ 32 Anc.bam > Anc.sorted.bam 

GFFÊÑ´¹¤ÇÆÀ¤é¤ì¤¿AP012030_e.gff¤ò»È¤Ã¤Æ¡¢Anc¥Ç¡¼¥¿Anc.sorted.bam¤òfeatureCount¤¹¤ë¡£

featureCounts -p -T 32 -t exon -g gene_id -a AP012030_e.gff -o Anc.counts.txt Anc.sorted.bam

¥È¥Ã¥×   ÊÔ½¸ Åà·ë º¹Ê¬ ¥Ð¥Ã¥¯¥¢¥Ã¥× źÉÕ Ê£À½ ̾Á°Êѹ¹ ¥ê¥í¡¼¥É   ¿·µ¬ °ìÍ÷ ñ¸ì¸¡º÷ ºÇ½ª¹¹¿·   ¥Ø¥ë¥×   ºÇ½ª¹¹¿·¤ÎRSS
Last-modified: 2019-08-01 (ÌÚ) 16:15:22 (1336d)