¥Î¡¼¥È
ˬÌä¼Ô¿ô¡¡281¡¡¡¡¡¡¡¡¡¡¡¡ºÇ½ª¹¹¿·¡¡2020-03-02 (·î) 11:56:09
TPMÊäÀµ¤¬½ª¤ï¤Ã¤¿¼Â¸³¥Ç¡¼¥¿¤ò¤É¤¦Ê¬ÀϤ¹¤ë¤«¹Í¤¨¤ë¡£
TPMÊäÀµºÑ¤ß¥Ç¡¼¥¿¡Êȯ¸½ÃͤÎÊäÀµºÑ¤ß¥Ç¡¼¥¿¡Ë¤Ï
Chr Start End Strand Length 10B.sam 10D_minus.sam 1p2-1.sam 1p2-2.sam 2-10B_minus.sam gene_0001_thrL AP012030.1 190 255 + 66 1119.230093 2070.357664 5212.719939 4193.551146 1955.183064 gene_0003_thrA AP012030.1 338 2800 + 2463 1872.865147 4391.896215 5773.99795 6352.945352 3971.157253 gene_0005_thrB AP012030.1 2802 3734 + 933 1925.903487 4443.130625 3779.116205 4639.666182 3926.988291 gene_0007_thrC AP012030.1 3735 5021 + 1287 2288.887037 3590.528484 3897.465673 4230.197519 3276.631145 gene_0008_yaaX AP012030.1 5235 5531 + 297 186.9824878 553.2073517 293.9131871 321.6221759 585.3916812 gene_0009_yaaA AP012030.1 5684 6460 - 777 42.27207978 45.02772533 73.69652755 49.63567721 49.63092301 gene_0011_yaaJ AP012030.1 6530 7960 - 1431 29.68188822 18.17449013 19.74707008 17.02170195 21.92416 gene_0014_talB AP012030.1 8239 9192 + 954 3197.624287 397.0260641 1580.352153 1586.773069 443.3769857 gene_0016_mog AP012030.1 9307 9894 + 588 218.27842 87.23291626 137.3536608 138.6934099 94.32622154 gene_0017_yaaH AP012030.1 9929 10495 - 567 146.5658456 50.05540317 89.97128851 78.75914474 61.42555542
¤Î¤è¤¦¤Ë¤Ê¤Ã¤Æ¤¤¤ë¡£
Ãí°Õ¡§¡¡Æ±¤¸gene̾¤Ç¡¢Ê£¿ô¤Î°Û¤Ê¤ëCDS¤Ë¤Ê¤Ã¤Æ¤¤¤ë¾ì¹ç¤¬¤¢¤ë¡£¤Ä¤Þ¤êgene̾¤Ç¤Ï¥æ¥Ë¡¼¥¯¤Ç¤Ï¤Ê¤¯¡¢CDS°ÌÃ֤ǥæ¥Ë¡¼¥¯¤Ë¤Ê¤ë¡£¤³¤ì¤é¤Ï¥È¥é¥ó¥¹¥Ý¥¾¥ó¡ÊinsA¡ÁN¤Ê¤É¡Ë¤Ê¤É¥²¥Î¥àÆâ¤òÈô¤Ó²ó¤ë°äÅÁ»Ò¤é¤·¤¤¡£
gene̾¤Ç¸«¤Æ½ÅÊ£¤Î¤¢¤ëCDS¤ò¼è½Ð¤·¤Æ¤ß¤ë¤È
gene | Start | Length | Anc | |
gene_3114 | arpB | 1786475 | 474 | 7.176203468 |
gene_3115 | arpB | 1786959 | 1416 | 1.441322222 |
gene_3782 | gatR | 2153495 | 447 | 6.087732338 |
gene_3786 | gatR | 2155197 | 339 | 163.2194667 |
gene_2015 | icd | 1172515 | 1251 | 1782.60336 |
gene_2058 | icd | 1190066 | 165 | 12.36916525 |
gene_6818 | ilvG | 3929665 | 984 | 1955.874255 |
gene_6819 | ilvG | 3930728 | 582 | 942.1393394 |
gene_0495 | insA | 290296 | 276 | 0 |
gene_1774 | insA | 1040161 | 276 | 6.572986365 |
gene_6204 | insA | 3561389 | 276 | 0.821623296 |
gene_7778 | insA | 4498297 | 276 | 37.7946716 |
gene_0035 | insB | 19812 | 504 | 3.149555967 |
gene_0475 | insB | 278403 | 504 | 0 |
gene_0494 | insB | 289874 | 504 | 2.6996194 |
gene_1775 | insB | 1040355 | 504 | 64.34092903 |
gene_7637 | insB | 4413646 | 504 | 3.599492533 |
gene_7779 | insB | 4498491 | 294 | 30.85279314 |
gene_7781 | insB | 4498785 | 210 | 78.82888648 |
gene_0647 | insC | 381730 | 411 | 0 |
gene_1727 | insC | 1016493 | 411 | 0 |
gene_2221 | insC | 1280354 | 411 | 0 |
gene_5485 | insC | 3164049 | 411 | 1.103494061 |
gene_7744 | insC | 4477997 | 411 | 0 |
gene_0648 | insD | 382080 | 924 | 0 |
gene_1728 | insD | 1016861 | 906 | 0 |
gene_2222 | insD | 1280722 | 906 | 0 |
gene_2529 | insD | 1450103 | 906 | 0 |
gene_3625 | insD | 2052755 | 906 | 0 |
gene_5486 | insD | 3164417 | 906 | 0 |
gene_7745 | insD | 4478347 | 924 | 2.6996194 |
gene_0535 | insE | 315706 | 309 | 0 |
gene_0536 | insE | 315715 | 300 | 0 |
gene_2021 | insE | 1177320 | 309 | 0 |
gene_3785 | insE | 2154038 | 300 | 0 |
gene_0467 | insH | 273326 | 981 | 0 |
gene_0505 | insH | 294457 | 981 | 0 |
gene_0973 | insH | 566399 | 981 | 0 |
gene_1163 | insH | 678257 | 981 | 0 |
gene_1843 | insH | 1081651 | 1017 | 0 |
gene_2218 | insH | 1278220 | 981 | 0 |
gene_2394 | insH | 1377059 | 1017 | 0.222977414 |
gene_2401 | insH | 1380609 | 1017 | 0.222977414 |
gene_2466 | insH | 1409928 | 981 | 16.18120497 |
gene_3011 | insH | 1725750 | 981 | 0 |
gene_3316 | insH | 1893378 | 981 | 0 |
gene_3623 | insH | 2050108 | 1017 | 28.31813152 |
gene_3690 | insH | 2085698 | 1017 | 0 |
gene_3792 | insH | 2158195 | 1017 | 5.351457925 |
gene_3983 | insH | 2274059 | 1017 | 0 |
gene_5382 | insH | 3108121 | 981 | 0 |
gene_5806 | insH | 3343608 | 981 | 0 |
gene_6318 | insH | 3630088 | 1017 | 0 |
gene_6713 | insH | 3868728 | 1017 | 0 |
gene_0463 | insI | 269828 | 1152 | 16.53516882 |
gene_2530 | insI | 1451540 | 1152 | 1.181083487 |
gene_7761 | insI | 4487236 | 1152 | 11.22029313 |
gene_0028 | insL | 15440 | 1119 | 0 |
gene_1028 | insL | 598319 | 1119 | 0 |
gene_4345 | insL | 2499206 | 1119 | 0 |
gene_0462 | insN | 269467 | 405 | 9.518658032 |
gene_7760 | insN | 4486967 | 267 | 40.76728622 |
gene_0464 | insO | 271055 | 426 | 117.11025 |
gene_7764 | insO | 4488728 | 597 | 20.89152701 |
gene_6038 | kefG | 3458512 | 555 | 0 |
gene_6039 | kefG | 3458512 | 552 | 0 |
gene_2164 | ldrB | 1247821 | 135 | 0 |
gene_2167 | ldrB | 1248356 | 135 | 68.87029047 |
gene_2169 | ldrB | 1248891 | 135 | 0 |
gene_2465 | lomR | 1409640 | 156 | 15.99005337 |
gene_2469 | lomR | 1410996 | 171 | 1.326128828 |
gene_3836 | molR | 2181468 | 825 | 3.573314406 |
gene_3838 | molR | 2182404 | 1938 | 8.658841171 |
gene_3840 | molR | 2184304 | 960 | 4.251900555 |
gene_7426 | phnE | 4301437 | 369 | 4.916380045 |
gene_7427 | phnE | 4301655 | 621 | 5.842654547 |
gene_1997 | potA | 1161851 | 1137 | 0 |
gene_1998 | potA | 1161851 | 1119 | 0 |
gene_6273 | rhsB | 3597098 | 4236 | 3.051411163 |
gene_6498 | rhsB | 3746869 | 294 | 13.11243709 |
gene_1713 | sulA | 1009402 | 516 | 0 |
gene_1714 | sulA | 1009402 | 510 | 0 |
gene_3689 | wbbL | 2085199 | 282 | 99.71360166 |
gene_3691 | wbbL | 2086719 | 474 | 189.4517716 |
gene_1830 | ycdN | 1071684 | 111 | 4.085910443 |
gene_1832 | ycdN | 1071794 | 720 | 11.65335708 |
gene_2077 | ycgH | 1198254 | 1521 | 32.05465244 |
gene_2078 | ycgH | 1199859 | 1017 | 105.0223618 |
gene_2212 | ychG | 1273079 | 591 | 44.50946097 |
gene_2213 | ychG | 1273621 | 231 | 54.97406778 |
gene_2284 | yciX | 1316143 | 129 | 54.49464277 |
gene_2285 | yciX | 1316253 | 189 | 278.3607559 |
gene_2528 | ydbA | 1447574 | 2559 | 13.20376569 |
gene_2532 | ydbA | 1452872 | 3324 | 120.0697148 |
gene_3511 | yedN | 1995151 | 192 | 1.181083487 |
gene_3512 | yedN | 1995352 | 321 | 0.70644246 |
gene_3570 | yedS | 2017854 | 486 | 5.132609723 |
gene_3572 | yedS | 2018349 | 210 | 16.1977164 |
gene_3574 | yedS | 2018642 | 405 | 19.03731606 |
gene_3597 | yeeL | 2036079 | 327 | 3.467401064 |
gene_3599 | yeeL | 2036427 | 705 | 0.964970339 |
gene_4039 | yfaS | 2314679 | 486 | 15.39782917 |
gene_4041 | yfaS | 2315180 | 4104 | 11.65888261 |
gene_4169 | yfcC | 2401962 | 1521 | 0.298182813 |
gene_4171 | yfcC | 2402004 | 1479 | 0 |
gene_7747 | yjgX | 4479813 | 432 | 58.26678538 |
gene_7748 | yjgX | 4480202 | 360 | 30.23573728 |
gene_0530 | ykgM | 312798 | 141 | 3697.444681 |
gene_0531 | ykgM | 312938 | 267 | 2342.420321 |
gene_0940 | ylbE | 548781 | 1260 | 0.179974627 |
gene_0941 | ylbE | 549039 | 1002 | 0 |
gene_2630 | yncI | 1512768 | 747 | 21.55358782 |
gene_2632 | yncI | 1513558 | 201 | 15.79478813 |
gene_2579 | yncK | 1485325 | 168 | 31.0456231 |
gene_2580 | yncK | 1485544 | 288 | 37.7946716 |
gene_2084 | ypjA | 1202348 | 213 | 5.323193183 |
gene_4790 | ypjA | 2756055 | 4500 | 23.98701824 |
¤³¤ì¤é¤Ë¤Ä¤¤¤Æ¤Ï¡¢È¯¸½Î̲òÀϤÎÅö½é¤Ç¤ÏÊüÃÖ¤·¤Æ¤ª¤¤¤¿¡ÊCDS°ÌÃ֤ǶèÊ̤·¤Æ½èÍý¤·¤Æ ¤¤¤¿¡Ë¤¬¡¢°äÅÁ»Ò¤Îȯ¸½Î̤Ȥ·¤Æ¤ß¤ë¤È²¿¤é¤«¤ÎȽÃǤò¤·¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£
ÆâÍÆ¤ò¸«¤ë¤È¡Ê´ßËÜÀèÀ¸¤Ë¤è¤ë¡Ë¡¢Æ±¤¸ÇÛÎ󤬥³¥Ô¡¼¤µ¤ì¤Æ¤¤¤ë¾ì¹ç¤È¡¢£±¤Ä¤ÎÇÛÎó¤¬ÅÓÃæ¤Ë;ʬ¤ÊÔó»¨Êª¤¬Æþ¤Ã¤ÆÀÚ¤ì¤Æ¤¤¤ë¤è¤¦¤Ë¸«¤¨¤ë¾ì¹ç¤¬¤¢¤ë¡£¤µ¤é¤Ë°Üư¤ÎÅÓÃæ¤Çû¤¯¤Ê¤Ã¤¿¤êŤ¯¤Ê¤Ã¤¿¤ê¤·¤Æ¤¤¤ë¥±¡¼¥¹¤¬¤¢¤ë¡£¤³¤ì¤é¤ÎȽÃÇ¤ÏÆñ¤·¤¤¤¬¡¢
¤½¤ì¤¾¤ì¡¢È¯¸½Î̤ˤĤ¤¤Æ¼¡¤Î¤è¤¦¤ÊÂбþ¤ò¤¹¤ë¤³¤È¤Ë¤¹¤ë¡£
¤³¤ì¤é¤Ë¤è¤Ã¤Æ¡¢gene̾¤ò¡ÊCDS°ÌÃ֤˰ͤ餺¡Ë¥æ¥Ë¡¼¥¯¤Ë¤·¤¿¡£
¥µ¥ó¥×¥ë´Ö¤ÎÊÑÆ°¥Ñ¥¿¡¼¥ó¤ògene¤´¤È¤Ëµá¤á¤Æ¡¢gene´Ö¤ÇÈæ³Ó¤·¤Æ»÷¤Æ¤¤¤ë¥Ñ¥¿¡¼¥ó¡ÊÁêÊäŪ¤Ê¥Ñ¥¿¡¼¥ó¤ò´Þ¤á¤Æ¡Ë¤òõ¤¹¡£¤â¤·¥Ñ¥¿¡¼¥ó´Ö¤Îµ÷Î¥¤òÄêµÁ¤Ç¤¤ì¤Ð¡¢µ÷Î¥¤Î¶á¤¤¤â¤Î¤ò¥°¥ë¡¼¥×²½¤¹¤ë¡Ê¥¯¥é¥¹¥¿²½¡Ë¤ÈƱ»þ¤Ë¡¢µ÷Î¥¤ò¸µ¤Ë¥°¥é¥Õ²½¤·¤Æ¹Í¤¨¤ë¤³¤È¤Çgene´Ö¤Î´ØÏ¢¡ÊÈ¿±þÅù¤Î¤Ä¤Ê¤¬¤ê¡Ë¤ò¹Í¤¨¤ë¤³¤È¤¬¤Ç¤¤ë¡£
ÊÑÆ°¤ÎÂоݤȤ·¤Æ¡¢¥Ç¡¼¥¿¤«¤é·ÏÎó¤Ë½¾¤Ã¤Æ¡¢¡¡Anc - ... - 1_2-1 - 2_5-1 ¤È¡¡Anc - ... - 2_2-1 - 2_6-2¡¡¤òÃê½Ð¤¹¤ë¡£
0 - ... - 0 - 0 - 0¡¡¤Ä¤Þ¤ê°ìÀÚȯ¸½Ìµ¤·¡¢¤¬¤¢¤êÆÀ¤ë¡£¤³¤ì¤Ï¤ª¤½¤é¤¯Ê¬ÀÏÂоݤ«¤é½ü³°¤·¤Æ¤è¤¤¤À¤í¤¦¡£
0 - ÅÓÃæ¤Ënon-0¤¬¤¢¤ë - ...¡¡¤³¤ì¤é¤Î¥±¡¼¥¹¤Ï¡¢Îã³°»ë¤¹¤ëɬÍפϤʤ¯¡¢£±¤Ä¤Î¥Ñ¥¿¡¼¥ó¤È¹Í¤¨¤Æ¤è¤«¤í¤¦¡£Anc¤¬0¤Ç¤¢¤ë¥±¡¼¥¹¤Ï¡¢¡Ö¤Ê¤«¤Ã¤¿¤â¤Î¤¬½Ð¤Æ¤¯¤ë¤è¤¦¤Ë¤Ê¤Ã¤¿¡×¤È¤¤¤¦°ÕÌ£¤Ç¾¯¤·¹Í¤¨¤ëɬÍפ¬¤¢¤ë¤«¤âÃΤì¤Ê¤¤¤¬¡¢¤È¤ê¤¢¤¨¤º£±¤Ä¤Î¥Ñ¥¿¡¼¥ó¤È¤·¤Æ¹Í¤¨¤ë¤³¤È¤Ë¤¹¤ë¡£
¤Ê¤ª¡¢¸å½Ò¤Î¤è¤¦¤Ë¡¢ÊÑÆ°¤È¤·¤Æ·ÏÎó¾å¤ÎÁ°¤ÎÃͤȤÎÈæ¤ò¹Í¤¨¤ë¤È¡¢0¤Ç³ä¤ë¤³¤È¤Ë¤Ê¤ë¤Î¤Ç¡¢Èù¾®ÃͤËÃÖ¤´¹¤¨¤ë¤Ê¤É¤Î¹©Éפ¬É¬Íפˤʤ뤷¡¢Èù¾®Ãͤˤ¹¤ë¤Èlog10¤ò¼è¤ë¤ÈÉé¤ÎÂ礤ÊÃͤˤʤäÆÂ¿¾¯»ÏËö¤¬°¤¤¡£
Ʊ¤¸gene̾¤¬Ê£¿ô¤ÎCDS¤Ë¸½¤ì¤ë·ï¤Ë¤Ä¤¤¤Æ¡£¡©¡©
¡Àµµ¬²½¡§
ȯ¸½Î̤ÎÀäÂÐÃͤϡ¢º£²¾¤Ë°ÕÌ£¤¬Ìµ¤¤¤È¹Í¤¨¤ë¡£Íߤ·¤¤¤Î¤Ï¥µ¥ó¥×¥ë´Ö¤Ç¸«¤¿¤È¤¤ÎÁý¸º¤Î¥Ñ¥¿¡¼¥ó¡ÊÁý¸º¤ÎÊý¸þ¤È¿²É¡¢¤½¤ì¤¬°ìÏ¢¤Î¥µ¥ó¥×¥ë´Ö¤Ç¤É¤¦¤¤¤¦Ï¢º¿¤«¡Ë¤Ê¤Î¤Ç¡¢¿ôÃͤÏgene´Ö¤ÇÀµµ¬²½¤¹¤ëɬÍפ¬¤¢¤ë¤¬¡¢£±¤Ä¤ÎÊýË¡¤È¤·¤ÆÆ±°ìgeneÆâ¤Ç¤ÎÊÑÆ°ÈæÎ¨¤ò¼è¤Ã¤Æ¤·¤Þ¤¦¡£
£²¤Ä¤ÎÈæÎ¨¤¬¹Í¤¨¤é¤ì¤ë¡§£±ÈÖÌܤϴð½àÃ͡ʤ¿¤È¤¨¤ÐAnc¤«¡¢¤Þ¤¿¤ÏÁ´ÂΤÎÊ¿¶ÑÃÍ¡¢ºÇÉÑÃͤΤ褦¤Ê¤â¤Î¡Ë¤ËÂФ¹¤ëÈæÎ¨¡¢£²ÈÖÌܤϥµ¥ó¥×¥ë´Ö¤ÎÈæÎ¨¡£
¢ÃͤÎlog¡Êlog10¡Ë¤Ë¤è¤ë°µ½Ì
¤³¤ì¤Ï¡¢È¯¸½Î̤ÎÀäÂÐÃͤ¬gene¤Ë¤è¤Ã¤Æ·å¤¬°ã¤¦ÅÀ¤ò¹Íθ¤¹¤ë¤¿¤á¤ËƳÆþ¤ò¹Í¤¨¤é¤ì¤ë¤¬¡¢¤â¤·¡¤ÇÈæÎ¨¤ò¹Í¤¨¤ë¤È¤½¤ì¤Û¤É°ÕÌ£¤Ï̵¤¤¡£¤Þ¤¿ÈæÎ¨¤Ï¡¢Âпô²½¤·¤¿¸å¤Ç¤Ïº¹¤Ë¤Ê¤ë¤³¤È¤Ë¤âα°Õ¤¹¤ë¡£
Âпô²½¤ÎÃí°ÕÅÀ¤Ï¡¢È¯¸½Ãͤ¬0¤Î¾ì¹ç¤Ëlog¤¬¼è¤ì¤Ê¤¤¤³¤È¤Ç¡¢¤³¤Î¾ì¹çÈù¾®ÃͤËÃÖ¤´¹¤¨¤ë¤³¤È¤¬¹Í¤¨¤é¤ì¤ë¡Ê⤷log¤ò¼è¤Ã¤¿·ë²Ì¤ÎÃͤÏÉé¤ÎÂ礤ʿô¤Ë¤Ê¤ë¡Ë¡£·ÏÎó¾å¤¹¤Ù¤Æ0¤Î¥±¡¼¥¹¤Ï²òÀÏÂоݤ«¤é³°¤¹¤³¤È¤¬¹Í¤¨¤é¤ì¤ë¤¬¡¢°ìÉô¤Îȯ¸½ÃͤΤß0¤Î¥±¡¼¥¹¤Ï°ÕÌ£¤¬¤¢¤êÆÀ¤ë¤Î¤Ç¡¢³°¤¹¤³¤È¤Ï¹Í¤¨¤Ê¤¤¡£
º£¤Ïlog(tpm/Anc)¤ò¼è¤Ã¤¿É½¡£
gene | Start | Length | Anc | 43B | 45a_minus | 45A_minus | 45L | 1_2-1 | 2_5-1 | |
gene_0001 | thrL | 190 | 66 | 0 | 0.518801371 | 0.120822012 | 0.130553092 | 0.662278199 | 0.736981706 | 0.782980429 |
gene_0003 | thrA | 338 | 2463 | 0 | 0.363545812 | 0.217293869 | 0.207090028 | 0.462360214 | 0.457629592 | 0.265097062 |
gene_0005 | thrB | 2802 | 933 | 0 | 0.375643354 | 0.338447083 | 0.313037592 | 0.533665319 | 0.35669746 | 0.17628472 |
gene_0007 | thrC | 3735 | 1287 | 0 | 0.351946181 | 0.178209151 | 0.160556929 | 0.449396654 | 0.309469379 | 0.092155761 |
gene_0008 | yaaX | 5235 | 297 | 0 | 0.361804821 | 0.497406189 | 0.565856219 | 0.483361196 | 0.254979922 | 0.141397537 |
gene_0009 | yaaA | 5684 | 777 | 0 | 0.127790066 | -0.16041128 | -0.046404433 | 0.015105221 | 0.04435137 | -0.083863423 |
gene_0011 | yaaJ | 6530 | 1431 | 0 | -0.061913911 | 0.120802403 | 0.215477533 | 0.132366925 | 0.108788741 | -0.028453563 |
gene_0014 | talB | 8239 | 954 | 0 | 0.025617922 | -0.296533334 | -0.408331467 | 0.077236367 | 0.241795445 | 0.203333867 |
gene_0016 | mog | 9307 | 588 | 0 | -0.18900972 | -0.26035833 | -0.255735167 | -0.114590533 | -0.140329364 | -0.159416034 |
gene_0017 | yaaH | 9929 | 567 | 0 | -0.151212522 | -0.346290801 | -0.274411814 | -0.177810069 | -0.161112427 | -0.103483574 |
¾åµ¥Ç¡¼¥¿¤ËÂФ·¤Æclustering¤ò¹Ô¤Ã¤¿·ë²Ì¡ÊÁ´Éô¹Ô¤¦¤Èµ÷Î¥·×»»¤Ë»þ´Ö¤¬Èó¾ï¤Ë¤«¤«¤ë¤Î¤Ç100¸Ä¤À¤±¡Ë
%matplotlib inline # import pandas as pd import numpy as np from scipy.spatial import distance from scipy.cluster.hierarchy import linkage, dendrogram import matplotlib.pyplot as plt import math import os import pickle def dfnormalize(row): # Anc¤¬0¤Ê¤é¥ª¡¼¥ë0¡¢¤½¤¦¤Ç¤Ê¤±¤ì¤Ðlog(u/Anc) Anc = row['Anc'] rest = row.to_list()[3:] #print('rest\n', rest) if Anc==0: result = [0] * len(rest) else: result = [math.log10(u/Anc) if u>0.000001 else math.log10(0.000001/Anc) for u in rest] #print('result\n', result) output = pd.Series(result, index=(row.index.to_list()[3:])) output['gene'] = row['gene'] output['Anc'] = 0 if Anc==0 else 0 output['Start'] = row['Start'] output['Length'] = row['Length'] #print('output\n', output) return(output) #def myeuc(u): # Euclideanµ÷Î¥¤ò·×»»¤¹¤ë´Ø¿ô¡Ámap¤¹¤ë¤¿¤á¤ËÍÑ°Õ # #print('u\n', u, '\ndfl.loc[target]\n', dfl.loc[target]) # result = distance.euclidean(u, dfl[['Anc', '43B', '45a_minus', '45A_minus', '45L', '1_2-1', '2_5-1']].loc[target]) # #print('result:', result) # return(result) picklefname = 'DistanceTest.pickle' slist = ['Anc', '43B', '45a_minus', '45A_minus', '45L', '1_2-1', '2_5-1'] if not os.path.exists(picklefname): fname = 'count_tpm.tsv' df = pd.read_csv(fname, sep='\t', index_col=0) #print(df.columns.to_list()) df = df.rename(columns= {'10B.sam': '45a_2-10Bplus', '10D_minus.sam': '45a_10D_minus', '1p2-1.sam': '1_2-1', '1p2-2.sam': '1_2-2', '2-10B_minus.sam': '45a_2-10B_minus', '2p5-1.sam': '2_5-1', '2p6-1.sam': '2_6-1', '43B.sam': '43B', '45A_minus.sam': '45A_minus', '45A_plus.sam': '45A_plus', '45L.sam': '45L', '45a10D_plus.sam': '45a_10D_plus', '45aIII6c_plus.sam': '45a_III6c_plus', '45a_minus.sam': '45a_minus', '45a_plus.sam': '45a_plus', '45alll6c_minus.sam': '45a_III6c_minus', '45b_minus.sam': '45b_minus', '45b_plus.sam': '45b_plus', '45c_minus.sam': '45c_minus', '45c_plus.sam': '45c_plus', '45d7B_minus.sam': '45d_7B_minus', '45d7B_plus.sam': '45d_7B_plus', 'Anc.sam': 'Anc', 'PwOw_minus.sam': 'PwOw_minus', 'PwOw_plus.sam': 'PwOw_plus' }) df['gene'] = [u[10:] for u in df.index.to_list()] df.index = [u[:9] for u in df.index.to_list()] dfdup = df[df.duplicated(subset='gene', keep=False)]\ [['gene', 'Start', 'Length', 'Anc', '43B', '45A_minus', '45L', \ '1_2-1', '2_5-1']].sort_values(['gene', 'Start']) dfdup.to_excel('DuplicatedCDS.xlsx') print(dfdup) df1 = df.copy()[['gene', 'Start', 'Length', \ 'Anc', '43B', '45a_minus', '45b_minus', '45c_minus', '45A_minus', '45L', \ '1_2-1', '2_5-1']] df1 = df1[df1['Anc']!=0] # Anc¤¬0¤Î¤â¤Î¤ò½ü¤¯¡ÊAnc¤Ç³ä¤ë¤«¤é¡Ë df1x = df1[:].apply(dfnormalize, axis=1) df1x = df1x[['gene', 'Start', 'Length', \ 'Anc', '43B', '45a_minus', '45A_minus', '45L', '1_2-1', '2_5-1']] df1x.to_excel('CompareTPM.xlsx') ############ # Line Graphs ############ #df1g = df1x[['Anc', '43B', '45a_minus', '45A_minus', '45L', '1_2-1', '2_5-1']] #min = 0.4 # ÀäÂÐÃͤ¬0.4°Ê¾å¤Î¥Ç¡¼¥¿ÅÀ¤À¤±¥×¥í¥Ã¥È #df1g = df1g[(abs(df1g['45a_minus'])>min) & (abs(df1g['45A_minus'])>min) &\ # (abs(df1g['45L'])>min) & (abs(df1g['1_2-1'])>min) & (abs(df1g['2_5-1'])>min) ] # #df1g.T.plot() #plt.show() df2 = df.copy()[slist] # df2 = df2 + 1 # ÃÍ0¤òÈò¤±¤ë¤¿¤á ¢Í¡¡¤¹¤Ù¤¤Ç¤Ï¤Ê¤¤¡£¤à¤·¤íÈù¾®¤ÊÀµ¿ô¤Ë¤¹¤Ù¤¤À¤í¤¦¡£ df2 = df2 + 0.00000001 dfl = np.log10(df2) # Àè¤Ëlog10¤ò¼è¤ë dfl_t = dfl.T # ¥ª¡¼¥ë0¤Î¹Ô¤Ï½ü¤«¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤ dfl_copy = dfl.replace(0.0, np.nan).dropna(how='all', axis=0) dfl = dfl.loc[dfl_copy.index] print('dfl\n', dfl.head()); print() dfl.to_pickle(picklefname) else: dfl = pd.read_pickle(picklefname) pickle2fname = 'CompareTPM2.pickle' if not os.path.exists(pickle2fname): #dfl = dfl.head(100) # target¤È¾¤Îgene¤È¤ÎÂФε÷Î¥¤ò·×»»¤¹¤ë genenamelist = dfl.T.columns.to_list() print(genenamelist) #for target in genenamelist[:50]: for target in genenamelist: #print('dfl[slist].loc[target]\n', dfl[slist].loc[target]) dfl['D_'+target] = dfl[slist].apply(lambda x: \ distance.euclidean(x, dfl[slist].loc[target]), axis=1) print('target:', target) #dfx = dfl.sort_values('d', ascending=True)[:10] #print('dfx\n', dfx) #dfg = dfx.drop('d', axis=1).T #dfg.plot() #plt.ylabel('$log_{10}(TPM)$') #plt.title(target) #plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') #plt.show() print(dfl.drop(columns=slist).head(), '\n') dfl.to_pickle('CompareTPM2.pickle') else: dfl = pd.read_pickle(pickle2fname) pickleLinkagefname = 'CompareTPMLinkage.pickle' if not os.path.exists(pickleLinkagefname): dArray = distance.squareform(dfl.drop(columns=slist)) result = linkage(dArray, method = 'average') node_labels = [u[2:] for u in dfl.drop(columns=slist).columns.to_list()] with open(pickleLinkagefname, 'wb') as pfw: pickle.dump([result, node_labels], pfw) else: with open(pickleLinkagefname, 'rb') as pf: result, node_labels = pickle.load(pf) plt.figure(figsize=(100,100), dpi=200, facecolor='w', edgecolor='k') dendrogram(result, labels=node_labels) plt.savefig('CompareTPM.pdf') plt.show() print('complete')
¹¹¤Ë¡¢½ÐÎϤÎCompareTPMLinkage.pickle¤òÆÉ¤ó¤Ç¡¢fcluster¤Ç¥¯¥é¥¹¥¿¤ò½ÐÎÏ
%matplotlib inline # ºÇ¸å¤ÎÉôʬ¡ÊLinkage·×»»¤è¤ê¸å¤í¤ÎÉôʬ¡Ë¤À¤± import pandas as pd import numpy as np from scipy.spatial import distance from scipy.cluster.hierarchy import linkage, dendrogram import matplotlib.pyplot as plt import os import pickle # gene_number¤«¤égene_name¤Ø¤ÎÊÑ´¹¼½ñ fname = 'count_tpm.tsv' df = pd.read_csv(fname, sep='\t', index_col=0) gene_name_dict = {u[:9]: u[10:] for u in df.index.to_list()} #print(gene_name_dict) pickleLinkagefname = 'CompareTPMLinkage.pickle' with open(pickleLinkagefname, 'rb') as pf: result, node_labels = pickle.load(pf) # print(result[:10]) NUM_CLUSTERS = 10 for num in range(10, NUM_CLUSTERS+1): labels = fcluster(result, t=num, criterion='maxclust') #fcluster¤Ï¡¢ÆþÎϤ¬¤É¤Î¥¯¥é¥¹¥¿¤Ë°¤¹¤ë¤«¡Ê¥¯¥é¥¹¥¿ÈÖ¹æ labels¡Ë¤òÊÖ¤¹ #print(num, labels) # ¥¯¥é¥¹¥¿¤´¤È¤Ë¡¢¤½¤ì¤Ë°¤¹¤ëÆþÎϤò¥ê¥¹¥È¤È¤·¤ÆÉ½¼¨ clusters = [] for cl_id in range(1, num+1): l = [gene_name_dict[ node_labels[n] ] for n in range(0,len(labels)) if labels[n]==cl_id] #print(' ', cl_id, l) clusters.append([cl_id, l]) with open('clusters_'+str(num)+'.pickle', 'wb') as pwf: pickle.dump(clusters, pwf) print('complete')
¥¯¥é¥¹¥¿¤´¤È¤Ë¡¢¤½¤ì¤¾¤ì¤Ë°¤¹¤ëgene¤Îȯ¸½ÊÑÆ°¤ò¥°¥é¥Õɽ¼¨¤¹¤ë¡£
%matplotlib inline # ¥¯¥é¥¹¥¿¤Ë°¤¹¤ëgene¤Î¥°¥é¥Õ¤òÉÁ¤¯ import pandas as pd import numpy as np import matplotlib.pyplot as plt import pickle fname = 'count_tpm.tsv' df = pd.read_csv(fname, sep='\t', index_col=0) gene_name_dict = {u[:9]: u[10:] for u in df.index.to_list()} #print(gene_name_dict) NUM = 10 with open('clusters_'+str(NUM)+'.pickle', 'rb') as pf: clusters = pickle.load(pf) picklefname = 'DistanceTest.pickle' dfl = pd.read_pickle(picklefname) #print(dfl.index.to_list()) for ucl_id, l in clusters: print(l) ############ # Line Graphs dfl['gene'] = [gene_name_dict[u] for u in dfl.index] #print(l, dfl[dfl['gene'].isin(l)]) dfg = dfl[dfl['gene'].isin(l)] dfg = dfg[['Anc', '43B', '45a_minus', '45A_minus', '45L', '1_2-1', '2_5-1']] dfg = dfg.iloc[0:10, :] dfg.T.plot() plt.show()
·ëÏÀ¤Ï¡¢
ɬ¿Ü°äÅÁ»Ò¤Îȯ¸½¥Ñ¥¿¡¼¥ó¤Ï¤É¤¦¤Ê¤Ã¤Æ¤¤¤ë¤Î¤«¡©