![]() |
Python¥Ð¥¤¥ª/pandas¥Æ¥£¥Ã¥×¥¹http://pepper.is.sci.toho-u.ac.jp/pepper/index.php?Python%A5%D0%A5%A4%A5%AA%2Fpandas%A5%C6%A5%A3%A5%C3%A5%D7%A5%B9 |
![]() |
Python¥Ð¥¤¥ª?¡¡
7890¡¡¡¡¡¡2022-08-29 (·î) 15:35:43
Python¤Ç¥ê¥¹¥È¡ÊÇÛÎó¡Ë¤«¤é½ÅÊ£¤·¤¿Í×ÁǤòºï½ü¡¦Ãê½Ð | note.nkmk.me
newl = sorted(set(l), key=l.index)
import pandas as pd df = pd.DataFrame([['A', 1, 3], ['B', 2, 4], ['C', 3, 5], ['D', 4, 6]], \ columns=['a', 'b', 'c']) print(df) # a b c #0 A 1 3 #1 B 2 4 #2 C 3 5 #3 D 4 6 print(df[df.index.isin([2, 3])].sum()) # ¥ê¥¹¥È¤Ë¤¢¤ë¹Ô¤ò¤¹ #a CD #b 7 #c 11 #dtype: object print(df.loc[0:1].sum()) # Ϣ³¤·¤¿¹Ô¤ò¤¹ #a AB #b 3 #c 7 #dtype: object print(df.loc[[0,2]].sum()) # (¥ê¥¹¥È¤Ë¤¢¤ë¡Ë¤È¤Ó¤È¤Ó¤Î¹Ô¤Ç¤âOK #a AC #b 4 #c 8 #dtype: object
¤ª¤Þ¤±¡¢Ê¿¶Ñ
print(df[df.index.isin([2, 3])].mean()) #b 3.5 #c 5.5 #dtype: float64 print(df.loc[0:1].mean()) #b 1.5 #c 3.5 #dtype: float64
pd.options.display.precision = 2 # ¤¹¤Ù¤Æ¤Îɽ¼¨¤ò¾®¿ôÅÀ°Ê²¼£²·å¤Ë´Ý¤á¤ë
df.round(2) # ¤¹¤Ù¤Æ¤ò¾®¿ôÅÀ°Ê²¼£²·å¤Ë´Ý¤á¤ë
Pandas¤ÇÊ£¿ô¤ÎÎó¤òÃͤò¤â¤È¤Ë¡¢¿·¤·¤¤Îó¤òǤ°Õ¤Î´Ø¿ô¤ÇÄêµÁ¤¹¤ëÊýË¡ | Shikoan's ML Blog
import pandas as pd comike = pd.DataFrame({ "block" : ["AX", "AY", "¤¢Z", "¤¢X", "¥¤Q", "¥¤R"], "number" : [1, 1, 10, 11, 12, 13], "side" : ["aX", "bX", "aY", "bZ", "aQ", "bR"] }) print(comike) #comike["space"] = comike.apply(lambda x: f"{x['block']}-{x['number']:02d}{x['side']}", axis=1) comike = comike[comike.apply(lambda x: x['block'][1]==x['side'][1], axis=1) ] print(comike)
pandas.DataFrame, Series¤ò¥½¡¼¥È¤¹¤ësort_values, sort_index | note.nkmk.me
Ãͤǥ½¡¼¥È¤¹¤ë
df.sort_values() df.sort_values(ascending=False)
¹Ô̾¤Ç¥½¡¼¥È¤¹¤ë
df.sort_index()
Îó̾¤Ç¥½¡¼¥È¤¹¤ë
df.sort_index(axis=1)
even_list = [i for i in range(10) if i % 2 == 0]
even_list = [i if i % 2 == 0 else "odd" for i in range(10)]
Python¤Î»°¹à±é»»»Ò¡Ê¾ò·ï±é»»»Ò¡Ë¤Çifʸ¤ò°ì¹Ô¤Ç½ñ¤¯ | note.nkmk.me
6. ¼° (expression) — Python 3.8.0 ¥É¥¥å¥á¥ó¥È¡¡6.12. ¾ò·ï¼° (Conditional Expressions)
result = a * 2 if a % 2 == 0 else a * 3
elif¤ò´Þ¤á¤¿¤¤»þ¡Ê¿ä¾©¤»¤º¡Ë
result = 'negative' if a < 0 else 'positive' if a > 0 else 'zero'
pandas.DataFrame, Series¤ÎÍ×ÁǤÎÃͤòÃÖ´¹¤¹¤ëreplace | note.nkmk.me
df = df.replace(oldvalue, newvalue)
Àµµ¬É½¸½¤ò»È¤Ã¤ÆÃÖ¤´¹¤¨¤ò»ØÄꤹ¤ë¤³¤È¤¬¤Ç¤¤ë¡£
print(df.replace('(.*)li(.*)', r'\1LI\2', regex=True))
¤Î¤è¤¦¤Ë¡¢regex=True¤Ç»ØÄꤹ¤ë¡£¡Ê¥Ç¥Õ¥©¥ë¥È¤ÏFalse¡Ë
Àµµ¬É½¸½¤Î»ØÄê¤Î»ÅÊý¤Ï¡¢¤¿¤È¤¨¤ÐPython¤ÎÀµµ¬É½¸½¥â¥¸¥å¡¼¥ëre¤Î»È¤¤Êý¡Êmatch¡¢search¡¢sub¤Ê¤É¡Ë | note.nkmk.me¤ò»²¾È¡£
Python ¤Î¥Í¥¹¥È¤·¤¿ÆâÊñɽµ - Qiita
[(x, y) for x in [1, 2] for y in ['a', 'b', 'c']]
y¤¬Æâ¦¥ë¡¼¥×¡¢x¤¬³°Â¦¥ë¡¼¥×¤Ë¤Ê¤ë¤³¤È¤ËÃí°Õ¡£
¾ò·ïÉÕ¤¤ÎÍ×Áǽñ´¹¤¨¤Ï¡¢where¤Ç½ñ¤¯¤³¤È¤¬¤Ç¤¤ë¡£
pandas¤Ç¾ò·ï¤Ë±þ¤¸¤ÆÃͤòÂåÆþ¡Êwhere, mask¡Ë | note.nkmk.me
where¤ò»È¤Ã¤Æ¡¢Â裳°ú¿ô¤Ë¿·¤·¤¤Ãͤò»ØÄꤹ¤ë¤³¤È¤Ë¤è¤Ã¤Æ¡¢½ñ¤´¹¤¨¤ë¤³¤È¤¬¤Ç¤¤ë¡£
£±Îó'A'¤ò½ñ¤´¹¤¨¤ëÎã
df['D'] = df['A'].where(df['C'] == 'a', 100) # Äê¿ô¤ò½ñ¹þ¤à df['D'] = df['A'].where(df['C'] == 'a', df['B']) # df['B']¤«¤éÃͤò»ý¤Ã¤Æ¤¯¤ë
⤷¡¢¡Ö½ñ´¹¤¨¡×Áàºî¤È¤·¤Æ¸«¤ë¤È¾ò·ï»ØÄê¤ÏµÕ¡ÊTrue¤À¤È½ñ¤´¹¤¨¤Ê¤¤¡Ë¤Ë¤Ê¤Ã¤Æ¤¤¤ë¤Î¤ÇÃí°Õ¡£¤½¤ÎÍýͳ¤Ï¡¢where¤¬¡ÊÂ裳°ú¿ô¤ò»ØÄꤷ¤Ê¤¤¤È¡Ë¾ò·ï¤¬True¤ÎÍ×ÁǤÀ¤±¤ò¸µ¤ÎÃͤòÈ´¤½Ð¤·¡¢False¤ÎÍ×ÁǤÏNaN¤òÊÖ¤¹¤è¤¦¤Ê¸µ¡¹¤Îµ¡Ç½¤À¤«¤é¡£¤Ä¤Þ¤êTrue¤Ï¤½¤Î¤Þ¤Þ¤Ç¡¢False¤À¤È²¿¤«¤¹¤ë¤¬¡¢¤½¤Î¡Ö²¿¤«¡×¤¬Â裳°ú¿ô¤¬¤¢¤ì¤Ð¤½¤ÎÃÍ¡¢Â裳°ú¿ô¤¬»ØÄꤵ¤ì¤Æ¤¤¤Ê¤±¤ì¤ÐNaN¤È¤Ê¤ë¡£
¥Ç¡¼¥¿¥Õ¥ì¡¼¥àÁ´ÂΤǽñ¤´¹¤¨¤ëÎã
df2 = df.where(df < 0, 100) df2 = df.where(df < 0, df * 2)
pandas.DataFrame¤Î¹Ô̾¡¦Îó̾¤ÎÊѹ¹ | note.nkmk.me
df = df.rename(columns={µì̾: ¿·Ì¾, µì̾2: ¿·Ì¾2, ...})
¤Ê¤ª¡¢index¤ËÂФ¹¤ë̾Á°¤òÉÕ¤±¤ë¤Ë¤Ï
df.index.name = colname1
ƱÍͤˡ¢
df.columns.name = colname2
¤Ä¤Þ¤ê
import pandas as pd df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=['A','B','C']) print(df) df.index.name='¥¤¥ó¥Ç¥Ã¥¯¥¹' print(df) df.columns.name = '¥«¥é¥à' print(df)
¤È¤¹¤ë¤È¡¢²¿¤â¤Ä¤±¤Ê¤¤¤È¤¤Ï
A B C 0 1 2 3 1 4 5 6 2 7 8 9
index¤Ë̾Á°'¥¤¥ó¥Ç¥Ã¥¯¥¹'¤òÉÕ¤±¤ë¤È
A B C ¥¤¥ó¥Ç¥Ã¥¯¥¹ 0 1 2 3 1 4 5 6 2 7 8 9
¹¹¤Ë¡¢column¤Ë̾Á°'¥«¥é¥à'¤òÉÕ¤±¤ë¤È
¥«¥é¥à A B C ¥¤¥ó¥Ç¥Ã¥¯¥¹ 0 1 2 3 1 4 5 6 2 7 8 9
¤È¤Ê¤ë¡£
[[pandas.DataFrame, Series¤Î¥¤¥ó¥Ç¥Ã¥¯¥¹¤ò¿¶¤êľ¤¹reset_index | note.nkmk.me https://note.nkmk.me/python-pandas-reset-index/]]
df = df.drop_index() # ¥¤¥ó¥Ç¥Ã¥¯¥¹¤Ï¿·¤¿¤ËÎó¤È¤·¤Æ²Ã¤¨¤é¤ì¤ë¡£¤â¤·Îó̾¤¬½ÅÊ£¤¹¤ë¤È¥¨¥é¡¼
df = df.drop_index(drop=True) # ¥¤¥ó¥Ç¥Ã¥¯¥¹¤ò´°Á´¤Ëºï½ü¡ÊÎó¤È¤·¤Æ»Ä¤µ¤Ê¤¤¡Ë
df = df.set_index('Îó̾') # Îó̾¤ÎÎó¤ò¥¤¥ó¥Ç¥Ã¥¯¥¹¤È¤·¤Æ»È¤¦¡£¤½¤ÎÎó¤Ï¾Ã¤¨¤ë
df = df.reset_index.set_index('Îó̾') # ¸µ¤Î¥¤¥ó¥Ç¥Ã¥¯¥¹¤òÎó¤È¤·¤Æ»Ä¤·¤Ä¤ÄÊѹ¹
df = df.loc[:, ["col3", "col1", "col2", "col0"]]
pandas.DataFrame¤Î¹Ô¡¦Îó¤ò»ØÄꤷ¤Æºï½ü¤¹¤ëdrop | note.nkmk.me
df = df.drop('¹Ô̾', axis=0) # axis=0¤Ï¥Ç¥Õ¥©¥ë¥È df = df.drop(index='¹Ô̾') df = df.drop('Îó̾', axis=1) df = df.drop(column='¹Ô̾')
Ê£¿ô¹Ô¡¦Îó¤Î¾ì¹ç
df = df.drop(index=['¹Ô̾1', '¹Ô̾2']
¹ÔÈÖ¹æ¤Ç»ØÄê
df = df.drop(index=df.index[1, 3, 5]))
¤Ê¤ª¡¢Ãͤò¾ò·ï¤Ë¤·¤Æ¹Ô¤òºï½ü¡¿»Ä¤¹¤Î¤Ï¡¢
df = df[df['»²¾È']>=3] df = df[(df['»²¾È£±']>=3) and (df['»²¾È£²']!='')]
¤Ç¤Ç¤¤ë¡£
import pandas as pd df = pd.DataFrame([['A',1, 0.1], ['B', 2, 0.2], ['C', 3, 0.3], ['D', 4, 0.4]], columns=['¥é¥Ù¥ë', 'ÃÍ', 'ÃÍ']) print(df) # ¥é¥Ù¥ë ÃÍ ÃÍ #0 A 1 0.1 #1 B 2 0.2 #2 C 3 0.3 #3 D 4 0.4 dfx = df[['¥é¥Ù¥ë', 'ÃÍ']] # ÃͤÏ2¤Ä»Ä¤ë print(dfx) # ¥é¥Ù¥ë ÃÍ ÃÍ #0 A 1 0.1 #1 B 2 0.2 #2 C 3 0.3 #3 D 4 0.4 dfx = df.drop('ÃÍ', axis=1) # £²¤Ä¤È¤âdrop¤¹¤ë print(dfx) # ¥é¥Ù¥ë #0 A #1 B #2 C #3 D dfx = df.drop(df.columns[[2]], axis=1) # °ÌÃÖ£²¤ò»ØÄꤷ¤Æ¤â£²¤Ä¤È¤âdrop¤¹¤ë print(dfx) # ¥é¥Ù¥ë #0 A #1 B #2 C #3 D dfx = df.copy() # Àõ¤¤¥³¥Ô¡¼¤À¤ÈÎó̾½ñ´¹¤¨¤Ïdf¤ÎÊý¤Ë¤âµÚ¤Ö¤Î¤ÇÃí°Õ dfx.columns = ['¥é¥Ù¥ë', 'ÃÍ', 'ÃÍ£²'] # ¤¹¤Ù¤Æ»ØÄꤹ¤ì¤ÐÎó̾¤Ï¤¹¤Ù¤Æ½ñ¤ÊѤ¨¤é¤ì¤ë dfx = dfx[['¥é¥Ù¥ë', 'ÃÍ']] # ÃÍ£²¤òdrop¤¹¤ì¤Ð¤¤¤¤ print(dfx) # ¥é¥Ù¥ë ÃÍ #0 A 1 #1 B 2 #2 C 3 #3 D 4 dfx = df.rename(columns={'ÃÍ': 'ÃÍ£³'}) # ξÊý¤È¤â½ñ¤´¹¤¨¤é¤ì¤ë print(dfx) # ¥é¥Ù¥ë ÃÍ£³ ÃÍ£³ <-- ξÊý¤È¤â'ÃÍ£³'¤Ë¤Ê¤ë¤Î¤Ç¥À¥á #0 A 1 0.1 #1 B 2 0.2 #2 C 3 0.3 #3 D 4 0.4
·ëÏÀ¤È¤·¤Æ¡¢Îó̾¤òÁ´ÌÌŪ¤Ë½ñ´¹¤¨¤¹¤ë¤·¤«¤Ê¤µ¤½¤¦¤À¡£
MultiIndex¤ÎDF/Series¤«¤éÍ×ÁǤò»ØÄꤹ¤ë¤Ë¤Ï¡¡¢Í¡¡xs()¥á¥½¥Ã¥É¤¬¸«¤ä¤¹¤¤
pandas¤ÎMultiIndex¤«¤éǤ°Õ¤Î¹Ô¡¦Îó¤òÁªÂò¡¢Ãê½Ð | note.nkmk.me
print(df.xs(¥¤¥ó¥Ç¥Ã¥¯¥¹Ì¾, level='¥ì¥Ù¥ë̾))
¤¿¤È¤¨¤Ð
val_1 val_2 level_1 level_2 level_3 A0 B0 C0 98 90 C1 44 9 B1 C2 39 17 C3 75 71 A1 B2 C0 1 89 C1 54 60 B3 C2 47 6 C3 16 5 A2 B0 C0 75 22 C1 19 4 B1 C2 25 52 C3 57 40
¤ËÂФ·¤Æ¡¢
print(df.xs('B1', level='level_2')) # level_2¤¬'B1'¤Ç¤¢¤ë¹Ô¤òÁªÂò
¤È¤¹¤ë¤È
# val_1 val_2 # level_1 level_3 # A0 C2 39 17 # C3 75 71 # A2 C2 25 52 # C3 57 40
pandas¤ÎMultiIndex¤«¤éǤ°Õ¤Î¹Ô¡¦Îó¤òÁªÂò¡¢Ãê½Ð | note.nkmk.me
#multindex import pandas as pd # Create DF columns = ['level_1','level_2','level_3','val_1','val_2'] l = \ [['A0', 'B0', 'C0', 98, 90], ['A0', 'B0', 'C1', 44, 9], ['A0', 'B1', 'C2', 39, 17], ['A0', 'B1', 'C3', 75, 71], ['A1', 'B2', 'C0', 1, 89], ['A1', 'B2', 'C1', 54, 60], ['A1', 'B3', 'C2', 47, 6], ['A1', 'B3', 'C3', 16, 5], ['A2', 'B0', 'C0', 75, 22], ['A2', 'B0', 'C1', 19, 4], ['A2', 'B1', 'C2', 25, 52], ['A2', 'B1', 'C3', 57, 40], ['A3', 'B2', 'C0', 64, 54], ['A3', 'B2', 'C1', 27, 96], ['A3', 'B3', 'C2', 100, 77], ['A3', 'B3', 'C3', 22, 50]] df = pd.DataFrame(l, columns=columns) print(df) df_m = df.set_index(['level_1', 'level_2', 'level_3']) # muyltiIndex²½ print() print(df_m) print() print(df_m.xs('B1', level='level_2')) # xs¤Ë¤è¤ë¥¢¥¯¥»¥¹ print() df_T = df_m.T # žÃÖ¤ÇmultiColumn²½ print(df_T) print() print(df_T.xs('B1', level='level_2', axis=1)) # xs¤Ïaxis=1¤ÇmultiColumn¤ËÂбþ
·ë²Ì¤Ï
level_1 level_2 level_3 val_1 val_2 0 A0 B0 C0 98 90 1 A0 B0 C1 44 9 2 A0 B1 C2 39 17 3 A0 B1 C3 75 71 4 A1 B2 C0 1 89 5 A1 B2 C1 54 60 6 A1 B3 C2 47 6 7 A1 B3 C3 16 5 8 A2 B0 C0 75 22 9 A2 B0 C1 19 4 10 A2 B1 C2 25 52 11 A2 B1 C3 57 40 12 A3 B2 C0 64 54 13 A3 B2 C1 27 96 14 A3 B3 C2 100 77 15 A3 B3 C3 22 50 val_1 val_2 level_1 level_2 level_3 A0 B0 C0 98 90 C1 44 9 B1 C2 39 17 C3 75 71 A1 B2 C0 1 89 C1 54 60 B3 C2 47 6 C3 16 5 A2 B0 C0 75 22 C1 19 4 B1 C2 25 52 C3 57 40 A3 B2 C0 64 54 C1 27 96 B3 C2 100 77 C3 22 50 val_1 val_2 level_1 level_3 A0 C2 39 17 C3 75 71 A2 C2 25 52 C3 57 40 level_1 A0 A1 A2 A3 level_2 B0 B1 B2 B3 B0 B1 B2 B3 level_3 C0 C1 C2 C3 C0 C1 C2 C3 C0 C1 C2 C3 C0 C1 C2 C3 val_1 98 44 39 75 1 54 47 16 75 19 25 57 64 27 100 22 val_2 90 9 17 71 89 60 6 5 22 4 52 40 54 96 77 50 level_1 A0 A2 level_3 C2 C3 C2 C3 val_1 39 75 25 57 val_2 17 71 52 40
pandas¤ÎMultiindex¤Î»ØÄꡦÄɲᦲò½ü¡¦¥½¡¼¥È¡¦¥ì¥Ù¥ëÊѹ¹ | note.nkmk.me
´ðËÜ¤Ï reset_index() ¤ÇºÑ¤à¤Î¤À¤¬¡¢Îó̾¤ò¥«¥Ã¥³¤è¤¯¤Ä¤±¤ë¤³¤È¤â²Äǽ¡£ pandas¤Î¥Þ¥ë¥Á¥«¥é¥à¤ò¤¤¤¤´¶¤¸¤Ë½èÍý¤¹¤ëtips - Qiita
df.columns.values
¤ò»È¤¦¤È¡¢Îó̾¤¬¥¿¥×¥ë¤Î¥ê¥¹¥È¤Ë¤Ê¤Ã¤Æ½Ð¤Æ¤¯¤ë¡£
array([('score', 'count'), ('score', 'max'), ('score', 'min'), ('score', 'mean'), ('score', 'std')], dtype=object)
¤Î¤Ç¤³¤ì¤ò»È¤Ã¤Æ
def get_converted_multi_columns(df): return [col[0] + '_' + col[1] for col in df.columns.values] ¤«¤Þ¤¿¤Ï return [col[0] + col[1].capitalize() for col in df.columns.values]
¤Ê¤É¤È¤Ç¤¤ë¡£¤³¤ì¤òreset_index¤·¤¿¸å¤Ç»È¤¦¤È¤¤¤¤¤Î¤Ç¤Ï¡©
s = pd.Series(['A','B','C']) df = pd.DataFrame([s]) print(df)
# 0 1 2 # 0 A B C
¤½¤³¤ÇžÃÖ¤¹¤ë¤È
df = pd.DataFrame([s]).T print(df)
# 0 # 0 A # 1 B # 2 C
¤â¤¦¾¯¤·Îã¤ò¡§
#multindex import pandas as pd # Create DF columns = ['level_1','level_2','level_3','val_1','val_2', 'val_3'] l = \ [[ 'X0', 'Y0', 'Z0', 'U', 'U', 'V'], [ 'X1', 'Y1', 'Z1', 'P', 'Q', 'P'], [ 'A0', 'B0', 'C0', 98, 90, 5], [ 'A0', 'B0', 'C1', 44, 9, 4], [ 'A0', 'B1', 'C2', 39, 17, 3], [ 'A0', 'B1', 'C3', 75, 71, 2], [ 'A1', 'B2', 'C0', 1, 89, 1], [ 'A1', 'B2', 'C1', 54, 60, 9], [ 'A1', 'B3', 'C2', 47, 6, 8], [ 'A1', 'B3', 'C3', 16, 5, 7], [ 'A2', 'B0', 'C0', 75, 22, 6], [ 'A2', 'B0', 'C1', 19, 4, 51], [ 'A2', 'B1', 'C2', 25, 52, 52], [ 'A2', 'B1', 'C3', 57, 40, 53], [ 'A3', 'B2', 'C0', 64, 54, 54], [ 'A3', 'B2', 'C1', 27, 96, 55], [ 'A3', 'B3', 'C2', 100, 77, 56], [ 'A3', 'B3', 'C3', 22, 50, 57]] df = pd.DataFrame(l, columns=columns) print(df) df_m = df.set_index(['level_1', 'level_2', 'level_3']) # multiIndex²½ print() print(df_m) print() print(df_m.xs('B1', level='level_2')) # xs¤Ë¤è¤ë¥¢¥¯¥»¥¹ print() df_T = df_m.T # žÃÖ¤ÇmultiColumn²½ print('df_T\n', df_T) print() #print(df_T.xs('B1', level='level_2', axis=1)) # xs¤Ïaxis=1¤ÇmultiColumn¤ËÂбþ df_X = df_T.set_index([('X0', 'Y0', 'Z0'), ('X1', 'Y1', 'Z1')]) print(df_X.index) index = pd.MultiIndex.from_tuples(df_X.index, names=['First', 'Second']) df_X.index = index print('df_X\n', df_X) print() df_Y = df_X.xs('B1', level='level_2', axis=1) # xs¤Ë¤è¤ë¥¢¥¯¥»¥¹ print('df_Y\n', df_Y) print() print(df_Y.index) df_R = df_Y.reset_index(level=('X0', 'Y0', 'Z0')) print('df_R\n', df_R)
¤³¤ÎÊÕ¤¬¤è¤µ¤½¤¦¤À¡£pd.MultiIndex.from_tuple()¤ò»È¤¦
import pandas as pd index = pd.MultiIndex.from_tuples([('bird', 'falcon'), ('bird', 'parrot'), ('mammal', 'lion'), ('mammal', 'monkey')], names=['class', 'name']) columns = pd.MultiIndex.from_tuples([('speed', 'max'), ('species', 'type')]) df = pd.DataFrame([(389.0, 'fly'), ( 24.0, 'fly'), ( 80.5, 'run'), (np.nan, 'jump')], index=index, columns=columns) print(df) #print(df.reset_index()) #print(df.reset_index(level='class')) # column̾Êѹ¹¡© columns = pd.MultiIndex.from_tuples([('P', 'Q'), ('R', 'S')]) df_R = df.copy() df_R.columns = columns print('df_R\n', df_R) df_RR = df.copy() df_RR[('P', 'Q')] = df[('speed', 'max')] df_RR = df_RR.drop(('speed', 'max'), axis=1) print('df_RR\n', df_RR) print() print('reset\n', df_RR.reset_index(level='name'))
Python¤Ç¼½ñ¤Î¥¡¼¤ÈÃͤòÆþ¤ìÂØ¤¨¤ë | note.nkmk.me
d_swap = {v: k for k, v in d.items()}
pandas.DataFrame, Series¤ò¼½ñ¤ËÊÑ´¹¡Êto_dict¡Ë | note.nkmk.me
DataFrame¤Î2¤Ä¤ÎÎó¤Î´Ö¤Î´Ø·¸¤ò¼½ñ¤Ë¤¹¤ë¤¿¤á¤Ë¤Ï¡¢
pair = df[['¥¡¼Â¦', '¥Ð¥ê¥å¡¼Â¦']] ser = pair['¥Ð¥ê¥å¡¼Â¦'] ser.index = pair['¥¡¼Â¦'] dic = ser.to_dict()
pandas.DataFrame, Series¤ÈPythonɸ½à¤Î¥ê¥¹¥È¤òÁê¸ß¤ËÊÑ´¹ | note.nkmk.me
Series¤ÎÃÍÉôʬ¤À¤±¤Ê¤é
s = pd.Series([0, 1, 2]) l_1d = s.values.tolist()
DataFrame¤ÎÃÍÉôʬ¤À¤±¤Ê¤é
df = pd.DataFrame([[0, 1, 2], [3, 4, 5]]) l_2d = df.values.tolist() print(l_2d) # [[0, 1, 2], [3, 4, 5]]
¹Ô¥é¥Ù¥ë¡Ê¹Ô̾¡Ë¤È¤â¥ê¥¹¥È¤ËÊÑ´¹¤¹¤ë¤Ë¤Ï¡¢·ë¶Éreset_index¤È¤«¤Ç¥¤¥ó¥Ç¥Ã¥¯¥¹¤òÎó¤Ë´Þ¤á¤ë
l_1d_index = s_index.reset_index().values.tolist() print(l_1d_index) # [['row1', 0], ['row2', 1], ['row3', 2]]
Îó¥é¥Ù¥ë¡ÊÎó̾¡Ë¤Ïreset_index¤ËÁêÅö¤¹¤ëµ¡Ç½¤¬Ìµ¤¤¤Î¤Ç¡¢1¤Ä¤ÎÊýË¡¤È¤·¤ÆÅ¾ÃÖ¤·¤Æ¤«¤éreset_index¤ò¤«¤±¤ë¡£
l_2d_index_columns = df_index.reset_index().T.reset_index().T.values.tolist() print(l_2d_index_columns) # [['index', 'col1', 'col2', 'col3'], ['row1', 0, 1, 2], ['row2', 3, 4, 5]]
¥é¥Ù¥ë¤À¤±¤Ê¤é¡¢df.index¡¢df.columns¤À¤¬¡¢¥ê¥¹¥È¤Ë¤¹¤ë¤¿¤á¤Ëtolist()¤·¤Æ¤ª¤¯
df.index.tolist() df.columns.tolist()
df = df.duplicated(keep='last', subset=['¤³¤ÎÎó¤Ç¤Î½ÅÊ£']) # ½ÅÊ£¤Ê¤éTrue¡¢¤Ê¤¤¤Ê¤éFalse¤Î1Îó df = df[~df.duplicated(keep='...', subset=['..', '..'])] # ½ÅÊ£°Ê³°¤ò¤Ò¤í¤¦¡Ê¡ádrop_duplicates¡Ë df = df.drop_duplicates(keep='last') # ½ÅÊ£¤·¤¿Îó¤òºÇ¸å¤Î1¤Ä¤À¤±»Ä¤·¤Æºï½ü
pandas¤ÇÆÃÄê¤Îʸ»úÎó¤ò´Þ¤à¹Ô¤òÃê½Ð¡Ê´°Á´°ìÃס¢Éôʬ°ìÃ×¡Ë | note.nkmk.me
´°Á´°ìÃפϡ¡==¡¡¤Ç¤è¤¤¡£
dfx = df[df['state'] == 'CA']
Éôʬ°ìÃפϡ¢in¡¡¤Ï¥À¥á¤é¤·¤¤¡£str.contains()¡¡¤ò»È¤¦¡£
dfx = df[df['name'].str.contains('li')
⤷¡¢¥Ç¥Õ¥©¥ë¥È¤Ç¤ÏÂè1°ú¿ô¤òÀµµ¬É½¸½¤È²ò¼á¤¹¤ë¡ÊÍפ¹¤ë¤Ëre.search¤ÈƱÍ͡ˤΤǡ¢ ¥Ô¥ê¥ª¥ÉÅù¤ò´Þ¤à¾ì¹ç¤Ë¤ÏÀµ¤·¤¯¤Ê¤¤¡£²óÈò¤¹¤ë¤¿¤á¤Ë¤Ï¡¡regex=False¡¡¤ò»ØÄꤹ¤ë¤È¤è¤¤¡£
dfx = df[df['name'].str.contains('li', regex=False)
¤Þ¤¿¡¢Í×ÁǤ¬·ç»ÃÍNaN¤Ç¤¢¤ë¾ì¹ç¡¢¥Ç¥Õ¥©¥ë¥È¤Ç¤ÏNaN¤òÊÖ¤¹¤Î¤Ç¡¢¹Ô¤òÃê½Ð¤¹¤ë¤È¥¨¥é¡¼¤Ë¤Ê¤ë¡£¥ª¥×¥·¥ç¥ó¤È¤·¤Æ¡¡na=True/False¡¡¤ò»ØÄꤹ¤ë¡£
dfx = df[df['name'].str.contains('li', na=True)] # NaN¤ÏÃê½Ð·ë²Ì¤Ë´Þ¤Þ¤ì¤ë dfx = df[df['name'].str.contains('li', na=True)] # NaN¤ÏÃê½Ð·ë²Ì¤Ë´Þ¤Þ¤Ê¤¤
¤Ê¤ª¡¢str.contains¤Î¾¡¢¡¡str.startswith¡¡¤ä¡¡str.endswidth¡¡¹¹¤Ë¡¡str.match(¡Áre.match¤ÈƱÍÍ¤ÎÆ°ºî¡Ë¡¡¤¬»È¤¨¤ë¡£
[[pandas.DataFrame¤Î¹Ô¤ò¾ò·ï¤ÇÃê½Ð¤¹¤ëquery | note.nkmk.me: https://note.nkmk.me/python-pandas-query/]]
´ðËÜŪ¤Ë¡¢isin¥á¥½¥Ã¥É¤ò»È¤¦¤«¡¢query¥á¥½¥Ã¥É¤ÎÃæ¤Ç in ¤ò»È¤¦¤«¡£
dfx = df[df['state'].isin(['NY', 'TX'])]
¤«
dfx = df.query('state in ["NY", "TX"]')
¤³¤ì¤â²Ä¡§
dfx = df.query('state == ["NY", "TX"]'
pandas.DataFrame¤òGroupBy¤Ç¥°¥ë¡¼¥Ô¥ó¥°¤·Åý·×Î̤ò»»½Ð | note.nkmk.me
grouped = df.groupby('¥°¥ë¡¼¥×²½¤·¤¿¤¤Îó̾')
¤³¤Î·ë²Ì¤ËÂФ·¤ÆÅý·×±é»»¤¬¤Ç¤¤ë
df = grouped.sum()
»È¤¨¤ë¤Î¤Ïsum(), mean(), min(), max(), std(), var()¤Ê¤É
¤Þ¤¿¡¢agg()¤ò»È¤Ã¤Æ¡¢´Ø¿ô¤òŬÍѤ¹¤ë¤³¤È¤¬¤Ç¤¤ë¡£´Ø¿ô¤ÏSeries¤ò¼õ¤±¼è¤Ã¤Æ object¤òÊÖ¤¹´Ø¿ô¤Ç¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£
df = grouped.agg(lambda x: type(x))['sl']) df = grouped.agg(min)
Îó¤´¤È¤Ë°Û¤Ê¤ë´Ø¿ô¤òŬÍѤ¹¤ë¤³¤È¤â²Äǽ¡£
df = grouped.agg({'Îó£±': min}, {'Îó£²': max})
Ê£¿ô¤ÎÎó¤ò¥¡¼¤Ë¤·¤Æ¥°¥ë¡¼¥×²½¤Ç¤¤ë¡£
gf = df.groupby(['Îó£±', 'Îó£²']).mean()
as_index=False¤ò»ØÄꤹ¤ë¤È¡ÊTrue»þ¤Ï¥¡¼¤¬·ë²ÌDF¤Î¥¤¥ó¥Ç¥Ã¥¯¥¹¤Ë¤Ê¤ë¤¬¡Ë¡¢¥¡¼¤Ï·ë²ÌDF¤Î¹Ô¤Î¤Þ¤Þ¤Ç»Ä¤ê¥¤¥ó¥Ç¥Ã¥¯¥¹¤Ï¸µ¤Î¤Þ¤Þ
df = pd.DataFrame({ 'city': ['osaka', 'osaka', 'osaka', 'osaka', 'tokyo', 'tokyo', 'tokyo'], 'food': ['apple', 'orange', 'banana', 'banana', 'apple', 'apple', 'banana'], 'price': [100, 200, 250, 300, 150, 200, 400], 'quantity': [1, 2, 3, 4, 5, 6, 7] }) gb = df.groupby(['city', 'food']).mean() # as_index»ØÄê¤Ê¤·¡Ê¥Ç¥Õ¥©¥ë¥È¤ÏTrue¡Ë print(gb)
¤ËÂФ·¤Æ
price quantity city food osaka apple 100.0 1.0 banana 275.0 3.5 orange 200.0 2.0 tokyo apple 175.0 5.5 banana 400.0 7.0
¤À¤¬¡¢as_index=False¤Ë¤¹¤ë¤È
gb_noindex = df.groupby(['city', 'food'], as_index=False).mean() print(gb_noindex)
¤ËÂФ·¤Æ
city food price quantity 0 osaka apple 100.0 1.0 1 osaka banana 275.0 3.5 2 osaka orange 200.0 2.0 3 tokyo apple 175.0 5.5 4 tokyo banana 400.0 7.0
¤Î¤è¤¦¤Ëcity¤Èfood¤ÎÎ󤬻Ĥ롣
ñ¤Ë¸Ä¿ô¤ò¥«¥¦¥ó¥È¤·¤¿¤¤¾ì¹ç¡¢²¼µ¤Îsize()¤Ç¤â¤è¤¤¤·¡¢´Ø¿ôcount()¤ò»È¤Ã¤Æ¤è¤¤¡£
Pandas¤Ç¥Ç¡¼¥¿¤Î¸Ä¿ô¤ò¿ô¤¨¾å¤²¤ëcount´Ø¿ô¤Î»È¤¤Êý - DeepAge
dfcount = df.groupby('city').count() print(dfcount) # food price quantity #city #osaka 4 4 4 #tokyo 3 3 3 dfcount = df.groupby('city')[['food']].count().rename(columns={'food': 'count'}) print(dfcount) # count #city #osaka 4 #tokyo 3
Pandas ¤Î groupby ¤Î»È¤¤Êý - Qiita
df.groupby('city').groups
{'osaka': Int64Index([0, 1, 2, 3], dtype='int64'), 'tokyo': Int64Index([4, 5, 6], dtype='int64')}
df.groupby('city').get_group('osaka')
city food price quantity 0 osaka apple 100 1 1 osaka orange 200 2 2 osaka banana 250 3 3 osaka banana 300 4
df.groupby('city').size()
city osaka 4 tokyo 3 dtype: int64
df.groupby('city').size()['osaka']
4
pandas¤ÇÍ×ÁÇ¡¢¹Ô¡¢Îó¤Ë´Ø¿ô¤òŬÍѤ¹¤ëmap, applymap, apply | note.nkmk.me
DF¤Î¥ê¥¹¥È joinlist = [df1, df2, ..., dfn] ¤òÍѰդ·¡¢df¤Ëjoin¤¹¤ë¡£
dfout = df.join(joinlist)
¤³¤Î¤È¤¡¢¥¡¼»ØÄê¡Ê"on"¤Î»ØÄê¡Ë¤Ï¤Ç¤¤Ê¤¤¡£¥¤¥ó¥Ç¥Ã¥¯¥¹¤ò¥¡¼¤Ë¤·¤Æjoin¤¹¤ë¡£
¤Ä¤Þ¤ê¡¢¤¢¤é¤«¤¸¤á¥¤¥ó¥Ç¥Ã¥¯¥¹¤òÉÕ¤±Âؤ¨¤Æ¤ª¤¯É¬Íפ¬¤¢¤ë¡£
df1 = df1.set_index('column to use as key')
NaN¤Ïnumpy¤Înan
DF¤Ç¤Ê¤±¤ì¤Ð
if np.isnan(np.nan): hogehoge
DF¤Ç¤ÎnanȽÄê
Reference¢ªAPI reference ¤Îgeneral function¤Ë¤¢¤ë
df = pd.DataFrame([[1,3],[2,np.nan]]) print(df.isnull()) print(df.notnull())
¤Ê¤ª¡¢isnull()¤Ïisna()¤Îalias¡¢notnull()¤Ïnotna()¤Îalias
NoneType¤ÈNaN¤Ï°ã¤¦¤é¤·¤¤¡£·ëÏÀ¤È¤·¤Æ is None ¤¬Í¸ú
x = None print(x is None)
[python]ÊÑ¿ô¤¬NoneType¤Ç¤¢¤ë¤«¤òȽÄꤹ¤ë | akamist blog https://akamist.com/blog/archives/3067
df = pd.DataFrame( columns=['A', 'B'] ) for u in [0, 1, 2, 3]: temp_series = pd.Series( [i, i*i], index=df.columns ) df = df.append( temp_series, ignore_index=True )
¤¢¤È¤«¤éÎó¤ò¤¹¤Î¤Ï¡¢½Ä¤Î¥Ç¡¼¥¿¤ÎŤµ¤¬Æ±¤¸¤Ê¤é´Êñ¤Ç¡¢¾åµ¤Î¾ì¹ç½Ä¤¬£´¤Ê¤Î¤Ç
df['NEW'] = [3, 4, 5, 6]
Ťµ¤¬°Û¤Ê¤ë¾ì¹ç¡¢¾åµ¤ÎÊýË¡¤Ï¥¨¥é¡¼¤Ë¤Ê¤ë¡£Series¤òºî¤Ã¤ÆÆ±¤¸¤è¤¦¤Ë½ñ¤¤³¤á¤ÐOK.
## Test ¡Á convert list (with different length) to DF column import pandas as pd import numpy as np list1 = [1, 2, 3] list2 = [4, 5, 6, 7, 8] list3 = [10, 11] # ¥ê¥¹¥È¤ÎŤµ¤¬¤¹¤Ù¤ÆÆ±¤¸¤Ê¤é¤Ð¡¢Ã±½ã¤Ëdf['¿·Îó̾']=list¤ÇOK # Ťµ¤¬°Û¤Ê¤ë¤È¥¨¥é¡¼ df = pd.DataFrame(index=[0, 1, 2, 3, 4], columns=[]) #df['A'] = list1 <-- Ťµ¤¬°ã¤¦¤Î¤Ç¥¨¥é¡¼ s = pd.Series(list1, index=[0,1,2]) df['A'] = s print(df) # ¤³¤ì¤À¤È¡¢ÉÔ¤·¤¿Í×ÁÇÉôʬ¤ÏNaN # A #0 1.0 #1 2.0 #2 3.0 #3 NaN #4 NaN df.to_excel('mytest.xlsx') # NaN¤Î¾ì½ê¤ÏExcel¾å¤Ç¤Ï¶õÍó¤Ë¤Ê¤ë df = df.replace(np.nan, '') # nan¤ò¶õʸ»úÎó¤ËÃÖ´¹¤¨ print(df) # A #0 1 #1 2 #2 3 #3 #4 df.to_excel('mytest2.xlsx') # ¶õʸ»úÎó¤Î¾ì½ê¤ÏExcel¾å¤Ç¤Ï¶õÍó¤Ë¤Ê¤ë
[Python] ¥ê¥¹¥È¤ÎÍ×ÁǤοô¤ò¿ô¤¨¤ë (collections.Counter) | Hibiki Programming Notes
c1 = collections.Counter('abbcabbbccca') c1.update({'a':1, 'b':1, 'c':1}) # c1¤ò½ñ¤´¹¤¨¤ë¤³¤È¤ËÃí°Õ¡Ê'+'¤È°Û¤Ê¤ë¡Ë
¤Þ¤¿¤Ï
c1 = collections.Counter('abbcabbbccca') c1.update('abc')
¤â£Ï£Ë
°ú¤¯¾ì¹ç¤Ï¡¢subtract
>>> c = Counter(a=4, b=2, c=0, d=-2) >>> d = Counter(a=1, b=2, c=3, d=4) >>> c.subtract(d) # c¤ò½ñ¤´¹¤¨¤ë¤³¤È¤ËÃí°Õ¡Ê'-'¤È°Û¤Ê¤ë¡Ë >>> c Counter({'a': 3, 'b': 0, 'c': -3, 'd': -6})
£°¤ä¥Þ¥¤¥Ê¥¹¤Ë¤Ê¤êÆÀ¤ë
¤Þ¤¿¡¢8.3. collections --- ¥³¥ó¥Æ¥Ê¥Ç¡¼¥¿·¿ — Python 3.6.10rc1 ¥É¥¥å¥á¥ó¥È¤Ç¤Ï
>>> c = Counter(a=3, b=1) >>> d = Counter(a=1, b=2) >>> c + d # add two counters together: c[x] + d[x] Counter({'a': 4, 'b': 3}) >>> c - d # subtract (keeping only positive counts) Counter({'a': 2}) >>> c & d # intersection: min(c[x], d[x]) Counter({'a': 1, 'b': 1}) >>> c | d # union: max(c[x], d[x]) Counter({'a': 3, 'b': 2})
°ú¤»»¤Ï£°¤äÉé¤Ë¤Ê¤ë¤ÈÍ×ÁǤ¬¾Ã¤¨¤ë¡ÊCount¤À¤«¤é¡©¡Ë¤Î¤ÇÃí°Õ¡Ásubtract¤È¿¶¤ëÉñ¤¤¤¬°Û¤Ê¤ë¡£
´ðËÜŪ¤Ë¡¢¥«¥¦¥ó¥È¤¬0¤Î¹àÌܤϺï¤ëÊý¸þ¤Ëư¤¯¡£½¾¤Ã¤Æ¡¢CounterƱ»Î¤ò¤·¤¿¤ê°ú¤¤¤¿¤ê ¤·¤¿·ë²Ì¡¢Ãͤ¬0¤Ë¤Ê¤ì¤Ðprint¤·¤Æ¤â¸«¤¨¤Ê¤¯¤Ê¤ë¡£
Counter¤Ï¤Û¤Ü¼½ñ¤ÈƱ¤¸¤Ê¤Î¤Ç¡¢¼¡¹à¤Î¥½¡¼¥È¤¬»È¤¨¤ë¡£
Python¤Î¼½ñ(dict)¤ò¥½¡¼¥È¤¹¤ëÊýË¡¤Þ¤È¤á | HEADBOOST
d = {'b': 2, 'a': 3} print(sorted(d)) # key¤ò¼è½Ð¤·¤Æ¥½¡¼¥È print(sorted(d.keys())) # ¾å¤ÈƱ¤¸¤³¤È print(sorted(d.values())) print(sorted(d.items())) # item¤ò¼è½Ð¤·¤Ækey½ç¤Ç¥½¡¼¥È print(sorted(d.items(), key = lambda x : x[1])) # item¤ò¼è½Ð¤·¤Ævalue½ç¤Ç¥½¡¼¥È
¤³¤ì¤ò»È¤Ã¤Æ¡¢codons¿ô¤ò¿ô¤¨¤Æ½ÐÎϤòcodon̾½ç¤Ë¥½¡¼¥È
import pandas as pd import collections # dict (codon count) ¤òDF¤Ë½ñ¤¹þ¤à # d = {'B':1, 'C':2} # df= pd.DataFrame(columns=['A', 'B', 'C']) # ·ë²Ì¤È¤·¤ÆA¤Ï0, B¤Ï1, C¤Ï2¤È½ñ¤¤¤ÆÍߤ·¤¤ zerodic = collections.Counter([]) # ¤³¤ì¤ÏÉԲġ¡¥¨¥ó¥È¥ê¡¼C¤¬Æþ¤é¤Ê¤¤ zerodic = collections.Counter({'A':0, 'B':0, 'C':0}) print(zerodic) ## °Õ¿Þ¤Ï{'A', 0:, 'B': 0, 'C':0} d = collections.Counter({'B':1, 'C':2}) print(zerodic+d) # ¤³¤ì¤ÏÉԲġ£¥«¥¦¥ó¥È¤¬£°¤Î¥¨¥ó¥È¥ê¡¼C¤¬Æþ¤é¤Ê¤¤ # zerodic.update({'B':1, 'C':2}) # update¤À¤È¸µ¤Îzerodic¤¬¤½¤Î¤Þ¤Þ·¿»æ¤Ë¤Ê¤ë # ⤷update¤Ïin-place¤ÇÃÖ¤´¹¤¨¤ë¤Î¤ÇÃí°Õ print(zerodic) # ¤¢¤È¤Ï¥¡¼¤Ç¥½¡¼¥È¤·¤ÆÃͤò¥ê¥¹¥È¤Ë¤·¤Æ¡¢pd.Series¤ËÊÑ´¹¤·¤Æ¡¢pd.DataFrame¤ËÄɲà szero = sorted(zerodic.items(), key=lambda x: x[0]) # ¥Ú¥¢¤Ç¥½¡¼¥È¡¢½ÐÎϤϥڥ¢¤Î¥ê¥¹¥È szerolist = [u[1] for u in szero] # ÃͤÀ¤±¤Î¥ê¥¹¥È¤Ë¤¹¤ë print(szerolist) ser = pd.Series(szerolist, index=df.columns) df = df.append(ser, ignore_index=True) print(df)
pandas¤Çcsv/tsv¥Õ¥¡¥¤¥ëÆÉ¤ß¹þ¤ß¡Êread_csv, read_table¡Ë | note.nkmk.me
pandas.read_csv — pandas 0.25.0 documentation
codecs — Codec registry and base classes — Python 3.7.4 documentation
import pandas as pd pd.read_csv('filename', sep='\t', header=0, skiprows=[], encoding='cp932')
¤Þ¤¿¤Ï
with open('filename', 'r', encoding='...') as fin: pd.read_csv(fin, ... )
¤Ê¤ª¡¢separator¤¬Ê£¿ô¶õÇò¤ò´Þ¤à¤È¤¡¢
pd.read_csv('filename', sep='\s+')
¤¬»È¤¨¤ë¡£python - How to make separator in pandas read_csv more flexible wrt whitespace? - Stack Overflow
df = ... df.to_csv('filename', ...)
pandas.DataFrame.to_excel — pandas 0.25.0 documentation
If you wish to write to more than one sheet in the workbook, it is necessary to specify an ExcelWriter object:
>>> df2 = df1.copy() >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP ... df1.to_excel(writer, sheet_name='Sheet_name_1') ... df2.to_excel(writer, sheet_name='Sheet_name_2')
¤Þ¤¿¤Ï¡¢with¤ò»È¤ï¤Ê¤¤ÊýË¡¤È¤·¤Æ
writer = pd.ExcelWriter('output.xlsx') ... df1.to_excel(writer, sheet_name='Sheet_name_1') df2.to_excel(writer, sheet_name='Sheet_name_2') ... writer.save() # <-- ¤³¤ì¤¬É¬Í× writer.close() # <-- ¤³¤ì¤¬É¬Í×
¥Ñ¥¹¥ï¡¼¥Éµ¡Ç½¤òľÀÜpandas¡Ê¤äopenxylx, xlrd¡Ë¤Ë¼è¤ê¹þ¤àÏÃ¤ÏÆñ¤·¤¤¤è¤¦¤À¡£
pip install msoffcrypto-tool
import tempfile import msoffcrypto import pandas as pd from pathlib import WindowsPath file_dir = WindowsPath(r"¥¢¥ó¥±¡¼¥È¤¬Æþ¤Ã¤¿¥Ç¥£¥ì¥¯¥È¥ê¤Î¥Ñ¥¹") # ²óÅú¥Õ¥¡¥¤¥ë¤ò½ç¼¡³Îǧ for file in file_dir.glob("*.xlsm"): # ¥Ñ¥¹¥ï¡¼¥É²ò½ü¤·¤¿¥Æ¥ó¥Ý¥é¥ê¥Õ¥¡¥¤¥ëºîÀ® with file.open("rb") as f, tempfile.TemporaryFile() as tf: office_file = msoffcrypto.OfficeFile(f) office_file.load_key(password="¥Ñ¥¹¥ï¡¼¥É") office_file.decrypt(tf) # ¥Æ¥ó¥Ý¥é¥ê¥Õ¥¡¥¤¥ë¤«¤é²óÅú¤ò¥í¡¼¥É df = pd.read_excel(tf, header=None)
pandas¤ÇÊ£¿ô¹Ô¥Ø¥Ã¥À¡¼¤ò»ý¤ÄExcel¥Õ¥¡¥¤¥ë¤òÆÉ¹þ¤ß¥«¥é¥à(columns)¤òÀ°·Á¤¹¤ë - Qiita
df = pd.read_excel(filename, sheet_name='xxx', skiprows=y, header=[u, v, w])
¤³¤ì¤Ç¡¢¤Þ¤ºÀèÆ¬y¹Ôʬ¤ò¥¹¥¥Ã¥×¤·¤¿¸å¡¢header¤ÇÎó̾Éôʬ¤òÆÉ¤à¤¬¡¢¡Ê¥¹¥¥Ã¥×¤·¤¿¸å¤Î¡Ëu¹ÔÌÜ¡¢v¹ÔÌÜ¡¢w¹ÔÌܤò¤½¤ì¤¾¤ìÂ裱³¬ÁØ¡¦Â裲³¬ÁØ¡¦Â裳³¬ÁؤÎÎó̾¤È¤¹¤ë¡£
A B a b a b 0 0 1 2 3 1 4 5 6 7
¤ò
Aa Ab Ba Bb 0 0 1 2 3 1 4 5 6 7
¤Î¤è¤¦¤Ë¤·¤¿¤¤¡¢¤È¸À¤¦¾ì¹ç¡£
Îó̾¤ò½ñ¤Ä¾¤¹¤È¤¹¤ì¤Ð¡¢
df.columns = df.columns.map(''.join)
Ëü¤¬°ìÎó̾¤¬Ê¸»úÎó¤Ç¤Ê¤¤¾ì¹ç¤Ï¡¢Ê¸»úÎó¤ËÊÑ´¹
df.columns = df.columns.map(lambda x: ''.join([*map(str, x)]))
¤ò¤·¤Æ¤«¤éjoin¤¹¤ëɬÍפ¬¤¢¤ë¡£
¥Ç¥Õ¥©¥ë¥È¤Ç¤Ï²þ¹Ô¤ÏÉԲġÊ\n¤òÆþ¤ì¤Æ¤â¸ú²Ì¤Ê¤¤¡Ë
¥»¥ë¤Î¥¹¥¿¥¤¥ë¥×¥í¥Ñ¥Æ¥£¤Ç¡¢wrap_text=True¤òÀßÄꤹ¤ë¡£
·ë¶É¡¢to_excel¤Ç¥Õ¥¡¥¤¥ë¤òºî¤Ã¤Æ¤·¤Þ¤Ã¤Æ¤«¤é¡¢¤â¤¦°ìÅÙopenpyxl¤Ç¥Õ¥¡¥¤¥ë¤ò¥ª¡¼¥×¥ó¤·¤Æ¡¢ ¥·¡¼¥ÈÆâ¤Î¤¹¤Ù¤Æ¤Î¥»¥ë¤Î¥×¥í¥Ñ¥Æ¥£¤ò¾åµ£³ÈÖÌܤÇÊѹ¹¤¹¤ë¤Î¤¬´Êñ¤½¤¦¡£
¤Ê¤ª¡¢Êѹ¹¤ÎÊýË¡¤Ï¡¢
cell.alignment = cell.alignment.copy(wrapText=True)
¤ÏDepreciated¤Îwarning¤¬½Ð¤ë¤Î¤Ç¡¢
cell.alignment = Alignment(wrap_text=True)
¤Ë¤·¤Æ¤ß¤¿¡£
Á´ÂΤϡ¢
# "to_Excel"¤Ç¥»¥ëÆâ²þ¹Ô¥¢¥ê¤¬²Äǽ¤«¡© import pandas as pd df = pd.DataFrame([['5\n4' ,3], [2,6], [8,5]]) # ¥Ç¡¼¥¿Ãæ¤Ë\n¤òÆþ¤ì¤ë print(df) df.to_excel("mytest2.xlsx") # ¤È¤Ë¤«¤¯DF¤Î¥Ç¡¼¥¿¤«¤éexcel¥Õ¥¡¥¤¥ë¤òºî¤Ã¤Æ¤·¤Þ¤¦ import openpyxl from openpyxl.styles import Alignment wb = openpyxl.load_workbook('mytest2.xlsx') ws = wb['Sheet1'] for row in ws.iter_rows(): for cell in row: #cell.alignment = cell.alignment.copy(wrapText=True) # Depreciated·Ù¹ð #cell.style.alignment.wrap_text=True # ¥¨¥é¡¼ cell.alignment = Alignment(wrap_text=True) # ¤³¤ì¤Ê¤éư¤¤¤¿ wb.save('mytest2.xlsx') # ¥Õ¥¡¥¤¥ë¤Ø½ñ¤½Ð¤·¡Ê½ñ¤Ìᤷ¡Ë
¤³¤ì¤Ë¤è¤Ã¤Æ¼Â¸½¤Ç¤¤ë¡£
matplotlib¤ÇÉÁ¤¤¤¿¿Þ¤ò¥¤¥á¡¼¥¸¥Ç¡¼¥¿¤Ëºî¤ë¤³¤È¤¬¤Ç¤¤ë¡£¤¢¤Þ¤êÈþ¤·¤¯¤Ï¤Ê¤¤¤¬¡£
from matplotlib import pyplot as plt import numpy as np from PIL import Image x = np.linspace(0, 2*np.pi, 21) y1 = np.sin(x) y2 = np.cos(x) y3 = y1-y2 fig, ax = plt.subplots() ax.plot(x, y1, 'b.-') ax.plot(x, y2, 'g,-.') ax.plot(x, y3, 'r,-.') fig.canvas.draw() #im = np.array(fig.canvas.renderer.buffer_rgba()) im = np.array(fig.canvas.renderer._renderer) # matplotlib¤¬3.1¤è¤êÁ°¤Î¾ì¹ç img = Image.fromarray(im) img.show() # ¤³¤ì¤Ïjupyter notebook¾å¤Çư¤¤¤¿¡¡PIL¤Îim.show()¤Î½ÐÎϤÏXwin¤Ø¹Ô¤¯
¤â¤¦£±¤Ä¤ÎÎã¤Ï¡Êrenderer¤Î½ÐÎϤòʸ»ú¤Ë¤¹¤ë¡Ë How to convert a matplotlib figure to a numpy array or a PIL image | ICARE Data and Services Center
import matplotlib.pyplot as plt import numpy as np from PIL import Image # make an agg figure fig, ax = plt.subplots() ax.plot([1, 2, 3]) ax.set_title('a simple figure') fig.canvas.draw() # grab the pixel buffer and dump it into a numpy array #X = np.array(fig.canvas.renderer.buffer_rgba()) # ¤³¤ì¤Ï¥À¥á¤À¤Ã¤¿ w,h = fig.canvas.get_width_height() buf = np.frombuffer( fig.canvas.tostring_argb(), dtype=np.uint8 ) buf.shape = ( w, h,4 ) # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode buf = np.roll ( buf, 3, axis = 2 ) w, h, d = buf.shape im = Image.frombytes( "RGBA", ( w ,h ), buf.tostring( ) ) im.show() # ¤³¤ì¤Ïjupyter notebook¾å¤Çư¤¤¤¿¡¡PIL¤Îim.show()¤Î½ÐÎϤÏXwin¤Ø¹Ô¤¯
subplots¤ò»È¤Ã¤Æ»ØÄê¤Ç¤¤ë¡£
fig, ax = plt.subplots( ncols=len(xdata), nrows=len(ydata), figsize=(6*len(xdata), 6*len(ydata)) ... for i, x in xdata: for j, y in ydata: sns.kdeplot(data[i], data[j], ax=ax[j,i], shade=True) plt.show()
subplots_adjust¤ò»È¤¦
plt.subplots_adjust(wspace=0.4, hspace=0.6)
¥Ç¥Õ¥©¥ë¥È¤Ï0.2, 0.2¤é¤·¤¤¡£Â礤¯¤¹¤ë¤È¹¤¬¤ë¡£Ëç¿ô¤Ë¤è¤ë¡£
matplotlib¤Îannotate¤ò»È¤¦
<Python, matplotlib> »¶ÉۿޤγÆÍ×ÁǤËʸ»ú¤òÉÕ¤±¤ë¡£ - ¤Í¤³¤æ¤¤Î¥á¥â
Pandas¤Ç»¶ÉÛ¿Þ¤ò½ñ¤¯¤È¤³ÆÍ×ÁǤΥé¥Ù¥ë¤òɽ¼¨ - Qiita
annotate¤ò»È¤¦¤Î¤Ëax¤¬É¬ÍפʤΤǡ¢¤¿¤È¤¨¤Ð¡¢
%matplotlib inline import pandas as pd import matplotlib.pyplot as plt df = pd.DataFrame([[1,3,5], [2,4,6]], index=['½õ¶µ', '¶µ¼ø'], columns=['¾¯¤Ê¤¤', '¤Þ¤¢¤Þ¤¢', '¿¤¤']) dfx = pd.DataFrame(columns=['¿¦³¬', 'ɾ²Á', 'ȯÀ¸¿ô']) for ix in df.index: for col in df.columns: line = [ix, col, df.loc[ix][col]] s = pd.Series(line, index=['¿¦³¬', 'ɾ²Á', 'ȯÀ¸¿ô']) dfx = dfx.append(s, ignore_index=True) print(df) print('dfx\n', dfx) ax = plt.subplot(1,1,1) # <-- ¤³¤Î¤è¤¦¤Ë¤·¤Æax¤òºî¤ë plt.scatter(dfx['¿¦³¬'].to_list(), dfx['ɾ²Á'].to_list(), s=[u*20 for u in dfx['ȯÀ¸¿ô'].to_list()]) for k, v in dfx.iterrows(): print(v) ax.annotate(v[2],xy=(v[0],v[1]),size=14) plt.show()
Pandas¤Ç»¶ÉÛ¿Þ¤ò½ñ¤¯¤È¤³ÆÍ×ÁǤΥé¥Ù¥ë¤òɽ¼¨ - Qiita¤Ç¤Ïax¤òºî¤ëÂå¤ê¤Ë¡¢
import pandas as pd # DataFrame¤Ë¥Ç¡¼¥¿¤ò¥»¥Ã¥È dd = pd.DataFrame([ [10,50,'hoge'], [50,30,'fpp'], [20,30,'baa'] ], columns=['ax','ay','label']) # »¶ÉÛ¿Þ¤òÉÁ²è a = dd.plot.scatter(x='ax',y='ay') # ³ÆÍ×ÁǤ˥é¥Ù¥ë¤òɽ¼¨ for k, v in dd.iterrows(): a.annotate(v[2], xy=(v[0],v[1]), size=15)
¤Î¤è¤¦¤Ë¡¢a = df.plot.scatter() ¤È¤·¤Æ¤ª¤¤¤Æ¡¢¤³¤Îa¤ËÂФ·¤Æ a.annotate()¡£
Íç¤Îmatplotlib¤Ç¤Ï
plt.xticks(rotation=90)
¤Ê¤É¤È¤¹¤ë¡£90¤Ï±¦90ÅÙ²óž¡Ê½Ä½ñ¤¡¢¤³¤ì¤¬¥Ç¥Õ¥©¥ë¥È¡Ë¡£²£½ñ¤¤Ë¤¹¤ë¤Ë¤Ïrotation=0¡£
pandas¤Îplot¤Î¾ì¹ç¤Ï¡¢¾åµ¤Îxticks¤ò²Ã¤¨¤Æ¤â¤è¤¤¤¬¡¢plot¤Î°ú¿ô¤Ë»ØÄꤹ¤ë¤³¤È¤â¤Ç¤¤ë¡£¤¿¤È¤¨¤Ð
df.plot(kind='bar', rot=0) df.plot.bar(rot=0)
¤¤¤º¤ì¤â¡¢³ÑÅ٤λØÄê¤Ïxticks¤ÈƱÍͤǡ¢0¤¬²£½ñ¤¤Ë¤Ê¤ë¤¬¡¢¥Ç¥Õ¥©¥ë¥È¤Ï90¤Î½Ä½ñ¤¡£
plt.legend(bbox_to_anchor=(0.005, 0.995), loc='upper left', borderaxespad=0)
¤ä
plt.legend(bbox_to_anchor=(1.005, 0.995), loc='upper left', borderaxespad=0)
python - Modify the legend of pandas bar plot - Stack Overflow
£±Ëܤ´¤È¤Ëplt¤ò¸Æ¤ó¤Ç¤¤¤ë¤È¤¡Êpandas¤Ç¤Ï¤Ê¤¯¤ÆÀ¸¤Îmatplotlib¤Î»þ¡Ë
plt.plot.bar(... label='¼«Ê¬¤Î¥Æ¥¥¹¥È', ...)
¤Ç»ØÄê¤Ç¤¤ë¡£
pandas¤ÇDataFrame¤ò°ìµ¤¤ËÉÁ¤¤¤Æ¤·¤Þ¤¦¡ÊÎ󤴤ȤËÉÁ²è¤Ê¤É¡Ë¤È¤¤Ï¡¢£±¤ÄÌÜ¤Ï plt.legend()¤Ç¡¢¥ê¥¹¥È¤ò»ØÄê¤Ç¤¤ë¡£
df = pd.DataFrame({'A':26, 'B':20}, index=['N']) ax = df.plot(kind='bar') ax.legend(["AAA", "BBB"]);
£±¤Ä¤ÎfigureÆâ¤ËÊ£¿ô¤Îax¤¬Â¸ºß¤·¤Æ¤â¤¤¤¤¡£
fig, ax = plt.subplots(nrows=1, ncols=len(years), figsize=(8*len(years), 6)) for i, year in enumerate(years): dfhx.plot.line(ax=ax[i]) ax[i].legend([¿ÞiÍѤΡ¢Àþ¤Î¿ô¤ËÂбþ¤·¤¿¥ê¥¹¥È])
Legend guide — Matplotlib 1.3.1 documentation
ax = subplot(1,1,1) p1, = ax.plot([1,2,3], label="line 1") p2, = ax.plot([3,2,1], label="line 2") p3, = ax.plot([2,3,1], label="line 3") handles, labels = ax.get_legend_handles_labels() # reverse the order ax.legend(handles[::-1], labels[::-1])
¼«Ê¬¤Î¥×¥í¥°¥é¥à¤Ç¤Ï¡¢
# df2¤ò½àÈ÷¤·¤Æ¤ª¤¯ a = df2.plot.bar(stacked=True) plt.title(lvl1[0]+'/'+lvl2) handles, labels = a.get_legend_handles_labels() plt.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1.005, 0.995), loc='upper left', borderaxespad=0) pdf.savefig(bbox_inches='tight') plt.show()
¤¤¤ë¤«¤Î¥Ü¥Ã¥¯¥¹: Matplotlib¤Ç¥Æ¥¥¹¥È¥é¥Ù¥ë¤ò½Å¤Ê¤é¤Ê¤¤¤è¤¦¤Ëɽ¼¨¤¹¤ë
»¶ÉÛ¿Þ¤ÇÅÀ¤Î¥é¥Ù¥ë¤òÆþ¤ì¤¿¤È¤¤Ë¡¢¿ô¤¬Â¿¤¤¤È¡¢½Å¤Ê¤ê¹ç¤Ã¤ÆÆÉ¤á¤Ê¤¯¤Ê¤ë¤³¤È¤¬¤¢¤ë¡£¤½¤ì¤òÈò¤±¤ë¤Ë¤Ï¡¢adjust_text¥é¥¤¥Ö¥é¥ê¤¬»È¤¨¤ë¡£¡Êadjust¤Ètext¤Î´Ö¤Ë²¼Àþ¤¬¤¢¤Ã¤¿¤ê¤Ê¤«¤Ã¤¿¤ê¤Ê¤Î¤ÇÍ×Ãí°Õ¡Ë
pip install adjusttext
¤Ç¥¤¥ó¥¹¥È¡¼¥ë¤·¤¿¸å¡¢
from adjustText import adjust_text ... texts = [] a = dfout.plot.scatter(x='½ÅÍ×ÅÙ', y='ËþÂÅÙ') for k, v in dfout.iterrows(): u = a.annotate(v[2], xy=(v[0],v[1]), size=10) texts.append(u) plt.title('...') adjust_text(texts) plt.savefig(...) plt.show()
¤³¤ì¤Ç¡¢¤«¤Ê¤ê¤Î¾ì¹ç¼«Æ°Åª¤Ë°ÌÃÖ¤ò·è¤á¤Æ¤¯¤ì¤ë¡£¡Ê⤷ÃÙ¤¯¤Ê¤ë¤é¤·¤¤¡Ë
Ê£¿ô¤Î¥×¥í¥Ã¥È¤¬¤¢¤ë¾ì¹ç¡¢ax¤ò»ØÄꤷ¤Æadjust_text¤ò¤¹¤ëɬÍפ¬¤¢¤ë¡£»ØÄꤷ¤Ê¤¤¤È°ÌÃ֤⤺¤ì¤ë¡£
With multiple subplots, run adjust_text for one subplot at a time
fig, axes = plt.subplots(1, 2, figsize=(8, 3)) for k, ax in enumerate(axes): ... ax.plot(x, y, 'bo') ... texts = [] for i in range(len(x)): t = ax.text(x[i], y[i], 'Text%s' %i, ha='center', va='center') texts.append(t) adjust_text(texts, ax=ax)
matplotlib - ÌÜÀ¹¡¢ÌÜÀ¹¤Î¥é¥Ù¥ë¡¢¥°¥ê¥Ã¥É¤ÎÀßÄêÊýË¡ - Pynote
ax.set_xticklabels(¥ê¥¹¥È)¡¢ax.set_yticklabels(¥ê¥¹¥È)¤ÇÀßÄꤹ¤ì¤Ð¤è¤¤¤é¤·¤¤¡£ ⤷¥ê¥¹¥È¤Î¸Ä¿ô¤Ï¸µ¤Î¸Ä¿ô¡Ê¤Ä¤Þ¤êxticks¤Ç¤Î¸Ä¿ô¡Ë¤Ë¹ç¤ï¤»¤ëɬÍפ¬¤¢¤ê¤½¤¦¡£ ÎãÂê¤Ç¤Ï¡¢
# x ¼´ (major) ¤ÎÌÜÀ¹¤ê¤òÀßÄꤹ¤ë¡£ ax.set_xticks(np.linspace(0, np.pi * 4, 5)) # x ¼´ (major) ¤ÎÌÜÀ¹¤ê¤Î¥é¥Ù¥ë¤òÀßÄꤹ¤ë¡£ ax.set_xticklabels(["0", "$1\pi$", "$2\pi$", "$3\pi$", "$4\pi$"]) # y ¼´ (major) ¤ÎÌÜÀ¹¤ê¤òÀßÄꤹ¤ë¡£ ax.set_yticks(np.linspace(-1, 1, 3)) # y ¼´ (major) ¤ÎÌÜÀ¹¤ê¤Î¥é¥Ù¥ë¤òÀßÄꤹ¤ë¡£ ax.set_yticklabels(["A", "B", "C"])
¤Î¤è¤¦¤Ë¤·¤Æ¤¤¤ë¡£¼«Ê¬¤ÎÎã¤Ç¤Ï¡¢
a = dfhxzout.plot.line(ax=ax[i], marker='x', xticks=range(len(DHankelist[col])), \ title=year+'ǯÆþ³Ø Æþ³Ø»þ '+col, rot=90) a.set_xticklabels([u[3:8] for u in DHankelist[col]])
import numpy as np from scipy.cluster.hierarchy import dendrogram, linkage from scipy.spatial.distance import pdist import matplotlib.pyplot as plt X = np.array([[1,2], [2,1], [3,4], [4,3]]) Z = linkage(X, 'single') # wardË¡¤ò»È¤¦¤Ê¤é¤Ð 'single' ¤ÎÂå¤ï¤ê¤Ë 'ward' ¤ò»ØÄꤹ¤ë dendrogram( Z, leaf_font_size=8., # font size for the x axis labels ) plt.show()
%matplotlib inline import pandas as pd import matplotlib.pyplot as plt from scipy.cluster.hierarchy import linkage, dendrogram df_count_tpm = pd.read_csv("count_tpm.tsv", sep="\t", index_col=0) tpm_t = df_count_tpm.T print(df_count_tpm.head()) from scipy.spatial.distance import pdist linkage_result = linkage(tpm_t, method='average', metric='correlation') plt.figure(num=None, figsize=(16, 9), dpi=200, facecolor='w', edgecolor='k') dendrogram(linkage_result, labels=df_count_tpm.columns) plt.show()
¥¯¥é¥¹¥¿¡¼¼«ÂΤò¼è½Ð¤·¤¿¤¤»þ¤Ï fcluster ¤¬»È¤¨¤ë¡£
import numpy as np from scipy.cluster.hierarchy import dendrogram, linkage, fcluster from scipy.spatial.distance import pdist import matplotlib.pyplot as plt X = np.array([[1,2], [2,1], [3,4], [4,3], [1,3], [3,1]]) Z = linkage(X, 'single') # wardË¡¤ò»È¤¦¤Ê¤é¤Ð 'single' ¤ÎÂå¤ï¤ê¤Ë 'ward' ¤ò»ØÄꤹ¤ë plt.figure(figsize=(20,20), dpi=200, facecolor='w', edgecolor='k') dendrogram( Z, leaf_font_size=8., # font size for the x axis labels labels = ['A', 'B', 'C', 'D', 'E', 'F'] ) plt.savefig('dendrogram.pdf') plt.show() NUM_CLUSTERS = 5 nodelabels = ['A', 'B', 'C', 'D', 'E', 'F'] for num in range(5, NUM_CLUSTERS+1): labels = fcluster(Z, t=num, criterion='maxclust') #fcluster¤Ï¡¢ÆþÎϤ¬¤É¤Î¥¯¥é¥¹¥¿¤Ë°¤¹¤ë¤«¡Ê¥¯¥é¥¹¥¿ÈÖ¹æ labels¡Ë¤òÊÖ¤¹ print(num, labels) # ¥¯¥é¥¹¥¿¤´¤È¤Ë¡¢¤½¤ì¤Ë°¤¹¤ëÆþÎϤò¥ê¥¹¥È¤È¤·¤ÆÉ½¼¨ for cl_id in range(1, num+1): l = [nodelabels[n] for n in range(0,len(labels)) if labels[n]==cl_id] print(' ', cl_id, l)
fcluster¤Î½ÐÎÏ¡§
5 [1 2 3 4 1 2] # ¥ê¥¹¥È¤Ë³ÆÍ×ÁǤΥ¯¥é¥¹¥¿id¡Ê1¡Á4¡Ë¡£Âè1Í×ÁǤÈÂè5Í×ÁǤϥ¯¥é¥¹¥¿1¤Ë¡¢Âè2¤ÈÂè6¤Ï¥¯¥é¥¹¥¿2¤Ë°¤¹¤ë
¤½¤ì¤ò½ñ¤Ä¾¤·¤¿·ë²Ì¡§
1 ['A', 'E'] # ¾åµ¤ò¡¢¥¯¥é¥¹¥¿1¤Ë¤ÏÍ×ÁÇ1¤È5¤¬Â°¤¹¤ë¤ÈÊÑ·Á¤·¡¢¹¹¤ËÍ×ÁÇid¡Ê1,5¡Ë¤ò¥é¥Ù¥ë¡ÊA,E¡Ë¤ËÃÖ´¹¤¨¤¿ 2 ['B', 'F'] 3 ['C'] 4 ['D'] 5 []
def read_db(query): engine = create_engine('mysql+mysqldb://userid:password@localhost/dbname?charset=utf8',\ echo=False) dfr = pd.io.sql.read_sql(query, engine) return(dfr) query = 'select * from tablename' df = read_db(query)
import os path = "./sample" os.path.exists(path)
¥×¥ì¡¼¥ó¤ÊPython¤Î¾ì¹ç
with open(picklefname, 'rb') as pf: df = pickle.load(pf)
with open(picklefname, 'wb') as pwf: pickle.dump(df, pwf)
Pandas¤ÎDataFrame¤Î¾ì¹ç¡¢¡Ê¥á¥½¥Ã¥É¤¬¤¢¤ë¡Ë
df.to_pickle(picklefname)
df = pd.read_pickle(picklefname)
x = "OK" if n == 10 else "NG"
s = '' for u in list: s += u
'´Ö¤ËÁÞÆþ¤¹¤ëʸ»úÎó'.join([Ï¢·ë¤·¤¿¤¤Ê¸»úÎó¤Î¥ê¥¹¥È])
s = ''.join(list)
>>> "spam".find("pa") 1 >>> "spam".find("x") -1
index¤Ï¸«¤Ä¤«¤é¤Ê¤¤¤È¤¤Ë¥¨¥é¡¼¤òÊÖ¤¹
df.x.apply(f1, args=(2,)) # ¤Þ¤¿¤Ï¡¢ apply ¤Ë¥¡¼¥ï¡¼¥É°ú¿ô¤ÇÍ¿¤¨¤ë df.x.apply(f1, b=2)
df[X].isin([list]) list¤Ç¤Ê¤±¤ì¤Ð¤Ê¤é¤Ê¤¤¡£
https://note.nkmk.me/python-pandas-at-iat-loc-iloc/
import sys argvs = sys.argv; argc = len(argvs) if (argc != 2): print('Usage: python %s filename' % argvs[0]) quit()
newls = sorted(list)
¥á¥½¥Ã¥Ésort¤Ïin-place¤Ç¥½¡¼¥È¤¹¤ë¤Î¤ÇÃí°Õ¡£
list.sort() # list¤½¤Î¤â¤Î¤¬ÃÖ¤´¹¤ï¤ë
# Python 3 ¤Ç flatten ¤¹¤ëÊýË¡¤¤¤í¤¤¤í
# https://qiita.com/hoto17296/items/e1f80fef8536a0e5e7db
sum([[1,2,3],[4,5,6],[7,8,9]], []) #=> [1, 2, 3, 4, 5, 6, 7, 8, 9]
Â裲°ú¿ô¤Î[]¤¬É¬Íס£¤³¤ì¤Ï¡¢Â裲°ú¿ô¤Î¥Ç¥Õ¥©¥ë¥ÈÃͤ¬0¤Ê¤Î¤Ç¿ôÃͤΡܤÀ¤È»×¤¤¹þ¤à¤«¤é¡£
l = [['A', 'B', 'C'], ['D', 'E'], ['F']] print(sum(l, []))
var = 1 def add_ten(x): global var var = 10 print var + x add_ten(2) # 12 print var # 10
def subtract_list(lst1, lst2): lst = lst1.copy() for e2 in lst2: try: lst.remove(e2) except ValueError: continue return lst
if¤Ç¥Á¥§¥Ã¥¯¤¹¤ë¤è¤êÁᤤ
ÆÃ¤ËNaN¤¬¤¢¤ë¤È¡¢NaN¤Ïfloat·¿¤À¤È»×¤¤¹þ¤à¤Î¤Ç¡¢¸å¤Î½èÍý¤Ç¤Ö¤Ä¤«¤ë¤³¤È¤¬¤¢¤ë¡£
¤½¤Î»þ¤Ë¡¢·¿¥Á¥§¥Ã¥¯¤Çƨ¤²¤ë¤È¤¹¤ë¤È¡¢¡ÊpandasŪ¤Ë¤Ï¸ÄÊ̤η¿¥Á¥§¥Ã¥¯¤è¤ê¤Ïdropna¤Ê¤É¤ÎÊý¤¬¤¤¤¤¤È»×¤¦¤¬¡ËɬÍפˤʤ롣
Python¤Ç·¿¤ò¼èÆÀ¡¦È½Äꤹ¤ëtype´Ø¿ô, isinstance´Ø¿ô | note.nkmk.me
¤Ê¤Î¤À¤¬¡¢isinstance()¤ÎÊý¤¬¤è¤µ¤½¤¦¤À¡£type¤Ç°ú¤Ã¤«¤«¤Ê¤¤¾ì¹ç¤¬¤¢¤ë¡£
df['innings'].astype(np.int64)
list.insert(°ÌÃÖ, ÃÍ)
¥ê¥¹¥È¤òÃÖ¤´¹¤¨¤ë¤Î¤ÇÃí°Õ
python¤Îabs¤Ï¡¢À°¿ô¤ÎÀäÂÐÃͤÏÀ°¿ô¡¢¾®¿ô¤ÎÀäÂÐÃͤϾ®¿ô¡¢Ê£ÁÇ¿ô¤ÎÀäÂÐÃͤâ²Ä¡£
math¤Îfabs¤Ï¡¢À°¿ô¤ËÂФ·¤Æ¤â¾®¿ô¤ËÂФ·¤Æ¤â¡¢ÀäÂÐÃͤϾ®¿ô¤òÊÖ¤¹¡£Ê£ÁÇ¿ô¤ÏÉԲġ£
numpy¤Îabs(=absolute¤È½ñ¤¤¤Æ¤â¤¤¤¤¤é¤·¤¤¡Ë¤ÏÇÛÎóndarray¤ËÂФ·¤Æ¤âŬÍѤǤ¤ë¡£ ¤â¤·À°¿ô¤È¾®¿ô¤¬º®¤¶¤Ã¤Æ¤¤¤ë¤È¡¢¾®¿ô¤Ë¤¹¤ë¡ÊºÇÂçÀºÅÙ¡Ë¡£
numpy¤Îfabs¤Ïmath¤Îfabs¤ÈƱÍͤˡ¢É¬¤º¾®¿ô¤òÊÖ¤¹¡£Ê£ÁÇ¿ô¤ÏÉԲġ£
python - Pandas pie plot actual values for multiple graphs - Stack Overflow
df.hist() (df['column']).hist() (df['column']).value_counts() (df['column']//10*10).value_counts()
Pandas¤Ç¥Ç¡¼¥¿¤ÎÃͤÎÉÑÅÙ¤ò·×»»¤¹¤ëvalue_counts´Ø¿ô¤Î»È¤¤Êý - DeepAge
¥«¥¦¥ó¥È·ë²Ì¤ÏSeries¤Ê¤Î¤Ç¡¢
s = pd.Series([3, 2, 7, 2, 3, 4]) u = s.value_counts(sort=False) print(u) # 2 2 # 3 2 # 4 1 # 7 1 print(type(u)) <class 'pandas.core.series.Series'> print(u.index) Int64Index([2, 3, 4, 7], dtype='int64') # ¥«¥¦¥ó¥È·ë²Ìu¤òindex¤Ç¥½¡¼¥È¤¹¤ë¤È¡ÊÌۤäƤ¤¤ë¤ÈÃͤι߽ç¤Ç¥½¡¼¥È¡Ë print(u.sort_index(ascending=False)) # 7 1 # 4 1 # 3 2 # 2 2
²ÊÌÜÊ̤ÎÀ®Àӥǡ¼¥¿¤òGroupBy¤Ç½¸·×À°Íý¤·¡¢ÅÀ¿ô¥Ò¥¹¥È¥°¥é¥à¤òÉÁ¤¯Îã
# df ¤¬ÆþÎϤβÊÌÜÊÌÀ®Àӥǡ¼¥¿ df = df[['³ØÀÒÈÖ¹æ', '³Ø²Ê', 'Æþ³ØÇ¯ÅÙ', '¼èÆÀÁÇÅÀ']] dfmean = df.groupby('³ØÀÒÈÖ¹æ').mean() # 1¿Í1¿Í¤Îºß³ØÃæ¤ÎɾÅÀÊ¿¶Ñ dfgakka = dfmean.groupby(['³Ø²Ê', 'Æþ³ØÇ¯ÅÙ'])['¼èÆÀÁÇÅÀ'].apply(lambda d: (d//5*5).value_counts(bins=list(range(0,101,5))).sort_index()) # dfgakka¤Ï¡¢Series¤Ç¡¢index¤¬ ('³Ø²Ê', 'Æþ³ØÇ¯ÅÙ', 'ɾÅÀ¤Î5ÅÀ¹ï¤ß¤Î¥¯¥é¥¹')¤Î£³¥ì¥Ù¥ë for year in [2010, 2011, 2012, 2013, 2014, 2015]: # DataFrame¤Ëľ¤¹ for gakka in [51, 52, 53, 54, 55, 56]: # ³Ø²Ê¤´¤È¤ËÎó¤Ë¤¹¤ë ser = dfgakka.xs(year, level='Æþ³ØÇ¯ÅÙ').xs(gakka, level='³Ø²Ê') if gakka==51: dfout = pd.DataFrame(ser) dfout.columns = [gakka] else: dfout[gakka] = ser dfout = dfout.rename(columns=gakkaname) dfout.index = [u.left for u in dfout.index.to_list()] #dfoutx.plot.bar(figsize=(8,6), rot=0) dfout.plot.line(figsize=(8,6), marker='x', rot=0, xticks=[u for u in dfout.index.to_list()]) plt.title('ÄÌ»»É¾ÅÀÉÑÅÙʬÉÛ '+str(year)+' ǯÆþ³Ø') plt.xlabel('Ê¿¶ÑÄÌ»»É¾ÅÀ') plt.ylabel('¿Í¿ô') plt.legend(bbox_to_anchor=(1.01, 0.995), loc='upper left', borderaxespad=0) pdf.savefig(bbox_inches='tight') plt.show()
¤Þ¤¿¥Ò¥¹¥È¥°¥é¥à¤¬¿Í¿ô¤ÎÂå¤ï¤ê¤Ë%¤Ë¤¹¤ë¤Ë¤Ï
dfm = dfout.sum() dfm = dfout/dfm*100 dfm = dfm.rename(columns=gakkaname) #dfm.plot.bar(figsize=(8,6), rot=0) dfm.plot.line(figsize=(8,6), marker='x', rot=0, xticks=[u for u in dfm.index.to_list()])
pandas¤ÇÆÃÄê¤Î¾ò·ï¤òËþ¤¿¤¹Í×ÁÇ¿ô¤ò¥«¥¦¥ó¥È¡ÊÁ´ÂΡ¢¹Ô¡¦Îó¤´¤È¡Ë | note.nkmk.me
¤¿¤È¤¨¤Ð
df['age']<25
¤ÏdfÃæ¤Î25̤Ëþ¤Î¥Ç¡¼¥¿¤ËÂФ·¤ÆTrue¤òÆþ¤ì¤¿Series¤òÊÖ¤¹¤«¤é
(df['age']<25).sum()
¤Ë¤è¤Ã¤Æ¸Ä¿ô¤ò¿ô¤¨¤ë¤³¤È¤¬¤Ç¤¤ë¡£¡ÊTrue¤Ï1¡¢False¤Ï0¡Ë
ñ¤Ë¥Ç¡¼¥¿¤Î¸Ä¿ô¡ÊNaN¤Ç¤Ê¤¤¥Ç¡¼¥¿¤Î¸Ä¿ô¡Ë¤ò¿ô¤¨¤ë¤À¤±¤Ê¤é¡¢
df.count() # Î󤴤ȡʽġˤ˥«¥¦¥ó¥È df.counts(axis=1) # ¹Ô¤´¤È¡Ê²£¡Ë¤Ë¥«¥¦¥ó¥È
¤¬»È¤¨¤ë¡£
bbox_inches = "tight"¤È¤«¤½¤ì·Ï¤Î¤ä¤Ä - virsalus¤ÎÆüµ
plt.savefig('sample.pdf', bbox_inches='tight')
¤È¤¹¤ë¤«
plt.tight_layout()
¤È¤¹¤ë¤«¤ÇÂбþ¡£
µ÷Î¥¹ÔÎódmat¤¬¤¢¤ë¤È¤¡¢
ndmat =squareform(dmat) lk = linkage(ndmat,method='average') plt.figure(num=None, figsize=(22, 12), dpi=200, facecolor='w', edgecolor='k') dendrogram(lk, labels=dmat.index, leaf_rotation=90) plt.tight_layout() # plt.savefig('corr_coeff_dendrogram.png', bbox_inches="tight") plt.savefig('corr_coeff_dendrogram.png')
Python: xmltodict ¤ò»È¤Ã¤Æ XML ¤ò¼½ñ¤ØÊÑ´¹
4.3.2.4 Location testing
You can use the Python keyword in with a SeqFeature or location object to see if the base/residue for a parent coordinate is within the feature/location or not.
For example, suppose you have a SNP of interest and you want to know which features this SNP is within, and lets suppose this SNP is at index 4350 (Python counting!). Here is a simple brute force solution where we just check all the features one by one in a loop:
>>> from Bio import SeqIO >>> my_snp = 4350 >>> record = SeqIO.read("NC_005816.gb", "genbank") >>> for feature in record.features: ... if my_snp in feature: ... print("%s %s" % (feature.type, feature.qualifiers.get("db_xref"))) ... source ['taxon:229193'] gene ['GeneID:2767712'] CDS ['GI:45478716', 'GeneID:2767712']
Note that gene and CDS features from GenBank or EMBL files defined with joins are the union of the exons – they do not cover any introns.
matplotlib¤ÇÊ£¿ô¤Îfigure¤ò°ì¤Ä¤Îpdf¤ËÊݸ¤¹¤ë - Qiita
import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages # <-- Äɲà pdf = PdfPages('¥Õ¥¡¥¤¥ë̾.pdf') # <-- Äɲá¡¡Êpdf¤ò¥ª¡¼¥×¥ó¡Ë for ... : # Ê£¿ôËçºî¤ë plt.scatter(....)¡¡ # plot¤·¤Æ¿Þ¤òºî¤ë pdf.savefig() # <-- savefig¤Î¹ÔÀè¤Ïpdf¤Ë¤Ê¤ë plt.show() # <-- ²èÌÌɽ¼¨ pdf.close() # <-- ºÇ¸å¤Ë˺¤ì¤Ê¤¤¤è¤¦¤Ë
DataFrame df ¤ò df.plot.bar() ¤Ç¥°¥é¥Õɽ¼¨¤¹¤ë¤È¤¡¢X¼´ÌÜÀ¹¤Îʸ»ú¤¬ ²£¸þ¤¤Ë¤Ê¤Ã¤Æ¤·¤Þ¤¦¡£
¤¤¤í¤¤¤í¤ä¤Ã¤Æ¤ï¤«¤Ã¤¿¤Î¤Ï¡¢
df.plot.bar(y='¥«¥é¥à̾')
¤È¤¹¤ë¤ÈÀµ¤·¤¯É½¼¨¤µ¤ì¤ë¤¬¡¢
df.plot.bar()
¤À¤È¥À¥á¡£y=¤ò½ñ¤«¤Ê¤¤¥¹¥¿¥¤¥ë¤Ï¡¢¤¹¤Ù¤Æ¤Î¥«¥é¥à¤¬É½¼¨¤µ¤ì¤ë¤Î¤Ç¶ñ¹ç¤¬¤¤¤¤¡£ df¤Ë£±¤Ä¤·¤«¥«¥é¥à¤¬¤Ê¤±¤ì¤Ð¤½¤ì¤ÇºÑ¤à¤·¡¢Ê£¿ô¤¢¤ì¤Ð¤½¤ì¤é¤¬¤ß¤Êɽ¼¨¤µ¤ì¤ë¡£ ¥é¥Ù¥ë¤Î¤³¤È¤µ¤¨µ¤¤Ë¤·¤Ê¤±¤ì¤Ð¤³¤ì¤Ç¤â½½Ê¬¤À¤í¤¦¡Ë
¤Þ¤¿¡¢xticks¤ò»È¤Ã¤Æ²óž¤µ¤»¤ë¥ï¥¶¤ÏƯ¤«¤Ê¤«¤Ã¤¿¡£¤¿¤È¤¨¤Ð
plt.xticks(rotation=90)
¤È¤«¤Ï¸ú¤«¤Ê¤«¤Ã¤¿¡£
seaborn¤Îheatmap¤Çy¼´¤¬Â¤ê¤Ê¤¯¤Æ¡¢annot¤¬³èÍѤǤ¤Ê¤¤ - Qiita
¥Ð¡¼¥¸¥ç¥ó¥¢¥Ã¥×¤Ç²ò·è¡£¸Å¤¤¥Ð¡¼¥¸¥ç¥ó¤Î»þ¤Ïƨ¤²¤ëÊýË¡¤¬¤¢¤ë¡£ ax.set_ylim(len(flights), 0)
seaborn.heatmap — seaborn 0.9.0 documentation
fmt¤Î»ØÄê¡§¡¡fmt='.2f'¤È¤«fmt='d'¤È¤«
Seaborn¤Î¥«¥é¡¼¥Ñ¥ì¥Ã¥È¤ÎÁª¤ÓÊý - Qiita
seaborn¤ÎºÙ¤«¤¤¸«¤¿ÌÜÄ´À°¤ò¤¢¤¤é¤á¤Ê¤¤ - Qiita
annot_kws={'fontsize': 9, 'color': 'green'} ¤È¤«¤Ç¤¤ë¡£
crosstab() — researchpy 0.1.1 documentation
ReportLab - Content to PDF Solutions
¤Ç¡¢¤³¤ÎÊÕ¤¬»È¤¨¤½¤¦¡£
# reportlab platypus Table¤ò»È¤¦Îã¡ # reportlab ¤Î Table ¤Îɽ¼¨°ÌÃÖ¤ò¥³¥ó¥È¥í¡¼¥ë¤¹¤ë # https://qiita.com/ekzemplaro/items/09bd10b02ecbb35c0efa from reportlab.lib.pagesizes import A4 from reportlab.lib.styles import getSampleStyleSheet from reportlab.lib.units import mm from reportlab.lib import colors from reportlab.lib.styles import ParagraphStyle from reportlab.pdfgen import canvas from reportlab.platypus import Image, Paragraph, Table from reportlab.pdfbase import pdfmetrics #from reportlab.pdfbase.cidfonts import UnicodeCIDFont from reportlab.pdfbase.ttfonts import TTFont # ------------------------------------------------------------------ #fontname_g = "HeiseiKakuGo-W5" #pdfmetrics.registerFont(UnicodeCIDFont(fontname_g)) pdfmetrics.registerFont(TTFont('IPAexGothic', 'ipaexg.ttf')) cc = canvas.Canvas('example.pdf', pagesize=A4) width, height = A4 cc.setFont("IPAexGothic", 16) str_out = "¤³¤ó¤Ë¤Á¤Ï" cc.drawString(100, 730, str_out) data = [["¥Æ¥¹¥È", 2, 3], ["ÆüËܸì", 1, 3], [3, 2, 10]] table = Table(data, colWidths=20*mm) table.setStyle([("VALIGN", (0,0), (-1,-1), "MIDDLE"), ("ALIGN", (0,0), (-1,-1), "CENTER"), ('INNERGRID', (0,0), (-1,-1), 0.25, colors.black), ('BOX', (0, 0), (-1, -1), 0.25, colors.black), # ('FONT', (0, 0), (-1, -1), "HeiseiKakuGo-W5", 16), ('FONT', (0, 0), (-1, -1), "IPAexGothic", 16), ]) # table.wrapOn(cc, width, height) # table.drawOn(cc, 140*mm, 250*mm) table.drawOn(cc, 75*mm, 225*mm) table.drawOn(cc, 10*mm, 200*mm) # styles = getSampleStyleSheet() my_style = styles["Normal"] my_style.name = "bonlife" #my_style.fontName = "HeiseiKakuGo-W5" my_style.fontName = "IPAexGothic" my_style.fontSize=16 ptext = "¤³¤ì¤Ï¥µ¥ó¥×¥ë¤Ç¤¹¡£" pp = Paragraph(ptext, style=my_style) pp.wrapOn(cc, 70*mm, 50*mm) # size of 'textbox' for linebreaks etc. pp.drawOn(cc, 50*mm, 190*mm) # position of text / where to draw cc.showPage() cc.save() print('complete')
¤È¡¢¤â¤Ã¤È´Ê°×ÈǤÇ
# reportlab platypus Table¤ò»È¤¦Î㢠# reportlab¤Îplatypus¤ò»È¤Ã¤Ætable¤òÉÁ²è¤¹¤ë¥µ¥ó¥×¥ë¡£wrapOn¤ò¸Æ¤Ó¤Ê¤µ¤¤¤È¤¤¤¦¤³¤È #https://gist.github.com/bgnori/4452571 from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import A4 from reportlab.lib.units import mm from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from reportlab.lib import colors from reportlab.platypus import Table #pdfmetrics.registerFont(TTFont('IPA Gothic', # '/usr/share/fonts/ipa-gothic/ipag.ttf')) pdfmetrics.registerFont(TTFont('IPAexGothic', 'ipaexg.ttf')) xmargin = 8.4*mm ymargin = 8.8*mm swidth = 48.3*mm sheight = 25.4*mm c = canvas.Canvas('example2.pdf', pagesize=A4) #c.drawString(xmargin, ymargin, u"¤É¤ä¡¢pdf¤ä¤Ç¡£reportlab!") t = Table([['a', 'b'], ['1', '2']]) #t.setStyle([('TEXTCOLOR', (0,0), (1,0), colors.red)]) # ¸µ¥µ¥ó¥×¥ë¤Ç¤Ï¤³¤¦¤·¤Æ¤¤¤ë t.setStyle([('INNERGRID', (0,0), (-1,-1), 0.25, colors.black), # ·ÓÀþ¡¢¾å¤Î¥µ¥ó¥×¥ë¤«¤é¼ÚÍÑ ('BOX', (0, 0), (-1, -1), 0.25, colors.black), ('FONT', (0, 0), (-1, -1), "IPAexGothic", 16), ]) t.wrapOn(c, 100*mm, 100*mm) t.drawOn(c, 100*mm, 100*mm) c.showPage() c.save() print('complete')
!pip install japanize-matplotlib
¤·¤¿¤¢¤È¤Ç¡¢
import japanize_matplotlib
¤¹¤ë¤À¤±¤Çʸ»ú²½¤±¤ò½¤Àµ¤Ç¤¤ë¡£
¡Á¡Á¡Á¡Á¡Á¡Á¡Á¡Á¡Á¡Á¡Á¡Á
Jupyter Notebook¤Î¥»¥ëÉý¤ò¹¤²¤¿¤¤¡ª - Qiita¡¡¤¬¤è¤µ¤½¤¦
pip install jupyterthemes jt -cellw 95%
¤Þ¤¿¤Ï¡¢
IPython/Jupyter Notebook enlarge/change cell width · GitHub
custom.css ¤Ç
.container { width: 99% !important; } div.cell.selected { border-left-width: 1px !important; } div.output_scroll { resize: vertical !important; }
¤Ç¡¢Jupyter Notebook¤ÎºÆµ¯Æ°¤¬É¬Íפ餷¤¤¡£
#!/usr/bin/bash samples=("Anc" "1_2-1" "2_2-1" "2_5-1" "2_5-1-7A" "1_2-2" "2_2-2" "2_6-2" "2_6-2-10E") for f in "${samples[@]}" do #nohup python ProcessGD.py $f > $f.out & echo $f done
unzip -O sjis foo.zip
¤³¤ì¤Ë¤è¤Ã¤Æ¡¢Æâ¢¤µ¤ì¤ë¥Õ¥¡¥¤¥ë¤Î̾Á°¤òSJIS¤«¤éUTF-8¤ËÊÑ´¹¤Ç¤¤ë¡£