![]() |
ノート/twitterで形容詞をリストしてみたhttps://pepper.is.sci.toho-u.ac.jp:443/pepper/index.php?%A5%CE%A1%BC%A5%C8%2Ftwitter%A4%C7%B7%C1%CD%C6%BB%EC%A4%F2%A5%EA%A5%B9%A5%C8%A4%B7%A4%C6%A4%DF%A4%BF |
![]() |
訪問者数 888 最終更新 2013-06-24 (月) 16:49:55
数えた結果 (過去120時間、MeCabで自立語としての形容詞に分類されたもの)
2013-06-24_twitter過去120時間のツイートの形容詞上位10000.txt
プログラム
#!/usr/bin/env python # -*- coding: utf-8 -*- import MySQLdb import codecs import sys import time import datetime import MeCab d = datetime.datetime.today() con = MySQLdb.connect(db="xxxx", host="localhost", port=3306, user="xxxx", passwd="xxxx") mt = MeCab.Tagger() c = 0; table = {} # tableは{word:count, ...}の辞書 for i in range(-120,1): t = (d + datetime.timedelta(hours=i)).strftime("%y%m%d%H") tablename = "tw" + t s = "SHOW TABLES FROM twitter like '" + tablename + "'" cur = con.cursor() cur.execute(s.encode('utf_8')) cnt = cur.rowcount if (cnt==0): cur.close() print "No table" break cur.close() s = "SELECT text FROM " + tablename cur = con.cursor() cur.execute(s.encode('utf_8')) r = cur.fetchone() while (r != None): # print "---" + r[0].encode('utf_8') try: m = mt.parseToNode(r[0].encode('utf_8')) except UnicodeDecodeError: print "e" continue; while m: #print "m.surface\t", m.surface, "\t", m.feature mm = m.feature.split(',') hinshi = mm[0] hinshi2 = mm[1] hinshi3 = mm[2] #print "hinshi: ", hinshi if hinshi==u'形容詞' and hinshi2==u'自立': #print hinshi, hinshi2, m.surface, m.feature if table.has_key(m.surface): table[m.surface] = table[m.surface]+1 else: table.update({m.surface:1}) m = m.next # print "EOS" r = cur.fetchone() c = c+1; cur.close() con.close() print u"======== サンプル数", c c = 0 for k, v in sorted(table.items(), key=lambda x:x[1], reverse=True): if c<=10000: print v, "\t", k c = c+1