[[ノート/ノート]]~
訪問者 &counter(); 最終更新 &lastmod();~
*(2016-12-12) Python3で、MeCabで単語共起リストを作る(Python3を使って再訪) [#sbfb9522]
昔に作った、文中の単語共起(名詞のみ)のリストを作るプログラムを、python-3に書き直してみた。
python-3による主な変更点は、
-print文の仕様が変わったことと、
-print文の仕様が変わったこと。
-辞書型に対するhas_key関数 「<辞書>.has_key(key)」 が使えなくなったのでそれを「<key> in <辞書>」 に変更したこと。
-mecabがpython-3対応バージョンにした(pip install mecab-python3) ので、すべてunicodeでシームレスに使えること。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import codecs
import MeCab
def hasKanji(s):
#Returns True if string s contains a Kanji, False if not.
kanji = False
for c in s:
if (((ord(c)&0xff00) == 0x3000) or (0x4e00 <= ((ord(c)&0xff00) <=0x9f00))):
kanji = True
break
return kanji
def allKanji(s):
#Returns True if string s contains a Kanji, False if not.
kanji = True
for c in s:
if not (((ord(c)&0xff00) == 0x3000) or ((0x4e00 <= (ord(c)&0xff00)) and ((ord(c)&0xff00) <=0x9f00))):
kanji = False
break
return kanji
def allKomoji(s):
#Returns True if string s consists of all komoji, False if not.
return all(i.islower() for i in s)
def printlist(list):
for key, value in list.iteritems():
print('(', key, value, ')', end='')
print
def printkyoukilistsort(list):
for key, value in sorted(list.items(), key=lambda x:x[1], reverse=True):
#print '((', key[0], key[1], ')', value, ')',
print(' '+ key[0], ',', key[1], ',', value)
print
def printlistsort(list):
for key, value in sorted(list.items(), key=lambda x:x[1], reverse=True):
print('(', key, value, ')', end='')
print
def addtotable(t, g):
if g in t:
t[g] = t[g]+1
else:
t.update({g:1})
return t
def genIgnore():
#returns IGNORE word set
s = set(['それ', 'これ', 'www', 'wwww', 'ー', 'の', 'ん', 'ーー', 'こと', 'よ う', 'もの', 'ところ', 'いや', 'そう', 'とき', 'こ', 'さ', 'む', 'ら', 'ごと', 'そこ', 'どこ', 'はず', 'ため', 'ア', 'ッ'])
return s
def nounlist(t):
#print("Source> ", t)
table = {}
if hasKanji(t):
mt = MeCab.Tagger()
mt.parse('')
m = mt.parseToNode(t)
go = ''
while m:
mm = m.feature.split(',')
hinshi = mm[0]
hinshi2 = mm[1]
#hinshi3 = mm[2]
if (m.surface in ignore) or not allKanji(m.surface):
if go != '':
#if table.has_key(go):
if go in table:
table[go] = table[go]+1
else:
table.update({go:1})
go = ''
else:
if (hinshi == u'名詞'):
# concatenate, if necessary
go = go + m.surface
# print 'go>', go
else:
# end of concatenation; make go and proceed
if go != '':
table = addtotable(table, go)
go = ''
m = m.next
#print "EOS"
return table
def kyoukilist(nlist):
pairlist = []
returnlist = {}
# package "itertools" generates all combinations.
nlistlist = dict.items(nlist) # convert to a plain list
import itertools
for u in itertools.combinations(nlistlist, 2):
pairlist.append(u)
for u in pairlist:
#returnlist.update({(u[0][0], u[1][0]): u[0][1]*u[1][1]})
returnlist.update({(u[0][0], u[1][0]): min(u[0][1], u[1][1])})
return returnlist
def appendlist(list1, list2):
for key in list2:
#if list1.has_key(key):
if key in list1:
list1[key] = list1[key]+list2[key]
else:
list1.update({key:list2[key]})
return list1
### MAIN ###
#argvs = sys.argv
#argc = len(argvs)
#if (argc != 2):
# print 'Usage: %s filename' % argvs[0]
# quit()
#filename = argvs[1]
filename='test.txt'
r = re.compile(r'([^-]+)-*')
ignore = genIgnore()
table = {}
kyouki = {}
f = open(filename)
lines = f.readlines()
f.close()
## split lines into sentenses.
line = ''
for s in lines:
if s[:-1]=='':
line = line + '\n'
else:
line = line + s[:-1]
sentenses = line.split(u'。')
for t in sentenses:
nlist = nounlist(t)
# printlist(nlist)
klist = kyoukilist(nlist)
table = appendlist(table, nlist)
kyouki = appendlist(kyouki, klist)
#printlist(table)
#printlistsort(table)
printkyoukilistsort(kyouki)