![]() |
ノート/mecabで共起リスト作成再訪(2016-12-12)http://pepper.is.sci.toho-u.ac.jp/pepper/index.php?%A5%CE%A1%BC%A5%C8%2Fmecab%A4%C7%B6%A6%B5%AF%A5%EA%A5%B9%A5%C8%BA%EE%C0%AE%BA%C6%CB%AC%282016-12-12%29 |
![]() |
ノート/ノート
訪問者 279 最終更新 2016-12-12 (月) 13:16:26
昔に作った、文中の単語共起(名詞のみ)のリストを作るプログラムを、python-3に書き直してみた。
python-3による主な変更点は、
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import codecs import MeCab def hasKanji(s): #Returns True if string s contains a Kanji, False if not. kanji = False for c in s: if (((ord(c)&0xff00) == 0x3000) or (0x4e00 <= ((ord(c)&0xff00) <=0x9f00))): kanji = True break return kanji def allKanji(s): #Returns True if string s contains a Kanji, False if not. kanji = True for c in s: if not (((ord(c)&0xff00) == 0x3000) or ((0x4e00 <= (ord(c)&0xff00)) and ((ord(c)&0xff00) <=0x9f00))): kanji = False break return kanji def allKomoji(s): #Returns True if string s consists of all komoji, False if not. return all(i.islower() for i in s) def printlist(list): for key, value in list.iteritems(): print('(', key, value, ')', end='') print def printkyoukilistsort(list): for key, value in sorted(list.items(), key=lambda x:x[1], reverse=True): #print '((', key[0], key[1], ')', value, ')', print(' '+ key[0], ',', key[1], ',', value) print def printlistsort(list): for key, value in sorted(list.items(), key=lambda x:x[1], reverse=True): print('(', key, value, ')', end='') print def addtotable(t, g): if g in t: t[g] = t[g]+1 else: t.update({g:1}) return t def genIgnore(): #returns IGNORE word set s = set(['それ', 'これ', 'www', 'wwww', 'ー', 'の', 'ん', 'ーー', 'こと', 'よ う', 'もの', 'ところ', 'いや', 'そう', 'とき', 'こ', 'さ', 'む', 'ら', 'ごと', 'そこ', 'どこ', 'はず', 'ため', 'ア', 'ッ']) return s def nounlist(t): #print("Source> ", t) table = {} if hasKanji(t): mt = MeCab.Tagger() mt.parse('') m = mt.parseToNode(t) go = '' while m: mm = m.feature.split(',') hinshi = mm[0] hinshi2 = mm[1] #hinshi3 = mm[2] if (m.surface in ignore) or not allKanji(m.surface): if go != '': #if table.has_key(go): if go in table: table[go] = table[go]+1 else: table.update({go:1}) go = '' else: if (hinshi == u'名詞'): # concatenate, if necessary go = go + m.surface # print 'go>', go else: # end of concatenation; make go and proceed if go != '': table = addtotable(table, go) go = '' m = m.next #print "EOS" return table def kyoukilist(nlist): pairlist = [] returnlist = {} # package "itertools" generates all combinations. nlistlist = dict.items(nlist) # convert to a plain list import itertools for u in itertools.combinations(nlistlist, 2): pairlist.append(u) for u in pairlist: #returnlist.update({(u[0][0], u[1][0]): u[0][1]*u[1][1]}) returnlist.update({(u[0][0], u[1][0]): min(u[0][1], u[1][1])}) return returnlist def appendlist(list1, list2): for key in list2: #if list1.has_key(key): if key in list1: list1[key] = list1[key]+list2[key] else: list1.update({key:list2[key]}) return list1 ### MAIN ### #argvs = sys.argv #argc = len(argvs) #if (argc != 2): # print 'Usage: %s filename' % argvs[0] # quit() #filename = argvs[1] filename='test.txt' r = re.compile(r'([^-]+)-*') ignore = genIgnore() table = {} kyouki = {} f = open(filename) lines = f.readlines() f.close() ## split lines into sentenses. line = '' for s in lines: if s[:-1]=='': line = line + '\n' else: line = line + s[:-1] sentenses = line.split(u'。') for t in sentenses: nlist = nounlist(t) # printlist(nlist) klist = kyoukilist(nlist) table = appendlist(table, nlist) kyouki = appendlist(kyouki, klist) #printlist(table) #printlistsort(table) printkyoukilistsort(kyouki)