ノート/ノート
訪問者 131  最終更新 2016-12-12 (月) 13:16:26

(2016-12-12) Python3で、MeCabで単語共起リストを作る(Python3を使って再訪)

昔に作った、文中の単語共起(名詞のみ)のリストを作るプログラムを、python-3に書き直してみた。

python-3による主な変更点は、

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import codecs
import MeCab

def hasKanji(s):
  #Returns True if string s contains a Kanji, False if not.
  kanji = False
  for c in s:
   if (((ord(c)&0xff00) == 0x3000) or (0x4e00 <= ((ord(c)&0xff00) <=0x9f00))):
     kanji = True
     break
  return kanji

def allKanji(s):
  #Returns True if string s contains a Kanji, False if not.
  kanji = True
  for c in s:
   if not (((ord(c)&0xff00) == 0x3000) or ((0x4e00 <= (ord(c)&0xff00)) and ((ord(c)&0xff00) <=0x9f00))):
     kanji = False
     break
  return kanji

def allKomoji(s):
  #Returns True if string s consists of all komoji, False if not.
  return all(i.islower() for i in s)

def printlist(list):
    for key, value in list.iteritems():
        print('(', key, value, ')', end='')
    print

def printkyoukilistsort(list):
    for key, value in sorted(list.items(), key=lambda x:x[1], reverse=True):
        #print '((', key[0], key[1], ')', value, ')',
        print(' '+ key[0], ',', key[1], ',', value)
    print

def printlistsort(list):
    for key, value in sorted(list.items(), key=lambda x:x[1], reverse=True):
        print('(', key, value, ')', end='')
    print

def addtotable(t, g):
    if g in t:
        t[g] = t[g]+1
    else:
        t.update({g:1})
    return t

def genIgnore():
  #returns IGNORE word set
  s = set(['それ', 'これ', 'www', 'wwww', 'ー', 'の', 'ん', 'ーー', 'こと', 'よ う', 'もの', 'ところ', 'いや', 'そう', 'とき', 'こ', 'さ', 'む', 'ら', 'ごと', 'そこ', 'どこ', 'はず', 'ため', 'ア', 'ッ'])
  return s

def nounlist(t):
  #print("Source> ", t)
  table = {}
  if hasKanji(t):
    mt = MeCab.Tagger()
    mt.parse('')
    m = mt.parseToNode(t)
    go = ''
    while m:
        mm = m.feature.split(',')
        hinshi = mm[0]
        hinshi2 = mm[1]
        #hinshi3 = mm[2]

        if (m.surface in ignore) or not allKanji(m.surface):
            if go != '':
                #if table.has_key(go):
                if go in table:
                    table[go] = table[go]+1
                else:
                    table.update({go:1})
                go = ''
        else:
            if (hinshi == u'名詞'):
                # concatenate, if necessary
                go = go + m.surface
                # print 'go>', go
            else:
                # end of concatenation;  make go and proceed
                if go != '':
                    table = addtotable(table, go)
                    go = ''
        m = m.next
    #print "EOS"
  return table

def kyoukilist(nlist):
    pairlist = []
    returnlist = {}
    # package "itertools" generates all combinations.
    nlistlist = dict.items(nlist)     # convert to a plain list
    import itertools
    for u in itertools.combinations(nlistlist, 2):
       pairlist.append(u)
    for u in pairlist:
       #returnlist.update({(u[0][0], u[1][0]): u[0][1]*u[1][1]})
       returnlist.update({(u[0][0], u[1][0]): min(u[0][1], u[1][1])})
    return returnlist

def appendlist(list1, list2):
    for key in list2:
        #if list1.has_key(key):
        if key in list1:
            list1[key] = list1[key]+list2[key]
        else:
            list1.update({key:list2[key]})
    return list1

### MAIN ###
#argvs = sys.argv
#argc = len(argvs)
#if (argc != 2):
#   print 'Usage: %s filename' % argvs[0]
#   quit()
#filename = argvs[1]
filename='test.txt'

r = re.compile(r'([^-]+)-*')
ignore = genIgnore()

table = {}
kyouki = {}

f = open(filename)
lines = f.readlines()
f.close()

## split lines into sentenses.

line = ''
for s in lines:
    if s[:-1]=='':
        line = line + '\n'
    else:
        line = line + s[:-1]

sentenses = line.split(u'。')

for t in sentenses:
    nlist = nounlist(t)
    # printlist(nlist)
    klist = kyoukilist(nlist)
    table = appendlist(table, nlist)
    kyouki = appendlist(kyouki, klist)

#printlist(table)
#printlistsort(table)
printkyoukilistsort(kyouki)

トップ   編集 凍結 差分 バックアップ 添付 複製 名前変更 リロード   新規 一覧 単語検索 最終更新   ヘルプ   最終更新のRSS
Last-modified: 2016-12-12 (月) 13:16:26 (222d)