[[ノート/テキストマイニング]]~
&counter();   &lastmod();~

**Python/NLTKからWordNet 3.0を使ってみる - 2 [#s5d5d2fa]

 >>> from nltk.corpus import wordnet as wn
 >>> wn.synsets('馬', lang='jpn')
 [Synset('horse.n.01')]
 >>> wn.synsets('馬', lang='jpn')[0].hypernyms()
 [Synset('equine.n.01')]
 >>> wn.synsets('馬', lang='jpn')[0].hyponyms()
 [Synset('bay.n.07'), Synset('chestnut.n.06'), Synset('eohippus.n.01'), Synset('gee-gee.n.01'), 
 Synset('hack.n.06'), Synset('hack.n.07'), Synset('harness_horse.n.01'), Synset('liver_chestnut.n.01'), 
 Synset('male_horse.n.01'), Synset('mare.n.01'), Synset('mesohippus.n.01'), Synset('pacer.n.02'), 
 Synset('palomino.n.01'), Synset('pinto.n.01'), Synset('polo_pony.n.01'), Synset('pony.n.01'), 
 Synset('pony.n.05'), Synset('post_horse.n.01'), Synset('protohippus.n.01'), Synset('racehorse.n.01'), 
 Synset('roan.n.02'), Synset('saddle_horse.n.01'), Synset('sorrel.n.05'), Synset('stablemate.n.01'), 
 Synset('stalking-horse.n.04'), Synset('steeplechaser.n.01'), Synset('stepper.n.03'), Synset('wild_horse.n.01'), Synset('workhorse.n.02')]

 >>> wn.synsets('リンゴ', lang='jpn')[0].hyponyms()
 [Synset('cooking_apple.n.01'), Synset('crab_apple.n.03'), Synset('eating_apple.n.01')]
 >>> wn.synsets('リンゴ', lang='jpn')[0].hypernyms()
 [Synset('edible_fruit.n.01'), Synset('pome.n.01')]
 >>> wn.synsets('ブドウ', lang='jpn')[0].hypernyms()
 [Synset('edible_fruit.n.01')]

 >>> wn.synset('equine.n.01')
 Synset('equine.n.01')
 >>> wn.synset('horse.n.01').lemma_names()
 ['horse', 'Equus_caballus']
 >>> wn.synset('equine.n.01').lemma_names()
 ['equine', 'equid']
 >>> wn.synset('equine.n.01').definition()
 'hoofed mammals having slender legs and a flat coat with a narrow mane along the back of the neck'
 >>> wn.synset('equid.n.01').definition()
 'hoofed mammals having slender legs and a flat coat with a narrow mane along the back of the neck'
 >>> wn.synset('equid.n.01').lemmas('jpn')
 [Lemma('equine.n.01.ウマ')]
 >>> wn.synset('horse.n.01').lemmas('jpn')
 [Lemma('horse.n.01.ウマ'), Lemma('horse.n.01.牡馬'), Lemma('horse.n.01.雄馬'), Lemma('horse.n.01.馬'), Lemma('horse.n.01.馬匹')]
 >>> wn.synset('equine.n.01').hypernyms()
 [Synset('odd-toed_ungulate.n.01')]
 >>> wn.synset('odd-toed_ungulate.n.01').lemmas('jpn')
 [Lemma('odd-toed_ungulate.n.01.ウマ目')]
 >>> wn.synset('odd-toed_ungulate.n.01').hypernyms()
 [Synset('ungulate.n.01')]
 >>> wn.synset('odd-toed_ungulate.n.01').hypernyms()
 [Synset('ungulate.n.01')]
 >>> wn.synset('ungulate.n.01').lemmas('jpn')
 [Lemma('ungulate.n.01.有蹄類')]
 >>> wn.synset('ungulate.n.01').hypernyms()
 [Synset('placental.n.01')]
 >>> wn.synset('placental.n.01').lemmas('jpn')
 [Lemma('placental.n.01.有胎盤哺乳類'), Lemma('placental.n.01.有胎盤類')]


*** 女王 [#j24fb9ba]
「女王」には5つ意味がある。
 >>> wn.synsets('女王', lang='jpn')
 [Synset('queen.n.01'), Synset('queen.n.08'), Synset('queen.n.07'), Synset('queen.n.02'), Synset('queen.n.04')]
定義は
 >>> [u.definition() for u in wn.synsets('女王', lang='jpn')]
 ['the only fertile female in a colony of social insects such as bees and ants and termites; 
    its function is to lay eggs', 
 '(chess) the most powerful piece', 
 'one of four face cards in a deck bearing a picture of a queen', 
 'a female sovereign ruler', 
 'something personified as a woman who is considered the best or most important of her kind']
3番のlemmaは
 >>> wn.synsets('女王', lang='jpn')[3].lemmas('jpn')
 [Lemma('queen.n.02.クィーン'), Lemma('queen.n.02.女君主'), Lemma('queen.n.02.女 帝'), 
 Lemma('queen.n.02.女王'), Lemma('queen.n.02.女皇'), Lemma('queen.n.02.統治女王')]

上位概念は
 >>> wn.synsets('女王', lang='jpn')[3].hypernyms()
 [Synset('female_aristocrat.n.01')]
 >>> [u.definition() for u in wn.synsets('女王', lang='jpn')[3].hypernyms()]
 ['a woman who is an aristocrat']
 >>> [u.lemmas() for u in wn.synsets('女王', lang='jpn')[3].hypernyms()]
 [[Lemma('female_aristocrat.n.01.female_aristocrat')]]
 >>> [u.lemmas('jpn') for u in wn.synsets('女王', lang='jpn')[3].hypernyms()]
 [[]]   <--- 日本語では定義されていない

***日 [#i0a0f04a]
***「日」の多義性をどう取り出すか [#i0a0f04a]
 >>> wn.synsets('日', lang='jpn')
 [Synset('japan.n.02'), Synset('sun.n.01'), Synset('day.n.02'), Synset('day.n.05'),
 Synset('day.n.01'), Synset('day.n.03'), Synset('date.n.01'), Synset('date.n.07'),
 Synset('date.n.06'), Synset('sunday.n.01'), Synset('date.n.04'), Synset('day.n.07')]

 >>> [u.definition() for u in wn.synsets('日', lang='jpn')]
+ 'a constitutional monarchy occupying the Japanese Archipelago; a world leader in electronics and automobile manufacture and ship building', 
+ 'the star that is the source of light and heat for the planets in the solar system',
+ 'some point or period in time', 
+ 'the recurring hours when you are not sleeping (especially those when you are working)', 
+ 'time for Earth to make a complete rotation on its axis',
+  'a day assigned to a particular purpose or observance', 
+ 'the specified day of the month', 
+ 'a particular day specified as the time something happens', 
+ 'the particular day, month, or year (usually according to the Gregorian calendar) that an event occurred', 
+ 'first day of the week; observed as a day of rest and worship by most Christians', 
+ 'a particular but unspecified point in time', 
+ 'the period of time taken by a particular planet (e.g. Mars) to make a complete rotation on its axis']

 >>> [u.lemmas('jpn') for u in wn.synsets('日', lang='jpn')]
+ [Lemma('japan.n.02.日'), Lemma('japan.n.02.日本')],
+ [Lemma('sun.n.01.ソレイユ'), Lemma('sun.n.01.天道'), Lemma('sun.n.01.お天道様'), Lemma('sun.n.01.天道様'), 
Lemma('sun.n.01.太陽'), Lemma('sun.n.01.御天道様'), Lemma('sun.n.01.御日様'), 
Lemma('sun.n.01.お日さま'), Lemma('sun.n.01.日'), Lemma('sun.n.01.日天'), 
Lemma('sun.n.01.日天子'), Lemma('sun.n.01.お日様'), Lemma('sun.n.01.日輪'), 
Lemma('sun.n.01.火輪')],
+ [Lemma('day.n.02.日')], 
+ [Lemma('day.n.05.日')], 
+ [Lemma('day.n.01.一日'), Lemma('day.n.01.太陽日'), Lemma('day.n.01.平均太陽日'), 
Lemma('day.n.01.日')],
+ [Lemma('day.n.03.ディ'), Lemma('day.n.03.日')], 
+ [Lemma('date.n.01.日'), Lemma('date.n.01.日づけ'), Lemma('date.n.01.日付'), 
Lemma('date.n.01.日付け')],
+ [Lemma('date.n.07.日')],
+ [Lemma('date.n.06.デート'), Lemma('date.n.06.年代'), Lemma('date.n.06.年月'), Lemma('date.n.06.年月日'), Lemma('date.n.06.日'), Lemma('date.n.06.日づけ'), Lemma('date.n.06.日付'), Lemma('date.n.06.日付け'), Lemma('date.n.06.日取り'), Lemma('date.n.06.日日'), Lemma('date.n.06.日時'), Lemma('date.n.06.時日'), Lemma('date.n.06.月日'), Lemma('date.n.06.期日'), Lemma('date.n.06.開催日')], 
+ [Lemma('sunday.n.01.主日'), Lemma('sunday.n.01.日'), Lemma('sunday.n.01.日曜'), Lemma('sunday.n.01.日曜日')], 
+ [Lemma('date.n.04.日')], 
+ [Lemma('day.n.07.ディ'), Lemma('day.n.07.日'), 
Lemma('day.n.07.昼'), Lemma('day.n.07.昼間')]]

表層だけ取り出すなら
 >>> [[v.name() for v in u.lemmas('jpn')] for u in wn.synsets('日', lang='jpn')]
 [['日', '日本'], 
 ['ソレイユ', '天道', 'お天道様', '天道様', '太陽', '御天道様', '御日様', 'お日さま', '日', '日天', '日天子', 'お日様', '日輪', '火輪'], 
 ['日'], 
 ['日'], 
 ['一日', '太陽日', '平均太陽日', '日'], 
 ['ディ', '日'], 
 ['日', '日づけ', '日付', '日付け'], 
 ['日'], 
 ['デート', '年代', '年月', '年月日', '日', '日づけ', '日付', '日付け', '日取り', '日日', '日時', '時日', '月日', '期日', '開催日'], 
 ['主日', '日', '日曜', '日曜日'], 
 ['日'], 
 ['ディ', '日', '昼', '昼間']]
これのそれぞれの行に対して、w2vの空間上での主成分を拾ってみる。もしくは代表ベクトルを作る。平均を取るとしたら、各語の出現頻度による加重平均が必要かも知れないが、出現頻度は分からないだろう。

但し、1つしか単語のない場合(「日」だけとか)は対応不可なので、今はあきらめる。

これで、w2vベクトル空間内でのそれぞれのポイントができるのか?

そもそも、synsetの「概念」とここのリストとはどう関係するか? 
 語→意味(synset)ブランチ→そのsynsetの語部分 
で作った。つまりブランチ1で言えば
 ブランチ1の 'japan.n.02.日' → 語の部分'日'
ブランチ1では、'japan.n.02.日', 'japan.n.02.日本' なので、両方とも'japan.n.02'(これが日本を表す。その表層が'日'と'日本')

本当は、各概念 (たとえば(japan.n.02)で、代表的な(それを代表する)表層語が書いてあるといいのだが。

この、「代表する語」のw2vベクトルを、このブランチのベクトルとする、というのも考えられるだろう。

今度は、japan.n.02の代表を「日本」としたとき、日本が表す概念はほかにないのか?
 >>> wn.synsets('日本',lang='jpn')
 [Synset('japan.n.02')]
 >>> wn.synset('japan.n.02').definition()
 'a constitutional monarchy occupying the Japanese Archipelago; a world leader in electronics 
 and automobile manufacture and ship building'
ちなみに
 >>> wn.synset('japan.n.01').definition()
 ' a string of more than 3,000 islands to the east of Asia extending 1,300 miles between 
 the Sea of Japan and the western Pacific Ocean'
で、更に見ると、
 >>> wn.synset('japan.n.02').lemmas()
 [Lemma('japan.n.02.Japan'), Lemma('japan.n.02.Nippon'), 
 Lemma('japan.n.02.Nihon')]
 >>> wn.synset('japan.n.01').lemmas()
 [Lemma('japan.n.01.Japan'), Lemma('japan.n.01.Japanese_Islands'),
 Lemma('japan.n.01.Japanese_Archipelago')]
となっている。

トップ   編集 差分 バックアップ 添付 複製 名前変更 リロード   新規 一覧 単語検索 最終更新   ヘルプ   最終更新のRSS