[
トップ
] [
新規
|
一覧
|
単語検索
|
最終更新
|
ヘルプ
]
開始行:
[[ノート/テキストマイニング]]~
&counter(); &lastmod();~
**Python/NLTKからWordNet 3.0を使ってみる - 2 [#s5d5d2fa]
>>> from nltk.corpus import wordnet as wn
>>> wn.synsets('馬', lang='jpn')
[Synset('horse.n.01')]
>>> wn.synsets('馬', lang='jpn')[0].hypernyms()
[Synset('equine.n.01')]
>>> wn.synsets('馬', lang='jpn')[0].hyponyms()
[Synset('bay.n.07'), Synset('chestnut.n.06'), Synset('eohippus.n.01'), Synset('gee-gee.n.01'),
Synset('hack.n.06'), Synset('hack.n.07'), Synset('harness_horse.n.01'), Synset('liver_chestnut.n.01'),
Synset('male_horse.n.01'), Synset('mare.n.01'), Synset('mesohippus.n.01'), Synset('pacer.n.02'),
Synset('palomino.n.01'), Synset('pinto.n.01'), Synset('polo_pony.n.01'), Synset('pony.n.01'),
Synset('pony.n.05'), Synset('post_horse.n.01'), Synset('protohippus.n.01'), Synset('racehorse.n.01'),
Synset('roan.n.02'), Synset('saddle_horse.n.01'), Synset('sorrel.n.05'), Synset('stablemate.n.01'),
Synset('stalking-horse.n.04'), Synset('steeplechaser.n.01'), Synset('stepper.n.03'), Synset('wild_horse.n.01'), Synset('workhorse.n.02')]
>>> wn.synsets('リンゴ', lang='jpn')[0].hyponyms()
[Synset('cooking_apple.n.01'), Synset('crab_apple.n.03'), Synset('eating_apple.n.01')]
>>> wn.synsets('リンゴ', lang='jpn')[0].hypernyms()
[Synset('edible_fruit.n.01'), Synset('pome.n.01')]
>>> wn.synsets('ブドウ', lang='jpn')[0].hypernyms()
[Synset('edible_fruit.n.01')]
>>> wn.synset('equine.n.01')
Synset('equine.n.01')
>>> wn.synset('horse.n.01').lemma_names()
['horse', 'Equus_caballus']
>>> wn.synset('equine.n.01').lemma_names()
['equine', 'equid']
>>> wn.synset('equine.n.01').definition()
'hoofed mammals having slender legs and a flat coat with a narrow mane along the back of the neck'
>>> wn.synset('equid.n.01').definition()
'hoofed mammals having slender legs and a flat coat with a narrow mane along the back of the neck'
>>> wn.synset('equid.n.01').lemmas('jpn')
[Lemma('equine.n.01.ウマ')]
>>> wn.synset('horse.n.01').lemmas('jpn')
[Lemma('horse.n.01.ウマ'), Lemma('horse.n.01.牡馬'), Lemma('horse.n.01.雄馬'), Lemma('horse.n.01.馬'), Lemma('horse.n.01.馬匹')]
>>> wn.synset('equine.n.01').hypernyms()
[Synset('odd-toed_ungulate.n.01')]
>>> wn.synset('odd-toed_ungulate.n.01').lemmas('jpn')
[Lemma('odd-toed_ungulate.n.01.ウマ目')]
>>> wn.synset('odd-toed_ungulate.n.01').hypernyms()
[Synset('ungulate.n.01')]
>>> wn.synset('odd-toed_ungulate.n.01').hypernyms()
[Synset('ungulate.n.01')]
>>> wn.synset('ungulate.n.01').lemmas('jpn')
[Lemma('ungulate.n.01.有蹄類')]
>>> wn.synset('ungulate.n.01').hypernyms()
[Synset('placental.n.01')]
>>> wn.synset('placental.n.01').lemmas('jpn')
[Lemma('placental.n.01.有胎盤哺乳類'), Lemma('placental.n.01.有胎盤類')]
*** 女王 [#j24fb9ba]
「女王」には5つ意味がある。
>>> wn.synsets('女王', lang='jpn')
[Synset('queen.n.01'), Synset('queen.n.08'), Synset('queen.n.07'), Synset('queen.n.02'), Synset('queen.n.04')]
定義は
>>> [u.definition() for u in wn.synsets('女王', lang='jpn')]
['the only fertile female in a colony of social insects such as bees and ants and termites;
its function is to lay eggs',
'(chess) the most powerful piece',
'one of four face cards in a deck bearing a picture of a queen',
'a female sovereign ruler',
'something personified as a woman who is considered the best or most important of her kind']
3番のlemmaは
>>> wn.synsets('女王', lang='jpn')[3].lemmas('jpn')
[Lemma('queen.n.02.クィーン'), Lemma('queen.n.02.女君主'), Lemma('queen.n.02.女 帝'),
Lemma('queen.n.02.女王'), Lemma('queen.n.02.女皇'), Lemma('queen.n.02.統治女王')]
上位概念は
>>> wn.synsets('女王', lang='jpn')[3].hypernyms()
[Synset('female_aristocrat.n.01')]
>>> [u.definition() for u in wn.synsets('女王', lang='jpn')[3].hypernyms()]
['a woman who is an aristocrat']
>>> [u.lemmas() for u in wn.synsets('女王', lang='jpn')[3].hypernyms()]
[[Lemma('female_aristocrat.n.01.female_aristocrat')]]
>>> [u.lemmas('jpn') for u in wn.synsets('女王', lang='jpn')[3].hypernyms()]
[[]] <--- 日本語では定義されていない
***「日」の多義性をどう取り出すか [#i0a0f04a]
>>> wn.synsets('日', lang='jpn')
[Synset('japan.n.02'), Synset('sun.n.01'), Synset('day.n.02'), Synset('day.n.05'),
Synset('day.n.01'), Synset('day.n.03'), Synset('date.n.01'), Synset('date.n.07'),
Synset('date.n.06'), Synset('sunday.n.01'), Synset('date.n.04'), Synset('day.n.07')]
>>> [u.definition() for u in wn.synsets('日', lang='jpn')]
+ 'a constitutional monarchy occupying the Japanese Archipelago; a world leader in electronics and automobile manufacture and ship building',
+ 'the star that is the source of light and heat for the planets in the solar system',
+ 'some point or period in time',
+ 'the recurring hours when you are not sleeping (especially those when you are working)',
+ 'time for Earth to make a complete rotation on its axis',
+ 'a day assigned to a particular purpose or observance',
+ 'the specified day of the month',
+ 'a particular day specified as the time something happens',
+ 'the particular day, month, or year (usually according to the Gregorian calendar) that an event occurred',
+ 'first day of the week; observed as a day of rest and worship by most Christians',
+ 'a particular but unspecified point in time',
+ 'the period of time taken by a particular planet (e.g. Mars) to make a complete rotation on its axis']
>>> [u.lemmas('jpn') for u in wn.synsets('日', lang='jpn')]
+ [Lemma('japan.n.02.日'), Lemma('japan.n.02.日本')],
+ [Lemma('sun.n.01.ソレイユ'), Lemma('sun.n.01.天道'), Lemma('sun.n.01.お天道様'), Lemma('sun.n.01.天道様'),
Lemma('sun.n.01.太陽'), Lemma('sun.n.01.御天道様'), Lemma('sun.n.01.御日様'),
Lemma('sun.n.01.お日さま'), Lemma('sun.n.01.日'), Lemma('sun.n.01.日天'),
Lemma('sun.n.01.日天子'), Lemma('sun.n.01.お日様'), Lemma('sun.n.01.日輪'),
Lemma('sun.n.01.火輪')],
+ [Lemma('day.n.02.日')],
+ [Lemma('day.n.05.日')],
+ [Lemma('day.n.01.一日'), Lemma('day.n.01.太陽日'), Lemma('day.n.01.平均太陽日'),
Lemma('day.n.01.日')],
+ [Lemma('day.n.03.ディ'), Lemma('day.n.03.日')],
+ [Lemma('date.n.01.日'), Lemma('date.n.01.日づけ'), Lemma('date.n.01.日付'),
Lemma('date.n.01.日付け')],
+ [Lemma('date.n.07.日')],
+ [Lemma('date.n.06.デート'), Lemma('date.n.06.年代'), Lemma('date.n.06.年月'), Lemma('date.n.06.年月日'), Lemma('date.n.06.日'), Lemma('date.n.06.日づけ'), Lemma('date.n.06.日付'), Lemma('date.n.06.日付け'), Lemma('date.n.06.日取り'), Lemma('date.n.06.日日'), Lemma('date.n.06.日時'), Lemma('date.n.06.時日'), Lemma('date.n.06.月日'), Lemma('date.n.06.期日'), Lemma('date.n.06.開催日')],
+ [Lemma('sunday.n.01.主日'), Lemma('sunday.n.01.日'), Lemma('sunday.n.01.日曜'), Lemma('sunday.n.01.日曜日')],
+ [Lemma('date.n.04.日')],
+ [Lemma('day.n.07.ディ'), Lemma('day.n.07.日'),
Lemma('day.n.07.昼'), Lemma('day.n.07.昼間')]]
表層だけ取り出すなら
>>> [[v.name() for v in u.lemmas('jpn')] for u in wn.synsets('日', lang='jpn')]
[['日', '日本'],
['ソレイユ', '天道', 'お天道様', '天道様', '太陽', '御天道様', '御日様', 'お日さま', '日', '日天', '日天子', 'お日様', '日輪', '火輪'],
['日'],
['日'],
['一日', '太陽日', '平均太陽日', '日'],
['ディ', '日'],
['日', '日づけ', '日付', '日付け'],
['日'],
['デート', '年代', '年月', '年月日', '日', '日づけ', '日付', '日付け', '日取り', '日日', '日時', '時日', '月日', '期日', '開催日'],
['主日', '日', '日曜', '日曜日'],
['日'],
['ディ', '日', '昼', '昼間']]
これのそれぞれの行に対して、w2vの空間上での主成分を拾ってみる。もしくは代表ベクトルを作る。平均を取るとしたら、各語の出現頻度による加重平均が必要かも知れないが、出現頻度は分からないだろう。
但し、1つしか単語のない場合(「日」だけとか)は対応不可なので、今はあきらめる。
これで、w2vベクトル空間内でのそれぞれのポイントができるのか?
そもそも、synsetの「概念」とここのリストとはどう関係するか?
語→意味(synset)ブランチ→そのsynsetの語部分
で作った。つまりブランチ1で言えば
ブランチ1の 'japan.n.02.日' → 語の部分'日'
ブランチ1では、'japan.n.02.日', 'japan.n.02.日本' なので、両方とも'japan.n.02'(これが日本を表す。その表層が'日'と'日本')
本当は、各概念 (たとえば(japan.n.02)で、代表的な(それを代表する)表層語が書いてあるといいのだが。
この、「代表する語」のw2vベクトルを、このブランチのベクトルとする、というのも考えられるだろう。
今度は、japan.n.02の代表を「日本」としたとき、日本が表す概念はほかにないのか?
>>> wn.synsets('日本',lang='jpn')
[Synset('japan.n.02')]
>>> wn.synset('japan.n.02').definition()
'a constitutional monarchy occupying the Japanese Archipelago; a world leader in electronics
and automobile manufacture and ship building'
ちなみに
>>> wn.synset('japan.n.01').definition()
' a string of more than 3,000 islands to the east of Asia extending 1,300 miles between
the Sea of Japan and the western Pacific Ocean'
で、更に見ると、
>>> wn.synset('japan.n.02').lemmas()
[Lemma('japan.n.02.Japan'), Lemma('japan.n.02.Nippon'),
Lemma('japan.n.02.Nihon')]
>>> wn.synset('japan.n.01').lemmas()
[Lemma('japan.n.01.Japan'), Lemma('japan.n.01.Japanese_Islands'),
Lemma('japan.n.01.Japanese_Archipelago')]
となっている。
終了行:
[[ノート/テキストマイニング]]~
&counter(); &lastmod();~
**Python/NLTKからWordNet 3.0を使ってみる - 2 [#s5d5d2fa]
>>> from nltk.corpus import wordnet as wn
>>> wn.synsets('馬', lang='jpn')
[Synset('horse.n.01')]
>>> wn.synsets('馬', lang='jpn')[0].hypernyms()
[Synset('equine.n.01')]
>>> wn.synsets('馬', lang='jpn')[0].hyponyms()
[Synset('bay.n.07'), Synset('chestnut.n.06'), Synset('eohippus.n.01'), Synset('gee-gee.n.01'),
Synset('hack.n.06'), Synset('hack.n.07'), Synset('harness_horse.n.01'), Synset('liver_chestnut.n.01'),
Synset('male_horse.n.01'), Synset('mare.n.01'), Synset('mesohippus.n.01'), Synset('pacer.n.02'),
Synset('palomino.n.01'), Synset('pinto.n.01'), Synset('polo_pony.n.01'), Synset('pony.n.01'),
Synset('pony.n.05'), Synset('post_horse.n.01'), Synset('protohippus.n.01'), Synset('racehorse.n.01'),
Synset('roan.n.02'), Synset('saddle_horse.n.01'), Synset('sorrel.n.05'), Synset('stablemate.n.01'),
Synset('stalking-horse.n.04'), Synset('steeplechaser.n.01'), Synset('stepper.n.03'), Synset('wild_horse.n.01'), Synset('workhorse.n.02')]
>>> wn.synsets('リンゴ', lang='jpn')[0].hyponyms()
[Synset('cooking_apple.n.01'), Synset('crab_apple.n.03'), Synset('eating_apple.n.01')]
>>> wn.synsets('リンゴ', lang='jpn')[0].hypernyms()
[Synset('edible_fruit.n.01'), Synset('pome.n.01')]
>>> wn.synsets('ブドウ', lang='jpn')[0].hypernyms()
[Synset('edible_fruit.n.01')]
>>> wn.synset('equine.n.01')
Synset('equine.n.01')
>>> wn.synset('horse.n.01').lemma_names()
['horse', 'Equus_caballus']
>>> wn.synset('equine.n.01').lemma_names()
['equine', 'equid']
>>> wn.synset('equine.n.01').definition()
'hoofed mammals having slender legs and a flat coat with a narrow mane along the back of the neck'
>>> wn.synset('equid.n.01').definition()
'hoofed mammals having slender legs and a flat coat with a narrow mane along the back of the neck'
>>> wn.synset('equid.n.01').lemmas('jpn')
[Lemma('equine.n.01.ウマ')]
>>> wn.synset('horse.n.01').lemmas('jpn')
[Lemma('horse.n.01.ウマ'), Lemma('horse.n.01.牡馬'), Lemma('horse.n.01.雄馬'), Lemma('horse.n.01.馬'), Lemma('horse.n.01.馬匹')]
>>> wn.synset('equine.n.01').hypernyms()
[Synset('odd-toed_ungulate.n.01')]
>>> wn.synset('odd-toed_ungulate.n.01').lemmas('jpn')
[Lemma('odd-toed_ungulate.n.01.ウマ目')]
>>> wn.synset('odd-toed_ungulate.n.01').hypernyms()
[Synset('ungulate.n.01')]
>>> wn.synset('odd-toed_ungulate.n.01').hypernyms()
[Synset('ungulate.n.01')]
>>> wn.synset('ungulate.n.01').lemmas('jpn')
[Lemma('ungulate.n.01.有蹄類')]
>>> wn.synset('ungulate.n.01').hypernyms()
[Synset('placental.n.01')]
>>> wn.synset('placental.n.01').lemmas('jpn')
[Lemma('placental.n.01.有胎盤哺乳類'), Lemma('placental.n.01.有胎盤類')]
*** 女王 [#j24fb9ba]
「女王」には5つ意味がある。
>>> wn.synsets('女王', lang='jpn')
[Synset('queen.n.01'), Synset('queen.n.08'), Synset('queen.n.07'), Synset('queen.n.02'), Synset('queen.n.04')]
定義は
>>> [u.definition() for u in wn.synsets('女王', lang='jpn')]
['the only fertile female in a colony of social insects such as bees and ants and termites;
its function is to lay eggs',
'(chess) the most powerful piece',
'one of four face cards in a deck bearing a picture of a queen',
'a female sovereign ruler',
'something personified as a woman who is considered the best or most important of her kind']
3番のlemmaは
>>> wn.synsets('女王', lang='jpn')[3].lemmas('jpn')
[Lemma('queen.n.02.クィーン'), Lemma('queen.n.02.女君主'), Lemma('queen.n.02.女 帝'),
Lemma('queen.n.02.女王'), Lemma('queen.n.02.女皇'), Lemma('queen.n.02.統治女王')]
上位概念は
>>> wn.synsets('女王', lang='jpn')[3].hypernyms()
[Synset('female_aristocrat.n.01')]
>>> [u.definition() for u in wn.synsets('女王', lang='jpn')[3].hypernyms()]
['a woman who is an aristocrat']
>>> [u.lemmas() for u in wn.synsets('女王', lang='jpn')[3].hypernyms()]
[[Lemma('female_aristocrat.n.01.female_aristocrat')]]
>>> [u.lemmas('jpn') for u in wn.synsets('女王', lang='jpn')[3].hypernyms()]
[[]] <--- 日本語では定義されていない
***「日」の多義性をどう取り出すか [#i0a0f04a]
>>> wn.synsets('日', lang='jpn')
[Synset('japan.n.02'), Synset('sun.n.01'), Synset('day.n.02'), Synset('day.n.05'),
Synset('day.n.01'), Synset('day.n.03'), Synset('date.n.01'), Synset('date.n.07'),
Synset('date.n.06'), Synset('sunday.n.01'), Synset('date.n.04'), Synset('day.n.07')]
>>> [u.definition() for u in wn.synsets('日', lang='jpn')]
+ 'a constitutional monarchy occupying the Japanese Archipelago; a world leader in electronics and automobile manufacture and ship building',
+ 'the star that is the source of light and heat for the planets in the solar system',
+ 'some point or period in time',
+ 'the recurring hours when you are not sleeping (especially those when you are working)',
+ 'time for Earth to make a complete rotation on its axis',
+ 'a day assigned to a particular purpose or observance',
+ 'the specified day of the month',
+ 'a particular day specified as the time something happens',
+ 'the particular day, month, or year (usually according to the Gregorian calendar) that an event occurred',
+ 'first day of the week; observed as a day of rest and worship by most Christians',
+ 'a particular but unspecified point in time',
+ 'the period of time taken by a particular planet (e.g. Mars) to make a complete rotation on its axis']
>>> [u.lemmas('jpn') for u in wn.synsets('日', lang='jpn')]
+ [Lemma('japan.n.02.日'), Lemma('japan.n.02.日本')],
+ [Lemma('sun.n.01.ソレイユ'), Lemma('sun.n.01.天道'), Lemma('sun.n.01.お天道様'), Lemma('sun.n.01.天道様'),
Lemma('sun.n.01.太陽'), Lemma('sun.n.01.御天道様'), Lemma('sun.n.01.御日様'),
Lemma('sun.n.01.お日さま'), Lemma('sun.n.01.日'), Lemma('sun.n.01.日天'),
Lemma('sun.n.01.日天子'), Lemma('sun.n.01.お日様'), Lemma('sun.n.01.日輪'),
Lemma('sun.n.01.火輪')],
+ [Lemma('day.n.02.日')],
+ [Lemma('day.n.05.日')],
+ [Lemma('day.n.01.一日'), Lemma('day.n.01.太陽日'), Lemma('day.n.01.平均太陽日'),
Lemma('day.n.01.日')],
+ [Lemma('day.n.03.ディ'), Lemma('day.n.03.日')],
+ [Lemma('date.n.01.日'), Lemma('date.n.01.日づけ'), Lemma('date.n.01.日付'),
Lemma('date.n.01.日付け')],
+ [Lemma('date.n.07.日')],
+ [Lemma('date.n.06.デート'), Lemma('date.n.06.年代'), Lemma('date.n.06.年月'), Lemma('date.n.06.年月日'), Lemma('date.n.06.日'), Lemma('date.n.06.日づけ'), Lemma('date.n.06.日付'), Lemma('date.n.06.日付け'), Lemma('date.n.06.日取り'), Lemma('date.n.06.日日'), Lemma('date.n.06.日時'), Lemma('date.n.06.時日'), Lemma('date.n.06.月日'), Lemma('date.n.06.期日'), Lemma('date.n.06.開催日')],
+ [Lemma('sunday.n.01.主日'), Lemma('sunday.n.01.日'), Lemma('sunday.n.01.日曜'), Lemma('sunday.n.01.日曜日')],
+ [Lemma('date.n.04.日')],
+ [Lemma('day.n.07.ディ'), Lemma('day.n.07.日'),
Lemma('day.n.07.昼'), Lemma('day.n.07.昼間')]]
表層だけ取り出すなら
>>> [[v.name() for v in u.lemmas('jpn')] for u in wn.synsets('日', lang='jpn')]
[['日', '日本'],
['ソレイユ', '天道', 'お天道様', '天道様', '太陽', '御天道様', '御日様', 'お日さま', '日', '日天', '日天子', 'お日様', '日輪', '火輪'],
['日'],
['日'],
['一日', '太陽日', '平均太陽日', '日'],
['ディ', '日'],
['日', '日づけ', '日付', '日付け'],
['日'],
['デート', '年代', '年月', '年月日', '日', '日づけ', '日付', '日付け', '日取り', '日日', '日時', '時日', '月日', '期日', '開催日'],
['主日', '日', '日曜', '日曜日'],
['日'],
['ディ', '日', '昼', '昼間']]
これのそれぞれの行に対して、w2vの空間上での主成分を拾ってみる。もしくは代表ベクトルを作る。平均を取るとしたら、各語の出現頻度による加重平均が必要かも知れないが、出現頻度は分からないだろう。
但し、1つしか単語のない場合(「日」だけとか)は対応不可なので、今はあきらめる。
これで、w2vベクトル空間内でのそれぞれのポイントができるのか?
そもそも、synsetの「概念」とここのリストとはどう関係するか?
語→意味(synset)ブランチ→そのsynsetの語部分
で作った。つまりブランチ1で言えば
ブランチ1の 'japan.n.02.日' → 語の部分'日'
ブランチ1では、'japan.n.02.日', 'japan.n.02.日本' なので、両方とも'japan.n.02'(これが日本を表す。その表層が'日'と'日本')
本当は、各概念 (たとえば(japan.n.02)で、代表的な(それを代表する)表層語が書いてあるといいのだが。
この、「代表する語」のw2vベクトルを、このブランチのベクトルとする、というのも考えられるだろう。
今度は、japan.n.02の代表を「日本」としたとき、日本が表す概念はほかにないのか?
>>> wn.synsets('日本',lang='jpn')
[Synset('japan.n.02')]
>>> wn.synset('japan.n.02').definition()
'a constitutional monarchy occupying the Japanese Archipelago; a world leader in electronics
and automobile manufacture and ship building'
ちなみに
>>> wn.synset('japan.n.01').definition()
' a string of more than 3,000 islands to the east of Asia extending 1,300 miles between
the Sea of Japan and the western Pacific Ocean'
で、更に見ると、
>>> wn.synset('japan.n.02').lemmas()
[Lemma('japan.n.02.Japan'), Lemma('japan.n.02.Nippon'),
Lemma('japan.n.02.Nihon')]
>>> wn.synset('japan.n.01').lemmas()
[Lemma('japan.n.01.Japan'), Lemma('japan.n.01.Japanese_Islands'),
Lemma('japan.n.01.Japanese_Archipelago')]
となっている。
ページ名: