3 # Copyright (C) 2010-2013 Takuya Nishimoto (NVDA Japanese Team)
\r
5 from __future__ import unicode_literals
\r
10 from datetime import datetime
\r
13 import eng_dic_maker
\r
14 import tankan_dic_maker
\r
15 import custom_dic_maker
\r
16 import roma_dic_maker
\r
18 # MECAB_DICT_INDEX と OUTDIR は libopenjtalk/mecab-naist-jdic/_temp が基準
\r
19 THISDIR = path.join(path.dirname(path.abspath(__file__)), "libopenjtalk", "mecab-naist-jdic")
\r
20 OUTDIR = path.normpath(path.join(THISDIR, "dic"))
\r
21 TEMPDIR = path.normpath(path.join(THISDIR, "_temp"))
\r
22 ENGDIC = path.normpath(path.join(path.dirname(path.abspath(__file__)), "bep-eng.dic"))
\r
23 MECAB_DICT_INDEX = path.normpath(path.join(THISDIR, "..", "mecab", "src", "mecab-dict-index.exe"))
\r
24 CS_FILE = path.join(path.dirname(path.abspath(__file__)), "characters-ja.dic")
\r
26 CODE = 'utf-8' # cp932
\r
31 except OSError as exc:
\r
32 if exc.errno == errno.EEXIST:
\r
39 eng_dic_maker.make_dic(ENGDIC, CODE, THISDIR)
\r
40 tankan_dic_maker.make_dic(CODE, CS_FILE, THISDIR)
\r
41 custom_dic_maker.make_dic(CODE, THISDIR)
\r
42 roma_dic_maker.make_dic(CODE, THISDIR)
\r
44 def convert_file(src_file, src_enc, dest_file, dest_enc):
\r
45 print "converting %s to %s" % (src_file, dest_file)
\r
46 with open(src_file) as sf:
\r
47 with open(dest_file, "w") as df:
\r
52 s = s.decode(src_enc)
\r
53 df.write(s.encode(dest_enc))
\r
55 # 0:表層形,1:左文脈ID,2:右文脈ID,3:コスト,
\r
56 # 4:品詞,5:品詞細分類1,6:品詞細分類2,7:品詞細分類3,
\r
57 # 8:活用形,9:活用型,10:原形,11:読み,12:発音,
\r
58 # 13:アクセント位置/モーラ数,14:アクセント属性, (Open JTalk の拡張情報)
\r
62 if a[0] == '盲' and a[11] == 'メクラ':
\r
63 a[11], a[12], a[13] = "モウ", "モー", "1/2"
\r
65 elif a[0] == '聾' and a[11] == 'ツンボ':
\r
66 a[11], a[12], a[13] = "ロウ", "ロー", "1/2"
\r
68 elif a[0] == 'z' and a[11] == 'ズィー':
\r
70 elif a[0] == '規' and a[11] == 'ブンマワシ':
\r
72 elif a[0] == '全' and a[11] == 'チョン':
\r
74 elif a[0] == '007' and a[11] == 'ゼロゼロセブン':
\r
76 elif a[0] == '未曾有' and a[12] == 'ミゾー':
\r
79 elif a[0] == '言う' and a[12] == 'ユウ':
\r
82 elif a[0] == 'まごう' and a[12] == 'マゴウ':
\r
85 elif a[0] == 'ゆう' and a[12] == 'ユウ':
\r
88 elif a[0] == '思う' and a[12] == 'オモウ':
\r
91 elif a[0] == '吸う' and a[12] == 'スウ':
\r
94 elif a[0] == '繕う' and a[12] == 'ツクロウ':
\r
97 elif a[0] == '大きい' and a[12] == 'オーキイ':
\r
100 elif a[0] == '仰せ' and a[12] == 'オーセ':
\r
103 elif a[0] == 'おおせる' and a[12] == 'オーセル':
\r
106 elif a[0] == '車前草' and a[12] == 'オーバコ':
\r
109 elif a[0] == '概ね' and a[12] == 'オームネ':
\r
112 elif a[0] == '公' and a[12] == 'オーヤケ':
\r
115 elif a[0] == '氷' and a[12] == 'コーリ':
\r
118 elif a[0] == '凍る' and a[12] == 'コール':
\r
121 elif a[0] == '滞る' and a[12] == 'トドコール':
\r
124 elif a[0] == '憤る' and a[12] == 'イキドール':
\r
127 elif a[0] == '蟋蟀' and a[12] == 'コーロギ':
\r
130 elif a[0] == '遠い' and a[12] == 'トーイ':
\r
133 elif a[0] == '通る' and a[12] == 'トール':
\r
136 elif a[0] == '頬' and a[12] == 'ホー':
\r
139 elif a[0] == '酸漿' and a[12] == 'ホーズキ':
\r
142 elif a[0] == '大目' and a[12] == 'オーメ':
\r
145 elif a[0] == '大通り' and a[12] == 'オードーリ':
\r
148 elif a[0] == '凍り付く' and a[12] == 'コーリツク':
\r
151 elif a[0] == '遠ざかる' and a[12] == 'トーザカル':
\r
154 elif a[0] == '通す' and a[12] == 'トース':
\r
157 elif a[0] == '頬張る' and a[12] == 'ホーバル':
\r
160 elif a[0] == 'いとおしい' and a[12] == 'イトーシイ':
\r
163 elif a[0] == '凡そ' and a[12] == 'オヨソ':
\r
164 a[11] = a[12] = 'オオヨソ'
\r
167 elif a[0] == '無花果' and a[12] == 'イチジュク':
\r
168 a[11] = a[12] = 'イチジク'
\r
170 elif a[0] == '鼓' and a[12] == 'コ':
\r
171 a[11] = a[12] = 'ツヅミ'
\r
173 elif a[0] == '葛籠' and a[12] == 'ツズロ':
\r
175 elif a[0] == '葛籠' and a[12] == 'ツズラ':
\r
178 elif a[0] == '提灯' and a[12] == 'ヂョウチン':
\r
179 a[11] = a[12] = 'ヂョーチン'
\r
181 elif a[0] == '青梅' and a[12] == 'オウメ':
\r
182 a[11] = a[12] = 'オーメ'
\r
184 elif a[0] == 'クヮルテット' and a[12] == 'クヮルテット':
\r
185 a[11] = a[12] = 'クァルテット'
\r
187 elif a[0] == 'スェーター' and a[12] == 'スェーター':
\r
188 a[11] = a[12] = 'スエーター'
\r
190 elif a[0] == '憤る' and a[12] in ('ムズカル', 'ムツカル'):
\r
192 elif a[0] == 'いひ' and a[12] in ('ユイ', 'イイ'):
\r
194 elif a[0] == '八幡平' and a[12] == 'ヤワタダイラ':
\r
196 elif a[0] == '好かん' and a[12] == 'コーカン':
\r
198 elif a[0] == 'おおきに' and a[12] == 'オーキニ':
\r
199 s = s + "," + a[11]
\r
200 elif a[0] == 'かほる' and a[11] == 'カホル' and a[12] == 'カオル':
\r
201 s = s + "," + a[11]
\r
202 elif a[0] == 'かほる' and a[11] == 'カホル' and a[12] == 'カホル':
\r
204 elif a[0] == 'さをり' and a[12] == 'サオリ':
\r
205 s = s + "," + a[11]
\r
206 elif a[0] == '透' and a[12] == 'トール':
\r
207 s = s + "," + a[11]
\r
208 elif a[0] == '大阪' and a[12] == 'オーサカ':
\r
209 s = s + "," + a[11]
\r
210 elif a[0] == '遠野' and a[12] == 'トーノ':
\r
211 s = s + "," + a[11]
\r
212 elif a[0] == 'みさを' and a[12] == 'ミサオ':
\r
213 s = s + "," + a[11]
\r
214 elif a[0] == 'そういう' and a[12] == 'ソーユウ':
\r
216 elif a[0] == 'どうして' and a[12] == 'ドーシテ':
\r
218 elif a[0] == 'フィードバック' and len(a) == 15:
\r
219 a.append('フィード バック')
\r
220 a[11] = a[12] = a[15].replace(' ', '')
\r
222 elif a[0] == 'インターフェース' and len(a) == 15:
\r
223 a.append('インター フェース')
\r
224 a[11] = a[12] = a[15].replace(' ', '')
\r
226 elif a[0] == 'オペレーティングシステム' and len(a) == 15:
\r
227 a.append('オペレーティング システム')
\r
228 a[11] = a[12] = a[15].replace(' ', '')
\r
230 elif a[0] == 'アイスクリーム' and len(a) == 15:
\r
231 a.append('アイス クリーム')
\r
232 a[11] = a[12] = a[15].replace(' ', '')
\r
234 elif a[0] == '日本点字図書館' and len(a) == 15:
\r
235 a.append('ニッポン テンジ トショカン')
\r
236 a[11] = a[12] = a[15].replace(' ', '')
\r
238 elif a[0] == '通り' and a[11] == 'トオリ' and len(a) == 15:
\r
241 elif a[0] == '狼' and a[11] == 'オオカミ' and len(a) == 15:
\r
244 elif a[0] == '多い' and a[11] == 'オオイ' and len(a) == 15:
\r
247 elif a[0] == '多く' and a[11] == 'オオク' and len(a) == 15:
\r
250 elif a[0] == '大晦日' and a[11] == 'オオミソカ' and len(a) == 15:
\r
253 elif a[0] == '手作り' and a[11] == 'テヅクリ' and len(a) == 15:
\r
256 elif a[0] == '南半球' and len(a) == 15:
\r
257 a.append('ミナミ ハンキュー')
\r
259 elif a[0] == 'アメリカ合衆国' and len(a) == 15:
\r
260 a.append('アメリカ ガッシューコク')
\r
262 elif a[0] == '第一人者' and len(a) == 15:
\r
263 a.append('ダイ1ニンシャ')
\r
265 elif a[0] == '一流' and len(a) == 15:
\r
268 elif a[0] == '一月' and len(a) == 15:
\r
271 elif a[0] == '二月' and len(a) == 15:
\r
274 elif a[0] == '四方' and len(a) == 15:
\r
277 elif a[0] == '六法全書' and len(a) == 15:
\r
278 a.append('6ポー ゼンショ')
\r
280 elif a[0] == '百人一首' and len(a) == 15:
\r
281 a.append('100ニン 1シュ')
\r
283 elif a[0] == '日本コロムビア' and len(a) == 15:
\r
284 a.append('ニッポン コロムビア')
\r
285 a[11] = a[12] = a[15].replace(' ', '')
\r
287 elif a[0] == 'ビタミンE' and len(a) == 15:
\r
290 elif a[0] == '劇団四季' and len(a) == 15:
\r
291 a.append('ゲキダン 4キ')
\r
293 elif a[0] == '四季' and len(a) == 15:
\r
296 elif a[0] == '四半期' and len(a) == 15:
\r
299 elif a[0] == '四角形' and len(a) == 15:
\r
302 elif a[0] == '四条' and len(a) == 15:
\r
305 elif a[0] == '二男' and len(a) == 15:
\r
308 elif a[0] == '十数' and len(a) == 15:
\r
311 elif a[0] == '一輪車' and len(a) == 15:
\r
314 elif a[0] == '三塁打' and len(a) == 15:
\r
317 elif a[0] == '一汁一菜' and len(a) == 15:
\r
318 a.append('1ジュー 1サイ')
\r
320 elif a[0] == '五臓六腑' and len(a) == 15:
\r
323 elif a[0] == '一段' and len(a) == 15:
\r
326 elif a[0] == '七転び八起き' and len(a) == 15:
\r
327 a.append('ナナコロビ ヤオキ')
\r
329 elif a[0] == '十重二十重' and len(a) == 15:
\r
332 elif a[0] == '3ラン' and len(a) == 15:
\r
335 elif a[0] == 'さんりんしゃ' and len(a) == 15:
\r
338 elif a[0] == 'いちばん' and len(a) == 15:
\r
341 elif a[0] == 'X線' and len(a) == 15:
\r
344 elif a[0] == '二・二六事件' and len(a) == 15:
\r
345 a.append('2⠼26 ジケン')
\r
347 elif a[0] == 'B5判' and len(a) == 15:
\r
350 elif a[0] == 'この間' and a[12] == 'コノカン':
\r
352 elif a[0] == 'インターネット' and len(a) == 15:
\r
353 a.append('インター ネット')
\r
355 elif a[0] == '各党' and len(a) == 15:
\r
360 def convert_jdic_file(src_file, src_enc, dest_file, dest_enc):
\r
361 print "converting %s to %s" % (src_file, dest_file)
\r
362 with open(src_file) as sf:
\r
363 with open(dest_file, "w") as df:
\r
368 s = s.decode(src_enc).rstrip()
\r
371 s += "\n" # do not use os.linesep here
\r
372 df.write(s.encode(dest_enc))
\r
375 'nvdajp-eng-dic.csv','nvdajp-tankan-dic.csv',
\r
376 'nvdajp-custom-dic.csv','nvdajp-roma-dic.csv',
\r
379 euc_files = ['char.def','feature.def','left-id.def','matrix.def',
\r
380 'pos-id.def','rewrite.def','right-id.def', 'unk.def']
\r
382 jdic_file = 'naist-jdic.csv'
\r
385 print "copy %s to %s" % (path.join(THISDIR, f), TEMPDIR)
\r
386 shutil.copy(path.join(THISDIR, f), TEMPDIR)
\r
388 for f in euc_files:
\r
389 convert_file(path.join(THISDIR, f), 'euc-jp', path.join(TEMPDIR, f), CODE)
\r
391 convert_jdic_file(path.join(THISDIR, jdic_file), 'euc-jp', path.join(TEMPDIR, jdic_file), CODE)
\r
393 print TEMPDIR, [MECAB_DICT_INDEX, '-d','.', '-o',OUTDIR, '-f',CODE, '-c',CODE]
\r
394 subprocess.check_call([MECAB_DICT_INDEX, '-d','.', '-o',OUTDIR, '-f',CODE, '-c',CODE], cwd=TEMPDIR)
\r
396 print "copy %s to %s" % (path.join(THISDIR, 'dicrc'), OUTDIR)
\r
397 shutil.copy(path.join(THISDIR, 'dicrc'), OUTDIR)
\r
398 dic_version_file = path.join(OUTDIR, "DIC_VERSION")
\r
399 print "dic version file: " + dic_version_file
\r
400 version = "nvdajp-jtalk-dic " + '(' + CODE + ') ' + datetime.utcnow().strftime('%Y%m%d-%H%M%S')
\r
402 with open(dic_version_file, "wb") as f:
\r
403 f.write(version + os.linesep)
\r