Skip to content

Commit

Permalink
Fix: 辞書データのフィルタリングロジックを改善
Browse files Browse the repository at this point in the history
surface がひらがな・カタカナのみで構成される場合、かつ3文字以上の場合は、pyopenjtalk が苦手とするひらがな・カタカナ単語の分かち書き強化のために意図的に残す
  • Loading branch information
tsukumijima committed Nov 28, 2024
1 parent 3d1c9c2 commit 53cbe47
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 9 deletions.
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ typos = "typos"
test = "pytest"
update-snapshots = "pytest --snapshot-update"
update-licenses = "bash tools/create_venv_and_generate_licenses.bash"
compress-dictionaries = "poetry run python tools/compress_dictionaries.py"
build = "poetry run task update-licenses && pyinstaller --noconfirm run.spec"
compress-dictionaries = "python tools/compress_dictionaries.py"
filter-dictionaries = "python tools/remove_dictionary_duplicates.py && python tools/remove_dictionary_duplicates_by_priority.py && python tools/remove_redundant_dictionary_entries.py"
build = "task update-licenses && pyinstaller --noconfirm run.spec"

[tool.pysen]
version = "0.11.0"
Expand Down
2 changes: 1 addition & 1 deletion tools/remove_dictionary_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def remove_duplicates() -> None:
total_removed = 0

for file_path in sorted(dict_dir.glob("*.csv")):
# 01_default.csv は手動生成された辞書なのでスキップ
# 01_default.csv は明示的に手動作成されたデフォルト辞書なのでスキップ
if file_path.name == "01_default.csv":
continue
original_count = sum(1 for _ in open(file_path, "r", encoding="utf-8"))
Expand Down
24 changes: 18 additions & 6 deletions tools/remove_redundant_dictionary_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import csv
import re
import shutil
from pathlib import Path

Expand Down Expand Up @@ -46,7 +47,7 @@ def process_csv_file(file_path: str) -> tuple[int, list[list[str]]]:
for row in reader:
processed_rows += 1
if processed_rows % 100 == 0: # 100行ごとに進捗を表示
print(f"Processing... {processed_rows}/{total_rows} rows", end="\r")
print(f"Processing... {processed_rows}/{total_rows} rows")

if not row: # 空行をスキップ
continue
Expand All @@ -57,16 +58,27 @@ def process_csv_file(file_path: str) -> tuple[int, list[list[str]]]:
reading = row[11].replace(":", "")
pronunciation = row[12].replace(":", "")

# surface がひらがな・カタカナのみで構成される場合、かつ3文字以上の場合は、pyopenjtalk が苦手とする
# ひらがな・カタカナ単語の分かち書き強化のために意図的に残す
if re.match(r"^[\u3040-\u309F\u30A0-\u30FF]{3,}$", surface):
unique_rows.append(row)
continue

# デフォルト辞書のみ適用した pyopenjtalk から読みと発音を取得
default_reading, default_pronunciation = get_default_reading_pronunciation(surface) # fmt: skip
default_reading_without_special_chars = default_reading.replace(
"・", ""
).replace(" ", "")
default_pronunciation_without_special_chars = default_pronunciation.replace(
"・", ""
).replace(" ", "")

# デフォルト辞書のみ適用した pyopenjtalk の発音と完全一致する場合は削除
## pyopenjtalk から取得した発音には「・」や全角スペースが含まれることがあるが、Mecab 辞書データの発音には含まれていないことが多いので、
## 除去した状態でも一致する場合は削除する
if (
default_pronunciation == pronunciation
or default_pronunciation.replace("・", "").replace(" ", "")
== pronunciation
or default_pronunciation_without_special_chars == pronunciation
):
removed_rows.append(row)
print(
Expand All @@ -75,7 +87,7 @@ def process_csv_file(file_path: str) -> tuple[int, list[list[str]]]:
# そうでないが、デフォルト辞書の読みと完全一致する場合は削除
elif (
default_reading == reading
or default_reading.replace("・", "").replace(" ", "") == reading
or default_reading_without_special_chars == reading
):
removed_rows.append(row)
print(
Expand Down Expand Up @@ -116,8 +128,8 @@ def remove_redundant_dictionary_entries() -> None:
total_removed = 0

for file_path in sorted(dict_dir.glob("*.csv")):
# 01_default.csv は手動生成された辞書なのでスキップ
if file_path.name == "01_default.csv":
# 01_ から始まる辞書は手動生成された辞書なのでスキップ
if file_path.name.startswith("01_"):
continue

print(f"\nProcessing {file_path.name}...")
Expand Down

0 comments on commit 53cbe47

Please sign in to comment.