Fix: 辞書データのフィルタリングロジックを改善

surface がひらがな・カタカナのみで構成される場合、かつ3文字以上の場合は、pyopenjtalk が苦手とするひらがな・カタカナ単語の分かち書き強化のために意図的に残す
Aivis-Project · Nov 28, 2024 · 53cbe47 · 53cbe47
1 parent 3d1c9c2
commit 53cbe47
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 9 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,8 +6,9 @@ typos = "typos"
 test = "pytest"
 update-snapshots = "pytest --snapshot-update"
 update-licenses = "bash tools/create_venv_and_generate_licenses.bash"
-compress-dictionaries = "poetry run python tools/compress_dictionaries.py"
-build = "poetry run task update-licenses && pyinstaller --noconfirm run.spec"
+compress-dictionaries = "python tools/compress_dictionaries.py"
+filter-dictionaries = "python tools/remove_dictionary_duplicates.py && python tools/remove_dictionary_duplicates_by_priority.py && python tools/remove_redundant_dictionary_entries.py"
+build = "task update-licenses && pyinstaller --noconfirm run.spec"
 
 [tool.pysen]
 version = "0.11.0"

diff --git a/tools/remove_dictionary_duplicates.py b/tools/remove_dictionary_duplicates.py
@@ -77,7 +77,7 @@ def remove_duplicates() -> None:
     total_removed = 0
 
     for file_path in sorted(dict_dir.glob("*.csv")):
-        # 01_default.csv は手動生成された辞書なのでスキップ
+        # 01_default.csv は明示的に手動作成されたデフォルト辞書なのでスキップ
         if file_path.name == "01_default.csv":
             continue
         original_count = sum(1 for _ in open(file_path, "r", encoding="utf-8"))

diff --git a/tools/remove_redundant_dictionary_entries.py b/tools/remove_redundant_dictionary_entries.py
@@ -6,6 +6,7 @@
 """
 
 import csv
+import re
 import shutil
 from pathlib import Path
 
@@ -46,7 +47,7 @@ def process_csv_file(file_path: str) -> tuple[int, list[list[str]]]:
         for row in reader:
             processed_rows += 1
             if processed_rows % 100 == 0:  # 100行ごとに進捗を表示
-                print(f"Processing... {processed_rows}/{total_rows} rows", end="\r")
+                print(f"Processing... {processed_rows}/{total_rows} rows")
 
             if not row:  # 空行をスキップ
                 continue
@@ -57,16 +58,27 @@ def process_csv_file(file_path: str) -> tuple[int, list[list[str]]]:
             reading = row[11].replace(":", "")
             pronunciation = row[12].replace(":", "")
 
+            # surface がひらがな・カタカナのみで構成される場合、かつ3文字以上の場合は、pyopenjtalk が苦手とする
+            # ひらがな・カタカナ単語の分かち書き強化のために意図的に残す
+            if re.match(r"^[\u3040-\u309F\u30A0-\u30FF]{3,}$", surface):
+                unique_rows.append(row)
+                continue
+
             # デフォルト辞書のみ適用した pyopenjtalk から読みと発音を取得
             default_reading, default_pronunciation = get_default_reading_pronunciation(surface)  # fmt: skip
+            default_reading_without_special_chars = default_reading.replace(
+                "・", ""
+            ).replace("　", "")
+            default_pronunciation_without_special_chars = default_pronunciation.replace(
+                "・", ""
+            ).replace("　", "")
 
             # デフォルト辞書のみ適用した pyopenjtalk の発音と完全一致する場合は削除
             ## pyopenjtalk から取得した発音には「・」や全角スペースが含まれることがあるが、Mecab 辞書データの発音には含まれていないことが多いので、
             ## 除去した状態でも一致する場合は削除する
             if (
                 default_pronunciation == pronunciation
-                or default_pronunciation.replace("・", "").replace("　", "")
-                == pronunciation
+                or default_pronunciation_without_special_chars == pronunciation
             ):
                 removed_rows.append(row)
                 print(
@@ -75,7 +87,7 @@ def process_csv_file(file_path: str) -> tuple[int, list[list[str]]]:
             # そうでないが、デフォルト辞書の読みと完全一致する場合は削除
             elif (
                 default_reading == reading
-                or default_reading.replace("・", "").replace("　", "") == reading
+                or default_reading_without_special_chars == reading
             ):
                 removed_rows.append(row)
                 print(
@@ -116,8 +128,8 @@ def remove_redundant_dictionary_entries() -> None:
     total_removed = 0
 
     for file_path in sorted(dict_dir.glob("*.csv")):
-        # 01_default.csv は手動生成された辞書なのでスキップ
-        if file_path.name == "01_default.csv":
+        # 01_ から始まる辞書は手動生成された辞書なのでスキップ
+        if file_path.name.startswith("01_"):
             continue
 
         print(f"\nProcessing {file_path.name}...")