From 2c8533894098141adea4ebb77d6022199becd98e Mon Sep 17 00:00:00 2001 From: "justin.hsu" Date: Sat, 9 Nov 2024 05:03:10 +0800 Subject: [PATCH] udpate tokenizer for jieba --- src/db_insert.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/db_insert.py b/src/db_insert.py index a32da19..081dea1 100644 --- a/src/db_insert.py +++ b/src/db_insert.py @@ -28,7 +28,7 @@ def check_class_exist(self): 'class': self.classnm, 'properties': [ {'name': 'pid', 'dataType': ['text']}, - {'name': 'content', 'dataType': ['text']}, + {'name': 'content', 'dataType': ['text'], "tokenization": "gse"}, ], 'vectorizer': 'text2vec-openai', 'moduleConfig': { @@ -80,7 +80,7 @@ def split_and_insert(self, pid, content, category): if __name__ == '__main__': - with open('data/test.json', encoding='utf-8') as file: + with open('data/aicup_noocr_sec.json', encoding='utf-8') as file: data = json.load(file) failed_records = [] # 用於存放匯入失敗的資料 @@ -91,13 +91,13 @@ def split_and_insert(self, pid, content, category): content = item['content'] if category == "faq": - classnm = "faq" + classnm = "faqdevprod" content_str = json.dumps(content, ensure_ascii=False, indent=4) elif category == "insurance": - classnm = "insurance" + classnm = "insurancedevprod" content_str = content elif category == "finance": - classnm = "finance" + classnm = "financedevprod" content_str = json.dumps(content, ensure_ascii=False, indent=4) if isinstance(content, dict) else content else: print("Unknown category, skipping item.")