From 2c8533894098141adea4ebb77d6022199becd98e Mon Sep 17 00:00:00 2001
From: "justin.hsu" <justin.hsu.1019@gmail.com>
Date: Sat, 9 Nov 2024 05:03:10 +0800
Subject: [PATCH] udpate tokenizer for jieba

---
 src/db_insert.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/db_insert.py b/src/db_insert.py
index a32da19..081dea1 100644
--- a/src/db_insert.py
+++ b/src/db_insert.py
@@ -28,7 +28,7 @@ def check_class_exist(self):
             'class': self.classnm,
             'properties': [
                 {'name': 'pid', 'dataType': ['text']},
-                {'name': 'content', 'dataType': ['text']},
+                {'name': 'content', 'dataType': ['text'], "tokenization": "gse"},
             ],
             'vectorizer': 'text2vec-openai',
             'moduleConfig': {
@@ -80,7 +80,7 @@ def split_and_insert(self, pid, content, category):
 
 
 if __name__ == '__main__':
-    with open('data/test.json', encoding='utf-8') as file:
+    with open('data/aicup_noocr_sec.json', encoding='utf-8') as file:
         data = json.load(file)
 
     failed_records = []  # 用於存放匯入失敗的資料
@@ -91,13 +91,13 @@ def split_and_insert(self, pid, content, category):
         content = item['content']
 
         if category == "faq":
-            classnm = "faq"
+            classnm = "faqdevprod"
             content_str = json.dumps(content, ensure_ascii=False, indent=4)
         elif category == "insurance":
-            classnm = "insurance"
+            classnm = "insurancedevprod"
             content_str = content
         elif category == "finance":
-            classnm = "finance"
+            classnm = "financedevprod"
             content_str = json.dumps(content, ensure_ascii=False, indent=4) if isinstance(content, dict) else content
         else:
             print("Unknown category, skipping item.")