Skip to content

Commit

Permalink
udpate tokenizer for jieba
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinHsu1019 committed Nov 8, 2024
1 parent 0b04e8c commit 2c85338
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions src/db_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def check_class_exist(self):
'class': self.classnm,
'properties': [
{'name': 'pid', 'dataType': ['text']},
{'name': 'content', 'dataType': ['text']},
{'name': 'content', 'dataType': ['text'], "tokenization": "gse"},
],
'vectorizer': 'text2vec-openai',
'moduleConfig': {
Expand Down Expand Up @@ -80,7 +80,7 @@ def split_and_insert(self, pid, content, category):


if __name__ == '__main__':
with open('data/test.json', encoding='utf-8') as file:
with open('data/aicup_noocr_sec.json', encoding='utf-8') as file:
data = json.load(file)

failed_records = [] # 用於存放匯入失敗的資料
Expand All @@ -91,13 +91,13 @@ def split_and_insert(self, pid, content, category):
content = item['content']

if category == "faq":
classnm = "faq"
classnm = "faqdevprod"
content_str = json.dumps(content, ensure_ascii=False, indent=4)
elif category == "insurance":
classnm = "insurance"
classnm = "insurancedevprod"
content_str = content
elif category == "finance":
classnm = "finance"
classnm = "financedevprod"
content_str = json.dumps(content, ensure_ascii=False, indent=4) if isinstance(content, dict) else content
else:
print("Unknown category, skipping item.")
Expand Down

0 comments on commit 2c85338

Please sign in to comment.