forked from damiangilgonzalez1995/Clustering-with-LLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
embedding_creation.py
36 lines (21 loc) · 927 Bytes
/
embedding_creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd # dataframe manipulation
import numpy as np # linear algebra
from sentence_transformers import SentenceTransformer
df = pd.read_csv("data/train.csv", sep = ";")
def compile_text(x):
text = f"""Age: {x['age']},
housing load: {x['housing']},
Job: {x['job']},
Marital: {x['marital']},
Education: {x['education']},
Default: {x['default']},
Balance: {x['balance']},
Personal loan: {x['loan']}
"""
return text
sentences = df.apply(lambda x: compile_text(x), axis=1).tolist()
model = SentenceTransformer(r"sentence-transformers/paraphrase-MiniLM-L6-v2")
output = model.encode(sentences=sentences, show_progress_bar= True, normalize_embeddings = True)
df_embedding = pd.DataFrame(output)
df_embedding
df_embedding.to_csv("embedding_train.csv",index = False)