-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
102 lines (68 loc) · 3.1 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Importing necessary libraries
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy.sparse import save_npz, csr_matrix
# ------------------------------------------------------------------------------------#
# Creating a set of stopwords
stop_words = set(stopwords.words("english"))
# NLTK setup
porterStemmer = PorterStemmer()
# ------------------------------------------------------------------------------------#
# Function to preprocess the text data retrieved from text file
def preprocessingText(text):
# Removing special characters and digits
text = re.sub(r"[^a-zA-Z]", " ", text)
# Converting text in lowercase and removing leading and trailing whitespace
text = text.lower().strip()
# Tokenizing, removing stopwords, and applying stemming
tokens = [porterStemmer.stem(word) for word in text.split() if word not in stop_words]
return " ".join(tokens)
# ------------------------------------------------------------------------------------#
# Function to read documents from a text file
def readDocuments(file_path):
with open(file_path, "r", encoding="utf-8") as file:
data = file.read()
# Splitting the data based on consecutive asterisks
text = re.split(r'[*#]+', data)
return text
# ------------------------------------------------------------------------------------#
# Function to create inverted index
def createTDIFMatrixAndInvertedIndex(documents):
# Using TfidfVectorizer to compute TF-IDF matrix
vector = TfidfVectorizer()
tfidf_matrix = vector.fit_transform(documents)
inverted_index = {}
for term, index in vector.vocabulary_.items():
inverted_index[term] = index
return tfidf_matrix, inverted_index
# ------------------------------------------------------------------------------------#
# Main function
def main():
# Read documents from files
folder_name = "files"
documents = []
for file_name in os.listdir(folder_name):
if file_name.endswith(".txt"):
file_path = os.path.join(folder_name, file_name)
documents.extend(readDocuments(file_path))
# Removing the empty strings
filtered_documents = [text for text in documents if text.strip()]
# Preprocessing documents
preprocessed_documents = [preprocessingText(doc) for doc in filtered_documents]
# Creating TF-IDF matrix and inverted index
tfidf_matrix, inverted_index = createTDIFMatrixAndInvertedIndex(preprocessed_documents)
print("Preprocessing step is completed\n############################################")
print("\nTFIDF Matrix\n")
print(tfidf_matrix)
print("\n############################################")
print("Inverted index is created\n")
print("Click below to visit the inverted index file\n", '\\inverted_index.txt')
# print("TFIDF Matrix\n", tfidf_matrix)
save_npz('tfidf_matrix.npz', csr_matrix(tfidf_matrix))
with open("inverted_index.txt", "w", encoding="utf-8") as file:
file.write(str(inverted_index))
if __name__ == "__main__":
main()