Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ray-ruizhen-li committed Aug 2, 2024
0 parents commit aa362c0
Show file tree
Hide file tree
Showing 12 changed files with 2,006 additions and 0 deletions.
28 changes: 28 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Create by RayLi
from src.data.make_dataset import load_and_preprocess_data
from src.feature_engineering.build_features import create_dummy_vars
from src.models.train_models import train_logistic_regression
from src.models.train_models import random_forest
from src.models.predict_model import evaluate_model

if __name__ == "__main__":
# Load and preprocess the data
data_path = "src/data/raw/final.csv"
df = load_and_preprocess_data(data_path)

# Create dummy variables and separate features and target
X, y = create_dummy_vars(df)

# Train the logistic regression model
model, X_test_scaled, y_test = train_logistic_regression(X, y)

# Evaluate the model
accuracy = evaluate_model(model, X_test_scaled, y_test)
print(f"Logistic Regression Accuracy: {accuracy}")

# Train the random forest model
model, X_test_scaled, y_test = random_forest(X,y)

# Evaluate the model
accuracy = evaluate_model(model, X_test_scaled, y_test)
print(f"Random Forest Accuracy: {accuracy}")
1 change: 1 addition & 0 deletions readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# This is a simple use on regression and random forest, and comparing them on the ecuracy.
Binary file added src/data/__pycache__/make_dataset.cpython-311.pyc
Binary file not shown.
9 changes: 9 additions & 0 deletions src/data/make_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import pandas as pd

def load_and_preprocess_data(data_path):

# Import the data from 'credit.csv'
df = pd.read_csv(data_path)
# impute all missing values in all the features
#
return df
1,864 changes: 1,864 additions & 0 deletions src/data/raw/final.csv

Large diffs are not rendered by default.

Binary file not shown.
15 changes: 15 additions & 0 deletions src/feature_engineering/build_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pandas as pd

# create dummy features
def create_dummy_vars(df):
# Create dummy variables for all 'object' type variables except 'Loan_Status'
#df = pd.get_dummies(df, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'])

# store the processed dataset in data/processed
#df.to_csv('./src/data/processed/Processed_Credit_Dataset.csv', index=None)

# Separate the input features and target variable
X = df.drop('price', axis=1)
y = df['price']

return X, y
Binary file not shown.
Binary file not shown.
Binary file added src/models/logistic_regression.pkl
Binary file not shown.
20 changes: 20 additions & 0 deletions src/models/predict_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Import accuracy score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


# # Function to predict and evaluate
def evaluate_model(model, X_test_scaled, y_test):
# Set up a KFold cross-validation
kfold = KFold(n_splits=5)
# Use cross-validation to evaluate the model
scores = cross_val_score(model, X_test_scaled, y_test, cv=kfold)
# Print the accuracy scores for each fold
print("Accuracy scores:", scores)
# Print the mean accuracy and standard deviation of the model
print("Mean accuracy:", scores.mean())
print("Standard deviation:", scores.std())

return scores
69 changes: 69 additions & 0 deletions src/models/train_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier


import pickle


# Function to train the model
def train_logistic_regression(X, y):
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the logistic regression model
model = LogisticRegression().fit(X_train_scaled, y_train)

# Save the trained model
with open('./src/models/logistic_regression.pkl', 'wb') as f:
pickle.dump(model, f)

return model, X_test_scaled, y_test

def random_forest(X,y):
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the random forest model
rfmodel = RandomForestClassifier(n_estimators=100, min_samples_leaf=5, max_features=None)
rfmodel.fit(X_train, y_train)
ypred = rfmodel.predict(X_test)
return rfmodel, X_test_scaled, y_test

def decision_tree(X,y):
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# create an instance of the class
dt = DecisionTreeRegressor(max_depth=3, max_features=10, random_state=457)
# train the model
dtmodel = dt.fit(X_train,y_train)
# make predictions using the test set
ytest_pred = dtmodel.predict(X_test)
# evaluate the model
test_mae = mean_absolute_error(ytest_pred, y_test)
print("Decision Tree test error is: ",test_mae)
# make predictions on train set
ytrain_pred = dtmodel.predict(X_train)
# evaluate the model
train_mae = mean_absolute_error(ytrain_pred, y_train)
print("Decision Tree Train error is: ",train_mae)
return dtmodel, X_test_scaled, y_test

0 comments on commit aa362c0

Please sign in to comment.