-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit aa362c0
Showing
12 changed files
with
2,006 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Create by RayLi | ||
from src.data.make_dataset import load_and_preprocess_data | ||
from src.feature_engineering.build_features import create_dummy_vars | ||
from src.models.train_models import train_logistic_regression | ||
from src.models.train_models import random_forest | ||
from src.models.predict_model import evaluate_model | ||
|
||
if __name__ == "__main__": | ||
# Load and preprocess the data | ||
data_path = "src/data/raw/final.csv" | ||
df = load_and_preprocess_data(data_path) | ||
|
||
# Create dummy variables and separate features and target | ||
X, y = create_dummy_vars(df) | ||
|
||
# Train the logistic regression model | ||
model, X_test_scaled, y_test = train_logistic_regression(X, y) | ||
|
||
# Evaluate the model | ||
accuracy = evaluate_model(model, X_test_scaled, y_test) | ||
print(f"Logistic Regression Accuracy: {accuracy}") | ||
|
||
# Train the random forest model | ||
model, X_test_scaled, y_test = random_forest(X,y) | ||
|
||
# Evaluate the model | ||
accuracy = evaluate_model(model, X_test_scaled, y_test) | ||
print(f"Random Forest Accuracy: {accuracy}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# This is a simple use on regression and random forest, and comparing them on the ecuracy. |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import pandas as pd | ||
|
||
def load_and_preprocess_data(data_path): | ||
|
||
# Import the data from 'credit.csv' | ||
df = pd.read_csv(data_path) | ||
# impute all missing values in all the features | ||
# | ||
return df |
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import pandas as pd | ||
|
||
# create dummy features | ||
def create_dummy_vars(df): | ||
# Create dummy variables for all 'object' type variables except 'Loan_Status' | ||
#df = pd.get_dummies(df, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']) | ||
|
||
# store the processed dataset in data/processed | ||
#df.to_csv('./src/data/processed/Processed_Credit_Dataset.csv', index=None) | ||
|
||
# Separate the input features and target variable | ||
X = df.drop('price', axis=1) | ||
y = df['price'] | ||
|
||
return X, y |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Import accuracy score | ||
from sklearn.metrics import accuracy_score | ||
from sklearn.model_selection import KFold | ||
from sklearn.model_selection import StratifiedKFold | ||
from sklearn.model_selection import cross_val_score | ||
|
||
|
||
# # Function to predict and evaluate | ||
def evaluate_model(model, X_test_scaled, y_test): | ||
# Set up a KFold cross-validation | ||
kfold = KFold(n_splits=5) | ||
# Use cross-validation to evaluate the model | ||
scores = cross_val_score(model, X_test_scaled, y_test, cv=kfold) | ||
# Print the accuracy scores for each fold | ||
print("Accuracy scores:", scores) | ||
# Print the mean accuracy and standard deviation of the model | ||
print("Mean accuracy:", scores.mean()) | ||
print("Standard deviation:", scores.std()) | ||
|
||
return scores |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
from sklearn.preprocessing import MinMaxScaler | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.tree import DecisionTreeRegressor | ||
from sklearn.metrics import mean_absolute_error | ||
from sklearn.ensemble import RandomForestClassifier | ||
|
||
|
||
import pickle | ||
|
||
|
||
# Function to train the model | ||
def train_logistic_regression(X, y): | ||
# Splitting the data into training and testing sets | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) | ||
|
||
# Scale the data using MinMaxScaler | ||
scaler = MinMaxScaler() | ||
X_train_scaled = scaler.fit_transform(X_train) | ||
X_test_scaled = scaler.transform(X_test) | ||
|
||
# Train the logistic regression model | ||
model = LogisticRegression().fit(X_train_scaled, y_train) | ||
|
||
# Save the trained model | ||
with open('./src/models/logistic_regression.pkl', 'wb') as f: | ||
pickle.dump(model, f) | ||
|
||
return model, X_test_scaled, y_test | ||
|
||
def random_forest(X,y): | ||
# Splitting the data into training and testing sets | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) | ||
|
||
# Scale the data using MinMaxScaler | ||
scaler = MinMaxScaler() | ||
X_train_scaled = scaler.fit_transform(X_train) | ||
X_test_scaled = scaler.transform(X_test) | ||
|
||
# Train the random forest model | ||
rfmodel = RandomForestClassifier(n_estimators=100, min_samples_leaf=5, max_features=None) | ||
rfmodel.fit(X_train, y_train) | ||
ypred = rfmodel.predict(X_test) | ||
return rfmodel, X_test_scaled, y_test | ||
|
||
def decision_tree(X,y): | ||
# Splitting the data into training and testing sets | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) | ||
|
||
# Scale the data using MinMaxScaler | ||
scaler = MinMaxScaler() | ||
X_train_scaled = scaler.fit_transform(X_train) | ||
X_test_scaled = scaler.transform(X_test) | ||
|
||
# create an instance of the class | ||
dt = DecisionTreeRegressor(max_depth=3, max_features=10, random_state=457) | ||
# train the model | ||
dtmodel = dt.fit(X_train,y_train) | ||
# make predictions using the test set | ||
ytest_pred = dtmodel.predict(X_test) | ||
# evaluate the model | ||
test_mae = mean_absolute_error(ytest_pred, y_test) | ||
print("Decision Tree test error is: ",test_mae) | ||
# make predictions on train set | ||
ytrain_pred = dtmodel.predict(X_train) | ||
# evaluate the model | ||
train_mae = mean_absolute_error(ytrain_pred, y_train) | ||
print("Decision Tree Train error is: ",train_mae) | ||
return dtmodel, X_test_scaled, y_test |