first commit

ray-ruizhen-li · Aug 2, 2024 · aa362c0 · aa362c0
commit aa362c0
Show file tree

Hide file tree

Showing 12 changed files with 2,006 additions and 0 deletions.
diff --git a/main.py b/main.py
@@ -0,0 +1,28 @@
+# Create by RayLi
+from src.data.make_dataset import load_and_preprocess_data
+from src.feature_engineering.build_features import create_dummy_vars
+from src.models.train_models import train_logistic_regression
+from src.models.train_models import random_forest
+from src.models.predict_model import evaluate_model
+
+if __name__ == "__main__":
+    # Load and preprocess the data
+    data_path = "src/data/raw/final.csv"
+    df = load_and_preprocess_data(data_path)
+
+    # Create dummy variables and separate features and target
+    X, y = create_dummy_vars(df)
+
+    # Train the logistic regression model
+    model, X_test_scaled, y_test = train_logistic_regression(X, y)
+
+    # Evaluate the model
+    accuracy = evaluate_model(model, X_test_scaled, y_test)
+    print(f"Logistic Regression Accuracy: {accuracy}")
+
+    # Train the random forest model
+    model, X_test_scaled, y_test = random_forest(X,y)
+
+    # Evaluate the model
+    accuracy = evaluate_model(model, X_test_scaled, y_test)
+    print(f"Random Forest Accuracy: {accuracy}")
diff --git a/readme.txt b/readme.txt
@@ -0,0 +1 @@
+# This is a simple use on regression and random forest, and comparing them on the ecuracy.
diff --git a/src/data/__pycache__/make_dataset.cpython-311.pyc b/src/data/__pycache__/make_dataset.cpython-311.pyc
diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py
@@ -0,0 +1,9 @@
+import pandas as pd
+
+def load_and_preprocess_data(data_path):
+
+    # Import the data from 'credit.csv'
+    df = pd.read_csv(data_path)
+     # impute all missing values in all the features
+    #
+    return df
diff --git a/src/data/raw/final.csv b/src/data/raw/final.csv
diff --git a/src/feature_engineering/__pycache__/build_features.cpython-311.pyc b/src/feature_engineering/__pycache__/build_features.cpython-311.pyc
diff --git a/src/feature_engineering/build_features.py b/src/feature_engineering/build_features.py
@@ -0,0 +1,15 @@
+import pandas as pd
+
+# create dummy features
+def create_dummy_vars(df):
+# Create dummy variables for all 'object' type variables except 'Loan_Status'
+    #df = pd.get_dummies(df, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'])
+
+    # store the processed dataset in data/processed
+    #df.to_csv('./src/data/processed/Processed_Credit_Dataset.csv', index=None)
+
+    # Separate the input features and target variable
+    X = df.drop('price', axis=1)
+    y = df['price']
+
+    return X, y
diff --git a/src/models/__pycache__/predict_model.cpython-311.pyc b/src/models/__pycache__/predict_model.cpython-311.pyc
diff --git a/src/models/__pycache__/train_models.cpython-311.pyc b/src/models/__pycache__/train_models.cpython-311.pyc
diff --git a/src/models/logistic_regression.pkl b/src/models/logistic_regression.pkl
diff --git a/src/models/predict_model.py b/src/models/predict_model.py
@@ -0,0 +1,20 @@
+# Import accuracy score
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import KFold
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import cross_val_score
+
+
+# # Function to predict and evaluate
+def evaluate_model(model, X_test_scaled, y_test):
+    # Set up a KFold cross-validation
+    kfold = KFold(n_splits=5)
+    # Use cross-validation to evaluate the model
+    scores = cross_val_score(model, X_test_scaled, y_test, cv=kfold)
+    # Print the accuracy scores for each fold
+    print("Accuracy scores:", scores)
+    # Print the mean accuracy and standard deviation of the model
+    print("Mean accuracy:", scores.mean())
+    print("Standard deviation:", scores.std())
+
+    return scores
diff --git a/src/models/train_models.py b/src/models/train_models.py
@@ -0,0 +1,69 @@
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.metrics import mean_absolute_error
+from sklearn.ensemble import RandomForestClassifier
+
+
+import pickle
+
+
+# Function to train the model
+def train_logistic_regression(X, y):
+    # Splitting the data into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
+
+    # Scale the data using MinMaxScaler
+    scaler = MinMaxScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+
+    # Train the logistic regression model
+    model = LogisticRegression().fit(X_train_scaled, y_train)
+
+    # Save the trained model
+    with open('./src/models/logistic_regression.pkl', 'wb') as f:
+        pickle.dump(model, f)
+
+    return model, X_test_scaled, y_test
+
+def random_forest(X,y):
+    # Splitting the data into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
+
+    # Scale the data using MinMaxScaler
+    scaler = MinMaxScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+
+    # Train the random forest model
+    rfmodel = RandomForestClassifier(n_estimators=100, min_samples_leaf=5, max_features=None)
+    rfmodel.fit(X_train, y_train)
+    ypred = rfmodel.predict(X_test)
+    return rfmodel, X_test_scaled, y_test
+
+def decision_tree(X,y):
+    # Splitting the data into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
+
+     # Scale the data using MinMaxScaler
+    scaler = MinMaxScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+
+    # create an instance of the class
+    dt = DecisionTreeRegressor(max_depth=3, max_features=10, random_state=457)
+    # train the model
+    dtmodel = dt.fit(X_train,y_train)
+    # make predictions using the test set
+    ytest_pred = dtmodel.predict(X_test)
+    # evaluate the model
+    test_mae = mean_absolute_error(ytest_pred, y_test)
+    print("Decision Tree test error is: ",test_mae)
+    # make predictions on train set
+    ytrain_pred = dtmodel.predict(X_train)
+    # evaluate the model
+    train_mae = mean_absolute_error(ytrain_pred, y_train)
+    print("Decision Tree Train error is: ",train_mae)
+    return dtmodel, X_test_scaled, y_test
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# This is a simple use on regression and random forest, and comparing them on the ecuracy.