Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance Outlier Detection Test Coverage and Edge Case Handling #474

Merged
merged 3 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DIRECTORY.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
/home/runner/work/_temp/fcbe3165-95cc-4878-8e84-ef18a16fbf3a.sh: line 1: scripts/build_directory_md.py: Permission denied
/home/runner/work/_temp/8260a9d1-eff9-42f4-b1a3-1bd90558f43e.sh: line 1: scripts/build_directory_md.py: Permission denied
56 changes: 52 additions & 4 deletions pysnippets/Data_preprocessing/data_cleaning.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,61 @@
import pandas as pd

def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
"""
Remove duplicate rows from the DataFrame.

Args:
- df (pd.DataFrame): The input DataFrame.

Returns:
- pd.DataFrame: A new DataFrame with duplicates removed.
"""
if df.empty:
print("Warning: The DataFrame is empty.")
return df.drop_duplicates()

def replace_missing_with_mean(df: pd.DataFrame, column: str) -> pd.DataFrame:
mean_value = df[column].mean()
df[column].fillna(mean_value, inplace=True)
def replace_missing_with_mean(df: pd.DataFrame, column: str, default_value: float = None) -> pd.DataFrame:
"""
Replace missing values in a specified column with the column's mean or a provided default value.

Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name where missing values need to be replaced.
- default_value (float, optional): If provided, will replace missing values with this value.

Returns:
- pd.DataFrame: A new DataFrame with missing values replaced.

Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

if default_value is not None:
df[column].fillna(default_value, inplace=True)
else:
mean_value = df[column].mean()
df[column].fillna(mean_value, inplace=True)

return df

def standardize_text(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Standardize the text in a specified column by converting it to lowercase and stripping whitespace.

Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column to standardize.

Returns:
- pd.DataFrame: A new DataFrame with standardized text in the specified column.

Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

df[column] = df[column].str.lower().str.strip()
return df
return df
58 changes: 57 additions & 1 deletion pysnippets/Data_preprocessing/data_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,69 @@
import numpy as np

def log_transform(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Apply a logarithmic transformation (log1p) to a specified column.
The transformation is log(1 + x) to handle zero and positive values.

Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to transform.

Returns:
- pd.DataFrame: A new DataFrame with the transformed column.

Raises:
- ValueError: If the column does not exist in the DataFrame or contains non-positive values.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

# Ensure that the values are positive before applying log transformation
if (df[column] <= 0).any():
raise ValueError(f"Log transformation cannot be applied to non-positive values in column '{column}'.")

df[column] = np.log1p(df[column])
return df

def power_transform(df: pd.DataFrame, column: str, power: float = 2.0) -> pd.DataFrame:
"""
Apply a power transformation to a specified column.

Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to transform.
- power (float): The power to raise the values to (default is 2.0).

Returns:
- pd.DataFrame: A new DataFrame with the transformed column.

Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

df[column] = np.power(df[column], power)
return df

def binarize(df: pd.DataFrame, column: str, threshold: float) -> pd.DataFrame:
"""
Binarize a specified column based on a threshold. Values greater than the threshold
are set to 1, and values less than or equal to the threshold are set to 0.

Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to binarize.
- threshold (float): The threshold for binarization.

Returns:
- pd.DataFrame: A new DataFrame with the binarized column.

Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

df[column] = (df[column] > threshold).astype(int)
return df
return df
41 changes: 38 additions & 3 deletions pysnippets/Data_preprocessing/encoding.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,48 @@
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

def one_hot_encode(df: pd.DataFrame, column: str) -> pd.DataFrame:
encoder = OneHotEncoder(sparse=False, drop='first')
def one_hot_encode(df: pd.DataFrame, column: str, drop_first: bool = True) -> pd.DataFrame:
"""
Perform one-hot encoding on a specified column.

Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to encode.
- drop_first (bool): Whether to drop the first category to avoid multicollinearity (default is True).

Returns:
- pd.DataFrame: A new DataFrame with the one-hot encoded column(s).

Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

encoder = OneHotEncoder(sparse=False, drop='first' if drop_first else None)
encoded = encoder.fit_transform(df[[column]])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([column]))

# Concatenate the original dataframe without the encoded column and the encoded DataFrame
return pd.concat([df.drop(column, axis=1), encoded_df], axis=1)

def label_encode(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Perform label encoding on a specified column (converting categories to integer labels).

Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to encode.

Returns:
- pd.DataFrame: A new DataFrame with the label encoded column.

Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

encoder = LabelEncoder()
df[column] = encoder.fit_transform(df[column])
return df
return df
40 changes: 39 additions & 1 deletion pysnippets/Data_preprocessing/feature_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,49 @@
import pandas as pd

def standardize_features(df: pd.DataFrame, columns: list) -> pd.DataFrame:
"""
Standardize the features (scale to zero mean and unit variance) for specified columns.

Args:
- df (pd.DataFrame): The input DataFrame.
- columns (list): List of column names to standardize.

Returns:
- pd.DataFrame: A new DataFrame with standardized features.

Raises:
- ValueError: If any of the columns do not exist in the DataFrame.
"""
# Check if columns exist in the DataFrame
missing_cols = [col for col in columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")

# Apply StandardScaler
scaler = StandardScaler()
df[columns] = scaler.fit_transform(df[columns])
return df

def min_max_scale_features(df: pd.DataFrame, columns: list) -> pd.DataFrame:
"""
Apply Min-Max scaling (scale to a [0, 1] range) for specified columns.

Args:
- df (pd.DataFrame): The input DataFrame.
- columns (list): List of column names to scale.

Returns:
- pd.DataFrame: A new DataFrame with Min-Max scaled features.

Raises:
- ValueError: If any of the columns do not exist in the DataFrame.
"""
# Check if columns exist in the DataFrame
missing_cols = [col for col in columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")

# Apply MinMaxScaler
scaler = MinMaxScaler()
df[columns] = scaler.fit_transform(df[columns])
return df
return df
47 changes: 46 additions & 1 deletion pysnippets/Data_preprocessing/missing_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,59 @@
from sklearn.impute import SimpleImputer

def impute_numeric_with_mean(df: pd.DataFrame, columns: list) -> pd.DataFrame:
"""
Impute missing numeric values in specified columns with the mean value of each column.

Args:
- df (pd.DataFrame): The input DataFrame.
- columns (list): List of column names to impute.

Returns:
- pd.DataFrame: A new DataFrame with missing numeric values imputed with the mean.

Raises:
- ValueError: If any of the columns do not exist in the DataFrame.
"""
missing_cols = [col for col in columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")

imputer = SimpleImputer(strategy='mean')
df[columns] = imputer.fit_transform(df[columns])
return df

def impute_categorical_with_mode(df: pd.DataFrame, columns: list) -> pd.DataFrame:
"""
Impute missing categorical values in specified columns with the most frequent value (mode).

Args:
- df (pd.DataFrame): The input DataFrame.
- columns (list): List of column names to impute.

Returns:
- pd.DataFrame: A new DataFrame with missing categorical values imputed with the mode.

Raises:
- ValueError: If any of the columns do not exist in the DataFrame.
"""
missing_cols = [col for col in columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")

imputer = SimpleImputer(strategy='most_frequent')
df[columns] = imputer.fit_transform(df[columns])
return df

def drop_missing(df: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
return df.dropna(thresh=int(df.shape[1] * threshold))
"""
Drop rows with missing values if the number of missing values exceeds a given threshold.

Args:
- df (pd.DataFrame): The input DataFrame.
- threshold (float): Proportion of non-null values required in a row to keep it. Default is 0.5.

Returns:
- pd.DataFrame: A new DataFrame with rows dropped based on the missing value threshold.
"""
# Drop rows where the number of non-null values is less than the threshold
return df.dropna(thresh=int(df.shape[1] * threshold))
39 changes: 38 additions & 1 deletion pysnippets/Data_preprocessing/outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,52 @@
import numpy as np

def remove_outliers_iqr(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Remove outliers from a specified column using the Interquartile Range (IQR) method.

Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to remove outliers from.

Returns:
- pd.DataFrame: A new DataFrame with outliers removed based on the IQR method.

Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove rows where the column's value is outside the IQR bounds
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

def remove_outliers_zscore(df: pd.DataFrame, column: str, threshold: float = 3.0) -> pd.DataFrame:
"""
Remove outliers from a specified column using the Z-Score method.

Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to remove outliers from.
- threshold (float): The Z-score threshold to identify outliers (default is 3.0).

Returns:
- pd.DataFrame: A new DataFrame with outliers removed based on the Z-Score method.

Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

mean = df[column].mean()
std = df[column].std()
z_scores = (df[column] - mean) / std
return df[np.abs(z_scores) <= threshold]

# Remove rows where the absolute Z-Score exceeds the threshold
return df[np.abs(z_scores) <= threshold]
Loading
Loading