From 2d25d8ca5d449a1cf74a9119addd80310ceb419c Mon Sep 17 00:00:00 2001 From: akashlogics Date: Sun, 2 Jun 2024 14:54:23 +0530 Subject: [PATCH] feat: added k-mean Cluster --- k_mean_algorithum/intro_of_k_mean.md | 62 +++++++++++++++++++++++ k_mean_algorithum/k-mean-Clustering.py | 70 ++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 k_mean_algorithum/intro_of_k_mean.md create mode 100644 k_mean_algorithum/k-mean-Clustering.py diff --git a/k_mean_algorithum/intro_of_k_mean.md b/k_mean_algorithum/intro_of_k_mean.md new file mode 100644 index 000000000..dfa69e88a --- /dev/null +++ b/k_mean_algorithum/intro_of_k_mean.md @@ -0,0 +1,62 @@ +The K-Means algorithm is a popular unsupervised learning technique used for data clustering. In this example, we will use the Sckit-learn library to implement the K-Means algorithm on a dataset. + +Here's an introduction to using the K-Means algorithm with Sckit-learn: + +# Import necessary libraries +from sklearn.cluster import KMeans +import numpy as np + +# Generate a sample dataset +# Replace this with your own dataset +X = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]]) + +# Initialize the KMeans algorithm with the desired number of clusters (k) +k = 2 +kmeans = KMeans(n_clusters=k) + +# Fit the algorithm to the dataset +kmeans.fit(X) + +# Get the cluster labels for each data point +labels = kmeans.labels_ + +# Get the coordinates of the cluster centers +centers = kmeans.cluster_centers_ + +# Print the results +print("Cluster Labels:", labels) +print("Cluster Centers:", centers) + +You can replace the sample dataset with your own dataset to perform clustering on your data. + +Certainly! The 'k' in the K-Means algorithm stands for the number of clusters you want to divide your data into. It is a hyperparameter that you need to specify before running the algorithm. + +The K-Means algorithm works by iteratively assigning each data point to one of the 'k' clusters based on the distance between the data point and the cluster centers. The cluster centers are initially chosen randomly. The algorithm then updates the cluster centers by taking the mean of all the data points assigned to each cluster. This process is repeated until the cluster assignments and cluster centers converge to stable values. + +The choice of 'k' can significantly impact the quality of the clustering results. If 'k' is too small, the algorithm may group data points together that are not truly similar, leading to overfitting. On the other hand, if 'k' is too large, the algorithm may create too many small clusters, resulting in underfitting. Therefore, it is essential to choose an appropriate value for 'k' based on the characteristics of your dataset and the desired level of granularity in the clustering results. + +In practice, there are several methods to determine the optimal value for 'k', such as the elbow method, silhouette analysis, or gap statistics. These methods help to evaluate the quality of the clustering results and guide the selection of an appropriate 'k' value. + +Absolutely! Choosing an appropriate value for 'k' in the K-Means algorithm can be done using various methods, and one popular approach is the elbow method. Here's a step-by-step guide on how to use the elbow method to determine the optimal 'k': + +1. +Normalize the data: If your dataset contains features with different scales, it is important to normalize the data before applying the K-Means algorithm. This ensures that all features are treated equally in the distance calculations. +2. +Compute the sum of squared distances (SSD) for different values of 'k': For each value of 'k' from 1 to a maximum value (e.g., the square root of the number of data points), perform the following steps: + +a. Initialize the K-Means algorithm with 'k' clusters. + +b. Fit the algorithm to the dataset. + +c. Compute the sum of squared distances between each data point and its assigned cluster center. + +d. Record the total SSD for this value of 'k'. +3. +Plot the SSD values: Create a plot with 'k' on the x-axis and the corresponding SSD values on the y-axis. This plot is called the elbow plot. +4. +Identify the elbow point: Look for a point in the plot where the SSD starts to decrease more slowly. This point is often referred to as the elbow point. The 'k' value corresponding to the elbow point is considered the optimal number of clusters. +5. +Evaluate the clustering results: After determining the optimal 'k', you can use the K-Means algorithm to cluster your data. Evaluate the quality of the clustering results using metrics such as silhouette score or within-cluster sum of squares (WCSS). If necessary, adjust the 'k' value based on the evaluation results. + + +Remember that the elbow method is just one of several methods to determine the optimal 'k' value. Other methods, such as silhouette analysis or gap statistics, can also be used. The choice of method depends on your specific dataset and the desired level of granularity in the clustering results. \ No newline at end of file diff --git a/k_mean_algorithum/k-mean-Clustering.py b/k_mean_algorithum/k-mean-Clustering.py new file mode 100644 index 000000000..2e22907c8 --- /dev/null +++ b/k_mean_algorithum/k-mean-Clustering.py @@ -0,0 +1,70 @@ +#pip install scikit-learn +# Import necessary libraries +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import load_iris +import matplotlib.pyplot as plt + +#Load the dataset +# Replace this with your own dataset +iris = load_iris() +X = iris.data +y = iris.target + +# Preprocess the dataset +# Standardize the features +scaler = StandardScaler() +X_scaled = scaler.fit_transform(X) + +# Choose the number of clusters (k) +# In this example, we use the elbow method to determine the optimal k value +wcss = [] +for i in range(1, 11): + kmeans = KMeans(n_clusters=i, random_state=0) + kmeans.fit(X_scaled) + wcss.append(kmeans.inertia_) + +plt.plot(range(1, 11), wcss) +plt.title('Elbow Method') +plt.xlabel('Number of clusters') +plt.ylabel('WCSS') +plt.show() + +# Based on the elbow method, choose the optimal k value +# In this example, we choose k = 3 +k = 3 + +# Initialize the KMeans algorithm +kmeans = KMeans(n_clusters=k, random_state=0) + +# Fit the algorithm to the dataset +kmeans.fit(X_scaled) + +# Get the cluster labels for each data point +labels = kmeans.labels_ + +# Visualize the clustering results +plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, s=50, cmap='viridis') +plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='black', s=200, alpha=0.5) +plt.title('Clustered Data') +plt.xlabel('Feature 1') +plt.ylabel('Feature 2') +plt.show() + +# Print the cluster centers +print(kmeans.cluster_centers_) + +# Print the cluster labels for each data point +print(labels) + +# Print the number of data points in each cluster +for i in range(k): + print(f'Number of data points in cluster {i}: {sum(labels == i)}') + +# Print the total number of data points +print(f'Total number of data points: {len(labels)}') + +# Print the total number of clusters +print(f'Total number of clusters: {k}') + +