From 0361014cf07ef8372db8f6d3aadf61d38ce22358 Mon Sep 17 00:00:00 2001
From: Jyotika Jayani <158709375+JyotikaJayani-08@users.noreply.github.com>
Date: Thu, 6 Jun 2024 12:26:39 +0530
Subject: [PATCH] Air Quality Prediction using ML
---
AirQualityPrediction.ipynb | 1312 ++++++++++++++++++++++++++++++++++++
1 file changed, 1312 insertions(+)
create mode 100644 AirQualityPrediction.ipynb
diff --git a/AirQualityPrediction.ipynb b/AirQualityPrediction.ipynb
new file mode 100644
index 000000000..15a8558c9
--- /dev/null
+++ b/AirQualityPrediction.ipynb
@@ -0,0 +1,1312 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install shap"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "gNIsfIay6Mvj",
+ "outputId": "f960c409-c439-42e8-8d3c-2dcd6918f2e3"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Collecting shap\n",
+ " Downloading shap-0.45.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m540.5/540.5 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from shap) (1.25.2)\n",
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from shap) (1.11.4)\n",
+ "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from shap) (1.2.2)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from shap) (2.0.3)\n",
+ "Requirement already satisfied: tqdm>=4.27.0 in /usr/local/lib/python3.10/dist-packages (from shap) (4.66.4)\n",
+ "Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.10/dist-packages (from shap) (24.0)\n",
+ "Collecting slicer==0.0.8 (from shap)\n",
+ " Downloading slicer-0.0.8-py3-none-any.whl (15 kB)\n",
+ "Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from shap) (0.58.1)\n",
+ "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.10/dist-packages (from shap) (2.2.1)\n",
+ "Requirement already satisfied: llvmlite<0.42,>=0.41.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->shap) (0.41.1)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->shap) (2.8.2)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap) (2023.4)\n",
+ "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->shap) (2024.1)\n",
+ "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->shap) (1.4.2)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->shap) (3.5.0)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->shap) (1.16.0)\n",
+ "Installing collected packages: slicer, shap\n",
+ "Successfully installed shap-0.45.1 slicer-0.0.8\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Importing the necessary libraries"
+ ],
+ "metadata": {
+ "id": "TtBTWxJaCFR_"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "UBLqLmF_03nU"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
+ "from sklearn.model_selection import train_test_split, GridSearchCV\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "from sklearn.metrics import mean_squared_error, r2_score\n",
+ "import shap\n",
+ "import joblib"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Loading the dataset"
+ ],
+ "metadata": {
+ "id": "7ScPGfvDCK44"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ " !pip install chardet\n",
+ " import chardet\n",
+ "\n",
+ " with open(\"/content/global air pollution dataset.csv\", \"rb\") as f:\n",
+ " encoding = chardet.detect(f.read())[\"encoding\"]\n",
+ "\n",
+ " df = pd.read_csv(\"/content/global air pollution dataset.csv\", encoding=encoding)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "KAMIzTh06rL9",
+ "outputId": "23a12f8a-8a71-49a7-a010-7c0c27157769"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (5.2.0)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Mean Median calculation\n"
+ ],
+ "metadata": {
+ "id": "Al7HsKVAUC7d"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "mean_aqi = df[[\"AQI Value\", \"CO AQI Value\", \"Ozone AQI Value\", \"NO2 AQI Value\", \"PM2.5 AQI Value\"]].mean()\n",
+ "\n",
+ "print(\"Average (Mean) AQI Values:\")\n",
+ "print(mean_aqi)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "_KNtvDgKT21E",
+ "outputId": "e30b4863-2b45-4ebf-ec08-1295b39a684d"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Average (Mean) AQI Values:\n",
+ "AQI Value 72.010868\n",
+ "CO AQI Value 1.368367\n",
+ "Ozone AQI Value 35.193709\n",
+ "NO2 AQI Value 3.063334\n",
+ "PM2.5 AQI Value 68.519755\n",
+ "dtype: float64\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Visualising the dataset"
+ ],
+ "metadata": {
+ "id": "UNslO0C8CN92"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 258
+ },
+ "id": "IOaYHNRk1DvP",
+ "outputId": "968d7829-601a-4aff-9e6a-92c1f64a96b0"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Country City AQI Value AQI Category CO AQI Value \\\n",
+ "0 Russian Federation Praskoveya 51 Moderate 1 \n",
+ "1 Brazil Presidente Dutra 41 Good 1 \n",
+ "2 Italy Priolo Gargallo 66 Moderate 1 \n",
+ "3 Poland Przasnysz 34 Good 1 \n",
+ "4 France Punaauia 22 Good 0 \n",
+ "\n",
+ " CO AQI Category Ozone AQI Value Ozone AQI Category NO2 AQI Value \\\n",
+ "0 Good 36 Good 0 \n",
+ "1 Good 5 Good 1 \n",
+ "2 Good 39 Good 2 \n",
+ "3 Good 34 Good 0 \n",
+ "4 Good 22 Good 0 \n",
+ "\n",
+ " NO2 AQI Category PM2.5 AQI Value PM2.5 AQI Category \n",
+ "0 Good 51 Moderate \n",
+ "1 Good 41 Good \n",
+ "2 Good 66 Moderate \n",
+ "3 Good 20 Good \n",
+ "4 Good 6 Good "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Country | \n",
+ " City | \n",
+ " AQI Value | \n",
+ " AQI Category | \n",
+ " CO AQI Value | \n",
+ " CO AQI Category | \n",
+ " Ozone AQI Value | \n",
+ " Ozone AQI Category | \n",
+ " NO2 AQI Value | \n",
+ " NO2 AQI Category | \n",
+ " PM2.5 AQI Value | \n",
+ " PM2.5 AQI Category | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Russian Federation | \n",
+ " Praskoveya | \n",
+ " 51 | \n",
+ " Moderate | \n",
+ " 1 | \n",
+ " Good | \n",
+ " 36 | \n",
+ " Good | \n",
+ " 0 | \n",
+ " Good | \n",
+ " 51 | \n",
+ " Moderate | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Brazil | \n",
+ " Presidente Dutra | \n",
+ " 41 | \n",
+ " Good | \n",
+ " 1 | \n",
+ " Good | \n",
+ " 5 | \n",
+ " Good | \n",
+ " 1 | \n",
+ " Good | \n",
+ " 41 | \n",
+ " Good | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Italy | \n",
+ " Priolo Gargallo | \n",
+ " 66 | \n",
+ " Moderate | \n",
+ " 1 | \n",
+ " Good | \n",
+ " 39 | \n",
+ " Good | \n",
+ " 2 | \n",
+ " Good | \n",
+ " 66 | \n",
+ " Moderate | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Poland | \n",
+ " Przasnysz | \n",
+ " 34 | \n",
+ " Good | \n",
+ " 1 | \n",
+ " Good | \n",
+ " 34 | \n",
+ " Good | \n",
+ " 0 | \n",
+ " Good | \n",
+ " 20 | \n",
+ " Good | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " France | \n",
+ " Punaauia | \n",
+ " 22 | \n",
+ " Good | \n",
+ " 0 | \n",
+ " Good | \n",
+ " 22 | \n",
+ " Good | \n",
+ " 0 | \n",
+ " Good | \n",
+ " 6 | \n",
+ " Good | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df",
+ "summary": "{\n \"name\": \"df\",\n \"rows\": 23463,\n \"fields\": [\n {\n \"column\": \"Country\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 175,\n \"samples\": [\n \"Comoros\",\n \"Turkmenistan\",\n \"Honduras\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"City\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 23462,\n \"samples\": [\n \"Fonte Boa\",\n \"Vidin\",\n \"Kunda\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AQI Value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 56,\n \"min\": 6,\n \"max\": 500,\n \"num_unique_values\": 347,\n \"samples\": [\n 250,\n 222,\n 348\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AQI Category\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Moderate\",\n \"Good\",\n \"Hazardous\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CO AQI Value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 133,\n \"num_unique_values\": 34,\n \"samples\": [\n 27,\n 13,\n 67\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"CO AQI Category\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Good\",\n \"Unhealthy for Sensitive Groups\",\n \"Moderate\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Ozone AQI Value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28,\n \"min\": 0,\n \"max\": 235,\n \"num_unique_values\": 213,\n \"samples\": [\n 89,\n 189,\n 138\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Ozone AQI Category\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Moderate\",\n \"Very Unhealthy\",\n \"Unhealthy for Sensitive Groups\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NO2 AQI Value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5,\n \"min\": 0,\n \"max\": 91,\n \"num_unique_values\": 59,\n \"samples\": [\n 0,\n 7,\n 44\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NO2 AQI Category\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Moderate\",\n \"Good\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PM2.5 AQI Value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 54,\n \"min\": 0,\n \"max\": 500,\n \"num_unique_values\": 383,\n \"samples\": [\n 136,\n 239\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PM2.5 AQI Category\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Moderate\",\n \"Good\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Median and Mean calculation\n"
+ ],
+ "metadata": {
+ "id": "8voSugX_Tuuk"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "\n",
+ "# Calculating median AQI values\n",
+ "median_aqi = df[[\"AQI Value\", \"CO AQI Value\", \"Ozone AQI Value\", \"NO2 AQI Value\", \"PM2.5 AQI Value\"]].median()\n",
+ "\n",
+ "print(\"Median AQI Values:\")\n",
+ "print(median_aqi)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "W_QJ6lG_TecT",
+ "outputId": "04baeb08-b42f-4d89-afad-b411fa4ca248"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Median AQI Values:\n",
+ "AQI Value 55.0\n",
+ "CO AQI Value 1.0\n",
+ "Ozone AQI Value 31.0\n",
+ "NO2 AQI Value 1.0\n",
+ "PM2.5 AQI Value 54.0\n",
+ "dtype: float64\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Checking null values and dropping them"
+ ],
+ "metadata": {
+ "id": "-LYjZ-yFCSbG"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "df = df.dropna()"
+ ],
+ "metadata": {
+ "id": "GCVyxwIi1HVZ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "sns.boxplot(\n",
+ " x = \"AQI Category\",\n",
+ " y = \"AQI Value\",\n",
+ " showmeans=True,\n",
+ " data=df\n",
+ ")\n",
+ "plt.xlabel(\"AQI Category\")\n",
+ "plt.ylabel(\"Air Quality Index (AQI)\")\n",
+ "plt.title(\"Distribution of AQI Across Categories\")\n",
+ "plt.xticks(rotation=45, ha='right')\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 487
+ },
+ "id": "NN7mL9U3R5jQ",
+ "outputId": "653ee3dd-f870-4486-d1a6-29deca0b151e"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "