Skip to content

Commit

Permalink
examin the impact on participation rates due to different ethnicities
Browse files Browse the repository at this point in the history
  • Loading branch information
anushkasaxena07 committed Jun 30, 2024
1 parent f91845c commit ac0b8e4
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 8 deletions.
72 changes: 68 additions & 4 deletions .ipynb_checkpoints/Stackoverflow_Survey_Analysis-checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21729,33 +21729,97 @@
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Examine the impact on participation rates due to different ethnicities."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"import pandas as pd\n",
"\n",
"# Load the CSV files\n",
"file1 = pd.read_csv(r\"Data/survey_results_public_2018.csv\")\n",
"file2 = pd.read_csv(r\"Data/survey_results_public_2019.csv\")\n",
"file3 = pd.read_csv(r\"Data/survey_results_public_2020.csv\")\n",
"\n",
"# Merge the data\n",
"merged_data = pd.concat([file1, file2, file3], ignore_index=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"def preprocess_data(df):\n",
" # Convert compensation to numeric, ignoring non-numeric values\n",
" df['ConvertedComp'] = pd.to_numeric(df['ConvertedComp'], errors='coerce')\n",
" \n",
" # Fill missing values in relevant columns\n",
" df['EdLevel'] = df['EdLevel'].fillna('Unknown')\n",
" df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce').fillna(0)\n",
" df['Gender'] = df['Gender'].fillna('Unknown')\n",
" df['Ethnicity'] = df['Ethnicity'].fillna('Unknown')\n",
" \n",
" # Drop rows where ConvertedComp is NaN\n",
" df = df.dropna(subset=['ConvertedComp'])\n",
" \n",
" return df\n",
"\n",
"# Preprocess the data\n",
"cleaned_data = preprocess_data(merged_data)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# Count the number of respondents by ethnicity\n",
"ethnicity_counts = cleaned_data['Ethnicity'].value_counts()\n",
"\n",
"# Calculate average salary by ethnicity\n",
"avg_salary_by_ethnicity = cleaned_data.groupby('Ethnicity')['ConvertedComp'].mean()\n",
"\n",
"print(\"Number of Respondents by Ethnicity:\")\n",
"print(ethnicity_counts)\n",
"print(\"\\nAverage Salary by Ethnicity:\")\n",
"print(avg_salary_by_ethnicity)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"# Number of Respondents by Ethnicity\n",
"plt.figure(figsize=(10, 6))\n",
"ethnicity_counts.plot(kind='bar')\n",
"plt.title('Number of Respondents by Ethnicity')\n",
"plt.xlabel('Ethnicity')\n",
"plt.ylabel('Number of Respondents')\n",
"plt.show()\n",
"\n",
"# Average Salary by Ethnicity\n",
"plt.figure(figsize=(10, 6))\n",
"avg_salary_by_ethnicity.plot(kind='bar')\n",
"plt.title('Average Salary by Ethnicity')\n",
"plt.xlabel('Ethnicity')\n",
"plt.ylabel('Average Salary (ConvertedComp)')\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
Expand Down
72 changes: 68 additions & 4 deletions Stackoverflow_Survey_Analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21729,33 +21729,97 @@
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Examine the impact on participation rates due to different ethnicities."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"import pandas as pd\n",
"\n",
"# Load the CSV files\n",
"file1 = pd.read_csv(r\"Data/survey_results_public_2018.csv\")\n",
"file2 = pd.read_csv(r\"Data/survey_results_public_2019.csv\")\n",
"file3 = pd.read_csv(r\"Data/survey_results_public_2020.csv\")\n",
"\n",
"# Merge the data\n",
"merged_data = pd.concat([file1, file2, file3], ignore_index=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"def preprocess_data(df):\n",
" # Convert compensation to numeric, ignoring non-numeric values\n",
" df['ConvertedComp'] = pd.to_numeric(df['ConvertedComp'], errors='coerce')\n",
" \n",
" # Fill missing values in relevant columns\n",
" df['EdLevel'] = df['EdLevel'].fillna('Unknown')\n",
" df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce').fillna(0)\n",
" df['Gender'] = df['Gender'].fillna('Unknown')\n",
" df['Ethnicity'] = df['Ethnicity'].fillna('Unknown')\n",
" \n",
" # Drop rows where ConvertedComp is NaN\n",
" df = df.dropna(subset=['ConvertedComp'])\n",
" \n",
" return df\n",
"\n",
"# Preprocess the data\n",
"cleaned_data = preprocess_data(merged_data)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# Count the number of respondents by ethnicity\n",
"ethnicity_counts = cleaned_data['Ethnicity'].value_counts()\n",
"\n",
"# Calculate average salary by ethnicity\n",
"avg_salary_by_ethnicity = cleaned_data.groupby('Ethnicity')['ConvertedComp'].mean()\n",
"\n",
"print(\"Number of Respondents by Ethnicity:\")\n",
"print(ethnicity_counts)\n",
"print(\"\\nAverage Salary by Ethnicity:\")\n",
"print(avg_salary_by_ethnicity)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"# Number of Respondents by Ethnicity\n",
"plt.figure(figsize=(10, 6))\n",
"ethnicity_counts.plot(kind='bar')\n",
"plt.title('Number of Respondents by Ethnicity')\n",
"plt.xlabel('Ethnicity')\n",
"plt.ylabel('Number of Respondents')\n",
"plt.show()\n",
"\n",
"# Average Salary by Ethnicity\n",
"plt.figure(figsize=(10, 6))\n",
"avg_salary_by_ethnicity.plot(kind='bar')\n",
"plt.title('Average Salary by Ethnicity')\n",
"plt.xlabel('Ethnicity')\n",
"plt.ylabel('Average Salary (ConvertedComp)')\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
Expand Down

0 comments on commit ac0b8e4

Please sign in to comment.