import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")
plt.style.use('fivethirtyeight')
df = pd.read_csv('salaries.csv')
print(df.head())
print(df.shape)
print(df.info())
print(df.describe())
print(df.describe(include='O'))
print(df.isnull().sum())
print(df.duplicated().sum())
plt.figure(figsize=(12, 6))
sns.histplot(df['salary_in_usd'], kde=True, bins=50)
plt.title('Distribution of Salaries (USD)', fontsize=16)
plt.xlabel('Salary (USD)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.axvline(df['salary_in_usd'].median(), color='red', linestyle='--', label=f'Median: ${df["salary_in_usd"].median():,}')
plt.axvline(df['salary_in_usd'].mean(), color='green', linestyle='--', label=f'Mean: ${df["salary_in_usd"].mean():,}')
plt.legend()
plt.show()
plt.figure(figsize=(12, 6))
sns.histplot(np.log1p(df['salary_in_usd']), kde=True, bins=50)
plt.title('Log-Transformed Salary Distribution', fontsize=16)
plt.xlabel('Log(Salary+1)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()
yearly_stats = df.groupby('work_year')['salary_in_usd'].agg(['mean', 'median', 'std']).reset_index()
fig = px.line(yearly_stats, x='work_year', y=['mean', 'median'], title='Data Science Salary Trends (2020-2025)', labels={'value': 'Salary (USD)', 'work_year': 'Year', 'variable': 'Metric'}, template='plotly_white')
fig.update_layout(legend_title_text='', hovermode='x unified', width=900, height=500)
fig.add_trace(go.Scatter(
x=np.concatenate([yearly_stats['work_year'], yearly_stats['work_year'][::-1]]),
y=np.concatenate([yearly_stats['mean'] + yearly_stats['std'], (yearly_stats['mean'] - yearly_stats['std'])[::-1]]),
fill='toself', fillcolor='rgba(0,100,80,0.2)', line=dict(color='rgba(255,255,255,0)'), name='Standard Deviation'
))
fig.show()
plt.figure(figsize=(12, 6))
sns.boxplot(x='experience_level', y='salary_in_usd', data=df, order=['EN', 'MI', 'SE', 'EX'])
plt.title('Salary Distribution by Experience Level', fontsize=16)
plt.xlabel('Experience Level', fontsize=12)
plt.ylabel('Salary (USD)', fontsize=12)
plt.xticks(ticks=[0, 1, 2, 3], labels=['Entry-level', 'Mid-level', 'Senior', 'Executive'])
plt.show()
exp_time = df.groupby(['work_year', 'experience_level'])['salary_in_usd'].mean().reset_index()
exp_time['experience_level'] = exp_time['experience_level'].replace({'EN': 'Entry-level', 'MI': 'Mid-level', 'SE': 'Senior', 'EX': 'Executive'})
fig = px.line(exp_time, x='work_year', y='salary_in_usd', color='experience_level', title='Salary Trends by Experience Level (2020-2025)', labels={'salary_in_usd': 'Average Salary (USD)', 'work_year': 'Year'}, template='plotly_white')
fig.update_layout(width=900, height=500, hovermode='x unified')
fig.show()
top_jobs = df['job_title'].value_counts().head(15)
print("Distribution of top job titles:")
print(top_jobs)
top_jobs_salary = df[df['job_title'].isin(top_jobs.index)].groupby('job_title')['salary_in_usd'].mean().sort_values(ascending=False)
fig = px.bar(x=top_jobs_salary.index, y=top_jobs_salary.values, labels={'x': 'Job Title', 'y': 'Average Salary (USD)'}, title='Average Salary by Top Job Titles', color=top_jobs_salary.values, color_continuous_scale='Viridis')
fig.update_layout(xaxis_tickangle=-45, width=1000, height=600)
fig.show()
plt.figure(figsize=(14, 8))
top5_jobs = top_jobs.index[:5]
sns.violinplot(x='job_title', y='salary_in_usd', data=df[df['job_title'].isin(top5_jobs)])
plt.title('Salary Distribution for Top 5 Job Titles', fontsize=16)
plt.xlabel('Job Title', fontsize=12)
plt.ylabel('Salary (USD)', fontsize=12)
plt.xticks(rotation=30, ha='right')
plt.tight_layout()
plt.show()
df['adjusted_salary'] = df['salary_in_usd']
iso2_to_name = {'US': 'United States', 'GB': 'United Kingdom', 'DE': 'Germany', 'FR': 'France', 'CA': 'Canada', 'IN': 'India', 'AU': 'Australia', 'ES': 'Spain', 'BR': 'Brazil', 'NL': 'Netherlands', 'JP': 'Japan', 'CH': 'Switzerland', 'IT': 'Italy', 'SG': 'Singapore', 'SE': 'Sweden', 'MX': 'Mexico', 'FI': 'Finland', 'DK': 'Denmark', 'PL': 'Poland', 'PT': 'Portugal', 'NZ': 'New Zealand', 'IE': 'Ireland', 'HK': 'Hong Kong', 'RU': 'Russia', 'BE': 'Belgium', 'IL': 'Israel', 'UA': 'Ukraine', 'TR': 'Turkey', 'AE': 'United Arab Emirates', 'ZA': 'South Africa', 'CO': 'Colombia', 'AR': 'Argentina', 'CL': 'Chile', 'AT': 'Austria', 'MY': 'Malaysia', 'NG': 'Nigeria', 'VN': 'Vietnam', 'KR': 'South Korea', 'TH': 'Thailand'}
avg_salary_by_residence = df.groupby('employee_residence')['adjusted_salary'].mean().reset_index()
avg_salary_by_residence['country_name'] = avg_salary_by_residence['employee_residence'].map(iso2_to_name)
avg_salary_by_residence = avg_salary_by_residence.dropna(subset=['country_name'])
fig2 = px.choropleth(avg_salary_by_residence, locations='country_name', locationmode='country names', color='adjusted_salary', hover_name='country_name', hover_data={'employee_residence': True, 'adjusted_salary': ':,.0f'}, color_continuous_scale=px.colors.sequential.Plasma, title='Average Salary by Employee Residence', labels={'adjusted_salary': 'Average Adjusted Salary'}, projection='natural earth')
fig2.update_layout(width=1000, height=600)
fig2.show()
top_countries = avg_salary_by_residence.sort_values('adjusted_salary', ascending=False).head(20)
plt.figure(figsize=(14, 8))
chart = sns.barplot(x='country_name', y='adjusted_salary', data=top_countries, palette='viridis', order=top_countries['country_name'])
plt.title('Top 20 Countries by Average Data Science Salary', fontsize=16)
plt.xlabel('Employee Residence', fontsize=12)
plt.ylabel('Average Adjusted Salary (USD)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
for i, bar in enumerate(chart.patches):
chart.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 2000, f'${top_countries["adjusted_salary"].iloc[i]:,.0f}', ha='center', fontsize=9)
plt.tight_layout()
plt.show()
remote_salary = df.groupby('remote_ratio')['salary_in_usd'].agg(['mean', 'median', 'count']).reset_index()
remote_salary['remote_ratio'] = remote_salary['remote_ratio'].map({0: 'On-site', 50: 'Hybrid', 100: 'Remote'})
fig = px.bar(remote_salary, x='remote_ratio', y=['mean', 'median'], barmode='group', title='Salary by Remote Work Ratio', labels={'value': 'Salary (USD)', 'remote_ratio': 'Work Setting', 'variable': 'Metric'}, color_discrete_sequence=['#2a9d8f', '#e76f51'])
fig.update_layout(width=800, height=500)
fig.show()
remote_time = df.groupby(['work_year', 'remote_ratio']).size().reset_index(name='count')
total_per_year = remote_time.groupby('work_year')['count'].sum().reset_index()
remote_time = remote_time.merge(total_per_year, on='work_year', suffixes=('', '_total'))
remote_time['percentage'] = (remote_time['count'] / remote_time['count_total']) * 100
remote_time['remote_ratio'] = remote_time['remote_ratio'].map({0: 'On-site', 50: 'Hybrid', 100: 'Remote'})
fig = px.line(remote_time, x='work_year', y='percentage', color='remote_ratio', title='Remote Work Trends (2020-2025)', labels={'percentage': 'Percentage of Jobs', 'work_year': 'Year'}, template='plotly_white')
fig.update_layout(width=900, height=500, hovermode='x unified')
fig.show()
company_salary = df.groupby(['company_size', 'experience_level'])['salary_in_usd'].median().reset_index()
company_salary['company_size'] = company_salary['company_size'].map({'S': 'Small', 'M': 'Medium', 'L': 'Large'})
company_salary['experience_level'] = company_salary['experience_level'].map({'EN': 'Entry-level', 'MI': 'Mid-level', 'SE': 'Senior', 'EX': 'Executive'})
fig = px.bar(company_salary, x='company_size', y='salary_in_usd', color='experience_level', barmode='group', title='Median Salary by Company Size and Experience Level', labels={'salary_in_usd': 'Median Salary (USD)', 'company_size': 'Company Size'}, template='plotly_white')
fig.update_layout(width=900, height=500)
fig.show()
currency_counts = df['salary_currency'].value_counts().head(10)
fig = px.pie(values=currency_counts.values, names=currency_counts.index, title='Distribution of Salary Currencies', template='plotly_white')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(width=700, height=500)
fig.show()
df['implied_exchange_rate'] = df['salary'] / df['salary_in_usd']
top_currencies = df['salary_currency'].value_counts().head(10).index.tolist()
exchange_rates = df[df['salary_currency'].isin(top_currencies)].groupby(['work_year', 'salary_currency'])['implied_exchange_rate'].median().reset_index()
fig = px.line(exchange_rates, x='work_year', y='implied_exchange_rate', color='salary_currency', title='Implied Exchange Rate Trends (2020-2025)', labels={'implied_exchange_rate': 'Rate vs USD', 'work_year': 'Year'}, template='plotly_white')
fig.update_layout(width=900, height=500, hovermode='x unified')
fig.show()
categorical_cols = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location', 'company_size']
numerical_cols = ['work_year', 'salary', 'salary_in_usd', 'remote_ratio']
top_job_titles = df['job_title'].value_counts().head(5).index.tolist()
df_corr = df[df['job_title'].isin(top_job_titles)].copy()
df_dummies = pd.get_dummies(df_corr, columns=categorical_cols, drop_first=True)
corr_matrix = df_dummies.corr()
plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Correlation Matrix of Features', fontsize=18)
plt.xticks(fontsize=10, rotation=90)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()
salary_corr = corr_matrix['salary_in_usd'].sort_values(ascending=False)
print("Top 10 features positively correlated with salary:")
print(salary_corr.head(10))
print("\nTop 10 features negatively correlated with salary:")
print(salary_corr.tail(10))