import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
# the dataset is same as used in above code
np.random.seed(0) # For reproducibility
data = {
'price': np.random.uniform(10, 50, 100),
'advertising': np.random.uniform(100, 500, 100),
'location_in_store': np.random.uniform(1, 10, 100),
'total_store_volume': np.random.uniform(1000, 5000, 100),
'sales': np.random.uniform(100, 1000, 100) # Dependent variable
}
df = pd.DataFrame(data)
def calc_vif(X):
vif = pd.DataFrame()
vif["variables"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
return vif
# Select independent variables
X = df[['price', 'advertising', 'location_in_store', 'total_store_volume']]
# Calculate VIF before removal
vif_before = calc_vif(X)
print("VIF before removal:")
print(vif_before)
# Visualizing the correlation matrix before removal
corr_before = X.corr()
# Apply PCA to the highly correlated features: price, advertising, and total_store_volume
pca = PCA(n_components=1) # Combine them into 1 component
X_combined = X[['price', 'advertising', 'total_store_volume']]
X_pca = pca.fit_transform(X_combined)
# Add the PCA result as a new feature to the dataset
X['combined_pca'] = X_pca
# Now, we will drop the original features and use the PCA component
X_reduced = X.drop(columns=['price', 'advertising', 'total_store_volume'])
corr_after_pca = X_reduced.corr()
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
sns.heatmap(corr_before, annot=True, cmap="coolwarm", fmt=".2f", vmin=-1, vmax=1, ax=ax[0])
ax[0].set_title("Correlation Matrix Before PCA")
sns.heatmap(corr_after_pca, annot=True, cmap="coolwarm", fmt=".2f", vmin=-1, vmax=1, ax=ax[1])
ax[1].set_title("Correlation Matrix After PCA")
plt.tight_layout()
plt.show()
# Calculate VIF after PCA (for the reduced set)
vif_after_pca = calc_vif(X_reduced)
print("\nVIF after PCA:")
print(vif_after_pca)