Python Cheat Sheet for Data Analysis
Data Loading Data Wrangling Exploratory Data Analysis
Complete data frame correlation
Read CSV dataset Replace missing data with frequency
df.corr()
# load without header MostFrequentEntry =
df = pd.read_csv(<CSV path>, header = None) df[‘attribute_name’].value_counts().idxmax() Specific attribute correlation
# load using first row as header df[[‘attribute1’,’attribute2’,...]].corr()
df = pd.read_csv(<CSV path>, header = 0) df[‘attribute_name’].replace(np.nan,MostFrequentEntry
, inplace=True) Scatter plot
Print first few entries from matlplotlib import pyplot as plt
Replace missing data with mean plt.scatter(df[[‘attribute_1’]], df[[‘attribute_2’]])
#n=number of entries; default 5
df.head(n) AverageValue= Regression plot
df[‘attribute’].astype(<data_type>).mean(axis=0) import seaborn as sns
Print last few entries sns.regplot(x=‘attribute_1’,y=‘attribute_2’, data=df)
df[‘attribute’].replace(np.nan, AverageValue,
#n=number of entries; default 5 inplace=True) Box plot
df.tail(n) import seaborn as sns
Fix the data types sns.boxplot(x=‘attribute_1’,y=‘attribute_2’, data=df)
Assign header names
df[[‘attribute1’, ‘attribute2’, ...]] = Grouping by attributes
df.columns = headers df[[‘attribute1’, ‘attribute2’, df_group = df[[‘attribute_1’,’attribute_2’,...]]
...]].astype(‘data_type’)
Replace “?” with NaN #data_type can be int, float, char, etc. GroupBy statements
# Group by a single attribute
df = df.replace(“?”, np.nan) Data normalization df_group = df_group.groupby(['attribute_1'],
as_index=False).mean()
df[‘attribute_name’] =
Retrieve data types df[‘attribute_name’]/df[‘attribute_name’].max() # Group by multiple attributes
df.dtypes df_group = df_group.groupby(['attribute_1',
Binning 'attribute_2'],as_index=False).mean()
Retrieve statistical description bins = np.linspace(min(df[‘attribute_name’]), Pivot tables
max(df[‘attribute_name’],n) grouped_pivot =
# default use
# n is the number of bins needed df_group.pivot(index='attribute_1',columns='attribute
df.describe()
# include all attributes _2')
GroupNames = [‘Group1’,’Group2’,’Group3’,...]
df.describe(include=”all”)
Pseudocolor plot
df['binned_attribute_name'] =
Retrieve data set summary pd.cut(df['attribute_name'], bins, labels=GroupNames, from matlplotlib import pyplot as plt
include_lowest=True) plt.pcolor(grouped_pivot, cmap='RdBu')
df.info()
Pearson Coefficient and p-value
Change column name
Save data frame to csv from scipy import stats
df.rename(columns={‘old_name’:’new_name’}, pearson_coef,p_value=stats.pearsonr(df[’attribute_1’]
df.to_csv(<output CSV path>) inplace=True) , df['attribute_2'])
Indicator variables
dummy_variable = pd.get_dummies(df[‘attribute_name’])
df = pd.concat([df, dummy_variable],axis = 1)
© Copyright IBM Corpora on 2023. All rights reserved Page | 1
Python Cheat Sheet for Data Analysis
Model Development Pipeline
lre=LinearRegression()
from sklearn.pipeline import Pipeline
Linear regression from sklearn.preprocessing import StandardScaler Rcross =
from sklearn.linear_model import LinearRegression Input=[('scale',StandardScaler()), ('polynomial', cross_val_score(lre,x_data[['attribute_1']],y_data,cv
lr = LinearRegression() PolynomialFeatures(include_bias=False)), =n)
('model',LinearRegression())] # n indicates number of times, or folds, for which
Train linear regression model the cross validation is to be done
X = df[[‘attribute_1’, ‘attribute_2’, ...]] pipe=Pipeline(Input)
Y = df['target_attribute'] Mean = Rcross.mean()
lr.fit(X,Y) Z = Z.astype(float) Std_dev = Rcross.std()
pipe.fit(Z,y)
Generate output predictions ypipe=pipe.predict(Z) Cross-validation prediction
from sklearn.model_selection import cross_val_score
Y_hat = lr.predict(X) R2 value
Identify the coefficient and intercept # For linear regression model from sklearn.linear_model import LinearRegression
X = df[[‘attribute_1’, ‘attribute_2’, ...]]
coeff = lr.coef_ Y = df['target_attribute'] lre=LinearRegression()
intercept = lr.intercept_
Residual plot lr.fit(X,Y) yhat = cross_val_predict(lre,x_data[[‘attribute_1’]],
R2_score = lr.score(X,Y) y_data,cv=4)
import seaborn as sns
sns.residplot(x=df[[‘attribute_1’]], # For polynomial regression model Ridge regression and prediction
y=df[[‘attribute_2’]]) from sklearn.metrics import r2_score from sklearn.linear_model import Ridge
Distribution plot pr=PolynomialFeatures(degree=2)
f = np.polyfit(x, y, n)
import seaborn as sns p = np.poly1d(f) x_train_pr=pr.fit_transform(x_train[[‘attribute_1’,
sns.distplot(df['attribute_name'], hist=False) R2_score = r2_score(y, p(x)) ‘attribute_2’, ...]])
# can include other parameters like color, label,
etc. MSE value x_test_pr=pr.fit_transform(x_test[[‘attribute_1’,
from sklearn.metrics import mean_squared_error ‘attribute_2’, ...]])
Polynomial regression mse = mean_squared_error(Y, Yhat)
f = np.polyfit(x, y, n) RidgeModel=Ridge(alpha=1)
#creates the polynomial features of order n Model Evaluation and Refinement RidgeModel.fit(x_train_pr, y_train)
yhat = RigeModel.predict(x_test_pr)
p = np.poly1d(f) Split data for training and testing
#p becomes the polynomial model used to generate the from sklearn.model_selection import train_test_split Grid search
predicted output
from sklearn.model_selection import GridSearchCV
y_data = df[‘target_attribute’]
Y_hat = p(x) from sklearn.linear_model import Ridge
x_data=df.drop('target_attribute',axis=1)
# Y_hat is the predicted output
parameters= [{'alpha': [0.001,0.1,1, 10, 100, 1000,
x_train, x_test, y_train, y_test =
Multi-variate polynomial regression 10000, ...]}]
train_test_split(x_data, y_data, test_size=0.10,
from sklearn.preprocessing import PolynomialFeatures random_state=1)
RR=Ridge()
Cross-validation score Grid1 = GridSearchCV(RR, parameters1,cv=4)
Z = df[[‘attribute_1’,’attribute_2’,...]]
pr=PolynomialFeatures(degree=n) from sklearn.model_selection import cross_val_score Grid1.fit(x_data[[‘attribute_1’, ‘attribute_2’,
Z_pr=pr.fit_transform(Z) ...]], y_data)
from sklearn.linear_model import LinearRegression
BestRR=Grid1.best_estimator_
BestRR.score(x_test[[‘attribute_1’, ‘attribute_2’,
...]], y_te
© Copyright IBM Corpora on 2023. All rights reserved Page | 2