# Import seaborn with alias sns
import seaborn as sns
# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt
# Draw the scatter plot
sns.scatterplot(x = "n_convenience", y = "price_twd_msq", data = taiwan_real_estate)
# Show the plot
plt.show()Regression in Python
- regression vs prediction
statsmodelsoptimized for inferencescikit-learnoptimized for prediction
taiwan_real_estate is available as a pandas DataFrame.
Draw a trend line calculated using linear regression. Omit the confidence interval ribbon. Note: The scatter_kws argument, pre-filled in the exercise, makes the data points 50% transparent.
# Draw a trend line on the scatter plot of price_twd_msq vs. n_convenience
sns.regplot(x="n_convenience",
y="price_twd_msq",
data=taiwan_real_estate,
ci=None,
scatter_kws={'alpha': 0.5})
# Show the plot
plt.show()running a model
from statsmodels.formula.api import ols
ols() function takes two arguments: 1. a formula in quotes 2. data
model = ols("y ~ x", data = df) # creates a model object
model = model.fit() # fit the model
print(model.params) # print parametersTo actually fit the model, dd .fit() method .params() shows OLS coefficients
# Import the ols function
from statsmodels.formula.api import ols
# Create the model object
mdl_price_vs_conv = ols("price_twd_msq ~ n_convenience", data=taiwan_real_estate)
# Fit the model
mdl_price_vs_conv = mdl_price_vs_conv.fit()
# Print the parameters of the fitted model
print(mdl_price_vs_conv.params)Categorical explanatory variables
- visualize with a histogram for each category
sns.displot(data = df, x = "x", col = "cat_col", col_wrap = 2, bins = b) - x is the dependent variable - col is the column with categorical variable - col_wrap is number of plots per row
by default, displot makes histograms
- grouped summary statistics,
summary_stats = df.groupby('cat_col')[y].mean()
for regression, can just have the categorical variable be x, nothing special
but will display intercept and n-1 coefficients for n groups - each coefficient styled cat_col[category_a]
can estimate without an intercept for easier interpretation
ols("y ~ cat_x + 0", data = df).fit()
will specify that all coefficients should be given relative to 0
# Histograms of price_twd_msq with 10 bins, split by the age of each house
sns.displot(data=taiwan_real_estate,
x="price_twd_msq",
col="house_age_years",
col_wrap=3,
bins = 10)
# Show the plot
plt.show()# Calculate the mean of price_twd_msq, grouped by house age
mean_price_by_age = taiwan_real_estate.groupby('house_age_years')['price_twd_msq'].mean()
# Print the result
print(mean_price_by_age)house_age_years
0 to 15 12.637
15 to 30 9.877
30 to 45 11.393
Name: price_twd_msq, dtype: float64
# Create the model, fit it
mdl_price_vs_age = ols("price_twd_msq ~ house_age_years", data=taiwan_real_estate).fit()
# Print the parameters of the fitted model
print(mdl_price_vs_age.params)Intercept 12.637
house_age_years[T.15 to 30] -2.761
house_age_years[T.30 to 45] -1.244
dtype: float64
# Update the model formula to remove the intercept
mdl_price_vs_age0 = ols("price_twd_msq ~ house_age_years + 0", data=taiwan_real_estate).fit()
# Print the parameters of the fitted model
print(mdl_price_vs_age0.params)house_age_years[0 to 15] 12.637
house_age_years[15 to 30] 9.877
house_age_years[30 to 45] 11.393
dtype: float64
Making predictions
- to create new x data to predict y on, need to store in a pandas DF
new_data = pd.DataFrame({'x': np.arange(20,41)}) or other method like pd.DataFrame({'x': [10,21.54]}) - use dictionary to specify columns (e.g. x) - specify an interval of values with arange(start, end) e.g. from 20 to 40
- then call
.predict()method on ols model, passing new data into it
model.predict(new_data)
- alternatively can
.assign()new data into a dataframe and with added predictions
prediction_data = new_data.assign(y = model.predict(new_data))
Can include predictions on scatterplot
fig = plt.figure() # plot multiple layers (using matplotlib) before plots
sns.regplot(x = "x",
y = "y",
ci = None,
data = df,)
sns.scatterplot(x = "x",
y = "y",
data = prediction_data,
color = "red", # red
marker = "s") # squares
plt.show() # will call both plots on same figure# Import numpy with alias np
import numpy as np
# Create the explanatory_data
explanatory_data = pd.DataFrame({'n_convenience': np.arange(0,11)})
# Print it
print(explanatory_data) n_convenience
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
# Use mdl_price_vs_conv to predict with explanatory_data, call it price_twd_msq
price_twd_msq = mdl_price_vs_conv.predict(explanatory_data)
# Print it
print(price_twd_msq)0 8.224
1 9.022
2 9.820
3 10.618
4 11.417
5 12.215
6 13.013
7 13.811
8 14.609
9 15.407
10 16.205
dtype: float64
# Create prediction_data
prediction_data = explanatory_data.assign(
price_twd_msq = mdl_price_vs_conv.predict(explanatory_data))
# Print the result
print(prediction_data) n_convenience price_twd_msq
0 0 8.224
1 1 9.022
2 2 9.820
3 3 10.618
4 4 11.417
5 5 12.215
6 6 13.013
7 7 13.811
8 8 14.609
9 9 15.407
10 10 16.205
# Create a new figure, fig
fig = plt.figure()
sns.regplot(x="n_convenience",
y="price_twd_msq",
data=taiwan_real_estate,
ci=None)
# Add a scatter plot layer to the regplot
sns.scatterplot(x="n_convenience",
y="price_twd_msq",
data=prediction_data,
color = "red")
# Show the layered plot
plt.show()# Define a DataFrame impossible
impossible = pd.DataFrame({"n_convenience": [-1, 2.5]})
# predict
mdl_price_vs_conv.predict(impossible)0 7.426
1 10.219
dtype: float64
model objects
.paramsattribute has coefficients, returns a Pandas series.fittedvaleusa Panda series of predicted values, on original data.residresiduals- equivalently
df['y'] - model.fittevalues
- equivalently
.summary()method gives printout
# Print the model parameters of mdl_price_vs_conv
print(mdl_price_vs_conv.params)Intercept 8.224
n_convenience 0.798
dtype: float64
# Print the fitted values of mdl_price_vs_conv
print(mdl_price_vs_conv.fittedvalues)0 16.205
1 15.407
2 12.215
3 12.215
4 12.215
...
409 8.224
410 15.407
411 13.811
412 12.215
413 15.407
Length: 414, dtype: float64
# Print the residuals of mdl_price_vs_conv
print(mdl_price_vs_conv.resid)0 -4.738
1 -2.638
2 2.097
3 4.366
4 0.826
...
409 -3.565
410 -0.278
411 -1.526
412 3.670
413 3.927
Length: 414, dtype: float64
# Print a summary of mdl_price_vs_conv
print(mdl_price_vs_conv.summary()) OLS Regression Results
==============================================================================
Dep. Variable: price_twd_msq R-squared: 0.326
Model: OLS Adj. R-squared: 0.324
Method: Least Squares F-statistic: 199.3
Date: Thu, 01 Feb 2024 Prob (F-statistic): 3.41e-37
Time: 18:49:51 Log-Likelihood: -1091.1
No. Observations: 414 AIC: 2186.
Df Residuals: 412 BIC: 2194.
Df Model: 1
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
Intercept 8.2242 0.285 28.857 0.000 7.664 8.784
n_convenience 0.7981 0.057 14.118 0.000 0.687 0.909
==============================================================================
Omnibus: 171.927 Durbin-Watson: 1.993
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1417.242
Skew: 1.553 Prob(JB): 1.78e-308
Kurtosis: 11.516 Cond. No. 8.87
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Get the coefficients of mdl_price_vs_conv
coeffs = mdl_price_vs_conv.params
# Get the intercept
intercept = mdl_price_vs_conv.params[0]
# Get the slope
slope = mdl_price_vs_conv.params[1]
# Manually calculate the predictions
price_twd_msq = intercept + slope * explanatory_data
print(price_twd_msq)
# Compare to the results from .predict()
print(price_twd_msq.assign(predictions_auto=mdl_price_vs_conv.predict(explanatory_data))) n_convenience
0 8.224
1 9.022
2 9.820
3 10.618
4 11.417
5 12.215
6 13.013
7 13.811
8 14.609
9 15.407
n_convenience predictions_auto
0 8.224 8.224
1 9.022 9.022
2 9.820 9.820
3 10.618 10.618
4 11.417 11.417
5 12.215 12.215
6 13.013 13.013
7 13.811 13.811
8 14.609 14.609
9 15.407 15.407
regression to the mean
- regression to the mdean is a property of the data, not a type of model, but linear regression can quantify its effect
- the stuff you explained + stuff you couldn’t explain
- residuals exist due to problems in model and in fundamental randomness
- extreme outliers are often due to randomness
- regression to the mean means extreme cases don’t persist over time
Regression to the mean is also an important concept in investing. Here you’ll look at the annual returns from investing in companies in the Standard and Poor 500 index (S&P 500), in 2018 and 2019.
The sp500_yearly_returns dataset contains three columns:
# Create a new figure, fig
fig = plt.figure()
# Plot the first layer: y = x
plt.axline(xy1=(0,0), slope=1, linewidth=2, color="green")
# Add scatter plot with linear regression trend line
sns.regplot(x = "return_2018",
y = "return_2019",
ci = None,
data = sp500_yearly_returns)
# Set the axes so that the distances along the x and y axes look the same
plt.axis("equal")
# Show the plot
plt.show()# Run a linear regression on return_2019 vs. return_2018 using sp500_yearly_returns
mdl_returns = ols("return_2019 ~ return_2018", data = sp500_yearly_returns).fit()
# Print the parameters
print(mdl_returns.params)Intercept 0.321
return_2018 0.020
dtype: float64
# Create a DataFrame with return_2018 at -1, 0, and 1
explanatory_data = pd.DataFrame({'return_2018': [-1,0,1]})
# Use mdl_returns to predict with explanatory_data
print(mdl_returns.predict(explanatory_data))0 0.301
1 0.321
2 0.341
dtype: float64
transforming variables
Square
df['x_sq'] = df['x'] ** 2
regplot using x_sq
run model using x_sq
square root wih df['sqrt_x'] = np.sqrt(df['x'])
# Create sqrt_dist_to_mrt_m
taiwan_real_estate["sqrt_dist_to_mrt_m"] = np.sqrt(taiwan_real_estate["dist_to_mrt_m"])
plt.figure()
# Plot using the transformed variable
sns.regplot(x = "sqrt_dist_to_mrt_m",
y = "price_twd_msq",
data = taiwan_real_estate,
ci = None)
plt.show()# Run a linear regression of price_twd_msq vs. square root of dist_to_mrt_m using taiwan_real_estate
mdl_price_vs_dist = ols("price_twd_msq ~ sqrt_dist_to_mrt_m", data = taiwan_real_estate).fit()
# Print the parameters
print(mdl_price_vs_dist.params)Intercept 16.710
sqrt_dist_to_mrt_m -0.183
dtype: float64
# Create sqrt_dist_to_mrt_m
taiwan_real_estate["sqrt_dist_to_mrt_m"] = np.sqrt(taiwan_real_estate["dist_to_mrt_m"])
# Run a linear regression of price_twd_msq vs. sqrt_dist_to_mrt_m
mdl_price_vs_dist = ols("price_twd_msq ~ sqrt_dist_to_mrt_m", data=taiwan_real_estate).fit()
explanatory_data = pd.DataFrame({"sqrt_dist_to_mrt_m": np.sqrt(np.arange(0, 81, 10) ** 2),
"dist_to_mrt_m": np.arange(0, 81, 10) ** 2})
# Create prediction_data by adding a column of predictions to explantory_data
prediction_data = explanatory_data.assign(
price_twd_msq = mdl_price_vs_dist.predict(explanatory_data)
)
# Print the result
print(prediction_data) sqrt_dist_to_mrt_m dist_to_mrt_m price_twd_msq
0 0.0 0 16.710
1 10.0 100 14.881
2 20.0 400 13.053
3 30.0 900 11.225
4 40.0 1600 9.396
5 50.0 2500 7.568
6 60.0 3600 5.739
7 70.0 4900 3.911
8 80.0 6400 2.082
fig = plt.figure()
sns.regplot(x="sqrt_dist_to_mrt_m", y="price_twd_msq", data=taiwan_real_estate, ci=None)
# Add a layer of your prediction points
sns.scatterplot(x="sqrt_dist_to_mrt_m", y="price_twd_msq", data=prediction_data, The response variable can be transformed too, but this means you need an extra step at the end to undo that transformation. That is, you “back transform” the predictions.
In the video, you saw the first step of the digital advertising workflow: spending money to buy ads, and counting how many people see them (the “impressions”). The next step is determining how many people click on the advert after seeing it.
ad_conversion is available as a pandas DataFrame.
# Create qdrt_n_impressions and qdrt_n_clicks
ad_conversion["qdrt_n_impressions"] = ad_conversion['n_impressions'] ** 0.25
ad_conversion["qdrt_n_clicks"] = ad_conversion['n_clicks'] ** 0.25
plt.figure()
# Plot using the transformed variables
sns.regplot(x = "qdrt_n_impressions",
y = "qdrt_n_clicks",
data = ad_conversion)
plt.show()# Run a linear regression of your transformed variables
mdl_click_vs_impression = ols("qdrt_n_clicks ~ qdrt_n_impressions", data = ad_conversion).fit()explanatory_data = pd.DataFrame({"qdrt_n_impressions": np.arange(0, 3e6+1, 5e5) ** .25,
"n_impressions": np.arange(0, 3e6+1, 5e5)})
# Complete prediction_data
prediction_data = explanatory_data.assign(
qdrt_n_clicks = mdl_click_vs_impression.predict(explanatory_data)
)
# Print the result
print(prediction_data) qdrt_n_impressions n_impressions qdrt_n_clicks
0 0.000 0.000e+00 0.072
1 26.591 5.000e+05 3.038
2 31.623 1.000e+06 3.599
3 34.996 1.500e+06 3.975
4 37.606 2.000e+06 4.266
5 39.764 2.500e+06 4.507
6 41.618 3.000e+06 4.714
In order to correctly interpret and visualize your predictions, you’ll need to do a back-transformation.
# Back transform qdrt_n_clicks
prediction_data["n_clicks"] = prediction_data['qdrt_n_clicks'] ** 4
print(prediction_data) qdrt_n_impressions n_impressions qdrt_n_clicks n_clicks
0 0.000 0.000e+00 0.072 2.650e-05
1 26.591 5.000e+05 3.038 8.514e+01
2 31.623 1.000e+06 3.599 1.677e+02
3 34.996 1.500e+06 3.975 2.497e+02
4 37.606 2.000e+06 4.266 3.312e+02
5 39.764 2.500e+06 4.507 4.125e+02
6 41.618 3.000e+06 4.714 4.936e+02
# Plot the transformed variables
fig = plt.figure()
sns.regplot(x="qdrt_n_impressions", y="qdrt_n_clicks", data=ad_conversion, ci=None)
# Add a layer of your prediction points
sns.scatterplot(x = "qdrt_n_impressions", y = "qdrt_n_clicks", data = prediction_data, color="red")
plt.show()Model Fit
.summary()method shows several performance metrics in output.rsquaredattribute of a model as a float- e.g.
print(mdl.rsquared)
- e.g.
- can calculate as square of corr of X and Y
- e.g.
coeff_determination = df['x'].corr(df.['y']) ** 2
- e.g.
Residual standard error (RSE)
roughly a measure of the typical size of the residuals
mean squared error (MSE) = RSE
.summary()method doesn’t contain RSE, but can be retrieved from.mse_residattributerse = np.sqrt(mse)
to calculate rse
- square residuals
residuals_sq = df.resid ** 2 - take sum of squared residuals
sum(residuals_sq) - calculate degrees of freedom of residuals
- df = n - 2 (obs - coeffs)
df = len(df.index) - 2
rse = np.sqrt(resid_sum_squares / deg_freedom)
- square residuals
root mean square error, rmse
- same as rse except no degrees of freedom correction (
residual_sum_squares / n_obs) - worse for comparisons between models
- same as rse except no degrees of freedom correction (
typically should use RSE instead of RMSE
# Print a summary of mdl_click_vs_impression_orig
print(mdl_click_vs_impression_orig.summary())
# Print a summary of mdl_click_vs_impression_trans
print(mdl_click_vs_impression_trans.summary()) OLS Regression Results
==============================================================================
Dep. Variable: n_clicks R-squared: 0.892
Model: OLS Adj. R-squared: 0.891
Method: Least Squares F-statistic: 7683.
Date: Sat, 03 Feb 2024 Prob (F-statistic): 0.00
Time: 03:44:52 Log-Likelihood: -4126.7
No. Observations: 936 AIC: 8257.
Df Residuals: 934 BIC: 8267.
Df Model: 1
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
Intercept 1.6829 0.789 2.133 0.033 0.135 3.231
n_impressions 0.0002 1.96e-06 87.654 0.000 0.000 0.000
==============================================================================
Omnibus: 247.038 Durbin-Watson: 0.870
Prob(Omnibus): 0.000 Jarque-Bera (JB): 13215.277
Skew: -0.258 Prob(JB): 0.00
Kurtosis: 21.401 Cond. No. 4.88e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.88e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
OLS Regression Results
==============================================================================
Dep. Variable: qdrt_n_clicks R-squared: 0.945
Model: OLS Adj. R-squared: 0.944
Method: Least Squares F-statistic: 1.590e+04
Date: Sat, 03 Feb 2024 Prob (F-statistic): 0.00
Time: 03:44:52 Log-Likelihood: 193.90
No. Observations: 936 AIC: -383.8
Df Residuals: 934 BIC: -374.1
Df Model: 1
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
Intercept 0.0717 0.017 4.171 0.000 0.038 0.106
qdrt_n_impressions 0.1115 0.001 126.108 0.000 0.110 0.113
==============================================================================
Omnibus: 11.447 Durbin-Watson: 0.568
Prob(Omnibus): 0.003 Jarque-Bera (JB): 10.637
Skew: -0.216 Prob(JB): 0.00490
Kurtosis: 2.707 Cond. No. 52.1
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Print the coeff of determination for mdl_click_vs_impression_orig
print(mdl_click_vs_impression_orig.rsquared)
# Print the coeff of determination for mdl_click_vs_impression_trans
print(mdl_click_vs_impression_trans.rsquared)0.8916134973508041
0.9445272817143905
# Calculate mse_orig for mdl_click_vs_impression_orig
mse_orig = mdl_click_vs_impression_orig.mse_resid
# Calculate rse_orig for mdl_click_vs_impression_orig and print it
rse_orig = np.sqrt(mse_orig)
print("RSE of original model: ", rse_orig)
# Calculate mse_trans for mdl_click_vs_impression_trans
mse_trans = mdl_click_vs_impression_trans.mse_resid
# Calculate rse_trans for mdl_click_vs_impression_trans and print it
rse_trans = np.sqrt(mse_trans)
print("RSE of transformed model: ", rse_trans)RSE of original model: 19.905838862478138
RSE of transformed model: 0.19690640896875722Visualizing Fit
residplotfrom seabornsns.residplot(x = 'x', y = 'y', data = df, lowess = True)(lowess smoothes curve)plt.xlabel('fitted values')plt.ylabel('residuals')
- qq-plot from statsmodels
from statsmodels.api import qqplotqqplot(data = model.resid, fit = True, line = "45")- will compare data’s quantiles to a normal distribution
- scale-location plot
- first need to extract normalized residuals from model, using
get_influence()and accessing theresid_studentized_internalattribute model_norm_residuals = model.get_influence().resid_studentized_internal- then take absolute values and take square root of these normalized residuals to standardize them
model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_normal_residuals))sns.regplot(x = model.fittedvalues, y = model_norm_residuals_abs_sqrt, ci = None, lowess = True)plt.xlabel('fitted values')plt.ylabel('sqrt of abs val of studentized residuals')
- first need to extract normalized residuals from model, using
# Residual Plot
# Plot the residuals vs. fitted values
sns.residplot(x='n_convenience', y='price_twd_msq', data=taiwan_real_estate, lowess = True)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
# Show the plot
plt.show()# Import qqplot
from statsmodels.api import qqplot
# Create the Q-Q plot of the residuals
qqplot(data=mdl_price_vs_conv.resid, fit=True, line="45")
# Show the plot
plt.show()# Preprocessing steps
model_norm_residuals = mdl_price_vs_conv.get_influence().resid_studentized_internal
model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
# Create the scale-location plot
sns.regplot(x=mdl_price_vs_conv.fittedvalues, y=model_norm_residuals_abs_sqrt, ci=None, lowess=True)
plt.xlabel("Fitted values")
plt.ylabel("Sqrt of abs val of stdized residuals")
# Show the plot
plt.show()outliers, leverage, influence
extreme explanatory values (X)
leverage: measure of how extreme X variable values are
influence: measures how much model would change if you omit the observation for modeling
- like ‘torque’: linear force times length of wrench (when turning)
- influence of each obs is based on size of residuals and leverage
leverage & influence retrieved from
.summary_frame()method on aget_influence()method of a model, i.e.model.get_influence().summary_frame()for historical reasons, leverage described in “hat matrix”, leverage values are stored in
hat_diagcolumn of summary framesummary_df = model.get_influence().summary_frame()df['leverage'] = summary_df['hat_diag']
Cook’s distance: measures influence
- stored in
summary_frame()ascooks_d df['cooks_dist'] = summary_df['cooks_d']- can sort to find most influential obs:
print(df.sort_values("cooks_dist", ascending = False))
- stored in
# Create summary_info
summary_info = mdl_price_vs_dist.get_influence().summary_frame()print(summary_info)
dfb_Intercept dfb_np.sqrt(dist_to_mrt_m) cooks_d standard_resid hat_diag dffits_internal student_resid dffits
0 -9.489e-02 7.354e-02 4.648e-03 -1.266 0.006 -0.096 -1.267 -0.096
1 -1.398e-02 8.690e-03 1.217e-04 -0.263 0.004 -0.016 -0.263 -0.016
2 2.551e-02 -9.963e-03 6.231e-04 0.688 0.003 0.035 0.688 0.035
3 5.553e-02 -2.169e-02 2.939e-03 1.495 0.003 0.077 1.497 0.077
4 -9.325e-04 5.182e-04 6.055e-07 -0.020 0.003 -0.001 -0.020 -0.001
.. ... ... ... ... ... ... ... ...
# Create summary_info
summary_info = mdl_price_vs_dist.get_influence().summary_frame()
print(summary_info)
# Add the hat_diag column to taiwan_real_estate, name it leverage
taiwan_real_estate["leverage"] = summary_info['hat_diag']
# Sort taiwan_real_estate by leverage in descending order and print the head
print(taiwan_real_estate.sort_values("leverage", ascending = False).head())[414 rows x 8 columns]
dist_to_mrt_m n_convenience house_age_years price_twd_msq leverage
347 6488.021 1 15 to 30 3.389 0.027
116 6396.283 1 30 to 45 3.691 0.026
249 6306.153 1 15 to 30 4.539 0.026
255 5512.038 1 30 to 45 5.265 0.021
8 5512.038 1 30 to 45 5.688 0.021
# Create summary_info
summary_info = mdl_price_vs_dist.get_influence().summary_frame()
# Add the hat_diag column to taiwan_real_estate, name it leverage
taiwan_real_estate["leverage"] = summary_info["hat_diag"]
# Add the cooks_d column to taiwan_real_estate, name it cooks_dist
taiwan_real_estate['cooks_dist'] = summary_info['cooks_d']
# Sort taiwan_real_estate by cooks_dist in descending order and print the head.
print(taiwan_real_estate.sort_values("cooks_dist", ascending = False).head())# Sort taiwan_real_estate by cooks_dist in descending order and print the head.
print(taiwan_real_estate.sort_values("cooks_dist", ascending = False).head())
dist_to_mrt_m n_convenience house_age_years price_twd_msq leverage cooks_dist
270 252.582 1 0 to 15 35.552 0.004 0.116
148 3780.590 0 15 to 30 13.646 0.012 0.052
228 3171.329 0 0 to 15 14.100 0.009 0.035
220 186.510 9 30 to 45 23.691 0.004 0.025
113 393.261 6 0 to 15 2.300 0.003 0.023
Logistic Regression
- use
logitfunction fromstatsmodels.formula.api:from statsmodels.formula.api import logit model = logit("y ~ x", data = df).fit()- otherwise the same as
ols()function - visualizing with
sns.regplot()- add
logistic = Trueargument will show logistic trend line
- add
In a sns.displot() call on the churn data, plot time_since_last_purchase as two histograms, split for each has_churned value.
# Create the histograms of time_since_last_purchase split by has_churned
sns.displot(x = "time_since_last_purchase", col = "has_churned", data = churn)
plt.show()# Draw a linear regression trend line and a scatter plot of time_since_first_purchase vs. has_churned
sns.regplot(x = "time_since_first_purchase",
y = "has_churned",
data = churn,
line_kws={"color": "red"})
plt.show()# Draw a linear regression trend line and a scatter plot of time_since_first_purchase vs. has_churned
sns.regplot(x="time_since_first_purchase",
y="has_churned",
data=churn,
ci=None,
line_kws={"color": "red"})
# Draw a logistic regression trend line and a scatter plot of time_since_first_purchase vs. has_churned
sns.regplot(x="time_since_first_purchase",
y="has_churned",
data=churn,
ci=None,
logistic = True,
line_kws={"color": "blue"})
plt.show()# Import logit
from statsmodels.formula.api import logit
# Fit a logistic regression of churn vs. length of relationship using the churn dataset
mdl_churn_vs_relationship = logit("has_churned ~ time_since_first_purchase", data = churn).fit()
# Print the parameters of the fitted model
print(mdl_churn_vs_relationship.params)Optimization terminated successfully.
Current function value: 0.679663
Iterations 4
Intercept -0.015
time_since_first_purchase -0.355
dtype: float64
Predrictions & odds ratios
- predict the same way as with ols
- create dataframe of explanatory variable values
- then add response column calculated using predict method
explanatory_data = pd.DataFrame({'x': np.arange(1, 2, 8)})
prediction_data = explanatory_data.assign(y = model.predict(explanatory_data))- also can calculate the most likely response (rather than probabilities) - whether
or not- calculate by rounding the predicted probability with
np.round() prediction_data['most_likely_outcome'] = np.round(prediction_data['y'])
- calculate by rounding the predicted probability with
- another way to talk about binary responses is the odds-ratio
odds ratio: probability of something happening divided by the probability of it not happening
- as an example, prob of a 0.25 is the same as “three to one against” odds (prob of it not happening is 3 times the prob of it happening)
prediction_data['odds_ratio'] = prediction_data['y'] / (1 - prediction_data['y'])- can plot odds ratio (y) vs. x variable
- nice thing is that on a log (y) scale, increases linearly with x
- logarithm of odds ratio (logit) is another way to describe the predictions
prediction_data['log_odds_ratio'] = np.log(prediction_data['odds_ratio'])examples
Two variables are available:
mdl_churn_vs_relationship is the fitted logistic regression model of has_churned versus time_since_first_purchase.
explanatory_data is a DataFrame of explanatory values.
# Create prediction_data
prediction_data = explanatory_data.assign(
has_churned = mdl_churn_vs_relationship.predict(explanatory_data)
)
# Print the head
print(prediction_data.head())# Create prediction_data
prediction_data = explanatory_data.assign(
has_churned = mdl_churn_vs_relationship.predict(explanatory_data)
)
# Print the head
print(prediction_data.head())
# Create prediction_data
prediction_data = explanatory_data.assign(
has_churned = mdl_churn_vs_relationship.predict(explanatory_data)
)
fig = plt.figure()
# Create a scatter plot with logistic trend line
sns.regplot(x = "time_since_first_purchase",
y = "has_churned",
data = churn,
logistic = True)
# Overlay with prediction_data, colored red
sns.scatterplot(x = "time_since_first_purchase",
y = "has_churned",
data = prediction_data,
color = "red")
plt.show()# Update prediction data by adding most_likely_outcome
prediction_data["most_likely_outcome"] = np.round(prediction_data['has_churned'])
# Print the head
print(prediction_data.head()) time_since_first_purchase has_churned most_likely_outcome
0 -1.50 0.626 1.0
1 -1.25 0.605 1.0
2 -1.00 0.584 1.0
3 -0.75 0.562 1.0
4 -0.50 0.540 1.0
# Update prediction data by adding most_likely_outcome
prediction_data["most_likely_outcome"] = np.round(prediction_data["has_churned"])
fig = plt.figure()
# Create a scatter plot with logistic trend line (from previous exercise)
sns.regplot(x="time_since_first_purchase",
y="has_churned",
data=churn,
ci=None,
logistic=True)
# Overlay with prediction_data, colored red
sns.scatterplot(x = "time_since_first_purchase",
y = "most_likely_outcome",
data = prediction_data,
color = "red")
plt.show()# Update prediction data with odds_ratio
prediction_data["odds_ratio"] = prediction_data['has_churned'] / (1 - prediction_data['has_churned'])
# Print the head
print(prediction_data.head()) time_since_first_purchase has_churned most_likely_outcome odds_ratio
0 -1.50 0.626 1.0 1.677
1 -1.25 0.605 1.0 1.535
2 -1.00 0.584 1.0 1.404
3 -0.75 0.562 1.0 1.285
4 -0.50 0.540 1.0 1.176
# Update prediction data with odds_ratio
prediction_data["odds_ratio"] = prediction_data["has_churned"] / (1 - prediction_data["has_churned"])
fig = plt.figure()
# Create a line plot of odds_ratio vs time_since_first_purchase
sns.lineplot(x = "time_since_first_purchase",
y = "odds_ratio",
data = prediction_data)
# Add a dotted horizontal line at odds_ratio = 1
plt.axhline(y=1, linestyle="dotted")
plt.show()# Update prediction data with log_odds_ratio
prediction_data['log_odds_ratio'] = np.log(prediction_data['odds_ratio'])
# Print the head
print(prediction_data.head()) time_since_first_purchase has_churned most_likely_outcome odds_ratio log_odds_ratio
0 -1.50 0.626 1.0 1.677 0.517
1 -1.25 0.605 1.0 1.535 0.428
2 -1.00 0.584 1.0 1.404 0.340
3 -0.75 0.562 1.0 1.285 0.251
4 -0.50 0.540 1.0 1.176 0.162
# Update prediction data with log_odds_ratio
prediction_data["log_odds_ratio"] = np.log(prediction_data["odds_ratio"])
fig = plt.figure()
# Update the line plot: log_odds_ratio vs. time_since_first_purchase
sns.lineplot(x="time_since_first_purchase",
y="log_odds_ratio",
data=prediction_data)
# Add a dotted horizontal line at log_odds_ratio = 0
plt.axhline(y=0, linestyle="dotted")
plt.show()Logistic Fit
Confusion matrix: counts of
| Predicted False | Predicted True | |
|---|---|---|
| Actual False | Correct | False Positive |
| Actual True | False Negative | Correct |
actual_response = df['y'] # from original dataframe
predicted_response = np.round(model.predict()) # round the probability from model
outcomes = pd.DataFrame({'actual_response': actual_response,
'predicted_response': predicted_response})
print(outcomes.value_counts(sort = False)) # use .value_counts methodcan also be generated automatically using the .pred_table() method on the original model - true negative, false positive - false negative, true positive
mosaicfunction fromstatsmodels.graphics.mosaicploteasily visualizes confusion matrices- width of each column is proportional to the fraction of observations in each category of actual values, then each column displays the fraction of predicted observations with each value
TN = conf_matrix[0,0]
TP = conf_matrix[1,1]
FN = conf_matrix[1,0]
FP = conf_matrix[0,1]- Accuracy: proportion of correct predictions,
acc = (TN + TP) / (TN + TP + FN + FP)
- *Sensitivity: the proportion of true positives, the proportion of observations where the actual value was true and the model also predicted they were true
sens = TP / (FN + TP)
- *Specificity: the proportion of true negatives, the proportion of observations where the actual value was false and the model also predicted they were false
spec = TN / (TN + FP)
- Tradeoff between sensitivity and specificity (increasing one will decrease the other)
examples
# Get the actual responses
actual_response = churn['has_churned']
# Get the predicted responses
predicted_response = np.round(mdl_churn_vs_relationship.predict())
# Create outcomes as a DataFrame of both Series
outcomes = pd.DataFrame({'actual_response': actual_response,
'predicted_response': predicted_response})
# Print the outcomes
print(outcomes.value_counts(sort = False))actual_response predicted_response
0 0.0 112
1.0 88
1 0.0 76
1.0 124
dtype: int64
# Import mosaic from statsmodels.graphics.mosaicplot
from statsmodels.graphics.mosaicplot import mosaic
# Calculate the confusion matrix conf_matrix
conf_matrix = mdl_churn_vs_relationship.pred_table()
# Print it
print(conf_matrix)
# Draw a mosaic plot of conf_matrix
mosaic(conf_matrix)
plt.show()[[112. 88.]
[ 76. 124.]]
# Extract TN, TP, FN and FP from conf_matrix
TN = conf_matrix[0,0]
TP = conf_matrix[1,1]
FN = conf_matrix[1,0]
FP = conf_matrix[0,1]
# Calculate and print the accuracy
accuracy = (TN + TP) / (TN + TP + FN + FP)
print("accuracy: ", accuracy)
# Calculate and print the sensitivity
sensitivity = TP / (FN + TP)
print("sensitivity: ", sensitivity)
# Calculate and print the specificity
specificity = TN / (TN + FP)
print("specificity: ", specificity)accuracy: 0.59
sensitivity: 0.62
specificity: 0.56