#!/usr/bin/env python # coding: utf-8 # ## Chapter 18 - Metric Predicted Variable with Multiple Metric Predictors # - [18.1 - Multiple Linear Regression](#18.1---Multiple-Linear-Regression) # - [18.1.4 - Redundant predictors](#18.1.4---Redundant-predictors) # - [18.2 - Multiplicative Interaction of Metric Predictors](#18.2---Multiplicative-Interaction-of-Metric-Predictors) # In[1]: # %load ../../standard_import.txt import pandas as pd import numpy as np import pymc as pm import arviz as az import matplotlib.pyplot as plt import seaborn as sns from matplotlib import gridspec from IPython.display import Image get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('seaborn-white') color = '#87ceeb' f_dict = {'size':16} # In[2]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-p matplotlib,numpy,pandas,pymc,seaborn') # ### 18.1 - Multiple Linear Regression # #### Data # In[3]: df = pd.read_csv('data/Guber1999data.csv') df.info() # In[4]: df.head() # In[5]: X = df[['Spend', 'PrcntTake']] y = df['SATT'] meanx = X.mean().to_numpy() scalex = X.std().to_numpy() zX = ((X-meanx)/scalex).to_numpy() meany = y.mean() scaley = y.std() zy = ((y-meany)/scaley).to_numpy() # #### Model (Kruschke, 2015) # In[6]: Image('images/fig18_4.png', width=400) # In[7]: with pm.Model() as model: zbeta0 = pm.Normal('zbeta0', mu=0, sigma=2) #zbeta1 = pm.Normal('zbetaj1', mu=0, sigma=2) #zbeta2 = pm.Normal('zbetaj2', mu=0, sigma=2) #zmu = zbeta0 + (zbeta1 * X[:,0]) + (zbeta2 * X[:,1]) zbetaj = pm.Normal('zbetaj', mu=0, sigma=2, shape=(2)) zmu = zbeta0 + pm.math.dot(zbetaj, zX.T) nu = pm.Exponential('nu', 1/29.) zsigma = pm.Uniform('zsigma', 10**-5, 10) likelihood = pm.StudentT('likelihood', nu=nu, mu=zmu, lam=1/zsigma**2, observed=zy) # In[8]: with model: idata = pm.sample(10000, chains=4, target_accept=0.9) # In[9]: with model: az.plot_trace(idata); # #### Figure 18.5 # In[10]: # Transform parameters back to original scale beta0 = idata.posterior['zbeta0']*scaley + meany - (idata.posterior['zbetaj']*meanx/scalex).sum(axis=2)*scaley betaj = (idata.posterior['zbetaj']/scalex)*scaley scale = (idata.posterior['zsigma']*scaley) intercept = beta0 spend = betaj.sel(zbetaj_dim_0=0) prcnttake = betaj.sel(zbetaj_dim_0=1) #normality = np.log10(trace['nu']) normality = idata.posterior['nu'] fig, ([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2,3, figsize=(12,6)) for ax, estimate, title, xlabel in zip(fig.axes, [intercept, spend, prcnttake, scale, normality], ['Intercept', 'Spend', 'PrcntTake', 'Scale', 'Normality'], [r'$\beta_0$', r'$\beta_1$', r'$\beta_2$', r'$\sigma$', r'$\nu$']): az.plot_posterior(estimate, point_estimate='mode', ax=ax, color=color) ax.set_title(title, fontdict=f_dict) ax.set_xlabel(xlabel, fontdict=f_dict) fig.tight_layout() ax6.set_visible(False) # Below we create the scatterplots of figure 18.5 using `pairplot()` in seaborn and then tweak the lower triangle. # In[11]: # DataFrame with the columns in correct order pair_plt = pd.DataFrame({'Intercept':intercept.values.flatten(), 'Spend':spend.values.flatten(), 'PrcntTake': prcnttake.values.flatten(), 'Scale':scale.values.flatten(), 'Normality': normality.values.flatten()}, columns=['Intercept', 'Spend', 'PrcntTake', 'Scale', 'Normality']) # Correlation coefficients corr = np.round(np.corrcoef(pair_plt, rowvar=0), decimals=3) # Indexes of the lower triangle, below the diagonal lower_idx = np.tril(corr, -1).nonzero() # The seaborn pairplot pgrid = sns.pairplot(pair_plt, plot_kws={'edgecolor':color, 'facecolor':'none'}) # Replace the plots on the diagonal with the parameter names for i, ax in enumerate(pgrid.diag_axes): ax.clear() ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) ax.text(.5,.5, pair_plt.columns[i], transform=ax.transAxes, fontdict={'size':16, 'weight':'bold'}, ha='center') # Replace the lower triangle with the correlation coefficients for i, ax in enumerate(pgrid.axes[lower_idx]): #ax.clear() ax.collections[0].remove() ax.xaxis.set_visible(False) #ax.yaxis.set_visible(False) ax.text(.5,.5, corr[lower_idx][i], transform=ax.transAxes, fontdict=f_dict, ha='center') # ### 18.1.4 - Redundant predictors # In[12]: X2 = X.assign(PropNotTake = (100-df['PrcntTake']) / 100) meanx2 = X2.mean().to_numpy() scalex2 = X2.std().to_numpy() zX2 = ((X2-meanx2)/scalex2).to_numpy() # In[13]: sns.pairplot(X2, plot_kws={'edgecolor':color, 'facecolor':'none'}) # In[14]: with pm.Model() as model2: zbeta0 = pm.Normal('zbeta0', mu=0, sigma=2) zbetaj = pm.Normal('zbetaj', mu=0, sigma=2, shape=(3)) zmu = zbeta0 + pm.math.dot(zbetaj, zX2.T) nu = pm.Exponential('nu', 1/29.) zsigma = pm.Uniform('zsigma', 10**-5, 10) likelihood = pm.StudentT('likelihood', nu=nu, mu=zmu, lam=1/zsigma**2, observed=zy) # In[15]: with model2: idata2 = pm.sample(5000, chains=4, target_accept=0.9) # In[16]: with model2: az.plot_trace(idata2); # **Note**: The trace/histogram above for `zbetaj` may look like one of the chains got stuck (i.e., all its values are piled up just above zero). However, paying attention to the plotting, you can actually tell that it isn't a *chain* that is piled up above zero, it's is one of the parameters in the *array* of parameters stored in `zbetaj`. Specifically, it is the first parameter, which is the coefficient associated with the *Spend* variable. # #### Figure 18.6 # In[17]: # Transform parameters back to original scale beta0 = idata2.posterior['zbeta0']*scaley + meany - (idata2.posterior['zbetaj']*meanx2/scalex2).sum(axis=2)*scaley betaj = (idata2.posterior['zbetaj']/scalex2)*scaley scale = (idata2.posterior['zsigma']*scaley) intercept = beta0 spend = betaj.sel(zbetaj_dim_0=0) prcnttake = betaj.sel(zbetaj_dim_0=1) propnottake = betaj.sel(zbetaj_dim_0=2) #normality = np.log10(trace2['nu']) normality = idata2.posterior['nu'] fig, ([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2,3, figsize=(12,6)) for ax, estimate, title, xlabel in zip(fig.axes, [intercept, spend, prcnttake, propnottake, scale, normality], ['Intercept', 'Spend', 'PrcntTake', 'PropNotTake', 'Scale', 'Normality'], [r'$\beta_0$', r'$\beta_1$', r'$\beta_2$', r'$\beta_3$', r'$\sigma$', r'log10($\nu$)']): pm.plot_posterior(estimate, point_estimate='mode', ax=ax, color=color) ax.set_title(title, fontdict=f_dict) ax.set_xlabel(xlabel, fontdict=f_dict) plt.tight_layout() # In[18]: # DataFrame with the columns in correct order pair_plt = pd.DataFrame({'Intercept':intercept.values.flatten(), 'Spend':spend.values.flatten(), 'PrcntTake': prcnttake.values.flatten(), 'PropNotTake': propnottake.values.flatten(), 'Scale':scale.values.flatten(), 'Normality': normality.values.flatten()}, columns=['Intercept', 'Spend', 'PrcntTake', 'PropNotTake', 'Scale', 'Normality']) # Correlation coefficients corr = np.round(np.corrcoef(pair_plt, rowvar=0), decimals=3) # Indexes of the lower triangle, below the diagonal lower_idx = np.tril(corr, -1).nonzero() # The seaborn pairplot pgrid = sns.pairplot(pair_plt, plot_kws={'edgecolor':color, 'facecolor':'none'}) # Replace the plots on the diagonal with the parameter names for i, ax in enumerate(pgrid.diag_axes): ax.clear() ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) ax.text(.5,.5, pair_plt.columns[i], transform=ax.transAxes, fontdict={'size':16, 'weight':'bold'}, ha='center') # Replace the lower triangle with the correlation coefficients for i, ax in enumerate(pgrid.axes[lower_idx]): #ax.clear() ax.collections[0].remove() ax.xaxis.set_visible(False) #ax.yaxis.set_visible(False) ax.text(.5,.5, corr[lower_idx][i], transform=ax.transAxes, fontdict=f_dict, ha='center') # ### 18.2 - Multiplicative Interaction of Metric Predictors # In[19]: X3 = X.assign(SpendXPrcnt = lambda x: x.Spend * x.PrcntTake) meanx3 = X3.mean().to_numpy() scalex3 = X3.std().to_numpy() zX3 = ((X3-meanx3)/scalex3).to_numpy() # In[20]: # Correlation matrix X3.corr() # In[21]: with pm.Model() as model3: zbeta0 = pm.Normal('zbeta0', mu=0, sigma=2) zbetaj = pm.Normal('zbetaj', mu=0, sigma=2, shape=(3)) zmu = zbeta0 + pm.math.dot(zbetaj, zX3.T) nu = pm.Exponential('nu', 1/30.) zsigma = pm.Uniform('zsigma', 10**-5, 10) likelihood = pm.StudentT('likelihood', nu=nu, mu=zmu, lam=1/zsigma**2, observed=zy) # In[22]: with model3: idata3 = pm.sample(20000, chains=4, target_accept=0.9) # #### Figure 18.9 # In[23]: # Transform parameters back to original scale beta0 = idata3.posterior['zbeta0']*scaley + meany - (idata3.posterior['zbetaj']*meanx3/scalex3).sum(axis=2)*scaley betaj = (idata3.posterior['zbetaj']/scalex3)*scaley scale = (idata3.posterior['zsigma']*scaley) intercept = beta0 spend = betaj.sel(zbetaj_dim_0=0) prcnttake = betaj.sel(zbetaj_dim_0=1) spendxprcnt = betaj.sel(zbetaj_dim_0=2) #normality = np.log10(trace3['nu']) normality = idata3.posterior['nu'] fig, ([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2,3, figsize=(12,6)) for ax, estimate, title, xlabel in zip(fig.axes, [intercept, spend, prcnttake, spendxprcnt, scale, normality], ['Intercept', 'Spend', 'PrcntTake', 'SpendXPrcnt', 'Scale', 'Normality'], [r'$\beta_0$', r'$\beta_1$', r'$\beta_2$', r'$\beta_3$', r'$\sigma$', r'$\nu$']): az.plot_posterior(estimate, point_estimate='mode', ax=ax, color=color) ax.set_title(title, fontdict=f_dict) ax.set_xlabel(xlabel, fontdict=f_dict) plt.tight_layout() # In[24]: spendxprcnt # In[25]: fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,8)) # Slope on Spend prcnttake_values = np.linspace(4, 80, 20).reshape(1,-1) spend_slope = spend.values.flatten() + spendxprcnt.values.flatten()*prcnttake_values.reshape(-1,1) hpds = np.array([az.hdi(spend_slope[i,:]) for i in range(len(prcnttake_values))]) spend_slope_medians = spend_slope.mean(axis=1) ax1.errorbar(prcnttake_values.ravel(), spend_slope_medians, yerr=hpds[:,1]-hpds[:,0], color=color, linestyle='None', marker='o', markersize=10) ax1.axhline(linestyle='dotted', color='k') ax1.set_xlim(0,82) ax1.set(xlabel='Value of PrcntTake', ylabel='Slope on Spend') # Slope on PrcntTake spend_values = np.linspace(3.75, 9.75, 20) prcnttake_slope = prcnttake.values.flatten() + spendxprcnt.values.flatten()*spend_values.reshape(-1,1) hpds2 = np.array([az.hdi(prcnttake_slope[i,:]) for i in range(len(prcnttake_values))]) prcnttake_slope_medians = prcnttake_slope.mean(axis=1) ax2.errorbar(spend_values.ravel(), prcnttake_slope_medians, yerr=hpds2[:,1]-hpds2[:,0], color=color, linestyle='None', marker='o', markersize=10) ax2.set_xlim(3.5,10) ax2.set(xlabel='Value of Spend', ylabel='Slope on PrcntTake'); # ### 18.3 - Shrinkage of Regression Coefficients # In *`R`* I ran the first 46 lines of code in the script `Jags-Ymet-XmetMulti-MrobustShrink-Example.R` to generate the exact same data used in the book. I exported the resulting data frame `myData` to a csv file. # In[26]: df_shrink = pd.read_csv('data/18_3shrinkage.csv') df_shrink.info() # In[27]: # check bivariate associations between SATT and other variables df_shrink.corrwith(df_shrink['SATT']) # In[28]: # Select the predictor columns: Spend, PrcntTake and the 12 randonly generated predictors #X4 = df_shrink.iloc[:, np.r_[[1,4], np.arange(8,20)]] X4 = df_shrink.drop(columns=['State','StuTeaRat', 'Salary', 'SATV', 'SATM', 'SATT']) y4 = df_shrink.SATT meanx4 = X4.mean().to_numpy() scalex4 = X4.std().to_numpy() zX4 = ((X4-meanx4)/scalex4).to_numpy() meany4 = y4.mean() scaley4 = y4.std() zy4 = ((y4-meany4)/scaley4).to_numpy() # ##### Hierarchical model with fixed, independent, vague normal priors # #### Model (Kruschke, 2015) # In[29]: Image('images/fig18_4.png', width=400) # In[30]: with pm.Model() as model4: zbeta0 = pm.Normal('zbeta0', mu=0, sigma=2) zbetaj = pm.Normal('zbetaj', mu=0, sigma=2, shape=(zX4.shape[1])) zmu = zbeta0 + pm.math.dot(zbetaj, zX4.T) nu = pm.Exponential('nu', 1/29.) zsigma = pm.Uniform('zsigma', 10**-5, 10) likelihood = pm.StudentT('likelihood', nu=nu, mu=zmu, lam=1/zsigma**2, observed=zy4) # In[31]: with model4: idata4 = pm.sample(5000, chains=4, target_accept=0.9) # In[32]: with model4: az.plot_trace(idata4); # In[33]: # probability that the spend coefficient is greater than zero (idata4.posterior['zbetaj'].sel(zbetaj_dim_0=0) > 0).mean() # #### Figure 18.11 # In[34]: # Transform parameters back to original scale beta0 = idata4.posterior['zbeta0']*scaley4 + meany4 - (idata4.posterior['zbetaj']*meanx4/scalex4).sum(axis=2)*scaley4 betaj = (idata4.posterior['zbetaj']/scalex4)*scaley4 scale = (idata4.posterior['zsigma']*scaley4) intercept = beta0 spend = betaj.sel(zbetaj_dim_0=0) prcnttake = betaj.sel(zbetaj_dim_0=1) #normality = np.log10(idata4.posterior['nu']) normality = idata4.posterior['nu'] fig, axes = plt.subplots(4,3, figsize=(12,10)) # Intercept pm.plot_posterior(intercept, point_estimate='mode', ax=axes.flatten()[0], color=color) axes.flatten()[0].set_title('Intercept', fontdict=f_dict) # Spend & PrcntTale pm.plot_posterior(spend, point_estimate='mode', ax=axes.flatten()[1], color=color) axes.flatten()[1].set_title('Spend', fontdict=f_dict) az.plot_posterior(prcnttake, point_estimate='mode', ax=axes.flatten()[2], color=color) axes.flatten()[2].set_title('PrcntTake', fontdict=f_dict) # Randomly generated predictors for ax, j in enumerate([2,3,4,11,12,13]): pm.plot_posterior(betaj.sel(zbetaj_dim_0=j), point_estimate='mode', ax=axes.flatten()[ax+3], color=color) axes.flatten()[ax+3].set_title(X4.columns[j], fontdict=f_dict) # Scale az.plot_posterior(scale, point_estimate='mode', ax=axes.flatten()[9], color=color) axes.flatten()[9].set_title(r'$\sigma$', fontdict=f_dict) # Normality az.plot_posterior(normality, point_estimate='mode', ax=axes.flatten()[10], color=color) axes.flatten()[10].set_title(r'$\nu$', fontdict=f_dict); axes.flatten()[11].set_visible(False) fig.tight_layout() # In[35]: with pm.Model() as model5: zbeta0 = pm.Normal('zbeta0', mu=0, sigma=2) zbetajscale = pm.Gamma('zbetajscale', mu=1, sigma=1) zbetaj = pm.StudentT('zbetaj', mu=0, sigma=zbetajscale, nu=1, shape=(zX4.shape[1])) zmu = zbeta0 + pm.math.dot(zbetaj, zX4.T) nu = pm.Exponential('nu', 1/29.) zsigma = pm.Uniform('zsigma', 10**-5, 10) likelihood = pm.StudentT('likelihood', nu=nu, mu=zmu, lam=1/zsigma**2, observed=zy4) # In[36]: with model5: idata5 = pm.sample(10000, chains=4, target_accept=0.9) # In[37]: # probability that the spend coefficient is greater than zero (idata5.posterior['zbetaj'].sel(zbetaj_dim_0=0) > 0).mean() # #### Figure 18.12 # In[38]: # Transform parameters back to original scale beta0 = idata5.posterior['zbeta0']*scaley4 + meany4 - (idata5.posterior['zbetaj']*meanx4/scalex4).sum(axis=2)*scaley4 betaj = (idata5.posterior['zbetaj']/scalex4)*scaley4 scale = (idata5.posterior['zsigma']*scaley4) intercept = beta0 spend = betaj.sel(zbetaj_dim_0=0) prcnttake = betaj.sel(zbetaj_dim_0=1) #normality = np.log10(idata5.posterior['nu']) normality = idata5.posterior['nu'] fig, axes = plt.subplots(4,3, figsize=(12,10)) # Intercept pm.plot_posterior(intercept, point_estimate='mode', ax=axes.flatten()[0], color=color) axes.flatten()[0].set_title('Intercept', fontdict=f_dict) # Spend & PrcntTale pm.plot_posterior(spend, point_estimate='mode', ax=axes.flatten()[1], color=color) axes.flatten()[1].set_title('Spend', fontdict=f_dict) az.plot_posterior(prcnttake, point_estimate='mode', ax=axes.flatten()[2], color=color) axes.flatten()[2].set_title('PrcntTake', fontdict=f_dict) # Randomly generated predictors for ax, j in enumerate([2,3,4,11,12,13]): pm.plot_posterior(betaj.sel(zbetaj_dim_0=j), point_estimate='mode', ax=axes.flatten()[ax+3], color=color) axes.flatten()[ax+3].set_title(X4.columns[j], fontdict=f_dict) # Scale az.plot_posterior(scale, point_estimate='mode', ax=axes.flatten()[9], color=color) axes.flatten()[9].set_title(r'$\sigma$', fontdict=f_dict) # Normality az.plot_posterior(normality, point_estimate='mode', ax=axes.flatten()[10], color=color) axes.flatten()[10].set_title(r'$\nu$', fontdict=f_dict); axes.flatten()[11].set_visible(False) fig.tight_layout() # In[ ]: