import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as clt
import geopandas as gpd
import seaborn as sns


explanatory = pd.read_pickle("data/explanatory.p")
explained = pd.read_pickle("data/explained.p")
counties = pd.read_pickle("data/counties.p")


explanatory.head()


#it take some time
ct = counties.join(explained.rename("exp"))
fig,ax = plt.subplots(figsize=(12,7))
ct.plot(column='exp',ax = ax, cmap="bwr",legend=True,norm=clt.CenteredNorm(halfrange=0.7))
ax.set_xlim(-2500000,2300000)
ax.set_ylim(-1500000,1800000)

(-1500000.0, 1800000.0)


from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import ExtraTreesRegressor as ETR
from xgboost import XGBRegressor as XGB

modelRFR = RFR(max_features=.2)
modelETR = ETR(max_features=.2)
modelXGB = XGB(objective='reg:squarederror')


modelRFR.fit(explanatory,explained)
pred_RFR = modelRFR.predict(explanatory)

modelETR.fit(explanatory,explained)
pred_ETR = modelETR.predict(explanatory)

modelXGB.fit(explanatory,explained)
pred_XGB = modelXGB.predict(explanatory)


fig,axes = plt.subplots(ncols=3, figsize=(14,6))
axes[0].scatter(explained,pred_RFR,s=2)
axes[1].scatter(explained,pred_ETR,s=2)
axes[2].scatter(explained,pred_XGB,s=2)

<matplotlib.collections.PathCollection at 0x7f695c2d4f70>


from sklearn.inspection import PartialDependenceDisplay


# list columns names just to selecy variables of interest.
explanatory.columns

Index(['race_white', 'race_black', 'race_hispanic', 'sex_female', 'age_0_17',
       'age_over65', 'ind_divorce', 'marriage_never_married',
       'lang_second_eng', 'empl_civilian_unemployed', 'ind_agriculture',
       'income_median', 'public_asist', 'gini_index', 'transp_car',
       'transp_public', 'pov_under_1', 'insured_under65', 'house_vacant',
       'housevalue_median', 'POP16dens', 'MEDIAN_INCOME08_16', 'POP08_16',
       'edu_high', 'housevalue_IQR'],
      dtype='object')


# long calculation
fig, axes = plt.subplots(ncols=3,figsize=(16,6),sharey=True)
PartialDependenceDisplay.from_estimator(modelETR, explanatory, features=["edu_high","ind_agriculture","race_black"],ax=axes)

<sklearn.inspection._plot.partial_dependence.PartialDependenceDisplay at 0x7f4bf47a94c0>


import shap

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


model = modelETR
n=23
explainer = shap.explainers.Permutation(model.predict, explanatory) # X masker
shap_values = explainer(explanatory[n:n+1],max_evals=51,error_bounds=True)


#test few examples
shap.plots.waterfall(shap_values[0])


explanatory.iloc[n]

race_white                       0.280693
race_black                       0.694974
race_hispanic                    0.009583
sex_female                       0.538261
age_0_17                         0.251871
age_over65                       0.156761
ind_divorce                      0.294962
marriage_never_married           0.414861
lang_second_eng                  0.026648
empl_civilian_unemployed         0.078192
ind_agriculture                  0.033730
income_median                28136.000000
public_asist                     0.327536
gini_index                       0.517000
transp_car                       0.945817
transp_public                    0.002810
pov_under_1                      0.343000
insured_under65                  0.852488
house_vacant                     0.204673
housevalue_median            81000.000000
POP16dens                       16.094373
MEDIAN_INCOME08_16               0.074796
POP08_16                        -0.049252
edu_high                         0.137843
housevalue_IQR              109200.000000
Name: G0100470, dtype: float64


#Very long caculation, don't run...
model = modelETR
model.fit(explanatory,explained)
shap_values = explainer(explanatory,max_evals=51,error_bounds=True) 
shapvalues = pd.DataFrame(shap_values.values,columns=explanatory.columns,index=explanatory.index)

Permutation explainer: 3136it [07:17,  7.02it/s]


#... simply load precalculated data
shapvalues = pd.read_pickle("data/shapvalues.p")


scatter_kws = dict(cmap="bwr",norm=clt.CenteredNorm(halfrange=0.4),edgecolor="none",s=30)
fig,axes = plt.subplots(ncols=3,figsize=(14,6),sharey=True)
axes[0].scatter(explanatory.edu_high,shapvalues.edu_high,c=shapvalues.edu_high,**scatter_kws)
axes[1].scatter(explanatory.race_white,shapvalues.race_white,c=shapvalues.race_white,**scatter_kws)
axes[2].scatter(explanatory.race_black,shapvalues.race_black,c=shapvalues.race_black,**scatter_kws)

<matplotlib.collections.PathCollection at 0x7f4bd4aca790>


# do not run...
scatter_kws = dict(cmap="bwr",norm=clt.CenteredNorm(halfrange=0.4),edgecolor="none",s=20)
fig,axes = plt.subplots(ncols=5,nrows=5,figsize=(25,25),sharey=True)
for column,ax  in zip(explanatory.columns,axes.flatten()):
    ax.scatter(explanatory[column],shapvalues[column],c=shapvalues[column],**scatter_kws)
    ax.set_title(column)
    ax.set_facecolor("gray")
#fig.savefig("dependency_plots.pdf")


shap.summary_plot(shap_values, explanatory,cmap="bwr")

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


fig,ax = plt.subplots(figsize=(10,7))
ct = counties.join(shapvalues["race_hispanic"])
ct.plot(column='race_hispanic',ax = ax, cmap="bwr",legend=True,norm=clt.CenteredNorm(halfrange=0.2))

ax.set_xlim(-2500000,2300000)
ax.set_ylim(-1500000,1800000)

(-1500000.0, 1800000.0)


from sklearn.preprocessing import scale
import scipy.cluster.hierarchy as sch
clustering = shap.utils.hclust(scale(explanatory.values), metric="cosine",linkage="average")
fig, ax = plt.subplots(1,1,figsize=(12,4))
dendr = sch.dendrogram(clustering,labels=explanatory.columns,ax=ax)
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)


masker = shap.maskers.Partition(explanatory, clustering=clustering)
explainer_partition = shap.explainers.Partition(model.predict, masker)
explainer_permutation = shap.explainers.Permutation(model.predict, explanatory)
explainer_correlation = shap.explainers.Permutation(model.predict,shap.maskers.Partition(explanatory)) #correlation is default


n=23
shap_values_partition = explainer_partition(explanatory[n:n+1],max_evals=51,error_bounds=True) 
shap_values_permutation = explainer_permutation(explanatory[n:n+1],max_evals=51,error_bounds=True) 
shap_values_correlation = explainer_correlation(explanatory[n:n+1],max_evals=51,error_bounds=True) 

shap.plots.waterfall(shap_values_permutation[0])
shap.plots.waterfall(shap_values_partition[0])
shap.plots.waterfall(shap_values_correlation[0])

X does not have valid feature names, but ExtraTreesRegressor was fitted with feature names
X does not have valid feature names, but ExtraTreesRegressor was fitted with feature names
X does not have valid feature names, but ExtraTreesRegressor was fitted with feature names
X does not have valid feature names, but ExtraTreesRegressor was fitted with feature names
X does not have valid feature names, but ExtraTreesRegressor was fitted with feature names
X does not have valid feature names, but ExtraTreesRegressor was fitted with feature names
X does not have valid feature names, but ExtraTreesRegressor was fitted with feature names
X does not have valid feature names, but ExtraTreesRegressor was fitted with feature names


from sklearn.cluster import AffinityPropagation


ap = AffinityPropagation(preference=-1.5).fit(shapvalues.values)
ap.cluster_centers_indices_
ap.labels_ # cluster labels, starting from 0
clusters = pd.Series(ap.labels_,index=explanatory.index,name="clusters")


clusters = pd.read_pickle("data/clusters.p")


fig,ax = plt.subplots(figsize=(10,7))
ct = counties.join(clusters)
ct.plot(column='clusters',ax = ax, cmap="rainbow",legend=True)
ax.set_xlim(-2500000,2300000)
ax.set_ylim(-1500000,1800000)

(-1500000.0, 1800000.0)


explained_clusters = pd.DataFrame({"explained":explained,"clusters":clusters})
fig,ax = plt.subplots(figsize=(10,7))
sns.boxplot(data=explained_clusters,x="clusters",y="explained",palette="rainbow",ax=ax)

<Axes: xlabel='clusters', ylabel='explained'>


from matplotlib import cm
colors = cm.rainbow(np.arange(0,1,0.1))
expl = shapvalues.iloc[ap.cluster_centers_indices_,]
g = sns.clustermap(expl,row_cluster=False,col_cluster=False,cmap="bwr",vmin=-.4,vmax=.2,center=0,row_colors=colors,figsize=(15,15))

	race_white	race_black	race_hispanic	sex_female	age_0_17	age_over65	ind_divorce	marriage_never_married	lang_second_eng	empl_civilian_unemployed	...	transp_public	pov_under_1	insured_under65	house_vacant	housevalue_median	POP16dens	MEDIAN_INCOME08_16	POP08_16	edu_high	housevalue_IQR
GISJOIN
G0100010	0.756835	0.183709	0.025723	0.511762	0.251649	0.139785	0.196471	0.248933	0.041779	0.033644	...	0.000782	0.122660	0.896521	0.084265	141000.0	35.167013	-0.012534	0.008408	0.245928	136300.0
G0100030	0.831788	0.092256	0.043667	0.511949	0.221894	0.187149	0.231375	0.246486	0.069489	0.036725	...	0.002143	0.129938	0.856273	0.301453	173400.0	47.226343	0.012996	0.088874	0.295471	157800.0
G0100050	0.458856	0.478883	0.043098	0.464981	0.215488	0.165289	0.281972	0.345559	0.060732	0.061604	...	0.003871	0.263737	0.842386	0.227080	90300.0	11.360474	0.064819	-0.031126	0.128678	116300.0
G0100070	0.747652	0.212121	0.022240	0.464646	0.210704	0.148857	0.231938	0.296369	0.023695	0.034764	...	0.004831	0.164539	0.892983	0.214445	97200.0	13.918126	0.080165	-0.008652	0.120000	112300.0
G0100090	0.876577	0.015580	0.087273	0.504852	0.235703	0.171929	0.182036	0.198090	0.068917	0.029713	...	0.001733	0.165344	0.867668	0.135472	124200.0	34.243279	0.022570	0.004142	0.130498	119800.0

Mapping explanation -Python toolchaining for spatial interpretative machine learning - Trump vs. Clinton 2016 case study¶

Topics¶

Required Libraries¶

Loading and browsing the data¶

Machine learning models¶

Decompositon - model level explanation (Partial dependency plots)¶

Building an explanation¶

Explanation of a single case¶

Waterfall plots¶

Create explanation for entire data set¶

Compare real value with impact - dependency plots¶

Dependency plots for the entire data¶

Spatial visualisation¶

More about explainers¶

Towards an interpretation¶

Spatial distribution¶

Distribution of dependent variable inside clusters¶

How clusters are shaped?¶