In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as clt
import seaborn as sns
from sklearn.decomposition import PCA
In [2]:
shapvalues = pd.read_pickle("data/shapvalues.p")
clusters = pd.read_pickle("data/clusters.p")
explained = pd.read_pickle("data/explained.p")
election = pd.read_pickle("data/election.p")
In [4]:
sum_votes = election.sum_votes_gop_dem
In [5]:
colors = ["#660000","#aa0000","#dd6600","#cc99dd","#0000aa","#3366dd","#004466","#bb3366","#bb3300","#995533"] # interpreted colors
colormap = clt.LinearSegmentedColormap.from_list("colors",colors=colors)
In [6]:
pca = PCA().fit(shapvalues) # shap values are scaled
transformed = pca.transform(shapvalues)
coeff=pca.components_
Recalculation for biplot.
In [48]:
scalecoef = 0.5
xs = transformed[:,0]
ys = transformed[:,1]
sizes = sum_votes.loc[shapvalues.index]/5000
colorlist = colormap(clusters/9)
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
#scalex = 1
#scaley = 1
#index_centr = shapvalues.index.get_indexer(centroids)
labels = shapvalues.columns
explained = np.round(pca.explained_variance_ratio_*100,1)
coeffx = pca.components_[0]*max(xs)
coeffy = pca.components_[1]*max(ys)
Biplot¶
Biplot mix scatterplot to show individual caases, where color denote cluster and size number of votes in a given county. Arrows are added in the six-step loop.
In [51]:
fig,ax = plt.subplots(figsize=(10,10),dpi=150)
ax.scatter(xs * scalex,ys * scaley, s=sizes,c = colorlist, alpha=0.8,edgecolors='none')
for i in range(6):
ax.arrow(0, 0, coeffx[i], coeffy[i],color = '#009900',alpha = 1,ls=":",lw=1)
ax.text(coeffx[i]* 1.05, coeffy[i] * 1.15, labels[i], color = '#009900', ha = 'left', va = 'center',fontsize=16)
ax.set_xlabel("PC 0 {} of variance".format(explained[0]),fontsize=16)
ax.set_ylabel("PC 1 {} of variance".format(explained[1]),fontsize=16)
ax.set_xlim(-0.6,0.8)
ax.set_ylim(-0.4,0.6)
ax.set_title("Principal components of most infuencing variables",fontsize=16)
Out[51]:
Text(0.5, 1.0, 'Principal components of most infuencing variables')
Loadings in biplot differ to that from the presentation, becuase we use a different ML model
Boxplots¶
The figure is made up of two panels, the top one shows boxplots for the dependent variable, the bottom one the number of votes by candidate.
There are two axes upper and lower. Upper one uses Seaborn to create array of boxplots. Lower one use classic matplotlib barplot to create stacked bar plot.
In [32]:
election = election.join(clusters)
In [39]:
size_dem = election.election_votes_dem.groupby(by=election.clusters).sum()
size_gop = election.election_votes_gop.groupby(by=election.clusters).sum()
In [45]:
fig, (ax1,ax2) = plt.subplots(nrows=2,tight_layout=True,figsize=(8,12),dpi=150,gridspec_kw={'height_ratios':[3,2]})
sns.boxplot(data=election,x="clusters",y="diff_gop_dem",palette=colors,ax=ax1)
ax2.set_xlabel("Clusters")
ax1.set_ylabel("Votes (%)")
ax2.set_ylabel("Votes ($10^7$)")
ax1.grid(ls=":",color=".85",which="major",zorder=-1000)
ax1.set_title("Vote advantage (%)")
ax1.axhline(0,zorder=-100,c="0.45",ls=":")
ax1.set_xlabel("")
ax2.bar(range(10),height=size_gop,color="#EE0000")
ax2.bar(range(10),height=size_dem,bottom=size_gop,color="#0000EE")
ax2.margins(0.01)
ax2.set_title("Number of votes")
_=ax2.set_xticks(range(10)) # _= to hide messages