import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as clt
import seaborn as sns
from sklearn.decomposition import PCA

shapvalues = pd.read_pickle("data/shapvalues.p")
clusters = pd.read_pickle("data/clusters.p")
explained = pd.read_pickle("data/explained.p")
election = pd.read_pickle("data/election.p")

sum_votes = election.sum_votes_gop_dem

colors = ["#660000","#aa0000","#dd6600","#cc99dd","#0000aa","#3366dd","#004466","#bb3366","#bb3300","#995533"] # interpreted colors
colormap = clt.LinearSegmentedColormap.from_list("colors",colors=colors)

pca = PCA().fit(shapvalues) # shap values are scaled
transformed = pca.transform(shapvalues)
coeff=pca.components_

scalecoef = 0.5
xs = transformed[:,0]
ys = transformed[:,1]
sizes = sum_votes.loc[shapvalues.index]/5000
colorlist = colormap(clusters/9)
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
#scalex = 1
#scaley = 1
#index_centr = shapvalues.index.get_indexer(centroids)

labels = shapvalues.columns
explained = np.round(pca.explained_variance_ratio_*100,1)
coeffx = pca.components_[0]*max(xs)
coeffy = pca.components_[1]*max(ys)

fig,ax = plt.subplots(figsize=(10,10),dpi=150)
ax.scatter(xs * scalex,ys * scaley, s=sizes,c = colorlist, alpha=0.8,edgecolors='none')
for i in range(6):
    ax.arrow(0, 0, coeffx[i], coeffy[i],color = '#009900',alpha = 1,ls=":",lw=1)
    ax.text(coeffx[i]* 1.05, coeffy[i] * 1.15, labels[i], color = '#009900', ha = 'left', va = 'center',fontsize=16)
    
ax.set_xlabel("PC 0 {} of variance".format(explained[0]),fontsize=16)
ax.set_ylabel("PC 1 {} of variance".format(explained[1]),fontsize=16)
ax.set_xlim(-0.6,0.8)
ax.set_ylim(-0.4,0.6)
ax.set_title("Principal components of most infuencing variables",fontsize=16)

Text(0.5, 1.0, 'Principal components of most infuencing variables')

election = election.join(clusters)

size_dem = election.election_votes_dem.groupby(by=election.clusters).sum()
size_gop = election.election_votes_gop.groupby(by=election.clusters).sum()

fig, (ax1,ax2) = plt.subplots(nrows=2,tight_layout=True,figsize=(8,12),dpi=150,gridspec_kw={'height_ratios':[3,2]})
sns.boxplot(data=election,x="clusters",y="diff_gop_dem",palette=colors,ax=ax1)
ax2.set_xlabel("Clusters")
ax1.set_ylabel("Votes (%)")
ax2.set_ylabel("Votes ($10^7$)")
ax1.grid(ls=":",color=".85",which="major",zorder=-1000)
ax1.set_title("Vote advantage (%)")
ax1.axhline(0,zorder=-100,c="0.45",ls=":")
ax1.set_xlabel("")
ax2.bar(range(10),height=size_gop,color="#EE0000")
ax2.bar(range(10),height=size_dem,bottom=size_gop,color="#0000EE")
ax2.margins(0.01)
ax2.set_title("Number of votes")
_=ax2.set_xticks(range(10)) # _= to hide messages

Biplot¶

Boxplots¶