from fuzzywuzzy import fuzz, process
from zipfile import ZipFile
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import statsmodels.api as sm


# URLs for the data
finance_url = "https://www.fec.gov/files/bulk-downloads/2022/weball22.zip"
house_url = "https://bigbuilder.s3.us-west-1.amazonaws.com/projects/elections-data/2022-11-08/latest-house.json"
senate_url = "https://bigbuilder.s3.us-west-1.amazonaws.com/projects/elections-data/2022-11-08/latest-senate.json"

# Download campaign finance data from the FEC
response = requests.get(finance_url)
with open("data/candidates.zip", "wb") as f:
    f.write(response.content)

# Unzip into data directory
with ZipFile("data/candidates.zip", "r") as f:
    f.extractall("data")

# Download house election results
response = requests.get(house_url)
with open("data/house.json", "wb") as f:
    f.write(response.content)

# Download senate election results
response = requests.get(senate_url)
with open("data/senate.json", "wb") as f:
    f.write(response.content)


# Read data into finance_df dataframe and keep important columns
finance_df = pd.read_csv("data/weball22.txt", sep="|", header=None)[[1, 4, 7, 18, 19]]
finance_df.columns = ["name", "party", "total_spent", "state", "district"]
finance_df

# Split the name column into first and last name
df = finance_df["name"].str.split(", ", n=1, expand=True)
df.columns = ["last_name", "first_name"]
finance_df = df.join(finance_df.iloc[:, 1:])
finance_df


# Find candidates with duplicate entries
finance_df[finance_df.duplicated(subset=["last_name", "first_name"], keep="last")]


# Sort by total spent and keep the first entry for duplicates
finance_df = (
    finance_df[finance_df["total_spent"] > 0]
    .sort_values("total_spent", ascending=False)
    .drop_duplicates(subset=["first_name", "last_name"], keep="first")
)
finance_df


# Retrieve JSON data
data = json.load(open("data/house.json")) + json.load(open("data/senate.json"))
candidates = []

# Iterate over each election
for row in data:
    # Remove duplicate candidate data and duplicate runoff elections
    if row["id"] in ["8516", "15766", "21404", "3156", "21405", "3153"]:
        continue

    # Iterate over each candidate in the election
    for i, candidate in enumerate(row["candidates"]):
        # Remove candidates with no votes and write-in candidates
        if candidate["votes"] == 0 or candidate["family_name"] == "Total Write-Ins":
            continue

        # For the runoff elections, the first candidate won each time
        if row["id"] in ["2015", "20645", "2933"] and i == 0:
            candidate["is_winner"] = True

        # District is 0 for senate
        if row["office_id"] == "S":
            district = 0
        else:
            district = row["seat_number"]

        # Change gop to REP to match with the campaign finance dataset
        if candidate["party"] == "gop":
            candidate["party"] = "REP"

        # Add candidate information
        candidates.append(
            {
                "last_name": candidate["family_name"].upper(),
                "first_name": candidate["given_name"].upper(),
                "state": row["state_postal"],
                "district": district,
                "party": candidate["party"].upper(),
                "is_winner": candidate["is_winner"],
                "is_incumbent": candidate["is_incumbent"],
                "votes": candidate["votes"],
            }
        )

# Create Pandas DataFrame
votes_df = pd.DataFrame(candidates)
votes_df


display(votes_df[votes_df["last_name"] == "COCHRAN"])
display(finance_df[finance_df["last_name"] == "COCHRAN"])


data = []

# Iterate over candidates from the election results data
for _, row in votes_df.iterrows():
    # Find very close name matches (subsets)
    df = finance_df[
        (
            (finance_df["state"] == row["state"])
            & (finance_df["last_name"].str.contains(row["last_name"]))
            & (finance_df["first_name"].str.contains(row["first_name"]))
        )
    ]

    # If this fails, try to find a close name match
    if len(df) == 0:
        # Limit to candidates in the same state and with matching first or last names
        state_df = finance_df[
            (finance_df["state"] == row["state"])
            & (
                (finance_df["first_name"] == row["first_name"])
                | (finance_df["last_name"] == row["last_name"])
            )
        ]

        # Search all of these candidates for a close name match
        if len(state_df) > 0:
            closest = process.extract(
                row["first_name"] + " " + row["last_name"],
                state_df["first_name"] + " " + state_df["last_name"],
                limit=1,
            )[0]

            # Isolate the entry for the candidate with the closest name match
            if closest[1] >= 80:
                df = finance_df[
                    finance_df["first_name"] + " " + finance_df["last_name"]
                    == closest[0]
                ]

    # Add data if there is a match
    if len(df) >= 1:
        # If there are multiple entries, choose the one with the most money spent
        total_spent = df.loc[df["total_spent"].idxmax()]["total_spent"]
        data.append(
            {
                "last_name": row["last_name"],
                "first_name": row["first_name"],
                "state": row["state"],
                "district": row["district"],
                "party": row["party"],
                "is_winner": row["is_winner"],
                "is_incumbent": row["is_incumbent"],
                "total_spent": total_spent,
                "votes": row["votes"],
            }
        )

data_df = pd.DataFrame(data)
data_df


data_df.describe()


# Violin plot of total spent
sns.violinplot(data=data_df["total_spent"]).set(
    title="Violin Plots of Total Spent",
    ylabel="Money",
)
plt.show()

# Violin plot of votes
sns.violinplot(data=data_df["votes"]).set(
    title="Violin Plots of Votes",
    ylabel="Votes",
)
plt.show()


# Scatter plot and regression line
sns.regplot(data=data_df, x="total_spent", y="votes").set(
    title="Total Spent vs. Votes",
    xlabel="Total Spent",
    ylabel="Votes",
)
plt.show()


# Linear regression statistics
y = data_df["votes"]
x = data_df["total_spent"]
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  votes   R-squared:                       0.289
Model:                            OLS   Adj. R-squared:                  0.289
Method:                 Least Squares   F-statistic:                     370.8
Date:                Fri, 16 Dec 2022   Prob (F-statistic):           1.37e-69
Time:                        20:42:37   Log-Likelihood:                -12965.
No. Observations:                 913   AIC:                         2.593e+04
Df Residuals:                     911   BIC:                         2.594e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const        1.184e+05   1.24e+04      9.533      0.000     9.4e+04    1.43e+05
total_spent     0.0248      0.001     19.256      0.000       0.022       0.027
==============================================================================
Omnibus:                     1445.851   Durbin-Watson:                   1.172
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           775890.424
Skew:                           9.423   Prob(JB):                         0.00
Kurtosis:                     144.565   Cond. No.                     1.02e+07
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.02e+07. This might indicate that there are
strong multicollinearity or other numerical problems.


# Create a new column for percentage of votes won
data_df["vote_percent"] = data_df.apply(
    lambda row: row["votes"] / data_df[
        (data_df["state"] == row["state"]) & (data_df["district"] == row["district"])
    ]["votes"].sum(),
    axis=1,
)

# Remove elections with only 1 candidate
data_df = data_df[data_df["vote_percent"] != 1]
data_df


data_df.plot(
    kind="scatter",
    title="Vote Percent vs. Total Spent",
    xlabel="Vote Percent",
    ylabel="Total Spent",
    x="vote_percent",
    y="total_spent",
    xticks=np.arange(0.0, 1.1, 0.1)
)

<AxesSubplot:title={'center':'Vote Percent vs. Total Spent'}, xlabel='Vote Percent', ylabel='Total Spent'>


# Scatter plot and regression line
close_df = data_df[abs(data_df["vote_percent"] - 0.5) < 0.15]
sns.regplot(data=close_df, x="total_spent", y="vote_percent").set(
    title="Total Spent vs. Votes",
    xlabel="Total Spent",
    ylabel="Votes",
    yticks=np.arange(0.3, 0.9, 0.05)
)
plt.show()


# Linear regression statistics
y = close_df["vote_percent"]
x = close_df["total_spent"]
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           vote_percent   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     3.429
Date:                Fri, 16 Dec 2022   Prob (F-statistic):             0.0647
Time:                        20:42:38   Log-Likelihood:                 486.37
No. Observations:                 477   AIC:                            -968.7
Df Residuals:                     475   BIC:                            -960.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
===============================================================================
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.4977      0.004    116.227      0.000       0.489       0.506
total_spent  6.105e-10    3.3e-10      1.852      0.065   -3.73e-11    1.26e-09
==============================================================================
Omnibus:                      317.193   Durbin-Watson:                   3.621
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               29.084
Skew:                           0.024   Prob(JB):                     4.84e-07
Kurtosis:                       1.791   Cond. No.                     1.39e+07
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.39e+07. This might indicate that there are
strong multicollinearity or other numerical problems.

	last_name	first_name	party	total_spent	state	district
0	CONSTANT	CHRISTOPHER	DEM	164037.51	AK	0.0
1	PELTOLA	MARY	DEM	6506330.37	AK	0.0
2	WOOL	ADAM L	DEM	16217.07	AK	0.0
3	REVAK	JOSHUA CARL	REP	121841.00	AK	0.0
4	PALIN	SARAH	REP	1908104.64	AK	0.0
...	...	...	...	...	...	...
4125	BEN DAVID	MERAV	DEM	14278.00	WY	0.0
4126	LUMMIS	CYNTHIA MARIE MRS.	REP	352180.98	WY	0.0
4127	MILLER	BRYAN	REP	0.00	WY	0.0
4128	BARRASSO	JOHN A	REP	1323311.92	WY	0.0
4129	ENZI	MICHAEL B	REP	253421.57	WY	0.0

	last_name	first_name	party	total_spent	state	district
22	COLEMAN	JEFF	REP	0.00	AL	2.0
36	BROOKS	MO	REP	4732229.28	AL	5.0
68	COTTON	THOMAS	REP	3069187.01	AR	4.0
88	MARTIN	BRANDON RAY	REP	606.19	AZ	2.0
114	OLSON	JUSTIN	REP	0.00	AZ	5.0
...	...	...	...	...	...	...
3723	BIERY	DAVID	IND	150.75	KY	0.0
3738	KENNEDY	JOSEPH P III	DEM	349348.51	MA	0.0
3868	SIVALINGAM	TEJASINHA	REP	794.78	NH	0.0
3973	TAHER	IBRAHIM	REP	8153.12	OR	0.0
4118	SWEARENGIN	PAULA JEAN	DEM	55950.64	WV	0.0

	last_name	first_name	party	total_spent	state	district
3376	MERCER JR	LEE	DEM	3.840000e+08	00	0.0
3616	WARNOCK	RAPHAEL	DEM	1.687255e+08	GA	0.0
3507	KELLY	MARK	DEM	9.165264e+07	AZ	0.0
3589	DEMINGS	VAL	DEM	8.106547e+07	FL	0.0
681	DEMINGS	VALDEZ 'VAL'	DEM	8.106547e+07	FL	10.0
...	...	...	...	...	...	...
2925	HULINGS	JAY	DEM	3.460000e+00	TX	23.0
1566	BENTIVOLIO	KERRY	REP	2.800000e+00	MI	11.0
3770	OVERBY	PAULA MIRARE	DFL	2.060000e+00	MN	0.0
408	CORTES BARRAGAN	RODOLFO	GRE	1.680000e+00	CA	40.0
2592	CAVARETTA	DEAN	REP	4.000000e-02	PA	17.0

	last_name	first_name	state	district	party	is_winner	is_incumbent	votes
0	PELTOLA	MARY	AK	1	DEM	True	True	128329
1	PALIN	SARAH	AK	1	REP	False	False	67732
2	BEGICH	NICK	AK	1	REP	False	False	61431
3	BYE	CHRIS	AK	1	LIB	False	False	4560
4	CARL	JERRY	AL	1	REP	True	True	139854
...	...	...	...	...	...	...	...	...
1165	ERICSON	MS. CRIS	VT	0	IND	False	False	1102
1166	MURRAY	PATTY	WA	0	DEM	True	True	1741827
1167	SMILEY	TIFFANY	WA	0	REP	False	False	1299322
1168	JOHNSON	RON	WI	0	REP	True	True	1336928
1169	BARNES	MANDELA	WI	0	DEM	False	False	1310673

	last_name	first_name	state	district	party	is_winner	is_incumbent	total_spent	votes
0	PELTOLA	MARY	AK	1	DEM	True	True	6506330.37	128329
1	PALIN	SARAH	AK	1	REP	False	False	1908104.64	67732
2	BEGICH	NICK	AK	1	REP	False	False	1552558.33	61431
3	BYE	CHRIS	AK	1	LIB	False	False	3940.25	4560
4	CARL	JERRY	AL	1	REP	True	True	931386.18	139854
...	...	...	...	...	...	...	...	...	...
908	COESTER	MARK	VT	0	IND	False	False	9394.38	1270
909	MURRAY	PATTY	WA	0	DEM	True	True	18778781.04	1741827
910	SMILEY	TIFFANY	WA	0	REP	False	False	20150770.46	1299322
911	JOHNSON	RON	WI	0	REP	True	True	33830890.07	1336928
912	BARNES	MANDELA	WI	0	DEM	False	False	41437531.87	1310673

Analyzing Campaign Finance Data to Predict Election Results¶

Introduction¶

Hypothesis¶

Packages¶

Data Collection¶

Data Processing¶

Finance Data¶

Results Data¶

Combining Datasets¶

Exploration¶

Analysis¶

Effect of Money on Votes¶

Normalizing Data¶

Close Elections¶

Conclusion¶

Insights¶

Final Thoughts¶

	total_spent	votes
count	9.130000e+02	9.130000e+02
mean	3.047781e+06	1.939655e+05
std	9.155355e+06	4.220941e+05
min	2.060000e+00	2.460000e+02
25%	1.311495e+05	7.976600e+04
50%	1.075978e+06	1.282610e+05
75%	2.721977e+06	1.655830e+05
max	1.687255e+08	6.559303e+06