%pip install pygam


import pandas as pd
import numpy as np

import plotly.express as px
import matplotlib.pyplot as plt

from pygam import PoissonGAM, te

import pymc as pm
import arviz as az
from scipy.stats import norm

import pickle

import warnings
warnings.filterwarnings('ignore')

Requirement already satisfied: pygam in /usr/local/lib/python3.10/dist-packages (0.9.1)
Requirement already satisfied: numpy>=1.25 in /usr/local/lib/python3.10/dist-packages (from pygam) (1.26.4)
Requirement already satisfied: progressbar2<5.0.0,>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pygam) (4.5.0)
Requirement already satisfied: scipy<1.12,>=1.11.1 in /usr/local/lib/python3.10/dist-packages (from pygam) (1.11.4)
Requirement already satisfied: python-utils>=3.8.1 in /usr/local/lib/python3.10/dist-packages (from progressbar2<5.0.0,>=4.2.0->pygam) (3.9.0)
Requirement already satisfied: typing-extensions>3.10.0.2 in /usr/local/lib/python3.10/dist-packages (from python-utils>=3.8.1->progressbar2<5.0.0,>=4.2.0->pygam) (4.12.2)


drive_df = pd.read_csv("/content/drive/MyDrive/Golf/tee.csv")
drive_df


drive_df = drive_df[drive_df["par"]==5]


def transform_coordinate_system(drive_df: pd.DataFrame) -> pd.DataFrame:
    """ This function transforms the coordinate system for each shot
    so that the origin is at the tee box, and the y-axis intersects
    the center of the fairway

    @param drive_df (pd.DataFrame): DataFrame containing shot and
        hole locations in the original coordinate system
    @return drive_df (pd.DataFrame): DataFrame containing shot and
        hole locations in the new coordinate system
    """

    # Translation
    ## Initial rotation booleans
    initial_rotation_x = [x > fairway_x for x, fairway_x in zip(drive_df["from_x"], drive_df["fairway_center_x"])]
    initial_rotation_y = [y > fairway_y for y, fairway_y in zip(drive_df["from_y"], drive_df["fairway_center_y"])]
    ## Ball location after drive
    drive_df["x"] = drive_df["x"] - drive_df["from_x"]
    drive_df["y"] = drive_df["y"] - drive_df["from_y"]
    drive_df["z"] = drive_df["z"] - drive_df["from_z"]

    ## Fairway center
    drive_df["fairway_center_x"] = drive_df["fairway_center_x"] - drive_df["from_x"]
    drive_df["fairway_center_y"] = drive_df["fairway_center_y"] - drive_df["from_y"]
    drive_df["fairway_center_z"] = drive_df["fairway_center_z"] - drive_df["from_z"]

    ## This negates the coordinates based on the location of the
    #tee box relative to the fairway center
    drive_df["x"] = [-x if rotate else x for x, rotate in zip(drive_df["x"], initial_rotation_x,)]
    drive_df["y"] = [-y if rotate else y for y, rotate in zip(drive_df["y"], initial_rotation_y,)]
    drive_df["fairway_center_x"] = [-x if rotate else x for x, rotate in zip(drive_df["fairway_center_x"], initial_rotation_x,)]
    drive_df["fairway_center_y"] = [-y if rotate else y for y, rotate in zip(drive_df["fairway_center_y"], initial_rotation_y,)]
    drive_df["rotate_x"] = initial_rotation_x
    drive_df["rotate_y"] = initial_rotation_y

    # Rotation
    ## Fairway center distance - this sets the location of the fairway center
    # of x = 0 and y as the two-dimensional distance from the tee box in the
    # x-y plane
    drive_df["fairway_center_new_y"] = np.sqrt(
        drive_df["fairway_center_x"]**2 + drive_df["fairway_center_y"]**2
    )
    drive_df["fairway_center_new_x"] = 0

    # Angle numerator
    drive_df["angle_numerator"] = [
        np.dot(np.array([x, y]), np.array([new_x, new_y]).T) for x, y, new_x, new_y in zip(
            drive_df["fairway_center_x"],
            drive_df["fairway_center_y"],
            drive_df["fairway_center_new_x"],
            drive_df["fairway_center_new_y"]
        )
    ]
    # Angle denominator
    drive_df["angle_denominator"] = [
        np.linalg.norm(np.array([x, y])) * np.linalg.norm(np.array([new_x, new_y])) for x, y, new_x, new_y in zip(
            drive_df["fairway_center_x"],
            drive_df["fairway_center_y"],
            drive_df["fairway_center_new_x"],
            drive_df["fairway_center_new_y"]
        )
    ]

    # Angle of rotation
    drive_df["fairway_angle"] = np.arccos(
        drive_df["angle_numerator"]
    / drive_df["angle_denominator"]
    )

    # Rotate coordinate system
    drive_df["x_new"] = (drive_df["x"]*np.cos(drive_df["fairway_angle"]) - drive_df["y"]*np.sin(drive_df["fairway_angle"]))
    drive_df["y_new"] = (drive_df["x"]*np.sin(drive_df["fairway_angle"]) + drive_df["y"]*np.cos(drive_df["fairway_angle"]))

    drive_df["x_new"] = [
        -x if (rotate and not rotate_y) else
        -x if (rotate_y and not rotate) else
        x for x, rotate, rotate_y in zip(drive_df["x_new"], initial_rotation_x, initial_rotation_y)
    ]

    return drive_df


drive_df = transform_coordinate_system(drive_df)


drive_df["x_new"] = drive_df["x_new"]/3
drive_df["y_new"] = drive_df["y_new"]/3


# Location coordinates
fig, ax = plt.subplots()
ax.scatter(drive_df["x_new"], drive_df["y_new"], alpha=0.1)

# Add fairway edge
plt.vlines(30, 0, 500, 'k', '--')
plt.vlines(-30, 0, 500, 'k', '--')
plt.annotate("Fairway Edge", (32, 450))
plt.ylim(0, 500)
plt.xlim(-250, 250)

# Update title and axes
plt.title("PGA Tour Drive Locations")
plt.ylabel("y-coordinates (yds)")
plt.xlabel("x-coordinates (yds)")

plt.show()


# Quick-and-dirty data quality / lack-of-club information filter
drive_df = drive_df[
    (abs(drive_df["x_new"])<=150) &
    (drive_df["y_new"]<=500) &
    (drive_df["y_new"]>=200)
]

# Standardizing the location coordinates to mean 0 and variance 1
drive_df["x_scaled"] = (drive_df["x_new"] - drive_df["x_new"].mean())/drive_df["x_new"].std()
drive_df["y_scaled"] = (drive_df["y_new"] - drive_df["y_new"].mean())/drive_df["y_new"].std()


# Create golfer factors and store the coordinate dimensions
player_idxs, players = pd.factorize(drive_df["player_id"])
coords = {
    "players": players,
    "obs_id": np.arange(len(player_idxs)),
    "coordinates": ["x", "y"]
}

with pm.Model(coords=coords) as mdl:

    # Set player indices
    player_idx = pm.Data("player_idx", player_idxs, dims="obs_id")

    # define priors, weakly informative
    ## mean
    mu = pm.Normal("mu", mu=0, sigma=1, dims=("players", "coordinates"))
    ## std
    sigma = pm.Exponential("sigma", lam=1, dims=("players", "coordinates"))

    ## Define Multivariate likelihood
    d = pm.Normal("d", mu=mu[player_idx], sigma=sigma[player_idx], observed=np.array(drive_df[["x_scaled", "y_scaled"]]))


pm.model_to_graphviz(mdl)


with mdl:
    advi = pm.ADVI()


tracker = pm.callbacks.Tracker(
    mean=advi.approx.mean.eval,  # callable that returns mean
    std=advi.approx.std.eval,  # callable that returns std
)
approx = advi.fit(10000, callbacks=[tracker])

Output()


trace = approx.sample(2000)
summary_df = az.summary(trace.posterior, kind="stats")
summary_df


summary_df.loc[
    ["mu[28237, x]",
     "mu[28237, y]",
     "sigma[28237, x]",
     "sigma[28237, y]",
    ]
]


az.plot_trace(trace)

array([[<Axes: title={'center': 'mu'}>, <Axes: title={'center': 'mu'}>],
       [<Axes: title={'center': 'sigma'}>,
        <Axes: title={'center': 'sigma'}>]], dtype=object)


posterior_predictive = pm.sample_posterior_predictive(trace, model=mdl)
az.plot_ppc(posterior_predictive)

Output()

<Axes: xlabel='d'>


# Grab the posterior data
posterior_data = trace.posterior.stack(chain_draw=("chain", "draw"))

# Obtain parameters indexed by golfer
hier_mu_est = posterior_data["mu"].transpose("players", ...)
hier_sigma_est = posterior_data["sigma"].transpose("players", ...)

# Indices for Furyk (id == 10809) and DeChambeau (id == 47959)
furyk_idx = np.where(players==10809)
dechambeau_idx = np.where(players==47959)


# Sampling Furyk coordinates
furyk_x = norm.rvs(hier_mu_est[furyk_idx][0][0],
                   hier_sigma_est[furyk_idx][0][0])
furyk_y = norm.rvs(hier_mu_est[furyk_idx][0][1],
                   hier_sigma_est[furyk_idx][0][1])

# Sampling DeChambeau coordinates
dechambeau_x = norm.rvs(hier_mu_est[dechambeau_idx][0][0],
                   hier_sigma_est[dechambeau_idx][0][0])
dechambeau_y = norm.rvs(hier_mu_est[dechambeau_idx][0][1],
                   hier_sigma_est[dechambeau_idx][0][1])


# Store posterior coordinates in a DataFrame
viz_df = pd.DataFrame({
    "x": np.concatenate((dechambeau_x, furyk_x)),
    "y": np.concatenate((dechambeau_y, furyk_y)),
    "name": ["DeChambeau"]*len(dechambeau_x) + ["Furyk"]*len(furyk_x)
})

# Inverse transform the standardized coordinates
viz_df["x"] = (viz_df["x"] * drive_df["x_new"].std()) + drive_df["x_new"].mean()
viz_df["y"] = (viz_df["y"] * drive_df["y_new"].std()) + drive_df["y_new"].mean()


# Observed data
fig, ax = plt.subplots()
ax.scatter(drive_df[drive_df["last_name"]=="DeChambeau"]["x_new"],
           drive_df[drive_df["last_name"]=="DeChambeau"]["y_new"], alpha=0.25, color="red",
           label="DeChambeau")
ax.scatter(drive_df[drive_df["last_name"]=="Furyk"]["x_new"],
           drive_df[drive_df["last_name"]=="Furyk"]["y_new"], alpha=0.25, color="blue",
           label="Furyk")

# Fairway edge
plt.vlines(30, 0, 500, 'k', '--')
plt.vlines(-30, 0, 500, 'k', '--')
plt.annotate("Fairway Edge", (32, 450))
plt.ylim(0, 500)
plt.xlim(-250, 250)

# Update axes and title
plt.title("PGA Tour Drive Locations - Observed")
plt.ylabel("y-coordinates (yds)")
plt.xlabel("x-coordinates (yds)")

ax.legend(loc="upper left")
plt.show()


# Posterior data
fig, ax = plt.subplots()
ax.scatter(viz_df[viz_df["name"]=="DeChambeau"]["x"],
           viz_df[viz_df["name"]=="DeChambeau"]["y"], alpha=0.1, color="red",
           label="DeChambeau")
ax.scatter(viz_df[viz_df["name"]=="Furyk"]["x"],
           viz_df[viz_df["name"]=="Furyk"]["y"], alpha=0.1, color="blue",
           label="Furyk")

# Fairway edge
plt.vlines(30, 0, 500, 'k', '--')
plt.vlines(-30, 0, 500, 'k', '--')
plt.annotate("Fairway Edge", (32, 450))
plt.ylim(0, 500)
plt.xlim(-250, 250)

# Update axes and title
plt.title("PGA Tour Drive Locations - Posterior")
plt.ylabel("y-coordinates (yds)")
plt.xlabel("x-coordinates (yds)")

ax.legend(loc="upper left")
plt.show()


with open('/content/drive/MyDrive/Golf/drive_models.pkl', 'rb') as handle:
    models = pickle.load(handle)


off_the_tee_x_strokes = models["par_5"].predict(np.array(550).reshape(1, -1))[0]
off_the_tee_x_strokes

4.587147369591772


with open('/content/drive/MyDrive/Golf/approach_objs.pkl', 'rb') as handle:
    objs = pickle.load(handle)


# One-hot booleans for the lies
viz_df["fairway"] = [1 if abs(val)<30 else 0 for val in viz_df["x"]]
viz_df["rough"] = [1 if abs(val)>=30 else 0 for val in viz_df["x"]]
viz_df["native_area"] = [0 for val in viz_df["x"]]
viz_df["bunker"] = [0 for val in viz_df["x"]]
viz_df["other"] = [0 for val in viz_df["x"]]

# Distance from the hole
viz_df["distance"] = np.sqrt(viz_df["x"]**2 + (550 - viz_df["y"])**2)
viz_df["distance_norm"] = objs["distance_scaler"].transform(viz_df["distance"].values.reshape(-1, 1))


# Predict and store expected strokes
preds = objs["model"].predict(viz_df[["distance_norm", "fairway", "rough", "bunker", "native_area", "other"]])
viz_df["preds"] = preds
viz_df


# Strokes gained off-the-tee (minus one accounts for the actual stroke)
viz_df["strokes_gained"] = off_the_tee_x_strokes - viz_df["preds"] - 1
viz_df.groupby(["name"])["strokes_gained"].agg(["mean", "std"])


# Posterior data
fig, ax = plt.subplots()
s = ax.scatter(viz_df["x"],
           viz_df["y"], alpha=0.25, c=viz_df["strokes_gained"],
           cmap="coolwarm")
fig.colorbar(s)

# Fairway edge
plt.vlines(30, 0, 500, 'k', '--')
plt.vlines(-30, 0, 500, 'k', '--')
plt.annotate("Fairway Edge", (32, 450))
plt.ylim(0, 500)
plt.xlim(-250, 250)

# Update axes and title
plt.title("PGA Tour Drive Locations - Posterior")
plt.ylabel("y-coordinates (yds)")
plt.xlabel("x-coordinates (yds)")

plt.show()

	strokeId	type	shotNumber	distance	left	from_x	from_y	from_z	groupId	x	...	pin_x	pin_y	pin_z	fairway_center_x	fairway_center_y	fairway_center_z	hole_number	event_id_hole	course_id	event
0	216935.0	SST	1.0	176 yds	0 in.	7122.6	8352.09	2704.507	23.0	7632.6	...	7632.63	8490.98	2682.66	0.00	0.00	0.00	17	R2022047	538	47
1	308578.0	SST	1.0	179 yds	46 ft 9 in.	7122.6	8352.09	2704.507	39.0	7650.1	...	7632.63	8490.98	2682.66	0.00	0.00	0.00	17	R2022047	538	47
2	250191.0	SST	1.0	180 yds	43 ft 3 in.	7122.6	8352.09	2704.507	6.0	7651.9	...	7632.63	8490.98	2682.66	0.00	0.00	0.00	17	R2022047	538	47
3	406300.0	SST	1.0	186 yds	48 ft 5 in.	7122.6	8352.09	2704.507	34.0	7649.1	...	7632.63	8490.98	2682.66	0.00	0.00	0.00	17	R2022047	538	47
4	341760.0	SST	1.0	166 yds	42 ft 3 in.	7122.6	8352.09	2704.507	44.0	7608.2	...	7632.63	8490.98	2682.66	0.00	0.00	0.00	17	R2022047	538	47
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
564261	386972.0	SST	1.0	330 yds	153 yds	7434.9	10215.59	356.743	19.0	8395.0	...	8848.13	10049.31	352.54	8319.55	10122.43	350.02	7	R2021476	513	476
564262	410315.0	SST	1.0	267 yds	134 yds	11681.6	10946.42	337.065	19.0	11927.2	...	11678.57	9870.78	331.33	11922.45	10170.86	331.15	12	R2021476	513	476
564263	368518.0	SST	1.0	328 yds	234 yds	8001.3	10331.28	363.057	19.0	7061.5	...	6359.34	10587.46	342.66	7166.72	10586.21	346.11	3	R2021476	513	476
564264	374131.0	SST	1.0	193 yds	25 ft 3 in.	6451.8	10313.55	353.922	19.0	7028.8	...	7017.51	10264.89	346.46	0.00	0.00	0.00	4	R2021476	513	476
564265	406911.0	SST	1.0	146 yds	6 ft 3 in.	12040.3	11400.37	344.513	19.0	11753.5	...	11758.22	11072.34	336.30	0.00	0.00	0.00	11	R2021476	513	476

	mean	sd	hdi_3%	hdi_97%
mu[35506, x]	-0.062	0.143	-0.328	0.203
mu[35506, y]	-0.058	0.144	-0.331	0.208
mu[49771, x]	-0.017	0.140	-0.277	0.243
mu[49771, y]	0.010	0.140	-0.248	0.282
mu[27936, x]	0.127	0.143	-0.140	0.403
...	...	...	...	...
sigma[48640, y]	1.989	0.613	0.991	3.120
sigma[33968, x]	0.704	0.205	0.342	1.061
sigma[33968, y]	1.938	0.494	1.024	2.804
sigma[30909, x]	1.152	0.540	0.346	2.079
sigma[30909, y]	2.172	0.806	0.907	3.673

	mean	sd	hdi_3%	hdi_97%
mu[28237, x]	0.014	0.147	-0.250	0.305
mu[28237, y]	0.872	0.164	0.585	1.201
sigma[28237, x]	1.286	0.181	0.945	1.613
sigma[28237, y]	1.074	0.161	0.794	1.378

	x	y	name	fairway	rough	native_area	bunker	other	distance	distance_norm	preds
0	32.578225	306.117742	DeChambeau	0	1	0	0	0	246.048565	1.817405	3.576645
1	-5.320748	255.663664	DeChambeau	1	0	0	0	0	294.384424	2.775468	3.637455
2	29.477522	337.546132	DeChambeau	1	0	0	0	0	214.489091	1.191865	3.274142
3	-71.668675	347.617722	DeChambeau	0	1	0	0	0	214.697427	1.195995	3.473314
4	-10.033188	305.613631	DeChambeau	1	0	0	0	0	244.592236	1.788539	3.438736
...	...	...	...	...	...	...	...	...	...	...	...
3995	-18.943792	273.310043	Furyk	1	0	0	0	0	277.337699	2.437586	3.584840
3996	5.452830	266.912537	Furyk	1	0	0	0	0	283.139974	2.552592	3.604653
3997	17.601837	252.417896	Furyk	1	0	0	0	0	298.102220	2.849159	3.646764
3998	13.856000	278.288602	Furyk	1	0	0	0	0	272.064464	2.333065	3.565043
3999	-22.803212	268.886047	Furyk	1	0	0	0	0	282.037304	2.530736	3.601043

PGA Tour Driving Distributions¶

Transformation¶

Model¶

Validation¶

Case Study¶

Visualization¶

Expected Strokes Gained¶

Conclusion¶