%pip install pygam

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import pickle

from pygam import PoissonGAM, te

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler

import warnings
warnings.filterwarnings('ignore')

Requirement already satisfied: pygam in /usr/local/lib/python3.10/dist-packages (0.9.1)
Requirement already satisfied: numpy>=1.25 in /usr/local/lib/python3.10/dist-packages (from pygam) (1.26.4)
Requirement already satisfied: progressbar2<5.0.0,>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pygam) (4.5.0)
Requirement already satisfied: scipy<1.12,>=1.11.1 in /usr/local/lib/python3.10/dist-packages (from pygam) (1.11.4)
Requirement already satisfied: python-utils>=3.8.1 in /usr/local/lib/python3.10/dist-packages (from progressbar2<5.0.0,>=4.2.0->pygam) (3.9.0)
Requirement already satisfied: typing-extensions>3.10.0.2 in /usr/local/lib/python3.10/dist-packages (from python-utils>=3.8.1->progressbar2<5.0.0,>=4.2.0->pygam) (4.12.2)


approach_df = pd.read_csv("/content/drive/MyDrive/Golf/approach.csv")
approach_df


list(set(approach_df["from"]))

['OST',
 'OGS',
 nan,
 'OTO',
 'OTH',
 'OWA',
 'OCO',
 'OFW',
 'OWD',
 'OUK',
 'OIR',
 'ORO',
 'ONA']


approach_df = approach_df[approach_df["type"]=="SST"]


stroke_counts = list(approach_df.groupby("strokes")["hole"].count() / len(approach_df))
strokes_remaining = np.arange(1, len(stroke_counts) + 1)


fig, ax = plt.subplots()

# Plot
ax.bar(strokes_remaining, stroke_counts, color="gray")

# Labels
ax.set_xlabel('Strokes Remaining')
ax.set_ylabel('Proportion of Total Approach Shots')
ax.set_title('Distribution of Strokes Remaining on Approach')
ax.set_xticks(strokes_remaining)

plt.show()


lie_dict = {
    "OTH": "Other",
    "OWD": "Water drop area",
    "OST": "Fairway bunker",
    "OGS": "Greenside bunker",
    "OFW": "Fairway",
    "OUK": "Unknown",
    "ORO": "Primary rough",
    "OWA": "Water",
    "OIR": "Intermediate rough",
    "ONA": "Native area",
    "OTO": "Tree outline",
    "OCO": "Fringe"
}

# Find number of shots by lie
lie_df = pd.DataFrame(approach_df.groupby("from")["hole"].count()).reset_index()
lie_df.columns = ["lie", "shots"]

# Human-friendly lie types
lie_df = lie_df.sort_values("shots", ascending=True)
lie_df["lie"] = [lie_dict[x] for x in lie_df["lie"]]

# Proportion of shots
denom = np.sum(lie_df["shots"])
lie_df["shots"] /= denom

lie_df


fig, ax = plt.subplots()

# Plot
ax.barh(np.arange(len(lie_df)), lie_df["shots"], color="gray")

# Labels
ax.set_ylabel('Lie')
ax.set_xlabel('Proportion of Total Approach Shots')
ax.set_title('Distribution of Approach Shots by Lie')
ax.set_yticks(np.arange(len(lie_df)))
ax.set_yticklabels(list(lie_df["lie"]))

plt.show()


# Subset DataFrame to just consider fairway, rough, and bunker shots
type_df = approach_df[approach_df["from"].isin(["OFW", "ORO", "OIR", "OST", "OGS"])].copy()
type_df["lie"] = ["Fairway" if x == "OFW" else
                  "Rough" if x in ["ORO", "OIR"] else
                  "Bunker" for x in type_df["from"]]

# Initialize discretizer
fairway_est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile', subsample=None)
rough_est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile', subsample=None)
bunker_est = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile', subsample=None)

# Subset again
fairway_df = type_df[type_df["lie"]=="Fairway"].copy()
rough_df = type_df[type_df["lie"]=="Rough"].copy()
bunker_df = type_df[type_df["lie"]=="Bunker"].copy()

# Fit and transform to bins
fairway_bin = fairway_est.fit_transform(fairway_df[["distance_val"]])
rough_bin = rough_est.fit_transform(rough_df[["distance_val"]])
bunker_bin = bunker_est.fit_transform(bunker_df[["distance_val"]])

# Store
fairway_df["bin"] = [x[0] for x in fairway_bin]
rough_df["bin"] = [x[0] for x in rough_bin]
bunker_df["bin"] = [x[0] for x in bunker_bin]


# Mean distance and stroke values by bin
fw_viz = pd.DataFrame(fairway_df.groupby("bin")[["distance_val", "strokes"]].mean()).reset_index()
ro_viz = pd.DataFrame(rough_df.groupby("bin")[["distance_val", "strokes"]].mean()).reset_index()
bu_viz = pd.DataFrame(bunker_df.groupby("bin")[["distance_val", "strokes"]].mean()).reset_index()


fig, ax = plt.subplots()

# Plot
## Fairway
ax.plot(
        fw_viz["distance_val"], fw_viz["strokes"],
        linestyle='-', marker='o', markersize=3,
        color='gray', label="Fairway"
    )
## Fairway
ax.plot(
        ro_viz["distance_val"], ro_viz["strokes"],
        linestyle='--', marker='o', markersize=3,
        color='gray', label="Rough"
    )
## Bunker
ax.plot(
        bu_viz["distance_val"], bu_viz["strokes"],
        linestyle='-.', marker='o', markersize=3,
        color='gray', label="Bunker"
    )

# Labels
ax.set_xlabel('Distance (yards)')
ax.set_ylabel('Average Strokes Remaining')
ax.set_title('Average Strokes Remaining by Distance and Lie')
ax.legend()
ax.set_ylim(1, 4)

plt.show()


# Condensing lies
approach_df["lie"] = ["Fairway" if x == "OFW" else
                      "Rough" if x in ["ORO", "OIR"] else
                      "Bunker" if x in ["OST", "OGS"] else
                      "Native Area" if x == "ONA" else
                      "Other" for x in approach_df["from"]]


# Lie index
lies, lie_idx = pd.factorize(approach_df["lie"])
approach_df["lie_idx"] = lies


# Input / output
features = ["lie_idx", "distance_norm"]
target = "strokes"

# Split dataset
train_df, test_df = train_test_split(approach_df, test_size=0.2, random_state=42)

# Scale distance
distance_scaler = StandardScaler()
train_df["distance_norm"] = distance_scaler.fit_transform(train_df[["distance_val"]])
test_df["distance_norm"] = distance_scaler.transform(test_df[["distance_val"]])

# Model fitting
gam = PoissonGAM(te(0, 1, n_splines=15), lam=0.6).fit(train_df[features], train_df[target])
gam.summary()

PoissonGAM                                                                                                
=============================================== ==========================================================
Distribution:                       PoissonDist Effective DoF:                                     27.8937
Link Function:                          LogLink Log Likelihood:                               -592067.6922
Number of Samples:                       377612 AIC:                                          1184191.1718
                                                AICc:                                         1184191.1763
                                                UBRE:                                               2.1293
                                                Scale:                                                 1.0
                                                Pseudo R-Squared:                                   0.1416
==========================================================================================================
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
================================= ==================== ============ ============ ============ ============
te(0, 1)                          [0.6 0.6]            225          27.9         0.00e+00     ***         
intercept                                              1            0.0          1.66e-05     ***         
==========================================================================================================
Significance codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

WARNING: Fitting splines and a linear function to a feature introduces a model identifiability problem
         which can cause p-values to appear significant when they are not.

WARNING: p-values calculated in this manner behave correctly for un-penalized models or models with
         known smoothing parameters, but when smoothing parameters have been estimated, the p-values
         are typically lower than they should be, meaning that the tests reject the null too readily.


# lams = [0.3, 0.6, 0.9]
# n_splines = [10, 15, 20]

# gam.gridsearch(train_df[features], train_df[target], lam=lams, n_splines=n_splines)


train_preds = gam.predict(train_df[features])
test_preds = gam.predict(test_df[features])

print("Training Set MAE: " + str(round(mean_absolute_error(train_df["strokes"],
                                                           train_preds), 3)))
print("Test Set MAE: " + str(round(mean_absolute_error(test_df["strokes"],
                                                       test_preds), 3)))

Training Set MAE: 0.486
Test Set MAE: 0.482


# Range between two standard deviations of shot distance
distance_norm = np.linspace(-2, 2, num=100)
distance_vals = distance_scaler.inverse_transform(distance_norm.reshape(-1, 1))
distance_vals = distance_vals.reshape(1, -1)[0]
# Loop through each type of lie and generate predictions
all_df = pd.DataFrame()
for val in range(len(lie_idx)):
    temp_df = pd.DataFrame(
        {
            "distance_norm": distance_norm,
            "distance_vals": distance_vals,
            "lie_idx": val,
            "lie": lie_idx[val]
        }
    )
    preds = gam.predict(temp_df[features])
    temp_df["preds"] = preds
    all_df = pd.concat([all_df, temp_df])
all_df


fig, ax = plt.subplots()

# Plot
## Fairway
ax.plot(
        all_df[all_df["lie"]=="Fairway"]["distance_vals"],
        all_df[all_df["lie"]=="Fairway"]["preds"],
        linestyle='-', markersize=1,
        color='blue', label="Fairway (Predicted)"
    )
ax.plot(
        fw_viz["distance_val"], fw_viz["strokes"],
        linestyle='-', markersize=3,
        color='gray', label="Fairway"
    )
## Rough
ax.plot(
        all_df[all_df["lie"]=="Rough"]["distance_vals"],
        all_df[all_df["lie"]=="Rough"]["preds"],
        linestyle='--', markersize=1,
        color='blue', label="Rough (Predicted)"
    )
ax.plot(
        ro_viz["distance_val"], ro_viz["strokes"],
        linestyle='--', markersize=3,
        color='gray', label="Rough"
    )
## Bunker
ax.plot(
        all_df[all_df["lie"]=="Bunker"]["distance_vals"],
        all_df[all_df["lie"]=="Bunker"]["preds"],
        linestyle='-.', markersize=1,
        color='blue', label="Bunker (Predicted)"
    )
ax.plot(
        bu_viz["distance_val"], bu_viz["strokes"],
        linestyle='-.', markersize=3,
        color='gray', label="Bunker"
    )

# Labels
ax.set_xlabel('Distance (yards)')
ax.set_ylabel('Predicted Strokes Remaining')
ax.set_title('Predicted Strokes Remaining by Distance and Lie')
ax.legend()
ax.set_ylim(1, 4)

plt.show()


# Initialize discretizer
test_df["preds"] = test_preds
overall_est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile', subsample=None)

# Fit and transform to bins
strokes_bin = overall_est.fit_transform(test_df[["preds"]])

# Store
test_df["bin"] = [x[0] for x in strokes_bin]
overall_viz = pd.DataFrame(test_df.groupby("bin")[["strokes", "preds"]].mean()).reset_index()


fig, ax = plt.subplots()

# Plot
ax.plot(overall_viz["preds"], overall_viz["strokes"], linestyle='-', marker='o', markersize=3, color='gray')

# Add a diagonal line for perfect calibration
min_val = min(overall_viz["preds"].min(), overall_viz["strokes"].min())
max_val = max(overall_viz["preds"].max(), overall_viz["strokes"].max())
ax.plot([min_val, max_val], [min_val, max_val], linestyle='--', color='red', label="Perfect Calibration")

# Labels
ax.set_xlabel('Predicted Strokes Remaining')
ax.set_ylabel('Actual Strokes Remaining')
ax.set_title('Calibration Plot')
ax.legend()

plt.show()


objs = {
    "distance_scaler": distance_scaler,
    "model": gam
}

with open('/content/drive/MyDrive/Golf/approach_objs.pkl', 'wb') as handle:
    pickle.dump(objs, handle)

	event_id	year	player_id	round_num	hole	distance_val	distance	lie	from	to	type	cup	shotNumber	max_strokes	strokes
0	R2022541	2022	27095	2	18	189.0	189 yds	Primary Rough	ORO	ORO	SST	False	2.0	5.0	4.0
1	R2022540	2022	49117	2	1	158.0	158 yds	Rear Greenside Bunker	OFW	EG8	SST	False	2.0	5.0	4.0
2	R202230	2022	36699	4	4	135.0	135 yds	Left Greenside Bunker	OFW	EG6	SST	False	2.0	4.0	3.0
3	R2022525	2022	54628	3	12	285.0	285 yds	Front Left Greenside Bunker	OFW	EG5	SST	False	2.0	5.0	4.0
4	R202241	2022	51890	2	14	231.0	231 yds	Right Greenside Bunker	OFW	EG2	SST	False	2.0	5.0	4.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
476160	R202247	2022	27129	2	10	87.0	87 yds	Green	OFW	OGR	SST	False	2.0	3.0	2.0
476161	R2021527	2021	22405	1	10	71.0	71 yds	Green	OFW	OGR	SST	False	2.0	3.0	2.0
476162	R202232	2022	36699	2	1	90.0	90 yds	Green	OFW	OGR	SST	False	2.0	3.0	2.0
476163	R202220	2022	46340	4	6	127.0	127 yds	Green	OFW	OGR	SST	False	2.0	3.0	2.0
476164	R202254	2022	29908	2	17	124.0	124 yds	Green	OFW	OGR	SST	False	2.0	4.0	3.0

	distance_norm	distance_vals	lie_idx	lie	preds
0	-2.000000	53.454328	0	Rough	3.106412
1	-1.959596	55.492777	0	Rough	3.106412
2	-1.919192	57.531226	0	Rough	3.106511
3	-1.878788	59.569675	0	Rough	3.106710
4	-1.838384	61.608124	0	Rough	3.107011
...	...	...	...	...	...
95	1.838384	247.106996	4	Bunker	3.687768
96	1.878788	249.145445	4	Bunker	3.694513
97	1.919192	251.183894	4	Bunker	3.700713
98	1.959596	253.222343	4	Bunker	3.706335
99	2.000000	255.260792	4	Bunker	3.711347

PGA Tour Expected Strokes - Approach¶

Visualization¶

Strokes Remaining on Approach¶

Approach Shots by Lie¶

Expected Strokes by Lie and Distance¶

Predictive Model¶

Validation¶

Conclusion¶

	lie	shots
0	Fringe	0.000004
9	Unknown	0.000004
8	Tree outline	0.000013
10	Water	0.000083
2	Greenside bunker	0.000752
11	Water drop area	0.002775
7	Other	0.016576
4	Native area	0.019173
6	Fairway bunker	0.052494
3	Intermediate rough	0.056937
5	Primary rough	0.228946
1	Fairway	0.622243