import pandas as pdgold_data = pd.read_parquet("../../files/datasets/too_messy_to_melt/gold_relegation_attendance.parquet")
print(gold_data.head()) Observation_ID Relegation_Event_ID Team Season Tier \
0 Aston Villa 2015-16_-1 Aston Villa 2015-16 Aston Villa 2014-15 1
1 Aston Villa 2015-16_0 Aston Villa 2015-16 Aston Villa 2015-16 1
2 Aston Villa 2015-16_1 Aston Villa 2015-16 Aston Villa 2016-17 2
3 Aston Villa 2015-16_2 Aston Villa 2015-16 Aston Villa 2017-18 2
4 Barnsley 1997-98_-1 Barnsley 1997-98 Barnsley 1996-97 2
Position Attendance Year_vs_Relegation Year_End_Outcome \
0 17 34133 -1 Survived
1 20 33690 0 Relegated
2 13 32107 1 Survived
3 4 32097 2 Survived
4 2 11356 -1 Promoted
Games_Played_History ... Covid_Baseline_Review_Flag \
0 38 ... False
1 38 ... False
2 46 ... False
3 46 ... False
4 46 ... False
Pct_Change_vs_Baseline Pct_Change_vs_Baseline_Display \
0 0.013149 1.31493
1 0.0 0.0
2 -0.046987 -4.698724
3 -0.047284 -4.728406
4 -0.384465 -38.446528
Pct_Change_Analysis Outcome_Group Goals_For_Per_Game Points_Per_Game \
0 <NA> Survived 0.815789 1.0
1 <NA> Survived 0.710526 0.447368
2 -4.698724 Survived 1.021739 1.347826
3 -4.728406 Survived 1.565217 1.804348
4 <NA> Survived 1.652174 1.73913
Is_Structural_Anomaly Plot_Group_Label Is_Baseline_Substituted
0 False Survived (Still in Champ) False
1 False Survived (Still in Champ) False
2 False Survived (Still in Champ) False
3 False Survived (Still in Champ) False
4 False Survived (Still in Champ) False
[5 rows x 35 columns]
gold_data["Relegation_Season_Index"] = (
gold_data["Relegation_Event_ID"].astype(str)
+ " | Y"
+ gold_data["Year_vs_Relegation"].astype(str)
)assert gold_data["Relegation_Season_Index"].is_uniquedupes = (
gold_data
.groupby(["Relegation_Event_ID", "Year_vs_Relegation"])
.size()
.reset_index(name="n")
.query("n > 1")
.sort_values("n", ascending=False)
)
print(dupes.head(100))Empty DataFrame
Columns: [Relegation_Event_ID, Year_vs_Relegation, n]
Index: []
silver_data = pd.read_parquet("../../files/datasets/too_messy_to_melt/silver_relegation_attendance.parquet")
print(silver_data.head())
dupes_silver = (
silver_data
.groupby(["Relegation_Event_ID", "Year_vs_Relegation"])
.size()
.reset_index(name="n")
.query("n > 1")
.sort_values("n", ascending=False)
)
print(dupes_silver.head(100)) Observation_ID Relegation_Event_ID Team Season Tier \
0 Aston Villa 2015-16_-1 Aston Villa 2015-16 Aston Villa 2014-15 1
1 Aston Villa 2015-16_0 Aston Villa 2015-16 Aston Villa 2015-16 1
2 Aston Villa 2015-16_1 Aston Villa 2015-16 Aston Villa 2016-17 2
3 Aston Villa 2015-16_2 Aston Villa 2015-16 Aston Villa 2017-18 2
4 Barnsley 1997-98_-1 Barnsley 1997-98 Barnsley 1996-97 2
Position Attendance Year_vs_Relegation Year_End_Outcome \
0 17 34133 -1 Survived
1 20 33690 0 Relegated
2 13 32107 1 Survived
3 4 32097 2 Survived
4 2 11356 -1 Promoted
Games_Played_History Wins_History Goals_For_History \
0 38 10 31
1 38 3 27
2 46 16 47
3 46 24 72
4 46 22 76
Goals_Against_History Points_History
0 57 38
1 76 17
2 48 62
3 42 83
4 55 80
Empty DataFrame
Columns: [Relegation_Event_ID, Year_vs_Relegation, n]
Index: []