Our goal is to use find data regarding the international matches played with all the countries in the world cup.
Data Source:
www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017
import os
import pandas as pd
df_fifa_rank = pd.read_csv("/Users/ethan/Downloads/Data/fifa_ranking-2022-10-06.csv", sep=",")
df_fifa_results = pd.read_csv("/Users/ethan/Downloads/Data/results.csv", sep=",")
df_fifa_rank.head()
rank | country_full | country_abrv | total_points | previous_points | rank_change | confederation | rank_date | |
---|---|---|---|---|---|---|---|---|
0 | 1 | Germany | GER | 57.0 | 0.0 | 0 | UEFA | 1992-12-31 |
1 | 96 | Syria | SYR | 11.0 | 0.0 | 0 | AFC | 1992-12-31 |
2 | 97 | Burkina Faso | BFA | 11.0 | 0.0 | 0 | CAF | 1992-12-31 |
3 | 99 | Latvia | LVA | 10.0 | 0.0 | 0 | UEFA | 1992-12-31 |
4 | 100 | Burundi | BDI | 10.0 | 0.0 | 0 | CAF | 1992-12-31 |
df_fifa_results.head()
date | home_team | away_team | home_score | away_score | tournament | city | country | neutral | |
---|---|---|---|---|---|---|---|---|---|
0 | 1872-11-30 | Scotland | England | 0 | 0 | Friendly | Glasgow | Scotland | False |
1 | 1873-03-08 | England | Scotland | 4 | 2 | Friendly | London | England | False |
2 | 1874-03-07 | Scotland | England | 2 | 1 | Friendly | Glasgow | Scotland | False |
3 | 1875-03-06 | England | Scotland | 2 | 2 | Friendly | London | England | False |
4 | 1876-03-04 | Scotland | England | 3 | 0 | Friendly | Glasgow | Scotland | False |
#Convert the values in the "date" and "rank_data" of the df_fifa_results and df_fifa_rank, to represent date times.
df_fifa_results["date"] = pd.to_datetime(df_fifa_results["date"])
df_fifa_rank["rank_date"] = pd.to_datetime(df_fifa_rank["rank_date"])
df_fifa_rank = df_fifa_rank.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(method='ffill').reset_index()
# Merging the df_fifa_rank and df_fifa_results together
df = df_fifa_results.merge(df_fifa_rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]], left_on=["date", "home_team"], right_on=["rank_date", "country_full"])
df = df.merge(df_fifa_rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]], left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away"))
# Dropping the repeated columns from merge
df= df.drop(["rank_date_home","rank_date_away", "country_full_away","country_full_home",
"rank_change_away","rank_change_home","country_full_away","country_full_home","previous_points_away","previous_points_home","country","neutral"], axis=1)
#renaming column headers and fixing any contry names
df = df.rename(columns = {'home_score': "home_team_score"})
df = df.rename(columns = {'away_score': "away_team_score"})
df = df.rename(columns = {'rank_home': "home_team_fifa_rank"})
df = df.rename(columns = {'rank_away': "away_team_fifa_rank"})
df = df.rename(columns = {'total_points_home': "home_team_total_fifa_points"})
df = df.rename(columns = {'total_points_away': "away_team_total_fifa_points"})
df = df.rename(columns = {'city': "location"})
df["home_team"] = df["home_team"].str.replace("USA", "United States").str.replace("IR Iran", "Iran").str.replace("Israel", "Palestine")
df["date"] = pd.to_datetime(df["date"])
df = df[(df["date"] >= "2008-8-1")].reset_index(drop=True)
df["goal_difference"] = (
abs(df["home_team_score"] - df["away_team_score"])
)
df["goal_total"] = (
abs(df["home_team_score"] + df["away_team_score"])
)
#df.head()
Dataframe "df_Fifa_world_cup" Column Description¶
date: date of the match¶
home_team: the name of the home team¶
away_team: the name of the away team¶
home_team_score: full-time home team score¶
away_team_score: full-time away team score¶
tournament: the name of the tournament¶
location: the name of the city where the match was played¶
home_team_total_fifa_points: current total points of home team as of the time of the match¶
home_team_fifa_rank: current country rank of home team as of the time of the match¶
away_team_total_fifa_points: current total points of away team as of the time of the match¶
away_team_fifa_rank: country rank of away team as of the time of the match¶
goal_difference : The absolute value of the difference in goals scored between two countries¶
goal_total: The totle of goals scored in a match¶
df_Fifa_world_cup = df
#df_Fifa_world_cup.head()
df_Fifa_world_cup.to_csv(r'/Users/ethan/Downloads/Data/Fifa_world_cup.csv')