サッカー勝敗予測 – 過去試合得点

目的

ドイツサッカーリーグ ブンデスリーガの勝敗予測をするために、線形サポートベクタマシン(LinearSVM)k近傍法(KNN)非線形サポートベクタマシン(SVM)を試した。
どの手法も予測精度がいまいちであったため、今回は特徴量に工夫を入れてみる。

試み

データは1993-2018年のブンデスリーガのデータを使う。

今回は競馬の予測で利用したスピード指数と似た感じに過去試合の得点を指数化してみる。
といってもスピード指数の計算式そのままは当てはめられないので、偏差値の式を使ってみる。

Wikipediaから計算式を引用する。

{\displaystyle T_{i}={\frac {10(x_{i}-\mu _{x})}{\sigma _{x}}}+50}
データの値 xi に対する偏差値 Ti

{\displaystyle {\begin{aligned}&\mu _{x}={\frac {1}{N}}\textstyle \sum \limits _{i=1}^{N}x_{i}\\&\sigma _{x}={\sqrt {{\frac {1}{N}}{\textstyle \sum \limits _{i=1}^{N}(x_{i}-\mu _{x})^{2}}}}={\sqrt {{\frac {1}{N}}{\textstyle \sum \limits _{i=1}^{N}{x_{i}}^{2}-{\mu _{x}}^{2}}}}\\\end{aligned}}}
N:データの大きさ、xi:データの各値、μx平均値σx標準偏差

試合結果を予測したいシーズンより前シーズンのスコアから平均値標準偏差を計算し、過去5試合スコアの偏差値を計算してみた。
例えば2022年シーズンを予測するなら、1993-2021シーズンのスコアから平均値・標準偏差を計算する。

下記のコードで実験した。

from kedro.io import DataCatalog
from kedro.extras.datasets.pandas import CSVDataSet
from kedro.extras.datasets.pickle import PickleDataSet
from kedro.io import DataCatalog, MemoryDataSet
from kedro.pipeline import node, Pipeline
from kedro.runner import SequentialRunner

# Prepare a data catalog
data_catalog = DataCatalog({"Bundesliga_Results": CSVDataSet(filepath='/kaggle/input/bundesliga-results-19932018/Bundesliga_Results.csv'),
                            "match_result" : PickleDataSet(filepath='./model_comparison_after.pkl'),
                            "features" : PickleDataSet(filepath='./features.pkl')})

def read_data(bundesliga_results):
    df = bundesliga_results.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df['home_score'] = df['FTHG']
    df['away_score'] = df['FTAG']
    df['game_result'] = df['FTR']
    return df[['Date', 'HomeTeam', 'AwayTeam', 'Season', 'home_score', 'away_score', 'game_result']]

def calc_std_score(bundesliga_results):
    df = bundesliga_results.copy()
    df = df.sort_values(['Season'])
    seasons = df['Season'].unique().tolist()
    std_score = None
    for i, season in enumerate(seasons):
        tmp = df[df['Season'].isin(seasons[0:i])]
        home_mean = tmp['home_score'].mean()
        home_std = tmp['home_score'].std()
        away_mean = tmp['away_score'].mean()
        away_std = tmp['away_score'].std()
        _std_score = pd.DataFrame([[season, home_mean, away_mean, home_std, away_std]], columns=['Season', 'home_mean', 'away_mean', 'home_std', 'away_std'])
        std_score = pd.concat([std_score, _std_score], axis=0)
    # NaNは平均値で補完する
    std_score = std_score.fillna(std_score.mean())
    return std_score

def engineer_feature(bundesliga_results, std_score):
    df = pd.merge(bundesliga_results, std_score, on='Season', how='inner')
    df['Season_home_score_index'] = (df['home_score'] - df['home_mean']) * 10 / df['home_std'] + 50
    df['Season_away_score_index'] = (df['away_score'] - df['away_mean']) * 10 / df['away_std'] + 50
    
    # ホーム・アウェイ関係なく、チーム名とスコア指数のテーブル
    df_home = df[['Date', 'Season', 'HomeTeam', 'Season_home_score_index']].copy()
    df_home.columns = ['Date', 'Season', 'Team', 'Season_score_index']
    df_away = df[['Date', 'Season', 'AwayTeam', 'Season_away_score_index']].copy()
    df_away.columns = ['Date', 'Season', 'Team', 'Season_score_index']
    df = pd.concat([df_home, df_away], axis=0).sort_values(['Team', 'Date'])
    df = df.sort_values(['Team', 'Date']).reset_index(drop=True)
    
    # ホームチームの過去試合
    def past(df, i=1):
        df = df.sort_values(['Team', 'Date'])
        df_shift = df.shift(i)
        columns_dict = dict()
        for col in df_shift.columns.tolist():
            columns_dict[col] = f'{col}_last{i}'
        df_shift = df_shift.rename(columns=columns_dict)
        df = pd.concat([df[['Date', 'Season', 'Team']], df_shift[[f'Team_last{i}', f'Season_score_index_last{i}']]], axis=1)
        df = df[df['Team']==df[f'Team_last{i}']]
        return df[['Date', 'Season', 'Team', f'Season_score_index_last{i}']]
    
    past_indices = df.copy()
    for i in range(1, 6):
        df_past = past(past_indices, i=i)
        past_indices = pd.concat([past_indices, df_past[f'Season_score_index_last{i}']], axis=1)
    
    past_home_indices = past_indices.copy()
    past_home_indices.columns = [f'home_{x}' for x in past_home_indices.columns]
    df = pd.merge(bundesliga_results, past_home_indices, 
                  left_on=['Date', 'Season', 'HomeTeam'], right_on=['home_Date', 'home_Season', 'home_Team'],
                  how='inner')
    past_away_indices = past_indices.copy()
    past_away_indices.columns = [f'away_{x}' for x in past_away_indices.columns]
    df = pd.merge(df, past_away_indices, 
                  left_on=['Date', 'Season', 'AwayTeam'], right_on=['away_Date', 'away_Season', 'away_Team'],
                  how='inner')
    df = df.fillna(0)
    return df[['Date', 'HomeTeam', 'AwayTeam', 'Season', 'home_score', 'away_score',
       'game_result', 
       'home_Season_score_index', 'home_Season_score_index_last1',
       'home_Season_score_index_last2', 'home_Season_score_index_last3',
       'home_Season_score_index_last4', 'home_Season_score_index_last5',
       'away_Season_score_index',
       'away_Season_score_index_last1', 'away_Season_score_index_last2',
       'away_Season_score_index_last3', 'away_Season_score_index_last4',
       'away_Season_score_index_last5']]

read_data_node = node(read_data, inputs='Bundesliga_Results', outputs='match_result')
calc_std_score_node = node(calc_std_score, inputs='match_result', outputs='std_score')
engineer_feature_node = node(engineer_feature, inputs=['match_result', 'std_score'], outputs='features')

# Assemble nodes into a pipeline
make_index_pipeline = Pipeline([read_data_node, calc_std_score_node, engineer_feature_node])

# Create a runner to run the pipeline
runner = SequentialRunner()

# Run the pipeline
print(runner.run(make_index_pipeline, data_catalog))

結果

線形SVMのスコアは前とあまり変わらなかった。

from sklearn.model_selection import GridSearchCV

# ブンデスリーガデータ読み込み
df = data_catalog.load('features')

# チーム名をダミー変数に変換
df_cat = None
for col in ['HomeTeam', 'AwayTeam']:
    tmp = pd.get_dummies(df[col], prefix=col)
    df_cat = pd.concat([df_cat, tmp], axis=1)
cat_col = df_cat.columns.tolist()
df = pd.concat([df, df_cat], axis=1)

# 勝ち・負け・引き分けをコード値に変換
df['game_result'] = df['game_result'].astype('category')
df['game_result_cd'] = df['game_result'].cat.codes
display(dict(enumerate(df['game_result'].cat.categories)))

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

feature = ['home_Season_score_index_last1',
       'home_Season_score_index_last2', 'home_Season_score_index_last3',
       'home_Season_score_index_last4', 'home_Season_score_index_last5',
       'away_Season_score_index_last1', 'away_Season_score_index_last2',
       'away_Season_score_index_last3', 'away_Season_score_index_last4',
       'away_Season_score_index_last5'] + cat_col
X = df_train[feature]
y = df_train['game_result_cd']
X_test = df_test[feature]
y_test = df_test['game_result_cd']

tuned_parameters = [{'C': [1e-2, 1e-1, 1, 10, 100, 1000]}]

clf = GridSearchCV(LinearSVC(random_state=0, tol=1e-5, max_iter=10000),
                   tuned_parameters,
                   cv=5,
                   scoring='accuracy')
clf.fit(X, y)

print(clf.best_params_)
"""{'C': 0.01}"""
print(clf.best_score_)
"""0.4980392156862745"""

print(f"accuracy: {accuracy_score(y_test, clf.predict(X_test))}")
"""accuracy: 0.5124183006535947"""

k近傍法の方もあまりスコアは変わらなかった。

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# ブンデスリーガデータ読み込み
df = data_catalog.load('features')

# チーム名をダミー変数に変換
df_cat = None
for col in ['HomeTeam', 'AwayTeam']:
    tmp = pd.get_dummies(df[col], prefix=col)
    df_cat = pd.concat([df_cat, tmp], axis=1)
cat_col = df_cat.columns.tolist()
df = pd.concat([df, df_cat], axis=1)

# 勝ち・負け・引き分けをコード値に変換
df['game_result'] = df['game_result'].astype('category')
df['game_result_cd'] = df['game_result'].cat.codes
display(dict(enumerate(df['game_result'].cat.categories)))

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

feature = ['home_Season_score_index_last1',
       'home_Season_score_index_last2', 'home_Season_score_index_last3',
       'home_Season_score_index_last4', 'home_Season_score_index_last5',
       'away_Season_score_index_last1', 'away_Season_score_index_last2',
       'away_Season_score_index_last3', 'away_Season_score_index_last4',
       'away_Season_score_index_last5'] + cat_col
X = df_train[feature]
y = df_train['game_result_cd']
X_test = df_test[feature]
y_test = df_test['game_result_cd']

tuned_parameters = [{'n_neighbors': [x for x in range(1, 20)]}]

neigh = GridSearchCV(KNeighborsClassifier(),
                   tuned_parameters,
                   cv=5,
                   scoring='accuracy')
neigh.fit(X, y)

print(neigh.best_params_)
"""{'n_neighbors': 19}"""
print(neigh.best_score_)
"""0.4322440087145969"""

print(f"train accuracy: {accuracy_score(y, neigh.predict(X))}")
"""train accuracy: 0.5138707334785766"""
print(f"test accuracy: {accuracy_score(y_test, neigh.predict(X_test))}")
"""test accuracy: 0.43137254901960786"""

非線形SVMは特徴量が増えたせいか、ハイパーパラメータ探索に時間がかかりすぎてしまったので、デフォルト設定で実行した結果を載せておく。
訓練データに対してはかなり高いスコアだが、テストデータに対してはスコアが悪く、過学習になっているよううだ。

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# ブンデスリーガデータ読み込み
df = data_catalog.load('features')

# チーム名をダミー変数に変換
df_cat = None
for col in ['HomeTeam', 'AwayTeam']:
    tmp = pd.get_dummies(df[col], prefix=col)
    df_cat = pd.concat([df_cat, tmp], axis=1)
cat_col = df_cat.columns.tolist()
df = pd.concat([df, df_cat], axis=1)

# 勝ち・負け・引き分けをコード値に変換
df['game_result'] = df['game_result'].astype('category')
df['game_result_cd'] = df['game_result'].cat.codes
display(dict(enumerate(df['game_result'].cat.categories)))

df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)

feature = ['home_Season_score_index_last1',
       'home_Season_score_index_last2', 'home_Season_score_index_last3',
       'home_Season_score_index_last4', 'home_Season_score_index_last5',
       'away_Season_score_index_last1', 'away_Season_score_index_last2',
       'away_Season_score_index_last3', 'away_Season_score_index_last4',
       'away_Season_score_index_last5'] + cat_col
X = df_train[feature]
y = df_train['game_result_cd']
X_test = df_test[feature]
y_test = df_test['game_result_cd']

clf = SVC(gamma='auto')
clf.fit(X, y)

print(f"train accuracy: {accuracy_score(y, clf.predict(X))}")
"""train accuracy: 0.9812636165577342"""
print(f"test accuracy: {accuracy_score(y_test, clf.predict(X_test))}")
"""test accuracy: 0.47581699346405226"""

考察

過去5試合のスコアを偏差値としてモデルに組み込んでみたが劇的な効果は得られなかった。
SVMは計算量の弱点があるようなので、次は決定木系のアプローチを試してみる。