Inteligenta artificiala pariaza pe Brazilia

In [1]:

```
# in primul rand, importam modulele necesare
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
```

In [2]:

```
# citim cele doua dataset-uri de pe Kaggle
world_cup = pd.read_csv('datasets/World Cup 2018 Dataset.csv')
results = pd.read_csv('datasets/results.csv')
```

In [3]:

```
# ... si vedem cu ochii noștri ce conțin
world_cup.head()
```

Out[3]:

In [4]:

```
results.head()
```

Out[4]:

In [5]:

```
# adaugam diferența de goluri și stabilim cine este câștigătorul
winner = []
for i in range (len(results['home_team'])):
if results ['home_score'][i] > results['away_score'][i]:
winner.append(results['home_team'][i])
elif results['home_score'][i] < results ['away_score'][i]:
winner.append(results['away_team'][i])
else:
winner.append('Draw')
results['winning_team'] = winner
# diferența de goluri o transformăm în coloană separată în dataset-ul results
results['goal_difference'] = np.absolute(results['home_score'] - results['away_score'])
results.head()
```

Out[5]:

In [6]:

```
# hai sa ne jucam cu un subset care include meciurile jucate de România în dataframe-ul Romania
df = results[(results['home_team'] == 'Romania') | (results['away_team'] == 'Romania')]
romania = df.iloc[:]
romania.head()
```

Out[6]:

In [7]:

```
year = []
for row in romania['date']:
year.append(int(row[:4]))
romania ['match_year']= year
romania_1930 = romania[romania.match_year >= 1930]
romania_1930.count()
```

Out[7]:

In [8]:

```
# hai sa vedem si grafic cum stam...
wins = []
for row in romania_1930['winning_team']:
if row != 'Romania' and row != 'Draw':
wins.append('Loss')
else:
wins.append(row)
winsdf= pd.DataFrame(wins, columns=[ 'Romania_Results'])
# ... plot
fig, ax = plt.subplots(1)
fig.set_size_inches(10.7, 6.27)
sns.set(style='darkgrid')
sns.countplot(x='Romania_Results', data=winsdf)
```

Out[8]:

In [9]:

```
# sa uitam de Romania, ca ma apuca plansu', si sa vedem echipele calificate la world cup 2018:
worldcup_teams = ['Australia', ' Iran', 'Japan', 'Korea Republic',
'Saudi Arabia', 'Egypt', 'Morocco', 'Nigeria',
'Senegal', 'Tunisia', 'Costa Rica', 'Mexico',
'Panama', 'Argentina', 'Brazil', 'Colombia',
'Peru', 'Uruguay', 'Belgium', 'Croatia',
'Denmark', 'England', 'France', 'Germany',
'Iceland', 'Poland', 'Portugal', 'Russia',
'Serbia', 'Spain', 'Sweden', 'Switzerland']
df_teams_home = results[results['home_team'].isin(worldcup_teams)]
df_teams_away = results[results['away_team'].isin(worldcup_teams)]
df_teams = pd.concat((df_teams_home, df_teams_away))
df_teams.drop_duplicates()
df_teams.count()
```

Out[9]:

In [10]:

```
df_teams.head()
```

Out[10]:

In [11]:

```
# acum sa facem cam ce-am facut cu Romania mai sus, ca sa facem predictiile folosind victoriile drept metrics
year = []
for row in df_teams['date']:
year.append(int(row[:4]))
df_teams['match_year'] = year
df_teams_1930 = df_teams[df_teams.match_year >= 1930]
df_teams_1930.head()
```

Out[11]:

In [12]:

```
# scapam de coloanele care in mod evident nu vor influenta rezultatul
df_teams_1930 = df_teams.drop(['date', 'home_score', 'away_score', 'tournament', 'city', 'country', 'goal_difference', 'match_year'], axis=1)
df_teams_1930.head()
```

Out[12]:

In [13]:

```
# construim modelul.
# labels = '2' pentru victoria echipei pe teren propriu, '1' pentru egalitate si '0' pentru victoria echipei in deplasare
df_teams_1930 = df_teams_1930.reset_index(drop=True)
df_teams_1930.loc[df_teams_1930.winning_team == df_teams_1930.home_team,'winning_team']=2
df_teams_1930.loc[df_teams_1930.winning_team == 'Draw', 'winning_team']=1
df_teams_1930.loc[df_teams_1930.winning_team == df_teams_1930.away_team, 'winning_team']=0
df_teams_1930.head()
```

Out[13]:

In [14]:

```
# convertim echipa gazda si echipa oaspete din variabile categorice in inputuri continue
# folosim o variabila 'dummy'
final = pd.get_dummies(df_teams_1930, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])
# Separam X si y sets
X = final.drop(['winning_team'], axis=1)
y = final["winning_team"]
y = y.astype('int')
# Separam train si test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
```

In [15]:

```
final.head()
```

Out[15]:

In [16]:

```
# folosim logistic regression ca algoritm pentru predictie
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
score = logreg.score(X_train, y_train)
score2 = logreg.score(X_test, y_test)
print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))
```

Asta e un rezultat cam “la ghici”, asa ca bagam in seama si rankingurile FIFA, in sensul ca echipa cu un rank mai mare o consideram favorita si o pozitionam in coloana “home”, desi nu exista asa ceva la WC2018Rusia

In [17]:

```
ranking = pd.read_csv('datasets/fifa_rankings.csv')
fixtures = pd.read_csv('datasets/fixtures.csv')
# Lista pentru stocarea meciuril
pred_set = []
```

In [18]:

```
# facem coloane noi cu rank-ul fiecarei echipe
fixtures.insert(1, 'first_position', fixtures['Home Team'].map(ranking.set_index('Team')['Position']))
fixtures.insert(2, 'second_position', fixtures['Away Team'].map(ranking.set_index('Team')['Position']))
fixtures = fixtures.iloc[:48, :]
fixtures.tail()
```

Out[18]:

In [19]:

```
# Loop pentru a adauga echipe la noul dataset bazat pe rankul fiecarei echipe
for index, row in fixtures.iterrows():
if row['first_position'] < row['second_position']:
pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
else:
pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set
pred_set.head()
```

Out[19]:

In [20]:

```
pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])
missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
pred_set[c] = 0
pred_set = pred_set[final.columns]
pred_set = pred_set.drop(['winning_team'], axis=1)
pred_set.head()
```

Out[20]:

In [21]:

```
# arata bine. let's go on...
predictions = logreg.predict(pred_set)
for i in range(fixtures.shape[0]):
print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
if predictions[i] == 2:
print("Winner: " + backup_pred_set.iloc[i, 1])
elif predictions[i] == 1:
print("Draw")
elif predictions[i] == 0:
print("Winner: " + backup_pred_set.iloc[i, 0])
print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(logreg.predict_proba(pred_set)[i][2]))
print('Probability of Draw: ', '%.3f'%(logreg.predict_proba(pred_set)[i][1]))
print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(logreg.predict_proba(pred_set)[i][0]))
print("")
```

In [22]:

```
# 16-zecimi
group_16 = [('Uruguay', 'Portugal'),
('France', 'Croatia'),
('Brazil', 'Mexico'),
('England', 'Colombia'),
('Spain', 'Russia'),
('Argentina', 'Peru'),
('Germany', 'Switzerland'),
('Poland', 'Belgium')]
```

In [23]:

```
def clean_and_predict(matches, ranking, final, logreg):
positions = []
for match in matches:
positions.append(ranking.loc[ranking['Team'] == match[0],'Position'].iloc[0])
positions.append(ranking.loc[ranking['Team'] == match[1],'Position'].iloc[0])
pred_set = []
i = 0
j = 0
while i < len(positions):
dict1 = {}
if positions[i] < positions[i + 1]:
dict1.update({'home_team': matches[j][0], 'away_team': matches[j][1]})
else:
dict1.update({'home_team': matches[j][1], 'away_team': matches[j][0]})
pred_set.append(dict1)
i += 2
j += 1
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set
pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])
missing_cols2 = set(final.columns) - set(pred_set.columns)
for c in missing_cols2:
pred_set[c] = 0
pred_set = pred_set[final.columns]
pred_set = pred_set.drop(['winning_team'], axis=1)
predictions = logreg.predict(pred_set)
for i in range(len(pred_set)):
print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
if predictions[i] == 2:
print("Winner: " + backup_pred_set.iloc[i, 1])
elif predictions[i] == 1:
print("Draw")
elif predictions[i] == 0:
print("Winner: " + backup_pred_set.iloc[i, 0])
print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ' , '%.3f'%(logreg.predict_proba(pred_set)[i][2]))
print('Probability of Draw: ', '%.3f'%(logreg.predict_proba(pred_set)[i][1]))
print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(logreg.predict_proba(pred_set)[i][0]))
print("")
```

In [24]:

```
clean_and_predict(group_16, ranking, final, logreg)
```

In [25]:

```
# sa vedem sferturile
quarters = [('Portugal', 'France'),
('Spain', 'Argentina'),
('Brazil', 'England'),
('Germany', 'Belgium')]
```

In [26]:

```
clean_and_predict(quarters, ranking, final, logreg)
```

In [27]:

```
# semifinalele
semi = [('Portugal', 'Brazil'),
('Argentina', 'Germany')]
```

In [28]:

```
clean_and_predict(semi, ranking, final, logreg)
```

In [29]:

```
# ... FINALA...
finals = [('Brazil', 'Germany')]
```

In [30]:

```
clean_and_predict(finals, ranking, final, logreg)
```

### qed, #haisapariempeBrazilia¶

http://musetoiu.ro/world-cup-2018-rusia-inteligenta-artificiala-pariaza-pe-brazilia/http://musetoiu.ro/wp-content/uploads/2018/06/brasil-1024x576.jpghttp://musetoiu.ro/wp-content/uploads/2018/06/brasil-150x150.jpgMLRevista preseibrazilia,python,rusia world cup 2018

Inteligenta artificiala pariaza pe Brazilia
/*!** Twitter Bootstrap**//*! * Bootstrap v3.3.7 (http://getbootstrap.com) * Copyright 2011-2016 Twitter, Inc. * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) *//*! normalize.css v3.0.3 | MIT License | github.com/necolas/normalize.css */html { font-family: sans-serif; -ms-text-size-adjust: 100%; -webkit-text-size-adjust: 100%;}body { ...

dan musetoiu
dan.musetoiu@gmail.comAdministratorDan Musetoiu