import csv, os, json
import numpy as np
import pandas as pd
print pd.__version__
from datetime import datetime
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
0.18.0
import round stats and scoring (wide table)
# CompuBox stats
kov_ward_bout = pd.read_excel('data/compubox_stats.xlsx', sheetname='Sheet3')
kov_ward_bout
round | kov_pun_land | kov_pun_thrw | kov_pun_perc | kov_jab_land | kov_jab_thrw | kov_jab_perc | kov_pow_land | kov_pow_thrw | kov_pow_perc | ... | ward_jab_land | ward_jab_thrw | ward_jab_perc | ward_pow_land | ward_pow_thrw | ward_pow_perc | kov_kdwns | white | blue | pink | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 7 | 34 | 0.205882 | 4 | 22 | 0.181818 | 3 | 12 | 0.250000 | ... | 4 | 13 | 0.307692 | 1 | 7 | 0.142857 | 0 | kov | kov | kov |
1 | 2 | 16 | 49 | 0.326531 | 5 | 23 | 0.217391 | 11 | 26 | 0.423077 | ... | 3 | 12 | 0.250000 | 0 | 4 | 0.000000 | 1 | kov | kov | kov |
2 | 3 | 4 | 27 | 0.148148 | 2 | 15 | 0.133333 | 2 | 12 | 0.166667 | ... | 2 | 12 | 0.166667 | 3 | 10 | 0.300000 | 0 | kov | kov | ward |
3 | 4 | 9 | 37 | 0.243243 | 3 | 22 | 0.136364 | 6 | 15 | 0.400000 | ... | 4 | 14 | 0.285714 | 3 | 11 | 0.272727 | 0 | kov | kov | kov |
4 | 5 | 7 | 33 | 0.212121 | 3 | 17 | 0.176471 | 4 | 16 | 0.250000 | ... | 3 | 10 | 0.300000 | 5 | 14 | 0.357143 | 0 | ward | ward | kov |
5 | 6 | 9 | 36 | 0.250000 | 3 | 17 | 0.176471 | 6 | 19 | 0.315789 | ... | 5 | 13 | 0.384615 | 3 | 12 | 0.250000 | 0 | kov | ward | kov |
6 | 7 | 7 | 29 | 0.241379 | 1 | 10 | 0.100000 | 6 | 19 | 0.315789 | ... | 6 | 12 | 0.500000 | 5 | 20 | 0.250000 | 0 | ward | ward | ward |
7 | 8 | 10 | 38 | 0.263158 | 4 | 20 | 0.200000 | 6 | 18 | 0.333333 | ... | 1 | 12 | 0.083333 | 10 | 20 | 0.500000 | 0 | ward | ward | ward |
8 | 9 | 11 | 46 | 0.239130 | 4 | 22 | 0.181818 | 7 | 24 | 0.291667 | ... | 8 | 20 | 0.400000 | 9 | 18 | 0.500000 | 0 | ward | ward | ward |
9 | 10 | 21 | 58 | 0.362069 | 11 | 33 | 0.333333 | 10 | 25 | 0.400000 | ... | 9 | 23 | 0.391304 | 7 | 12 | 0.583333 | 0 | ward | ward | ward |
10 | 11 | 12 | 40 | 0.300000 | 5 | 22 | 0.227273 | 7 | 18 | 0.388889 | ... | 9 | 15 | 0.600000 | 4 | 11 | 0.363636 | 0 | ward | ward | ward |
11 | 12 | 13 | 47 | 0.276596 | 3 | 19 | 0.157895 | 10 | 28 | 0.357143 | ... | 1 | 12 | 0.083333 | 11 | 30 | 0.366667 | 0 | ward | kov | ward |
12 rows × 23 columns
kov_ward_bout['winner'] = 'split'
kov_ward_bout.loc[((kov_ward_bout.white=='kov') &
(kov_ward_bout.blue=='kov') &
(kov_ward_bout.pink=='kov')), 'winner' ] = 'kov'
kov_ward_bout.loc[((kov_ward_bout.white=='ward') &
(kov_ward_bout.blue=='ward') &
(kov_ward_bout.pink=='ward')), 'winner' ] = 'ward'
kov_ward_bout[['white', 'blue', 'pink', 'winner']]
white | blue | pink | winner | |
---|---|---|---|---|
0 | kov | kov | kov | kov |
1 | kov | kov | kov | kov |
2 | kov | kov | ward | split |
3 | kov | kov | kov | kov |
4 | ward | ward | kov | split |
5 | kov | ward | kov | split |
6 | ward | ward | ward | ward |
7 | ward | ward | ward | ward |
8 | ward | ward | ward | ward |
9 | ward | ward | ward | ward |
10 | ward | ward | ward | ward |
11 | ward | kov | ward | split |
exploratory vis analysis
nrows=2; ncols=3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16,10))
x = [-5, 50]; y = [-5, 50]
kov_ward_bout.plot.scatter('ward_jab_land', 'kov_jab_land', ax=axes[0,0])
axes[0,0].set_title('jabs')
axes[0,0].set_xlim([-1,12])
axes[0,0].set_ylim([-1,12])
kov_ward_bout.plot.scatter('ward_pow_land', 'kov_pow_land', ax=axes[0,1])
axes[0,1].set_title('power')
axes[0,1].set_xlim([-1,12])
axes[0,1].set_ylim([-1,12])
kov_ward_bout.plot.scatter('ward_pun_land', 'kov_pun_land', ax=axes[0,2])
axes[0,2].set_title('total')
axes[0,2].set_xlim([-1,24])
axes[0,2].set_ylim([-1,24])
kov_ward_bout.plot.scatter('ward_jab_perc', 'kov_jab_perc', ax=axes[1,0])
axes[1,0].set_xlim([-0.05,.65])
axes[1,0].set_ylim([-0.05,.65])
kov_ward_bout.plot.scatter('ward_pow_perc', 'kov_pow_perc', ax=axes[1,1])
axes[1,1].set_xlim([-0.05,.65])
axes[1,1].set_ylim([-0.05,.65])
kov_ward_bout.plot.scatter('ward_pun_perc', 'kov_pun_perc', ax=axes[1,2])
axes[1,2].set_xlim([-0.05,.65])
axes[1,2].set_ylim([-0.05,.65])
axes[1,2].text(0.6, 0., 'endlesspint.com',
fontsize=12, color='gray',
ha='right', va='bottom', alpha=0.3)
for row in range(nrows):
for col in range(ncols):
axes[row, col].set_aspect('equal')
axes[row, col].plot(x,y,'--')
axes[row, col].grid(False, which='both')
### save file localy w high resolution
# plt.savefig('img/fight_hour_tweets.PNG', dpi=1200)
same as above but with dots colored by round winner/split
nrows=2; ncols=3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16,10))
x_line = [-5, 50]; y_line = [-5, 50]
kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_jab_land', 'kov_jab_land', ax=axes[0,0], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_jab_land', 'kov_jab_land', ax=axes[0,0], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_jab_land', 'kov_jab_land', ax=axes[0,0], c='g', s=50)
axes[0,0].set_title('jabs')
axes[0,0].set_xlim([-1,12]); axes[0,0].set_ylim([-1,12])
axes[0,0].set_xlabel(''); axes[0,0].set_ylabel('Kovalev')
kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_pow_land', 'kov_pow_land', ax=axes[0,1], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_pow_land', 'kov_pow_land', ax=axes[0,1], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_pow_land', 'kov_pow_land', ax=axes[0,1], c='g', s=50)
axes[0,1].set_title('power punches')
axes[0,1].set_xlim([-1,12]); axes[0,1].set_ylim([-1,12])
axes[0,1].set_xlabel(''); axes[0,1].set_ylabel('')
kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_pun_land', 'kov_pun_land', ax=axes[0,2], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_pun_land', 'kov_pun_land', ax=axes[0,2], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_pun_land', 'kov_pun_land', ax=axes[0,2], c='g', s=50)
axes[0,2].set_title('total')
axes[0,2].set_xlim([-1,24]); axes[0,2].set_ylim([-1,24])
axes[0,2].set_xlabel(''); axes[0,2].set_ylabel('')
kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_jab_perc', 'kov_jab_perc', ax=axes[1,0], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_jab_perc', 'kov_jab_perc', ax=axes[1,0], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_jab_perc', 'kov_jab_perc', ax=axes[1,0], c='g', s=50)
axes[1,0].set_xlim([-0.05,.65]);axes[1,0].set_ylim([-0.05,.65])
axes[1,0].set_xlabel('Ward'); axes[1,0].set_ylabel('Kovalev')
kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_pow_perc', 'kov_pow_perc', ax=axes[1,1], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_pow_perc', 'kov_pow_perc', ax=axes[1,1], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_pow_perc', 'kov_pow_perc', ax=axes[1,1], c='g', s=50)
axes[1,1].set_xlim([-0.05,.65]); axes[1,1].set_ylim([-0.05,.65])
axes[1,1].set_xlabel('Ward'); axes[1,1].set_ylabel('')
kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_pun_perc', 'kov_pun_perc', ax=axes[1,2], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_pun_perc', 'kov_pun_perc', ax=axes[1,2], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_pun_perc', 'kov_pun_perc', ax=axes[1,2], c='g', s=50)
axes[1,2].set_xlim([-0.05,.65]); axes[1,2].set_ylim([-0.05,.65])
axes[1,2].set_xlabel('Ward'); axes[1,2].set_ylabel('')
axes[1,2].text(0.6, 0., 'endlesspint.com',
fontsize=12, color='gray',
ha='right', va='bottom', alpha=0.3)
for row in range(nrows):
for col in range(ncols):
axes[row, col].set_aspect('equal')
axes[row, col].plot(x_line,y_line,'c--')
plt.tight_layout(h_pad=1.0)
### save file localy w high resolution
# plt.savefig('img/fight_hour_tweets.PNG', dpi=1200)
notes
- calculate perp dist to 45 degree line for each fighter; get an idea of “distance from parity”
- Kov had 7 rounds with power shots landed, two dots/rounds (4 & 6) fall in same location (3,6)
- Ward was better marksmans in all but the round he was floored in
- Round 10 is curious one:
- Twitter saw it for Kov, Judges for Ward
- Kov landed his most jabs & total punches, throwing his most for the fight in that round
- his power shots were tied for his best performance of the fight
kov_ward_bout.kov_pun_perc > kov_ward_bout.ward_pun_perc
0 False
1 True
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
dtype: bool
ax = kov_ward_bout.plot.scatter(x='kov_jab_land', y='kov_pow_land', color='DarkBlue', label='Kov');
kov_ward_bout.plot.scatter(x='ward_jab_land', y='ward_pow_land', color='DarkGreen', label='Ward', title='Nom', ax=ax);
plt.figure(figsize=(8,8))
plt.scatter(x=kov_ward_bout.kov_jab_land, y=kov_ward_bout.kov_pow_land, c='r')
plt.scatter(x=kov_ward_bout.ward_jab_land, y=kov_ward_bout.ward_pow_land)
plt.plot(x_line,y_line,'--')
plt.xlim((-1,12))
plt.ylim((-1,12))
plt.axes().set_aspect('equal')
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(15,15))
Kov1 = ax1.scatter(x=kov_ward_bout[kov_ward_bout['round']<=6]['kov_jab_land'],
y=kov_ward_bout[kov_ward_bout['round']<=6]['kov_pow_land'],
c='r', s=50, alpha=.7)
Kov2 = ax1.scatter(x=kov_ward_bout[kov_ward_bout['round']>6]['kov_jab_land'],
y=kov_ward_bout[kov_ward_bout['round']>6]['kov_pow_land'],
c='r', marker='s', s=50, alpha=.7)
Ward1 = ax1.scatter(x=kov_ward_bout[kov_ward_bout['round']<=6]['ward_jab_land'],
y=kov_ward_bout[kov_ward_bout['round']<=6]['ward_pow_land'],
c='b', s=50, alpha=.7)
Ward2 = ax1.scatter(x=kov_ward_bout[kov_ward_bout['round']>6]['ward_jab_land'],
y=kov_ward_bout[kov_ward_bout['round']>6]['ward_pow_land'],
c='b', marker='s', s=50, alpha=.7)
ax1.set_xlim((-1,12))
ax1.set_ylim((-1,12))
ax1.set_xlabel('jab count')
ax1.set_ylabel('power count')
ax2.scatter(x=kov_ward_bout[kov_ward_bout['round']<=6]['kov_jab_perc'],
y=kov_ward_bout[kov_ward_bout['round']<=6]['kov_pow_perc'],
c='r', s=50, alpha=.7)
ax2.scatter(x=kov_ward_bout[kov_ward_bout['round']>6]['kov_jab_perc'],
y=kov_ward_bout[kov_ward_bout['round']>6]['kov_pow_perc'],
c='r', marker='s', s=50, alpha=.7)
ax2.scatter(x=kov_ward_bout[kov_ward_bout['round']<=6]['ward_jab_perc'],
y=kov_ward_bout[kov_ward_bout['round']<=6]['ward_pow_perc'],
c='b', s=50, alpha=.7)
ax2.scatter(x=kov_ward_bout[kov_ward_bout['round']>6]['ward_jab_perc'],
y=kov_ward_bout[kov_ward_bout['round']>6]['ward_pow_perc'],
c='b', marker='s', s=50, alpha=.7)
ax2.set_xlim((-0.05,.65))
ax2.set_ylim((-0.05,.65))
ax2.set_xlabel('jab percent')
ax2.set_ylabel('power percent')
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='kov']['kov_jab_land'],
y=kov_ward_bout[kov_ward_bout.winner=='kov']['kov_pow_land'], c='r', marker='o', s=80, alpha=.7)
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='split']['kov_jab_land'],
y=kov_ward_bout[kov_ward_bout.winner=='split']['kov_pow_land'], c='g', marker='o', s=50, alpha=.7)
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='ward']['kov_jab_land'],
y=kov_ward_bout[kov_ward_bout.winner=='ward']['kov_pow_land'], c='r', marker='x', s=50, alpha=.7)
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='ward']['ward_jab_land'],
y=kov_ward_bout[kov_ward_bout.winner=='ward']['ward_pow_land'], c='b', marker='s', s=80, alpha=.7)
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='split']['ward_jab_land'],
y=kov_ward_bout[kov_ward_bout.winner=='split']['ward_pow_land'], c='g', marker='s', s=50, alpha=.7)
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='kov']['ward_jab_land'],
y=kov_ward_bout[kov_ward_bout.winner=='kov']['ward_pow_land'], c='b', marker='x', s=50, alpha=.7)
ax3.set_xlim((-1,12))
ax3.set_ylim((-1,12))
ax3.set_xlabel('jab count')
ax3.set_ylabel('power count')
# ax4.scatter(x=kov_ward_bout.kov_jab_perc, y=kov_ward_bout.kov_pow_perc, c='r', marker='D', s=50, alpha=.7)
# ax4.scatter(x=kov_ward_bout.ward_jab_perc, y=kov_ward_bout.ward_pow_perc, c='b', marker='s', s=50, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='kov']['kov_jab_perc'],
y=kov_ward_bout[kov_ward_bout.winner=='kov']['kov_pow_perc'], c='r', marker='o', s=80, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='split']['kov_jab_perc'],
y=kov_ward_bout[kov_ward_bout.winner=='split']['kov_pow_perc'], c='g', marker='o', s=50, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='ward']['kov_jab_perc'],
y=kov_ward_bout[kov_ward_bout.winner=='ward']['kov_pow_perc'], c='r', marker='x', s=50, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='ward']['ward_jab_perc'],
y=kov_ward_bout[kov_ward_bout.winner=='ward']['ward_pow_perc'], c='b', marker='s', s=80, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='split']['ward_jab_perc'],
y=kov_ward_bout[kov_ward_bout.winner=='split']['ward_pow_perc'], c='g', marker='s', s=50, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='kov']['ward_jab_perc'],
y=kov_ward_bout[kov_ward_bout.winner=='kov']['ward_pow_perc'], c='b', marker='x', s=50, alpha=.7)
ax4.set_xlim((-0.05,.65))
ax4.set_ylim((-0.05,.65))
ax4.set_xlabel('jab percent')
ax4.set_ylabel('power percent')
ax4.text(0.6, 0., 'endlesspint.com',
fontsize=12, color='gray',
ha='right', va='bottom', alpha=0.3)
for ax in [ax1, ax2, ax3, ax4]:
ax.plot(x_line,y_line,'c--')
ax.set_aspect('equal')
f.legend((Kov1, Ward1, Kov2, Ward2),
('Kov Rd 1-6', 'Ward Rd 1-6', 'Kov Rd 7-12', 'Ward Rd 7-12'), 'right')
### save file localy w high resolution
# plt.savefig('img/fight_hour_tweets.PNG', dpi=1200)
<matplotlib.legend.Legend at 0xa634160>
notes
- there are two Kov rounds with (nearly) same stats, but diff results (1 win, 1 split)
- all unanimous Ward rounds showed him scoring at 50% in at least one punch stat
ax = kov_ward_bout.plot.scatter(x='kov_jab_perc', y='kov_pow_perc', color='Red', label='Kov');
kov_ward_bout.plot.scatter(x='ward_jab_perc', y='ward_pow_perc', color='Blue', label='Ward', title='Per', ax=ax);
round stats and scoring (long table)
kov_ward_bout.columns
Index([ u'round', u'kov_pun_land', u'kov_pun_thrw', u'kov_pun_perc',
u'kov_jab_land', u'kov_jab_thrw', u'kov_jab_perc', u'kov_pow_land',
u'kov_pow_thrw', u'kov_pow_perc', u'ward_pun_land', u'ward_pun_thrw',
u'ward_pun_perc', u'ward_jab_land', u'ward_jab_thrw', u'ward_jab_perc',
u'ward_pow_land', u'ward_pow_thrw', u'ward_pow_perc', u'kov_kdwns',
u'white', u'blue', u'pink', u'winner'],
dtype='object')
cols_non_perc = ['kov_pun_land', 'kov_pun_thrw', 'kov_jab_land', 'kov_jab_thrw', 'kov_pow_land', 'kov_pow_thrw',
'ward_pun_land', 'ward_pun_thrw', 'ward_jab_land', 'ward_jab_thrw', 'ward_pow_land', 'ward_pow_thrw',
'kov_kdwns']
kov_ward_bout[cols_non_perc].head()
kov_pun_land | kov_pun_thrw | kov_jab_land | kov_jab_thrw | kov_pow_land | kov_pow_thrw | ward_pun_land | ward_pun_thrw | ward_jab_land | ward_jab_thrw | ward_pow_land | ward_pow_thrw | kov_kdwns | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7 | 34 | 4 | 22 | 3 | 12 | 5 | 20 | 4 | 13 | 1 | 7 | 0 |
1 | 16 | 49 | 5 | 23 | 11 | 26 | 3 | 16 | 3 | 12 | 0 | 4 | 1 |
2 | 4 | 27 | 2 | 15 | 2 | 12 | 5 | 22 | 2 | 12 | 3 | 10 | 0 |
3 | 9 | 37 | 3 | 22 | 6 | 15 | 7 | 25 | 4 | 14 | 3 | 11 | 0 |
4 | 7 | 33 | 3 | 17 | 4 | 16 | 8 | 24 | 3 | 10 | 5 | 14 | 0 |
tup_non_perc = ('kov_pun_land', 'kov_pun_thrw', 'kov_jab_land', 'kov_jab_thrw', 'kov_pow_land', 'kov_pow_thrw',
'ward_pun_land', 'ward_pun_thrw', 'ward_jab_land', 'ward_jab_thrw', 'ward_pow_land', 'ward_pow_thrw',
'kov_kdwns')
judges = ['white', 'blue', 'pink']
kov_ward_long = pd.DataFrame()
for judge in judges:
temp_df = pd.melt(kov_ward_bout, id_vars=tup_non_perc, value_vars=[judge])
kov_ward_long = kov_ward_long.append(temp_df)
kov_ward_long.reset_index(drop=True, inplace=True)
kov_ward_long.drop('variable', axis=1, inplace=True)
print kov_ward_long.shape
kov_ward_long.head(12)
(36, 14)
kov_pun_land | kov_pun_thrw | kov_jab_land | kov_jab_thrw | kov_pow_land | kov_pow_thrw | ward_pun_land | ward_pun_thrw | ward_jab_land | ward_jab_thrw | ward_pow_land | ward_pow_thrw | kov_kdwns | value | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7 | 34 | 4 | 22 | 3 | 12 | 5 | 20 | 4 | 13 | 1 | 7 | 0 | kov |
1 | 16 | 49 | 5 | 23 | 11 | 26 | 3 | 16 | 3 | 12 | 0 | 4 | 1 | kov |
2 | 4 | 27 | 2 | 15 | 2 | 12 | 5 | 22 | 2 | 12 | 3 | 10 | 0 | kov |
3 | 9 | 37 | 3 | 22 | 6 | 15 | 7 | 25 | 4 | 14 | 3 | 11 | 0 | kov |
4 | 7 | 33 | 3 | 17 | 4 | 16 | 8 | 24 | 3 | 10 | 5 | 14 | 0 | ward |
5 | 9 | 36 | 3 | 17 | 6 | 19 | 8 | 25 | 5 | 13 | 3 | 12 | 0 | kov |
6 | 7 | 29 | 1 | 10 | 6 | 19 | 11 | 32 | 6 | 12 | 5 | 20 | 0 | ward |
7 | 10 | 38 | 4 | 20 | 6 | 18 | 11 | 32 | 1 | 12 | 10 | 20 | 0 | ward |
8 | 11 | 46 | 4 | 22 | 7 | 24 | 17 | 38 | 8 | 20 | 9 | 18 | 0 | ward |
9 | 21 | 58 | 11 | 33 | 10 | 25 | 16 | 35 | 9 | 23 | 7 | 12 | 0 | ward |
10 | 12 | 40 | 5 | 22 | 7 | 18 | 13 | 26 | 9 | 15 | 4 | 11 | 0 | ward |
11 | 13 | 47 | 3 | 19 | 10 | 28 | 12 | 42 | 1 | 12 | 11 | 30 | 0 | ward |
sklearn ensemble techniques
source: http://scikit-learn.org/stable/modules/ensemble.html
source: http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/
Bagging
from sklearn import cross_validation
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
array = kov_ward_long.values
X = array[:,0:13]
y = array[:,13]
num_folds = 10
num_instances = len(X)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
clf_bgg = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = cross_validation.cross_val_score(clf_bgg, X, y, cv=kfold)
print(results.mean())
0.858333333333
clf_bgg.fit(X, y)
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best'),
bootstrap=True, bootstrap_features=False, max_features=1.0,
max_samples=1.0, n_estimators=100, n_jobs=1, oob_score=False,
random_state=7, verbose=0, warm_start=False)
print clf_bgg.predict(X[:12])
print clf_bgg.predict_proba(X[:12])
[u'kov' u'kov' u'kov' u'kov' u'ward' u'kov' u'ward' u'ward' u'ward' u'ward'
u'ward' u'ward']
[[ 0.998 0.002 ]
[ 0.99404762 0.00595238]
[ 0.74559524 0.25440476]
[ 0.98216667 0.01783333]
[ 0.38943651 0.61056349]
[ 0.66216667 0.33783333]
[ 0.0075 0.9925 ]
[ 0.015 0.985 ]
[ 0. 1. ]
[ 0.00833333 0.99166667]
[ 0.03 0.97 ]
[ 0.28569048 0.71430952]]
pd.DataFrame(np.hstack((np.reshape(clf_bgg.predict(X[:12]), (12,1)), clf_bgg.predict_proba(X[:12]))),
columns=['winner', 'prob_kov', 'prob_ward'])
winner | prob_kov | prob_ward | |
---|---|---|---|
0 | kov | 0.998 | 0.002 |
1 | kov | 0.994048 | 0.00595238 |
2 | kov | 0.745595 | 0.254405 |
3 | kov | 0.982167 | 0.0178333 |
4 | ward | 0.389437 | 0.610563 |
5 | kov | 0.662167 | 0.337833 |
6 | ward | 0.0075 | 0.9925 |
7 | ward | 0.015 | 0.985 |
8 | ward | 0 | 1 |
9 | ward | 0.00833333 | 0.991667 |
10 | ward | 0.03 | 0.97 |
11 | ward | 0.28569 | 0.71431 |
Random Forest
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
array = kov_ward_long.values
X = array[:,0:13]
y = array[:,13]
num_folds = 10
num_instances = len(X)
seed = 7
num_trees = 100
max_features = 3
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
clf_RF = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_validation.cross_val_score(clf_RF, X, y, cv=kfold)
print(results.mean())
0.8
clf_RF.fit(X,y)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features=3, max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
print clf_RF.predict(X[:12])
print clf_RF.predict_proba(X[:12])
[u'kov' u'kov' u'kov' u'kov' u'ward' u'kov' u'ward' u'ward' u'ward' u'ward'
u'ward' u'ward']
[[ 0.98933333 0.01066667]
[ 0.97083333 0.02916667]
[ 0.6694881 0.3305119 ]
[ 0.99333333 0.00666667]
[ 0.35454762 0.64545238]
[ 0.70514286 0.29485714]
[ 0. 1. ]
[ 0.00991667 0.99008333]
[ 0. 1. ]
[ 0.00333333 0.99666667]
[ 0.035 0.965 ]
[ 0.2872619 0.7127381 ]]
print kov_ward_long.columns[:13]
print np.sum(clf_RF.feature_importances_)
clf_RF.feature_importances_
Index([u'kov_pun_land', u'kov_pun_thrw', u'kov_jab_land', u'kov_jab_thrw',
u'kov_pow_land', u'kov_pow_thrw', u'ward_pun_land', u'ward_pun_thrw',
u'ward_jab_land', u'ward_jab_thrw', u'ward_pow_land', u'ward_pow_thrw',
u'kov_kdwns'],
dtype='object')
1.0
array([ 0.03661623, 0.03374295, 0.00706595, 0.02572995, 0.03433428,
0.06633053, 0.19602896, 0.14064 , 0.07438218, 0.03191946,
0.27581351, 0.07112938, 0.00626662])
rf_feat_imp = pd.DataFrame(zip(kov_ward_long.columns[:13], clf_RF.feature_importances_), columns=['feature', 'importance'])
rf_feat_imp.sort_values('importance', ascending=False, inplace=True)
rf_feat_imp
feature | importance | |
---|---|---|
10 | ward_pow_land | 0.275814 |
6 | ward_pun_land | 0.196029 |
7 | ward_pun_thrw | 0.140640 |
8 | ward_jab_land | 0.074382 |
11 | ward_pow_thrw | 0.071129 |
5 | kov_pow_thrw | 0.066331 |
0 | kov_pun_land | 0.036616 |
4 | kov_pow_land | 0.034334 |
1 | kov_pun_thrw | 0.033743 |
9 | ward_jab_thrw | 0.031919 |
3 | kov_jab_thrw | 0.025730 |
2 | kov_jab_land | 0.007066 |
12 | kov_kdwns | 0.006267 |
how about that round 10? maybe one more go
kNN
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
array = kov_ward_long.values
X = array[:,0:13]
y = array[:,13]
num_folds = 10
num_instances = len(X)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
results = cross_validation.cross_val_score(bagging, X, y, cv=kfold)
print(results.mean())
0.666666666667
bagging.fit(X, y)
print bagging.predict(X[:12])
print bagging.predict_proba(X[:12])
[u'kov' u'kov' u'kov' u'kov' u'kov' u'kov' u'ward' u'ward' u'ward' u'ward'
u'ward' u'ward']
[[ 0.86 0.14]
[ 0.56 0.44]
[ 0.62 0.38]
[ 0.7 0.3 ]
[ 0.68 0.32]
[ 0.64 0.36]
[ 0.3 0.7 ]
[ 0.26 0.74]
[ 0.14 0.86]
[ 0.18 0.82]
[ 0.36 0.64]
[ 0.2 0.8 ]]
zip(bagging.predict(X[:12]), bagging.predict_proba(X[:12]))
[(u'kov', array([ 0.86, 0.14])),
(u'kov', array([ 0.56, 0.44])),
(u'kov', array([ 0.62, 0.38])),
(u'kov', array([ 0.7, 0.3])),
(u'kov', array([ 0.68, 0.32])),
(u'kov', array([ 0.64, 0.36])),
(u'ward', array([ 0.3, 0.7])),
(u'ward', array([ 0.26, 0.74])),
(u'ward', array([ 0.14, 0.86])),
(u'ward', array([ 0.18, 0.82])),
(u'ward', array([ 0.36, 0.64])),
(u'ward', array([ 0.2, 0.8]))]
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
array = kov_ward_long.values
X = array[:,0:13]
y = array[:,13]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)
# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)
0.777777777778
from sklearn.cross_validation import cross_val_score
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print scores
[ 0.6 0.75 0.75 0.25 0.75 1.
0.33333333 1. 0.66666667 0.66666667]
# use average accuracy as an estimate of out-of-sample accuracy
print scores.mean()
0.676666666667
# search for an optimal value of K for KNN
k_range = range(1, 31)
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
k_scores.append(scores.mean())
# print k_scores
# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
<matplotlib.text.Text at 0xbeb65c0>
# 10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors=13)
print cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()
0.871666666667
knn.fit(X, y)
zip(knn.predict(X[:12]), knn.predict_proba(X[:12]))
[(u'kov', array([ 0.69230769, 0.30769231])),
(u'kov', array([ 0.69230769, 0.30769231])),
(u'kov', array([ 0.69230769, 0.30769231])),
(u'kov', array([ 0.69230769, 0.30769231])),
(u'kov', array([ 0.69230769, 0.30769231])),
(u'kov', array([ 0.53846154, 0.46153846])),
(u'ward', array([ 0.30769231, 0.69230769])),
(u'ward', array([ 0.46153846, 0.53846154])),
(u'ward', array([ 0.07692308, 0.92307692])),
(u'ward', array([ 0.15384615, 0.84615385])),
(u'ward', array([ 0.46153846, 0.53846154])),
(u'ward', array([ 0.07692308, 0.92307692]))]
# avg Kov proba round
np.mean(knn.predict_proba(X[:12])[:6,0])
0.66666666666666663
# avg Ward proba round
np.mean(knn.predict_proba(X[:12])[6:,1])
0.74358974358974361
… and KMeans (why not?)
from sklearn.cluster import KMeans
# identify the twelve rounds as 'X'
X_12rds = X[:12,]
X_12rds
array([[7L, 34L, 4L, 22L, 3L, 12L, 5L, 20L, 4L, 13L, 1L, 7L, 0L],
[16L, 49L, 5L, 23L, 11L, 26L, 3L, 16L, 3L, 12L, 0L, 4L, 1L],
[4L, 27L, 2L, 15L, 2L, 12L, 5L, 22L, 2L, 12L, 3L, 10L, 0L],
[9L, 37L, 3L, 22L, 6L, 15L, 7L, 25L, 4L, 14L, 3L, 11L, 0L],
[7L, 33L, 3L, 17L, 4L, 16L, 8L, 24L, 3L, 10L, 5L, 14L, 0L],
[9L, 36L, 3L, 17L, 6L, 19L, 8L, 25L, 5L, 13L, 3L, 12L, 0L],
[7L, 29L, 1L, 10L, 6L, 19L, 11L, 32L, 6L, 12L, 5L, 20L, 0L],
[10L, 38L, 4L, 20L, 6L, 18L, 11L, 32L, 1L, 12L, 10L, 20L, 0L],
[11L, 46L, 4L, 22L, 7L, 24L, 17L, 38L, 8L, 20L, 9L, 18L, 0L],
[21L, 58L, 11L, 33L, 10L, 25L, 16L, 35L, 9L, 23L, 7L, 12L, 0L],
[12L, 40L, 5L, 22L, 7L, 18L, 13L, 26L, 9L, 15L, 4L, 11L, 0L],
[13L, 47L, 3L, 19L, 10L, 28L, 12L, 42L, 1L, 12L, 11L, 30L, 0L]], dtype=object)
# identify unanimous rounds for training
X_unanm = X_12rds[np.array(kov_ward_bout['winner']!='split')]
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_unanm)
kmeans.labels_
array([0, 0, 0, 0, 0, 1, 1, 0])
# predict on split rounds
X_split = X_12rds[np.array(kov_ward_bout['winner']=='split')]
kmeans.predict(X_split)
array([0, 0, 0, 1])
kmeans.cluster_centers_
array([[ 10.16666667, 37.83333333, 3.66666667, 19.83333333,
6.5 , 18. , 8.33333333, 25.16666667,
4.5 , 13. , 3.83333333, 12.16666667,
0.16666667],
[ 16. , 52. , 7.5 , 27.5 ,
8.5 , 24.5 , 16.5 , 36.5 ,
8.5 , 21.5 , 8. , 15. , 0. ]])
maybe too many features, let’s go with the three most important as identified by RF
X_impFeat = X[:12,[6,7,10]]
X_impFeat
array([[5L, 20L, 1L],
[3L, 16L, 0L],
[5L, 22L, 3L],
[7L, 25L, 3L],
[8L, 24L, 5L],
[8L, 25L, 3L],
[11L, 32L, 5L],
[11L, 32L, 10L],
[17L, 38L, 9L],
[16L, 35L, 7L],
[13L, 26L, 4L],
[12L, 42L, 11L]], dtype=object)
# identify unanimous rounds for training
X_unanm = X_impFeat[np.array(kov_ward_bout['winner']!='split')]
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_unanm)
kmeans.labels_
array([1, 1, 1, 0, 0, 0, 0, 0])
# predict on split rounds
X_split = X_impFeat[np.array(kov_ward_bout['winner']=='split')]
kmeans.predict(X_split)
array([1, 1, 1, 0])
# just run on all 12 at once
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_impFeat)
kmeans.labels_
array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1])
source: http://matplotlib.org/mpl_toolkits/mplot3d/tutorial.html; http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html#sphx-glr-auto-examples-cluster-plot-cluster-iris-py
print(plt.style.available)
[u'seaborn-darkgrid', u'seaborn-notebook', u'classic', u'seaborn-ticks', u'grayscale', u'bmh', u'seaborn-talk', u'dark_background', u'ggplot', u'fivethirtyeight', u'seaborn-colorblind', u'seaborn-deep', u'seaborn-whitegrid', u'seaborn-bright', u'seaborn-poster', u'seaborn-muted', u'seaborn-paper', u'seaborn-white', u'seaborn-pastel', u'seaborn-dark', u'seaborn-dark-palette']
X_impFeat[np.array(kov_ward_bout['winner']=='split'),0]
array([5L, 8L, 8L, 12L], dtype=object)
for i in zip(['kov', 'ward', 'split'],['red', 'blue', 'green']):
print i[0]
kov
ward
split
from mpl_toolkits.mplot3d import Axes3D
plt.style.use(u'seaborn-colorblind')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
# ADD COLOR by round win/split
# ax.scatter(X_impFeat[:,0], X_impFeat[:,1], X_impFeat[:,2])
for i in zip(['kov', 'ward', 'split'],['red', 'blue', 'green']):
ax.scatter(X_impFeat[np.array(kov_ward_bout['winner']==i[0]),0],
X_impFeat[np.array(kov_ward_bout['winner']==i[0]),1],
X_impFeat[np.array(kov_ward_bout['winner']==i[0]),2],
c=i[1], s=50
)
ax.set_xlabel('ward_pun_land')
ax.set_ylabel('ward_pun_thrw')
ax.set_zlabel('ward_pow_land')
ax.text(27, 8, 0, 'endlesspint.com',
fontsize=12, color='gray',
ha='right', va='bottom', rotation=180, alpha=0.3)
plt.show()
### save file localy w high resolution
# plt.savefig('img/fight_hour_tweets.PNG', dpi=1200)