endlesspint

import csv, os, json
import numpy as np
import pandas as pd
print pd.__version__

from datetime import datetime

from collections import Counter, defaultdict

import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline

0.18.0

import round stats and scoring (wide table)

# CompuBox stats
kov_ward_bout = pd.read_excel('data/compubox_stats.xlsx', sheetname='Sheet3')

kov_ward_bout

	round	kov_pun_land	kov_pun_thrw	kov_pun_perc	kov_jab_land	kov_jab_thrw	kov_jab_perc	kov_pow_land	kov_pow_thrw	kov_pow_perc	...	ward_jab_land	ward_jab_thrw	ward_jab_perc	ward_pow_land	ward_pow_thrw	ward_pow_perc	kov_kdwns	white	blue	pink
0	1	7	34	0.205882	4	22	0.181818	3	12	0.250000	...	4	13	0.307692	1	7	0.142857	0	kov	kov	kov
1	2	16	49	0.326531	5	23	0.217391	11	26	0.423077	...	3	12	0.250000	0	4	0.000000	1	kov	kov	kov
2	3	4	27	0.148148	2	15	0.133333	2	12	0.166667	...	2	12	0.166667	3	10	0.300000	0	kov	kov	ward
3	4	9	37	0.243243	3	22	0.136364	6	15	0.400000	...	4	14	0.285714	3	11	0.272727	0	kov	kov	kov
4	5	7	33	0.212121	3	17	0.176471	4	16	0.250000	...	3	10	0.300000	5	14	0.357143	0	ward	ward	kov
5	6	9	36	0.250000	3	17	0.176471	6	19	0.315789	...	5	13	0.384615	3	12	0.250000	0	kov	ward	kov
6	7	7	29	0.241379	1	10	0.100000	6	19	0.315789	...	6	12	0.500000	5	20	0.250000	0	ward	ward	ward
7	8	10	38	0.263158	4	20	0.200000	6	18	0.333333	...	1	12	0.083333	10	20	0.500000	0	ward	ward	ward
8	9	11	46	0.239130	4	22	0.181818	7	24	0.291667	...	8	20	0.400000	9	18	0.500000	0	ward	ward	ward
9	10	21	58	0.362069	11	33	0.333333	10	25	0.400000	...	9	23	0.391304	7	12	0.583333	0	ward	ward	ward
10	11	12	40	0.300000	5	22	0.227273	7	18	0.388889	...	9	15	0.600000	4	11	0.363636	0	ward	ward	ward
11	12	13	47	0.276596	3	19	0.157895	10	28	0.357143	...	1	12	0.083333	11	30	0.366667	0	ward	kov	ward

12 rows × 23 columns

kov_ward_bout['winner'] = 'split'

kov_ward_bout.loc[((kov_ward_bout.white=='kov') & 
                   (kov_ward_bout.blue=='kov') & 
                   (kov_ward_bout.pink=='kov')), 'winner' ] = 'kov'

kov_ward_bout.loc[((kov_ward_bout.white=='ward') & 
                  (kov_ward_bout.blue=='ward') & 
                  (kov_ward_bout.pink=='ward')), 'winner' ] = 'ward'

kov_ward_bout[['white', 'blue', 'pink', 'winner']]

	white	blue	pink	winner
0	kov	kov	kov	kov
1	kov	kov	kov	kov
2	kov	kov	ward	split
3	kov	kov	kov	kov
4	ward	ward	kov	split
5	kov	ward	kov	split
6	ward	ward	ward	ward
7	ward	ward	ward	ward
8	ward	ward	ward	ward
9	ward	ward	ward	ward
10	ward	ward	ward	ward
11	ward	kov	ward	split

exploratory vis analysis

nrows=2; ncols=3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16,10))

x = [-5, 50]; y = [-5, 50]

kov_ward_bout.plot.scatter('ward_jab_land', 'kov_jab_land', ax=axes[0,0])
axes[0,0].set_title('jabs')
axes[0,0].set_xlim([-1,12])
axes[0,0].set_ylim([-1,12])

kov_ward_bout.plot.scatter('ward_pow_land', 'kov_pow_land', ax=axes[0,1])
axes[0,1].set_title('power')
axes[0,1].set_xlim([-1,12])
axes[0,1].set_ylim([-1,12])
kov_ward_bout.plot.scatter('ward_pun_land', 'kov_pun_land', ax=axes[0,2])
axes[0,2].set_title('total')
axes[0,2].set_xlim([-1,24])
axes[0,2].set_ylim([-1,24])

kov_ward_bout.plot.scatter('ward_jab_perc', 'kov_jab_perc', ax=axes[1,0])
axes[1,0].set_xlim([-0.05,.65])
axes[1,0].set_ylim([-0.05,.65])
kov_ward_bout.plot.scatter('ward_pow_perc', 'kov_pow_perc', ax=axes[1,1])
axes[1,1].set_xlim([-0.05,.65])
axes[1,1].set_ylim([-0.05,.65])
kov_ward_bout.plot.scatter('ward_pun_perc', 'kov_pun_perc', ax=axes[1,2])
axes[1,2].set_xlim([-0.05,.65])
axes[1,2].set_ylim([-0.05,.65])
axes[1,2].text(0.6, 0., 'endlesspint.com',
               fontsize=12, color='gray',
               ha='right', va='bottom', alpha=0.3)

for row in range(nrows):
    for col in range(ncols):
        axes[row, col].set_aspect('equal')
        axes[row, col].plot(x,y,'--')
        axes[row, col].grid(False, which='both')
        
### save file localy w high resolution
# plt.savefig('img/fight_hour_tweets.PNG', dpi=1200)

same as above but with dots colored by round winner/split

nrows=2; ncols=3
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16,10))

x_line = [-5, 50]; y_line = [-5, 50]

kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_jab_land', 'kov_jab_land', ax=axes[0,0], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_jab_land', 'kov_jab_land', ax=axes[0,0], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_jab_land', 'kov_jab_land', ax=axes[0,0], c='g', s=50)
axes[0,0].set_title('jabs')
axes[0,0].set_xlim([-1,12]); axes[0,0].set_ylim([-1,12])
axes[0,0].set_xlabel(''); axes[0,0].set_ylabel('Kovalev')

kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_pow_land', 'kov_pow_land', ax=axes[0,1], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_pow_land', 'kov_pow_land', ax=axes[0,1], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_pow_land', 'kov_pow_land', ax=axes[0,1], c='g', s=50)
axes[0,1].set_title('power punches')
axes[0,1].set_xlim([-1,12]); axes[0,1].set_ylim([-1,12])
axes[0,1].set_xlabel(''); axes[0,1].set_ylabel('')

kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_pun_land', 'kov_pun_land', ax=axes[0,2], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_pun_land', 'kov_pun_land', ax=axes[0,2], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_pun_land', 'kov_pun_land', ax=axes[0,2], c='g', s=50)
axes[0,2].set_title('total')
axes[0,2].set_xlim([-1,24]); axes[0,2].set_ylim([-1,24])
axes[0,2].set_xlabel(''); axes[0,2].set_ylabel('')

kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_jab_perc', 'kov_jab_perc', ax=axes[1,0], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_jab_perc', 'kov_jab_perc', ax=axes[1,0], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_jab_perc', 'kov_jab_perc', ax=axes[1,0], c='g', s=50)
axes[1,0].set_xlim([-0.05,.65]);axes[1,0].set_ylim([-0.05,.65])
axes[1,0].set_xlabel('Ward'); axes[1,0].set_ylabel('Kovalev')

kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_pow_perc', 'kov_pow_perc', ax=axes[1,1], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_pow_perc', 'kov_pow_perc', ax=axes[1,1], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_pow_perc', 'kov_pow_perc', ax=axes[1,1], c='g', s=50)
axes[1,1].set_xlim([-0.05,.65]); axes[1,1].set_ylim([-0.05,.65])
axes[1,1].set_xlabel('Ward'); axes[1,1].set_ylabel('')

kov_ward_bout[kov_ward_bout.winner=='kov'].plot.scatter('ward_pun_perc', 'kov_pun_perc', ax=axes[1,2], c='r', s=50)
kov_ward_bout[kov_ward_bout.winner=='ward'].plot.scatter('ward_pun_perc', 'kov_pun_perc', ax=axes[1,2], c='b', s=50)
kov_ward_bout[kov_ward_bout.winner=='split'].plot.scatter('ward_pun_perc', 'kov_pun_perc', ax=axes[1,2], c='g', s=50)
axes[1,2].set_xlim([-0.05,.65]); axes[1,2].set_ylim([-0.05,.65])
axes[1,2].set_xlabel('Ward'); axes[1,2].set_ylabel('')
axes[1,2].text(0.6, 0., 'endlesspint.com',
               fontsize=12, color='gray',
               ha='right', va='bottom', alpha=0.3)

for row in range(nrows):
    for col in range(ncols):
        axes[row, col].set_aspect('equal')
        axes[row, col].plot(x_line,y_line,'c--')
        
plt.tight_layout(h_pad=1.0)

### save file localy w high resolution
# plt.savefig('img/fight_hour_tweets.PNG', dpi=1200)

notes

calculate perp dist to 45 degree line for each fighter; get an idea of “distance from parity”
Kov had 7 rounds with power shots landed, two dots/rounds (4 & 6) fall in same location (3,6)
Ward was better marksmans in all but the round he was floored in
Round 10 is curious one:
- Twitter saw it for Kov, Judges for Ward
- Kov landed his most jabs & total punches, throwing his most for the fight in that round
- his power shots were tied for his best performance of the fight

kov_ward_bout.kov_pun_perc > kov_ward_bout.ward_pun_perc

   False
    True
   False
   False
   False
   False
   False
   False
   False
   False
  False
  False
dtype: bool

ax = kov_ward_bout.plot.scatter(x='kov_jab_land', y='kov_pow_land', color='DarkBlue', label='Kov');
kov_ward_bout.plot.scatter(x='ward_jab_land', y='ward_pow_land', color='DarkGreen', label='Ward', title='Nom', ax=ax);

plt.figure(figsize=(8,8))
plt.scatter(x=kov_ward_bout.kov_jab_land, y=kov_ward_bout.kov_pow_land, c='r')
plt.scatter(x=kov_ward_bout.ward_jab_land, y=kov_ward_bout.ward_pow_land)
plt.plot(x_line,y_line,'--')
plt.xlim((-1,12))
plt.ylim((-1,12))
plt.axes().set_aspect('equal')

f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(15,15))

Kov1 = ax1.scatter(x=kov_ward_bout[kov_ward_bout['round']<=6]['kov_jab_land'], 
                  y=kov_ward_bout[kov_ward_bout['round']<=6]['kov_pow_land'], 
                  c='r', s=50, alpha=.7)
Kov2 = ax1.scatter(x=kov_ward_bout[kov_ward_bout['round']>6]['kov_jab_land'], 
                  y=kov_ward_bout[kov_ward_bout['round']>6]['kov_pow_land'], 
                  c='r', marker='s', s=50, alpha=.7)
Ward1 = ax1.scatter(x=kov_ward_bout[kov_ward_bout['round']<=6]['ward_jab_land'], 
                   y=kov_ward_bout[kov_ward_bout['round']<=6]['ward_pow_land'], 
                   c='b', s=50, alpha=.7)
Ward2 = ax1.scatter(x=kov_ward_bout[kov_ward_bout['round']>6]['ward_jab_land'], 
                   y=kov_ward_bout[kov_ward_bout['round']>6]['ward_pow_land'], 
                   c='b', marker='s', s=50, alpha=.7)
ax1.set_xlim((-1,12))
ax1.set_ylim((-1,12))
ax1.set_xlabel('jab count')
ax1.set_ylabel('power count')

ax2.scatter(x=kov_ward_bout[kov_ward_bout['round']<=6]['kov_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout['round']<=6]['kov_pow_perc'], 
            c='r', s=50, alpha=.7)
ax2.scatter(x=kov_ward_bout[kov_ward_bout['round']>6]['kov_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout['round']>6]['kov_pow_perc'], 
            c='r', marker='s', s=50, alpha=.7)
ax2.scatter(x=kov_ward_bout[kov_ward_bout['round']<=6]['ward_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout['round']<=6]['ward_pow_perc'], 
            c='b', s=50, alpha=.7)
ax2.scatter(x=kov_ward_bout[kov_ward_bout['round']>6]['ward_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout['round']>6]['ward_pow_perc'], 
            c='b', marker='s', s=50, alpha=.7)
ax2.set_xlim((-0.05,.65))
ax2.set_ylim((-0.05,.65))
ax2.set_xlabel('jab percent')
ax2.set_ylabel('power percent')


ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='kov']['kov_jab_land'], 
            y=kov_ward_bout[kov_ward_bout.winner=='kov']['kov_pow_land'], c='r', marker='o', s=80, alpha=.7)
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='split']['kov_jab_land'], 
            y=kov_ward_bout[kov_ward_bout.winner=='split']['kov_pow_land'], c='g', marker='o', s=50, alpha=.7)
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='ward']['kov_jab_land'], 
            y=kov_ward_bout[kov_ward_bout.winner=='ward']['kov_pow_land'], c='r', marker='x', s=50, alpha=.7)

ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='ward']['ward_jab_land'], 
            y=kov_ward_bout[kov_ward_bout.winner=='ward']['ward_pow_land'], c='b', marker='s', s=80, alpha=.7)
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='split']['ward_jab_land'], 
            y=kov_ward_bout[kov_ward_bout.winner=='split']['ward_pow_land'], c='g', marker='s', s=50, alpha=.7)
ax3.scatter(x=kov_ward_bout[kov_ward_bout.winner=='kov']['ward_jab_land'], 
            y=kov_ward_bout[kov_ward_bout.winner=='kov']['ward_pow_land'], c='b', marker='x', s=50, alpha=.7)
ax3.set_xlim((-1,12))
ax3.set_ylim((-1,12))
ax3.set_xlabel('jab count')
ax3.set_ylabel('power count')

# ax4.scatter(x=kov_ward_bout.kov_jab_perc, y=kov_ward_bout.kov_pow_perc, c='r', marker='D', s=50, alpha=.7)
# ax4.scatter(x=kov_ward_bout.ward_jab_perc, y=kov_ward_bout.ward_pow_perc, c='b', marker='s', s=50, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='kov']['kov_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout.winner=='kov']['kov_pow_perc'], c='r', marker='o', s=80, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='split']['kov_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout.winner=='split']['kov_pow_perc'], c='g', marker='o', s=50, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='ward']['kov_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout.winner=='ward']['kov_pow_perc'], c='r', marker='x', s=50, alpha=.7)

ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='ward']['ward_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout.winner=='ward']['ward_pow_perc'], c='b', marker='s', s=80, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='split']['ward_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout.winner=='split']['ward_pow_perc'], c='g', marker='s', s=50, alpha=.7)
ax4.scatter(x=kov_ward_bout[kov_ward_bout.winner=='kov']['ward_jab_perc'], 
            y=kov_ward_bout[kov_ward_bout.winner=='kov']['ward_pow_perc'], c='b', marker='x', s=50, alpha=.7)
ax4.set_xlim((-0.05,.65))
ax4.set_ylim((-0.05,.65))
ax4.set_xlabel('jab percent')
ax4.set_ylabel('power percent')
ax4.text(0.6, 0., 'endlesspint.com',
         fontsize=12, color='gray',
         ha='right', va='bottom', alpha=0.3)

for ax in [ax1, ax2, ax3, ax4]:
    ax.plot(x_line,y_line,'c--')
    ax.set_aspect('equal')

f.legend((Kov1, Ward1, Kov2, Ward2), 
         ('Kov Rd 1-6', 'Ward Rd 1-6', 'Kov Rd 7-12', 'Ward Rd 7-12'), 'right')

### save file localy w high resolution
# plt.savefig('img/fight_hour_tweets.PNG', dpi=1200)

<matplotlib.legend.Legend at 0xa634160>

notes

there are two Kov rounds with (nearly) same stats, but diff results (1 win, 1 split)
all unanimous Ward rounds showed him scoring at 50% in at least one punch stat

ax = kov_ward_bout.plot.scatter(x='kov_jab_perc', y='kov_pow_perc', color='Red', label='Kov');
kov_ward_bout.plot.scatter(x='ward_jab_perc', y='ward_pow_perc', color='Blue', label='Ward', title='Per', ax=ax);

round stats and scoring (long table)

kov_ward_bout.columns

Index([        u'round',  u'kov_pun_land',  u'kov_pun_thrw',  u'kov_pun_perc',
        u'kov_jab_land',  u'kov_jab_thrw',  u'kov_jab_perc',  u'kov_pow_land',
        u'kov_pow_thrw',  u'kov_pow_perc', u'ward_pun_land', u'ward_pun_thrw',
       u'ward_pun_perc', u'ward_jab_land', u'ward_jab_thrw', u'ward_jab_perc',
       u'ward_pow_land', u'ward_pow_thrw', u'ward_pow_perc',     u'kov_kdwns',
               u'white',          u'blue',          u'pink',        u'winner'],
      dtype='object')

cols_non_perc = ['kov_pun_land', 'kov_pun_thrw', 'kov_jab_land', 'kov_jab_thrw', 'kov_pow_land', 'kov_pow_thrw',
                 'ward_pun_land', 'ward_pun_thrw', 'ward_jab_land', 'ward_jab_thrw', 'ward_pow_land', 'ward_pow_thrw', 
                 'kov_kdwns']

kov_ward_bout[cols_non_perc].head()

	kov_pun_land	kov_pun_thrw	kov_jab_land	kov_jab_thrw	kov_pow_land	kov_pow_thrw	ward_pun_land	ward_pun_thrw	ward_jab_land	ward_jab_thrw	ward_pow_land	ward_pow_thrw	kov_kdwns
0	7	34	4	22	3	12	5	20	4	13	1	7	0
1	16	49	5	23	11	26	3	16	3	12	0	4	1
2	4	27	2	15	2	12	5	22	2	12	3	10	0
3	9	37	3	22	6	15	7	25	4	14	3	11	0
4	7	33	3	17	4	16	8	24	3	10	5	14	0

tup_non_perc = ('kov_pun_land', 'kov_pun_thrw', 'kov_jab_land', 'kov_jab_thrw', 'kov_pow_land', 'kov_pow_thrw',
                 'ward_pun_land', 'ward_pun_thrw', 'ward_jab_land', 'ward_jab_thrw', 'ward_pow_land', 'ward_pow_thrw', 
                 'kov_kdwns')

judges = ['white', 'blue', 'pink']
kov_ward_long = pd.DataFrame()

for judge in judges:
    temp_df = pd.melt(kov_ward_bout, id_vars=tup_non_perc, value_vars=[judge])
    kov_ward_long = kov_ward_long.append(temp_df)
    
kov_ward_long.reset_index(drop=True, inplace=True)
kov_ward_long.drop('variable', axis=1, inplace=True)
print kov_ward_long.shape
kov_ward_long.head(12)

(36, 14)

	kov_pun_land	kov_pun_thrw	kov_jab_land	kov_jab_thrw	kov_pow_land	kov_pow_thrw	ward_pun_land	ward_pun_thrw	ward_jab_land	ward_jab_thrw	ward_pow_land	ward_pow_thrw	kov_kdwns	value
0	7	34	4	22	3	12	5	20	4	13	1	7	0	kov
1	16	49	5	23	11	26	3	16	3	12	0	4	1	kov
2	4	27	2	15	2	12	5	22	2	12	3	10	0	kov
3	9	37	3	22	6	15	7	25	4	14	3	11	0	kov
4	7	33	3	17	4	16	8	24	3	10	5	14	0	ward
5	9	36	3	17	6	19	8	25	5	13	3	12	0	kov
6	7	29	1	10	6	19	11	32	6	12	5	20	0	ward
7	10	38	4	20	6	18	11	32	1	12	10	20	0	ward
8	11	46	4	22	7	24	17	38	8	20	9	18	0	ward
9	21	58	11	33	10	25	16	35	9	23	7	12	0	ward
10	12	40	5	22	7	18	13	26	9	15	4	11	0	ward
11	13	47	3	19	10	28	12	42	1	12	11	30	0	ward

sklearn ensemble techniques

source: http://scikit-learn.org/stable/modules/ensemble.html

source: http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/

Bagging

from sklearn import cross_validation
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

array = kov_ward_long.values
X = array[:,0:13]
y = array[:,13]

num_folds = 10
num_instances = len(X)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100

clf_bgg = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = cross_validation.cross_val_score(clf_bgg, X, y, cv=kfold)
print(results.mean())

0.858333333333

clf_bgg.fit(X, y)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=100, n_jobs=1, oob_score=False,
         random_state=7, verbose=0, warm_start=False)

print clf_bgg.predict(X[:12])
print clf_bgg.predict_proba(X[:12])

[u'kov' u'kov' u'kov' u'kov' u'ward' u'kov' u'ward' u'ward' u'ward' u'ward'
 u'ward' u'ward']
[[ 0.998       0.002     ]
 [ 0.99404762  0.00595238]
 [ 0.74559524  0.25440476]
 [ 0.98216667  0.01783333]
 [ 0.38943651  0.61056349]
 [ 0.66216667  0.33783333]
 [ 0.0075      0.9925    ]
 [ 0.015       0.985     ]
 [ 0.          1.        ]
 [ 0.00833333  0.99166667]
 [ 0.03        0.97      ]
 [ 0.28569048  0.71430952]]

pd.DataFrame(np.hstack((np.reshape(clf_bgg.predict(X[:12]), (12,1)), clf_bgg.predict_proba(X[:12]))), 
             columns=['winner', 'prob_kov', 'prob_ward'])

	winner	prob_kov	prob_ward
0	kov	0.998	0.002
1	kov	0.994048	0.00595238
2	kov	0.745595	0.254405
3	kov	0.982167	0.0178333
4	ward	0.389437	0.610563
5	kov	0.662167	0.337833
6	ward	0.0075	0.9925
7	ward	0.015	0.985
8	ward	0	1
9	ward	0.00833333	0.991667
10	ward	0.03	0.97
11	ward	0.28569	0.71431

Random Forest

from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

array = kov_ward_long.values
X = array[:,0:13]
y = array[:,13]

num_folds = 10
num_instances = len(X)
seed = 7
num_trees = 100
max_features = 3
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)

clf_RF = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_validation.cross_val_score(clf_RF, X, y, cv=kfold)
print(results.mean())

0.8

clf_RF.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

print clf_RF.predict(X[:12])
print clf_RF.predict_proba(X[:12])

[u'kov' u'kov' u'kov' u'kov' u'ward' u'kov' u'ward' u'ward' u'ward' u'ward'
 u'ward' u'ward']
[[ 0.98933333  0.01066667]
 [ 0.97083333  0.02916667]
 [ 0.6694881   0.3305119 ]
 [ 0.99333333  0.00666667]
 [ 0.35454762  0.64545238]
 [ 0.70514286  0.29485714]
 [ 0.          1.        ]
 [ 0.00991667  0.99008333]
 [ 0.          1.        ]
 [ 0.00333333  0.99666667]
 [ 0.035       0.965     ]
 [ 0.2872619   0.7127381 ]]

print kov_ward_long.columns[:13]
print np.sum(clf_RF.feature_importances_)
clf_RF.feature_importances_

Index([u'kov_pun_land', u'kov_pun_thrw', u'kov_jab_land', u'kov_jab_thrw',
       u'kov_pow_land', u'kov_pow_thrw', u'ward_pun_land', u'ward_pun_thrw',
       u'ward_jab_land', u'ward_jab_thrw', u'ward_pow_land', u'ward_pow_thrw',
       u'kov_kdwns'],
      dtype='object')
1.0





array([ 0.03661623,  0.03374295,  0.00706595,  0.02572995,  0.03433428,
        0.06633053,  0.19602896,  0.14064   ,  0.07438218,  0.03191946,
        0.27581351,  0.07112938,  0.00626662])

rf_feat_imp = pd.DataFrame(zip(kov_ward_long.columns[:13], clf_RF.feature_importances_), columns=['feature', 'importance'])
rf_feat_imp.sort_values('importance', ascending=False, inplace=True)
rf_feat_imp

	feature	importance
10	ward_pow_land	0.275814
6	ward_pun_land	0.196029
7	ward_pun_thrw	0.140640
8	ward_jab_land	0.074382
11	ward_pow_thrw	0.071129
5	kov_pow_thrw	0.066331
0	kov_pun_land	0.036616
4	kov_pow_land	0.034334
1	kov_pun_thrw	0.033743
9	ward_jab_thrw	0.031919
3	kov_jab_thrw	0.025730
2	kov_jab_land	0.007066
12	kov_kdwns	0.006267

how about that round 10? maybe one more go

kNN

from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

array = kov_ward_long.values
X = array[:,0:13]
y = array[:,13]

num_folds = 10
num_instances = len(X)
seed = 7

kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
results = cross_validation.cross_val_score(bagging, X, y, cv=kfold)
print(results.mean())

0.666666666667

bagging.fit(X, y)

print bagging.predict(X[:12])
print bagging.predict_proba(X[:12])

[u'kov' u'kov' u'kov' u'kov' u'kov' u'kov' u'ward' u'ward' u'ward' u'ward'
 u'ward' u'ward']
[[ 0.86  0.14]
 [ 0.56  0.44]
 [ 0.62  0.38]
 [ 0.7   0.3 ]
 [ 0.68  0.32]
 [ 0.64  0.36]
 [ 0.3   0.7 ]
 [ 0.26  0.74]
 [ 0.14  0.86]
 [ 0.18  0.82]
 [ 0.36  0.64]
 [ 0.2   0.8 ]]

zip(bagging.predict(X[:12]), bagging.predict_proba(X[:12]))

[(u'kov', array([ 0.86,  0.14])),
 (u'kov', array([ 0.56,  0.44])),
 (u'kov', array([ 0.62,  0.38])),
 (u'kov', array([ 0.7,  0.3])),
 (u'kov', array([ 0.68,  0.32])),
 (u'kov', array([ 0.64,  0.36])),
 (u'ward', array([ 0.3,  0.7])),
 (u'ward', array([ 0.26,  0.74])),
 (u'ward', array([ 0.14,  0.86])),
 (u'ward', array([ 0.18,  0.82])),
 (u'ward', array([ 0.36,  0.64])),
 (u'ward', array([ 0.2,  0.8]))]

from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

array = kov_ward_long.values
X = array[:,0:13]
y = array[:,13]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)

0.777777777778

from sklearn.cross_validation import cross_val_score

# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print scores

[ 0.6         0.75        0.75        0.25        0.75        1.
  0.33333333  1.          0.66666667  0.66666667]

# use average accuracy as an estimate of out-of-sample accuracy
print scores.mean()

0.676666666667

# search for an optimal value of K for KNN
k_range = range(1, 31)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
# print k_scores

# plot the value of K for KNN (x-axis) versus the cross-validated accuracy (y-axis)
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')

<matplotlib.text.Text at 0xbeb65c0>

# 10-fold cross-validation with the best KNN model
knn = KNeighborsClassifier(n_neighbors=13)
print cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()

0.871666666667

knn.fit(X, y)

zip(knn.predict(X[:12]), knn.predict_proba(X[:12]))

[(u'kov', array([ 0.69230769,  0.30769231])),
 (u'kov', array([ 0.69230769,  0.30769231])),
 (u'kov', array([ 0.69230769,  0.30769231])),
 (u'kov', array([ 0.69230769,  0.30769231])),
 (u'kov', array([ 0.69230769,  0.30769231])),
 (u'kov', array([ 0.53846154,  0.46153846])),
 (u'ward', array([ 0.30769231,  0.69230769])),
 (u'ward', array([ 0.46153846,  0.53846154])),
 (u'ward', array([ 0.07692308,  0.92307692])),
 (u'ward', array([ 0.15384615,  0.84615385])),
 (u'ward', array([ 0.46153846,  0.53846154])),
 (u'ward', array([ 0.07692308,  0.92307692]))]

# avg Kov proba round
np.mean(knn.predict_proba(X[:12])[:6,0])

0.66666666666666663

# avg Ward proba round
np.mean(knn.predict_proba(X[:12])[6:,1])

0.74358974358974361

… and KMeans (why not?)

from sklearn.cluster import KMeans

# identify the twelve rounds as 'X'
X_12rds = X[:12,]
X_12rds

array([[7L, 34L, 4L, 22L, 3L, 12L, 5L, 20L, 4L, 13L, 1L, 7L, 0L],
       [16L, 49L, 5L, 23L, 11L, 26L, 3L, 16L, 3L, 12L, 0L, 4L, 1L],
       [4L, 27L, 2L, 15L, 2L, 12L, 5L, 22L, 2L, 12L, 3L, 10L, 0L],
       [9L, 37L, 3L, 22L, 6L, 15L, 7L, 25L, 4L, 14L, 3L, 11L, 0L],
       [7L, 33L, 3L, 17L, 4L, 16L, 8L, 24L, 3L, 10L, 5L, 14L, 0L],
       [9L, 36L, 3L, 17L, 6L, 19L, 8L, 25L, 5L, 13L, 3L, 12L, 0L],
       [7L, 29L, 1L, 10L, 6L, 19L, 11L, 32L, 6L, 12L, 5L, 20L, 0L],
       [10L, 38L, 4L, 20L, 6L, 18L, 11L, 32L, 1L, 12L, 10L, 20L, 0L],
       [11L, 46L, 4L, 22L, 7L, 24L, 17L, 38L, 8L, 20L, 9L, 18L, 0L],
       [21L, 58L, 11L, 33L, 10L, 25L, 16L, 35L, 9L, 23L, 7L, 12L, 0L],
       [12L, 40L, 5L, 22L, 7L, 18L, 13L, 26L, 9L, 15L, 4L, 11L, 0L],
       [13L, 47L, 3L, 19L, 10L, 28L, 12L, 42L, 1L, 12L, 11L, 30L, 0L]], dtype=object)

# identify unanimous rounds for training
X_unanm = X_12rds[np.array(kov_ward_bout['winner']!='split')]
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_unanm)
kmeans.labels_

array([0, 0, 0, 0, 0, 1, 1, 0])

# predict on split rounds
X_split = X_12rds[np.array(kov_ward_bout['winner']=='split')]
kmeans.predict(X_split)

array([0, 0, 0, 1])

kmeans.cluster_centers_

array([[ 10.16666667,  37.83333333,   3.66666667,  19.83333333,
          6.5       ,  18.        ,   8.33333333,  25.16666667,
          4.5       ,  13.        ,   3.83333333,  12.16666667,
          0.16666667],
       [ 16.        ,  52.        ,   7.5       ,  27.5       ,
          8.5       ,  24.5       ,  16.5       ,  36.5       ,
          8.5       ,  21.5       ,   8.        ,  15.        ,   0.        ]])

maybe too many features, let’s go with the three most important as identified by RF

X_impFeat = X[:12,[6,7,10]]
X_impFeat

array([[5L, 20L, 1L],
       [3L, 16L, 0L],
       [5L, 22L, 3L],
       [7L, 25L, 3L],
       [8L, 24L, 5L],
       [8L, 25L, 3L],
       [11L, 32L, 5L],
       [11L, 32L, 10L],
       [17L, 38L, 9L],
       [16L, 35L, 7L],
       [13L, 26L, 4L],
       [12L, 42L, 11L]], dtype=object)

# identify unanimous rounds for training
X_unanm = X_impFeat[np.array(kov_ward_bout['winner']!='split')]
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_unanm)
kmeans.labels_

array([1, 1, 1, 0, 0, 0, 0, 0])

# predict on split rounds
X_split = X_impFeat[np.array(kov_ward_bout['winner']=='split')]
kmeans.predict(X_split)

array([1, 1, 1, 0])

# just run on all 12 at once
kmeans = KMeans(n_clusters=2, random_state=0).fit(X_impFeat)
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1])

source: http://matplotlib.org/mpl_toolkits/mplot3d/tutorial.html; http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html#sphx-glr-auto-examples-cluster-plot-cluster-iris-py

print(plt.style.available)

[u'seaborn-darkgrid', u'seaborn-notebook', u'classic', u'seaborn-ticks', u'grayscale', u'bmh', u'seaborn-talk', u'dark_background', u'ggplot', u'fivethirtyeight', u'seaborn-colorblind', u'seaborn-deep', u'seaborn-whitegrid', u'seaborn-bright', u'seaborn-poster', u'seaborn-muted', u'seaborn-paper', u'seaborn-white', u'seaborn-pastel', u'seaborn-dark', u'seaborn-dark-palette']

X_impFeat[np.array(kov_ward_bout['winner']=='split'),0]

array([5L, 8L, 8L, 12L], dtype=object)

for i in zip(['kov', 'ward', 'split'],['red', 'blue', 'green']):
    print i[0]

kov
ward
split

from mpl_toolkits.mplot3d import Axes3D

plt.style.use(u'seaborn-colorblind')

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')

# ADD COLOR by round win/split
# ax.scatter(X_impFeat[:,0], X_impFeat[:,1], X_impFeat[:,2])
for i in zip(['kov', 'ward', 'split'],['red', 'blue', 'green']):
    ax.scatter(X_impFeat[np.array(kov_ward_bout['winner']==i[0]),0], 
               X_impFeat[np.array(kov_ward_bout['winner']==i[0]),1], 
               X_impFeat[np.array(kov_ward_bout['winner']==i[0]),2],
               c=i[1], s=50
              )

ax.set_xlabel('ward_pun_land')
ax.set_ylabel('ward_pun_thrw')
ax.set_zlabel('ward_pow_land')

ax.text(27, 8, 0, 'endlesspint.com',
               fontsize=12, color='gray',
               ha='right', va='bottom', rotation=180, alpha=0.3)

plt.show()

### save file localy w high resolution
# plt.savefig('img/fight_hour_tweets.PNG', dpi=1200)

	kov_pun_land	kov_pun_thrw	kov_jab_land	kov_jab_thrw	kov_pow_land	kov_pow_thrw	ward_pun_land	ward_pun_thrw	ward_jab_land	ward_jab_thrw	ward_pow_land	ward_pow_thrw	kov_kdwns
0	7	34	4	22	3	12	5	20	4	13	1	7	0
1	16	49	5	23	11	26	3	16	3	12	0	4	1
2	4	27	2	15	2	12	5	22	2	12	3	10	0
3	9	37	3	22	6	15	7	25	4	14	3	11	0
4	7	33	3	17	4	16	8	24	3	10	5	14	0

	kov_pun_land	kov_pun_thrw	kov_jab_land	kov_jab_thrw	kov_pow_land	kov_pow_thrw	ward_pun_land	ward_pun_thrw	ward_jab_land	ward_jab_thrw	ward_pow_land	ward_pow_thrw	kov_kdwns
0	7	34	4	22	3	12	5	20	4	13	1	7	0
1	16	49	5	23	11	26	3	16	3	12	0	4	1
2	4	27	2	15	2	12	5	22	2	12	3	10	0
3	9	37	3	22	6	15	7	25	4	14	3	11	0
4	7	33	3	17	4	16	8	24	3	10	5	14	0

	kov_pun_land	kov_pun_thrw	kov_jab_land	kov_jab_thrw	kov_pow_land	kov_pow_thrw	ward_pun_land	ward_pun_thrw	ward_jab_land	ward_jab_thrw	ward_pow_land	ward_pow_thrw	kov_kdwns
0	7	34	4	22	3	12	5	20	4	13	1	7	0
1	16	49	5	23	11	26	3	16	3	12	0	4	1
2	4	27	2	15	2	12	5	22	2	12	3	10	0
3	9	37	3	22	6	15	7	25	4	14	3	11	0
4	7	33	3	17	4	16	8	24	3	10	5	14	0