import csv, os, json
import numpy as np
import pandas as pd
print pd.__version__
from datetime import datetime
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
0.18.0
pull tw [ggg1]
ingest tw [ggg2]
tweets_data = []
with open('data/kov_ward_tweets.txt', 'r') as f:
for line in f:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
print len(tweets_data)
74902
convert to DF
tweets_df = pd.DataFrame(tweets_data)
tweets_df.created_at = pd.to_datetime(tweets_df.created_at)
tweets_df.sort('created_at', inplace=True)
tweets_df = tweets_df[tweets_df.created_at.notnull()]
tweets_df.drop_duplicates(subset='id_str', inplace=True)
tweets_df.reset_index(drop=True, inplace=True)
print tweets_df.dtypes
print tweets_df.shape
tweets_df.tail()
contributors object
coordinates object
created_at datetime64[ns]
entities object
extended_entities object
favorite_count int64
favorited bool
geo object
id int64
id_str object
in_reply_to_screen_name object
in_reply_to_status_id float64
in_reply_to_status_id_str object
in_reply_to_user_id float64
in_reply_to_user_id_str object
is_quote_status bool
lang object
metadata object
place object
possibly_sensitive object
quoted_status object
quoted_status_id float64
quoted_status_id_str object
retweet_count int64
retweeted bool
retweeted_status object
source object
text object
truncated bool
user object
dtype: object
(74702, 30)
contributors | coordinates | created_at | entities | extended_entities | favorite_count | favorited | geo | id | id_str | ... | quoted_status | quoted_status_id | quoted_status_id_str | retweet_count | retweeted | retweeted_status | source | text | truncated | user | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
74697 | None | None | 2016-11-20 23:58:49 | {u'symbols': [], u'user_mentions': [{u'indices... | NaN | 3 | False | None | 800488602656444417 | 800488602656444417 | ... | NaN | NaN | NaN | 1 | False | NaN | <a href="http://twitter.com/download/iphone" r... | Say what you want about @andreward but make su... | False | {u'follow_request_sent': False, u'has_extended... |
74698 | None | None | 2016-11-20 23:59:14 | {u'symbols': [], u'user_mentions': [{u'indices... | NaN | 0 | False | None | 800488704703860736 | 800488704703860736 | ... | NaN | NaN | NaN | 1 | False | {u'contributors': None, u'truncated': False, u... | <a href="http://twitter.com/download/iphone" r... | RT @GarethADaviesDT: Jerry Izenberg tells me s... | False | {u'follow_request_sent': False, u'has_extended... |
74699 | None | None | 2016-11-20 23:59:31 | {u'symbols': [], u'user_mentions': [{u'indices... | {u'media': [{u'source_user_id': 239159456, u's... | 0 | False | None | 800488776401240064 | 800488776401240064 | ... | NaN | NaN | NaN | 5 | False | {u'contributors': None, u'truncated': False, u... | <a href="http://twitter.com/download/android" ... | RT @Shady__007: Was good to bump into the one ... | False | {u'follow_request_sent': False, u'has_extended... |
74700 | None | None | 2016-11-20 23:59:35 | {u'symbols': [], u'user_mentions': [{u'indices... | NaN | 0 | False | None | 800488794453639168 | 800488794453639168 | ... | NaN | NaN | NaN | 10 | False | {u'contributors': None, u'truncated': True, u'... | <a href="http://twitter.com" rel="nofollow">Tw... | RT @JCGBoxing: Basta de #KovalevWard por un ra... | False | {u'follow_request_sent': False, u'has_extended... |
74701 | None | None | 2016-11-20 23:59:43 | {u'symbols': [], u'user_mentions': [{u'indices... | NaN | 0 | False | None | 800488827278258177 | 800488827278258177 | ... | NaN | NaN | NaN | 3 | False | {u'contributors': None, u'truncated': False, u... | <a href="http://twitter.com/download/android" ... | RT @IzquierdazoBox: "Ward tiene un gran futuro... | False | {u'follow_request_sent': False, u'has_extended... |
5 rows × 30 columns
- plot on ts [ggg2]
- identify rd’s & rest [ggg2]
- export rest tw with regex scores [ggg2]
print tweets_df.created_at.min()
print tweets_df.created_at.max()
2016-11-16 00:41:50
2016-11-20 23:59:43
create new user_ DF fields
user_fields = ['created_at', 'description', 'followers_count', 'friends_count', 'geo_enabled',
'location', 'name', 'screen_name', 'statuses_count', 'time_zone', 'utc_offset']
user_fields_df = ['user_'+i for i in user_fields]
print user_fields_df
['user_created_at', 'user_description', 'user_followers_count', 'user_friends_count', 'user_geo_enabled', 'user_location', 'user_name', 'user_screen_name', 'user_statuses_count', 'user_time_zone', 'user_utc_offset']
tweets_df = pd.concat([tweets_df,pd.DataFrame(columns=user_fields_df)])
tweets_df.columns
Index([ u'contributors', u'coordinates',
u'created_at', u'entities',
u'extended_entities', u'favorite_count',
u'favorited', u'geo',
u'id', u'id_str',
u'in_reply_to_screen_name', u'in_reply_to_status_id',
u'in_reply_to_status_id_str', u'in_reply_to_user_id',
u'in_reply_to_user_id_str', u'is_quote_status',
u'lang', u'metadata',
u'place', u'possibly_sensitive',
u'quoted_status', u'quoted_status_id',
u'quoted_status_id_str', u'rest_period',
u'retweet_count', u'retweeted',
u'retweeted_status', u'source',
u'text', u'truncated',
u'user', u'user_created_at',
u'user_description', u'user_followers_count',
u'user_friends_count', u'user_geo_enabled',
u'user_location', u'user_name',
u'user_screen_name', u'user_statuses_count',
u'user_time_zone', u'user_utc_offset'],
dtype='object')
def user_info(row, field):
try:
return row[field]
except:
return None
for f,df in zip(user_fields, user_fields_df):
tweets_df[df] = tweets_df.user.apply(user_info, field=f)
tweets_df.user_created_at = pd.to_datetime(tweets_df.user_created_at)
print tweets_df.dtypes
tweets_df[user_fields_df].head()
contributors object
coordinates object
created_at datetime64[ns]
entities object
extended_entities object
favorite_count float64
favorited object
geo object
id float64
id_str object
in_reply_to_screen_name object
in_reply_to_status_id float64
in_reply_to_status_id_str object
in_reply_to_user_id float64
in_reply_to_user_id_str object
is_quote_status object
lang object
metadata object
place object
possibly_sensitive object
quoted_status object
quoted_status_id float64
quoted_status_id_str object
rest_period float64
retweet_count float64
retweeted object
retweeted_status object
source object
text object
truncated object
user object
user_created_at datetime64[ns]
user_description object
user_followers_count int64
user_friends_count int64
user_geo_enabled bool
user_location object
user_name object
user_screen_name object
user_statuses_count int64
user_time_zone object
user_utc_offset float64
dtype: object
user_created_at | user_description | user_followers_count | user_friends_count | user_geo_enabled | user_location | user_name | user_screen_name | user_statuses_count | user_time_zone | user_utc_offset | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-05-30 18:13:42 | 123 | 102 | True | Rico | Deef_djam | 9005 | London | 0.0 | ||
1 | 2011-05-30 18:13:42 | 123 | 102 | True | Rico | Deef_djam | 9005 | London | 0.0 | ||
2 | 2015-11-15 21:59:32 | amosc: derreberrygirl 🎓SEN17RS🤘🏽 | 150 | 130 | True | New Orleans, LA | Queen Derreberry | _derreberry | 2177 | None | NaN |
3 | 2012-09-12 23:56:32 | #IRONADDICT #MARTIALARTIST\n#BOXING \n@TOPCLAS... | 1960 | 959 | False | KINGSTON UPON HULL : ENGLAND | STEVE HAIGH | _SH70_ | 13022 | None | NaN |
4 | 2014-09-16 15:05:26 | I'm so mean I make medicine sick | 316 | 421 | True | George | georgehughes77 | 2487 | None | NaN |
create user
UTC offset in hours
- for another time, ignore for now (out of scope)
tweets_df['user_utc_offset_hr'] = tweets_df.user_utc_offset / 3600.0
tweets_df['user_utc_offset_hr'].value_counts()
-5.00 11602
-8.00 11190
-6.00 6058
0.00 6021
-7.00 2269
-4.00 2147
1.00 2067
-3.00 872
-10.00 668
-9.00 560
2.00 441
3.00 416
8.00 327
11.00 236
7.00 180
-2.00 165
-11.00 151
9.00 140
6.00 123
4.00 120
10.00 111
5.50 108
5.00 106
13.00 61
10.50 55
5.75 12
3.50 6
4.50 5
-3.50 4
9.50 2
6.50 1
-1.00 1
Name: user_utc_offset_hr, dtype: int64
time index
dates_allTweets = []
for tweet in tweets_data:
try:
dates_allTweets.append(tweet['created_at'])
except:
pass
len(dates_allTweets)
74902
dates_allTweets = tweets_df['created_at'].tolist()
# a list of "1's" to count the tweets
ones = [1]*len(dates_allTweets)
# the index of the series
idx = pd.DatetimeIndex(dates_allTweets)
# the actual series (at series of 1s for the moment)
tweets_KovWard = pd.Series(ones, index=idx)
# Resampling / bucketing
per_minute = tweets_KovWard.resample('1Min', how='sum').fillna(0)
per_minute[(per_minute.index > '2016-11-20 04:30:00') &
(per_minute.index < '2016-11-20 06:00:00')].plot(figsize=(12,6), label='Tweets')
<matplotlib.axes._subplots.AxesSubplot at 0x8d25e48>
print per_minute[(per_minute.index > '2016-11-20 04:44:00') &
(per_minute.index < '2016-11-20 05:34:00')]
2016-11-20 04:45:00 296.0
2016-11-20 04:46:00 392.0
2016-11-20 04:47:00 195.0
2016-11-20 04:48:00 219.0
2016-11-20 04:49:00 607.0
2016-11-20 04:50:00 825.0
2016-11-20 04:51:00 472.0
2016-11-20 04:52:00 273.0
2016-11-20 04:53:00 358.0
2016-11-20 04:54:00 548.0
2016-11-20 04:55:00 283.0
2016-11-20 04:56:00 168.0
2016-11-20 04:57:00 224.0
2016-11-20 04:58:00 428.0
2016-11-20 04:59:00 237.0
2016-11-20 05:00:00 188.0
2016-11-20 05:01:00 241.0
2016-11-20 05:02:00 463.0
2016-11-20 05:03:00 247.0
2016-11-20 05:04:00 154.0
2016-11-20 05:05:00 280.0
2016-11-20 05:06:00 421.0
2016-11-20 05:07:00 257.0
2016-11-20 05:08:00 189.0
2016-11-20 05:09:00 277.0
2016-11-20 05:10:00 481.0
2016-11-20 05:11:00 223.0
2016-11-20 05:12:00 167.0
2016-11-20 05:13:00 246.0
2016-11-20 05:14:00 446.0
2016-11-20 05:15:00 240.0
2016-11-20 05:16:00 165.0
2016-11-20 05:17:00 368.0
2016-11-20 05:18:00 741.0
2016-11-20 05:19:00 443.0
2016-11-20 05:20:00 212.0
2016-11-20 05:21:00 333.0
2016-11-20 05:22:00 704.0
2016-11-20 05:23:00 379.0
2016-11-20 05:24:00 232.0
2016-11-20 05:25:00 350.0
2016-11-20 05:26:00 836.0
2016-11-20 05:27:00 363.0
2016-11-20 05:28:00 255.0
2016-11-20 05:29:00 448.0
2016-11-20 05:30:00 1183.0
2016-11-20 05:31:00 1003.0
2016-11-20 05:32:00 1375.0
2016-11-20 05:33:00 2277.0
Freq: T, dtype: float64
taking closer look to determine rest period tweets
per_10sec = tweets_KovWard.resample('30S', how='sum').fillna(0)
per_10sec[(per_10sec.index > '2016-11-20 04:45:00') &
(per_10sec.index < '2016-11-20 05:00:00')]
2016-11-20 04:45:30 203.0
2016-11-20 04:46:00 242.0
2016-11-20 04:46:30 150.0
2016-11-20 04:47:00 122.0
2016-11-20 04:47:30 73.0
2016-11-20 04:48:00 85.0
2016-11-20 04:48:30 134.0
2016-11-20 04:49:00 206.0
2016-11-20 04:49:30 401.0
2016-11-20 04:50:00 478.0
2016-11-20 04:50:30 347.0
2016-11-20 04:51:00 264.0
2016-11-20 04:51:30 208.0
2016-11-20 04:52:00 148.0
2016-11-20 04:52:30 125.0
2016-11-20 04:53:00 144.0
2016-11-20 04:53:30 214.0
2016-11-20 04:54:00 317.0
2016-11-20 04:54:30 231.0
2016-11-20 04:55:00 156.0
2016-11-20 04:55:30 127.0
2016-11-20 04:56:00 98.0
2016-11-20 04:56:30 70.0
2016-11-20 04:57:00 80.0
2016-11-20 04:57:30 144.0
2016-11-20 04:58:00 250.0
2016-11-20 04:58:30 178.0
2016-11-20 04:59:00 134.0
2016-11-20 04:59:30 103.0
Freq: 30S, dtype: float64
# rd1: 2016-11-20 04:42:30 - 04:45:30
# rd1 rest: 2016-11-20 04:45:30 - 04:46:30
# rd2 rest: 2016-11-20 04:49:30 - 04:50:30
tweets_df[(tweets_df['created_at'] >= '2016-11-20 04:49:30') & \
(tweets_df['created_at'] < '2016-11-20 04:50:30')]['text'].count()
879
from datetime import timedelta
fight_start_text = '2016-11-20 04:42:30'
fight_start_time = datetime.strptime('2016-11-20 04:42:30', '%Y-%m-%d %H:%M:%S')
rnd = timedelta(minutes = 3)
rst = timedelta(minutes = 1)
rnd_start_stop = []
rst_start_stop = []
for i in range(12):
print 'round %d start: %s' % (i+1, (i*(rnd+rst)+fight_start_time).strftime('%Y-%m-%d %H:%M:%S'))
rnd_start = i*(rnd+rst)+fight_start_time
rnd_stop = rnd_start + rnd
rst_start = rnd_start + rnd
rst_stop = rst_start + rst
rnd_start_stop.append((rnd_start.strftime('%Y-%m-%d %H:%M:%S'), rnd_stop.strftime('%Y-%m-%d %H:%M:%S')))
rst_start_stop.append((rst_start.strftime('%Y-%m-%d %H:%M:%S'), rst_stop.strftime('%Y-%m-%d %H:%M:%S')))
print ''
print rnd_start_stop
print rst_start_stop
round 1 start: 2016-11-20 04:42:30
round 2 start: 2016-11-20 04:46:30
round 3 start: 2016-11-20 04:50:30
round 4 start: 2016-11-20 04:54:30
round 5 start: 2016-11-20 04:58:30
round 6 start: 2016-11-20 05:02:30
round 7 start: 2016-11-20 05:06:30
round 8 start: 2016-11-20 05:10:30
round 9 start: 2016-11-20 05:14:30
round 10 start: 2016-11-20 05:18:30
round 11 start: 2016-11-20 05:22:30
round 12 start: 2016-11-20 05:26:30
[('2016-11-20 04:42:30', '2016-11-20 04:45:30'), ('2016-11-20 04:46:30', '2016-11-20 04:49:30'), ('2016-11-20 04:50:30', '2016-11-20 04:53:30'), ('2016-11-20 04:54:30', '2016-11-20 04:57:30'), ('2016-11-20 04:58:30', '2016-11-20 05:01:30'), ('2016-11-20 05:02:30', '2016-11-20 05:05:30'), ('2016-11-20 05:06:30', '2016-11-20 05:09:30'), ('2016-11-20 05:10:30', '2016-11-20 05:13:30'), ('2016-11-20 05:14:30', '2016-11-20 05:17:30'), ('2016-11-20 05:18:30', '2016-11-20 05:21:30'), ('2016-11-20 05:22:30', '2016-11-20 05:25:30'), ('2016-11-20 05:26:30', '2016-11-20 05:29:30')]
[('2016-11-20 04:45:30', '2016-11-20 04:46:30'), ('2016-11-20 04:49:30', '2016-11-20 04:50:30'), ('2016-11-20 04:53:30', '2016-11-20 04:54:30'), ('2016-11-20 04:57:30', '2016-11-20 04:58:30'), ('2016-11-20 05:01:30', '2016-11-20 05:02:30'), ('2016-11-20 05:05:30', '2016-11-20 05:06:30'), ('2016-11-20 05:09:30', '2016-11-20 05:10:30'), ('2016-11-20 05:13:30', '2016-11-20 05:14:30'), ('2016-11-20 05:17:30', '2016-11-20 05:18:30'), ('2016-11-20 05:21:30', '2016-11-20 05:22:30'), ('2016-11-20 05:25:30', '2016-11-20 05:26:30'), ('2016-11-20 05:29:30', '2016-11-20 05:30:30')]
tweets_df['rest_period'] = 0
rst_prd = 1
for i in rst_start_stop:
tweets_df.loc[(tweets_df['created_at'] >= i[0]) & (tweets_df['created_at'] < i[1]), 'rest_period'] = rst_prd
rst_prd += 1
tweets_df.rest_period.value_counts()
0 67814
2 879
12 861
9 726
11 706
10 632
3 531
1 445
7 440
6 430
5 429
8 415
4 394
Name: rest_period, dtype: int64
tweets_df[tweets_df.rest_period == 1]['text']
11121 Kovalev won round one. Good jab. Ward not very...
11122 RT @RBRBoxing: Retweet if you think Kovalev wi...
11123 UH OH WARD GOT CAUGHT #KovalevWard
11124 RT @amirkingkhan: Let's go @andreward. #Kovale...
11125 #kovalevward @ T-Mobile Arena https://t.co/9vI...
11126 Ward staggering after a jab. Going to change m...
11127 RT @PromoDelPueblo: Vaya combate que nos esper...
11128 RT @TheDKano: Here we go for boxing's pound fo...
11129 RT @amirkingkhan: Let's go @andreward. #Kovale...
11130 RT @AndreasHale: Kovalev’s jab is dominant in ...
11131 Kovalev's armpit is about to be the real winne...
11132 !!!!!!!!!!!!!! Kov starting out strong. Ward n...
11133 RT @SugarRayLeonard: Who will be winning this ...
11134 RT @danrafaelespn: Rd 1 for Kovalev, no questi...
11135 RT @amirkingkhan: Let's go @andreward. #Kovale...
11136 RT @RBRBoxing: Good start, Ward runs into a ha...
11137 Round 1 #KovalevWard : 10-9 Kovalev
11138 Kovalev takes R1 10-9. #KovalevWard
11139 RT @Jeskeliin22: Se viene que peleón, Kovalev ...
11140 Finally people are seeing why I was shocked by...
11141 RT @Main_Events: This is the moment we have al...
11142 Rd. 1: Tough round to score. Largely a feel-ou...
11143 RT @rosieperezbklyn: Oh! Ward stumbled! @HBObo...
11144 Le jab de Kovalev fait le travail. 10-9 Kovale...
11145 RT @danrafaelespn: Rd 1 for Kovalev, no questi...
11146 Fucking Russians!!!! #KovalevWard
11147 Ward was winning 1st round with jab to body, b...
11148 #KovalevWard round 1 kovalev.
11149 That jab rocking Ward. Wow! #KovalevWard
11150 Round 1 to kovalev #KovalevWard
...
11536 RT @boxingcorner247: Here we go #KovalevWard ���...
11537 1st round Kovalev 10-9 #KovalevWard
11538 R1 Ward showing more fight than flight, but ea...
11539 1st round i think goes to Kovalev he landed th...
11540 Ward just needs to be super clever in the figh...
11541 #KovalevWard jordanaharkness @ T-Mobile Arena ...
11542 Kovalev showing his power early with the jab. ...
11543 RT @LennoxLewis: Let's get ready to ruuuuumble...
11544 ウォードさん、コバレフさんのジャブに手こずる。 #KovalevWard
11545 Round 1: Ward appears stunned by Kovalev's pow...
11546 RT @chaka_210: Es para kovalev este round!!! #...
11547 RT @rosieperezbklyn: Good 1st round! Gave it t...
11548 RT @Patrick_Wyman: 10-9 Kovalev in the first. ...
11549 Kovalev wins round 1 for me #KovalevWard
11550 WHOAH CUT THE TENSION WITH A KNIFEEEE. That ja...
11551 Wow. Ward buzzed by a jab already. \n\n#Kovale...
11552 Kovalev 1-0. Ward felt the power and was a lit...
11553 #letsgetreadytorumble #kovalevward https://t.c...
11554 Felt that power already 😏 #KovalevWard
11555 A cautious start, not surprising. First round ...
11556 1-0 Ward #KovalevWard
11557 RT @sand_trevino: This round goes to Kovalev. ...
11558 Ward's face is already showing the results of ...
11559 10-9 Kovalev. #KovalevWard
11560 Kovalev got Ward looking like Sullinger..\n#Ko...
11561 Slow first round. If kovalev keeps leaning on ...
11562 Good start for Sergey Kovalev.\nGood start for...
11563 RT @ChavaESPN: Round 1. Kovalev se ha llevado ...
11564 Round 1de estudio pero se lo doy a Kovalev #Ko...
11565 Both guys understandably cagey in round one bu...
Name: text, dtype: object
calc tw prob/rd
- any rd 2 surprises?
tweets_df[(tweets_df['text'].str.contains(r'\d+\-\d+|round|rd', case=False)) & (tweets_df.rest_period!=0)]\
[['rest_period', 'user_screen_name', 'text']].to_excel('data/scorecard_tweets.xlsx', encoding='utf8')
ax = per_minute[(per_minute.index > '2016-11-20 04:30:00') &
(per_minute.index < '2016-11-20 06:00:00')].plot(figsize=(12,6), label='Tweets')
ax.axvspan(rnd_start_stop[0][0], rnd_start_stop[0][1], alpha=0.7, color='lightblue', label="Rounds 1-12")
ax.axvspan(rst_start_stop[0][0], rst_start_stop[0][1], color='white', label="Rest Period")
for i in range(1,12):
ax.axvspan(rnd_start_stop[i][0], rnd_start_stop[i][1], alpha=0.7, color='lightblue')
ax.axvspan(rst_start_stop[i][0], rst_start_stop[i][1], color='white')
plt.title('Fight "Hour" Tweets Kovalev/Ward')
plt.xlabel("Time (UTC)")
plt.ylabel("Tweet Count")
plt.legend(loc='upper right')
ax.text('2016-11-20 05:55:00', 20, 'endlesspint.com',
fontsize=16, color='gray',
ha='right', va='bottom', alpha=0.3)
ax.grid(False, which='both')
### save file localy w high resolution
plt.savefig('img/fight_hour_tweets.PNG', dpi=1200)
plot hist of rd’s won over 1k sim [nfl/wk8]
- what % of sim agree with judges?
kov_rd_prob = np.array([0.962, 1.000, 0.627, 0.727, 0.293, 0.862, 0.082, 0.075, 0.028, 0.689, 0.082, 0.167])
np.random.seed(506) # "SOG" ~ 506
bouts = 1000
samp_bouts = np.random.random((bouts, len(kov_rd_prob)))
print kov_rd_prob
print samp_bouts[:5]
[ 0.962 1. 0.627 0.727 0.293 0.862 0.082 0.075 0.028 0.689
0.082 0.167]
[[ 0.847888 0.0062996 0.11547329 0.06227527 0.70417491 0.45142321
0.02663976 0.68392883 0.8048215 0.01091125 0.99085692 0.98667746]
[ 0.51372109 0.05352931 0.55695119 0.56343986 0.16066945 0.24419693
0.00722941 0.2815361 0.48279483 0.50589732 0.53835301 0.28329453]
[ 0.27038547 0.49515731 0.86365123 0.46734458 0.73658639 0.62529576
0.44125324 0.47066594 0.27224368 0.37605197 0.34024841 0.95194275]
[ 0.81634129 0.58905575 0.97129374 0.46887292 0.83701569 0.17920073
0.0192533 0.1664162 0.58023702 0.93020315 0.6370348 0.42960779]
[ 0.14706038 0.5908011 0.49457534 0.04813603 0.10570926 0.05762396
0.86765057 0.63971255 0.70860045 0.95180043 0.43166628 0.31270852]]
kov_rd_wins = samp_bouts < kov_rd_prob
kov_rd_wins[:5]
array([[ True, True, True, True, False, True, True, False, False,
True, False, False],
[ True, True, True, True, True, True, True, False, False,
True, False, False],
[ True, True, False, True, False, True, False, False, False,
True, False, False],
[ True, True, False, True, False, True, True, False, False,
False, False, False],
[ True, True, True, True, True, True, False, False, False,
False, False, False]], dtype=bool)
kov_scores = np.sum(kov_rd_wins, axis=1)
print kov_scores[:25]
[7 8 5 5 6 4 6 5 7 5 5 3 4 7 8 3 4 6 5 5 3 6 5 4 9]
plt.hist(kov_scores)
(array([ 4., 38., 129., 303., 0., 323., 159., 38., 4., 2.]),
array([ 2. , 2.8, 3.6, 4.4, 5.2, 6. , 6.8, 7.6, 8.4,
9.2, 10. ]),
<a list of 10 Patch objects>)
df = pd.DataFrame(kov_scores)
ax = df.plot(kind='density', figsize=(12,6), legend=False)
# plt.legend(False)
plt.title('Kovalev Rounds Won (1,000 bouts simulated)')
plt.xlim((0,12))
plt.xlabel('Rounds')
ax.text(11.8, 0.01, 'endlesspint.com',
fontsize=16, color='gray',
ha='right', va='bottom', alpha=0.3)
ax.grid(False, which='both')
### save file localy w high resolution
plt.savefig('img/kov_rnds_won_density.PNG', dpi=1200)
print np.median(kov_scores)
print np.mean(kov_scores)
6.0
5.564
print np.mean(kov_scores <= 5.0)
print np.mean(kov_scores == 6.0)
print np.mean(kov_scores <= 6.0)
0.474
0.323
0.797
len(kov_scores)
1000
df.describe()
0 | |
---|---|
count | 1000.000000 |
mean | 5.564000 |
std | 1.189672 |
min | 2.000000 |
25% | 5.000000 |
50% | 6.000000 |
75% | 6.000000 |
max | 10.000000 |
df.plot(kind='box')
<matplotlib.axes._subplots.AxesSubplot at 0xb6494748>