import os, csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
data loading
ipa_ratings = [file for file in os.listdir('tIJ_IPA/') if file[-3:] == 'txt']
print ipa_ratings
['rb_reviews_alesmith_ipa.txt', 'rb_reviews_ballastpoint_sculpin.txt', 'rb_reviews_bells_twohearted.txt', 'rb_reviews_lagunitas_ipa.txt', 'rb_reviews_russianriver_blindpig.txt', 'rb_reviews_stone_ipa.txt', 'rb_reviews_surly_furious.txt', 'rb_reviews_tij_ipa.txt', 'rb_reviews_troegs_nuggetnectar.txt']
d = {}
for ipa in ipa_ratings:
ipa_file = 'tIJ_IPA/' + ipa
ipa_name = (ipa.split('_')[2])
with open(ipa_file, 'r') as f:
next(f)
temp_dict = {}
for line in f:
line = line.strip()
key, val = line.split('\t')
temp_dict[key] = float(val)
d[ipa_name] = temp_dict
df_ipa = pd.DataFrame.from_dict(d)
print df_ipa.shape
df_ipa.head()
(7503, 9)
alesmith | ballastpoint | bells | lagunitas | russianriver | stone | surly | tij | troegs | |
---|---|---|---|---|---|---|---|---|---|
00cobraR | 4.0 | NaN | 4.4 | 3.6 | 4.0 | 4.0 | NaN | NaN | NaN |
09maso3 | NaN | NaN | NaN | NaN | NaN | 3.9 | NaN | NaN | NaN |
0o9i | 4.3 | 3.5 | NaN | 3.7 | 4.1 | 4.4 | NaN | NaN | NaN |
1001 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.8 |
1010 | NaN | NaN | 4.2 | NaN | NaN | 3.7 | NaN | NaN | NaN |
export
# df_ipa.to_csv('tIJ_IPA/ipa_listing.csv')
dummy plots
df_ipa.plot.box()
<matplotlib.axes._subplots.AxesSubplot at 0x7fec9e8>
df_ipa.plot.hist(alpha=0.5)
<matplotlib.axes._subplots.AxesSubplot at 0x8b31c50>
some investigation
df_ipa['ratings_cnt'] = df_ipa.notnull().sum(axis=1)
print df_ipa[df_ipa.ratings_cnt > 1].shape
df_ipa[df_ipa.ratings_cnt > 1].head()
(3950, 10)
alesmith | ballastpoint | bells | lagunitas | russianriver | stone | surly | tij | troegs | ratings_cnt | |
---|---|---|---|---|---|---|---|---|---|---|
00cobraR | 4.0 | NaN | 4.4 | 3.6 | 4.0 | 4.0 | NaN | NaN | NaN | 5 |
0o9i | 4.3 | 3.5 | NaN | 3.7 | 4.1 | 4.4 | NaN | NaN | NaN | 5 |
1010 | NaN | NaN | 4.2 | NaN | NaN | 3.7 | NaN | NaN | NaN | 2 |
11026 | 4.6 | NaN | 4.3 | 3.1 | NaN | 3.7 | NaN | NaN | 4.2 | 5 |
123456green | 4.4 | NaN | 4.0 | NaN | 2.5 | 4.2 | NaN | NaN | NaN | 4 |
df_ipa[(df_ipa.ratings_cnt > 1) & (df_ipa.tij >= .1)].head()
alesmith | ballastpoint | bells | lagunitas | russianriver | stone | surly | tij | troegs | ratings_cnt | |
---|---|---|---|---|---|---|---|---|---|---|
20107589 | 3.7 | NaN | NaN | 3.6 | NaN | NaN | NaN | 3.9 | NaN | 3 |
77ships | 4.1 | 4.4 | 4.6 | NaN | NaN | 3.8 | NaN | 3.6 | NaN | 5 |
Abijen | NaN | 4.0 | NaN | NaN | NaN | NaN | NaN | 4.2 | NaN | 2 |
Abrakarl | NaN | 4.1 | NaN | 3.6 | NaN | 4.4 | NaN | 2.5 | NaN | 4 |
Alengrin | 4.3 | 4.0 | 4.3 | 3.8 | NaN | 4.4 | NaN | 3.4 | 4.1 | 7 |
# mean ratings of beers based on number of IPA's from collection rated
# i.e., some reviewers only rated 1 beer, others all 9
# i myself have had 8 of 9; missing 'surly'
df_ipa.groupby('ratings_cnt').mean()
alesmith | ballastpoint | bells | lagunitas | russianriver | stone | surly | tij | troegs | |
---|---|---|---|---|---|---|---|---|---|
ratings_cnt | |||||||||
1 | 4.210861 | 4.151282 | 4.177386 | 3.605392 | 4.115741 | 4.062695 | 4.153600 | 3.661538 | 4.047393 |
2 | 4.096018 | 4.030769 | 4.074603 | 3.563295 | 4.036275 | 3.976417 | 4.004082 | 3.567308 | 4.040268 |
3 | 4.101384 | 4.023220 | 4.068182 | 3.601401 | 4.020472 | 3.976623 | 3.993893 | 3.652381 | 3.974719 |
4 | 4.064113 | 4.003957 | 4.035000 | 3.600968 | 3.984071 | 3.983088 | 3.970000 | 3.530769 | 3.935542 |
5 | 4.121935 | 4.032107 | 4.039316 | 3.608136 | 3.986061 | 3.943421 | 3.957051 | 3.510000 | 3.972283 |
6 | 4.048239 | 3.994094 | 3.977670 | 3.569796 | 3.967045 | 3.956902 | 3.985629 | 3.543333 | 3.925824 |
7 | 4.054967 | 4.032759 | 4.046497 | 3.574786 | 3.997665 | 3.907742 | 3.926860 | 3.434483 | 3.893361 |
8 | 4.051667 | 4.031333 | 4.038796 | 3.529866 | 3.966779 | 3.933110 | 3.963605 | 3.475000 | 3.903082 |
9 | 3.947826 | 3.847826 | 3.917391 | 3.530435 | 4.013043 | 3.791304 | 3.782609 | 3.395652 | 3.730435 |
prep for MLE grid by first tallying and getting %’s of votes
ipa_comp = np.zeros(81).reshape(9,9)
for i in range(len(df_ipa.columns[:-1])):
for j in range(len(df_ipa.columns[:-1])):
# +1 for outright better rating
ipa_comp[i][j] += np.sum(df_ipa.iloc[:, i] > df_ipa.iloc[:, j])
# +0.5 for even score
# makes more sense for "winning percentage"
ipa_comp[i][j] += np.sum(df_ipa.iloc[:, i] == df_ipa.iloc[:, j])/2.0
ipa_comp[i][i] = 0
print 'Post comparison tally:\n'
print ipa_comp
Post comparison tally:
[[ 0. 680.5 803. 1040.5 561. 1004. 539.5 153. 605. ]
[ 594.5 0. 703. 1034. 497.5 930.5 479.5 149. 551.5]
[ 634. 724. 0. 1268. 530. 1191.5 620.5 121. 722. ]
[ 125.5 192. 202. 0. 106.5 294.5 108. 98. 165.5]
[ 362. 412.5 444. 673.5 0. 571. 378.5 73.5 429.5]
[ 578. 610.5 833.5 1399.5 424. 0. 442. 171.5 571.5]
[ 310.5 361.5 431.5 622. 295.5 504. 0. 58. 375. ]
[ 15. 28. 21. 69. 9.5 29.5 15. 0. 9. ]
[ 298. 333.5 429. 670.5 278.5 541.5 302. 41. 0. ]]
# number of head-to-head match ups
np.sum(ipa_comp)
31852.0
# number of reviewers
# obviously only counting reviewers with two or more IPA's rated
# info already available above but highlighting it here
np.sum(df_ipa.ratings_cnt > 1)
3950
ipa_perc = np.zeros(81).reshape(9,9)
for i in range(len(df_ipa.columns[:-1])):
for j in range(len(df_ipa.columns[:-1])):
try:
ipa_perc[i][j] = ipa_comp[i][j]/float(ipa_comp[i][j] + ipa_comp[j][i])
except:
ipa_perc[i][j] = 0
print 'Post percentage calculation:\n'
print np.round_(ipa_perc, 4)
Post percentage calculation:
[[ nan 0.5337 0.5588 0.8924 0.6078 0.6346 0.6347 0.9107 0.67 ]
[ 0.4663 nan 0.4926 0.8434 0.5467 0.6038 0.5702 0.8418 0.6232]
[ 0.4412 0.5074 nan 0.8626 0.5441 0.5884 0.5898 0.8521 0.6273]
[ 0.1076 0.1566 0.1374 nan 0.1365 0.1738 0.1479 0.5868 0.198 ]
[ 0.3922 0.4533 0.4559 0.8635 nan 0.5739 0.5616 0.8855 0.6066]
[ 0.3654 0.3962 0.4116 0.8262 0.4261 nan 0.4672 0.8532 0.5135]
[ 0.3653 0.4298 0.4102 0.8521 0.4384 0.5328 nan 0.7945 0.5539]
[ 0.0893 0.1582 0.1479 0.4132 0.1145 0.1468 0.2055 nan 0.18 ]
[ 0.33 0.3768 0.3727 0.802 0.3934 0.4865 0.4461 0.82 nan]]
print df_ipa.columns[:-1].tolist()
print np.round_(ipa_perc, 3)
['alesmith', 'ballastpoint', 'bells', 'lagunitas', 'russianriver', 'stone', 'surly', 'tij', 'troegs']
[[ nan 0.534 0.559 0.892 0.608 0.635 0.635 0.911 0.67 ]
[ 0.466 nan 0.493 0.843 0.547 0.604 0.57 0.842 0.623]
[ 0.441 0.507 nan 0.863 0.544 0.588 0.59 0.852 0.627]
[ 0.108 0.157 0.137 nan 0.137 0.174 0.148 0.587 0.198]
[ 0.392 0.453 0.456 0.863 nan 0.574 0.562 0.886 0.607]
[ 0.365 0.396 0.412 0.826 0.426 nan 0.467 0.853 0.513]
[ 0.365 0.43 0.41 0.852 0.438 0.533 nan 0.795 0.554]
[ 0.089 0.158 0.148 0.413 0.114 0.147 0.205 nan 0.18 ]
[ 0.33 0.377 0.373 0.802 0.393 0.487 0.446 0.82 nan]]
test MLE iteration on toy set
original motivation: http://fivethirtyeight.com/features/captain-america-civil-war-who-would-win/
more step-by-step: http://www.pro-football-reference.com/blog/?p=171; screen shots below
toy_wins = np.array([[0, 1, 1], [0, 0, 1], [1, 0, 0]])
toy_wins
array([[0, 1, 1],
[0, 0, 1],
[1, 0, 0]])
rating_iter = []
rating_iter.append(np.ones(3).reshape((3,1)))
rating_iter
[array([[ 1.],
[ 1.],
[ 1.]])]
idx_a = 0
idx_b = 1
idx_c = 2
# newA
np.sum(toy_wins[idx_a]) /\
float( np.sum(toy_wins[idx_a][idx_b]+toy_wins[idx_b][idx_a])/float(rating_iter[0][idx_a]+rating_iter[0][idx_b]) +\
np.sum(toy_wins[idx_a][idx_c]+toy_wins[idx_c][idx_a])/float(rating_iter[0][idx_a]+rating_iter[0][idx_c])
)
1.3333333333333333
# newB
np.sum(toy_wins[idx_b]) /\
float( np.sum(toy_wins[idx_a][idx_b]+toy_wins[idx_b][idx_a])/float(rating_iter[0][idx_a]+rating_iter[0][idx_b]) +\
np.sum(toy_wins[idx_b][idx_c]+toy_wins[idx_c][idx_b])/float(rating_iter[0][idx_b]+rating_iter[0][idx_c])
)
1.0
# newC
np.sum(toy_wins[idx_c]) /\
float( np.sum(toy_wins[idx_a][idx_c]+toy_wins[idx_c][idx_a])/float(rating_iter[0][idx_a]+rating_iter[0][idx_c]) +\
np.sum(toy_wins[idx_b][idx_c]+toy_wins[idx_c][idx_b])/float(rating_iter[0][idx_b]+rating_iter[0][idx_c])
)
0.66666666666666663
teams = {'a': 0, 'b': 1, 'c': 2}
def new_rating(team, rating=0):
team_list = [team]
wins = float(np.sum(toy_wins[teams[team]]))
denom = 0
for t, idx in teams.items():
if t not in team_list:
games_against = float(np.sum(toy_wins[teams[team]][idx]+toy_wins[idx][teams[team]]))
denom += games_against/float(rating_iter[rating][teams[team]]+rating_iter[rating][idx])
return wins/denom
print new_rating('a', 0)
print new_rating('b')
print new_rating('c')
1.33333333333
1.0
0.666666666667
for team in teams:
print 'team %s: %f\n' % (team, new_rating(team))
team a: 1.333333
team c: 0.666667
team b: 1.000000
rating_iter = []
rating_iter.append(np.ones(3).reshape((3,1)))
for i in range(2000):
temp_ratings = []
# specifiy ratings order via list
# notice in test function in cell above 'b' and 'c' swap prefered order
for j in ['a', 'b', 'c']:
temp_ratings.append(new_rating(j, i))
rating_iter.append(temp_ratings)
if np.array_equal(rating_iter[-2], rating_iter[-1]):
print "number or iterations to maximize:", i
break
print "ratings:", rating_iter[-1]
print "Team A beats B: %f" % (rating_iter[-1][0]/(rating_iter[-1][0]+rating_iter[-1][1]))
number or iterations to maximize: 41
ratings: [1.431212899687144, 0.9407335284451731, 0.6183423666278844]
Team A beats B: 0.603392
the ratings are different, not important just ratios, but the winning probability matches example: 60.3%
ready to apply to IPA’s
rating_iter = []
rating_iter.append(np.ones(9).reshape((9,1)))
rating_iter
[array([[ 1.],
[ 1.],
[ 1.],
[ 1.],
[ 1.],
[ 1.],
[ 1.],
[ 1.],
[ 1.]])]
ipas = {k:v for k, v in zip(df_ipa.columns[:-1], range(9))}
print ipas
{'stone': 5, 'tij': 7, 'lagunitas': 3, 'surly': 6, 'alesmith': 0, 'bells': 2, 'ballastpoint': 1, 'troegs': 8, 'russianriver': 4}
def new_beer_rating(beer, grid=ipa_comp, rating=0):
beer_list = [beer]
wins = float(np.sum(grid[ipas[beer]]))
denom = 0
for b, idx in ipas.items():
if b not in beer_list:
games_against = float(np.sum(grid[ipas[beer]][idx]+grid[idx][ipas[beer]]))
denom += games_against/float(rating_iter[rating][ipas[beer]]+rating_iter[rating][idx])
return wins/denom
print new_beer_rating('tij')
0.369462770971
# notice again the IPA's get random sorting
for ipa in ipas:
print '1st rating update of %s: %f' % (ipa, new_beer_rating(ipa))
1st rating update of stone: 0.996435
1st rating update of tij: 0.369463
1st rating update of lagunitas: 0.320238
1st rating update of surly: 1.012494
1st rating update of alesmith: 1.297327
1st rating update of bells: 1.200868
1st rating update of ballastpoint: 1.192828
1st rating update of troegs: 0.915388
1st rating update of russianriver: 1.106168
# just to have the IPA order present for reference
print df_ipa.columns[:-1]
Index([u'alesmith', u'ballastpoint', u'bells', u'lagunitas', u'russianriver',
u'stone', u'surly', u'tij', u'troegs'],
dtype='object')
rating_iter = []
rating_iter.append(np.ones(9).reshape((9,1)))
for i in range(2000):
temp_ratings = []
# specifiy ratings order via list
# notice in test function in cell above 'b' and 'c' swap prefered order
for j in df_ipa.columns[:-1]:
temp_ratings.append(new_beer_rating(j, ipa_comp, i))
rating_iter.append(temp_ratings)
if np.array_equal(rating_iter[-2], rating_iter[-1]):
print "number or iterations to maximize:", i
break
print "ratings:", rating_iter[-1]
print "'t IJ beats Stone: %f" % (rating_iter[-1][7]/(rating_iter[-1][7]+rating_iter[-1][5]))
number or iterations to maximize: 70
ratings: [1.6074466822012488, 1.300273173040017, 1.304919081977399, 0.19824502126552032, 1.1405649634612778, 0.883810335360305, 0.9585607140089442, 0.17355391800160963, 0.788754690387197]
't IJ beats Stone: 0.164138
# generate ratings dict and sort in desc order for creation of grids
ipa_rating_score = {k:v for k,v in zip(df_ipa.columns[:-1], rating_iter[-1])}
ipa_rating_score = sorted(ipa_rating_score.items(), key=lambda x: (-x[1], x[0]))
ipa_rating_score
[('alesmith', 1.6074466822012488),
('bells', 1.304919081977399),
('ballastpoint', 1.300273173040017),
('russianriver', 1.1405649634612778),
('surly', 0.9585607140089442),
('stone', 0.883810335360305),
('troegs', 0.788754690387197),
('lagunitas', 0.19824502126552032),
('tij', 0.17355391800160963)]
new ipa_perc grid in sorted order by ML ratings
ipa_perc_sorted = np.zeros(81).reshape(9,9)
new_beer_row = 0
for i in ipa_rating_score:
new_beer_col = 0
for j in ipa_rating_score:
old_beer_row = ipas[i[0]]
old_beer_col = ipas[j[0]]
# print i, j, old_beer_row, old_beer_col
ipa_perc_sorted[new_beer_row][new_beer_col] = ipa_perc[old_beer_row][old_beer_col]
new_beer_col += 1
new_beer_row += 1
np.round_(ipa_perc_sorted, 2)
array([[ nan, 0.56, 0.53, 0.61, 0.63, 0.63, 0.67, 0.89, 0.91],
[ 0.44, nan, 0.51, 0.54, 0.59, 0.59, 0.63, 0.86, 0.85],
[ 0.47, 0.49, nan, 0.55, 0.57, 0.6 , 0.62, 0.84, 0.84],
[ 0.39, 0.46, 0.45, nan, 0.56, 0.57, 0.61, 0.86, 0.89],
[ 0.37, 0.41, 0.43, 0.44, nan, 0.53, 0.55, 0.85, 0.79],
[ 0.37, 0.41, 0.4 , 0.43, 0.47, nan, 0.51, 0.83, 0.85],
[ 0.33, 0.37, 0.38, 0.39, 0.45, 0.49, nan, 0.8 , 0.82],
[ 0.11, 0.14, 0.16, 0.14, 0.15, 0.17, 0.2 , nan, 0.59],
[ 0.09, 0.15, 0.16, 0.11, 0.21, 0.15, 0.18, 0.41, nan]])
estimated probability of IPA preference
ipa_est_pref = np.zeros(81).reshape(9,9)
est_row = 0
for i in ipa_rating_score:
est_col = 0
for j in ipa_rating_score:
if i[0] == j[0]:
est_col += 1
pass
else:
ipa_est_pref[est_row][est_col] = i[1]/(i[1]+j[1])
est_col += 1
est_row += 1
print [beer[0] for beer in ipa_rating_score]
np.round_(ipa_est_pref, 2)
['alesmith', 'bells', 'ballastpoint', 'russianriver', 'surly', 'stone', 'troegs', 'lagunitas', 'tij']
array([[ 0. , 0.55, 0.55, 0.58, 0.63, 0.65, 0.67, 0.89, 0.9 ],
[ 0.45, 0. , 0.5 , 0.53, 0.58, 0.6 , 0.62, 0.87, 0.88],
[ 0.45, 0.5 , 0. , 0.53, 0.58, 0.6 , 0.62, 0.87, 0.88],
[ 0.42, 0.47, 0.47, 0. , 0.54, 0.56, 0.59, 0.85, 0.87],
[ 0.37, 0.42, 0.42, 0.46, 0. , 0.52, 0.55, 0.83, 0.85],
[ 0.35, 0.4 , 0.4 , 0.44, 0.48, 0. , 0.53, 0.82, 0.84],
[ 0.33, 0.38, 0.38, 0.41, 0.45, 0.47, 0. , 0.8 , 0.82],
[ 0.11, 0.13, 0.13, 0.15, 0.17, 0.18, 0.2 , 0. , 0.53],
[ 0.1 , 0.12, 0.12, 0.13, 0.15, 0.16, 0.18, 0.47, 0. ]])
# import plotly.plotly as py
# import plotly.graph_objs as go
# - OR -
# import seaborn as sns
# {k:v for k,v in zip(range(9), [beer[0] for beer in ipa_rating_score])}
ipa_perc_sorted_df = pd.DataFrame(ipa_perc_sorted,
columns=[beer[0] for beer in ipa_rating_score],
index=[beer[0] for beer in ipa_rating_score])
ipa_est_pref_df = pd.DataFrame(ipa_est_pref,
columns=[beer[0] for beer in ipa_rating_score],
index=[beer[0] for beer in ipa_rating_score])
import seaborn as sns
C:\Anaconda2\lib\site-packages\matplotlib\__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
warnings.warn(self.msg_depr % (key, alt_key))
# User Rating Preferences
plt.figure(figsize=(12, 12))
plt.title('User Rating Preferences')
sns.heatmap(ipa_perc_sorted_df, annot=True, cbar=False, cmap="YlGnBu")
<matplotlib.axes._subplots.AxesSubplot at 0x8e8bd30>
# Maximum Liklihood Estimates
plt.figure(figsize=(12, 12))
plt.title('Maximum Liklihood Estimates')
sns.heatmap(ipa_est_pref_df.replace(0, np.nan), annot=True, cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0xa561b00>