import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
entrants2014 = pd.read_csv('data/2014_participating_countries.txt', header=None, names=['country', 'entries'])
print entrants2014.dtypes
print entrants2014.shape
entrants2014.head()
entry_count = np.sum(entrants2014.entries)
entry_count
entrants2014['perOfTotal'] = entrants2014.entries/entry_count
topTen2014 = entrants2014.sort_values('entries', axis=0, ascending=False).head(10)
topTen2014.reset_index(drop=True)
topTen2014
np.sum(topTen2014.perOfTotal)
plt.figure(figsize=(12,8))
sns.plt.title("2014 World Beer Cup Top 10 Country Entrants")
sns.barplot(topTen2014.country, topTen2014.entries, palette="BuGn_d")
sns.plt.xlabel("Country")
sns.plt.ylabel("Entries")
sns.plt.savefig('img/wbc_2014_top10entrants.png')
winners2014 = pd.read_csv('data/2014_award_counts_country.txt', header=None, names=['country', 'awards'])
print winners2014.dtypes
winners2014['perOfTot'] = winners2014.awards/np.sum(winners2014.awards)
print winners2014.shape
winners2014.sort_values('awards', axis=0, ascending=False)
winners2014.sort_values('awards', axis=0, ascending=False).head(10)
np.sum(winners2014.sort_values('awards', axis=0, ascending=False).head(10)['perOfTot'])
WBC = pd.read_csv('data/wbc_PDF98_14c.csv')
WBC.tail()
usa2014 = WBC[(WBC.country=="USA") & (WBC.year==2014)]
print usa2014.shape
usa2014.head()
states2014 = usa2014.groupby('country_state')
states2014.award.count().sort_values(ascending=False).head()
WBC.award.value_counts()
award_by_country = WBC.groupby(['country', 'award']).size()
award_by_country.head(12)
award_by_country_df = award_by_country.unstack()
award_by_country_df.head()
award_by_country_df.fillna(0, inplace=True)
award_by_country_df.idxmax()
award_by_country_df.sort('Gold', ascending=False, inplace=True)
award_by_country_df = award_by_country_df[['Gold', 'Silver', 'Bronze']]
award_by_country_df.head(10)
award_by_country_df[:10].plot(kind='bar', figsize=(12,10), colormap='winter')
plot_colors = ['gold', 'silver', 'saddlebrown']
award_by_country_df[:10].plot(kind='bar', figsize=(12,10), color=plot_colors)
usa_award_year = WBC[WBC.country=="USA"].groupby('year').size()
usa_award_year
usa_award_year.plot(kind='bar', figsize=(12,6))
from IPython.core.pylabtools import figsize
plt.rcParams['axes.facecolor'] = '#F5F5F5'
figsize(11, 6)
import scipy.stats as stats
a = np.arange(30)
poi = stats.poisson
lambdas = [2.5, 14.5]
colors = ["#6B6A96", "#943A45"]
# plot a Poisson distribution for each lambda
for lambda_, color in zip(lambdas, colors):
plt.bar(a, poi.pmf(a, lambda_), color=color,
label="$\lambda = %.1f$" % lambda_, alpha=0.50,
edgecolor=color, lw="1.5")
plt.title("Probability mass function of a Poisson random variable")
plt.legend()
plt.grid()
plt.ylabel("probability of $n$")
plt.xticks(a + 0.4, a)
plt.xlabel("$n$")
plt.show()
# conda install -c https://conda.binstar.org/pymc pymc
import pymc as pm
import numpy as np
def make_model(data):
'''
Create a model for use in predicting lambda_1, lambda_2, and tau,
which combine to create our posterior probability distributions
(one before tau, and one after tau).
'''
# prior lambda estimates
alpha = 1.0 / data.mean()
lambda_1 = pm.Exponential("lambda_1", alpha)
lambda_2 = pm.Exponential("lambda_2", alpha)
# prior tau estimates
tau = pm.DiscreteUniform("tau", lower=0, upper=len(data))
@pm.deterministic
def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
'''Create random value of lambda, based on prior lambda.'''
out = np.zeros(len(data))
out[:tau] = lambda_1 # lambda before tau
out[tau:] = lambda_2 # lambda after (and including) tau
return out
# lambda is Poisson distributed
observation = pm.Poisson("obs", lambda_, value=data.values, observed=True)
model = pm.Model([observation, lambda_1, lambda_2, tau])
return model
model = make_model(usa_award_year)
mcmc = pm.MCMC(model)
mcmc.sample(50000, 10000, 1)
lambda_1_samples = mcmc.trace('lambda_1')[:]
lambda_2_samples = mcmc.trace('lambda_2')[:]
tau_samples = mcmc.trace('tau')[:]
# our data
data = usa_award_year
n_data = len(data)
years = data.index.values
figsize(10, 12)
xticks = np.arange(80, 200, 10)
yticks = np.arange(0.0, 1.1, 0.1)
# lambda_1
ax = plt.subplot(311)
ax.set_autoscaley_on(False)
plt.hist(lambda_1_samples, histtype='stepfilled', bins=30, alpha=0.65,
label="posterior of $\lambda_1$", color="#6B6A96", normed=True)
plt.legend(loc="upper left")
plt.xticks(xticks)
# plt.yticks(yticks)
plt.ylim([0., .06])
plt.xlabel("$\lambda_1$ value")
plt.ylabel("probability");
plt.title(r"""Posterior distributions of $\lambda_1,\;\lambda_2,\;\tau$""")
plt.grid()
# lambda_2
ax = plt.subplot(312)
ax.set_autoscaley_on(False)
plt.hist(lambda_2_samples, histtype='stepfilled', bins=30, alpha=0.65,
label="posterior of $\lambda_2$", color="#943A45", normed=True)
plt.legend(loc="upper left")
plt.xticks(xticks)
# plt.xlim([xticks.min(), xticks.max()])
plt.xlim([150, 250])
# plt.yticks(yticks)
plt.ylim([0., .06])
plt.xlabel("$\lambda_2$ value")
plt.ylabel("probability");
plt.grid()
# tau
ax = plt.subplot(313)
w = 1.0 / tau_samples.shape[0] * np.ones_like(tau_samples)
plt.hist(tau_samples, bins=n_data, alpha=1,
label=r"posterior of $\tau$",
color="#799631", weights=w, rwidth=2.)
plt.legend(loc="upper left")
plt.xticks(np.arange(n_data), years, rotation='vertical')
plt.yticks(yticks)
plt.xlabel(r"$\tau$ (year of Cup)")
plt.ylabel("probability");
plt.grid()
# plot medals awarded per games
ax = WBC.groupby('year').size()\
.plot(figsize=(10,5), legend=True, label='Medals')
ax.set_ylabel('award')
# plot events per games
events_year = WBC.drop_duplicates(cols=['year', 'category_name'])\
.groupby('year').size()
events_year.plot(secondary_y=True, legend=True, label='Events', mark_right=False)
ax.right_ax.set_ylabel('Events')