import numpy as np
import pandas as pd
import csv
import codecs
from bs4 import BeautifulSoup
import re
import os
import socket
import urllib
import time
import requests
gtrend1 = pd.read_csv('timelines_brownTruckJoin/multiTimeline (1).csv', skiprows=2, index_col=0)
gtrend1.columns = [col.split(': ')[0].lower().replace(' ', '_') for col in gtrend1.columns]
gtrend1.head()
millersburg_brewing | bells_brewery | rubicon_brewing_company_pub | the_bruery | brown_truck_brewery | |
---|---|---|---|---|---|
Day | |||||
2016-07-01 | 12 | 12 | 0 | 37 | 0 |
2016-07-02 | 0 | 29 | 0 | 14 | 14 |
2016-07-03 | 0 | 15 | 0 | 25 | 0 |
2016-07-04 | 0 | 14 | 0 | 33 | 0 |
2016-07-05 | 0 | 13 | 0 | 38 | 0 |
gtrend2 = pd.read_csv('timelines_brownTruckJoin/multiTimeline (2).csv', skiprows=2, index_col=0)
gtrend2.columns = [col.split(': ')[0].lower().replace(' ', '_') for col in gtrend2.columns]
gtrend2.head()
brown_truck_brewery | crank_arm_brewing | el_segundo_brewing | iowa_brewing | figueroa_mountain_brewing | |
---|---|---|---|---|---|
Day | |||||
2016-07-01 | 0 | 0 | 31 | 31 | 0 |
2016-07-02 | 36 | 0 | 0 | 0 | 36 |
2016-07-03 | 0 | 0 | 74 | 37 | 0 |
2016-07-04 | 0 | 0 | 35 | 71 | 0 |
2016-07-05 | 0 | 0 | 0 | 0 | 0 |
def import_gtrend(file_path):
gtrend = pd.read_csv(file_path, skiprows=2, index_col=0)
gtrend.columns = [col.split(': ')[0].lower().replace(' ', '_') for col in gtrend.columns]
return gtrend
import_gtrend('timelines_brownTruckJoin/multiTimeline (3).csv').head()
brown_truck_brewery | bns_brewing_&_distilling | vintage_brewing | smog_city_brewing | lynnwood_brewing_concern | |
---|---|---|---|---|---|
Day | |||||
2016-07-01 | 0 | 0 | 0 | 0 | 30 |
2016-07-02 | 34 | 0 | 34 | 0 | 0 |
2016-07-03 | 0 | 0 | 0 | 0 | 35 |
2016-07-04 | 34 | 0 | 0 | 34 | 0 |
2016-07-05 | 0 | 0 | 60 | 0 | 0 |
pd.concat([gtrend1.brown_truck_brewery, gtrend2.brown_truck_brewery,
np.isfinite(gtrend1.brown_truck_brewery/gtrend2.brown_truck_brewery)
], axis=1).head()
brown_truck_brewery | brown_truck_brewery | brown_truck_brewery | |
---|---|---|---|
Day | |||
2016-07-01 | 0 | 0 | False |
2016-07-02 | 14 | 36 | True |
2016-07-03 | 0 | 0 | False |
2016-07-04 | 0 | 0 | False |
2016-07-05 | 0 | 0 | False |
np.mean((gtrend1.brown_truck_brewery/gtrend2.brown_truck_brewery)[np.isfinite(gtrend1.brown_truck_brewery/gtrend2.brown_truck_brewery)])
0.4035917946776314
def gtrend_norm(df_1, df_2, col='brown_truck_brewery'):
norm_mult = np.mean((df_1.brown_truck_brewery/df_2.brown_truck_brewery)[np.isfinite(df_1.brown_truck_brewery/df_2.brown_truck_brewery)])
return norm_mult
gtrend_norm(gtrend1, gtrend2)
0.4035917946776314
def df_norm(df_2, norm_mult):
return df_2.iloc[:,1:] * norm_mult
df_norm(gtrend2, 0.4036).head()
crank_arm_brewing | el_segundo_brewing | iowa_brewing | figueroa_mountain_brewing | |
---|---|---|---|---|
Day | ||||
2016-07-01 | 0.0 | 12.5116 | 12.5116 | 0.0000 |
2016-07-02 | 0.0 | 0.0000 | 0.0000 | 14.5296 |
2016-07-03 | 0.0 | 29.8664 | 14.9332 | 0.0000 |
2016-07-04 | 0.0 | 14.1260 | 28.6556 | 0.0000 |
2016-07-05 | 0.0 | 0.0000 | 0.0000 | 0.0000 |
gtrend1.join(gtrend2.iloc[:,1:] * gtrend_norm(gtrend1, gtrend2)).head()
millersburg_brewing | bells_brewery | rubicon_brewing_company_pub | the_bruery | brown_truck_brewery | crank_arm_brewing | el_segundo_brewing | iowa_brewing | figueroa_mountain_brewing | |
---|---|---|---|---|---|---|---|---|---|
Day | |||||||||
2016-07-01 | 12 | 12 | 0 | 37 | 0 | 0.0 | 12.511346 | 12.511346 | 0.000000 |
2016-07-02 | 0 | 29 | 0 | 14 | 14 | 0.0 | 0.000000 | 0.000000 | 14.529305 |
2016-07-03 | 0 | 15 | 0 | 25 | 0 | 0.0 | 29.865793 | 14.932896 | 0.000000 |
2016-07-04 | 0 | 14 | 0 | 33 | 0 | 0.0 | 14.125713 | 28.655017 | 0.000000 |
2016-07-05 | 0 | 13 | 0 | 38 | 0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
os.listdir('timelines_brownTruckJoin/')
['multiTimeline (1).csv',
'multiTimeline (10).csv',
'multiTimeline (11).csv',
'multiTimeline (12).csv',
'multiTimeline (13).csv',
'multiTimeline (14).csv',
'multiTimeline (15).csv',
'multiTimeline (16).csv',
'multiTimeline (17).csv',
'multiTimeline (18).csv',
'multiTimeline (19).csv',
'multiTimeline (2).csv',
'multiTimeline (20).csv',
'multiTimeline (21).csv',
'multiTimeline (22).csv',
'multiTimeline (23).csv',
'multiTimeline (24).csv',
'multiTimeline (25).csv',
'multiTimeline (26).csv',
'multiTimeline (27).csv',
'multiTimeline (28).csv',
'multiTimeline (29).csv',
'multiTimeline (3).csv',
'multiTimeline (30).csv',
'multiTimeline (31).csv',
'multiTimeline (32).csv',
'multiTimeline (33).csv',
'multiTimeline (34).csv',
'multiTimeline (35).csv',
'multiTimeline (36).csv',
'multiTimeline (37).csv',
'multiTimeline (38).csv',
'multiTimeline (39).csv',
'multiTimeline (4).csv',
'multiTimeline (40).csv',
'multiTimeline (41).csv',
'multiTimeline (42).csv',
'multiTimeline (43).csv',
'multiTimeline (44).csv',
'multiTimeline (45).csv',
'multiTimeline (46).csv',
'multiTimeline (47).csv',
'multiTimeline (48).csv',
'multiTimeline (49).csv',
'multiTimeline (5).csv',
'multiTimeline (50).csv',
'multiTimeline (51).csv',
'multiTimeline (52).csv',
'multiTimeline (53).csv',
'multiTimeline (54).csv',
'multiTimeline (55).csv',
'multiTimeline (56).csv',
'multiTimeline (57).csv',
'multiTimeline (58).csv',
'multiTimeline (59).csv',
'multiTimeline (6).csv',
'multiTimeline (60).csv',
'multiTimeline (61).csv',
'multiTimeline (62).csv',
'multiTimeline (63).csv',
'multiTimeline (7).csv',
'multiTimeline (8).csv',
'multiTimeline (9).csv']
let’s put it together
- load files in directory
- hold first file as ‘node’
- cycle through other files
- match on common term/brewery
- normalize the cycled file
- join to ‘node’
- suffix in case of dupes; not all bad, will allow for comparisons (dupe columns should match/be close on values)
- repeat for remaining files
- return wide df
dir_trends = 'timelines_brownTruckJoin/'
node_file = os.listdir(dir_trends)[0]
def gtrend_join(file_dir = dir_trends, node_file = node_file):
node_file = import_gtrend(file_dir + node_file)
for i in os.listdir(file_dir)[1:]:
temp_file = import_gtrend(file_dir + i)
node_file = node_file.join(df_norm(temp_file, gtrend_norm(node_file, temp_file)),
lsuffix = '', rsuffix = '_y')
# node_file = pd.merge(node_file, df_norm(temp_file, gtrend_norm(node_file, temp_file)),
# left_index=True, right_index=True, suffixes=('', '_y'))
return node_file
combined_trends = gtrend_join()
print combined_trends.shape
combined_trends.head()
(123, 252)
millersburg_brewing | bells_brewery | rubicon_brewing_company_pub | the_bruery | brown_truck_brewery | great_notion_brewing | max_lagers_wood | 10_barrel_brewing | boise_brewing | societe_brewing | ... | coors_brewing | craft_brew_alliance_y | carver_brewing | aeronaut_brewing | pelican_brewing | ska_brewing | bootstrap_brewing | central_coast_brewing | maplewood_brewery | karl_strauss_brewing_y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Day | |||||||||||||||||||||
2016-07-01 | 12 | 12 | 0 | 37 | 0 | 20.007564 | 0.0 | 9.733410 | 9.733410 | 9.842289 | ... | 0.000000 | 9.26641 | 9.774888 | 14.662332 | 9.774888 | 24.881533 | 0.0 | 0.000000 | 0.000000 | 9.839457 |
2016-07-02 | 0 | 29 | 0 | 14 | 14 | 0.000000 | 0.0 | 30.281719 | 11.355645 | 11.482671 | ... | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 11.107827 | 11.552140 | 0.0 | 0.000000 | 0.000000 | 11.181201 |
2016-07-03 | 0 | 15 | 0 | 25 | 0 | 11.896390 | 0.0 | 35.689169 | 11.896390 | 0.000000 | ... | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 11.552140 | 11.552140 | 0.0 | 11.628449 | 0.000000 | 11.628449 |
2016-07-04 | 0 | 14 | 0 | 33 | 0 | 0.000000 | 0.0 | 22.711289 | 11.355645 | 0.000000 | ... | 0.000000 | 0.00000 | 0.000000 | 22.659967 | 11.107827 | 11.107827 | 0.0 | 0.000000 | 11.181201 | 11.181201 |
2016-07-05 | 0 | 13 | 0 | 38 | 0 | 10.274155 | 0.0 | 25.415014 | 10.274155 | 10.252385 | ... | 18.918921 | 0.00000 | 0.000000 | 0.000000 | 10.219201 | 15.106645 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
5 rows × 252 columns
from collections import Counter
cnt = Counter()
for i in os.listdir(dir_trends):
temp_file = import_gtrend(dir_trends + i)
for col in temp_file.columns:
cnt[col] += 1
[brw for brw in cnt if cnt[brw] > 1]
['craft_brew_alliance',
'figueroa_mountain_brewing',
'ram/big_horn_brewery',
'lynnwood_brewing_concern',
'echo_brewing',
'bjs_restaurant_&_brewery',
'devils_backbone_brewing',
'brown_truck_brewery',
'karl_strauss_brewing']
none_dupes = [brw for brw in combined_trends.columns.tolist() if '_y' not in brw]
combined_trends = combined_trends[none_dupes]
print combined_trends.shape
combined_trends.head()
(123, 242)
millersburg_brewing | bells_brewery | rubicon_brewing_company_pub | the_bruery | brown_truck_brewery | great_notion_brewing | max_lagers_wood | 10_barrel_brewing | boise_brewing | societe_brewing | ... | riip_beer | knee_deep_brewing | coors_brewing | carver_brewing | aeronaut_brewing | pelican_brewing | ska_brewing | bootstrap_brewing | central_coast_brewing | maplewood_brewery | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Day | |||||||||||||||||||||
2016-07-01 | 12 | 12 | 0 | 37 | 0 | 20.007564 | 0.0 | 9.733410 | 9.733410 | 9.842289 | ... | 0.0 | 13.899615 | 0.000000 | 9.774888 | 14.662332 | 9.774888 | 24.881533 | 0.0 | 0.000000 | 0.000000 |
2016-07-02 | 0 | 29 | 0 | 14 | 14 | 0.000000 | 0.0 | 30.281719 | 11.355645 | 11.482671 | ... | 0.0 | 21.621624 | 0.000000 | 0.000000 | 0.000000 | 11.107827 | 11.552140 | 0.0 | 0.000000 | 0.000000 |
2016-07-03 | 0 | 15 | 0 | 25 | 0 | 11.896390 | 0.0 | 35.689169 | 11.896390 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 11.552140 | 11.552140 | 0.0 | 11.628449 | 0.000000 |
2016-07-04 | 0 | 14 | 0 | 33 | 0 | 0.000000 | 0.0 | 22.711289 | 11.355645 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 22.659967 | 11.107827 | 11.107827 | 0.0 | 0.000000 | 11.181201 |
2016-07-05 | 0 | 13 | 0 | 38 | 0 | 10.274155 | 0.0 | 25.415014 | 10.274155 | 10.252385 | ... | 0.0 | 0.000000 | 18.918921 | 0.000000 | 0.000000 | 10.219201 | 15.106645 | 0.0 | 0.000000 | 0.000000 |
5 rows × 242 columns
# combined_trends.to_csv('timelines_combined/combined_trends.csv', sep=';')
look at trend diff for date split
- pre.period <- as.Date(c(“2016-07-01”, “2016-10-05”))
- post.period <- as.Date(c(“2016-10-09”, “2016-10-31”))
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
pre_gabf16 = combined_trends.index <= '2016-10-05'
post_gabf16 = combined_trends.index >= '2016-10-09'
pre_post_avgs = combined_trends[pre_gabf16].mean().to_frame(name='pre_gabf16').join(combined_trends[post_gabf16].mean().to_frame(name='post_gabf16'))
pre_post_avgs.head()
pre_gabf16 | post_gabf16 | |
---|---|---|
millersburg_brewing | 2.525773 | 2.086957 |
bells_brewery | 18.896907 | 15.521739 |
rubicon_brewing_company_pub | 0.000000 | 0.000000 |
the_bruery | 28.855670 | 33.478261 |
brown_truck_brewery | 3.278351 | 9.478261 |
pre_post_avgs['dif_gabf16_norm'] = pre_post_avgs.post_gabf16/(pre_post_avgs.pre_gabf16 + 0.000001)
pre_post_avgs.head()
pre_gabf16 | post_gabf16 | dif_gabf16_norm | |
---|---|---|---|
millersburg_brewing | 2.525773 | 2.086957 | 0.826264 |
bells_brewery | 18.896907 | 15.521739 | 0.821390 |
rubicon_brewing_company_pub | 0.000000 | 0.000000 | 0.000000 |
the_bruery | 28.855670 | 33.478261 | 1.160197 |
brown_truck_brewery | 3.278351 | 9.478261 | 2.891167 |
pre_post_avgs.dif_gabf16_norm.describe()
count 242.000000
mean 0.893139
std 0.837155
min 0.000000
25% 0.432029
50% 0.843814
75% 1.115964
max 5.504798
Name: dif_gabf16_norm, dtype: float64
plt.figure(figsize=(12, 6))
plt.hist(pre_post_avgs.dif_gabf16_norm)
plt.title('Search Engagement Ratio - Post GABF Medal (2016)')
plt.ylabel('Brewery Count')
<matplotlib.text.Text at 0xb8c8048>
plt.boxplot(pre_post_avgs.dif_gabf16_norm)
{'boxes': [<matplotlib.lines.Line2D at 0xbb8cd30>],
'caps': [<matplotlib.lines.Line2D at 0xbba09b0>,
<matplotlib.lines.Line2D at 0xbba0f28>],
'fliers': [<matplotlib.lines.Line2D at 0xbbeea58>],
'means': [],
'medians': [<matplotlib.lines.Line2D at 0xbbee4e0>],
'whiskers': [<matplotlib.lines.Line2D at 0xbb8ce10>,
<matplotlib.lines.Line2D at 0xbba0438>]}
time series of non-zero averages
# 1/6 of breweries have NO Google search data
np.sum(combined_trends.mean(axis=0) == 0.0)
40
mean_idx_trends = combined_trends.divide(combined_trends.mean(axis=0))
mean_idx_trends.head()
millersburg_brewing | bells_brewery | rubicon_brewing_company_pub | the_bruery | brown_truck_brewery | great_notion_brewing | max_lagers_wood | 10_barrel_brewing | boise_brewing | societe_brewing | ... | riip_beer | knee_deep_brewing | coors_brewing | carver_brewing | aeronaut_brewing | pelican_brewing | ska_brewing | bootstrap_brewing | central_coast_brewing | maplewood_brewery | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Day | |||||||||||||||||||||
2016-07-01 | 4.641509 | 0.656876 | NaN | 1.239042 | 0.000000 | 2.913572 | NaN | 0.447997 | 1.112004 | 1.101082 | ... | 0.0 | 1.673469 | 0.000000 | 4.609881 | 1.706179 | 1.210197 | 2.085377 | 0.0 | 0.000000 | 0.0000 |
2016-07-02 | 0.000000 | 1.587450 | NaN | 0.468827 | 3.015762 | 0.000000 | NaN | 1.393768 | 1.297338 | 1.284595 | ... | 0.0 | 2.603175 | 0.000000 | 0.000000 | 0.000000 | 1.375224 | 0.968211 | 0.0 | 0.000000 | 0.0000 |
2016-07-03 | 0.000000 | 0.821095 | NaN | 0.837190 | 0.000000 | 1.732394 | NaN | 1.642655 | 1.359116 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.430233 | 0.968211 | 0.0 | 6.580247 | 0.0000 |
2016-07-04 | 0.000000 | 0.766355 | NaN | 1.105091 | 0.000000 | 0.000000 | NaN | 1.045326 | 1.297338 | 0.000000 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 2.636822 | 1.375224 | 0.930972 | 0.0 | 0.000000 | 7.6875 |
2016-07-05 | 0.000000 | 0.711615 | NaN | 1.272529 | 0.000000 | 1.496159 | NaN | 1.169769 | 1.173782 | 1.146960 | ... | 0.0 | 0.000000 | 1.806655 | 0.000000 | 0.000000 | 1.265206 | 1.266122 | 0.0 | 0.000000 | 0.0000 |
5 rows × 242 columns
mean_idx_trends.plot(legend=False, figsize=(16, 8))
plt.title('Brewery Search Hits (indexed)')
plt.ylabel('relative to own avg')
<matplotlib.text.Text at 0xbd35748>
mean_idx_trends[pre_gabf16].max().sort_values(ascending=False)[:10]
two_kilts_brewing 123.000000
12degree_brewing 61.500000
ghostfish_brewing_company 60.701299
high_heel_brewing 49.785714
zwanzigz_brewing 27.954545
auburn_alehouse 25.408451
black_tooth_brewing 25.408451
logsdon_farmhouse_ales 23.697248
grimm_brothers_brewhouse 23.455814
taps_fish_house_and_brewery 22.932203
dtype: float64
mean_idx_trends[post_gabf16].max().sort_values(ascending=False)[:10]
12degree_brewing 61.500000
zwanzigz_brewing 31.448864
logsdon_farmhouse_ales 22.004587
taps_fish_house_and_brewery 21.889831
pizza_port_san_clemente 20.856522
auburn_alehouse 19.056338
morgan_territory_brewing 17.571429
solid_rock_brewing 17.253394
gibbs_hundred_brewing 16.400000
ardent_craft_ales 15.216495
dtype: float64
IQR
(mean_idx_trends[pre_gabf16].sum(axis=1)).describe()
count 97.000000
mean 201.333917
std 48.543656
min 127.849023
25% 160.680484
50% 191.979998
75% 231.980401
max 350.348231
dtype: float64
(mean_idx_trends[post_gabf16].sum(axis=1)).describe()
count 23.000000
mean 196.012536
std 39.039190
min 116.992323
25% 160.378756
50% 201.091106
75% 231.406028
max 252.729727
dtype: float64
import brewery type, state, and medal count
gabf16_type = pd.read_excel('comp_breweries/gabf16_type.xlsx', index_col=0)
print gabf16_type.shape
gabf16_type.tail()
(303, 16)
medal | beer_name | brewery | city | state | category | year | medal_cnt | brewery_name2 | brewery_type | name_brewassoc | name_Gtrends | FAM | PARENT | DUPE | RETAIN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
302 | Silver | Mélange A Trois | Nebraska Brewing Co. - Papillion | Papillion | NE | Wood- and Barrel-Aged Strong Beer | 2016 | 2 | nebraska brew | Brewpub | Nebraska Brewing Co | nebraska_brewing | False | False | False | True |
303 | Bronze | 15th Anniversary Ale | Island Brewing Co. | Carpinteria | CA | Wood- and Barrel-Aged Strong Beer | 2016 | 1 | island brew | Micro | Island Brewing Co | island_brewing | False | False | False | True |
304 | Gold | Barrel-Aged Darkness | Surly Brewing Co. | Brooklyn Center | MN | Wood- and Barrel-Aged Strong Stout | 2016 | 1 | surly brew | Regional | Surly Brewing Company | surly_brewing | False | False | False | True |
305 | Silver | The Event Horizon | Olde Hickory Brewery | Hickory | NC | Wood- and Barrel-Aged Strong Stout | 2016 | 2 | olde hickory brew | Micro | Olde Hickory Brewery - Production | olde_hickory_brewery | False | False | False | True |
306 | Bronze | Little Nonsense | Verboten Brewing | Loveland | CO | Wood- and Barrel-Aged Strong Stout | 2016 | 1 | verboten brew | Micro | Verboten Brewing | verboten_brewing | False | False | False | True |
gabf16_new = gabf16_type[gabf16_type.RETAIN == True]
print gabf16_new.shape
gabf16_new.tail()
(286, 16)
medal | beer_name | brewery | city | state | category | year | medal_cnt | brewery_name2 | brewery_type | name_brewassoc | name_Gtrends | FAM | PARENT | DUPE | RETAIN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
302 | Silver | Mélange A Trois | Nebraska Brewing Co. - Papillion | Papillion | NE | Wood- and Barrel-Aged Strong Beer | 2016 | 2 | nebraska brew | Brewpub | Nebraska Brewing Co | nebraska_brewing | False | False | False | True |
303 | Bronze | 15th Anniversary Ale | Island Brewing Co. | Carpinteria | CA | Wood- and Barrel-Aged Strong Beer | 2016 | 1 | island brew | Micro | Island Brewing Co | island_brewing | False | False | False | True |
304 | Gold | Barrel-Aged Darkness | Surly Brewing Co. | Brooklyn Center | MN | Wood- and Barrel-Aged Strong Stout | 2016 | 1 | surly brew | Regional | Surly Brewing Company | surly_brewing | False | False | False | True |
305 | Silver | The Event Horizon | Olde Hickory Brewery | Hickory | NC | Wood- and Barrel-Aged Strong Stout | 2016 | 2 | olde hickory brew | Micro | Olde Hickory Brewery - Production | olde_hickory_brewery | False | False | False | True |
306 | Bronze | Little Nonsense | Verboten Brewing | Loveland | CO | Wood- and Barrel-Aged Strong Stout | 2016 | 1 | verboten brew | Micro | Verboten Brewing | verboten_brewing | False | False | False | True |
gabf16_new[['name_Gtrends', 'state', 'brewery_type', 'medal_cnt']].head()
name_Gtrends | state | brewery_type | medal_cnt | |
---|---|---|---|---|
0 | millersburg_brewing | OH | Brewpub | 1 |
1 | bells_brewery | MI | Regional | 1 |
2 | rubicon_brewing_company_pub | CA | n/a | 1 |
3 | the_bruery | CA | Micro | 2 |
4 | brown_truck_brewery | NC | Micro | 3 |
combine craft brewery data
gabf16_eda = pd.merge(gabf16_new[['brewery', 'name_Gtrends', 'state', 'brewery_type', 'medal_cnt']], pre_post_avgs,
left_on='name_Gtrends', right_index=True).drop_duplicates()
print gabf16_eda.shape
gabf16_eda.head()
(249, 8)
brewery | name_Gtrends | state | brewery_type | medal_cnt | pre_gabf16 | post_gabf16 | dif_gabf16_norm | |
---|---|---|---|---|---|---|---|---|
0 | Millersburg Brewing | millersburg_brewing | OH | Brewpub | 1 | 2.525773 | 2.086957 | 0.826264 |
1 | Bell's Brewery, Inc | bells_brewery | MI | Regional | 1 | 18.896907 | 15.521739 | 0.821390 |
2 | Rubicon Brewing Company Pub | rubicon_brewing_company_pub | CA | n/a | 1 | 0.000000 | 0.000000 | 0.000000 |
3 | The Bruery | the_bruery | CA | Micro | 2 | 28.855670 | 33.478261 | 1.160197 |
4 | Brown Truck Brewery | brown_truck_brewery | NC | Micro | 3 | 3.278351 | 9.478261 | 2.891167 |
gabf16_eda['multiple_wins'] = 0
gabf16_eda.loc[gabf16_eda.medal_cnt>1, 'multiple_wins'] = 1
gabf16_eda.multiple_wins.value_counts()
0 220
1 29
Name: multiple_wins, dtype: int64
gabf16_eda.pivot_table(index='brewery_type', columns='multiple_wins', values='brewery', aggfunc='count')
multiple_wins | 0 | 1 |
---|---|---|
brewery_type | ||
Brewpub | 59.0 | 10.0 |
Contract | 2.0 | NaN |
Large | 6.0 | 2.0 |
Micro | 127.0 | 13.0 |
Proprietor | 1.0 | NaN |
Regional | 24.0 | 4.0 |
n/a | 1.0 | NaN |
gabf16_eda.pivot_table(index='brewery_type', columns='multiple_wins', values='dif_gabf16_norm', aggfunc='mean')
multiple_wins | 0 | 1 |
---|---|---|
brewery_type | ||
Brewpub | 0.730805 | 0.978195 |
Contract | 0.519592 | NaN |
Large | 0.912032 | 1.145409 |
Micro | 0.940714 | 1.083965 |
Proprietor | 2.194376 | NaN |
Regional | 0.774173 | 1.130548 |
n/a | 0.000000 | NaN |
brewpub, micro, & regional (single and multi wins)
gabf16_eda[(gabf16_eda.multiple_wins==0) & (gabf16_eda.brewery_type=='Brewpub')].nlargest(3,'dif_gabf16_norm')
brewery | name_Gtrends | state | brewery_type | medal_cnt | pre_gabf16 | post_gabf16 | dif_gabf16_norm | multiple_wins | |
---|---|---|---|---|---|---|---|---|---|
86 | 12Degree Brewing | 12degree_brewing | CO | Brewpub | 1 | 0.102369 | 0.431731 | 4.217350 | 0 |
301 | Taps Fish House and Brewery - Corona | taps_fish_house_and_brewery | CA | Brewpub | 1 | 0.412732 | 0.859309 | 2.081998 | 0 |
122 | Pizza Port San Clemente | pizza_port_san_clemente | CA | Brewpub | 1 | 0.472705 | 0.926952 | 1.960948 | 0 |
gabf16_eda[(gabf16_eda.multiple_wins==0) & (gabf16_eda.brewery_type=='Micro')].nlargest(3,'dif_gabf16_norm')
brewery | name_Gtrends | state | brewery_type | medal_cnt | pre_gabf16 | post_gabf16 | dif_gabf16_norm | multiple_wins | |
---|---|---|---|---|---|---|---|---|---|
202 | Hardywood Park Craft Brewery | hardywood_park_craft_brewery | VA | Micro | 1 | 0.785911 | 4.326285 | 5.504798 | 0 |
101 | Ardent Craft Ales | ardent_craft_ales | VA | Micro | 1 | 0.456800 | 2.288627 | 5.010123 | 0 |
80 | Logsdon Farmhouse Ales | logsdon_farmhouse_ales | OR | Micro | 1 | 0.337216 | 1.345986 | 3.991448 | 0 |
gabf16_eda[(gabf16_eda.multiple_wins==0) & (gabf16_eda.brewery_type=='Regional')].nlargest(3,'dif_gabf16_norm')
brewery | name_Gtrends | state | brewery_type | medal_cnt | pre_gabf16 | post_gabf16 | dif_gabf16_norm | multiple_wins | |
---|---|---|---|---|---|---|---|---|---|
81 | Two Roads Brewing Co. | two_roads_brewing | CT | Regional | 1 | 9.442060 | 12.075778 | 1.278935 | 0 |
304 | Surly Brewing Co. | surly_brewing | MN | Regional | 1 | 15.677050 | 17.159652 | 1.094571 | 0 |
79 | Sun King Brewing Co. | sun_king_brewing | IN | Regional | 1 | 6.952078 | 7.491808 | 1.077636 | 0 |
gabf16_eda[(gabf16_eda.multiple_wins==1) & (gabf16_eda.brewery_type=='Brewpub')].nlargest(3,'dif_gabf16_norm')
brewery | name_Gtrends | state | brewery_type | medal_cnt | pre_gabf16 | post_gabf16 | dif_gabf16_norm | multiple_wins | |
---|---|---|---|---|---|---|---|---|---|
109 | ZwanzigZ Brewing | zwanzigz_brewing | IN | Brewpub | 2 | 0.117996 | 0.559839 | 4.744525 | 1 |
15 | Uberbrew | uberbrew | MT | Brewpub | 4 | 1.152701 | 1.283407 | 1.113390 | 1 |
76 | Nebraska Brewing Co. - Papillion | nebraska_brewing | NE | Brewpub | 2 | 12.617107 | 12.913848 | 1.023519 | 1 |
gabf16_eda[(gabf16_eda.multiple_wins==1) & (gabf16_eda.brewery_type=='Micro')].nlargest(3,'dif_gabf16_norm')
brewery | name_Gtrends | state | brewery_type | medal_cnt | pre_gabf16 | post_gabf16 | dif_gabf16_norm | multiple_wins | |
---|---|---|---|---|---|---|---|---|---|
71 | 2SP Brewing Co. | 2sp_brewing | PA | Micro | 2 | 1.755364 | 6.395837 | 3.643594 | 1 |
4 | Brown Truck Brewery | brown_truck_brewery | NC | Micro | 3 | 3.278351 | 9.478261 | 2.891167 | 1 |
275 | Neshaminy Creek Brewing Co. | neshaminy_creek_brewing | PA | Micro | 2 | 3.785876 | 4.727902 | 1.248826 | 1 |
gabf16_eda[(gabf16_eda.multiple_wins==1) & (gabf16_eda.brewery_type=='Regional')].nlargest(3,'dif_gabf16_norm')
brewery | name_Gtrends | state | brewery_type | medal_cnt | pre_gabf16 | post_gabf16 | dif_gabf16_norm | multiple_wins | |
---|---|---|---|---|---|---|---|---|---|
26 | Georgetown Brewing Co. | georgetown_brewing | WA | Regional | 2 | 2.236994 | 4.012087 | 1.793516 | 1 |
14 | Fat Head's Brewery | fat_heads_brewery | OH | Regional | 2 | 3.218342 | 3.636321 | 1.129874 | 1 |
41 | Karl Strauss Brewing Co. - San Diego | karl_strauss_brewing | CA | Regional | 3 | 6.549709 | 5.653187 | 0.863120 | 1 |
select a few from above
- definitely want a CA, CO, OR, & WA… so:
- CA: taps_fish_house_and_brewery (brewpub, 1)
- CO: 12degree_brewing (brewpub, 1)
- OR: logsdon_farmhouse_ales (mirco, 1)
- WA: georgetown_brewing (regional, 2)
- some compliments:
- uberbrew (MT; brewpub, 4 wins!)
- hardywood_park_craft_brewery (VA; micro, 1; largest post GABF uptick!)
- brown_truck_brewery (NC; micro, 3; the joining brewery, why not?)
find comparable (type & state) non-GABF16 winners
brw_assoc = pd.read_csv('comp_breweries/brw_assoc_usa_gabf16.csv', sep=';', index_col=0)
print brw_assoc.shape
brw_assoc.head()
(7429, 12)
name | brewery_type | address | city | state | z_code | ownership | telephone | url | brewery_name2 | name_short | gabf16_win | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Bradley Farm / RB Brew, LLC | Micro | 317 Springtown Rd | New Paltz | NY | 12561-3020 | NaN | (845) 255-8769 | www.raybradleyfarm.com | bradley farm / rb brew | Bradley Farm | 0 |
1 | Kona Brewing Co | Regional | 74-5612 Pawai Pl | Kailua Kona | HI | 96740- | Greater than 25% ownership by Anheuser-Busch I... | (808) 334-1133 | www.konabrewingco.com | kona brew | Kona Brewing | 1 |
2 | (405) Brewing Co | Micro | 1716 Topeka St | Norman | OK | 73069-8224 | NaN | (405) 816-0490 | www.405brewing.com | (405) brew | (405) Brewing | 0 |
3 | (512) Brewing Co | Micro | 407 Radam Ln Ste F200 | Austin | TX | 78745-1197 | NaN | (512) 921-1545 | www.512brewing.com | (512) brew | (512) Brewing | 0 |
4 | 10 Barrel Brewing Co | Large | 62970 18th St | Bend | OR | 97701-9847 | Greater than 25% ownership by Anheuser-Busch I... | (541) 585-1007 | www.10barrel.com | 10 barrel brew | 10 Barrel | 1 |
brewery_combos = [('CA', 'Brewpub'), ('CO', 'Brewpub'), ('OR', 'Micro'), ('WA', 'Regional'),
('MT', 'Brewpub'), ('VA', 'Micro'), ('NC', 'Micro')
]
np.random.seed(8)
for combo in brewery_combos:
candidates = brw_assoc[(brw_assoc.state==combo[0]) &
(brw_assoc.brewery_type==combo[1]) &
(brw_assoc.gabf16_win==0)
]['name']
print combo
print np.random.choice(candidates, 4, replace=False), '\n'
('CA', 'Brewpub')
['Downtown Joes Brewery and Restaurant' 'Taplands Brewery'
'Woods Bar & Brewery' 'Miner\xe2\x80\x99s Alley Brewing Company']
('CO', 'Brewpub')
['Oskar Blues Brewery - Lyons' 'Whistle Pig Brewing Company'
'Brix Taphouse and Brewery' 'Moonlight Pizza']
('OR', 'Micro')
['Mazama Brewing Co' 'Siuslaw Brewing'
"Krauski's Brewskis / The Hoppy Brewer" 'Red Ox Brewing']
('WA', 'Regional')
['Fremont Brewing Co' 'Redhook Brewery' 'Mac and Jacks Brewery Inc'
'Iron Horse Brewery']
('MT', 'Brewpub')
['Bridger Brewing Company' 'Cabinet Mountain Brewing Co'
'The Front Brewing Company' 'Backslope Brewing ']
('VA', 'Micro')
['Lickinghole Creek Craft Brewery' 'Barrel Oak Farm Taphouse'
'Sunken City Brewing Co' 'New District Brewing Company']
('NC', 'Micro')
['Good Hops Brewing LLC' 'Preyer Brewing Company'
'Fortnight Brewing Company' 'Burial Beer Co Forestry Camp']
some last minute eda by segment
single_medals = gabf16_new[gabf16_new.medal_cnt==1][['brewery', 'name_Gtrends', 'state', 'brewery_type', 'medal', 'category']]
single_medals.head()
brewery | name_Gtrends | state | brewery_type | medal | category | |
---|---|---|---|---|---|---|
0 | Millersburg Brewing | millersburg_brewing | OH | Brewpub | Gold | Aged Beer |
1 | Bell's Brewery, Inc | bells_brewery | MI | Regional | Silver | Aged Beer |
2 | Rubicon Brewing Company Pub | rubicon_brewing_company_pub | CA | n/a | Bronze | Aged Beer |
5 | Crank Arm Brewing Co. | crank_arm_brewing | NC | Micro | Bronze | American-Belgo-Style Ale |
6 | El Segundo Brewing Co. | el_segundo_brewing | CA | Micro | Gold | American-Style Amber Lager or Dark Lager |
gold = [s for s in single_medals[single_medals.medal=='Gold']['name_Gtrends'].tolist() if s != u'n/a']
mean_idx_trends[gold].plot(legend=False, figsize=(16, 8))
plt.title('Gold Medal Brewery Search Hits (indexed)')
plt.ylabel('relative to own avg')
<matplotlib.text.Text at 0xf84a748>
silver = [s for s in single_medals[single_medals.medal=='Silver']['name_Gtrends'].tolist() if s != u'n/a']
mean_idx_trends[silver].plot(legend=False, figsize=(16, 8))
plt.title('Silver Medal Brewery Search Hits (indexed)')
plt.ylabel('relative to own avg')
<matplotlib.text.Text at 0xfbf3208>
bronze = [s for s in single_medals[single_medals.medal=='Bronze']['name_Gtrends'].tolist() if s != u'n/a']
mean_idx_trends[bronze].plot(legend=False, figsize=(16, 8))
plt.title('Bronze Medal Brewery Search Hits (indexed)')
plt.ylabel('relative to own avg')
<matplotlib.text.Text at 0x10f0ecc0>
IQR by medal (single medal winners)
mean_idx_trends[gold].sum(axis=1)[pre_gabf16].describe(), mean_idx_trends[gold].sum(axis=1)[post_gabf16].describe()
(count 97.000000
mean 57.907986
std 17.754784
min 32.077572
25% 45.378124
50% 52.536740
75% 66.628534
max 112.251168
dtype: float64, count 23.000000
mean 57.466237
std 18.771940
min 24.325076
25% 45.764013
50% 58.324900
75% 72.582052
max 98.950253
dtype: float64)
mean_idx_trends[silver].sum(axis=1)[pre_gabf16].describe(), mean_idx_trends[silver].sum(axis=1)[post_gabf16].describe()
(count 97.000000
mean 60.439844
std 22.552839
min 28.589938
25% 45.640806
50% 56.970620
75% 70.268552
max 195.875898
dtype: float64, count 23.000000
mean 57.492863
std 14.953135
min 30.036341
25% 47.732232
50% 61.927325
75% 69.956782
max 80.223873
dtype: float64)
mean_idx_trends[bronze].sum(axis=1)[pre_gabf16].describe(), mean_idx_trends[bronze].sum(axis=1)[post_gabf16].describe()
(count 97.000000
mean 62.787726
std 22.994400
min 27.376774
25% 46.681124
50% 57.716062
75% 70.865554
max 145.357013
dtype: float64, count 23.000000
mean 60.411436
std 17.978898
min 29.892762
25% 47.813260
50% 60.287267
75% 68.125172
max 97.718445
dtype: float64)
single_medals_idx = pd.DataFrame({'gold': mean_idx_trends[gold].sum(axis=1),
'silver': mean_idx_trends[silver].sum(axis=1),
'bronze': mean_idx_trends[bronze].sum(axis=1),
'gabf': 'GABF'
})
single_medals_idx.loc[pre_gabf16, 'gabf'] = 'pre'
single_medals_idx.loc[post_gabf16, 'gabf'] = 'post'
single_medals_idx = single_medals_idx[['gold', 'silver', 'bronze', 'gabf']]
single_medals_idx.tail()
gold | silver | bronze | gabf | |
---|---|---|---|---|
Day | ||||
2016-10-27 | 36.031479 | 51.258344 | 60.137878 | post |
2016-10-28 | 52.193937 | 79.114118 | 60.287267 | post |
2016-10-29 | 83.409933 | 64.636172 | 67.456476 | post |
2016-10-30 | 46.589409 | 63.336685 | 33.481906 | post |
2016-10-31 | 46.590061 | 59.287918 | 54.354502 | post |
import seaborn as sns
sns.set_style("whitegrid")
df = single_medals_idx[single_medals_idx.gabf!='GABF'].set_index('gabf', append=True).stack().to_frame().reset_index()\
.rename(columns={'level_2': 'medal', 0: 'value'})
df.head()
Day | gabf | medal | value | |
---|---|---|---|---|
0 | 2016-07-01 | pre | gold | 87.037179 |
1 | 2016-07-01 | pre | silver | 88.758016 |
2 | 2016-07-01 | pre | bronze | 65.273149 |
3 | 2016-07-02 | pre | gold | 103.857065 |
4 | 2016-07-02 | pre | silver | 59.777300 |
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
sns.boxplot(x='medal', y='value', hue='gabf', data=df, palette="Set3")
sns.despine(trim=True)
misc
pre_post_avgs.sort_values('dif_gabf16_norm', ascending=False)
pre_gabf16 | post_gabf16 | dif_gabf16_norm | |
---|---|---|---|
hardywood_park_craft_brewery | 0.785911 | 4.326285 | 5.504798 |
ardent_craft_ales | 0.456800 | 2.288627 | 5.010123 |
zwanzigz_brewing | 0.117996 | 0.559839 | 4.744525 |
12degree_brewing | 0.102369 | 0.431731 | 4.217350 |
logsdon_farmhouse_ales | 0.337216 | 1.345986 | 3.991448 |
2sp_brewing | 1.755364 | 6.395837 | 3.643594 |
solid_rock_brewing | 0.443389 | 1.376959 | 3.105527 |
brown_truck_brewery | 3.278351 | 9.478261 | 2.891167 |
gibbs_hundred_brewing | 0.398042 | 1.091153 | 2.741297 |
hi-wire_brewing | 0.676520 | 1.830562 | 2.705844 |
14er_brewing | 1.291659 | 3.212589 | 2.487178 |
blackberry_farm_brewery | 1.422047 | 3.246674 | 2.283097 |
high_water_brewing | 3.014830 | 6.615672 | 2.194376 |
taps_fish_house_and_brewery | 0.412732 | 0.859309 | 2.081998 |
second_chance_beer | 1.447585 | 2.986046 | 2.062776 |
riip_beer | 1.229949 | 2.450898 | 1.992682 |
pizza_port_san_clemente | 0.472705 | 0.926952 | 1.960948 |
verboten_brewing | 1.848623 | 3.439574 | 1.860613 |
slo_brew | 2.113737 | 3.836893 | 1.815217 |
spencer_devon_brewing | 1.122478 | 2.014437 | 1.794633 |
georgetown_brewing | 2.236994 | 4.012087 | 1.793516 |
la_cumbre_brewing | 2.591634 | 4.635861 | 1.788779 |
daredevil_brewing | 1.829192 | 3.206119 | 1.752751 |
propolis_brewing | 1.460230 | 2.514240 | 1.721810 |
alvarado_street_brewery | 5.433718 | 9.260751 | 1.704312 |
pollyanna_brewing | 1.975679 | 3.269762 | 1.655006 |
4_noses_brewing | 0.783885 | 1.272879 | 1.623806 |
morgan_territory_brewing | 0.494651 | 0.784530 | 1.586024 |
echo_brewing | 4.451411 | 6.996020 | 1.571641 |
revolver_brewing | 9.335360 | 14.266856 | 1.528260 |
... | ... | ... | ... |
altitude_chophouse_and_brewery | 0.000000 | 0.000000 | 0.000000 |
missoula_brewing | 0.000000 | 0.000000 | 0.000000 |
two_kilts_brewing | 0.111016 | 0.000000 | 0.000000 |
high_heel_brewing | 0.501533 | 0.000000 | 0.000000 |
ram/big_horn_brewery | 0.000000 | 0.000000 | 0.000000 |
the_packinghouse_brewing | 0.000000 | 0.000000 | 0.000000 |
coopersmiths_pub_&_brewing | 0.000000 | 0.000000 | 0.000000 |
bns_brewing_&_distilling | 0.000000 | 0.000000 | 0.000000 |
no_clue_craft_brewery | 0.000000 | 0.000000 | 0.000000 |
emmetts_tavern_&_brewing | 0.000000 | 0.000000 | 0.000000 |
black_tooth_brewing | 1.152725 | 0.000000 | 0.000000 |
iron_springs_pub_&_brewery | 0.000000 | 0.000000 | 0.000000 |
colorado_boy_pub | 0.000000 | 0.000000 | 0.000000 |
brewers_alley_restaurant_&_brewery | 0.000000 | 0.000000 | 0.000000 |
redwood_curtain_brewing | 0.000000 | 0.000000 | 0.000000 |
horse_thief_hollow_brewery | 0.569610 | 0.000000 | 0.000000 |
ghostfish_brewing_company | 0.097172 | 0.000000 | 0.000000 |
fiction_beer | 0.000000 | 0.000000 | 0.000000 |
philipsburg_brewing | 0.000000 | 0.000000 | 0.000000 |
three_creeks_production | 0.000000 | 0.000000 | 0.000000 |
alesong_brewing_&_blending | 0.000000 | 0.000000 | 0.000000 |
hop_dogma_brewing | 0.000000 | 0.000000 | 0.000000 |
swamp_rabbit_brewery_&_taproom | 0.000000 | 0.000000 | 0.000000 |
glenwood_canyon_brewing | 0.000000 | 0.000000 | 0.000000 |
kootenai_river_brewing | 0.000000 | 0.000000 | 0.000000 |
greenview_brewing | 0.000000 | 0.000000 | 0.000000 |
big_dogs_brewing | 0.000000 | 0.000000 | 0.000000 |
rivers_edge_brewing | 0.000000 | 0.000000 | 0.000000 |
maize_valley_craft_brewery | 0.000000 | 0.000000 | 0.000000 |
grimm_brothers_brewhouse | 0.518925 | 0.000000 | 0.000000 |
242 rows × 3 columns