milesplit, part 2 - Exploring the Data
Posted on Thu 29 December 2016 in Projects
As a High School Cross Country coach, I visit co.milesplit.com quite often. This website is a nationwide database of High School Running (Cross Country and Track & Field) performances. For a week
endlong project, I thought I'd try to visualize some runner data.
- Part 1 - Webscraping
- Part 2 - Exploring the Data (you are here)
- Part 3 - Making Predictions
I gave a Tuesday Nerd Talk at a local brewery about this project. Check it out here
Exploring the Data¶
Let's start by reading in the data we scraped during Part 1:
dfL = pd.read_csv('milesplit data/5K.csv')
For reasons that will become clear later on, I'm also going to read in elevation data per State, and merge the two dfs:
dfR = pd.read_csv('milesplit data/elevation.csv', names = ['State', 'Elevation'])
df = pd.merge(dfL, dfR, on='State')
df.head()
Now its time to really dive in. Let's do some grouping:
boys = df.groupby('Gender').get_group('Boys')
girls = df.groupby('Gender').get_group('Girls')
print str(len(boys)) + ' boy records since 2000'
print str(len(girls)) + ' girl records since 2000'
df.dtypes
'Object' here, in python parlance, should be thought of as 'a collection of strings'. We'll see more on that a bit later.
Let's plot some stuff¶
athlete_school = df['Athlete/School'].str.split(expand = True)
athlete_school.columns = ['First_Name' , 'Next_Name' , 'C' , 'D' , 'E' , 'F' , 'G' , 'H', 'I', 'J' , 'K']
athlete_school.head()
athlete_school[['First_Name','Next_Name']].head()
athlete_school.First_Name.describe()
plt.subplot(1, 2, 1)
athlete_school.First_Name.value_counts().head(25).plot.barh(figsize=(10,5), color='b')
ax1 = plt.gca()
ax1.invert_yaxis()
plt.title('First Names')
plt.subplot(1, 2, 2)
athlete_school.Next_Name.value_counts().head(25).plot.barh(figsize=(10,5), color='b')
ax2 = plt.gca()
ax2.invert_yaxis()
plt.title('Next Names')
plt.tight_layout()
plt.show()
plt.figure(figsize=(9,4))
athlete_school.First_Name.value_counts().head(500).plot.barh()
ax = plt.gca()
ax.invert_yaxis()
plt.yticks([])
plt.xlabel('Frequency')
plt.ylabel('First Names')
plt.title('Occurrence of First Names')
plt.show()
devins = sorted(athlete_school.loc[athlete_school['First_Name'] == 'Devin','Next_Name'].unique())
len(devins)
## Fastest boys/girls times run since 2000
boys.sort_values('Time').head()
# girls.sort_values('Time').head()
## lookup a particular record (in this case, the one with the quickest time)
df.iloc[boys.Time.idxmin()]
response1 = requests.get('http://archive.dyestat.com/rivals/pics/2000100701880790.jpg')
response2 = requests.get('http://archive.dyestat.com/image/4tr/April/30StanfordCardinalInv/040430DathanRitzenheinStanfordInvMGallagher.jpg')
response3 = requests.get('http://1.bp.blogspot.com/-fJEQUnFpGCk/UBnndSVzPBI/AAAAAAAAE1w/lJbObv9k0hQ/s1600/Dathan+Ritzenhein-1.jpg')
ritzhs = Image.open(BytesIO(response1.content))
ritzcu = Image.open(BytesIO(response2.content))
ritzpro = Image.open(BytesIO(response3.content))
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,5))
ax1.imshow(ritzhs)
ax1.set_title('HS 2000 14:10')
ax1.axis('off')
ax2.imshow(ritzcu)
ax2.set_title('Colorado 2002 13:27')
ax2.axis('off')
ax3.imshow(ritzpro)
ax3.set_title('American Record 2009 12:56')
ax3.axis('off')
## Calculate nationwide averages
avg_5k = datetime.timedelta(seconds = df.Time.mean())
boys_avg_5k = datetime.timedelta(seconds = boys.Time.mean())
girls_avg_5k = datetime.timedelta(seconds = girls.Time.mean())
print 'Average 5K: ' + str(avg_5k)[2:7]
print 'Boys Average 5K: ' + str(boys_avg_5k)[2:7]
print 'Girls Average 5K: ' + str(girls_avg_5k)[2:7]
# Were YOU a fast runner in HS??
df[df['Athlete/School'].str.contains('Devin Rourke')] # Nope
df[df['Athlete/School'].str.contains('Elise Cranny')] # YUP.
Plotting¶
## Function to label y-axis of plots with (mm:ss) instead of (sec)
def timeTicks(x, pos):
d = dt.timedelta(seconds = x)
return str(d)[2:7]
formatter = ticker.FuncFormatter(timeTicks)
f, (ax1, ax2) = plt.subplots(1, 2, sharey = True, figsize=(10, 5))
ax1.scatter(boys.Year, boys.Time, s = 2, marker = ".", color = 'b')
ax1.set_title('Boys')
ax1.set_xlim([1999, 2017])
ax1.set_ylim([10*60, 60*60])
ax1.set_ylabel('Time')
ax2.scatter(girls.Year, girls.Time, s = 2, marker = ".", color = 'r')
ax2.set_title('Girls')
ax2.set_xlim([1999, 2017])
ax1.yaxis.set_major_formatter(formatter)
plt.tight_layout()
plt.show()
import seaborn.apionly as sns
ax = plt.figure(figsize=(10, 5))
ax = sns.violinplot(x=df.Year, y=df.Time, hue = df.Gender, cut = 0,
scale = "count", scale_hue = False, split = True,
palette = {"Boys": "blue", "Girls": "red"},
linewidth = 0, saturation = 0.9)
ax.set_ylim([15*60, 30*60])
ax.yaxis.set_major_formatter(formatter)
plt.title('Nationwide 5K times')
plt.xlabel('Year')
plt.ylabel('Time')
plt.show()
Completeness of the database¶
int_years = map(int, year)
ind = 0
for key in sorted(df.Time.groupby(df.State).groups.keys()):
ind = ind + 1
ax = plt.subplot(6, 10, ind)
plt.axis([2000, 2016, 0, 1000])
plt.xticks([])
frame1 = plt.gca()
frame1.axes.get_xaxis().set_visible(False)
df.Time.groupby([df.State, df.Year]).count().get(key).reindex(index=int_years).plot.bar(
title = key, figsize = (15, 8), sharey = True, sharex = False, color = 'b')
plt.tight_layout()
plt.xlabel('')
plt.show()
## State Record holding Boys/Girls (only online records, only dating back to 2000)
boys.sort_values('Time').groupby('State').head(1)
# girls.sort_values('Time').groupby('State').head(1).sort_index() #.sort_index() sorts by the groupby index, ie. 'State' in this case.
boys.Time.groupby(df.Year).mean().plot(color = 'b')
girls.Time.groupby(df.Year).mean().plot(color = 'r')
plt.axis([1998, 2018, 15*60, 30*60])
plt.ylabel('Time')
plt.title('Nationwide average 5k time')
ax = plt.gca()
ax.yaxis.set_major_formatter(formatter)
plt.legend(['boys','girls'])
plt.show()
dfboys = boys.Time.groupby([df.State, df.Year]).mean()
dfgirls = girls.Time.groupby([df.State, df.Year]).mean()
years = ['2000','2001','2002','2003','2004','2005','2006','2007',
'2008','2009','2010','2011','2012','2013','2014','2015','2016']
years = map(int, years)
ind = 0
for key in sorted(df.Time.groupby(df.State).groups.keys()):
ind = ind + 1
plt.subplot(6, 10, ind)
plt.xticks([])
frame1 = plt.gca()
frame1.axes.get_xaxis().set_visible(False)
dfboys[key].fillna(0).reindex(index = years).plot(title = key, figsize = (16, 9), yticks = [15*60, 20*60, 25*60],
color = 'b', sharey = True, sharex = False)
dfgirls[key].fillna(0).reindex(index = years).plot(title = key, figsize = (16, 9), yticks = [15*60, 20*60, 25*60],
color = 'r', sharey = True, sharex = False)
plt.axis([1998, 2018, 14*60, 30*60])
ax = plt.gca()
ax.yaxis.set_major_formatter(formatter)
plt.tight_layout()
plt.xlabel('')
plt.show()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
year = ['2000','2001','2002','2003','2004','2005','2006','2007',
'2008','2009','2010','2011','2012','2013','2014','2015','2016']
dfplotly = pd.read_csv('milesplit data/d3usa5kboys.csv')
scl = [[0.0, '#a80000'],[0.25, '#ff3f3f'],[0.5, '#ff7f7f'],
[0.75, '#ffbfbf'],[1.0, '#ffffff']]
for yr in year:
# df['text'] = df['state'] + '<br>' +\
# '2000 '+df['2000']+' 2001 '+df['2001']+'<br>'+\
# '2002 '+df['2002']+' 2003 ' + df['2003']+'<br>'+\
# '2004 '+df['2004']+' 2005 '+df['2005']
data = [ dict(
type='choropleth',
colorscale = scl,
autocolorscale = False,
locations = dfplotly['code'],
z = dfplotly[yr].astype(float),
zmax = 20.0,
zmin = -20.0,
zauto = False,
locationmode = 'USA-states',
marker = dict(
line = dict (
color = 'rgb(255,255,255)',
width = 1
) ),
colorbar = dict(
title = "% Difference"
))]
layout = dict(
title = 'Percent difference from yearly national average<br>Boys 5K<br><br>'+yr,
geo = dict(
scope='usa',
projection=dict( type='albers usa'),
showlakes = False))
fig = dict(data=data, layout=layout)
py.plot(fig, filename='choropleth-map_' + yr, image_height = 600, image_width = 900, image = 'png')
dfCHS = df[df['Athlete/School'].str.contains("Centaurus High School")].sort_values(by=['Time', 'Athlete/School'])
dfCHS.head()
# Centaurus School Records
dfCHS['Duration'] = pd.to_datetime(df['Time'], unit='s')
dfCHS[df.Year == df.Grade - 1].head(10) #senior class records
# dfCHS[df.Year == df.Grade - 2] #junior class records
# dfCHS[df.Year == df.Grade - 3] #sophomore class records
# dfCHS[df.Year == df.Grade - 4] #freshman class records
dfCHS = df[df['Athlete/School'].str.contains("Centaurus High School")]
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4))
ax1.scatter(dfCHS.Year, dfCHS.Rank)
ax1.set_title('Centaurus "Top 1000" Performances')
ax1.set_xlabel('Year')
ax1.set_ylabel('Rank')
ax1.set_ylim([-100, 1100])
ax2.plot(dfCHS.Year.unique(), dfCHS.groupby('Year').Time.nunique())
ax2.set_title('Number of Centaurus Performances in the Top 1000')
ax2.set_xlim([1999, 2017])
ax2.set_ylim([0, 25])
plt.tight_layout()
plt.show()
To Do¶
- get "Year = All" records
- Integrate "course difficulty" ratings
- Include Track events, and field events
- Determine seconds / ft conversion
- Machine Learning, feature selection.. predict times?