Read citibike data and write to SQL database from python
# define function to read in a citibike file (already downloaded and unzipped), do some
# cleaning, and add some fields
import pandas as pd
#dat = pd.read_csv('data/2013-09 - Citi Bike trip data.csv',parse_dates=True)
def load_citibike_monthly(fname):
dat = pd.read_csv(fname,parse_dates=True)
dat.starttime=pd.to_datetime(dat.starttime)
dat.stoptime=pd.to_datetime(dat.stoptime)
dat['gender']=dat['gender'].astype('category')
dat.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
dat['day']=dat.starttime.dt.day
dat['month']=dat.starttime.dt.month
dat['year']=dat.starttime.dt.year
dat['yday']=dat.starttime.dt.dayofyear
dat['wkday']=dat.starttime.dt.dayofweek
return dat
# test it out
fname='data/2013-09 - Citi Bike trip data.csv'
dat = load_citibike_monthly(fname)
dat.head()
tripduration | starttime | stoptime | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bikeid | usertype | birth_year | gender | day | month | year | yday | wkday | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1010 | 2013-09-01 00:00:02 | 2013-09-01 00:16:52 | 254 | W 11 St & 6 Ave | 40.735324 | -73.998004 | 147 | Greenwich St & Warren St | 40.715422 | -74.011220 | 15014 | Subscriber | 1974 | 1 | 1 | 9 | 2013 | 244 | 6 |
1 | 1443 | 2013-09-01 00:00:09 | 2013-09-01 00:24:12 | 151 | Cleveland Pl & Spring St | 40.721816 | -73.997203 | 497 | E 17 St & Broadway | 40.737050 | -73.990093 | 19393 | Customer | \N | 0 | 1 | 9 | 2013 | 244 | 6 |
2 | 1387 | 2013-09-01 00:00:16 | 2013-09-01 00:23:23 | 352 | W 56 St & 6 Ave | 40.763406 | -73.977225 | 405 | Washington St & Gansevoort St | 40.739323 | -74.008119 | 16160 | Subscriber | 1992 | 1 | 1 | 9 | 2013 | 244 | 6 |
3 | 405 | 2013-09-01 00:00:18 | 2013-09-01 00:07:03 | 490 | 8 Ave & W 33 St | 40.751551 | -73.993934 | 459 | W 20 St & 11 Ave | 40.746745 | -74.007756 | 14997 | Subscriber | 1973 | 1 | 1 | 9 | 2013 | 244 | 6 |
4 | 270 | 2013-09-01 00:00:20 | 2013-09-01 00:04:50 | 236 | St Marks Pl & 2 Ave | 40.728419 | -73.987140 | 393 | E 5 St & Avenue C | 40.722992 | -73.979955 | 19609 | Subscriber | 1984 | 1 | 1 | 9 | 2013 | 244 | 6 |
Now write a script to read all monthly files and put in database
# Get list of the citibke csv files we have
import glob
flist = glob.glob('data/*data.csv')
flist
['data/2013-07 - Citi Bike trip data.csv',
'data/2013-08 - Citi Bike trip data.csv',
'data/2013-09 - Citi Bike trip data.csv',
'data/2013-10 - Citi Bike trip data.csv',
'data/2013-11 - Citi Bike trip data.csv',
'data/2013-12 - Citi Bike trip data.csv',
'data/2014-01 - Citi Bike trip data.csv',
'data/2014-02 - Citi Bike trip data.csv',
'data/2014-03 - Citi Bike trip data.csv',
'data/2014-04 - Citi Bike trip data.csv',
'data/2014-05 - Citi Bike trip data.csv',
'data/2014-06 - Citi Bike trip data.csv',
'data/2014-07 - Citi Bike trip data.csv',
'data/2014-08 - Citi Bike trip data.csv',
'data/201409-citibike-tripdata.csv',
'data/201410-citibike-tripdata.csv',
'data/201411-citibike-tripdata.csv',
'data/201412-citibike-tripdata.csv']
# Then loop and load each one
# if this is too much (some files are pretty large), could also read each file in in chunks
import sqlite3
con = sqlite3.connect("/Users/Andy/Desktop/test2.db3")
for fname in flist:
print('loading ' + fname)
dat = load_citibike_monthly(fname)
print('adding ' + fname + ' to sql database')
dat.to_sql("rides",con,if_exists='append',chunksize=10000)
del dat
print('done')
loading data/2013-07 - Citi Bike trip data.csv
adding data/2013-07 - Citi Bike trip data.csv to sql database
loading data/2013-08 - Citi Bike trip data.csv
adding data/2013-08 - Citi Bike trip data.csv to sql database
loading data/2013-09 - Citi Bike trip data.csv
adding data/2013-09 - Citi Bike trip data.csv to sql database
loading data/2013-10 - Citi Bike trip data.csv
adding data/2013-10 - Citi Bike trip data.csv to sql database
loading data/2013-11 - Citi Bike trip data.csv
adding data/2013-11 - Citi Bike trip data.csv to sql database
loading data/2013-12 - Citi Bike trip data.csv
adding data/2013-12 - Citi Bike trip data.csv to sql database
loading data/2014-01 - Citi Bike trip data.csv
adding data/2014-01 - Citi Bike trip data.csv to sql database
loading data/2014-02 - Citi Bike trip data.csv
adding data/2014-02 - Citi Bike trip data.csv to sql database
loading data/2014-03 - Citi Bike trip data.csv
adding data/2014-03 - Citi Bike trip data.csv to sql database
loading data/2014-04 - Citi Bike trip data.csv
adding data/2014-04 - Citi Bike trip data.csv to sql database
loading data/2014-05 - Citi Bike trip data.csv
adding data/2014-05 - Citi Bike trip data.csv to sql database
loading data/2014-06 - Citi Bike trip data.csv
adding data/2014-06 - Citi Bike trip data.csv to sql database
loading data/2014-07 - Citi Bike trip data.csv
adding data/2014-07 - Citi Bike trip data.csv to sql database
loading data/2014-08 - Citi Bike trip data.csv
adding data/2014-08 - Citi Bike trip data.csv to sql database
loading data/201409-citibike-tripdata.csv
adding data/201409-citibike-tripdata.csv to sql database
loading data/201410-citibike-tripdata.csv
adding data/201410-citibike-tripdata.csv to sql database
loading data/201411-citibike-tripdata.csv
adding data/201411-citibike-tripdata.csv to sql database
loading data/201412-citibike-tripdata.csv
adding data/201412-citibike-tripdata.csv to sql database
done
# count the total number of rows: there are over 13 million rows, no way we could load that all in Pandas
pd.read_sql_query("SELECT COUNT(*) FROM rides", con)
COUNT(*) | |
---|---|
0 | 13118401 |
First i'll examine the number of rides per month, for 2014. The plot below shows that there is a strong seasonal cycle, with more rides in the summer months. My hypothesis is that this is mainly driven by the temperature; I will get weather data later and test this.
# count rides per month
df = pd.read_sql_query("SELECT year,month,count(*) as num_rides FROM rides WHERE year=2014 GROUP BY month", con)
df
year | month | num_rides | |
---|---|---|---|
0 | 2014 | 1 | 300400 |
1 | 2014 | 2 | 224736 |
2 | 2014 | 3 | 439117 |
3 | 2014 | 4 | 670780 |
4 | 2014 | 5 | 866117 |
5 | 2014 | 6 | 936880 |
6 | 2014 | 7 | 968842 |
7 | 2014 | 8 | 963489 |
8 | 2014 | 9 | 953887 |
9 | 2014 | 10 | 828711 |
10 | 2014 | 11 | 529188 |
11 | 2014 | 12 | 399069 |
# Plot rides vs month
# There is a very strong seasonal cycle
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(df.month,df.num_rides)
plt.scatter(df.month,df.num_rides)
plt.grid()
plt.xlabel('month')
plt.ylabel('# rides')
<matplotlib.text.Text at 0x11e7ffeb8>
Next i'll look at the number of rides each day in 2014. This shows that there is a strong seasonal pattern, but also some big residuals from that pattern. These could be days that were unseasonably cold/warm, or due to other factors.
# plot rides/day for 2014
df = pd.read_sql_query("select yday, count(*) as num_rides from rides where year=2014 group by yday",con)
plt.plot(df.yday,df.num_rides)
plt.grid()
plt.xlabel('day of year')
plt.ylabel('# rides')
<matplotlib.text.Text at 0x120348b00>
Which stations were the most used?
df = pd.read_sql_query("select start_station_id, start_station_name, count(*) as num_rides from rides where year=2014 group by start_station_id order by num_rides desc",con)
df.head(10)
start_station_id | start_station_name | num_rides | |
---|---|---|---|
0 | 521 | 8 Ave & W 31 St | 100498 |
1 | 519 | Pershing Square North | 92137 |
2 | 293 | Lafayette St & E 8 St | 86692 |
3 | 497 | E 17 St & Broadway | 80166 |
4 | 435 | W 21 St & 6 Ave | 73448 |
5 | 285 | Broadway & E 14 St | 65852 |
6 | 426 | West St & Chambers St | 63221 |
7 | 402 | Broadway & E 22 St | 60932 |
8 | 151 | Cleveland Pl & Spring St | 60092 |
9 | 382 | University Pl & E 14 St | 60072 |
plt.boxplot(df.num_rides)
{'boxes': [<matplotlib.lines.Line2D at 0x1223a4588>],
'caps': [<matplotlib.lines.Line2D at 0x1223ba518>,
<matplotlib.lines.Line2D at 0x1223cf208>],
'fliers': [<matplotlib.lines.Line2D at 0x120348cf8>],
'means': [],
'medians': [<matplotlib.lines.Line2D at 0x11f8f9c18>],
'whiskers': [<matplotlib.lines.Line2D at 0x1223a44a8>,
<matplotlib.lines.Line2D at 0x12142ec18>]}
# There are a small number of stations with many more rides; where are they? Plot on map?
To-do:
- fit seasonal cycle and remove to examine anomalies
# How many rides were taken by men vs women?
df = pd.read_sql_query("SELECT gender,count(*) as num_rides FROM rides GROUP BY gender ORDER BY num_rides DESC", con)
df.head()
# write a separate function to read the data and write to sql in chunks, see if it is faster.