Playing with GTFS - LAG-BAOMER ל"ג בעומר
Work-in-Progress!¶
Looking into schedule changes for LAG-BAOMER (ל"ג בעומר) 2018.
Changes are (supposedly?) attributed to the הילולת רבי שמעון בר יוחאי which takes place at Mount Meron.
In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ftplib import FTP
import datetime
import re
import zipfile
import os
%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 10) # set default size of plots
sns.set_style("white")
sns.set_context("talk")
sns.set_palette('Set1', 10)
Get the data¶
I get all the files from our GTFS repository at הסדנא - it is not still openly accessible, but you can ask me for files or access if you need it.
In [2]:
from gtfs_utils import *
TARIFF_FILE_NAME = 'Tariff.zip'
TARIFF_TXT_NAME = 'Tariff.txt'
TARIFF_TO_REFORM_ZONE = 'StationToReformZone.txt'
local_tariff_path = 'data/sample/tariff.zip'
conn = ftp_connect()
get_ftp_file(conn, file_name = TARIFF_FILE_NAME, local_zip_path = local_tariff_path )
Tariff stuff for zone names¶
In [3]:
# not a true csv, so we need to jiggle it a bit
cols = ['ShareCode','ShareCodeDesc','ZoneCodes','Daily','Weekly','Monthly','FromDate','ToDate', 'EXTRA']
reform_cols = ['StationId', 'ReformZoneCode','FromDate','ToDate', 'EXTRA']
with zipfile.ZipFile(local_tariff_path) as zf:
tariff_df = (pd.read_csv(zf.open(TARIFF_TXT_NAME), header=None, skiprows=[0], names = cols)
.drop(columns = ['EXTRA']))
reform_df = (pd.read_csv(zf.open(TARIFF_TO_REFORM_ZONE), header=None, skiprows=[0], names = reform_cols)
.drop(columns = ['EXTRA']))
# remove ShareCodes which contain multiple zones e.g. גוש דן מורחב
tariff_df = (tariff_df[~ tariff_df.ZoneCodes.str.contains(';')]
.rename(columns = {'ShareCodeDesc': 'zone_name',
'ZoneCodes': 'zone_id'}))
rs = reform_df[['StationId', 'ReformZoneCode']].drop_duplicates().applymap(str).set_index('StationId').iloc[:,0]
ts = (tariff_df[['zone_id', 'zone_name']].drop_duplicates().set_index('zone_id').iloc[:,0])
zones = rs.map(ts).reset_index().rename(columns={'StationId': 'stop_code', 'ReformZoneCode':'zone_name'})
In [4]:
import partridge as ptg
def get_partridge_feed(zip_path, date):
service_ids_by_date = ptg.read_service_ids_by_date(zip_path)
service_ids = service_ids_by_date[date]
feed = ptg.feed(zip_path, view={
'trips.txt': {
'service_id': service_ids,
},
})
return feed
def to_timedelta(df):
'''
Turn time columns into timedelta dtype
'''
cols = ['arrival_time', 'departure_time']
numeric = df[cols].apply(pd.to_timedelta, unit='s')
df = df.copy()
df[cols] = numeric
return df
def get_tidy_feed_df(feed, zones):
s = feed.stops
r = feed.routes
t = (feed.trips
.assign(route_id=lambda x: pd.Categorical(x['route_id'])))
f = (feed.stop_times[['trip_id', 'departure_time', 'arrival_time', 'stop_id', 'stop_sequence']]
.merge(s[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'stop_code']], on='stop_id')
# Much faster joins and slices with Categorical dtypes
.merge(zones, how='left')
.assign(zone_name=lambda x: pd.Categorical(x['zone_name']))
.merge(t[['trip_id', 'route_id', 'direction_id']], on='trip_id')
.merge(r[['route_id', 'route_short_name', 'route_long_name']], on='route_id')
.assign(route_id=lambda x: pd.Categorical(x['route_id']))
.pipe(to_timedelta)
)
return f
In [5]:
LOCAL_ZIP_PATH1 = 'data/gtfs_feeds/2018-05-01.zip'
LOCAL_ZIP_PATH2 = 'data/gtfs_feeds/2018-05-02.zip'
LOCAL_ZIP_PATH3 = 'data/gtfs_feeds/2018-04-25.zip'
LAG-BAOMER feeds¶
In [6]:
feed2 = get_partridge_feed(LOCAL_ZIP_PATH2, datetime.date(2018, 5, 2))
feed3 = get_partridge_feed(LOCAL_ZIP_PATH2, datetime.date(2018, 5, 3))
Last week's feeds¶
In [15]:
feed25 = get_partridge_feed(LOCAL_ZIP_PATH3, datetime.date(2018, 4, 25))
feed26 = get_partridge_feed(LOCAL_ZIP_PATH3, datetime.date(2018, 4, 26))
In [16]:
print('(Last Wed - This Wed): %.3f' % ((feed25.trips.shape[0] - feed2.trips.shape[0])/feed25.trips.shape[0]))
print('(Last Thu - This Wed): %.3f' % ((feed26.trips.shape[0] - feed3.trips.shape[0])/feed25.trips.shape[0]))
In [17]:
f25 = get_tidy_feed_df(feed25, zones)
f26 = get_tidy_feed_df(feed26, zones)
In [18]:
f2 = get_tidy_feed_df(feed2, zones)
f3 = get_tidy_feed_df(feed3, zones)
In [19]:
zones.zone_name.value_counts()
Out[19]:
Zone-specific difference ratio¶
In [20]:
danwed = f25[f25.zone_name=="גוש דן"].trip_id.nunique() - f2[f2.zone_name=="גוש דן"].trip_id.nunique()
danthu = f26[f26.zone_name=="גוש דן"].trip_id.nunique() - f3[f3.zone_name=="גוש דן"].trip_id.nunique()
haiwed = f25[f25.zone_name=="סובב חיפה"].trip_id.nunique() - f2[f2.zone_name=="סובב חיפה"].trip_id.nunique()
haithu = f26[f26.zone_name=="סובב חיפה"].trip_id.nunique() - f3[f3.zone_name=="סובב חיפה"].trip_id.nunique()
akowed = f25[f25.zone_name=="עכו"].trip_id.nunique() - f2[f2.zone_name=="עכו"].trip_id.nunique()
akothu = f26[f26.zone_name=="עכו"].trip_id.nunique() - f3[f3.zone_name=="עכו"].trip_id.nunique()
nahwed = f25[f25.zone_name=="נהריה"].trip_id.nunique() - f2[f2.zone_name=="נהריה"].trip_id.nunique()
nahthu = f26[f26.zone_name=="נהריה"].trip_id.nunique() - f3[f3.zone_name=="נהריה"].trip_id.nunique()
In [21]:
netwed = f25[f25.zone_name=="נתניה"].trip_id.nunique() - f2[f2.zone_name=="נתניה"].trip_id.nunique()
netthu = f26[f26.zone_name=="נתניה"].trip_id.nunique() - f3[f3.zone_name=="נתניה"].trip_id.nunique()
In [22]:
print('Gush Dan Wed: %.2f' % (danwed / f25[f25.zone_name=="גוש דן"].trip_id.nunique()))
print('Gush Dan Thu: %.2f' % (danthu / f26[f26.zone_name=="גוש דן"].trip_id.nunique()))
print('Haifa Wed: %.2f' % (haiwed / f25[f25.zone_name=="סובב חיפה"].trip_id.nunique()))
print('Haifa Thu: %.2f' % (haithu / f26[f26.zone_name=="סובב חיפה"].trip_id.nunique()))
print('Akko Wed: %.2f' % (akowed / f25[f25.zone_name=="עכו"].trip_id.nunique()))
print('Akko Thu: %.2f' % (akothu / f26[f26.zone_name=="עכו"].trip_id.nunique()))
print('Nahariya Wed: %.2f' % (nahwed / f25[f25.zone_name=="נהריה"].trip_id.nunique()))
print('Nahariya Thu: %.2f' % (nahthu / f26[f26.zone_name=="נהריה"].trip_id.nunique()))
print('Netanya Wed: %.2f' % (netwed / f25[f25.zone_name=="נתניה"].trip_id.nunique()))
print('Netanya Thu: %.2f' % (netthu / f26[f26.zone_name=="נתניה"].trip_id.nunique()))
Per route trip count difference¶
In [23]:
rcmp = pd.concat([feed26.trips.route_id.value_counts(), feed3.trips.route_id.value_counts()], axis=1, keys=['feed26', 'feed3'])
rcmp['dif'] = rcmp.feed26.subtract(rcmp.feed3)
pd.concat((rcmp, feed26.routes.set_index('route_id')[['route_short_name', 'route_long_name']]), axis=1).sort_values(by='dif', ascending=False).head(20)
Out[23]:
Per minute departure difference¶
In [24]:
se = (feed3.stop_times.groupby(['trip_id'])
.agg({'departure_time': 'min',
'arrival_time': 'max'})
.pipe(to_timedelta)
.sort_values(['arrival_time', 'departure_time']))
se.head()
Out[24]:
In [25]:
se2 = (feed26.stop_times.groupby(['trip_id'])
.agg({'departure_time': 'min',
'arrival_time': 'max'})
.pipe(to_timedelta)
.sort_values(['arrival_time', 'departure_time']))
se2.head()
Out[25]:
In [77]:
departures = pd.Series(1, se.departure_time).resample('1Min').sum()
arrivals = pd.Series(1, se.arrival_time).resample('T').sum()
onroad = pd.concat([pd.Series(1, se.departure_time), # departed add 1
pd.Series(-1, se.arrival_time) # arrived substract 1
]).resample('1Min').sum().cumsum().ffill()
df = (pd.concat([departures, arrivals, onroad], axis=1).reset_index()
.rename(columns={'index': 'time', 0:'departures', 1:'arrivals', 2:'onroad'}))
idx = pd.DatetimeIndex(df.time+datetime.datetime(2018, 5, 1))
(df.assign(time=idx)
.set_index('time')
.resample('1min').sum()
.ewm(span=60)
.mean()
.plot())
Out[77]:
In [78]:
departures2 = pd.Series(1, se2.departure_time).resample('1Min').sum()
arrivals2 = pd.Series(1, se2.arrival_time).resample('T').sum()
onroad2 = pd.concat([pd.Series(1, se2.departure_time), # departed add 1
pd.Series(-1, se2.arrival_time) # arrived substract 1
]).resample('1Min').sum().cumsum().ffill()
df2 = (pd.concat([departures2, arrivals2, onroad2], axis=1).reset_index()
.rename(columns={'index': 'time', 0:'departures', 1:'arrivals', 2:'onroad'}))
#idx = pd.DatetimeIndex(df.time+datetime.datetime(2018, 5, 1))
(df2.assign(time=idx)
.set_index('time')
.resample('1min').sum()
.ewm(span=60)
.mean()
.plot())
Out[78]:
This looks almost the same, lets line the departures up
In [79]:
idx = pd.DatetimeIndex(departures.index+datetime.datetime(2018, 5, 1))
df_m = (pd.concat([departures, departures2], axis=1).reset_index()
.rename(columns={'index': 'time', 1:'רבעש עובש', 0:'רמועב ג"ל'}))
fig, ax = plt.subplots()
(df_m.assign(time=idx)
.set_index('time')
.resample('1min').sum()
.ewm(span=120)
.mean()
.plot(ax=ax))
ax.set_xlabel('Time')
ax.set_ylabel('Departures / Minute')
ax.set_title('Departures Per Minute (smoothed)')
Out[79]:
In [35]:
import gtfstk as gt
import shapely.geometry as sg
In [36]:
# List feed
gt.list_gtfs(LOCAL_ZIP_PATH2)
Out[36]:
In [37]:
# Read and print feed
feed = gt.read_gtfs(LOCAL_ZIP_PATH2, dist_units='m')
In [38]:
feed.describe()
Out[38]:
In [39]:
# Validate
feed.validate()
Out[39]:
In [40]:
# Compute trip stats
trip_stats = feed.compute_trip_stats()
trip_stats.head().T
Out[40]:
In [41]:
# Add shape_dist_traveled column to stop times
feed = feed.append_dist_to_stop_times(trip_stats)
feed.stop_times.head().T
Out[41]:
In [42]:
# Choose study dates
#week = feed.get_first_week()
#dates = [week[4], week[6]] # First Friday and Sunday
dates = ['20180502', '20180503']
In [43]:
route_stats = feed.compute_route_stats(trip_stats, dates )
route_stats.head().T
Out[43]:
In [44]:
feed_lw = gt.read_gtfs(LOCAL_ZIP_PATH3, dist_units='m')
trip_stats_lw = feed_lw.compute_trip_stats()
dates_lw = ['20180425', '20180426']
route_stats_lw = feed_lw.compute_route_stats(trip_stats_lw, dates_lw )
route_stats_lw.head()
Out[44]:
In [76]:
feed_lw.describe().merge(feed.describe(), on='indicator')
Out[76]:
Biggest Headway Diffs¶
In [97]:
conc = (pd.concat((route_stats_lw[(route_stats_lw.date=='20180426') & (route_stats_lw.num_trips>30)].set_index('route_id')[['route_short_name', 'num_trips', 'mean_headway']],
route_stats[route_stats.date=='20180503'].set_index('route_id')[['num_trips', 'mean_headway']]), axis=1, keys=['last_week', 'this_week'])
.dropna()
)
conc[('dif', 'headway')] = 1 - (conc.iloc[:, 2])/conc.iloc[:, 4]
conc[('dif', 'num_trips')] = (conc.iloc[:, 3])/conc.iloc[:, 1]
with pd.option_context('display.max_rows', 10000, 'display.max_columns', 10):
display(conc.sort_values(by=[('dif', 'headway')], ascending=False).head(100).merge((pd.concat([feed26.routes[['route_id', 'route_long_name']].set_index('route_id')],
axis=1, keys=['a'])),
how='left', left_index=True, right_index=True))
Comments !