Playing with GTFS - New Zones
We have noticed a big change in how MOT uses the zone_id
field. There are thousands of zones now, as compared to a few dozens that were there about three months ago.
I took a look at the Tariff.zip
file in the FTP folder, and noticed that it now includes two files - Tariff.txt
, which we knew from before, and a new file StationToReformZone.txt
which includes a new mapping. This change is documented in MOT's developers pdf.
There are examples and explanations in. The rest of this notebook just shows how to work with these changes by merging the partridge pandas dataframes.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ftplib import FTP
import datetime
import re
import zipfile
import os
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 5) # set default size of plots
sns.set_style("white")
sns.set_context("talk")
sns.set_palette('Set2', 10)
Get the data¶
Start by making an FTP connection and retrieve the gtfs zip file's modification date. We're restricted from using most commands, but we can LIST...
MOT_FTP = 'gtfs.mot.gov.il'
FILE_NAME = 'israel-public-transportation.zip'
UPTODATE = 150 #days
local_zip_path = 'data/sample/gtfs.zip'
conn = FTP(MOT_FTP)
conn.login()
ftp_dir = []
conn.retrlines('LIST', lambda x: ftp_dir.append(x))
ftp_dir
Now that we have a list of file records, lets extract the dates and see if our file is up-to-date
RE_FTP_LINE = re.compile(
r'(?P<date>\d+-\d+-\d+\s+\d+:\d+[APM]{2})\s+(?P<size><DIR>|[0-9]+)\s+(?P<file_name>.*)')
f = [re.findall(RE_FTP_LINE, line) for line in ftp_dir]
f_dates = {t[0][2]: datetime.datetime.strptime(t[0][0], "%m-%d-%y %H:%M%p") for t in f}
ftp_date = f_dates[FILE_NAME]
have_zip = os.path.exists(local_zip_path)
if have_zip:
our_date = datetime.datetime.fromtimestamp(os.path.getmtime(local_zip_path))
our_uptodateness = (ftp_date - our_date).days
print(f'Our zip is {our_uptodateness} days old')
if our_uptodateness > UPTODATE:
print('Oh snap we need to download a new zip...')
else:
print('Oh goody! we\'re up-to-date!')
And now we shall actually download the zip... if we really want to...
if (have_zip and our_uptodateness > UPTODATE) or not have_zip:
temp_local_path = 'data/sample/gtfs_temp.zip'
fh = open(temp_local_path, 'wb')
conn.retrbinary('RETR %s' % (FILE_NAME), fh.write)
fh.close()
#TODO THIS DOESN'T WORK IF YOU DIDN"T HAVE FILE
os.remove(local_zip_path)
os.rename(temp_local_path, local_zip_path)
conn.quit()
Lets also load MOT's tarrif.txt file, which will give us zone names and other local info
TARIFF_FILE_NAME = 'Tariff.zip'
TARIFF_TXT_NAME = 'Tariff.txt'
TARIFF_TO_REFORM_ZONE = 'StationToReformZone.txt'
local_tariff_path = 'data/sample/tariff.zip'
conn = FTP(MOT_FTP)
conn.login()
fh = open(local_tariff_path, 'wb')
conn.retrbinary('RETR %s' % (TARIFF_FILE_NAME), fh.write)
fh.close()
conn.quit()
import partridge as ptg
service_ids_by_date = ptg.read_service_ids_by_date(local_zip_path)
next_thursday = None #TODO: add this instead of hardcoding the date
service_ids = service_ids_by_date[datetime.date(2018, 3, 1)]
feed = ptg.feed(local_zip_path, view={
'trips.txt': {
'service_id': service_ids,
},
})
The partridge feed now references all the GTFS files in the zip, and will only load them to memory only once used
New zone mapping¶
Before we build our first stop times tidy DataFrame, we need zone name data. We need to combine Tariff.txt
with StationToReformZone.txt
to a mapping as described in MOT's developers pdf.
Note: I'm disregarding FromDate
and ToDate
and just dropping duplicates. So we will give wrong zone names for zone changes within the GTFS 60 day period.
# not a true csv, so we need to jiggle it a bit
cols = ['ShareCode','ShareCodeDesc','ZoneCodes','Daily','Weekly','Monthly','FromDate','ToDate', 'EXTRA']
reform_cols = ['StationId', 'ReformZoneCode','FromDate','ToDate', 'EXTRA']
with zipfile.ZipFile(local_tariff_path) as zf:
tariff_df = (pd.read_csv(zf.open(TARIFF_TXT_NAME), header=None, skiprows=[0], names = cols)
.drop(columns = ['EXTRA']))
reform_df = (pd.read_csv(zf.open(TARIFF_TO_REFORM_ZONE), header=None, skiprows=[0], names = reform_cols)
.drop(columns = ['EXTRA']))
# remove ShareCodes which contain multiple zones e.g. גוש דן מורחב
tariff_df = (tariff_df[~ tariff_df.ZoneCodes.str.contains(';')]
.rename(columns = {'ShareCodeDesc': 'zone_name',
'ZoneCodes': 'zone_id'}))
tariff_df.head()
rs = reform_df[['StationId', 'ReformZoneCode']].drop_duplicates().applymap(str).set_index('StationId').iloc[:,0]
rs.head()
ts = (tariff_df[['zone_id', 'zone_name']].drop_duplicates().set_index('zone_id').iloc[:,0])
ts.head()
zones = rs.map(ts).reset_index().rename(columns={'StationId': 'stop_code', 'ReformZoneCode':'zone_name'})
And we're ready to build our final stop_times DataFrame. Not a lot of work on it, as it is already in a good tidy format, just select the columns we want, do some transformations and merges.
def to_timedelta(df):
'''
Turn time columns into timedelta dtype
'''
cols = ['arrival_time']
numeric = df[cols].apply(pd.to_timedelta, unit='s')
df = df.copy()
df[cols] = numeric
return df
f = (feed.stop_times[['trip_id', 'arrival_time', 'stop_id']]
.assign(date = datetime.date(2017, 12, 21))
.merge(s[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'stop_code']], on='stop_id')
# Much faster joins and slices with Categorical dtypes
.merge(zones, how='left')
.assign(zone_name=lambda x: pd.Categorical(x['zone_name']))
.pipe(to_timedelta)
)
f.head()
Top 10 zones¶
top_zones = f.groupby('zone_name').size().sort_values(ascending=False).head(10)
top_zones.reset_index()
Zone timeline¶
zone10 = (f[f.zone_name.isin(top_zones.index)].set_index('arrival_time')
.groupby([pd.Grouper(freq='1T'), 'zone_name'])
.size())
zones10 = zone10.reset_index().rename(columns={0: 'size'})
#reverse names so they will show nicely in plot
zones10 = zones10.assign(zone_name=lambda x: (x['zone_name'].str[::-1]))
zones10 = zones10.pivot('arrival_time', 'zone_name', 'size')
fig, ax = plt.subplots()
zones10.plot(figsize=(15,10), ax=ax)
handles, labels = ax.get_legend_handles_labels()
# sort both labels and handles by their rank (and do so in a very weird way :))
top_zones_rank = top_zones.reset_index()
labels, handles = zip(*sorted(zip(labels, handles),
key=lambda t: top_zones_rank[top_zones_rank.zone_name==t[0][::-1]].index.tolist()[0]))
ax.legend(handles, labels, title="zone_name")
sns.despine()
Looking good...
Comments !