Playing with GTFS III - Geo & Graphs
Third post of the Playing with GTFS series (1st, 2nd).
Again, we're playing with Israel's Ministry of Transportation GTFS data, this time with a focus on preliminary work on geo visualizations and graph stuff. Using pandas, partridge, bokeh, peartree and networkx.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
%matplotlib inline
So, yes, again, I'm trying to show a play-by-play analysis of the Israeli GTFS for SIRI archiving purposes we have at hasadna/open-bus.
I'm drifting a bit from the main purpose of these analyses, because I need to do some prep work on graphs and geo visualizations if I want to ask more questions. In order to do so I start with mapping stuff with colors, and also do some basic work with networkx graphs. This is all preliminary work, and I'll do more meaniningful stuff in the next posts.
The agenda this time is: (again, you can skip to item 3 if you're just interested in the stats, or if you already read the previous post):
Get the data¶
We get the data straight from MOT's ftp. Just so I don't repeat the last post, this went out to a separate module.
from gtfs_utils import *
conn = ftp_connect()
ftp_dir = get_ftp_dir(conn)
UPTODATE = 90 #days
our_uptodateness = get_uptodateness(ftp_dir)
if our_uptodateness > UPTODATE:
get_ftp_file(conn)
get_ftp_file(conn, file_name = 'Tariff.zip', local_zip_path = 'data/sample/tariff.zip' )
conn.quit()
We load MOT's tarrif.txt file, which will give us zone names and other local info. Found out it's the file is even more fd-up than I realized, so I add the southern zones manualy
tariff_df = extract_tariff_df(local_zip_path = 'data/sample/tariff.zip')
south = [
{
'zone_name': 'מצפה רמון',
'zone_id': '903'
},
{
'zone_name': 'ערבה',
'zone_id': '902'
},
{
'zone_name': 'אילת',
'zone_id': '901'
},]
south = pd.DataFrame(south)
tariff_df = tariff_df.append(south)
tariff_df.tail()
LOCAL_ZIP_PATH = 'data/sample/gtfs.zip'
import partridge as ptg
service_ids_by_date = ptg.read_service_ids_by_date(LOCAL_ZIP_PATH)
service_ids = service_ids_by_date[datetime.date(2017, 12, 21)]
feed = ptg.feed(LOCAL_ZIP_PATH, view={
'trips.txt': {
'service_id': service_ids,
},
})
def to_timedelta(df):
'''
Turn time columns into timedelta dtype
'''
cols = ['arrival_time', 'departure_time']
numeric = df[cols].apply(pd.to_timedelta, unit='s')
df = df.copy()
df[cols] = numeric
return df
s = feed.stops
r = feed.routes
t = (feed.trips
.assign(route_id=lambda x: pd.Categorical(x['route_id'])))
f = (feed.stop_times[['trip_id', 'departure_time', 'arrival_time', 'stop_id', 'stop_sequence']]
.assign(date = datetime.date(2017, 12, 21))
.merge(s[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'zone_id']], on='stop_id')
# Much faster joins and slices with Categorical dtypes
.merge(tariff_df.groupby(['zone_id', 'zone_name']).size().reset_index()[['zone_id', 'zone_name']], on='zone_id')
.assign(zone_id=lambda x: pd.Categorical(x['zone_id']))
.assign(zone_name=lambda x: pd.Categorical(x['zone_name']))
.merge(t[['trip_id', 'route_id', 'direction_id']], on='trip_id')
.merge(r[['route_id', 'route_short_name', 'route_long_name']], on='route_id')
.assign(route_id=lambda x: pd.Categorical(x['route_id']))
.pipe(to_timedelta)
)
f.head()
Interactive bokeh maps¶
I'm starting by repeating the same map from last time.
from bokeh.io import output_notebook, show
output_notebook()
from bokeh.plotting import figure
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.palettes import inferno, magma
# web mercator coordinates (got them here https://epsg.io/map)
Center_Israel = x_range, y_range = ((3852120,3852120+10e4), (3729820,3729820+10e4/1.3))
plot_width = int(540)
plot_height = int(plot_width//1.3)
def base_plot(tools='pan,wheel_zoom,box_zoom,reset', active_drag='pan',
active_scroll='wheel_zoom', toolbar_location='left',
plot_width=plot_width, plot_height=plot_height, **plot_args):
p = figure(tools=tools, active_drag=active_drag, active_scroll=active_scroll,
toolbar_location=toolbar_location,
plot_width=plot_width, plot_height=plot_height,
x_range=x_range, y_range=y_range, outline_line_color=None,
min_border=0, min_border_left=0, min_border_right=0,
min_border_top=0, min_border_bottom=0, **plot_args)
p.axis.visible = False
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
return p
def wgs84_to_web_mercator(df, lon="stop_lon", lat="stop_lat"):
"""Converts decimal longitude/latitude to Web Mercator format"""
k = 6378137
df["x"] = df[lon] * (k * np.pi/180.0)
df["y"] = np.log(np.tan((90 + df[lat]) * np.pi/360.0)) * k
return df
stops = (s[['stop_lon', 'stop_lat', 'stop_id', 'zone_id']].set_index('stop_id')
.pipe(wgs84_to_web_mercator)
.assign(counts = f.stop_id.value_counts())
.sort_values(by='counts', ascending=False))
pal = inferno(256)
c256 = 255 - pd.cut(stops.counts.fillna(0), 256).cat.codes
colors = [pal[c] for _, c in c256.iteritems()]
options = dict(line_color=None, fill_color=colors, size=5)
p = base_plot()
p.add_tile(CARTODBPOSITRON)
p.circle(x=stops['x'], y=stops['y'], **options)
show(p)