GTFS Stats

Using a mix of partridge and gtfstk with some of my own additions to create daily statistical DataFrames for trips, routes and stops. This will later become a module which we will run on our historical MoT GTFS archive and schedule for nightly runs.

Imports and config

In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline
In [92]:
import pandas as pd
import numpy as np
import partridge as ptg
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import datetime

from gtfs_utils import *

alt.renderers.enable('notebook')
alt.data_transformers.enable('json')

sns.set_style("white")
sns.set_context("talk")
sns.set_palette('Set2', 10)
In [5]:
LOCAL_GTFS_ZIP_PATH = 'data/gtfs_feeds/2018-03-05.zip' 
LOCAL_TARIFF_PATH = 'data/sample/latest_tariff.zip' 

Creating a partridge feed

We have a util function for getting a partridge feed object by date.

In [6]:
feed = get_partridge_feed_by_date(LOCAL_GTFS_ZIP_PATH, datetime.date(2018,3,5))
type(feed)
Out[6]:
partridge.gtfs.feed
In [7]:
zones = get_zones_df(LOCAL_TARIFF_PATH)
zones.head()
Out[7]:
stop_code zone_name
0 2716 סובב ירושלים
1 2718 סובב ירושלים
2 2720 סובב ירושלים
3 2721 סובב ירושלים
4 2747 סובב ירושלים

Stats

trip stats

  1. calculate using partridge feed
  2. add:
    1. stop_code, stop_name
    2. zone
In [8]:
import gtfstk
In [10]:
feed.stop_times.dtypes
Out[10]:
trip_id                 object
arrival_time           float64
departure_time         float64
stop_id                 object
stop_sequence            int64
pickup_type            float64
drop_off_type           object
shape_dist_traveled      int64
dtype: object

computing trip_stats corresponding to gtfstk.compute_trip_stats()

In [132]:
feed.trips.direction_id.value_counts()
Out[132]:
0    49026
1    41228
Name: direction_id, dtype: int64

check whether we have bidirectionals

In [133]:
feed.trips.groupby('route_id').direction_id.nunique().value_counts()
Out[133]:
1    6673
Name: direction_id, dtype: int64

Nope

In [55]:
f = feed.trips
f = (
    f[['route_id', 'trip_id', 'direction_id', 'shape_id']]
    .merge(feed.routes[['route_id', 'route_short_name', 'route_type']])
    .merge(feed.stop_times)
    .merge(feed.stops[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'stop_code']])
    .merge(zones)
    .sort_values(['trip_id', 'stop_sequence'])
    #.assign(departure_time=lambda x: x['departure_time'].map(
    #    hp.timestr_to_seconds)
    #       )
    )
In [58]:
f.head().T
Out[58]:
2235521 2322805 2241223 2241263 2322737
route_id 9735 9735 9735 9735 9735
trip_id 10096398_040318 10096398_040318 10096398_040318 10096398_040318 10096398_040318
direction_id 1 1 1 1 1
shape_id 88862 88862 88862 88862 88862
route_short_name 70 70 70 70 70
route_type 3 3 3 3 3
arrival_time 75600 75640 75716 75858 76483
departure_time 75600 75640 75716 75858 76483
stop_id 34657 35317 34436 34444 35506
stop_sequence 1 2 3 4 5
pickup_type 0 0 0 0 0
drop_off_type 1 0 0 0 0
shape_dist_traveled 0 276 857 1955 14838
stop_name קניון קרני שומרון שדרות רחבעם/קרני שומרון קוממיות/שדרות רחבעם נווה מנחם יקיר בסיס צבאי
stop_lat 32.1744 32.1752 32.1755 32.1796 32.1456
stop_lon 35.0912 35.0939 35.0993 35.1077 35.117
stop_code 63436 65268 63200 63213 65290
zone_name השומרון השומרון השומרון השומרון השומרון
In [68]:
geometry_by_stop = gtfstk.build_geometry_by_stop(feed, use_utm=True)
In [69]:
g = f.groupby('trip_id')
In [70]:
from collections import OrderedDict
def my_agg(group):
    d = OrderedDict()
    d['route_id'] = group['route_id'].iat[0]
    d['route_short_name'] = group['route_short_name'].iat[0]
    d['route_type'] = group['route_type'].iat[0]
    d['direction_id'] = group['direction_id'].iat[0]
    d['shape_id'] = group['shape_id'].iat[0]
    d['num_stops'] = group.shape[0]
    d['start_time'] = group['departure_time'].iat[0]
    d['end_time'] = group['departure_time'].iat[-1]
    d['start_stop_id'] = group['stop_id'].iat[0]
    d['end_stop_id'] = group['stop_id'].iat[-1]
    d['start_stop_code'] = group['stop_code'].iat[0]
    d['end_stop_code'] = group['stop_code'].iat[-1]
    d['start_stop_name'] = group['stop_name'].iat[0]
    d['end_stop_name'] = group['stop_name'].iat[-1]
    d['start_zone'] = group['zone_name'].iat[0]
    d['end_zone'] = group['zone_name'].iat[-1]
    dist = geometry_by_stop[d['start_stop_id']].distance(
      geometry_by_stop[d['end_stop_id']])
    d['is_loop'] = int(dist < 400)
    d['duration'] = (d['end_time'] - d['start_time'])/3600
    return pd.Series(d)
In [71]:
h = g.apply(my_agg)
In [72]:
h['distance'] = g.shape_dist_traveled.max()
In [73]:
# Reset index and compute final stats
h = h.reset_index()
h['speed'] = h['distance'] / h['duration'] / 1000
h[['start_time', 'end_time']] = (
  h[['start_time', 'end_time']].applymap(
  lambda x: gtfstk.helpers.timestr_to_seconds(x, inverse=True))
)
In [74]:
h.sort_values(by='speed', ascending=False).head().T
Out[74]:
36646 36647 36648 43945 43944
trip_id 29575729_040318 29575734_040318 29575739_040318 29899643_040318 29899638_040318
route_id 12518 12518 12518 12517 12517
route_short_name NaN NaN NaN NaN NaN
route_type 2 2 2 2 2
direction_id 1 1 1 0 0
shape_id 51387 51387 51387 51386 51386
num_stops 2 2 2 2 2
start_time 06:10:00 08:10:00 16:25:00 17:52:00 10:37:00
end_time 06:34:00 08:34:00 16:49:00 18:16:00 11:01:00
start_stop_id 37310 37310 37310 37314 37314
end_stop_id 37314 37314 37314 37310 37310
start_stop_code 17086 17086 17086 17082 17082
end_stop_code 17082 17082 17082 17086 17086
start_stop_name דימונה דימונה דימונה באר שבע-צפון באר שבע-צפון
end_stop_name באר שבע-צפון באר שבע-צפון באר שבע-צפון דימונה דימונה
start_zone ערד דימונה ערד דימונה ערד דימונה באר שבע באר שבע
end_zone באר שבע באר שבע באר שבע ערד דימונה ערד דימונה
is_loop 0 0 0 0 0
duration 0.4 0.4 0.4 0.4 0.4
distance 37400 37400 37400 37400 37400
speed 93.5 93.5 93.5 93.5 93.5
In [76]:
h[h.is_loop==1].sort_values(by='duration', ascending=False).head().T
Out[76]:
29454 29452 29451 29455 29453
trip_id 26534192_040318 26534183_040318 26534179_040318 26534197_040318 26534187_040318
route_id 8067 8067 8067 8067 8067
route_short_name 58 58 58 58 58
route_type 3 3 3 3 3
direction_id 0 0 0 0 0
shape_id 91607 91607 91607 91607 91607
num_stops 56 56 56 56 56
start_time 07:30:00 10:15:00 08:45:00 21:00:00 14:50:00
end_time 09:30:01 12:15:01 10:45:01 23:00:01 16:50:01
start_stop_id 37124 37124 37124 37124 37124
end_stop_id 27918 27918 27918 27918 27918
start_stop_code 13907 13907 13907 13907 13907
end_stop_code 15657 15657 15657 15657 15657
start_stop_name תחנה מרכזית באר שבע/רציפים תחנה מרכזית באר שבע/רציפים תחנה מרכזית באר שבע/רציפים תחנה מרכזית באר שבע/רציפים תחנה מרכזית באר שבע/רציפים
end_stop_name תחנה מרכזית באר שבע תחנה מרכזית באר שבע תחנה מרכזית באר שבע תחנה מרכזית באר שבע תחנה מרכזית באר שבע
start_zone באר שבע באר שבע באר שבע באר שבע באר שבע
end_zone באר שבע באר שבע באר שבע באר שבע באר שבע
is_loop 1 1 1 1 1
duration 2.00028 2.00028 2.00028 2.00028 2.00028
distance 85513 85513 85513 85513 85513
speed 42.7506 42.7506 42.7506 42.7506 42.7506

Route stats

This is mostly taken from gtfstk.compute_route_stats_base(), with some additions:

  1. first start_stop_id and end_stop_id
  2. first start_zone and end_zone
  3. first number of stops
In [ ]:
    """
    Compute stats for the given subset of trips stats.

    Parameters
    ----------
    trip_stats_subset : DataFrame
        Subset of the output of :func:`.trips.compute_trip_stats`
    split_directions : boolean
        If ``True``, then separate the stats by trip direction (0 or 1);
        otherwise aggregate trips visiting from both directions. 
        Default: ``False``
    headway_start_time : string
        HH:MM:SS time string indicating the start time for computing
        headway stats
        Default: ``'07:00:00'``
    headway_end_time : string
        HH:MM:SS time string indicating the end time for computing
        headway stats.
        Default: ``'19:00:00'``

    Returns
    -------
    DataFrame
        Columns are

        - ``'route_id'``
        - ``'route_short_name'``
        - ``'agency_id'``
        - ``'agency_name'``
        - ``'route_long_name'``
        - ``'route_type'``
        - ``'direction_id'``: 1/0
        - ``'num_trips'``: number of trips on the route in the subset
        - ``'num_trip_starts'``: number of trips on the route with
          nonnull start times
        - ``'num_trip_ends'``: number of trips on the route with nonnull
          end times that end before 23:59:59
        - ``'is_loop'``: 1 if at least one of the trips on the route has
          its ``is_loop`` field equal to 1; 0 otherwise
        - ``'is_bidirectional'``: 1 if the route has trips in both
          directions; 0 otherwise
        - ``'start_time'``: start time of the earliest trip on the route
        - ``'end_time'``: end time of latest trip on the route
        - ``'max_headway'``: maximum of the durations (in minutes)
          between trip starts on the route between
          ``headway_start_time`` and ``headway_end_time`` on the given
          dates
        - ``'min_headway'``: minimum of the durations (in minutes)
          mentioned above
        - ``'mean_headway'``: mean of the durations (in minutes)
          mentioned above
        - ``'peak_num_trips'``: maximum number of simultaneous trips in
          service (for the given direction, or for both directions when
          ``split_directions==False``)
        - ``'peak_start_time'``: start time of first longest period
          during which the peak number of trips occurs
        - ``'peak_end_time'``: end time of first longest period during
          which the peak number of trips occurs
        - ``'service_duration'``: total of the duration of each trip on
          the route in the given subset of trips; measured in hours
        - ``'service_distance'``: total of the distance traveled by each
          trip on the route in the given subset of trips; measured in
          whatever distance units are present in ``trip_stats_subset``;
          contains all ``np.nan`` entries if ``feed.shapes is None``
        - ``'service_speed'``: service_distance/service_duration;
          measured in distance units per hour
        - ``'mean_trip_distance'``: service_distance/num_trips
        - ``'mean_trip_duration'``: service_duration/num_trips
        - ``'start_stop_id'``: ``start_stop_id`` of the first trip for the route
        - ``'end_stop_id'``: ``end_stop_id`` of the first trip for the route
        - ``'num_stops'``: ``num_stops`` of the first trip for the route
        - ``'start_zone'``: ``start_zone`` of the first trip for the route
        - ``'end_zone'``: ``end_zone`` of the first trip for the route
        
        If not ``split_directions``, then remove the
        direction_id column and compute each route's stats,
        except for headways, using
        its trips running in both directions.
        In this case, (1) compute max headway by taking the max of the
        max headways in both directions; (2) compute mean headway by
        taking the weighted mean of the mean headways in both
        directions.
        
        If ``trip_stats_subset`` is empty, return an empty DataFrame.

    """
In [103]:
headway_start_time='07:00:00'
headway_end_time='19:00:00'
# Convert trip start and end times to seconds to ease calculations below
f = h.copy()
f[['start_time', 'end_time']] = f[['start_time', 'end_time']
  ].applymap(gtfstk.helpers.timestr_to_seconds)

headway_start = gtfstk.helpers.timestr_to_seconds(headway_start_time)
headway_end = gtfstk.helpers.timestr_to_seconds(headway_end_time)
In [114]:
def compute_route_stats(group):
    d = OrderedDict()
    d['route_short_name'] = group['route_short_name'].iat[0]
    d['route_type'] = group['route_type'].iat[0]
    d['num_trips'] = group.shape[0]
    d['num_trip_starts'] = group['start_time'].count()
    d['num_trip_ends'] = group.loc[
      group['end_time'] < 24*3600, 'end_time'].count()
    d['is_loop'] = int(group['is_loop'].any())
    d['is_bidirectional'] = int(group['direction_id'].unique().size > 1)
    d['start_time'] = group['start_time'].min()
    d['end_time'] = group['end_time'].max()

    # Compute headway stats
    headways = np.array([])
    for direction in [0, 1]:
        stimes = group[group['direction_id'] == direction][
          'start_time'].values
        stimes = sorted([stime for stime in stimes
          if headway_start <= stime <= headway_end])
        headways = np.concatenate([headways, np.diff(stimes)])
    if headways.size:
        d['max_headway'] = np.max(headways)/60  # minutes
        d['min_headway'] = np.min(headways)/60  # minutes
        d['mean_headway'] = np.mean(headways)/60  # minutes
    else:
        d['max_headway'] = np.nan
        d['min_headway'] = np.nan
        d['mean_headway'] = np.nan

    # Compute peak num trips
    times = np.unique(group[['start_time', 'end_time']].values)
    counts = [gtfstk.helpers.count_active_trips(group, t) for t in times]
    start, end = gtfstk.helpers.get_peak_indices(times, counts)
    d['peak_num_trips'] = counts[start]
    d['peak_start_time'] = times[start]
    d['peak_end_time'] = times[end]

    d['service_distance'] = group['distance'].sum()
    d['service_duration'] = group['duration'].sum()
    
    # Added by cjer
    d['start_stop_id'] = group['start_stop_id'].iat[0]
    d['end_stop_id'] = group['end_stop_id'].iat[0]
    
    d['num_stops'] = group['num_stops'].iat[0]
    
    d['start_zone'] = group['start_zone'].iat[0]
    d['end_zone'] = group['end_zone'].iat[0]
    
    
    return pd.Series(d)
In [115]:
g = f.groupby('route_id').apply(
compute_route_stats).reset_index()

# Compute a few more stats
g['service_speed'] = g['service_distance']/g['service_duration']
g['mean_trip_distance'] = g['service_distance']/g['num_trips']
g['mean_trip_duration'] = g['service_duration']/g['num_trips']
In [116]:
# Convert route times to time strings
g[['start_time', 'end_time', 'peak_start_time', 'peak_end_time']] =\
    g[['start_time', 'end_time', 'peak_start_time', 'peak_end_time']].\
        applymap(lambda x: gtfstk.helpers.timestr_to_seconds(x, inverse=True))
In [117]:
g['service_speed'] = g.service_speed/1000
In [118]:
g.sort_values(by='num_trips', ascending=False).head(10).T
Out[118]:
814 816 1394 1395 4351 4352 338 339 6549 4481
route_id 11685 11689 13428 13429 2259 2261 10746 10747 9817 2533
route_short_name 1 1 1 1 5 5 1 1 204 66
route_type 3 3 3 3 3 3 0 0 3 3
num_trips 195 186 177 177 156 155 148 148 143 132
num_trip_starts 195 186 177 177 156 155 148 148 143 132
num_trip_ends 192 184 173 173 155 154 147 147 142 129
is_loop 0 0 0 0 0 0 0 0 0 0
is_bidirectional 0 0 0 0 0 0 0 0 0 0
start_time 05:00:00 04:40:00 00:00:00 00:05:00 05:15:00 00:00:00 05:30:00 05:30:00 00:05:00 00:10:00
end_time 24:45:30 24:22:09 24:49:57 24:55:49 24:19:58 24:12:07 24:09:47 24:04:47 24:08:24 24:43:53
max_headway 5 5 7 6 7 7 10 10 12 12
min_headway 4 4 5 5 5 5 6 6 4 6
mean_headway 4.4472 4.49315 5.752 5.744 6.01681 6.06838 6.18261 6.13793 6.78095 7.4375
peak_num_trips 17 16 13 14 6 7 3 3 7 9
peak_start_time 07:04:00 07:00:00 07:40:00 14:15:00 12:55:00 14:20:00 07:04:00 07:12:00 07:40:00 14:48:00
peak_end_time 07:05:30 07:02:09 07:44:57 14:15:49 12:59:58 14:22:07 07:09:47 07:13:47 07:40:24 14:53:53
service_distance 4980105 4731840 3944622 3967986 1367184 1364775 1972248 1972692 1411553 2363064
service_duration 212.875 192.665 191.602 194.159 77.9133 82.9681 43.8656 33.9989 67.6867 118.543
start_stop_id 36217 35756 37504 14613 37649 12889 35161 35119 13112 14323
end_stop_id 35501 35827 23921 37504 13730 14318 35120 35162 13700 37504
num_stops 42 41 51 55 26 26 23 23 33 40
start_zone סובב חיפה סובב חיפה גוש דן גוש דן גוש דן גוש דן סובב ירושלים סובב ירושלים גוש דן גוש דן
end_zone סובב חיפה סובב חיפה גוש דן גוש דן גוש דן גוש דן סובב ירושלים סובב ירושלים גוש דן גוש דן
service_speed 23.3945 24.5599 20.5875 20.4368 17.5475 16.4494 44.9612 58.0222 20.8542 19.9342
mean_trip_distance 25539 25440 22286 22418 8764 8805 13326 13329 9871 17902
mean_trip_duration 1.09167 1.03583 1.0825 1.09694 0.499444 0.535278 0.296389 0.229722 0.473333 0.898056

Add stuff

  1. agency_id, agency_name
  2. route_long_name
In [119]:
g = (g
     .merge(feed.routes[['route_id', 'route_long_name', 'agency_id']], how='left', on='route_id')
     .merge(feed.agency[['agency_id', 'agency_name']], how='left', on='agency_id')
    )
In [120]:
g = g[['route_id', 'route_short_name', 'agency_id', 'agency_name', 'route_long_name', 'route_type', 
           'num_trips', 'num_trip_starts', 'num_trip_ends', 'is_loop', 
           'is_bidirectional', 'start_time', 'end_time', 'max_headway', 'min_headway', 
           'mean_headway', 'peak_num_trips', 'peak_start_time', 'peak_end_time',
           'service_distance', 'service_duration', 'service_speed',
           'mean_trip_distance', 'mean_trip_duration', 'start_stop_id',
           'end_stop_id', 'num_stops', 'start_zone', 'end_zone', 
           ]]
In [135]:
g.sort_values(by='peak_num_trips', ascending=False).head(10).T
Out[135]:
814 1534 816 810 812 1395 4516 1394 6544 5504
route_id 11685 14050 11689 11681 11683 13429 2709 13428 9809 6660
route_short_name 1 1 1 3 3 1 129 1 172 417
agency_id 30 30 30 30 30 5 5 5 5 3
agency_name דן צפון דן צפון דן צפון דן צפון דן צפון דן דן דן דן אגד
route_long_name תחנה מרכזית חוף כרמל/רציפים עירוני-חיפה<->מרכז... מרכזית הקריות-קרית מוצקין<->תחנה מרכזית חוף כר... מרכזית הקריות-קרית מוצקין<->תחנה מרכזית חוף כר... הנביאים-חיפה<->מרכזית הקריות-קרית מוצקין-10 מרכזית הקריות-קרית מוצקין<->הנביאים-חיפה-20 מרכז הספורט/דרך בגין-בת ים<->תחנה מרכזית פתח ת... אבי האסירים-ראשון לציון<->רדינג-תל אביב יפו-20 תחנה מרכזית פתח תקווה/רציפים עירוני-פתח תקווה<... האורגים-חולון<->רדינג-תל אביב יפו-20 שד.נחל צאלים/נחל חבר-בית שמש<->מסוף אגד/הר חוצ...
route_type 3 3 3 3 3 3 3 3 3 3
num_trips 195 27 186 123 123 177 55 177 111 89
num_trip_starts 195 27 186 123 123 177 55 177 111 89
num_trip_ends 192 27 184 121 121 173 55 173 110 86
is_loop 0 0 0 0 0 0 0 0 0 0
is_bidirectional 0 0 0 0 0 0 0 0 0 0
start_time 05:00:00 18:00:00 04:40:00 05:00:00 05:00:00 00:05:00 05:30:00 00:00:00 00:00:00 05:30:00
end_time 24:45:30 21:16:37 24:22:09 24:30:46 24:32:21 24:55:49 20:22:46 24:49:57 24:24:30 24:44:52
max_headway 5 4 5 10 10 6 25 7 12 20
min_headway 4 4 4 5 5 5 5 5 5 5
mean_headway 4.4472 4 4.49315 8.18391 8.18391 5.744 17.575 5.752 9 11.25
peak_num_trips 17 17 16 14 14 14 13 13 12 12
peak_start_time 07:04:00 19:06:00 07:00:00 08:00:00 08:00:00 14:15:00 07:38:00 07:40:00 07:15:00 08:00:00
peak_end_time 07:05:30 19:06:37 07:02:09 08:03:46 08:05:21 14:15:49 07:42:46 07:44:57 07:19:30 08:04:52
service_distance 4980105 741420 4731840 1949919 1948074 3967986 1267200 3944622 1879341 3785793
service_duration 212.875 29.9775 192.665 145.072 148.318 194.159 66.7028 191.602 100.825 96.2189
service_speed 23.3945 24.7325 24.5599 13.4411 13.1345 20.4368 18.9977 20.5875 18.6396 39.3456
mean_trip_distance 25539 27460 25440 15853 15838 22418 23040 22286 16931 42537
mean_trip_duration 1.09167 1.11028 1.03583 1.17944 1.20583 1.09694 1.21278 1.0825 0.908333 1.08111
start_stop_id 36217 35756 35756 3734 35756 14613 23881 37504 24174 10840
end_stop_id 35501 35827 35827 35501 3734 37504 13700 23921 13700 10684
num_stops 42 42 41 28 28 55 56 51 51 43
start_zone סובב חיפה סובב חיפה סובב חיפה סובב חיפה סובב חיפה גוש דן גוש דן גוש דן גוש דן אזור בית שמש
end_zone סובב חיפה סובב חיפה סובב חיפה סובב חיפה סובב חיפה גוש דן גוש דן גוש דן גוש דן סובב ירושלים
In [138]:
g.sort_values(by='peak_num_trips', ascending=False).head(10).T.to_csv('180305_route_stats_top10_peak_num_trips.csv')
In [125]:
g.is_bidirectional.value_counts()
Out[125]:
0    6673
Name: is_bidirectional, dtype: int64

What's next

TODO

  1. add split_directions
  2. time between stops - max, min, mean (using delta)
  3. integrate with custom day cutoff
  4. add day and night headways and num_trips (maybe noon also)
  5. put this all back into proper documented functions
  6. write tests

Comments !