The Sacramento Police Department daily activity logs website was always the first I visited when I worked the early morning crime shifts at the Sacramento Bee.
A public relations officer updated it every morning with a fairly well-standardized list of notable incidents the department responded to the day before, and I would pick a few incidents from this list, make some calls and write up a post for the Bee’s popular crime blog.
Although the daily activity logs were great fodder for the crime blotter, I wondered how accurately the list of incidents represented crime in Sacramento. I wrote a scraper to grab all the daily activity pages from 2013 and downloaded the 2013 Universal Crime Report data, but never got around to using the sources for the paper.
This notebook contains my efforts to compare the Sacramento police activity logs and UCR crime data.
What are most common crimes in UCR and daily activity?
under-reporting certain crimes? crimes that dont appear in logs
hard to link! but if you had these activities you would expect these crimes?
pick a sample of UCRs, find some comparable activity reports, expand?
When do most UCR and daily activity incidents happen?
season, time of day
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
activity = pd.read_csv('data/refine-crimes.csv')
ucr = pd.read_csv('data/RMSData_2012-01-01_to_2012-12-31.csv')
The number of incidents in the 2012 daily activity logs is 8.5 percent the number of reported crimes in 2012.
print float(activity['Crime'].count()) / float(ucr['InternalID'].count())
print "Daily activity frequenciesn"
print activity['Crime'].value_counts()[0:10]
print "nn"
print "Distinct crimes: %s" % len(activity['Crime'].value_counts())
print "Average incidents per crime: %s" % activity['Crime'].value_counts().mean()
print "Median incidents per crime: %s" % activity['Crime'].value_counts().median()
activity['Crime'].value_counts().hist()
plt.title('Distribution of activity frequencies')
plt.show()
print "UCR frequenciesn"
print ucr['Description'].value_counts()[0:10]
print "nn"
print "Distinct crimes: %s" % len(ucr['Description'].value_counts())
print "Average incidents per crime: %s" % ucr['Description'].value_counts().mean()
print "Median incidents per crime: %s" % ucr['Description'].value_counts().median()
ucr['Description'].value_counts().hist()
plt.title('Distribution of UCR frequencies')
plt.show()
top_ucrs = ucr.groupby(['Description']).count().sort_index(by='Description',
ascending=False)[0:25]
top_activities = activity.groupby(['Crime']).count().sort_index(by='Crime',
ascending=False)[0:25]
fig = plt.figure()
fig.subplots_adjust(left=0.2, wspace=0.6)
ucr_plot = top_ucrs['Description'].plot(kind="barh")
ax1 = fig.add_subplot(ucr_plot)
ax1.set_title('Top 25 UCR descriptions')
ax1.set_ylabel('')
plt.show()
fig = plt.figure()
fig.subplots_adjust(left=0.2, wspace=0.6)
activity_plot = top_activities['Crime'].plot(kind="barh")
ax2 = fig.add_subplot(activity_plot)
ax2.set_title('Top 25 acitivity incididents')
ax2.set_xlim(0, 600)
ax2.set_ylabel('')
plt.show()
months = ["Jan.", "Feb.", "March", "April", "May", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."]
days = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
def get_activity_date(activity):
# grabs the posted date from the URL, likely making days off by one
# TODO scrape dates of the incidents from the pages
return pd.to_datetime(activity[-8:], format='%Y%m%d')
activity['date'] = activity.url.apply(lambda x: get_activity_date(x))
activity['month'] = activity.date.apply(lambda x: x.month)
activity['weekday'] = activity.date.apply(lambda x: x.weekday())
activity_by_month = activity.groupby('month').size().plot()
activity_by_month.set_title('Activity by month')
activity_by_month.set_xticklabels(months)
activity_by_month.set_xlim(1, 12)
plt.show()
ucr.OccDate = pd.to_datetime(ucr.OccDate)
ucr['weekday'] = ucr.OccDate.apply(lambda x: x.weekday())
ucr_by_day = ucr.groupby('weekday').size().plot()
ucr_by_day.set_xticklabels(days)
ucr_by_day.set_title('UCR by weekday')
plt.show()
ucr['month'] = ucr.OccDate.apply(lambda x: x.month)
ucr_by_month = ucr.groupby('month').size().plot()
ucr_by_month.set_title('UCR by month')
ucr_by_month.set_xticklabels(months)
ucr_by_month.set_xlim(1, 12)
plt.show()
ucr['time'] = ucr.OccDate.apply(lambda x: x.hour + (x.minute / 100))
ucr_by_time = ucr.groupby('time').size().plot()
ucr_by_time.set_title('UCR by time of day')
ucr_by_time.set_xlim(0, 24)
plt.show()
activity_cumsum = activity.groupby('date').size().cumsum().plot()
activity_cumsum.set_title('Total 2012 activities by date')
plt.show()
ucr_cumsum = ucr.groupby('OccDate').size().cumsum().plot()
ucr_cumsum.set_title('Total 2012 UCR by date')
plt.show()
print "Daily activity incidents per day"
activity_per_day = activity.groupby('date').size()
print "Mean: %s" % activity_per_day.mean()
print "Standard dev: %s" % activity_per_day.std()
apd_hist = activity_per_day.hist(bins=25)
apd_hist.set_title('Distribution of daily activity incidents per day')
plt.show()
def get_md(ucr_date):
return str(ucr_date.month) + '/' + str(ucr_date.day)
ucr['md'] = ucr.OccDate.apply(get_md)
print "Daily activity incidents per day"
ucr_per_day = ucr.groupby('md').size()
print "Mean: %s" % ucr_per_day.mean()
print "Standard dev: %s" % ucr_per_day.std()
upd_hist = ucr_per_day.hist(bins=25)
upd_hist.set_title('Distribution of UCR crimes per day')
plt.show()
# TODO nltk?
# TODO combine charts
# TODO Monte Carlo
The Sacramento Police Department daily activity logs website was always the first I visited when I worked the early morning crime shifts at the Sacramento Bee.
A public relations officer updated it every morning with a fairly well-standardized list of notable incidents the department responded to the day before, and I would pick a few incidents from this list, make some calls and write up a post for the Bee's popular crime blog.
Although the daily activity logs were great fodder for the crime blotter, I wondered how accurately the list of incidents represented crime in Sacramento. I wrote a scraper to grab all the daily activity pages from 2013 and downloaded the 2013 Universal Crime Report data, but never got around to using the sources for the paper.
This notebook contains my efforts to compare the Sacramento police activity logs and UCR crime data.
What are most common crimes in UCR and daily activity?
under-reporting certain crimes? crimes that dont appear in logs
hard to link! but if you had these activities you would expect these crimes?
pick a sample of UCRs, find some comparable activity reports, expand?
When do most UCR and daily activity incidents happen?
season, time of day
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
activity = pd.read_csv('data/refine-crimes.csv')
ucr = pd.read_csv('data/RMSData_2012-01-01_to_2012-12-31.csv')
The number of incidents in the 2012 daily activity logs is 8.5 percent the number of reported crimes in 2012.
print float(activity['Crime'].count()) / float(ucr['InternalID'].count())
print "Daily activity frequenciesn"
print activity['Crime'].value_counts()[0:10]
print "nn"
print "Distinct crimes: %s" % len(activity['Crime'].value_counts())
print "Average incidents per crime: %s" % activity['Crime'].value_counts().mean()
print "Median incidents per crime: %s" % activity['Crime'].value_counts().median()
activity['Crime'].value_counts().hist()
plt.title('Distribution of activity frequencies')
plt.show()
print "UCR frequenciesn"
print ucr['Description'].value_counts()[0:10]
print "nn"
print "Distinct crimes: %s" % len(ucr['Description'].value_counts())
print "Average incidents per crime: %s" % ucr['Description'].value_counts().mean()
print "Median incidents per crime: %s" % ucr['Description'].value_counts().median()
ucr['Description'].value_counts().hist()
plt.title('Distribution of UCR frequencies')
plt.show()
top_ucrs = ucr.groupby(['Description']).count().sort_index(by='Description',
ascending=False)[0:25]
top_activities = activity.groupby(['Crime']).count().sort_index(by='Crime',
ascending=False)[0:25]
fig = plt.figure()
fig.subplots_adjust(left=0.2, wspace=0.6)
ucr_plot = top_ucrs['Description'].plot(kind="barh")
ax1 = fig.add_subplot(ucr_plot)
ax1.set_title('Top 25 UCR descriptions')
ax1.set_ylabel('')
plt.show()
fig = plt.figure()
fig.subplots_adjust(left=0.2, wspace=0.6)
activity_plot = top_activities['Crime'].plot(kind="barh")
ax2 = fig.add_subplot(activity_plot)
ax2.set_title('Top 25 acitivity incididents')
ax2.set_xlim(0, 600)
ax2.set_ylabel('')
plt.show()
months = ["Jan.", "Feb.", "March", "April", "May", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."]
days = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
def get_activity_date(activity):
# grabs the posted date from the URL, likely making days off by one
# TODO scrape dates of the incidents from the pages
return pd.to_datetime(activity[-8:], format='%Y%m%d')
activity['date'] = activity.url.apply(lambda x: get_activity_date(x))
activity['month'] = activity.date.apply(lambda x: x.month)
activity['weekday'] = activity.date.apply(lambda x: x.weekday())
activity_by_month = activity.groupby('month').size().plot()
activity_by_month.set_title('Activity by month')
activity_by_month.set_xticklabels(months)
activity_by_month.set_xlim(1, 12)
plt.show()
ucr.OccDate = pd.to_datetime(ucr.OccDate)
ucr['weekday'] = ucr.OccDate.apply(lambda x: x.weekday())
ucr_by_day = ucr.groupby('weekday').size().plot()
ucr_by_day.set_xticklabels(days)
ucr_by_day.set_title('UCR by weekday')
plt.show()
ucr['month'] = ucr.OccDate.apply(lambda x: x.month)
ucr_by_month = ucr.groupby('month').size().plot()
ucr_by_month.set_title('UCR by month')
ucr_by_month.set_xticklabels(months)
ucr_by_month.set_xlim(1, 12)
plt.show()
ucr['time'] = ucr.OccDate.apply(lambda x: x.hour + (x.minute / 100))
ucr_by_time = ucr.groupby('time').size().plot()
ucr_by_time.set_title('UCR by time of day')
ucr_by_time.set_xlim(0, 24)
plt.show()
activity_cumsum = activity.groupby('date').size().cumsum().plot()
activity_cumsum.set_title('Total 2012 activities by date')
plt.show()
ucr_cumsum = ucr.groupby('OccDate').size().cumsum().plot()
ucr_cumsum.set_title('Total 2012 UCR by date')
plt.show()
print "Daily activity incidents per day"
activity_per_day = activity.groupby('date').size()
print "Mean: %s" % activity_per_day.mean()
print "Standard dev: %s" % activity_per_day.std()
apd_hist = activity_per_day.hist(bins=25)
apd_hist.set_title('Distribution of daily activity incidents per day')
plt.show()
def get_md(ucr_date):
return str(ucr_date.month) + '/' + str(ucr_date.day)
ucr['md'] = ucr.OccDate.apply(get_md)
print "Daily activity incidents per day"
ucr_per_day = ucr.groupby('md').size()
print "Mean: %s" % ucr_per_day.mean()
print "Standard dev: %s" % ucr_per_day.std()
upd_hist = ucr_per_day.hist(bins=25)
upd_hist.set_title('Distribution of UCR crimes per day')
plt.show()
# TODO nltk?
# TODO combine charts
# TODO Monte Carlo