knitr::opts_chunk$set(echo = TRUE, results=TRUE, message = FALSE, 
                      warning=FALSE, fig.align="center", cache=TRUE)
options(scipen=99)

1 Setup

First, load the requisite packages and create a mapTheme() function that is used to standardized the map outputs created below.

library(tidyverse)
library(sf)
library(QuantPsyc)
library(RSocrata)
library(viridis)
library(caret)
library(spatstat)
library(spdep)
library(FNN)
library(grid)
library(gridExtra)
library(knitr)
library(kableExtra)
library(tidycensus)

mapTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = 14,colour = "black"),
    plot.subtitle=element_text(face="italic"),
    plot.caption=element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),axis.title = element_blank(),
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=2)
  )
}

2 Data

Download data and create base maps, including police districts, police beats, city boundary, neighborhoods and fishnet.

policeDistricts <- 
  st_read("https://data.cityofchicago.org/api/geospatial/fthy-xz3r?method=export&format=GeoJSON") %>%
  st_transform(crs=102271) %>%
  dplyr::select(District = dist_num)
  
policeBeats <- 
  st_read("https://data.cityofchicago.org/api/geospatial/aerh-rz74?method=export&format=GeoJSON") %>%
  st_transform(crs=102271) %>%
  dplyr::select(District = beat_num)

chicagoBoundary <- 
  st_read("D:/！courses/CPLN590/HW/HW7/riskPrediction_data/chicagoBoundary.shp") %>%
  st_transform(crs=102271) 

fishnet <- 
  st_make_grid(chicagoBoundary, cellsize = 500) %>%
  st_sf()

fishnet <- 
  fishnet[chicagoBoundary,] %>%
  mutate(uniqueID = rownames(.)) %>%
  dplyr::select(uniqueID)

neighborhoods <- 
  st_read("https://raw.githubusercontent.com/blackmad/neighborhoods/master/chicago.geojson") %>%
  st_transform(st_crs(fishnet))

Next, 2017 crime data is downloaded from the Chicago Open Data site using the RSocrata package. We choose criminal damage as the dependent variables, since it suffers from much selection bias like the burglary example. Plus, we select only those criminal damages that are “to property”. People living in affluent neighborhood may have higher incentive to report criminal damage incidents, while those in poor neighborhoods may turn a blind eye to them. Thus, the spatial risk model based on the biased sample set will demonstrate different predictive power across neighborhoods. Social problems will occur if this model has lower accuracy in poor areas.

damage <- 
  read.socrata("https://data.cityofchicago.org/Public-Safety/Crimes-2017/d62x-nvdr")%>%
  filter(Primary.Type == "CRIMINAL DAMAGE" & 
           Description == "TO PROPERTY") %>%
  mutate(x = gsub("[()]", "", Location)) %>%
  separate(x,into= c("Y","X"), sep=",") %>%
  mutate(X = as.numeric(X),
         Y = as.numeric(Y)) %>% 
  dplyr::select(.,-Date,-Updated.On) %>%
  na.omit %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant")%>%
  st_transform(102271) %>% 
  distinct()

ggplot() + 
  geom_sf(data = chicagoBoundary) +
  geom_sf(data = damage, colour="red", size=0.1, show.legend = "point") +
  labs(title= "CRIMINAL DAMAGE TO PROPERTY, Chicago - 2017") +
  mapTheme()

We then join the criminal damage points to the fishnet.

crime_net <- 
  damage %>% 
  dplyr::select() %>% 
  mutate(countdamage = 1) %>% 
  aggregate(., fishnet, sum) %>%
  mutate(countdamage = ifelse(is.na(countdamage), 0, countdamage),
         uniqueID = rownames(.),
         cvID = sample(round(nrow(fishnet) / 24), size=nrow(fishnet), replace = TRUE))

ggplot() +
  geom_sf(data = crime_net, aes(fill = countdamage)) +
  scale_fill_viridis() +
  labs(title = "Count of criminal damages for the fishnet") +
  mapTheme()

Next, we download and wrangle the risk factor data.

abandonCars <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Abandoned-Vehicles/3c9v-pnva") %>%
  mutate(year = substr(creation_date,1,4)) %>%
  filter(year == "2017") %>%
  dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Abandoned_Cars")

rodent <- 
  read.socrata("https://data.cityofchicago.org/resource/uqhs-j723.json") %>%
  mutate(year = substr(creation_date,1,4)) %>%
  filter(year == "2017") %>%
  dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "rodent")

abandonBuildings <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Vacant-and-Abandoned-Building/7nii-7srd") %>%
  mutate(year = substr(date_service_request_was_received,1,4)) %>%
  filter(year == "2017") %>%
  dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Abandoned_Buildings")

graffiti <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Graffiti-Removal-Historical/hec5-y4x5") %>%
  mutate(year = substr(creation_date,1,4)) %>%
  filter(year == "2017") %>%
  filter(where_is_the_graffiti_located_ == "Front" |
           where_is_the_graffiti_located_ == "Rear" | where_is_the_graffiti_located_ == "Side") %>%
  dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Graffiti")

streetLightsOut <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Street-Lights-All-Out/zuxi-7xem") %>%
  mutate(year = substr(creation_date,1,4)) %>%
  filter(year == "2017") %>%
  dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Street_Lights_Out")

sanitation <-
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Sanitation-Code-Complaints-Hi/me59-5fac") %>%
  mutate(year = substr(creation_date,1,4)) %>%
  filter(year == "2017") %>%
  dplyr::select(Y = latitude, X = longitude) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Sanitation")

liquorRetail <- 
  read.socrata("https://data.cityofchicago.org/Community-Economic-Development/Business-Licenses-Cur   rent-Liquor-and-Public-Places/nrmj-3kcf") %>%
  filter(BUSINESS.ACTIVITY == "Retail Sales of Packaged Liquor") %>%
  dplyr::select(Y = LATITUDE, X = LONGITUDE) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "Liquor_Retail")

packagegood <- 
  read.socrata("https://data.cityofchicago.org/Community-Economic-Development/Business-Licenses-Cur   rent-Liquor-and-Public-Places/nrmj-3kcf") %>%
  filter(LICENSE.DESCRIPTION == "Package Goods") %>%
  dplyr::select(Y = LATITUDE, X = LONGITUDE) %>%
  na.omit() %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform(st_crs(fishnet)) %>%
  mutate(Legend = "packagegood")

3 Feature engineering

We used two feature engineering approaches with the risk factor data. The first is to sum the number of a given risk factor points occuring in a given grid cell.

vars_net <- 
  rbind(abandonCars,streetLightsOut,abandonBuildings,
        liquorRetail, graffiti, sanitation, rodent, packagegood) %>%
  st_join(., fishnet, join=st_within) %>%
  st_set_geometry(NULL) %>%
  group_by(uniqueID, Legend) %>%
  summarize(count = n()) %>%
  full_join(fishnet) %>%
  spread(Legend, count, fill=0) %>%
  st_sf() %>%
  dplyr::select(-`<NA>`) %>%
  na.omit()

vars_net.long <- 
  vars_net %>%
  gather(Variable, value, -geometry, -uniqueID)

vars <- unique(vars_net.long$Variable)
mapList <- list()

for(i in vars){
  mapList[[i]] <- 
    ggplot() +
    geom_sf(data = filter(vars_net.long, Variable == i), aes(fill=value), colour=NA) +
    scale_fill_viridis(name="") +
    labs(title=i) +
    mapTheme()}

do.call(grid.arrange,c(mapList, ncol =2, top = "Risk Factors by Fishnet"))

The second is to calculates the average of 3 nearest neighbor distance.

nn_function <- function(measureFrom,measureTo,k) {
  measureFrom_Matrix <-
    as.matrix(measureFrom)
  measureTo_Matrix <-
    as.matrix(measureTo)
  nn <-   
    get.knnx(measureTo, measureFrom, k)$nn.dist
    output <-
      as.data.frame(nn) %>%
      rownames_to_column(var = "thisPoint") %>%
      gather(points, point_distance, V1:ncol(.)) %>%
      arrange(as.numeric(thisPoint)) %>%
      group_by(thisPoint) %>%
      summarize(pointDistance = mean(point_distance)) %>%
      arrange(as.numeric(thisPoint)) %>% 
      dplyr::select(-thisPoint) %>%
      pull()
  
  return(output)  
}

vars_net$Abandoned_Buildings.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(abandonBuildings), 3)
    
vars_net$Abandoned_Cars.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(abandonCars), 3)
    
vars_net$Graffiti.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(graffiti), 3)
    
vars_net$Liquor_Retail.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(liquorRetail), 3)

vars_net$Street_Lights_Out.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(streetLightsOut), 3)
    
vars_net$Sanitation.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(sanitation), 3)

vars_net$rodent.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(rodent), 3)

vars_net$packagegood.nn =
    nn_function(st_coordinates(st_centroid(vars_net)), st_coordinates(packagegood), 3)

vars_net.long.nn <- 
  vars_net %>%
  dplyr::select(ends_with(".nn")) %>%
  gather(Variable, value, -geometry, -uniqueID)

vars <- unique(vars_net.long.nn$Variable)
mapList <- list()

for(i in vars){
  mapList[[i]] <- 
    ggplot() +
      geom_sf(data = filter(vars_net.long.nn, Variable == i), aes(fill=value), colour=NA) +
      scale_fill_viridis(name="") +
      labs(title=i) +
      mapTheme()}

do.call(grid.arrange,c(mapList, ncol =2, top = "Nearest Neighbor risk Factors by Fishnet"))

We also develope a distance-to-a-single-point feature by measuring the distance to Chicago’s Central Business District.

loopPoint <-
  neighborhoods %>%
  filter(name == "Loop") %>%
  st_centroid()

vars_net$loopDistance =
  st_distance(st_centroid(vars_net),loopPoint) %>%
  as.numeric() 

ggplot() +
  geom_sf(data=vars_net, aes(fill=loopDistance)) +
  scale_fill_viridis() +
  labs(title="Euclidean distance to The Loop") +
  mapTheme()

A “final net” is created by joining the crime net and variable net layers.

final_net <-
  left_join(crime_net, st_set_geometry(vars_net, NULL), by="uniqueID") 

final_net <-
  st_centroid(final_net) %>%
  st_join(., dplyr::select(neighborhoods, name)) %>%
  st_join(., dplyr::select(policeDistricts, District)) %>%
  st_set_geometry(NULL) %>%
  left_join(dplyr::select(final_net, geometry, uniqueID)) %>%
  st_sf() %>%
  na.omit()

4 Spatial structure

To examine the spatial autocorrelation of criminal damages on a local scale, we conduct the local Moran’s I test to our outcome.

final_net.nb <- poly2nb(final_net, queen=TRUE)
final_net.weights <- nb2listw(final_net.nb, style="W", zero.policy=TRUE)

final_net.localMorans <- 
  cbind(
    as.data.frame(localmoran(final_net$countdamage, final_net.weights)),
    as.data.frame(final_net, NULL)) %>% 
  st_sf() %>%
  dplyr::select(damage_Count = countdamage, 
                Local_Morans_I = Ii, 
                P_Value = `Pr(z > 0)`) %>%
  mutate(Significant_Hotspots = ifelse(P_Value <= 0.05, 1, 0)) %>%
  gather(Variable, Value, -geometry)

vars <- unique(final_net.localMorans$Variable)
varList <- list()

for(i in vars){
  varList[[i]] <- 
    ggplot() +
    geom_sf(data = filter(final_net.localMorans, Variable == i), aes(fill = Value), colour=NA) +
    scale_fill_viridis(name="") +
    labs(title=i) +
    mapTheme()}

do.call(grid.arrange,c(varList, ncol = 4, top = "Local Morans I statistics, criminal damages"))

final_net <-
  final_net %>% 
  mutate(damage.isSig = ifelse(localmoran(final_net$countdamage, 
                                            final_net.weights)[,5] <= 0.0000001, 1, 0)) %>%
  mutate(damage.isSig.dist = nn_function(st_coordinates(st_centroid(final_net)),
                                           st_coordinates(st_centroid(
                                             filter(final_net, damage.isSig == 1))), 1 ))

ggplot() + 
  geom_sf(data = final_net, aes(fill = damage.isSig.dist)) +
  scale_fill_viridis() +
  labs(title = "Distance to highly significant local criminal damage hotspots") +
  mapTheme()

From above plots we can see strong clustering feature of the criminal damages as well as the hotspots for such crime.

5 Correlation tests

Before regression, we produce a series of scatterplots to see the correlations between criminal damages and different variables.

correlation.long <-
  st_set_geometry(final_net, NULL) %>%
  dplyr::select(-uniqueID, -cvID, -loopDistance, -name, -District) %>%
  gather(Variable, Value, -countdamage)

correlation.cor <-
  correlation.long %>%
  group_by(Variable) %>%
  summarize(correlation = cor(Value, countdamage, use = "complete.obs"))

ggplot(correlation.long, aes(Value, countdamage)) +
  geom_point(size = 0.1) +
  geom_text(data = correlation.cor, aes(label = paste("r =", round(correlation, 2))),
            x=-Inf, y=Inf, vjust = 1, hjust = -.1) +
  geom_smooth(method = "lm", se = FALSE, colour = "#a2d7d8") +
  facet_wrap(~Variable, ncol = 2, scales = "free") +
  labs(title = "criminal damage count as a function of risk factors")

we also plot a histogram for our dependent variable.

ggplot(final_net, aes(countdamage)) + 
  geom_histogram(binwidth = 1) +
  labs(title = "Distribution of criminal damages by grid cell")

6 Cross-validated poisson Regression

We examine the goodness of fit metrics for both random k-fold and LOGO cross validations for two different specifications (4 total regressions). The first only include Risk Factors features and are listed in reg.vars below. The second includes both the risk factors plus the Spatial Structure features and are listed in reg.ss.vars below.

reg.vars <- c("Abandoned_Buildings.nn", "Abandoned_Cars.nn", "Graffiti.nn", 
              "Liquor_Retail.nn", "Street_Lights_Out.nn", "Sanitation.nn", "rodent.nn", "packagegood.nn", "loopDistance")

reg.ss.vars <- c("Abandoned_Buildings.nn", "Abandoned_Cars.nn", "Graffiti.nn", "Liquor_Retail.nn", 
                 "Street_Lights_Out.nn", "Sanitation.nn","rodent.nn", "packagegood.nn", "loopDistance", 
                 "damage.isSig", "damage.isSig.dist")

crossValidate <- function(dataset, id, dependentVariable, indVariables) {
  
  allPredictions <- data.frame()
  cvID_list <- unique(dataset[[id]])
  
  for (i in cvID_list) {
    
    thisFold <- i
    cat("This hold out fold is", thisFold, "\n")
    
    fold.train <- filter(dataset, dataset[[id]] != thisFold) %>% as.data.frame() %>% 
      dplyr::select(id, geometry, indVariables, dependentVariable)
    fold.test  <- filter(dataset, dataset[[id]] == thisFold) %>% as.data.frame() %>% 
      dplyr::select(id, geometry, indVariables, dependentVariable)
    
    regression <-
      glm(countdamage ~ ., family = "poisson", 
          data = fold.train %>% 
            dplyr::select(-geometry, -id))
    
    thisPrediction <- 
      mutate(fold.test, Prediction = predict(regression, fold.test, type = "response"))
    
    allPredictions <-
      rbind(allPredictions, thisPrediction)
    
  }
  return(st_sf(allPredictions))
}

reg.cv <- crossValidate(
  dataset = final_net,
  id = "cvID",
  dependentVariable = "countdamage",
  indVariables = reg.vars) %>%
  dplyr::select(cvID = cvID, countdamage, Prediction, geometry)

reg.ss.cv <- crossValidate(
  dataset = final_net,
  id = "cvID",
  dependentVariable = "countdamage",
  indVariables = reg.ss.vars) %>%
  dplyr::select(cvID = cvID, countdamage, Prediction, geometry)

reg.spatialCV <- crossValidate(
  dataset = final_net,
  id = "name",
  dependentVariable = "countdamage",
  indVariables = reg.vars) %>%
  dplyr::select(cvID = name, countdamage, Prediction, geometry)

reg.ss.spatialCV <- crossValidate(
  dataset = final_net,
  id = "name",
  dependentVariable = "countdamage",
  indVariables = reg.ss.vars) %>%
  dplyr::select(cvID = name, countdamage, Prediction, geometry)

reg.summary <- 
  rbind(
    mutate(reg.cv,           Error = countdamage - Prediction,
           Regression = "Random k-fold CV: Just Risk Factors"),
    
    mutate(reg.ss.cv,        Error = countdamage - Prediction,
           Regression = "Random k-fold CV: Spatial Structure"),
    
    mutate(reg.spatialCV,    Error = countdamage - Prediction,
           Regression = "Spatial LOGO-CV: Just Risk Factors"),
    
    mutate(reg.ss.spatialCV, Error = countdamage - Prediction,
           Regression = "Spatial LOGO-CV: Spatial Structure")) %>%
  st_sf() 

grid.arrange(
  reg.summary %>%
    ggplot() +
    geom_sf(aes(fill = Prediction)) +
    facet_wrap(~Regression) +
    scale_fill_viridis() +
    labs(title = "Predicted damages by Regression") +
    mapTheme() + theme(legend.position="bottom"),
  
  filter(reg.summary, Regression == "Random k-fold CV: Just Risk Factors") %>%
    ggplot() +
    geom_sf(aes(fill = countdamage)) +
    scale_fill_viridis() +
    labs(title = "Observed criminal damages\n") +
    mapTheme() + theme(legend.position="bottom"), ncol = 2)

filter(reg.summary, Regression == "Spatial LOGO-CV: Just Risk Factors" | 
         Regression == "Spatial LOGO-CV: Spatial Structure") %>%
  ggplot() +
  geom_sf(aes(fill = Error)) +
  facet_wrap(~Regression) +
  scale_fill_viridis() +
  labs(title = "criminal damages errors by Regression") +
  mapTheme()

st_set_geometry(reg.summary, NULL) %>%
  group_by(Regression) %>% 
  summarize(MAE = round(mean(abs(Prediction - countdamage), na.rm = T),2),
            SD_MAE = round(sd(abs(Prediction - countdamage), na.rm = T),2)) %>% 
  kable(caption = "MAE by regression") %>%
  kable_styling("striped", full_width = F) %>%
  row_spec(2, color = "black", background = "#FDE725FF") %>%
  row_spec(4, color = "black", background = "#FDE725FF")

MAE by regression
Regression	MAE	SD_MAE
Random k-fold CV: Just Risk Factors	2.84	3.14
Random k-fold CV: Spatial Structure	2.33	2.44
Spatial LOGO-CV: Just Risk Factors	2.87	3.17
Spatial LOGO-CV: Spatial Structure	2.35	2.48

From above plots we can see that the models generalize equally well in both of the cross-validation types. Also, it is clear that models with the spatial structural features have stronger predictive power.

7 Generalizability by neighborhood context

we also examine the raw errors by race context for a random k-fold vs. spatial cross validation regression.

tracts17 <- 
  get_acs(geography = "tract", variables = c("B01001_001E","B01001A_001E"), 
          year = 2017, state=17, county=031, geometry=T) %>%
  st_transform(102271)  %>% 
  dplyr::select(variable, estimate, GEOID) %>%
  spread(variable, estimate) %>%
  rename(TotalPop = B01001_001,
         NumberWhites = B01001A_001) %>%
  mutate(percentWhite = NumberWhites / TotalPop,
         raceContext = ifelse(percentWhite > .5, "Majority_White", "Majority_Non_White")) %>%
  .[neighborhoods,]

ggplot() + 
  geom_sf(data = tracts17, aes(fill = raceContext)) +
  scale_fill_viridis(discrete = TRUE) +
  labs(title = "Race Context", name="Race Context") +
  mapTheme()

final_reg <- 
  filter(reg.summary, Regression == "Spatial LOGO-CV: Spatial Structure" |
           Regression == "Spatial LOGO-CV: Just Risk Factors") %>%
  mutate(uniqueID = rownames(.))

final_reg.tracts <- 
  st_join(st_centroid(final_reg), tracts17) %>%
  st_set_geometry(NULL) %>%
  left_join(dplyr::select(final_reg, uniqueID)) %>%
  st_sf() %>%
  na.omit()

st_set_geometry(final_reg.tracts, NULL) %>%
  group_by(Regression, raceContext) %>%
  summarize(mean.Error = mean(Error, na.rm = T)) %>%
  spread(raceContext, mean.Error) %>%
  kable(caption = "Mean Error by neighborhood racial context") %>%
  kable_styling("striped", full_width = F)

Mean Error by neighborhood racial context
Regression	Majority_Non_White	Majority_White
Spatial LOGO-CV: Just Risk Factors	0.7132942	-0.7342960
Spatial LOGO-CV: Spatial Structure	0.2610246	-0.2507776

Unfortunately, the model does overpredict in Majority_Non_White neighborhoods and underpredict in Majority_White neighborhoods. However, we can significantly reduce such error by including spatial structure features.

8 Comparing with traditional kernel-density method

We use map and bar plot to compare traditional kernel-density method to risk-prediction method.

library(raster)

# Compute kernel density
burg_ppp <- as.ppp(st_coordinates(damage), W = st_bbox(final_net))
burg_KD <- spatstat::density.ppp(burg_ppp, 1000)

# Convert kernel density to grid cells taking the mean
burg_KDE_sf <- as.data.frame(burg_KD) %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
  aggregate(., final_net, mean) %>%
  
  #Mutate the Risk_Category field as defined below.
  mutate(label = "Kernel Density",
         Risk_Category = ntile(value, 100),
         Risk_Category = case_when(
           Risk_Category >= 90 ~ "90% to 100%",
           Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
           Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
           Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
           Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
  
  #Bind to a layer where test set crime counts are spatially joined to the fisnnet.
  bind_cols(
    aggregate(
      dplyr::select(damage) %>% mutate(burgCount = 1), ., length) %>%
      mutate(burgCount = replace_na(burgCount, 0))) %>%
  
  #Select the fields we need
  dplyr::select(label, Risk_Category, burgCount)

burg_risk_sf <-
  filter(final_reg, Regression == "Spatial LOGO-CV: Spatial Structure") %>%
  mutate(label = "Risk Predictions",
         Risk_Category = ntile(Prediction, 100),
         Risk_Category = case_when(
           Risk_Category >= 90 ~ "90% to 100%",
           Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
           Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
           Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
           Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
  bind_cols(
    aggregate(
      dplyr::select(damage) %>% mutate(burgCount = 1), ., length) %>%
      mutate(burgCount = replace_na(burgCount, 0))) %>%
  dplyr::select(label,Risk_Category, burgCount)

rbind(burg_KDE_sf, burg_risk_sf) %>%
  gather(Variable, Value, -label, -Risk_Category, -geometry) %>%
  ggplot() +
  geom_sf(aes(fill = Risk_Category), colour = NA) +
  geom_sf(data = sample_n(damage, 1500), size = .1, colour = "black") +
  facet_wrap(~label, ) +
  scale_fill_viridis(discrete = TRUE) +
  labs(title="Comparison of Kernel Density and Risk Predictions",
       subtitle="Relative to test set points (in black)") +
  mapTheme()

rbind(burg_KDE_sf, burg_risk_sf) %>%
  st_set_geometry(NULL) %>%
  gather(Variable, Value, -label, -Risk_Category) %>%
  group_by(label, Risk_Category) %>%
  summarize(countdamage = sum(Value)) %>%
  ungroup() %>%
  group_by(label) %>%
  mutate(Rate_of_test_set_crimes = countdamage / sum(countdamage)) %>%
  ggplot(aes(Risk_Category,Rate_of_test_set_crimes)) +
  geom_bar(aes(fill=label), position="dodge", stat="identity") +
  scale_fill_viridis(discrete = TRUE)

To sum up, I recommend our algorithm be put into production.

The first and most important reason is that comparing to the the kernel-density method, our risk-prediction method captures a greater share of observed criminal damage events in the highest risk category (as shown in the above plots).

Another reason is that our method can generalize experience learnt from observed crime hotspots to places at risk for crime despite none being reported. Since the reporting of criminal damages often suffers serious selection bias, it is of great importance to identify such latent risk.

Spatial risk prediction: the case of crime

Huidi Hu

November 7, 2019