diff --git a/01_read_data_DE.R b/01_read_data_DE.R new file mode 100644 index 0000000..f88df2c --- /dev/null +++ b/01_read_data_DE.R @@ -0,0 +1,119 @@ +###### Prepare German pop. data for sdcSpatial ##### + +# ----- load libraries ----- + +library(sf) # handling shape data +library(data.table) # fast import and data manipulation +library(dplyr) # convenience +library(ggplot2) # (optional) visualization + + +# ----- get required data ----- + +data_folder <- file.path(getwd(), "sdcSpatialExperiment_DE_ext") # customize if needed + +web_pop <- paste0("https://www.zensus2011.de/SharedDocs/Downloads/DE/Pressemitteilung/", + "DemografischeGrunddaten/csv_Bevoelkerung_100m_Gitter.zip", + "?__blob=publicationFile&v=2") +web_shp <- paste0("https://daten.gdz.bkg.bund.de/produkte/vg/vg250_ebenen_0101/aktuell/", + "vg250_01-01.utm32s.shape.ebenen.zip") +web_100km <- paste0("https://daten.gdz.bkg.bund.de/produkte/sonstige/geogitter/aktuell/", + "DE_Grid_ETRS89-LAEA_100km.gpkg.zip") +web_100m <- paste0("https://daten.gdz.bkg.bund.de/produkte/sonstige/geogitter/aktuell/", + "DE_Grid_ETRS89-LAEA_100m.csv.zip") + +# download files (may need to raise timeout in R options for the larger files) +download.file(web_pop, paste0(data_folder, "/csv_Bevoelkerung_100m_Gitter.zip"), + mode = "wb") # ~ 105 MB +download.file(web_shp, paste0(data_folder, "/vg250_01-01.utm32s.shape.ebenen.zip"), + mode = "wb") # ~ 66 MB +download.file(web_100km, paste0(data_folder, "/DE_Grid_ETRS89-LAEA_100km.gpkg.zip"), + mode = "wb") # ~ 1 MB +download.file(web_100m, paste0(data_folder, "/DE_Grid_ETRS89-LAEA_100m.zip"), + mode = "wb") # ~ 291 MB + +# unzip files +unzip(paste0(data_folder, "/csv_Bevoelkerung_100m_Gitter.zip"), exdir = data_folder) +unzip(paste0(data_folder, "/vg250_01-01.utm32s.shape.ebenen.zip"), exdir = data_folder) +unzip(paste0(data_folder, "/DE_Grid_ETRS89-LAEA_100km.gpkg.zip"), exdir = data_folder) +unzip(paste0(data_folder, "/DE_Grid_ETRS89-LAEA_100m.zip"), exdir = data_folder) + +path_grid_100m <- "DE_Grid_ETRS89-LAEA_100m/geogitter" +shp_grid_100km <- "DE_Grid_ETRS89-LAEA_100km.gpkg/geogitter/DE_Grid_ETRS89-LAEA_100km.gpkg" +shp_admi <- "vg250_01-01.utm32s.shape.ebenen/vg250_ebenen_0101/VG250_LAN.shp" +data_pop <- "Zensus_Bevoelkerung_100m-Gitter.csv" + + +# ----- select data ----- + +# The 100m x 100m grid is delivered in 100km x 100km chunks for the sake of file size. +# We start with the large grid in order to decide which chunks to load. + +# read large grid +grid_100km <- st_read(file.path(data_folder, shp_grid_100km)) + +# In order to do the analysis for a single federal state, load state borders +admi_data <- st_read(file.path(data_folder, shp_admi)) %>% + filter(GF == 4) %>% # use only land area + st_transform(st_crs(grid_100km)) # UTM to LAEA + +# select a federal state to work with +admi_data$GEN +(fedstate <- admi_data$GEN[1]) +fedstate_ags <- admi_data[admi_data$GEN == fedstate, ]$AGS # state identifier + +# which 100km x 100km must be loaded for that state? +grid_load <- grid_100km[st_intersects(admi_data[admi_data$GEN == fedstate, ], + grid_100km)[[1]], ]$id + +# (optional) visualize the selected chunks +ggplot() + + geom_sf(data = grid_100km) + + geom_sf(data = admi_data, fill = NA) + + geom_sf(data = admi_data[admi_data$AGS == fedstate_ags, ], fill = "cyan") + + geom_sf(data = grid_100km[grid_100km$id %in% grid_load, ], + fill = NA, color = "red") + + +# ----- read data ----- + +# read in selected chunks +grd <- vector("list", length = length(grid_load)) +for(i in seq(grid_load)) { + grd[[i]] <- fread(file.path(data_folder, path_grid_100m, + paste0(grid_load[i], "_DE_Grid_ETRS89-LAEA_100m.csv")), + colClasses = c("character", rep("numeric", 10), "character")) +} + +# combining selected chunks +grid_100m <- dplyr::bind_rows(grd)[, c(1:5, 12)] +names(grid_100m) <- c("cell_id", "x_sw", "y_sw", "x_mp", "y_mp", "ags") + +# get rid of overlaps with other fed. states +grid_100m <- filter(grid_100m, substr(ags, 1, 2) == fedstate_ags) + + +# ----- add pop. numbers ----- + +# read in population data +pop <- fread(file.path(data_folder, data_pop)) %>% + dplyr::select(Gitter_ID_100m, Einwohner) %>% + dplyr::filter(Einwohner > 0) + +# merge population to grid cells +pop_100m <- merge(grid_100m, pop, by.x = "cell_id", by.y = "Gitter_ID_100m") + +sum(pop_100m$Einwohner) # inhabitants of selected state (w/o suppressed cells) + +# expand to record-level data +pop_100m <- pop_100m[, .(1:Einwohner, x_mp, y_mp), by = .(cell_id)] %>% + rename(person_id_cell = V1) +pop_100m$person_id <- 1:nrow(pop_100m) + + +# ----- save data ----- + +save(pop_100m, grid_100m, file = paste0(data_folder, "/pop_data_DE.RData"), compress = TRUE) + +rm(list = ls()) + diff --git a/02_sdcSpatial_DE.R b/02_sdcSpatial_DE.R new file mode 100644 index 0000000..8204e5c --- /dev/null +++ b/02_sdcSpatial_DE.R @@ -0,0 +1,470 @@ +##### Apply sdcSpatial on 100m grid counts tables [DE] ##### +# +# Script does the following: +# (1) load required tools + pre-computed pop. and grid data [see: 01_read_data_DE.R] +# (2) create + inspect sdc_raster object from pop. data +# (3) apply protection algorithms (removal, quadtree, kernel smoothing) +# (4) inspect protected raster maps, check & compare various utility measures +# (5) illustrate two aspects of KWD as info loss measure + + +# ----- (1a) load libraries ----- + +library(raster) # grid data handling +library(sdcSpatial) # map protection algorithms +library(spatstat) # geospatial functions +library(SpatialKWD) # fast KWD approximation +library(ggplot2) # plotting +library(viridis) # plotting +library(dplyr) # convenience +library(tidyr) # convenience + + +# ----- (1b) load data ----- + +# customize to local setting +path_data <- "sdcSpatialExperiment_DE_ext" + +load(file.path(path_data, "pop_data_DE.RData")) + +head(pop_100m) +## data.table is one row per person ############################## ### +# cell_id: INSPIRE name of grid cell +# person_id_cell: unique ID for person within a cell +# x_mp: x-coordinate of person (cell centroid) +# y_mp: y-coordinate of person (cell centroid) +# person_id: unique person ID +# [all coordinates in ETRS89-LAEA] + +head(grid_100m) +## data.table is one row per grid cell (incl. uninhabited) ####### ### +# cell_id: INSPIRE name of grid cell +# x_sw: x-coordinate of southwest-corner +# y_sw: y-coordinate of southwest-corner +# x_mp: x-coordinate of centroid +# y_mp: y-coordinate of centroid +# ags: municipality identifier +# [all coordinates in ETRS89-LAEA] + + +# ----- (2a) set up sdc_raster ----- + +# prepare input raster (incl. uninhabited) +r <- raster(xmn = min(grid_100m$x_sw), xmx = max(grid_100m$x_sw + 100), + ymn = min(grid_100m$y_sw), ymx = max(grid_100m$y_sw + 100), + res = 100, + crs = CRS("+init=epsg:3035")) + +# to sdc_raster +pop100m_raster <- sdc_raster(as.matrix(pop_100m[, c("x_mp", "y_mp")]), + variable = 1, + r = r, + min_count = 10, # require 10-anonymity + max_risk = 1.0) # no magnitude / freq. criterion + +pal <- rev(viridis(10)) # color palette for plotting +plot(pop100m_raster, value = "count", col = pal) + + +# ----- (2b) inspect sdc_raster ----- + +# subset sensitive cells +raster_sens <- is_sensitive(pop100m_raster) + +# no. of sensitive cells +(ncells_sens <- sum(raster_sens@data@values, na.rm = TRUE)) +# share of sensitive cells (in all inhabited) +sensitivity_score(pop100m_raster) +# verify: +ncells_sens / length(unique(pop_100m$cell_id)) + +# How many individuals are at risk? +sens_vals <- pop100m_raster$value$count[raster_sens] +sum(sens_vals) # no. of indiv. at risk +sum(sens_vals) / nrow(pop_100m) # share of indiv. at risk + +# How are cell counts < k distributed? +ggplot(data = data.frame(sens_vals), aes(sens_vals)) + + geom_histogram(binwidth = 1, color = "black", fill = "lightgrey") + + scale_x_continuous(n.breaks = 7) + + xlab("no. inhabitants in sensitive cells") + + +# ----- (3) apply map protection algorithms ----- + +# simply remove sensitive cells +pop100m_raster_rm <- remove_sensitive(pop100m_raster) + +# quadtree method +pop100m_raster_qt1 <- protect_quadtree(pop100m_raster, max_zoom = 2) +pop100m_raster_qt2 <- protect_quadtree(pop100m_raster, max_zoom = 3) + +# kernel smoothing (~ 12min) +pop100m_raster_sm <- protect_smooth(pop100m_raster, bw = 100) + + +sensitivity_score(pop100m_raster_rm) +sensitivity_score(pop100m_raster_qt1) +sensitivity_score(pop100m_raster_qt2) +sensitivity_score(pop100m_raster_sm) + +# remove remaining sensitive cells (if any) +pop100m_raster_qt1 <- remove_sensitive(pop100m_raster_qt1) +pop100m_raster_qt2 <- remove_sensitive(pop100m_raster_qt2) +pop100m_raster_sm <- remove_sensitive(pop100m_raster_sm) + + +# ----- (4a) inspect protected rasters ----- + +# visual comparison +plot(pop100m_raster_rm, "count", col = pal, sensitive = FALSE) +plot(pop100m_raster_qt1, "count", col = pal, sensitive = FALSE) +plot(pop100m_raster_qt2, "count", col = pal, sensitive = FALSE) +plot(pop100m_raster_sm, "count", col = pal, sensitive = FALSE) + + +# What's the cell-wise abs. difference between protected vs. unprotected? + +difference_rasters <- function(x, y) { + + # function is only needed when rasters contain NAs + na <- is.na(x) & is.na(y) + x[is.na(x)] <- 0 + y[is.na(y)] <- 0 + z <- x - y + z[na] <- NA + z +} + +# calculate differences +pop100m_raster_rm$value$diff <- difference_rasters(pop100m_raster_rm$value$count, + pop100m_raster$value$count) +pop100m_raster_qt1$value$diff <- difference_rasters(pop100m_raster_qt1$value$count, + pop100m_raster$value$count) +pop100m_raster_qt2$value$diff <- difference_rasters(pop100m_raster_qt2$value$count, + pop100m_raster$value$count) +pop100m_raster_sm$value$diff <- difference_rasters(pop100m_raster_sm$value$count, + pop100m_raster$value$count) + +# plot differences +plot(pop100m_raster_rm$value$diff, col = pal, main = "removal") +plot(pop100m_raster_qt1$value$diff, col = pal, main = "quadtree zoom 2") +plot(pop100m_raster_qt2$value$diff, col = pal, main = "quadtree zoom 3") +plot(pop100m_raster_sm$value$diff, col = pal, main = "kernel smoothing") + +# How much have the values of sensitive cells been changed? +hist(pop100m_raster_rm$value$diff[raster_sens], main = "removal") +hist(pop100m_raster_qt1$value$diff[raster_sens], main = "quadtree zoom 2") +hist(pop100m_raster_qt2$value$diff[raster_sens], main = "quadtree zoom 3") +hist(pop100m_raster_sm$value$diff[raster_sens], main = "kernel smoothing") + +# How many sensitive cells are unchanged? +sum(pop100m_raster_rm$value$diff[raster_sens] == 0) +sum(pop100m_raster_qt1$value$diff[raster_sens] == 0) +sum(pop100m_raster_qt2$value$diff[raster_sens] == 0) +sum(pop100m_raster_sm$value$diff[raster_sens] == 0) +# What share of sensitive cells is unchanged? +sum(pop100m_raster_rm$value$diff[raster_sens] == 0) / ncells_sens +sum(pop100m_raster_qt1$value$diff[raster_sens] == 0) / ncells_sens +sum(pop100m_raster_qt2$value$diff[raster_sens] == 0) / ncells_sens +sum(pop100m_raster_sm$value$diff[raster_sens] == 0) / ncells_sens + + +# Which cells were NA in orig. and now aren't? + +pop100m_raster_rm$value$outside <- + is.na(pop100m_raster$value$count) & !is.na(pop100m_raster_rm$value$count) +pop100m_raster_qt1$value$outside <- + is.na(pop100m_raster$value$count) & !is.na(pop100m_raster_qt1$value$count) +pop100m_raster_qt2$value$outside <- + is.na(pop100m_raster$value$count) & !is.na(pop100m_raster_qt2$value$count) +pop100m_raster_sm$value$outside <- + is.na(pop100m_raster$value$count) & !is.na(pop100m_raster_sm$value$count) + +sum(getValues(pop100m_raster_rm$value$outside)) # removal +sum(getValues(pop100m_raster_qt1$value$outside)) # quadtree zoom 2 +sum(getValues(pop100m_raster_qt2$value$outside)) # quadtree zoom 3 +sum(getValues(pop100m_raster_sm$value$outside)) # kernel smoothing + + +# ----- (4b) utility measures - general ----- + +###### fun: calculate utility values for protected maps ##### ### +# x: sdc_raster object - after map protection +# orig: sdc_raster object - original before protection +# value: value to compare ("count", "sum", ...) +# measure: one or several of the following +# - bin-by-bin: - +# RMSE: Root Mean Squared Error +# HD: Hellinger's Distance +# - cross-bin: - +# Moran: change in (global) Moran's I +# KWD: Kantorovic-Wasserstein Distance +# ... : additional parameters for SpatialKWD::compareOneToOne() + +get_utility <- function(x, orig, + value = "count", + measure = c("RMSE", "HD", "Moran", "KWD"), + ...) { + + u <- vector("numeric", length = length(measure)) + names(u) <- measure + + # extract value raster of interest + r_x <- x$value[[value]] + r_o <- orig$value[[value]] + + # extract cell values + v_x <- raster::getValues(r_x) + v_o <- raster::getValues(r_o) + + # set NAs to 0 + v_x[is.na(v_x)] <- 0 + v_o[is.na(v_o)] <- 0 + + for (i in seq(u)) { + + if (measure[i] == "RMSE") { + + # compute root mean squared error + u[i] <- sqrt(mean((v_x - v_o)^2)) + } + + if (measure[i] == "HD") { + + # compute Hellinger's distance + u[i] <- sqrt(sum((sqrt(v_o) - sqrt(v_x))^2)) * (1/(sqrt(2))) + } + + if (measure[i] == "Moran") { + + # change in (global) Moran's I + u[i] <- Moran(r_x) - Moran(r_o) + } + + if (measure[i] == "KWD") { + + # rescale to balance mass + v_x_sc <- v_x * (sum(v_o) / sum(v_x)) + + # approximate Kantorovic-Wasserstein distance + xy <- raster::xyFromCell(r_x, 1:raster::ncell(r_x)) + u[i] <- SpatialKWD::compareOneToOne(Coordinates = xy, + Weights = cbind(v_o, v_x_sc), + ...)$distance + } + } + + u +} + + +# assess protected maps with utility measures + +u_measures <- c("RMSE", "HD") + +u_rm <- get_utility(pop100m_raster_rm, + orig = pop100m_raster, value = "count", + measure = u_measures) +u_qt1 <- get_utility(pop100m_raster_qt1, + orig = pop100m_raster, value = "count", + measure = u_measures) +u_qt2 <- get_utility(pop100m_raster_qt2, + orig = pop100m_raster, value = "count", + measure = u_measures) +u_sm <- get_utility(pop100m_raster_sm, + orig = pop100m_raster, value = "count", + measure = u_measures) + +results <- as.data.frame(rbind(u_rm, u_qt1, u_qt2, u_sm)) +rownames(results) <- c("removal", "qt_zoom2", "qt_zoom3", "smoothing") +results + + +# ----- (4c) utility measures - sub-area (incl. KWD) ----- + +# Even with the fast approximation algorithm, computing KWD for a large map with +# small cell size can take a long time! +# Therefore we only demonstrate on a smaller sub-area + +# select a rectangular sub-area as bounding box of regional ID +ags_sub <- "01002000" # city of Kiel +grid_sub <- grid_100m[grid_100m$ags == ags_sub, ] +ext <- extent(min(grid_sub$x_sw), max(grid_sub$x_sw) + 100, + min(grid_sub$y_sw), max(grid_sub$y_sw) + 100) + +# crop rasters to sub-area +sub_raster <- pop100m_raster +sub_raster_rm <- pop100m_raster_rm +sub_raster_qt1 <- pop100m_raster_qt1 +sub_raster_qt2 <- pop100m_raster_qt2 +sub_raster_sm <- pop100m_raster_sm +sub_raster$value <- crop(sub_raster$value, ext) +sub_raster_rm$value <- crop(sub_raster_rm$value, ext) +sub_raster_qt1$value <- crop(sub_raster_qt1$value, ext) +sub_raster_qt2$value <- crop(sub_raster_qt2$value, ext) +sub_raster_sm$value <- crop(sub_raster_sm$value, ext) + +# visually inspect sub-area +par(mfrow = c(2, 3)) +plot(sub_raster, "count", col = pal, sensitive = FALSE, main = "unprotected") +plot(sub_raster_rm, "count", col = pal, sensitive = FALSE, main = "removal") +plot(sub_raster_qt1, "count", col = pal, sensitive = FALSE, main = "quadtree zoom 2") +plot(sub_raster_qt2, "count", col = pal, sensitive = FALSE, main = "quadtree zoom 3") +plot(sub_raster_sm, "count", col = pal, sensitive = FALSE, main = "kernel smoothing (100m bw)") +par(mfrow = c(1, 1)) + +# assess sub-area with full set of utility measures + +u_sub_rm <- get_utility(sub_raster_rm, orig = sub_raster, value = "count") +u_sub_qt1 <- get_utility(sub_raster_qt1, orig = sub_raster, value = "count") +u_sub_qt2 <- get_utility(sub_raster_qt2, orig = sub_raster, value = "count") +u_sub_sm <- get_utility(sub_raster_sm, orig = sub_raster, value = "count") + +results_sub <- as.data.frame(rbind(u_sub_rm, u_sub_qt1, u_sub_qt2, u_sub_sm)) +rownames(results_sub) <- c("removal", "qt_zoom2", "qt_zoom3", "smoothing") +results_sub + +# inspect results +ggplot(results_sub %>% mutate(method = rownames(results_sub))) + + geom_point(aes(HD, KWD, color = method)) + + +# ----- (5a) KWD illustration 1 ----- + +# KWD can yield a different rank-order of protection mechanisms than bin-by-bin +# utility metrics, if the mechanism shifts distribution mass without concern for +# geographic neighborhood (e.g. removal, random swapping). + +# If, however, the protection mechanism respects geographic neighborhood by design +# (quadtree, smoothing, small random perturbation), then bin-by-bin metrics and +# KWD give the same answer (same rank order w.r.t. utility). + +data("dwellings") # pseudo household locations (from sdcSpatial) +dw_or <- sdc_raster(dwellings[, c("x", "y")], variable = 1) + +# do both methods with increasingly strong parameters +dw_qt1 <- protect_quadtree(dw_or, max_zoom = 2) +dw_qt2 <- protect_quadtree(dw_or, max_zoom = 3) +dw_qt3 <- protect_quadtree(dw_or, max_zoom = 4) +dw_sm1 <- protect_smooth(dw_or, bw = 200) +dw_sm2 <- protect_smooth(dw_or, bw = 300) +dw_sm3 <- protect_smooth(dw_or, bw = 400) + +# get utility +u_qt1 <- round(get_utility(dw_qt1, dw_or, value = "count"), 3) +u_qt2 <- round(get_utility(dw_qt2, dw_or, value = "count"), 3) +u_qt3 <- round(get_utility(dw_qt3, dw_or, value = "count"), 3) +u_sm1 <- round(get_utility(dw_sm1, dw_or, value = "count"), 3) +u_sm2 <- round(get_utility(dw_sm2, dw_or, value = "count"), 3) +u_sm3 <- round(get_utility(dw_sm3, dw_or, value = "count"), 3) + +(results_dw <- as.data.frame(rbind(u_qt1, u_qt2, u_qt3, u_sm1, u_sm2, u_sm3))) + +# plot results +ggplot(results_dw, aes(RMSE, KWD)) + + geom_point() + + geom_text(aes(label = rownames(results_dw), x = RMSE + 0.5, y = KWD + 0.03)) + + geom_abline(intercept = coef(lm(KWD ~ RMSE, data = results_dw))[1], + slope = coef(lm(KWD ~ RMSE, data = results_dw))[2], + color = "blue", lty = "dashed") + +# inspect visual impressions +par(mfrow = c(2, 3)) +plot(dw_qt1, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_qt1[4], " RMSE", u_qt1[1])) +plot(dw_qt2, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_qt2[4], " RMSE", u_qt2[1])) +plot(dw_qt3, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_qt3[4], " RMSE", u_qt3[1])) +plot(dw_sm1, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_sm1[4], " RMSE", u_sm1[1])) +plot(dw_sm2, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_sm2[4], " RMSE", u_sm2[1])) +plot(dw_sm3, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_sm3[4], " RMSE", u_sm3[1])) +par(mfrow = c(1, 1)) + + +# ----- (5b) KWD illustration 2 ----- + +# KWD seems to give preference to the quadtree method over smoothing; +# this does not necessarily correspond to human visual impression + +# make small toy data set +set.seed(3000) +pts_exa <- as.data.frame(spatstat.random::rMatClust(20, 0.1, 100)) +pts_exa <- pts_exa * 1000 + +test_raster <- sdc_raster(pts_exa, variable = 1, r = 50, + min_count = 5, max_risk = 1.0) + +## compare: quadtree vs. smoothing + +test_raster_qt <- protect_quadtree(test_raster) +test_raster_sm <- protect_smooth(test_raster, bw = 50) + +u_test_qt <- round(get_utility(test_raster_qt, orig = test_raster, + value = "count", measure = c("RMSE", "KWD")), 4) +u_test_sm <- round(get_utility(test_raster_sm, orig = test_raster, + value = "count", measure = c("RMSE", "KWD")), 4) + +par(mfrow = c(1, 3)) +plot(test_raster, "count", sensitive = FALSE, + main = "original") +plot(test_raster_qt, "count", sensitive = FALSE, + main = "quadtree", + sub = paste("KWD:", u_test_qt[2], " RMSE:", u_test_qt[1])) +plot(test_raster_sm, "count", sensitive = FALSE, + main = "smoothing", + sub = paste("KWD:", u_test_sm[2], " RMSE:", u_test_sm[1])) +par(mfrow = c(1, 1)) + +# The KWD is, in principle, well suited to compare 2 raster maps, but does it +# capture 'similarity of visual impression'? Some doubt is in order. +# KWD seems to penalize small distribution mass, transported over medium distances, +# more than the eyeball. + +# -> Idea: remove small distribution mass; compare only 'hot spots' + + +# remove the lower [tsh]% of distribution mass +tsh <- 0.2 +hs <- tsh * max(getValues(test_raster$value$count), na.rm = TRUE) +hs_qt <- tsh * max(getValues(test_raster_qt$value$count), na.rm = TRUE) +hs_sm <- tsh * max(getValues(test_raster_sm$value$count), na.rm = TRUE) + +# raster layer of 'hot spots' - original +hsr <- test_raster$value$count +hsr@data@values <- ifelse(hsr@data@values > hs, hsr@data@values, NA) +test_raster$value$hs <- hsr +# raster layer of 'hot spots' - quadtree +hsr <- test_raster_qt$value$count +hsr@data@values <- ifelse(hsr@data@values > hs_qt, hsr@data@values, NA) +test_raster_qt$value$hs <- hsr +# raster layer of 'hot spots' - smoothing +hsr <- test_raster_sm$value$count +hsr@data@values <- ifelse(hsr@data@values > hs_sm, hsr@data@values, NA) +test_raster_sm$value$hs <- hsr + +# re-compute KWD & RMSE for 'hot spots' only +u_test_qt_hs <- round(get_utility(test_raster_qt, orig = test_raster, + value = "hs", measure = c("RMSE", "KWD")), 4) +u_test_sm_hs <- round(get_utility(test_raster_sm, orig = test_raster, + value = "hs", measure = c("RMSE", "KWD")), 4) + +par(mfrow = c(1, 3)) +plot(test_raster, "hs", sensitive = FALSE, + main = "original") +plot(test_raster_qt, "hs", sensitive = FALSE, + main = "quadtree", + sub = paste("KWD:", u_test_qt_hs[2], " RMSE:", u_test_qt[1])) +plot(test_raster_sm, "hs", sensitive = FALSE, + main = "smoothing", + sub = paste("KWD:", u_test_sm_hs[2], " RMSE:", u_test_sm[1])) +par(mfrow = c(1, 1)) + +# KWD for 'hot spot'-maps, rather than full maps, seems to be more in line with +# 'visual experience'. How to finally assess the map? +# Maybe some (weighted) average of KWD for full map vs. for 'hot spot' map? + diff --git a/Example_spatialKWD_focusAreas.Rmd b/Example_spatialKWD_focusAreas.Rmd new file mode 100644 index 0000000..444a9cc --- /dev/null +++ b/Example_spatialKWD_focusAreas.Rmd @@ -0,0 +1,237 @@ +--- +title: "Reproducible example of current open issues with the SpatialKWD package" +output: + html_document: + df_print: paged + toc: true +--- + +2023-08-03 + +## 1. Preparing example data + +```{r} +library(SpatialKWD) +library(sp) +library(raster) +library(sdcSpatial) +``` + +The issues are illustrated by means of an example data set from the sdcSpatial package called +'dwellings'. The first step is to create the unprotected raster map, which forms the +ground truth. + +The example uses a map of counts (no. of units per grid cell), with 200m $\times$ +200m grid cells in a 62 $\times$ 61 cells raster map. + +```{r} +data("dwellings") + +dw <- sdcSpatial::sdc_raster(dwellings[, c("x", "y")], + variable = 1, r = 200, + max_risk = .8, min_count = 10) + +ras_orig <- dw$value$count # original raster + +raster::plot(ras_orig, main = "no. of dwellings") # map +ras_orig # raster info +``` + +Next, we create the altered raster maps using three protection methods: +1) suppression of sensitive cells (remove_sensitive), +2) quadtree method (protect_quadtree), +3) kernel smoothing (protect_smooth). + +```{r} +# apply protection methods +dw_rm <- remove_sensitive(dw) +dw_qt <- protect_quadtree(dw, max_zoom = 2) +dw_sm <- protect_smooth(dw, bw = 200) + +# extract protected rasters from sdcSpatial object +ras_rm <- dw_rm$value$count +ras_qt <- dw_qt$value$count +ras_sm <- dw_sm$value$count +``` + +Following up, we select a square focus area. It is identified by its centroid and +radius. + +```{r} +# Focus area (square) + +fa_ctrd <- c(155100, 464100) # center of focus area +fa_rads <- 3000 # radius of focus area + + +# maps + +par(mfrow = c(2, 2)) + +plot(ras_orig, main = "original") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_rm, main = "cell suppression") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_qt, main = "quad tree method") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_sm, main = "smoothing method") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") + +par(mfrow = c(1, 1)) +``` + + +## 2. FocusArea sometimes returns zero for clear differences + +We continue by evaluating the difference between altered maps and ground truth with +respect to the chosen focus area. + +This involves the following steps: +1) extract the center coordinates of tiles (raster::xyFromCell), +2) extract the tile-values (weights) (raster::getValues) [where NA-values are +treated as zeros]. + +```{r} +# extract coordinates from raster +xy <- raster::xyFromCell(ras_orig, 1:ncell(ras_orig)) + +# extract cell values from rasters +w_orig <- raster::getValues(ras_orig) +w_rm <- raster::getValues(ras_rm) +w_qt <- raster::getValues(ras_qt) +w_sm <- raster::getValues(ras_sm) + +# treat NA-nodes as having zero weight +w_orig[is.na(w_orig)] <- 0 +w_rm[is.na(w_rm)] <- 0 +w_qt[is.na(w_qt)] <- 0 +w_sm[is.na(w_sm)] <- 0 +``` + +Using these inputs we calculate the first set of KWDs. + +```{r} +kwd_rm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +kwd_qt <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +kwd_sm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +``` + +Even though the suppression map is clearly different (all cells with < 10 units are +removed, see maps above), we get a KWD of zero. + +```{r} +c(kwd_rm$distance, kwd_qt$distance, kwd_sm$distance) +``` + +Second try: The description of the focusArea function tells us that "All the weights outside +the focus area should be equal to zero." We create new rasters, where all tiles outside +the focus area are assigned zero values. + +```{r} +ex_fa <- extent(fa_ctrd[1] - (fa_rads - 100), fa_ctrd[1] + (fa_rads - 100), + fa_ctrd[2] - (fa_rads - 100), fa_ctrd[2] + (fa_rads - 100)) +clls_fa <- cellsFromExtent(ras_orig, ex_fa) + +ras_rm_fa <- ras_rm +ras_qt_fa <- ras_qt +ras_sm_fa <- ras_sm +ras_rm_fa[-clls_fa] <- 0 +ras_qt_fa[-clls_fa] <- 0 +ras_sm_fa[-clls_fa] <- 0 + + +par(mfrow = c(2, 2)) + +plot(ras_orig, main = "original") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_rm_fa, main = "cell suppression (focus area)") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_qt_fa, main = "quad tree method (focus area)") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_sm_fa, main = "smoothing method (focus area)") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") + +par(mfrow = c(1, 1)) +``` + +With these we re-calculate the KWDs. + +```{r} +# updated weights (zero outside focus area) +w_rm_fa <- getValues(ras_rm_fa) +w_qt_fa <- getValues(ras_qt_fa) +w_sm_fa <- getValues(ras_sm_fa) +w_rm_fa[is.na(w_rm_fa)] <- 0 +w_qt_fa[is.na(w_qt_fa)] <- 0 +w_sm_fa[is.na(w_sm_fa)] <- 0 + +kwd_rm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm_fa), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +kwd_qt_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt_fa), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +kwd_sm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm_fa), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +``` + +We still get a KWD of zero for the 'removal' method. Furthermore, we now also get zero +for smoothing. + +```{r} +c(kwd_rm_fa$distance, kwd_qt_fa$distance, kwd_sm_fa$distance) +``` + + +## 3. How to apply a fixed cost to missing units of mass? + +Suppose now we look at the whole map rather than focus areas. Then, we might get +a mass mismatch, especially with removal: + +```{r} +c(sum(w_orig), sum(w_rm)) +``` + +There are ca. 2000 units missing after removal. We want to try solving the problem +by assigning each a cost of half the map diagonal. + +```{r} +# compute length of the diagonal in cells +diag_cells <- sqrt(ncol(ras_orig)^2 + nrow(ras_orig)^2) +# ... in meters +diag_metrs <- diag_cells * 200 + +# solve with extra artificial bin (?) +kwd_rm_bal <- compareOneToOne(Coordinates = xy, Weights = cbind(w_orig, w_rm), + L = 3, recode = TRUE, + unbalanced = TRUE, unbal_cost = diag_metrs/2) +``` + +Again, this yields a distance of zero. + +```{r} +kwd_rm_bal$distance +``` + +Do we get a distance of zero, because all non-zero tiles after removal match exactly? +(I.e. for any given tile the mass is missing completely or not at all.) + +Given that this is the case, we could, in principle, assign the costs manually +(2000 $\times c$ ). +Still, intuitively the function should not return zero (?) + diff --git a/Example_spatialKWD_focusAreas.html b/Example_spatialKWD_focusAreas.html new file mode 100644 index 0000000..9442cfd --- /dev/null +++ b/Example_spatialKWD_focusAreas.html @@ -0,0 +1,440 @@ + + + + +
+ + + + + + + + +2023-08-03
+library(SpatialKWD)
+library(sp)
+library(raster)
+library(sdcSpatial)
+The issues are illustrated by means of an example data set from the sdcSpatial package called ‘dwellings’. The first step is to create the unprotected raster map, which forms the ground truth.
+The example uses a map of counts (no. of units per grid cell), with 200m \(\times\) 200m grid cells in a 62 \(\times\) 61 cells raster map.
+data("dwellings")
+
+dw <- sdcSpatial::sdc_raster(dwellings[, c("x", "y")],
+ variable = 1, r = 200,
+ max_risk = .8, min_count = 10)
+
+ras_orig <- dw$value$count # original raster
+
+raster::plot(ras_orig, main = "no. of dwellings") # map
+ras_orig # raster info
+## class : RasterLayer
+## dimensions : 62, 61, 3782 (nrow, ncol, ncell)
+## resolution : 200, 200 (x, y)
+## extent : 149400, 161600, 457800, 470200 (xmin, xmax, ymin, ymax)
+## crs : NA
+## source : memory
+## names : count
+## values : 1, 329 (min, max)
+Next, we create the altered raster maps using three protection methods: 1) suppression of sensitive cells (remove_sensitive), 2) quadtree method (protect_quadtree), 3) kernel smoothing (protect_smooth).
+# apply protection methods
+dw_rm <- remove_sensitive(dw)
+dw_qt <- protect_quadtree(dw, max_zoom = 2)
+dw_sm <- protect_smooth(dw, bw = 200)
+
+# extract protected rasters from sdcSpatial object
+ras_rm <- dw_rm$value$count
+ras_qt <- dw_qt$value$count
+ras_sm <- dw_sm$value$count
+Following up, we select a square focus area. It is identified by its centroid and radius.
+# Focus area (square)
+
+fa_ctrd <- c(155100, 464100) # center of focus area
+fa_rads <- 3000 # radius of focus area
+
+
+# maps
+
+par(mfrow = c(2, 2))
+
+plot(ras_orig, main = "original")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_rm, main = "cell suppression")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_qt, main = "quad tree method")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_sm, main = "smoothing method")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+par(mfrow = c(1, 1))
+We continue by evaluating the difference between altered maps and ground truth with respect to the chosen focus area.
+This involves the following steps: 1) extract the center coordinates of tiles (raster::xyFromCell), 2) extract the tile-values (weights) (raster::getValues) [where NA-values are treated as zeros].
+# extract coordinates from raster
+xy <- raster::xyFromCell(ras_orig, 1:ncell(ras_orig))
+
+# extract cell values from rasters
+w_orig <- raster::getValues(ras_orig)
+w_rm <- raster::getValues(ras_rm)
+w_qt <- raster::getValues(ras_qt)
+w_sm <- raster::getValues(ras_sm)
+
+# treat NA-nodes as having zero weight
+w_orig[is.na(w_orig)] <- 0
+w_rm[is.na(w_rm)] <- 0
+w_qt[is.na(w_qt)] <- 0
+w_sm[is.na(w_sm)] <- 0
+Using these inputs we calculate the first set of KWDs.
+kwd_rm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_qt <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_sm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+Even though the suppression map is clearly different (all cells with < 10 units are removed, see maps above), we get a KWD of zero.
+c(kwd_rm$distance, kwd_qt$distance, kwd_sm$distance)
+## [1] 0.000 6514.312 2332.205
+Second try: The description of the focusArea function tells us that “All the weights outside the focus area should be equal to zero.” We create new rasters, where all tiles outside the focus area are assigned zero values.
+ex_fa <- extent(fa_ctrd[1] - (fa_rads - 100), fa_ctrd[1] + (fa_rads - 100),
+ fa_ctrd[2] - (fa_rads - 100), fa_ctrd[2] + (fa_rads - 100))
+clls_fa <- cellsFromExtent(ras_orig, ex_fa)
+
+ras_rm_fa <- ras_rm
+ras_qt_fa <- ras_qt
+ras_sm_fa <- ras_sm
+ras_rm_fa[-clls_fa] <- 0
+ras_qt_fa[-clls_fa] <- 0
+ras_sm_fa[-clls_fa] <- 0
+
+
+par(mfrow = c(2, 2))
+
+plot(ras_orig, main = "original")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_rm_fa, main = "cell suppression (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_qt_fa, main = "quad tree method (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_sm_fa, main = "smoothing method (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+par(mfrow = c(1, 1))
+With these we re-calculate the KWDs.
+# updated weights (zero outside focus area)
+w_rm_fa <- getValues(ras_rm_fa)
+w_qt_fa <- getValues(ras_qt_fa)
+w_sm_fa <- getValues(ras_sm_fa)
+w_rm_fa[is.na(w_rm_fa)] <- 0
+w_qt_fa[is.na(w_qt_fa)] <- 0
+w_sm_fa[is.na(w_sm_fa)] <- 0
+
+kwd_rm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm_fa),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_qt_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt_fa),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_sm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm_fa),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+We still get a KWD of zero for the ‘removal’ method. Furthermore, we now also get zero for smoothing.
+c(kwd_rm_fa$distance, kwd_qt_fa$distance, kwd_sm_fa$distance)
+## [1] 0.000 2748.876 0.000
+Suppose now we look at the whole map rather than focus areas. Then, we might get a mass mismatch, especially with removal:
+c(sum(w_orig), sum(w_rm))
+## [1] 90603 88530
+There are ca. 2000 units missing after removal. We want to try solving the problem by assigning each a cost of half the map diagonal.
+# compute length of the diagonal in cells
+diag_cells <- sqrt(ncol(ras_orig)^2 + nrow(ras_orig)^2)
+# ... in meters
+diag_metrs <- diag_cells * 200
+
+# solve with extra artificial bin (?)
+kwd_rm_bal <- compareOneToOne(Coordinates = xy, Weights = cbind(w_orig, w_rm),
+ L = 3, recode = TRUE,
+ unbalanced = TRUE, unbal_cost = diag_metrs/2)
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+Again, this yields a distance of zero.
+kwd_rm_bal$distance
+## [1] 0
+Do we get a distance of zero, because all non-zero tiles after removal match exactly? (I.e. for any given tile the mass is missing completely or not at all.)
+Given that this is the case, we could, in principle, assign the costs manually (2000 \(\times c\) ). Still, intuitively the function should not return zero (?)
+library(SpatialKWD)
+library(sp)
+library(raster)
+library(sdcSpatial)
+
+
+
+The issues are illustrated by means of an example data set from the sdcSpatial package called ‘dwellings’. The first step is to create the unprotected raster map, which forms the ground truth.
+The example uses a map of counts (no. of units per grid cell), with 200m \(\times\) 200m grid cells in a 62 \(\times\) 61 cells raster map.
+ + + +data("dwellings")
+
+dw <- sdcSpatial::sdc_raster(dwellings[, c("x", "y")],
+ variable = 1, r = 200,
+ max_risk = .8, min_count = 5)
+
+ras_orig <- dw$value$count # original raster
+
+raster::plot(ras_orig) # map
+ras_orig # raster info
+
+
+
+Next, we create the altered raster maps using three protection methods: 1) suppression of sensitive cells (remove_sensitive), 2) quadtree method (protect_quadtree), 3) kernel smoothing (protect_smooth).
+ + + +# apply protection methods
+dw_rm <- remove_sensitive(dw)
+dw_qt <- protect_quadtree(dw, max_zoom = 2)
+dw_sm <- protect_smooth(dw, bw = 200)
+
+# extract protected rasters from sdcSpatial object
+ras_rm <- dw_rm$value$count
+ras_qt <- dw_qt$value$count
+ras_sm <- dw_sm$value$count
+
+
+
+Following up, we select a square focus area. It is identified by its centroid and radius.
+ + + +# Focus area (square)
+
+fa_ctrd <- c(155100, 464100) # center of focus area
+fa_rads <- 3000 # radius of focus area
+
+
+par(mfrow = c(2, 2))
+
+plot(ras_orig, main = "original")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_rm, main = "cell suppression")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_qt, main = "quad tree method")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_sm, main = "smoothing method")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+
+par(mfrow = c(1, 1))
+
+
+
+
+
+
+# ----- KWDs -----
+
+# extract coordinates from raster
+xy <- raster::xyFromCell(ras_orig, 1:ncell(ras_orig))
+
+# extract cell values from rasters
+w_orig <- raster::getValues(ras_orig)
+w_rm <- raster::getValues(ras_rm)
+w_qt <- raster::getValues(ras_qt)
+w_sm <- raster::getValues(ras_sm)
+
+# treat NA-nodes as having zero weight
+w_orig[is.na(w_orig)] <- 0
+w_rm[is.na(w_rm)] <- 0
+w_qt[is.na(w_qt)] <- 0
+w_sm[is.na(w_sm)] <- 0
+
+kwd_rm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+kwd_qt <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+kwd_sm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
+# removal gives dist. of 0, although clearly different
+c(kwd_rm$distance, kwd_qt$distance, kwd_sm$distance)
+
+
+# try with weights outside FA set to zero
+ex_fa <- extent(fa_ctrd[1] - (fa_rads - 100), fa_ctrd[1] + (fa_rads - 100),
+ fa_ctrd[2] - (fa_rads - 100), fa_ctrd[2] + (fa_rads - 100))
+clls_fa <- cellsFromExtent(ras_orig, ex_fa)
+
+ras_rm_fa <- ras_rm
+ras_qt_fa <- ras_qt
+ras_sm_fa <- ras_sm
+ras_rm_fa[-clls_fa] <- 0
+ras_qt_fa[-clls_fa] <- 0
+ras_sm_fa[-clls_fa] <- 0
+
+
+par(mfrow = c(2, 2))
+
+plot(ras_orig, main = "original")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_rm_fa, main = "cell suppression (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_qt_fa, main = "quad tree method (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_sm_fa, main = "smoothing method (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+ ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+
+par(mfrow = c(1, 1))
+
+# updated weights (zero outside focus area)
+w_rm_fa <- getValues(ras_rm_fa)
+w_qt_fa <- getValues(ras_qt_fa)
+w_sm_fa <- getValues(ras_sm_fa)
+w_rm_fa[is.na(w_rm_fa)] <- 0
+w_qt_fa[is.na(w_qt_fa)] <- 0
+w_sm_fa[is.na(w_sm_fa)] <- 0
+
+kwd_rm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm_fa),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+kwd_qt_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt_fa),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+kwd_sm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm_fa),
+ L = 3, recode = TRUE, area = "linf",
+ x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
+c(kwd_rm_fa$distance, kwd_qt_fa$distance, kwd_sm_fa$distance)
+
+
+
+
+We use pre-computed raster maps from 02_sdcSpatial_DE.R, see GitHub repo.
library(sdcSpatial)
+library(SpatialKWD)
+library(sp)
+library(raster)
+library(viridisLite)
+library(viridis)
+
+load("sdcRasters_SHARE.RData")
+
+pal <- rev(viridis(10))
+Selecting four focus areas by: centroid (center point in ETRS89-LAEA) and size (measured in cell width).
+fa1_ctrd <- c(4122250, 3162750)
+fa2_ctrd <- c(4195750, 2993750)
+fa3_ctrd <- c(4535250, 3474750)
+fa4_ctrd <- c(4328750, 2751250)
+
+cellsize <- 500
+
+# focus area sizes
+fa_wcells <- c(55, 41, 75, 55)
+
+# find cell index of centroid in larger raster
+fa1_cntr <- c(raster::colFromX(pop_ras$value, x = fa1_ctrd[1]),
+ raster::rowFromY(pop_ras$value, y = fa1_ctrd[2]))
+fa2_cntr <- c(raster::colFromX(pop_ras$value, x = fa2_ctrd[1]),
+ raster::rowFromY(pop_ras$value, y = fa2_ctrd[2]))
+fa3_cntr <- c(raster::colFromX(pop_ras$value, x = fa3_ctrd[1]),
+ raster::rowFromY(pop_ras$value, y = fa3_ctrd[2]))
+fa4_cntr <- c(raster::colFromX(pop_ras$value, x = fa4_ctrd[1]),
+ raster::rowFromY(pop_ras$value, y = fa4_ctrd[2]))
+
+# create extent objects for chosen positions & sizes
+fa1_extt <- extent(pop_ras$value, fa1_cntr[2] - (fa_wcells[1] - 1)/2,
+ fa1_cntr[2] + (fa_wcells[1] - 1)/2,
+ fa1_cntr[1] - (fa_wcells[1] - 1)/2,
+ fa1_cntr[1] + (fa_wcells[1] - 1)/2)
+
+fa2_extt <- extent(pop_ras$value, fa2_cntr[2] - (fa_wcells[2] - 1)/2,
+ fa2_cntr[2] + (fa_wcells[2] - 1)/2,
+ fa2_cntr[1] - (fa_wcells[2] - 1)/2,
+ fa2_cntr[1] + (fa_wcells[2] - 1)/2)
+
+fa3_extt <- extent(pop_ras$value, fa3_cntr[2] - (fa_wcells[3] - 1)/2,
+ fa3_cntr[2] + (fa_wcells[3] - 1)/2,
+ fa3_cntr[1] - (fa_wcells[3] - 1)/2,
+ fa3_cntr[1] + (fa_wcells[3] - 1)/2)
+
+fa4_extt <- extent(pop_ras$value, fa4_cntr[2] - (fa_wcells[4] - 1)/2,
+ fa4_cntr[2] + (fa_wcells[4] - 1)/2,
+ fa4_cntr[1] - (fa_wcells[4] - 1)/2,
+ fa4_cntr[1] + (fa_wcells[4] - 1)/2)
+
+# crop overall raster to focus area for plotting & pop. counts
+fa1 <- fa2 <- fa3 <- fa4 <- pop_ras
+fa1$value <- crop(fa1$value, fa1_extt)
+fa2$value <- crop(fa2$value, fa2_extt)
+fa3$value <- crop(fa3$value, fa3_extt)
+fa4$value <- crop(fa4$value, fa4_extt)
+
+par(mfrow = c(2, 2))
+ plot(fa1, "count", sensitive = FALSE, col = pal, main = "Ruhr valley")
+ plot(fa2, "count", sensitive = FALSE, col = pal, main = "MZ + WI")
+ plot(fa3, "count", sensitive = FALSE, col = pal, main = "Strelasund")
+ plot(fa4, "count", sensitive = FALSE, col = pal, main = "Allgäu")
+par(mfrow = c(1, 1))
+
+# calculate populations for subregions
+p_fa <- c(sum(getValues(fa1$value$count), na.rm = TRUE),
+ sum(getValues(fa2$value$count), na.rm = TRUE),
+ sum(getValues(fa3$value$count), na.rm = TRUE),
+ sum(getValues(fa4$value$count), na.rm = TRUE))
+
+fa_sizes <- cbind(fa_wcells,
+ fa_wcells^2,
+ fa_wcells^2 * cellsize^2 / 1e+6,
+ p_fa)
+colnames(fa_sizes) <- c("width (ncells)", "size (ncells)", "size (km^2)", "pop.")
+rownames(fa_sizes) <- c("fa1", "fa2", "fa3", "fa4")
+fa_sizes
+## width (ncells) size (ncells) size (km^2) pop.
+## fa1 55 3025 756.25 1745062
+## fa2 41 1681 420.25 558055
+## fa3 75 5625 1406.25 107863
+## fa4 55 3025 756.25 106283
+We start by protecting focus areas, using the sdcSpatial algorithms. For this, wrapper functions are used:
+wrap_qt <- function(x, max_zoom = Inf, enlarge = TRUE) {
+
+ if(enlarge) {
+ pad <- min(2^max_zoom - 1, 10) # maximum padding is 10 (arbitrary)
+ x$value <- extend(x$value, c(pad, pad))
+ x <- sdcSpatial::protect_quadtree(x, max_zoom)
+ } else {
+ x <- sdcSpatial::protect_quadtree(x, max_zoom)
+ }
+ x
+}
+
+wrap_sm <- function(x, bw = res(x$value)[1], enlarge = TRUE) {
+
+ if(enlarge) {
+ pad <- 3 * (bw / res(x$value)[1])
+ x$value <- extend(x$value, c(pad, pad))
+ x <- sdcSpatial::protect_smooth(x, bw)
+ } else {
+ x <- sdcSpatial::protect_smooth(bw)
+ }
+ x
+}
+The idea behind these wrappers is to add some empty cells around the borders of the focus areas in order to capture distribution mass that ‘spills out’. This mass can then be taken into account later-on (esp. for KWD computation).
+The necessary enlargement depends on the protection mechanism and can be inferred by looking into the functions sdcSpatial::protect_quadtree() and sdcSpatial::protect_smooth().
+For the quadtree method we need to enlarge the grid by \[2^{\text{max_zoom} - 1} - 1\] rows / columns on each side. For Gaussian smoothing it is \[3 \cdot \frac{\text{bandwidth}}{\text{cellwidth}}\] rows / columns on each side.
+We call the (wrapped) protection algorithms separately for each focus area (4 methods times 4 areas = 16 outcomes).
+par(mfrow = c(2, 2))
+
+# removal
+fa1_rm <- remove_sensitive(fa1)
+fa2_rm <- remove_sensitive(fa2)
+fa3_rm <- remove_sensitive(fa3)
+fa4_rm <- remove_sensitive(fa4)
+
+# quad tree I
+fa1_qt1 <- wrap_qt(fa1, max_zoom = 2)
+fa2_qt1 <- wrap_qt(fa2, max_zoom = 2)
+fa3_qt1 <- wrap_qt(fa3, max_zoom = 2)
+fa4_qt1 <- wrap_qt(fa4, max_zoom = 2)
+
+# quad tree II
+fa1_qt2 <- wrap_qt(fa1, max_zoom = 3)
+fa2_qt2 <- wrap_qt(fa2, max_zoom = 3)
+fa3_qt2 <- wrap_qt(fa3, max_zoom = 3)
+fa4_qt2 <- wrap_qt(fa4, max_zoom = 3)
+
+# smoothing
+fa1_sm <- wrap_sm(fa1, bw = cellsize)
+fa2_sm <- wrap_sm(fa2, bw = cellsize)
+fa3_sm <- wrap_sm(fa3, bw = cellsize)
+fa4_sm <- wrap_sm(fa4, bw = cellsize)
+
+
+# show results
+
+plot(fa1_rm, "count", sensitive = FALSE, col = pal, main = "Ruhr valley (rm)")
+plot(fa2_rm, "count", sensitive = FALSE, col = pal, main = "MZ + WI (rm)")
+plot(fa3_rm, "count", sensitive = FALSE, col = pal, main = "Strelasund (rm)")
+plot(fa4_rm, "count", sensitive = FALSE, col = pal, main = "Allgäu (rm)")
+plot(fa1_qt1, "count", sensitive = FALSE, col = pal, main = "Ruhr valley (qt1)")
+plot(fa2_qt1, "count", sensitive = FALSE, col = pal, main = "MZ + WI (qt1)")
+plot(fa3_qt1, "count", sensitive = FALSE, col = pal, main = "Strelasund (qt1)")
+plot(fa4_qt1, "count", sensitive = FALSE, col = pal, main = "Allgäu (qt1)")
+plot(fa1_qt2, "count", sensitive = FALSE, col = pal, main = "Ruhr valley (qt2)")
+plot(fa2_qt2, "count", sensitive = FALSE, col = pal, main = "MZ + WI (qt2)")
+plot(fa3_qt2, "count", sensitive = FALSE, col = pal, main = "Strelasund (qt2)")
+plot(fa4_qt2, "count", sensitive = FALSE, col = pal, main = "Allgäu (qt2)")
+plot(fa1_sm, "count", sensitive = FALSE, col = pal, main = "Ruhr valley (sm)")
+plot(fa2_sm, "count", sensitive = FALSE, col = pal, main = "MZ + WI (sm)")
+plot(fa3_sm, "count", sensitive = FALSE, col = pal, main = "Strelasund (sm)")
+plot(fa4_sm, "count", sensitive = FALSE, col = pal, main = "Allgäu (sm)")
+par(mfrow = c(1, 1))
+Note that for qt1, qt2 and sm the mass spills slightly beyond the original borders. We can show that with the exception of the removal method, all mass has been preserved:
+mass_rm <- c(sum(getValues(fa1_rm$value$count), na.rm = TRUE),
+ sum(getValues(fa2_rm$value$count), na.rm = TRUE),
+ sum(getValues(fa3_rm$value$count), na.rm = TRUE),
+ sum(getValues(fa4_rm$value$count), na.rm = TRUE))
+
+mass_qt1 <- c(sum(getValues(fa1_qt1$value$count), na.rm = TRUE),
+ sum(getValues(fa2_qt1$value$count), na.rm = TRUE),
+ sum(getValues(fa3_qt1$value$count), na.rm = TRUE),
+ sum(getValues(fa4_qt1$value$count), na.rm = TRUE))
+
+mass_qt2 <- c(sum(getValues(fa1_qt2$value$count), na.rm = TRUE),
+ sum(getValues(fa2_qt2$value$count), na.rm = TRUE),
+ sum(getValues(fa3_qt2$value$count), na.rm = TRUE),
+ sum(getValues(fa4_qt2$value$count), na.rm = TRUE))
+
+mass_sm <- c(sum(getValues(fa1_sm$value$count), na.rm = TRUE),
+ sum(getValues(fa2_sm$value$count), na.rm = TRUE),
+ sum(getValues(fa3_sm$value$count), na.rm = TRUE),
+ sum(getValues(fa4_sm$value$count), na.rm = TRUE))
+
+masses <- cbind(p_fa, mass_rm, mass_qt1, mass_qt2, mass_sm)
+masses <- cbind(masses, round(masses[, 2:5] - p_fa))
+colnames(masses) <- c("original", "removal", "quad tree I", "quad tree II", "smoothing",
+ "diff_removal", "diff_qt1", "diff_qt2", "diff_sm")
+rownames(masses) <- c("fa1", "fa2", "fa3", "fa4")
+masses
+## original removal quad tree I quad tree II smoothing diff_removal diff_qt1
+## fa1 1745062 1744773 1745062 1745062 1745062 -289 0
+## fa2 558055 557830 558055 558055 558055 -225 0
+## fa3 107863 107175 107863 107863 107863 -688 0
+## fa4 106283 105116 106283 106283 106283 -1167 0
+## diff_qt2 diff_sm
+## fa1 0 0
+## fa2 0 0
+## fa3 0 0
+## fa4 0 0
+We can now calculate risk measures for the focus areas.
+# --- share of cells at risk ---
+# initial
+sens <- c(sensitivity_score(fa1), sensitivity_score(fa2), sensitivity_score(fa3), sensitivity_score(fa4))
+# after SDC
+sens_rm <- c(sensitivity_score(fa1_rm), sensitivity_score(fa2_rm), sensitivity_score(fa3_rm), sensitivity_score(fa4_rm))
+sens_qt1 <- c(sensitivity_score(fa1_qt1), sensitivity_score(fa2_qt1), sensitivity_score(fa3_qt1), sensitivity_score(fa4_qt1))
+sens_qt2 <- c(sensitivity_score(fa1_qt2), sensitivity_score(fa2_qt2), sensitivity_score(fa3_qt2), sensitivity_score(fa4_qt2))
+sens_sm <- c(sensitivity_score(fa1_sm), sensitivity_score(fa2_sm), sensitivity_score(fa3_sm), sensitivity_score(fa4_sm))
+
+risk_cell <- cbind(sens, sens_rm, sens_qt1, sens_qt2, sens_sm)
+colnames(risk_cell) <- c("cell_initial", "cell_removal", "cell_quad tree I", "cell_quad tree II", "cell_smoothing")
+rownames(risk_cell) <- c("fa1", "fa2", "fa3", "fa4")
+round(risk_cell, 3)
+## cell_initial cell_removal cell_quad tree I cell_quad tree II cell_smoothing
+## fa1 0.033 0 0.007 0 0
+## fa2 0.077 0 0.089 0 0
+## fa3 0.186 0 0.142 0 0
+## fa4 0.213 0 0.077 0 0
+# --- share of pop. at risk ---
+# initial
+sensp <- c(sum(fa1$value$count[is_sensitive(fa1)]), sum(fa2$value$count[is_sensitive(fa2)]),
+ sum(fa3$value$count[is_sensitive(fa3)]), sum(fa4$value$count[is_sensitive(fa4)]))
+# after SDC
+sensp_rm <- c(sum(fa1_rm$value$count[is_sensitive(fa1_rm)]), sum(fa2_rm$value$count[is_sensitive(fa2_rm)]),
+ sum(fa3_rm$value$count[is_sensitive(fa3_rm)]), sum(fa4_rm$value$count[is_sensitive(fa4_rm)]))
+sensp_qt1 <- c(sum(fa1_qt1$value$count[is_sensitive(fa1_qt1)]), sum(fa2_qt1$value$count[is_sensitive(fa2_qt1)]),
+ sum(fa3_qt1$value$count[is_sensitive(fa3_qt1)]), sum(fa4_qt1$value$count[is_sensitive(fa4_qt1)]))
+sensp_qt2 <- c(sum(fa1_qt2$value$count[is_sensitive(fa1_qt2)]), sum(fa2_qt2$value$count[is_sensitive(fa2_qt2)]),
+ sum(fa3_qt2$value$count[is_sensitive(fa3_qt2)]), sum(fa4_qt2$value$count[is_sensitive(fa4_qt2)]))
+sensp_sm <- c(sum(fa1_sm$value$count[is_sensitive(fa1_sm)]), sum(fa2_sm$value$count[is_sensitive(fa2_sm)]),
+ sum(fa3_sm$value$count[is_sensitive(fa3_sm)]), sum(fa4_sm$value$count[is_sensitive(fa4_sm)]))
+risk_pop <- cbind(sensp/p_fa, sensp_rm/p_fa, sensp_qt1/p_fa, sensp_qt2/p_fa, sensp_sm/p_fa)
+colnames(risk_pop) <- c("pop_initial", "pop_removal", "pop_quad tree I", "pop_quad tree II", "pop_smoothing")
+rownames(risk_pop) <- c("fa1", "fa2", "fa3", "fa4")
+risk_pop
+## pop_initial pop_removal pop_quad tree I pop_quad tree II pop_smoothing
+## fa1 0.0001656102 0 8.309160e-06 0 0
+## fa2 0.0004031861 0 1.334994e-04 0 0
+## fa3 0.0063784616 0 1.678055e-03 0 0
+## fa4 0.0109801191 0 1.232558e-03 0 0
+Next, we calculate the normalized Hellinger distances.
+In accordance with the definition, mass mismatch does not bother the metric, so we consider only the original focus areas without enlargement around the borders. We therefore include a step where if the focus area was enlarged by a wrapper function, it gets cropped to its original size first.
+getHD <- function(x, orig, value = "count") {
+
+ # resize protected raster if it was enlarged
+ if(extent(x$value) > extent(orig$value)) {
+ x$value <- crop(x$value, extent(orig$value))
+ }
+
+ # extract value raster of interest
+ r_x <- x$value[[value]]
+ r_o <- orig$value[[value]]
+
+ # extract cell values
+ v_x <- raster::getValues(r_x)
+ v_o <- raster::getValues(r_o)
+
+ # set NAs to 0
+ v_x[is.na(v_x)] <- 0
+ v_o[is.na(v_o)] <- 0
+
+ # rescale
+ v_x <- v_x / sum(v_x)
+ v_o <- v_o / sum(v_o)
+
+ HD <- sqrt(sum((sqrt(v_x) - sqrt(v_o))^2)) * (1/sqrt(2))
+ HD
+}
+
+hd_rm <- c(getHD(fa1_rm, fa1), getHD(fa2_rm, fa2), getHD(fa3_rm, fa3), getHD(fa4_rm, fa4))
+hd_qt1 <- c(getHD(fa1_qt1, fa1), getHD(fa2_qt1, fa2), getHD(fa3_qt1, fa3), getHD(fa4_qt1, fa4))
+hd_qt2 <- c(getHD(fa1_qt2, fa1), getHD(fa2_qt2, fa2), getHD(fa3_qt2, fa3), getHD(fa4_qt2, fa4))
+hd_sm <- c(getHD(fa1_sm, fa1), getHD(fa2_sm, fa2), getHD(fa3_sm, fa3), getHD(fa4_sm, fa4))
+
+hd <- cbind(hd_rm, hd_qt1, hd_qt2, hd_sm)
+colnames(hd) <- c("removal", "quad tree I", "quad tree II", "smoothing")
+rownames(hd) <- c("fa1", "fa2", "fa3", "fa4")
+round(hd, 3)
+## removal quad tree I quad tree II smoothing
+## fa1 0.009 0.070 0.083 0.280
+## fa2 0.014 0.114 0.213 0.366
+## fa3 0.057 0.151 0.205 0.451
+## fa4 0.074 0.167 0.226 0.416
+KWD requires that both maps have the same mass and extent. By enlarging the focus areas slightly the mass was kept for all methods except removal (see above).
+In order to account for the slightly larger extent, the ground truth is also slightly enlarged before the comparison. Since all the padding is made up of zero values, it does not distort the measure. This takes care of quadtree and smoothing.
+getKWD <- function(x, orig, value = "count", balance = "none", plot_map = FALSE, ...) {
+
+ # resize original raster to balance mass for qt/sm
+ if(extent(x$value) > extent(orig$value)) {
+ orig$value <- extend(orig$value, extent(x$value))
+ }
+
+ if(balance == "frame") {
+ x$value <- extend(x$value, c(1, 1), value = 0)
+ orig$value <- extend(orig$value, c(1, 1), value = 0)
+ }
+
+ # extract value raster of interest
+ r_x <- x$value[[value]]
+ r_o <- orig$value[[value]]
+
+ # extract cell values
+ v_x <- raster::getValues(r_x)
+ v_o <- raster::getValues(r_o)
+
+ # set NAs to 0
+ v_x[is.na(v_x)] <- 0
+ v_o[is.na(v_o)] <- 0
+
+ xy <- raster::xyFromCell(r_x, 1:ncell(r_x))
+
+ # balance mass for removal
+ if(balance %in% c("pop_ctrd", "frame")) {
+
+ mdiff <- sum(v_o) - sum(v_x) # mass difference must be positive
+
+ if(balance == "frame") {
+ frameclls <- unique(c(raster::cellFromCol(r_x, c(1, ncol(r_x))),
+ raster::cellFromRow(r_x, c(1, nrow(r_x)))))
+ r_x[frameclls] <- 1/(length(frameclls)) * mdiff
+ }
+
+ if(balance == "pop_ctrd") {
+
+ # get population weighted centroid
+ pop_ctrd <- c(stats::weighted.mean(xy[, 1], v_x),
+ stats::weighted.mean(xy[, 2], v_x))
+ mcell <- cellFromXY(r_x, pop_ctrd)
+ r_x[mcell] <- ifelse(is.na(r_x[mcell]), mdiff, r_x[mcell] + mdiff)
+ }
+
+ # extract new re-distributed mass
+ v_x <- raster::getValues(r_x)
+ v_x[is.na(v_x)] <- 0
+ }
+
+ if(plot_map) {
+ plot(r_x, col = pal)
+ if(balance == "frame") {
+ fr <- extent(r_x, 2, ncol(r_x) - 1, 2, nrow(r_x) - 1)
+ } else if(balance == "pop_ctrd") {
+ fr <- c(xFromCell(r_x, mcell) - res(r_x)[1]/2,
+ xFromCell(r_x, mcell) + res(r_x)[1]/2,
+ yFromCell(r_x, mcell) - res(r_x)[2]/2,
+ yFromCell(r_x, mcell) + res(r_x)[2]/2)
+ }
+ if(balance %in% c("frame", "pop_ctrd")) {
+ rect(xleft = fr[1], xright = fr[2], ybottom = fr[3], ytop = fr[4],
+ border = "red")
+ }
+ }
+
+ KWD <- compareOneToOne(Coordinates = xy, Weights = cbind(v_x, v_o), ...)
+
+ KWD$distance
+}
+For removal, we need to make assumptions on how to balance mass. Two versions of manually balancing are considered:
+The missing mass gets added in a frame around the focus area. Therefore, missing mass in the visual center of the map gets punished more.
The missing mass gets added in a single cell at the population-weighted centroid of the map. The intuition behind this is that this point should be a kind of ‘best guess’ as to where it might be located.
# Visualizing the two approaches:
+
+par(mfrow = c(2, 2))
+
+getKWD(fa1_rm, fa1, "count", balance = "frame", plot_map = TRUE)
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## [1] 0.002546098
+getKWD(fa2_rm, fa2, "count", balance = "frame", plot_map = TRUE)
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## [1] 0.00379919
+getKWD(fa3_rm, fa3, "count", balance = "frame", plot_map = TRUE)
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## [1] 0.1285745
+getKWD(fa4_rm, fa4, "count", balance = "frame", plot_map = TRUE)
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## [1] 0.1314477
+getKWD(fa1_rm, fa1, "count", balance = "pop_ctrd", plot_map = TRUE)
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## [1] 0.004185462
+getKWD(fa2_rm, fa2, "count", balance = "pop_ctrd", plot_map = TRUE)
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## [1] 0.006253455
+getKWD(fa3_rm, fa3, "count", balance = "pop_ctrd", plot_map = TRUE)
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## [1] 0.1740762
+getKWD(fa4_rm, fa4, "count", balance = "pop_ctrd", plot_map = TRUE)
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## [1] 0.2377131
+par(mfrow = c(1, 1))
+We compute now the KWD for all methods and focus areas. For removal, both ideas about mass balancing are compared.
+kwd_rm_f <- c(getKWD(fa1_rm, fa1, "count", balance = "frame"),
+ getKWD(fa2_rm, fa2, "count", balance = "frame"),
+ getKWD(fa3_rm, fa3, "count", balance = "frame"),
+ getKWD(fa4_rm, fa4, "count", balance = "frame"))
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_rm_c <- c(getKWD(fa1_rm, fa1, "count", balance = "pop_ctrd"),
+ getKWD(fa2_rm, fa2, "count", balance = "pop_ctrd"),
+ getKWD(fa3_rm, fa3, "count", balance = "pop_ctrd"),
+ getKWD(fa4_rm, fa4, "count", balance = "pop_ctrd"))
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_qt1 <- c(getKWD(fa1_qt1, fa1, "count"), getKWD(fa2_qt1, fa2, "count"),
+ getKWD(fa3_qt1, fa3, "count"), getKWD(fa4_qt1, fa4, "count"))
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_qt2 <- c(getKWD(fa1_qt2, fa1, "count"), getKWD(fa2_qt2, fa2, "count"),
+ getKWD(fa3_qt2, fa3, "count"), getKWD(fa4_qt2, fa4, "count"))
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_sm <- c(getKWD(fa1_sm, fa1, "count"), getKWD(fa2_sm, fa2, "count"),
+ getKWD(fa3_sm, fa3, "count"), getKWD(fa4_sm, fa4, "count"))
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd <- cbind(kwd_rm_f, kwd_rm_c, kwd_qt1, kwd_qt2, kwd_sm)
+colnames(kwd) <- c("removal (frame)", "removal (centroid)", "quad tree I", "quad tree II", "smoothing")
+rownames(kwd) <- c("fa1", "fa2", "fa3", "fa4")
+round(kwd, 3)
+## removal (frame) removal (centroid) quad tree I quad tree II smoothing
+## fa1 0.003 0.004 0.012 0.018 0.381
+## fa2 0.004 0.006 0.030 0.144 0.493
+## fa3 0.129 0.174 0.053 0.119 0.626
+## fa4 0.131 0.238 0.067 0.150 0.624
+The results we get are now a lot more subtle.
+Removal is not universally worse than other methods (according to KWD), but actually very good in the more densely populated areas, but beaten by quad tree in rural focus areas.
+Some considerations on how to normalize KWD from the focusArea() function. Suppose we have a total count of 400. We move 10 units of mass by exactly one cell.
+wdth <- 11
+m <- matrix(data = 1, nrow = wdth, ncol = wdth)
+
+set.seed(100)
+m[sample(1:(11^2), 50)] <- 6
+m[6, 6] <- m[6, 6] + 29
+
+r_exa <- raster(m, xmn = 0, xmx = wdth, ymn = 0, ymx = wdth)
+
+r_exa2 <- r_exa
+r_exa2[6, 6] <- r_exa2[6, 6] - 10
+r_exa2[6, 7] <- r_exa2[6, 7] + 10
+
+par(mfrow = c(1, 2))
+plot(r_exa, breaks = seq(5, 35, 5), col = terrain.colors(7),
+ main = paste("count total:", sum(getValues(r_exa))))
+plot(r_exa2, breaks = seq(5, 35, 5), col = terrain.colors(7),
+ main = paste("count total:", sum(getValues(r_exa2))))
+par(mfrow = c(1, 1))
+We know the KWD before normalization must be 10, afterwards 0.025.
+xy <- ceiling(raster::xyFromCell(r_exa, 1:ncell(r_exa)))
+
+kwd_test <- compareOneToOne(Coordinates = xy,
+ Weights = cbind(getValues(r_exa), getValues(r_exa2)),
+ method = "exact")$distance
+## CompareOneToOne, Solution method: EXACT
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_fa_test <- focusArea(Coordinates = xy,
+ Weights = cbind(getValues(r_exa), getValues(r_exa2)),
+ x = 6, y = 6, radius = 3, area = "linf",
+ method = "exact")$distance
+## FocusArea, Solution method: APPROX
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+tst_results <- c(kwd_test, kwd_fa_test, kwd_fa_test / sum(getValues(r_exa2)))
+names(tst_results) <- c("full map", "focus area", "focus area / sum(full map)")
+tst_results
+## full map focus area
+## 0.025 10.000
+## focus area / sum(full map)
+## 0.025
+Suppose now we select our focus area from a smaller super area (with fewer total counts). In order to get to the same results we need to calibrate.
+r_exa_new <- crop(r_exa, extent(r_exa, 2, wdth - 1, 2, wdth - 1))
+r_exa2_new <- crop(r_exa2, extent(r_exa2, 2, wdth - 1, 2, wdth - 1))
+
+fa_factor <- sum(getValues(r_exa)) / sum(getValues(r_exa_new))
+
+xy <- ceiling(raster::xyFromCell(r_exa_new, 1:ncell(r_exa_new)))
+
+kwd_test <- compareOneToOne(Coordinates = xy,
+ Weights = cbind(getValues(r_exa_new), getValues(r_exa2_new)),
+ method = "exact")$distance
+## CompareOneToOne, Solution method: EXACT
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+kwd_fa_test <- focusArea(Coordinates = xy,
+ Weights = cbind(getValues(r_exa_new), getValues(r_exa2_new)),
+ x = 5, y = 5, radius = 3, area = "linf",
+ method = "exact")$distance
+## FocusArea, Solution method: APPROX
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+tst_results <- c(kwd_test, kwd_test / fa_factor,
+ kwd_fa_test / sum(r_exa2_new[(5-3):(5+3), (5-3):(5+3)]),
+ (kwd_fa_test / sum(getValues(r_exa2_new)) / fa_factor))
+names(tst_results) <- c("cropped area", "cropped area (calib.)", "focus area", "focus area (calib.)")
+tst_results
+## cropped area cropped area (calib.) focus area
+## 0.03571429 0.02500000 0.05617978
+## focus area (calib.)
+## 0.02500000
+