diff --git a/01_read_data_DE.R b/01_read_data_DE.R new file mode 100644 index 0000000..f88df2c --- /dev/null +++ b/01_read_data_DE.R @@ -0,0 +1,119 @@ +###### Prepare German pop. data for sdcSpatial ##### + +# ----- load libraries ----- + +library(sf) # handling shape data +library(data.table) # fast import and data manipulation +library(dplyr) # convenience +library(ggplot2) # (optional) visualization + + +# ----- get required data ----- + +data_folder <- file.path(getwd(), "sdcSpatialExperiment_DE_ext") # customize if needed + +web_pop <- paste0("https://www.zensus2011.de/SharedDocs/Downloads/DE/Pressemitteilung/", + "DemografischeGrunddaten/csv_Bevoelkerung_100m_Gitter.zip", + "?__blob=publicationFile&v=2") +web_shp <- paste0("https://daten.gdz.bkg.bund.de/produkte/vg/vg250_ebenen_0101/aktuell/", + "vg250_01-01.utm32s.shape.ebenen.zip") +web_100km <- paste0("https://daten.gdz.bkg.bund.de/produkte/sonstige/geogitter/aktuell/", + "DE_Grid_ETRS89-LAEA_100km.gpkg.zip") +web_100m <- paste0("https://daten.gdz.bkg.bund.de/produkte/sonstige/geogitter/aktuell/", + "DE_Grid_ETRS89-LAEA_100m.csv.zip") + +# download files (may need to raise timeout in R options for the larger files) +download.file(web_pop, paste0(data_folder, "/csv_Bevoelkerung_100m_Gitter.zip"), + mode = "wb") # ~ 105 MB +download.file(web_shp, paste0(data_folder, "/vg250_01-01.utm32s.shape.ebenen.zip"), + mode = "wb") # ~ 66 MB +download.file(web_100km, paste0(data_folder, "/DE_Grid_ETRS89-LAEA_100km.gpkg.zip"), + mode = "wb") # ~ 1 MB +download.file(web_100m, paste0(data_folder, "/DE_Grid_ETRS89-LAEA_100m.zip"), + mode = "wb") # ~ 291 MB + +# unzip files +unzip(paste0(data_folder, "/csv_Bevoelkerung_100m_Gitter.zip"), exdir = data_folder) +unzip(paste0(data_folder, "/vg250_01-01.utm32s.shape.ebenen.zip"), exdir = data_folder) +unzip(paste0(data_folder, "/DE_Grid_ETRS89-LAEA_100km.gpkg.zip"), exdir = data_folder) +unzip(paste0(data_folder, "/DE_Grid_ETRS89-LAEA_100m.zip"), exdir = data_folder) + +path_grid_100m <- "DE_Grid_ETRS89-LAEA_100m/geogitter" +shp_grid_100km <- "DE_Grid_ETRS89-LAEA_100km.gpkg/geogitter/DE_Grid_ETRS89-LAEA_100km.gpkg" +shp_admi <- "vg250_01-01.utm32s.shape.ebenen/vg250_ebenen_0101/VG250_LAN.shp" +data_pop <- "Zensus_Bevoelkerung_100m-Gitter.csv" + + +# ----- select data ----- + +# The 100m x 100m grid is delivered in 100km x 100km chunks for the sake of file size. +# We start with the large grid in order to decide which chunks to load. + +# read large grid +grid_100km <- st_read(file.path(data_folder, shp_grid_100km)) + +# In order to do the analysis for a single federal state, load state borders +admi_data <- st_read(file.path(data_folder, shp_admi)) %>% + filter(GF == 4) %>% # use only land area + st_transform(st_crs(grid_100km)) # UTM to LAEA + +# select a federal state to work with +admi_data$GEN +(fedstate <- admi_data$GEN[1]) +fedstate_ags <- admi_data[admi_data$GEN == fedstate, ]$AGS # state identifier + +# which 100km x 100km must be loaded for that state? +grid_load <- grid_100km[st_intersects(admi_data[admi_data$GEN == fedstate, ], + grid_100km)[[1]], ]$id + +# (optional) visualize the selected chunks +ggplot() + + geom_sf(data = grid_100km) + + geom_sf(data = admi_data, fill = NA) + + geom_sf(data = admi_data[admi_data$AGS == fedstate_ags, ], fill = "cyan") + + geom_sf(data = grid_100km[grid_100km$id %in% grid_load, ], + fill = NA, color = "red") + + +# ----- read data ----- + +# read in selected chunks +grd <- vector("list", length = length(grid_load)) +for(i in seq(grid_load)) { + grd[[i]] <- fread(file.path(data_folder, path_grid_100m, + paste0(grid_load[i], "_DE_Grid_ETRS89-LAEA_100m.csv")), + colClasses = c("character", rep("numeric", 10), "character")) +} + +# combining selected chunks +grid_100m <- dplyr::bind_rows(grd)[, c(1:5, 12)] +names(grid_100m) <- c("cell_id", "x_sw", "y_sw", "x_mp", "y_mp", "ags") + +# get rid of overlaps with other fed. states +grid_100m <- filter(grid_100m, substr(ags, 1, 2) == fedstate_ags) + + +# ----- add pop. numbers ----- + +# read in population data +pop <- fread(file.path(data_folder, data_pop)) %>% + dplyr::select(Gitter_ID_100m, Einwohner) %>% + dplyr::filter(Einwohner > 0) + +# merge population to grid cells +pop_100m <- merge(grid_100m, pop, by.x = "cell_id", by.y = "Gitter_ID_100m") + +sum(pop_100m$Einwohner) # inhabitants of selected state (w/o suppressed cells) + +# expand to record-level data +pop_100m <- pop_100m[, .(1:Einwohner, x_mp, y_mp), by = .(cell_id)] %>% + rename(person_id_cell = V1) +pop_100m$person_id <- 1:nrow(pop_100m) + + +# ----- save data ----- + +save(pop_100m, grid_100m, file = paste0(data_folder, "/pop_data_DE.RData"), compress = TRUE) + +rm(list = ls()) + diff --git a/02_sdcSpatial_DE.R b/02_sdcSpatial_DE.R new file mode 100644 index 0000000..8204e5c --- /dev/null +++ b/02_sdcSpatial_DE.R @@ -0,0 +1,470 @@ +##### Apply sdcSpatial on 100m grid counts tables [DE] ##### +# +# Script does the following: +# (1) load required tools + pre-computed pop. and grid data [see: 01_read_data_DE.R] +# (2) create + inspect sdc_raster object from pop. data +# (3) apply protection algorithms (removal, quadtree, kernel smoothing) +# (4) inspect protected raster maps, check & compare various utility measures +# (5) illustrate two aspects of KWD as info loss measure + + +# ----- (1a) load libraries ----- + +library(raster) # grid data handling +library(sdcSpatial) # map protection algorithms +library(spatstat) # geospatial functions +library(SpatialKWD) # fast KWD approximation +library(ggplot2) # plotting +library(viridis) # plotting +library(dplyr) # convenience +library(tidyr) # convenience + + +# ----- (1b) load data ----- + +# customize to local setting +path_data <- "sdcSpatialExperiment_DE_ext" + +load(file.path(path_data, "pop_data_DE.RData")) + +head(pop_100m) +## data.table is one row per person ############################## ### +# cell_id: INSPIRE name of grid cell +# person_id_cell: unique ID for person within a cell +# x_mp: x-coordinate of person (cell centroid) +# y_mp: y-coordinate of person (cell centroid) +# person_id: unique person ID +# [all coordinates in ETRS89-LAEA] + +head(grid_100m) +## data.table is one row per grid cell (incl. uninhabited) ####### ### +# cell_id: INSPIRE name of grid cell +# x_sw: x-coordinate of southwest-corner +# y_sw: y-coordinate of southwest-corner +# x_mp: x-coordinate of centroid +# y_mp: y-coordinate of centroid +# ags: municipality identifier +# [all coordinates in ETRS89-LAEA] + + +# ----- (2a) set up sdc_raster ----- + +# prepare input raster (incl. uninhabited) +r <- raster(xmn = min(grid_100m$x_sw), xmx = max(grid_100m$x_sw + 100), + ymn = min(grid_100m$y_sw), ymx = max(grid_100m$y_sw + 100), + res = 100, + crs = CRS("+init=epsg:3035")) + +# to sdc_raster +pop100m_raster <- sdc_raster(as.matrix(pop_100m[, c("x_mp", "y_mp")]), + variable = 1, + r = r, + min_count = 10, # require 10-anonymity + max_risk = 1.0) # no magnitude / freq. criterion + +pal <- rev(viridis(10)) # color palette for plotting +plot(pop100m_raster, value = "count", col = pal) + + +# ----- (2b) inspect sdc_raster ----- + +# subset sensitive cells +raster_sens <- is_sensitive(pop100m_raster) + +# no. of sensitive cells +(ncells_sens <- sum(raster_sens@data@values, na.rm = TRUE)) +# share of sensitive cells (in all inhabited) +sensitivity_score(pop100m_raster) +# verify: +ncells_sens / length(unique(pop_100m$cell_id)) + +# How many individuals are at risk? +sens_vals <- pop100m_raster$value$count[raster_sens] +sum(sens_vals) # no. of indiv. at risk +sum(sens_vals) / nrow(pop_100m) # share of indiv. at risk + +# How are cell counts < k distributed? +ggplot(data = data.frame(sens_vals), aes(sens_vals)) + + geom_histogram(binwidth = 1, color = "black", fill = "lightgrey") + + scale_x_continuous(n.breaks = 7) + + xlab("no. inhabitants in sensitive cells") + + +# ----- (3) apply map protection algorithms ----- + +# simply remove sensitive cells +pop100m_raster_rm <- remove_sensitive(pop100m_raster) + +# quadtree method +pop100m_raster_qt1 <- protect_quadtree(pop100m_raster, max_zoom = 2) +pop100m_raster_qt2 <- protect_quadtree(pop100m_raster, max_zoom = 3) + +# kernel smoothing (~ 12min) +pop100m_raster_sm <- protect_smooth(pop100m_raster, bw = 100) + + +sensitivity_score(pop100m_raster_rm) +sensitivity_score(pop100m_raster_qt1) +sensitivity_score(pop100m_raster_qt2) +sensitivity_score(pop100m_raster_sm) + +# remove remaining sensitive cells (if any) +pop100m_raster_qt1 <- remove_sensitive(pop100m_raster_qt1) +pop100m_raster_qt2 <- remove_sensitive(pop100m_raster_qt2) +pop100m_raster_sm <- remove_sensitive(pop100m_raster_sm) + + +# ----- (4a) inspect protected rasters ----- + +# visual comparison +plot(pop100m_raster_rm, "count", col = pal, sensitive = FALSE) +plot(pop100m_raster_qt1, "count", col = pal, sensitive = FALSE) +plot(pop100m_raster_qt2, "count", col = pal, sensitive = FALSE) +plot(pop100m_raster_sm, "count", col = pal, sensitive = FALSE) + + +# What's the cell-wise abs. difference between protected vs. unprotected? + +difference_rasters <- function(x, y) { + + # function is only needed when rasters contain NAs + na <- is.na(x) & is.na(y) + x[is.na(x)] <- 0 + y[is.na(y)] <- 0 + z <- x - y + z[na] <- NA + z +} + +# calculate differences +pop100m_raster_rm$value$diff <- difference_rasters(pop100m_raster_rm$value$count, + pop100m_raster$value$count) +pop100m_raster_qt1$value$diff <- difference_rasters(pop100m_raster_qt1$value$count, + pop100m_raster$value$count) +pop100m_raster_qt2$value$diff <- difference_rasters(pop100m_raster_qt2$value$count, + pop100m_raster$value$count) +pop100m_raster_sm$value$diff <- difference_rasters(pop100m_raster_sm$value$count, + pop100m_raster$value$count) + +# plot differences +plot(pop100m_raster_rm$value$diff, col = pal, main = "removal") +plot(pop100m_raster_qt1$value$diff, col = pal, main = "quadtree zoom 2") +plot(pop100m_raster_qt2$value$diff, col = pal, main = "quadtree zoom 3") +plot(pop100m_raster_sm$value$diff, col = pal, main = "kernel smoothing") + +# How much have the values of sensitive cells been changed? +hist(pop100m_raster_rm$value$diff[raster_sens], main = "removal") +hist(pop100m_raster_qt1$value$diff[raster_sens], main = "quadtree zoom 2") +hist(pop100m_raster_qt2$value$diff[raster_sens], main = "quadtree zoom 3") +hist(pop100m_raster_sm$value$diff[raster_sens], main = "kernel smoothing") + +# How many sensitive cells are unchanged? +sum(pop100m_raster_rm$value$diff[raster_sens] == 0) +sum(pop100m_raster_qt1$value$diff[raster_sens] == 0) +sum(pop100m_raster_qt2$value$diff[raster_sens] == 0) +sum(pop100m_raster_sm$value$diff[raster_sens] == 0) +# What share of sensitive cells is unchanged? +sum(pop100m_raster_rm$value$diff[raster_sens] == 0) / ncells_sens +sum(pop100m_raster_qt1$value$diff[raster_sens] == 0) / ncells_sens +sum(pop100m_raster_qt2$value$diff[raster_sens] == 0) / ncells_sens +sum(pop100m_raster_sm$value$diff[raster_sens] == 0) / ncells_sens + + +# Which cells were NA in orig. and now aren't? + +pop100m_raster_rm$value$outside <- + is.na(pop100m_raster$value$count) & !is.na(pop100m_raster_rm$value$count) +pop100m_raster_qt1$value$outside <- + is.na(pop100m_raster$value$count) & !is.na(pop100m_raster_qt1$value$count) +pop100m_raster_qt2$value$outside <- + is.na(pop100m_raster$value$count) & !is.na(pop100m_raster_qt2$value$count) +pop100m_raster_sm$value$outside <- + is.na(pop100m_raster$value$count) & !is.na(pop100m_raster_sm$value$count) + +sum(getValues(pop100m_raster_rm$value$outside)) # removal +sum(getValues(pop100m_raster_qt1$value$outside)) # quadtree zoom 2 +sum(getValues(pop100m_raster_qt2$value$outside)) # quadtree zoom 3 +sum(getValues(pop100m_raster_sm$value$outside)) # kernel smoothing + + +# ----- (4b) utility measures - general ----- + +###### fun: calculate utility values for protected maps ##### ### +# x: sdc_raster object - after map protection +# orig: sdc_raster object - original before protection +# value: value to compare ("count", "sum", ...) +# measure: one or several of the following +# - bin-by-bin: - +# RMSE: Root Mean Squared Error +# HD: Hellinger's Distance +# - cross-bin: - +# Moran: change in (global) Moran's I +# KWD: Kantorovic-Wasserstein Distance +# ... : additional parameters for SpatialKWD::compareOneToOne() + +get_utility <- function(x, orig, + value = "count", + measure = c("RMSE", "HD", "Moran", "KWD"), + ...) { + + u <- vector("numeric", length = length(measure)) + names(u) <- measure + + # extract value raster of interest + r_x <- x$value[[value]] + r_o <- orig$value[[value]] + + # extract cell values + v_x <- raster::getValues(r_x) + v_o <- raster::getValues(r_o) + + # set NAs to 0 + v_x[is.na(v_x)] <- 0 + v_o[is.na(v_o)] <- 0 + + for (i in seq(u)) { + + if (measure[i] == "RMSE") { + + # compute root mean squared error + u[i] <- sqrt(mean((v_x - v_o)^2)) + } + + if (measure[i] == "HD") { + + # compute Hellinger's distance + u[i] <- sqrt(sum((sqrt(v_o) - sqrt(v_x))^2)) * (1/(sqrt(2))) + } + + if (measure[i] == "Moran") { + + # change in (global) Moran's I + u[i] <- Moran(r_x) - Moran(r_o) + } + + if (measure[i] == "KWD") { + + # rescale to balance mass + v_x_sc <- v_x * (sum(v_o) / sum(v_x)) + + # approximate Kantorovic-Wasserstein distance + xy <- raster::xyFromCell(r_x, 1:raster::ncell(r_x)) + u[i] <- SpatialKWD::compareOneToOne(Coordinates = xy, + Weights = cbind(v_o, v_x_sc), + ...)$distance + } + } + + u +} + + +# assess protected maps with utility measures + +u_measures <- c("RMSE", "HD") + +u_rm <- get_utility(pop100m_raster_rm, + orig = pop100m_raster, value = "count", + measure = u_measures) +u_qt1 <- get_utility(pop100m_raster_qt1, + orig = pop100m_raster, value = "count", + measure = u_measures) +u_qt2 <- get_utility(pop100m_raster_qt2, + orig = pop100m_raster, value = "count", + measure = u_measures) +u_sm <- get_utility(pop100m_raster_sm, + orig = pop100m_raster, value = "count", + measure = u_measures) + +results <- as.data.frame(rbind(u_rm, u_qt1, u_qt2, u_sm)) +rownames(results) <- c("removal", "qt_zoom2", "qt_zoom3", "smoothing") +results + + +# ----- (4c) utility measures - sub-area (incl. KWD) ----- + +# Even with the fast approximation algorithm, computing KWD for a large map with +# small cell size can take a long time! +# Therefore we only demonstrate on a smaller sub-area + +# select a rectangular sub-area as bounding box of regional ID +ags_sub <- "01002000" # city of Kiel +grid_sub <- grid_100m[grid_100m$ags == ags_sub, ] +ext <- extent(min(grid_sub$x_sw), max(grid_sub$x_sw) + 100, + min(grid_sub$y_sw), max(grid_sub$y_sw) + 100) + +# crop rasters to sub-area +sub_raster <- pop100m_raster +sub_raster_rm <- pop100m_raster_rm +sub_raster_qt1 <- pop100m_raster_qt1 +sub_raster_qt2 <- pop100m_raster_qt2 +sub_raster_sm <- pop100m_raster_sm +sub_raster$value <- crop(sub_raster$value, ext) +sub_raster_rm$value <- crop(sub_raster_rm$value, ext) +sub_raster_qt1$value <- crop(sub_raster_qt1$value, ext) +sub_raster_qt2$value <- crop(sub_raster_qt2$value, ext) +sub_raster_sm$value <- crop(sub_raster_sm$value, ext) + +# visually inspect sub-area +par(mfrow = c(2, 3)) +plot(sub_raster, "count", col = pal, sensitive = FALSE, main = "unprotected") +plot(sub_raster_rm, "count", col = pal, sensitive = FALSE, main = "removal") +plot(sub_raster_qt1, "count", col = pal, sensitive = FALSE, main = "quadtree zoom 2") +plot(sub_raster_qt2, "count", col = pal, sensitive = FALSE, main = "quadtree zoom 3") +plot(sub_raster_sm, "count", col = pal, sensitive = FALSE, main = "kernel smoothing (100m bw)") +par(mfrow = c(1, 1)) + +# assess sub-area with full set of utility measures + +u_sub_rm <- get_utility(sub_raster_rm, orig = sub_raster, value = "count") +u_sub_qt1 <- get_utility(sub_raster_qt1, orig = sub_raster, value = "count") +u_sub_qt2 <- get_utility(sub_raster_qt2, orig = sub_raster, value = "count") +u_sub_sm <- get_utility(sub_raster_sm, orig = sub_raster, value = "count") + +results_sub <- as.data.frame(rbind(u_sub_rm, u_sub_qt1, u_sub_qt2, u_sub_sm)) +rownames(results_sub) <- c("removal", "qt_zoom2", "qt_zoom3", "smoothing") +results_sub + +# inspect results +ggplot(results_sub %>% mutate(method = rownames(results_sub))) + + geom_point(aes(HD, KWD, color = method)) + + +# ----- (5a) KWD illustration 1 ----- + +# KWD can yield a different rank-order of protection mechanisms than bin-by-bin +# utility metrics, if the mechanism shifts distribution mass without concern for +# geographic neighborhood (e.g. removal, random swapping). + +# If, however, the protection mechanism respects geographic neighborhood by design +# (quadtree, smoothing, small random perturbation), then bin-by-bin metrics and +# KWD give the same answer (same rank order w.r.t. utility). + +data("dwellings") # pseudo household locations (from sdcSpatial) +dw_or <- sdc_raster(dwellings[, c("x", "y")], variable = 1) + +# do both methods with increasingly strong parameters +dw_qt1 <- protect_quadtree(dw_or, max_zoom = 2) +dw_qt2 <- protect_quadtree(dw_or, max_zoom = 3) +dw_qt3 <- protect_quadtree(dw_or, max_zoom = 4) +dw_sm1 <- protect_smooth(dw_or, bw = 200) +dw_sm2 <- protect_smooth(dw_or, bw = 300) +dw_sm3 <- protect_smooth(dw_or, bw = 400) + +# get utility +u_qt1 <- round(get_utility(dw_qt1, dw_or, value = "count"), 3) +u_qt2 <- round(get_utility(dw_qt2, dw_or, value = "count"), 3) +u_qt3 <- round(get_utility(dw_qt3, dw_or, value = "count"), 3) +u_sm1 <- round(get_utility(dw_sm1, dw_or, value = "count"), 3) +u_sm2 <- round(get_utility(dw_sm2, dw_or, value = "count"), 3) +u_sm3 <- round(get_utility(dw_sm3, dw_or, value = "count"), 3) + +(results_dw <- as.data.frame(rbind(u_qt1, u_qt2, u_qt3, u_sm1, u_sm2, u_sm3))) + +# plot results +ggplot(results_dw, aes(RMSE, KWD)) + + geom_point() + + geom_text(aes(label = rownames(results_dw), x = RMSE + 0.5, y = KWD + 0.03)) + + geom_abline(intercept = coef(lm(KWD ~ RMSE, data = results_dw))[1], + slope = coef(lm(KWD ~ RMSE, data = results_dw))[2], + color = "blue", lty = "dashed") + +# inspect visual impressions +par(mfrow = c(2, 3)) +plot(dw_qt1, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_qt1[4], " RMSE", u_qt1[1])) +plot(dw_qt2, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_qt2[4], " RMSE", u_qt2[1])) +plot(dw_qt3, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_qt3[4], " RMSE", u_qt3[1])) +plot(dw_sm1, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_sm1[4], " RMSE", u_sm1[1])) +plot(dw_sm2, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_sm2[4], " RMSE", u_sm2[1])) +plot(dw_sm3, "count", sensitive = FALSE, col = pal, + main = paste("KWD", u_sm3[4], " RMSE", u_sm3[1])) +par(mfrow = c(1, 1)) + + +# ----- (5b) KWD illustration 2 ----- + +# KWD seems to give preference to the quadtree method over smoothing; +# this does not necessarily correspond to human visual impression + +# make small toy data set +set.seed(3000) +pts_exa <- as.data.frame(spatstat.random::rMatClust(20, 0.1, 100)) +pts_exa <- pts_exa * 1000 + +test_raster <- sdc_raster(pts_exa, variable = 1, r = 50, + min_count = 5, max_risk = 1.0) + +## compare: quadtree vs. smoothing + +test_raster_qt <- protect_quadtree(test_raster) +test_raster_sm <- protect_smooth(test_raster, bw = 50) + +u_test_qt <- round(get_utility(test_raster_qt, orig = test_raster, + value = "count", measure = c("RMSE", "KWD")), 4) +u_test_sm <- round(get_utility(test_raster_sm, orig = test_raster, + value = "count", measure = c("RMSE", "KWD")), 4) + +par(mfrow = c(1, 3)) +plot(test_raster, "count", sensitive = FALSE, + main = "original") +plot(test_raster_qt, "count", sensitive = FALSE, + main = "quadtree", + sub = paste("KWD:", u_test_qt[2], " RMSE:", u_test_qt[1])) +plot(test_raster_sm, "count", sensitive = FALSE, + main = "smoothing", + sub = paste("KWD:", u_test_sm[2], " RMSE:", u_test_sm[1])) +par(mfrow = c(1, 1)) + +# The KWD is, in principle, well suited to compare 2 raster maps, but does it +# capture 'similarity of visual impression'? Some doubt is in order. +# KWD seems to penalize small distribution mass, transported over medium distances, +# more than the eyeball. + +# -> Idea: remove small distribution mass; compare only 'hot spots' + + +# remove the lower [tsh]% of distribution mass +tsh <- 0.2 +hs <- tsh * max(getValues(test_raster$value$count), na.rm = TRUE) +hs_qt <- tsh * max(getValues(test_raster_qt$value$count), na.rm = TRUE) +hs_sm <- tsh * max(getValues(test_raster_sm$value$count), na.rm = TRUE) + +# raster layer of 'hot spots' - original +hsr <- test_raster$value$count +hsr@data@values <- ifelse(hsr@data@values > hs, hsr@data@values, NA) +test_raster$value$hs <- hsr +# raster layer of 'hot spots' - quadtree +hsr <- test_raster_qt$value$count +hsr@data@values <- ifelse(hsr@data@values > hs_qt, hsr@data@values, NA) +test_raster_qt$value$hs <- hsr +# raster layer of 'hot spots' - smoothing +hsr <- test_raster_sm$value$count +hsr@data@values <- ifelse(hsr@data@values > hs_sm, hsr@data@values, NA) +test_raster_sm$value$hs <- hsr + +# re-compute KWD & RMSE for 'hot spots' only +u_test_qt_hs <- round(get_utility(test_raster_qt, orig = test_raster, + value = "hs", measure = c("RMSE", "KWD")), 4) +u_test_sm_hs <- round(get_utility(test_raster_sm, orig = test_raster, + value = "hs", measure = c("RMSE", "KWD")), 4) + +par(mfrow = c(1, 3)) +plot(test_raster, "hs", sensitive = FALSE, + main = "original") +plot(test_raster_qt, "hs", sensitive = FALSE, + main = "quadtree", + sub = paste("KWD:", u_test_qt_hs[2], " RMSE:", u_test_qt[1])) +plot(test_raster_sm, "hs", sensitive = FALSE, + main = "smoothing", + sub = paste("KWD:", u_test_sm_hs[2], " RMSE:", u_test_sm[1])) +par(mfrow = c(1, 1)) + +# KWD for 'hot spot'-maps, rather than full maps, seems to be more in line with +# 'visual experience'. How to finally assess the map? +# Maybe some (weighted) average of KWD for full map vs. for 'hot spot' map? + diff --git a/Example_spatialKWD_focusAreas.Rmd b/Example_spatialKWD_focusAreas.Rmd new file mode 100644 index 0000000..444a9cc --- /dev/null +++ b/Example_spatialKWD_focusAreas.Rmd @@ -0,0 +1,237 @@ +--- +title: "Reproducible example of current open issues with the SpatialKWD package" +output: + html_document: + df_print: paged + toc: true +--- + +2023-08-03 + +## 1. Preparing example data + +```{r} +library(SpatialKWD) +library(sp) +library(raster) +library(sdcSpatial) +``` + +The issues are illustrated by means of an example data set from the sdcSpatial package called +'dwellings'. The first step is to create the unprotected raster map, which forms the +ground truth. + +The example uses a map of counts (no. of units per grid cell), with 200m $\times$ +200m grid cells in a 62 $\times$ 61 cells raster map. + +```{r} +data("dwellings") + +dw <- sdcSpatial::sdc_raster(dwellings[, c("x", "y")], + variable = 1, r = 200, + max_risk = .8, min_count = 10) + +ras_orig <- dw$value$count # original raster + +raster::plot(ras_orig, main = "no. of dwellings") # map +ras_orig # raster info +``` + +Next, we create the altered raster maps using three protection methods: +1) suppression of sensitive cells (remove_sensitive), +2) quadtree method (protect_quadtree), +3) kernel smoothing (protect_smooth). + +```{r} +# apply protection methods +dw_rm <- remove_sensitive(dw) +dw_qt <- protect_quadtree(dw, max_zoom = 2) +dw_sm <- protect_smooth(dw, bw = 200) + +# extract protected rasters from sdcSpatial object +ras_rm <- dw_rm$value$count +ras_qt <- dw_qt$value$count +ras_sm <- dw_sm$value$count +``` + +Following up, we select a square focus area. It is identified by its centroid and +radius. + +```{r} +# Focus area (square) + +fa_ctrd <- c(155100, 464100) # center of focus area +fa_rads <- 3000 # radius of focus area + + +# maps + +par(mfrow = c(2, 2)) + +plot(ras_orig, main = "original") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_rm, main = "cell suppression") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_qt, main = "quad tree method") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_sm, main = "smoothing method") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") + +par(mfrow = c(1, 1)) +``` + + +## 2. FocusArea sometimes returns zero for clear differences + +We continue by evaluating the difference between altered maps and ground truth with +respect to the chosen focus area. + +This involves the following steps: +1) extract the center coordinates of tiles (raster::xyFromCell), +2) extract the tile-values (weights) (raster::getValues) [where NA-values are +treated as zeros]. + +```{r} +# extract coordinates from raster +xy <- raster::xyFromCell(ras_orig, 1:ncell(ras_orig)) + +# extract cell values from rasters +w_orig <- raster::getValues(ras_orig) +w_rm <- raster::getValues(ras_rm) +w_qt <- raster::getValues(ras_qt) +w_sm <- raster::getValues(ras_sm) + +# treat NA-nodes as having zero weight +w_orig[is.na(w_orig)] <- 0 +w_rm[is.na(w_rm)] <- 0 +w_qt[is.na(w_qt)] <- 0 +w_sm[is.na(w_sm)] <- 0 +``` + +Using these inputs we calculate the first set of KWDs. + +```{r} +kwd_rm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +kwd_qt <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +kwd_sm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +``` + +Even though the suppression map is clearly different (all cells with < 10 units are +removed, see maps above), we get a KWD of zero. + +```{r} +c(kwd_rm$distance, kwd_qt$distance, kwd_sm$distance) +``` + +Second try: The description of the focusArea function tells us that "All the weights outside +the focus area should be equal to zero." We create new rasters, where all tiles outside +the focus area are assigned zero values. + +```{r} +ex_fa <- extent(fa_ctrd[1] - (fa_rads - 100), fa_ctrd[1] + (fa_rads - 100), + fa_ctrd[2] - (fa_rads - 100), fa_ctrd[2] + (fa_rads - 100)) +clls_fa <- cellsFromExtent(ras_orig, ex_fa) + +ras_rm_fa <- ras_rm +ras_qt_fa <- ras_qt +ras_sm_fa <- ras_sm +ras_rm_fa[-clls_fa] <- 0 +ras_qt_fa[-clls_fa] <- 0 +ras_sm_fa[-clls_fa] <- 0 + + +par(mfrow = c(2, 2)) + +plot(ras_orig, main = "original") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_rm_fa, main = "cell suppression (focus area)") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_qt_fa, main = "quad tree method (focus area)") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") +plot(ras_sm_fa, main = "smoothing method (focus area)") +rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads, + ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red") + +par(mfrow = c(1, 1)) +``` + +With these we re-calculate the KWDs. + +```{r} +# updated weights (zero outside focus area) +w_rm_fa <- getValues(ras_rm_fa) +w_qt_fa <- getValues(ras_qt_fa) +w_sm_fa <- getValues(ras_sm_fa) +w_rm_fa[is.na(w_rm_fa)] <- 0 +w_qt_fa[is.na(w_qt_fa)] <- 0 +w_sm_fa[is.na(w_sm_fa)] <- 0 + +kwd_rm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm_fa), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +kwd_qt_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt_fa), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +kwd_sm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm_fa), + L = 3, recode = TRUE, area = "linf", + x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads) +``` + +We still get a KWD of zero for the 'removal' method. Furthermore, we now also get zero +for smoothing. + +```{r} +c(kwd_rm_fa$distance, kwd_qt_fa$distance, kwd_sm_fa$distance) +``` + + +## 3. How to apply a fixed cost to missing units of mass? + +Suppose now we look at the whole map rather than focus areas. Then, we might get +a mass mismatch, especially with removal: + +```{r} +c(sum(w_orig), sum(w_rm)) +``` + +There are ca. 2000 units missing after removal. We want to try solving the problem +by assigning each a cost of half the map diagonal. + +```{r} +# compute length of the diagonal in cells +diag_cells <- sqrt(ncol(ras_orig)^2 + nrow(ras_orig)^2) +# ... in meters +diag_metrs <- diag_cells * 200 + +# solve with extra artificial bin (?) +kwd_rm_bal <- compareOneToOne(Coordinates = xy, Weights = cbind(w_orig, w_rm), + L = 3, recode = TRUE, + unbalanced = TRUE, unbal_cost = diag_metrs/2) +``` + +Again, this yields a distance of zero. + +```{r} +kwd_rm_bal$distance +``` + +Do we get a distance of zero, because all non-zero tiles after removal match exactly? +(I.e. for any given tile the mass is missing completely or not at all.) + +Given that this is the case, we could, in principle, assign the costs manually +(2000 $\times c$ ). +Still, intuitively the function should not return zero (?) + diff --git a/Example_spatialKWD_focusAreas.html b/Example_spatialKWD_focusAreas.html new file mode 100644 index 0000000..9442cfd --- /dev/null +++ b/Example_spatialKWD_focusAreas.html @@ -0,0 +1,440 @@ + + + + + + + + + + + + + +Reproducible example of current open issues with the SpatialKWD package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ +
+ +

2023-08-03

+
+

1. Preparing example data

+
library(SpatialKWD)
+library(sp)
+library(raster)
+library(sdcSpatial)
+

The issues are illustrated by means of an example data set from the sdcSpatial package called ‘dwellings’. The first step is to create the unprotected raster map, which forms the ground truth.

+

The example uses a map of counts (no. of units per grid cell), with 200m \(\times\) 200m grid cells in a 62 \(\times\) 61 cells raster map.

+
data("dwellings")
+
+dw <- sdcSpatial::sdc_raster(dwellings[, c("x", "y")], 
+                             variable = 1, r = 200,
+                             max_risk = .8, min_count = 10)
+
+ras_orig <- dw$value$count # original raster
+
+raster::plot(ras_orig, main = "no. of dwellings") # map
+

+
ras_orig # raster info
+
## class      : RasterLayer 
+## dimensions : 62, 61, 3782  (nrow, ncol, ncell)
+## resolution : 200, 200  (x, y)
+## extent     : 149400, 161600, 457800, 470200  (xmin, xmax, ymin, ymax)
+## crs        : NA 
+## source     : memory
+## names      : count 
+## values     : 1, 329  (min, max)
+

Next, we create the altered raster maps using three protection methods: 1) suppression of sensitive cells (remove_sensitive), 2) quadtree method (protect_quadtree), 3) kernel smoothing (protect_smooth).

+
# apply protection methods
+dw_rm <- remove_sensitive(dw)
+dw_qt <- protect_quadtree(dw, max_zoom = 2)
+dw_sm <- protect_smooth(dw, bw = 200)
+
+# extract protected rasters from sdcSpatial object
+ras_rm <- dw_rm$value$count
+ras_qt <- dw_qt$value$count
+ras_sm <- dw_sm$value$count
+

Following up, we select a square focus area. It is identified by its centroid and radius.

+
# Focus area (square)
+
+fa_ctrd <- c(155100, 464100) # center of focus area
+fa_rads <- 3000              # radius of focus area
+
+
+# maps
+
+par(mfrow = c(2, 2))
+
+plot(ras_orig, main = "original")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_rm, main = "cell suppression")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_qt, main = "quad tree method")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_sm, main = "smoothing method")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+

+
par(mfrow = c(1, 1))
+
+
+

2. FocusArea sometimes returns zero for clear differences

+

We continue by evaluating the difference between altered maps and ground truth with respect to the chosen focus area.

+

This involves the following steps: 1) extract the center coordinates of tiles (raster::xyFromCell), 2) extract the tile-values (weights) (raster::getValues) [where NA-values are treated as zeros].

+
# extract coordinates from raster
+xy <- raster::xyFromCell(ras_orig, 1:ncell(ras_orig))
+
+# extract cell values from rasters
+w_orig <- raster::getValues(ras_orig)
+w_rm   <- raster::getValues(ras_rm)
+w_qt   <- raster::getValues(ras_qt)
+w_sm   <- raster::getValues(ras_sm)
+
+# treat NA-nodes as having zero weight
+w_orig[is.na(w_orig)] <- 0
+w_rm[is.na(w_rm)] <- 0
+w_qt[is.na(w_qt)] <- 0
+w_sm[is.na(w_sm)] <- 0
+

Using these inputs we calculate the first set of KWDs.

+
kwd_rm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm),
+                                L = 3, recode = TRUE, area = "linf",
+                                x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_qt <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt),
+                                L = 3, recode = TRUE, area = "linf",
+                                x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_sm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm),
+                                L = 3, recode = TRUE, area = "linf",
+                                x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+

Even though the suppression map is clearly different (all cells with < 10 units are removed, see maps above), we get a KWD of zero.

+
c(kwd_rm$distance, kwd_qt$distance, kwd_sm$distance)
+
## [1]    0.000 6514.312 2332.205
+

Second try: The description of the focusArea function tells us that “All the weights outside the focus area should be equal to zero.” We create new rasters, where all tiles outside the focus area are assigned zero values.

+
ex_fa <- extent(fa_ctrd[1] - (fa_rads - 100), fa_ctrd[1] + (fa_rads - 100),
+                fa_ctrd[2] - (fa_rads - 100), fa_ctrd[2] + (fa_rads - 100))
+clls_fa <- cellsFromExtent(ras_orig, ex_fa)
+
+ras_rm_fa <- ras_rm
+ras_qt_fa <- ras_qt
+ras_sm_fa <- ras_sm
+ras_rm_fa[-clls_fa] <- 0
+ras_qt_fa[-clls_fa] <- 0
+ras_sm_fa[-clls_fa] <- 0
+
+
+par(mfrow = c(2, 2))
+
+plot(ras_orig, main = "original")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_rm_fa, main = "cell suppression (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_qt_fa, main = "quad tree method (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_sm_fa, main = "smoothing method (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+

+
par(mfrow = c(1, 1))
+

With these we re-calculate the KWDs.

+
# updated weights (zero outside focus area)
+w_rm_fa <- getValues(ras_rm_fa)
+w_qt_fa <- getValues(ras_qt_fa)
+w_sm_fa <- getValues(ras_sm_fa)
+w_rm_fa[is.na(w_rm_fa)] <- 0
+w_qt_fa[is.na(w_qt_fa)] <- 0
+w_sm_fa[is.na(w_sm_fa)] <- 0
+
+kwd_rm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm_fa),
+                                   L = 3, recode = TRUE, area = "linf",
+                                   x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_qt_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt_fa),
+                                   L = 3, recode = TRUE, area = "linf",
+                                   x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_sm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm_fa),
+                                   L = 3, recode = TRUE, area = "linf",
+                                   x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
## FocusArea, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+

We still get a KWD of zero for the ‘removal’ method. Furthermore, we now also get zero for smoothing.

+
c(kwd_rm_fa$distance, kwd_qt_fa$distance, kwd_sm_fa$distance)
+
## [1]    0.000 2748.876    0.000
+
+
+

3. How to apply a fixed cost to missing units of mass?

+

Suppose now we look at the whole map rather than focus areas. Then, we might get a mass mismatch, especially with removal:

+
c(sum(w_orig), sum(w_rm))
+
## [1] 90603 88530
+

There are ca. 2000 units missing after removal. We want to try solving the problem by assigning each a cost of half the map diagonal.

+
# compute length of the diagonal in cells
+diag_cells <- sqrt(ncol(ras_orig)^2 + nrow(ras_orig)^2)
+# ... in meters
+diag_metrs <- diag_cells * 200
+
+# solve with extra artificial bin (?)
+kwd_rm_bal <- compareOneToOne(Coordinates = xy, Weights = cbind(w_orig, w_rm),
+                              L = 3, recode = TRUE,
+                              unbalanced = TRUE, unbal_cost = diag_metrs/2)
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+

Again, this yields a distance of zero.

+
kwd_rm_bal$distance
+
## [1] 0
+

Do we get a distance of zero, because all non-zero tiles after removal match exactly? (I.e. for any given tile the mass is missing completely or not at all.)

+

Given that this is the case, we could, in principle, assign the costs manually (2000 \(\times c\) ). Still, intuitively the function should not return zero (?)

+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/Example_spatialKWD_focusAreas.nb.html b/Example_spatialKWD_focusAreas.nb.html new file mode 100644 index 0000000..5e4a946 --- /dev/null +++ b/Example_spatialKWD_focusAreas.nb.html @@ -0,0 +1,443 @@ + + + + + + + + + + + + + +Reproducible example of current open issues with the SpatialKWD package + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + +
+

Preparing example data

+ + + +
library(SpatialKWD)
+library(sp)
+library(raster)
+library(sdcSpatial)
+ + + +

The issues are illustrated by means of an example data set from the sdcSpatial package called ‘dwellings’. The first step is to create the unprotected raster map, which forms the ground truth.

+

The example uses a map of counts (no. of units per grid cell), with 200m \(\times\) 200m grid cells in a 62 \(\times\) 61 cells raster map.

+ + + +
data("dwellings")
+
+dw <- sdcSpatial::sdc_raster(dwellings[, c("x", "y")], 
+                             variable = 1, r = 200,
+                             max_risk = .8, min_count = 5)
+
+ras_orig <- dw$value$count # original raster
+
+raster::plot(ras_orig) # map
+ras_orig               # raster info
+ + + +

Next, we create the altered raster maps using three protection methods: 1) suppression of sensitive cells (remove_sensitive), 2) quadtree method (protect_quadtree), 3) kernel smoothing (protect_smooth).

+ + + +
# apply protection methods
+dw_rm <- remove_sensitive(dw)
+dw_qt <- protect_quadtree(dw, max_zoom = 2)
+dw_sm <- protect_smooth(dw, bw = 200)
+
+# extract protected rasters from sdcSpatial object
+ras_rm <- dw_rm$value$count
+ras_qt <- dw_qt$value$count
+ras_sm <- dw_sm$value$count
+ + + +

Following up, we select a square focus area. It is identified by its centroid and radius.

+ + + +
# Focus area (square)
+
+fa_ctrd <- c(155100, 464100) # center of focus area
+fa_rads <- 3000              # radius of focus area
+
+
+par(mfrow = c(2, 2))
+
+plot(ras_orig, main = "original")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_rm, main = "cell suppression")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_qt, main = "quad tree method")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_sm, main = "smoothing method")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+
+par(mfrow = c(1, 1))
+ + + + + + +
# ----- KWDs -----
+
+# extract coordinates from raster
+xy <- raster::xyFromCell(ras_orig, 1:ncell(ras_orig))
+
+# extract cell values from rasters
+w_orig <- raster::getValues(ras_orig)
+w_rm   <- raster::getValues(ras_rm)
+w_qt   <- raster::getValues(ras_qt)
+w_sm   <- raster::getValues(ras_sm)
+
+# treat NA-nodes as having zero weight
+w_orig[is.na(w_orig)] <- 0
+w_rm[is.na(w_rm)] <- 0
+w_qt[is.na(w_qt)] <- 0
+w_sm[is.na(w_sm)] <- 0
+
+kwd_rm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm),
+                                L = 3, recode = TRUE, area = "linf",
+                                x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+kwd_qt <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt),
+                                L = 3, recode = TRUE, area = "linf",
+                                x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+kwd_sm <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm),
+                                L = 3, recode = TRUE, area = "linf",
+                                x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
+# removal gives dist. of 0, although clearly different
+c(kwd_rm$distance, kwd_qt$distance, kwd_sm$distance)
+
+
+# try with weights outside FA set to zero
+ex_fa <- extent(fa_ctrd[1] - (fa_rads - 100), fa_ctrd[1] + (fa_rads - 100),
+                fa_ctrd[2] - (fa_rads - 100), fa_ctrd[2] + (fa_rads - 100))
+clls_fa <- cellsFromExtent(ras_orig, ex_fa)
+
+ras_rm_fa <- ras_rm
+ras_qt_fa <- ras_qt
+ras_sm_fa <- ras_sm
+ras_rm_fa[-clls_fa] <- 0
+ras_qt_fa[-clls_fa] <- 0
+ras_sm_fa[-clls_fa] <- 0
+
+
+par(mfrow = c(2, 2))
+
+plot(ras_orig, main = "original")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_rm_fa, main = "cell suppression (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_qt_fa, main = "quad tree method (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+plot(ras_sm_fa, main = "smoothing method (focus area)")
+rect(xleft = fa_ctrd[1] - fa_rads, xright = fa_ctrd[1] + fa_rads,
+     ybottom = fa_ctrd[2] - fa_rads, ytop = fa_ctrd[2] + fa_rads, border = "red")
+
+par(mfrow = c(1, 1))
+
+# updated weights (zero outside focus area)
+w_rm_fa <- getValues(ras_rm_fa)
+w_qt_fa <- getValues(ras_qt_fa)
+w_sm_fa <- getValues(ras_sm_fa)
+w_rm_fa[is.na(w_rm_fa)] <- 0
+w_qt_fa[is.na(w_qt_fa)] <- 0
+w_sm_fa[is.na(w_sm_fa)] <- 0
+
+kwd_rm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_rm_fa),
+                                   L = 3, recode = TRUE, area = "linf",
+                                   x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+kwd_qt_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_qt_fa),
+                                   L = 3, recode = TRUE, area = "linf",
+                                   x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+kwd_sm_fa <- SpatialKWD::focusArea(Coordinates = xy, Weights = cbind(w_orig, w_sm_fa),
+                                   L = 3, recode = TRUE, area = "linf",
+                                   x = fa_ctrd[1], y = fa_ctrd[2], radius = fa_rads)
+
+c(kwd_rm_fa$distance, kwd_qt_fa$distance, kwd_sm_fa$distance)
+ + + + +
+ +
LS0tCnRpdGxlOiAiUmVwcm9kdWNpYmxlIGV4YW1wbGUgb2YgY3VycmVudCBvcGVuIGlzc3VlcyB3aXRoIHRoZSBTcGF0aWFsS1dEIHBhY2thZ2UiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMjIFByZXBhcmluZyBleGFtcGxlIGRhdGEKCmBgYHtyfQpsaWJyYXJ5KFNwYXRpYWxLV0QpCmxpYnJhcnkoc3ApCmxpYnJhcnkocmFzdGVyKQpsaWJyYXJ5KHNkY1NwYXRpYWwpCmBgYAoKVGhlIGlzc3VlcyBhcmUgaWxsdXN0cmF0ZWQgYnkgbWVhbnMgb2YgYW4gZXhhbXBsZSBkYXRhIHNldCBmcm9tIHRoZSBzZGNTcGF0aWFsIHBhY2thZ2UgY2FsbGVkCidkd2VsbGluZ3MnLiBUaGUgZmlyc3Qgc3RlcCBpcyB0byBjcmVhdGUgdGhlIHVucHJvdGVjdGVkIHJhc3RlciBtYXAsIHdoaWNoIGZvcm1zIHRoZQpncm91bmQgdHJ1dGguIAoKVGhlIGV4YW1wbGUgdXNlcyBhIG1hcCBvZiBjb3VudHMgKG5vLiBvZiB1bml0cyBwZXIgZ3JpZCBjZWxsKSwgd2l0aCAyMDBtICRcdGltZXMkIAoyMDBtIGdyaWQgY2VsbHMgaW4gYSA2MiAkXHRpbWVzJCA2MSBjZWxscyByYXN0ZXIgbWFwLgoKYGBge3J9CmRhdGEoImR3ZWxsaW5ncyIpCgpkdyA8LSBzZGNTcGF0aWFsOjpzZGNfcmFzdGVyKGR3ZWxsaW5nc1ssIGMoIngiLCAieSIpXSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgdmFyaWFibGUgPSAxLCByID0gMjAwLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIG1heF9yaXNrID0gLjgsIG1pbl9jb3VudCA9IDUpCgpyYXNfb3JpZyA8LSBkdyR2YWx1ZSRjb3VudCAjIG9yaWdpbmFsIHJhc3RlcgoKcmFzdGVyOjpwbG90KHJhc19vcmlnKSAjIG1hcApyYXNfb3JpZyAgICAgICAgICAgICAgICMgcmFzdGVyIGluZm8KYGBgCgpOZXh0LCB3ZSBjcmVhdGUgdGhlIGFsdGVyZWQgcmFzdGVyIG1hcHMgdXNpbmcgdGhyZWUgcHJvdGVjdGlvbiBtZXRob2RzOgoxKSBzdXBwcmVzc2lvbiBvZiBzZW5zaXRpdmUgY2VsbHMgKHJlbW92ZV9zZW5zaXRpdmUpLAoyKSBxdWFkdHJlZSBtZXRob2QgKHByb3RlY3RfcXVhZHRyZWUpLAozKSBrZXJuZWwgc21vb3RoaW5nIChwcm90ZWN0X3Ntb290aCkuCgpgYGB7cn0KIyBhcHBseSBwcm90ZWN0aW9uIG1ldGhvZHMKZHdfcm0gPC0gcmVtb3ZlX3NlbnNpdGl2ZShkdykKZHdfcXQgPC0gcHJvdGVjdF9xdWFkdHJlZShkdywgbWF4X3pvb20gPSAyKQpkd19zbSA8LSBwcm90ZWN0X3Ntb290aChkdywgYncgPSAyMDApCgojIGV4dHJhY3QgcHJvdGVjdGVkIHJhc3RlcnMgZnJvbSBzZGNTcGF0aWFsIG9iamVjdApyYXNfcm0gPC0gZHdfcm0kdmFsdWUkY291bnQKcmFzX3F0IDwtIGR3X3F0JHZhbHVlJGNvdW50CnJhc19zbSA8LSBkd19zbSR2YWx1ZSRjb3VudApgYGAKCkZvbGxvd2luZyB1cCwgd2Ugc2VsZWN0IGEgc3F1YXJlIGZvY3VzIGFyZWEuIEl0IGlzIGlkZW50aWZpZWQgYnkgaXRzIGNlbnRyb2lkIGFuZCAKcmFkaXVzLgoKYGBge3J9CiMgRm9jdXMgYXJlYSAoc3F1YXJlKQoKZmFfY3RyZCA8LSBjKDE1NTEwMCwgNDY0MTAwKSAjIGNlbnRlciBvZiBmb2N1cyBhcmVhCmZhX3JhZHMgPC0gMzAwMCAgICAgICAgICAgICAgIyByYWRpdXMgb2YgZm9jdXMgYXJlYQoKCnBhcihtZnJvdyA9IGMoMiwgMikpCgpwbG90KHJhc19vcmlnLCBtYWluID0gIm9yaWdpbmFsIikKcmVjdCh4bGVmdCA9IGZhX2N0cmRbMV0gLSBmYV9yYWRzLCB4cmlnaHQgPSBmYV9jdHJkWzFdICsgZmFfcmFkcywKICAgICB5Ym90dG9tID0gZmFfY3RyZFsyXSAtIGZhX3JhZHMsIHl0b3AgPSBmYV9jdHJkWzJdICsgZmFfcmFkcywgYm9yZGVyID0gInJlZCIpCnBsb3QocmFzX3JtLCBtYWluID0gImNlbGwgc3VwcHJlc3Npb24iKQpyZWN0KHhsZWZ0ID0gZmFfY3RyZFsxXSAtIGZhX3JhZHMsIHhyaWdodCA9IGZhX2N0cmRbMV0gKyBmYV9yYWRzLAogICAgIHlib3R0b20gPSBmYV9jdHJkWzJdIC0gZmFfcmFkcywgeXRvcCA9IGZhX2N0cmRbMl0gKyBmYV9yYWRzLCBib3JkZXIgPSAicmVkIikKcGxvdChyYXNfcXQsIG1haW4gPSAicXVhZCB0cmVlIG1ldGhvZCIpCnJlY3QoeGxlZnQgPSBmYV9jdHJkWzFdIC0gZmFfcmFkcywgeHJpZ2h0ID0gZmFfY3RyZFsxXSArIGZhX3JhZHMsCiAgICAgeWJvdHRvbSA9IGZhX2N0cmRbMl0gLSBmYV9yYWRzLCB5dG9wID0gZmFfY3RyZFsyXSArIGZhX3JhZHMsIGJvcmRlciA9ICJyZWQiKQpwbG90KHJhc19zbSwgbWFpbiA9ICJzbW9vdGhpbmcgbWV0aG9kIikKcmVjdCh4bGVmdCA9IGZhX2N0cmRbMV0gLSBmYV9yYWRzLCB4cmlnaHQgPSBmYV9jdHJkWzFdICsgZmFfcmFkcywKICAgICB5Ym90dG9tID0gZmFfY3RyZFsyXSAtIGZhX3JhZHMsIHl0b3AgPSBmYV9jdHJkWzJdICsgZmFfcmFkcywgYm9yZGVyID0gInJlZCIpCgpwYXIobWZyb3cgPSBjKDEsIDEpKQpgYGAKCgoKYGBge3J9CiMgLS0tLS0gS1dEcyAtLS0tLQoKIyBleHRyYWN0IGNvb3JkaW5hdGVzIGZyb20gcmFzdGVyCnh5IDwtIHJhc3Rlcjo6eHlGcm9tQ2VsbChyYXNfb3JpZywgMTpuY2VsbChyYXNfb3JpZykpCgojIGV4dHJhY3QgY2VsbCB2YWx1ZXMgZnJvbSByYXN0ZXJzCndfb3JpZyA8LSByYXN0ZXI6OmdldFZhbHVlcyhyYXNfb3JpZykKd19ybSAgIDwtIHJhc3Rlcjo6Z2V0VmFsdWVzKHJhc19ybSkKd19xdCAgIDwtIHJhc3Rlcjo6Z2V0VmFsdWVzKHJhc19xdCkKd19zbSAgIDwtIHJhc3Rlcjo6Z2V0VmFsdWVzKHJhc19zbSkKCiMgdHJlYXQgTkEtbm9kZXMgYXMgaGF2aW5nIHplcm8gd2VpZ2h0Cndfb3JpZ1tpcy5uYSh3X29yaWcpXSA8LSAwCndfcm1baXMubmEod19ybSldIDwtIDAKd19xdFtpcy5uYSh3X3F0KV0gPC0gMAp3X3NtW2lzLm5hKHdfc20pXSA8LSAwCgprd2Rfcm0gPC0gU3BhdGlhbEtXRDo6Zm9jdXNBcmVhKENvb3JkaW5hdGVzID0geHksIFdlaWdodHMgPSBjYmluZCh3X29yaWcsIHdfcm0pLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEwgPSAzLCByZWNvZGUgPSBUUlVFLCBhcmVhID0gImxpbmYiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHggPSBmYV9jdHJkWzFdLCB5ID0gZmFfY3RyZFsyXSwgcmFkaXVzID0gZmFfcmFkcykKa3dkX3F0IDwtIFNwYXRpYWxLV0Q6OmZvY3VzQXJlYShDb29yZGluYXRlcyA9IHh5LCBXZWlnaHRzID0gY2JpbmQod19vcmlnLCB3X3F0KSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBMID0gMywgcmVjb2RlID0gVFJVRSwgYXJlYSA9ICJsaW5mIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB4ID0gZmFfY3RyZFsxXSwgeSA9IGZhX2N0cmRbMl0sIHJhZGl1cyA9IGZhX3JhZHMpCmt3ZF9zbSA8LSBTcGF0aWFsS1dEOjpmb2N1c0FyZWEoQ29vcmRpbmF0ZXMgPSB4eSwgV2VpZ2h0cyA9IGNiaW5kKHdfb3JpZywgd19zbSksCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTCA9IDMsIHJlY29kZSA9IFRSVUUsIGFyZWEgPSAibGluZiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgeCA9IGZhX2N0cmRbMV0sIHkgPSBmYV9jdHJkWzJdLCByYWRpdXMgPSBmYV9yYWRzKQoKIyByZW1vdmFsIGdpdmVzIGRpc3QuIG9mIDAsIGFsdGhvdWdoIGNsZWFybHkgZGlmZmVyZW50CmMoa3dkX3JtJGRpc3RhbmNlLCBrd2RfcXQkZGlzdGFuY2UsIGt3ZF9zbSRkaXN0YW5jZSkKCgojIHRyeSB3aXRoIHdlaWdodHMgb3V0c2lkZSBGQSBzZXQgdG8gemVybwpleF9mYSA8LSBleHRlbnQoZmFfY3RyZFsxXSAtIChmYV9yYWRzIC0gMTAwKSwgZmFfY3RyZFsxXSArIChmYV9yYWRzIC0gMTAwKSwKICAgICAgICAgICAgICAgIGZhX2N0cmRbMl0gLSAoZmFfcmFkcyAtIDEwMCksIGZhX2N0cmRbMl0gKyAoZmFfcmFkcyAtIDEwMCkpCmNsbHNfZmEgPC0gY2VsbHNGcm9tRXh0ZW50KHJhc19vcmlnLCBleF9mYSkKCnJhc19ybV9mYSA8LSByYXNfcm0KcmFzX3F0X2ZhIDwtIHJhc19xdApyYXNfc21fZmEgPC0gcmFzX3NtCnJhc19ybV9mYVstY2xsc19mYV0gPC0gMApyYXNfcXRfZmFbLWNsbHNfZmFdIDwtIDAKcmFzX3NtX2ZhWy1jbGxzX2ZhXSA8LSAwCgoKcGFyKG1mcm93ID0gYygyLCAyKSkKCnBsb3QocmFzX29yaWcsIG1haW4gPSAib3JpZ2luYWwiKQpyZWN0KHhsZWZ0ID0gZmFfY3RyZFsxXSAtIGZhX3JhZHMsIHhyaWdodCA9IGZhX2N0cmRbMV0gKyBmYV9yYWRzLAogICAgIHlib3R0b20gPSBmYV9jdHJkWzJdIC0gZmFfcmFkcywgeXRvcCA9IGZhX2N0cmRbMl0gKyBmYV9yYWRzLCBib3JkZXIgPSAicmVkIikKcGxvdChyYXNfcm1fZmEsIG1haW4gPSAiY2VsbCBzdXBwcmVzc2lvbiAoZm9jdXMgYXJlYSkiKQpyZWN0KHhsZWZ0ID0gZmFfY3RyZFsxXSAtIGZhX3JhZHMsIHhyaWdodCA9IGZhX2N0cmRbMV0gKyBmYV9yYWRzLAogICAgIHlib3R0b20gPSBmYV9jdHJkWzJdIC0gZmFfcmFkcywgeXRvcCA9IGZhX2N0cmRbMl0gKyBmYV9yYWRzLCBib3JkZXIgPSAicmVkIikKcGxvdChyYXNfcXRfZmEsIG1haW4gPSAicXVhZCB0cmVlIG1ldGhvZCAoZm9jdXMgYXJlYSkiKQpyZWN0KHhsZWZ0ID0gZmFfY3RyZFsxXSAtIGZhX3JhZHMsIHhyaWdodCA9IGZhX2N0cmRbMV0gKyBmYV9yYWRzLAogICAgIHlib3R0b20gPSBmYV9jdHJkWzJdIC0gZmFfcmFkcywgeXRvcCA9IGZhX2N0cmRbMl0gKyBmYV9yYWRzLCBib3JkZXIgPSAicmVkIikKcGxvdChyYXNfc21fZmEsIG1haW4gPSAic21vb3RoaW5nIG1ldGhvZCAoZm9jdXMgYXJlYSkiKQpyZWN0KHhsZWZ0ID0gZmFfY3RyZFsxXSAtIGZhX3JhZHMsIHhyaWdodCA9IGZhX2N0cmRbMV0gKyBmYV9yYWRzLAogICAgIHlib3R0b20gPSBmYV9jdHJkWzJdIC0gZmFfcmFkcywgeXRvcCA9IGZhX2N0cmRbMl0gKyBmYV9yYWRzLCBib3JkZXIgPSAicmVkIikKCnBhcihtZnJvdyA9IGMoMSwgMSkpCgojIHVwZGF0ZWQgd2VpZ2h0cyAoemVybyBvdXRzaWRlIGZvY3VzIGFyZWEpCndfcm1fZmEgPC0gZ2V0VmFsdWVzKHJhc19ybV9mYSkKd19xdF9mYSA8LSBnZXRWYWx1ZXMocmFzX3F0X2ZhKQp3X3NtX2ZhIDwtIGdldFZhbHVlcyhyYXNfc21fZmEpCndfcm1fZmFbaXMubmEod19ybV9mYSldIDwtIDAKd19xdF9mYVtpcy5uYSh3X3F0X2ZhKV0gPC0gMAp3X3NtX2ZhW2lzLm5hKHdfc21fZmEpXSA8LSAwCgprd2Rfcm1fZmEgPC0gU3BhdGlhbEtXRDo6Zm9jdXNBcmVhKENvb3JkaW5hdGVzID0geHksIFdlaWdodHMgPSBjYmluZCh3X29yaWcsIHdfcm1fZmEpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEwgPSAzLCByZWNvZGUgPSBUUlVFLCBhcmVhID0gImxpbmYiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHggPSBmYV9jdHJkWzFdLCB5ID0gZmFfY3RyZFsyXSwgcmFkaXVzID0gZmFfcmFkcykKa3dkX3F0X2ZhIDwtIFNwYXRpYWxLV0Q6OmZvY3VzQXJlYShDb29yZGluYXRlcyA9IHh5LCBXZWlnaHRzID0gY2JpbmQod19vcmlnLCB3X3F0X2ZhKSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBMID0gMywgcmVjb2RlID0gVFJVRSwgYXJlYSA9ICJsaW5mIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB4ID0gZmFfY3RyZFsxXSwgeSA9IGZhX2N0cmRbMl0sIHJhZGl1cyA9IGZhX3JhZHMpCmt3ZF9zbV9mYSA8LSBTcGF0aWFsS1dEOjpmb2N1c0FyZWEoQ29vcmRpbmF0ZXMgPSB4eSwgV2VpZ2h0cyA9IGNiaW5kKHdfb3JpZywgd19zbV9mYSksCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgTCA9IDMsIHJlY29kZSA9IFRSVUUsIGFyZWEgPSAibGluZiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgeCA9IGZhX2N0cmRbMV0sIHkgPSBmYV9jdHJkWzJdLCByYWRpdXMgPSBmYV9yYWRzKQoKYyhrd2Rfcm1fZmEkZGlzdGFuY2UsIGt3ZF9xdF9mYSRkaXN0YW5jZSwga3dkX3NtX2ZhJGRpc3RhbmNlKQpgYGAKCgoKCg==
+ + + +
+ + + + + + + + + + + + + + + + diff --git a/README.md b/README.md deleted file mode 100644 index 8597161..0000000 --- a/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Scripts for testing R-package [sdcSpatial](https://github.com/edwindj/sdcSpatial) on population grid cells - -Scripts to apply R-Package [sdcSpatial](https://github.com/edwindj/sdcSpatial) on population grids. First install need packages: - -``` -install.packages(c("data.table","sf","raster","sdcSpatial")) -``` - -Then run script `generate_data.R` which generates a random population based on the cell grids from Austria (see https://data.statistik.gv.at/web/meta.jsp?dataset=OGDEXT_RASTER_1). -This data set is the input for the script `sdcSpatial_template.R` which goes through the following steps: - -- Load needed libraries -- Load dummy population data -- Define "raster"-object for grid cells to be used with sdcSpatial -- Define sdc_raster object using population data and raster-object from before -- Check sensitive cells -- Apply protection method; check sensitive cells again and suppress if necessary -- Get protected table and apply information loss function -- Check if the original value of protected cells can be re-estimated - - diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..c5b78ef --- /dev/null +++ b/Readme.md @@ -0,0 +1,42 @@ +# Scripts for testing R-package [sdcSpatial](https://github.com/edwindj/sdcSpatial) on German grid data + +Contact: +Martin Möhler +martin.moehler@destatis.de + + +2 scripts are supplied: + +`01_read_data_DE.R` can be run to recreate pop_data_DE.RData; it can also +be skipped and the pre-computed file used to run `02_sdcSpatial_DE.R`. + +Packages needed: + +``` +install.packages(c("data.table","sf","raster","sdcSpatial", "dplyr", "tidyr", "ggplot2", "viridis", "spatstat", "SpatialKWD")) +``` + +`01_read_data_DE.R` + +- creates geocoded microdata from published 2011 census results for test purposes +- test area is one of the German federal states (can be chosen in the script) +- outputs: pop_data_DE.RData (stored default: Schleswig-Holstein) +- needs the following data: + a) German population aggregates on 100m grid +source: +https://www.zensus2011.de/SharedDocs/Downloads/DE/Pressemitteilung/DemografischeGrunddaten/csv_Bevoelkerung_100m_Gitter.zip + b) 100m grid definitions for Germany +source: +https://gdz.bkg.bund.de/index.php/default/open-data/geographische-gitter-fur-deutschland-in-lambert-projektion-geogitter-inspire.html + c) 100km grid definitions for Germany (to select chunks of the 100m grid) +source: +https://gdz.bkg.bund.de/index.php/default/open-data/geographische-gitter-fur-deutschland-in-lambert-projektion-geogitter-inspire.html + d) shapefile for German federal states (in UTM32) +https://daten.gdz.bkg.bund.de/produkte/vg/vg250_ebenen_0101/aktuell/ + + +`02_sdcSpatial_DE.R` + +- run the actual sdcSpatial experiments and utility analyses +- inputs: pop_data_DE.RData + diff --git a/SDCspatial_focusAreas.html b/SDCspatial_focusAreas.html new file mode 100644 index 0000000..1f29393 --- /dev/null +++ b/SDCspatial_focusAreas.html @@ -0,0 +1,927 @@ + + + + + + + + + + + + + +SDC spatial focus areas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ +
+ +

We use pre-computed raster maps from 02_sdcSpatial_DE.R, see GitHub repo.

+
library(sdcSpatial)
+library(SpatialKWD)
+library(sp)
+library(raster)
+library(viridisLite)
+library(viridis)
+
+load("sdcRasters_SHARE.RData")
+
+pal <- rev(viridis(10)) 
+
+

Making focus areas

+

Selecting four focus areas by: centroid (center point in ETRS89-LAEA) and size (measured in cell width).

+
fa1_ctrd <- c(4122250, 3162750)
+fa2_ctrd <- c(4195750, 2993750)
+fa3_ctrd <- c(4535250, 3474750)
+fa4_ctrd <- c(4328750, 2751250)
+
+cellsize <- 500
+
+# focus area sizes
+fa_wcells <- c(55, 41, 75, 55)
+
+# find cell index of centroid in larger raster
+fa1_cntr <- c(raster::colFromX(pop_ras$value, x = fa1_ctrd[1]),
+              raster::rowFromY(pop_ras$value, y = fa1_ctrd[2]))
+fa2_cntr <- c(raster::colFromX(pop_ras$value, x = fa2_ctrd[1]),
+              raster::rowFromY(pop_ras$value, y = fa2_ctrd[2]))
+fa3_cntr <- c(raster::colFromX(pop_ras$value, x = fa3_ctrd[1]),
+              raster::rowFromY(pop_ras$value, y = fa3_ctrd[2]))
+fa4_cntr <- c(raster::colFromX(pop_ras$value, x = fa4_ctrd[1]),
+              raster::rowFromY(pop_ras$value, y = fa4_ctrd[2]))
+
+# create extent objects for chosen positions & sizes
+fa1_extt <- extent(pop_ras$value, fa1_cntr[2] - (fa_wcells[1] - 1)/2, 
+                                  fa1_cntr[2] + (fa_wcells[1] - 1)/2,
+                                  fa1_cntr[1] - (fa_wcells[1] - 1)/2,
+                                  fa1_cntr[1] + (fa_wcells[1] - 1)/2)
+
+fa2_extt <- extent(pop_ras$value, fa2_cntr[2] - (fa_wcells[2] - 1)/2, 
+                                  fa2_cntr[2] + (fa_wcells[2] - 1)/2,
+                                  fa2_cntr[1] - (fa_wcells[2] - 1)/2,
+                                  fa2_cntr[1] + (fa_wcells[2] - 1)/2)
+
+fa3_extt <- extent(pop_ras$value, fa3_cntr[2] - (fa_wcells[3] - 1)/2, 
+                                  fa3_cntr[2] + (fa_wcells[3] - 1)/2,
+                                  fa3_cntr[1] - (fa_wcells[3] - 1)/2,
+                                  fa3_cntr[1] + (fa_wcells[3] - 1)/2)
+
+fa4_extt <- extent(pop_ras$value, fa4_cntr[2] - (fa_wcells[4] - 1)/2, 
+                                  fa4_cntr[2] + (fa_wcells[4] - 1)/2,
+                                  fa4_cntr[1] - (fa_wcells[4] - 1)/2,
+                                  fa4_cntr[1] + (fa_wcells[4] - 1)/2)
+
+# crop overall raster to focus area for plotting & pop. counts
+fa1 <- fa2 <- fa3 <- fa4 <- pop_ras
+fa1$value <- crop(fa1$value, fa1_extt)
+fa2$value <- crop(fa2$value, fa2_extt)
+fa3$value <- crop(fa3$value, fa3_extt)
+fa4$value <- crop(fa4$value, fa4_extt)
+
+par(mfrow = c(2, 2))
+  plot(fa1, "count", sensitive = FALSE, col = pal, main = "Ruhr valley")
+  plot(fa2, "count", sensitive = FALSE, col = pal, main = "MZ + WI")
+  plot(fa3, "count", sensitive = FALSE, col = pal, main = "Strelasund")
+  plot(fa4, "count", sensitive = FALSE, col = pal, main = "Allgäu")
+

+
par(mfrow = c(1, 1))
+
+# calculate populations for subregions
+p_fa <- c(sum(getValues(fa1$value$count), na.rm = TRUE), 
+          sum(getValues(fa2$value$count), na.rm = TRUE), 
+          sum(getValues(fa3$value$count), na.rm = TRUE), 
+          sum(getValues(fa4$value$count), na.rm = TRUE))
+
+fa_sizes  <- cbind(fa_wcells,
+                   fa_wcells^2,
+                   fa_wcells^2 * cellsize^2 / 1e+6,
+                   p_fa) 
+colnames(fa_sizes) <- c("width (ncells)", "size (ncells)", "size (km^2)", "pop.")
+rownames(fa_sizes) <- c("fa1", "fa2", "fa3", "fa4")
+fa_sizes
+
##     width (ncells) size (ncells) size (km^2)    pop.
+## fa1             55          3025      756.25 1745062
+## fa2             41          1681      420.25  558055
+## fa3             75          5625     1406.25  107863
+## fa4             55          3025      756.25  106283
+
+
+

Risk and bin-by-bin measure for focus areas

+

We start by protecting focus areas, using the sdcSpatial algorithms. For this, wrapper functions are used:

+
wrap_qt <- function(x, max_zoom = Inf, enlarge = TRUE) {
+  
+  if(enlarge) {
+    pad <- min(2^max_zoom - 1, 10) # maximum padding is 10 (arbitrary)
+    x$value <- extend(x$value, c(pad, pad))
+    x <- sdcSpatial::protect_quadtree(x, max_zoom)
+  } else {
+    x <- sdcSpatial::protect_quadtree(x, max_zoom)
+  }
+  x
+}
+
+wrap_sm <- function(x, bw = res(x$value)[1], enlarge = TRUE) {
+  
+  if(enlarge) {
+    pad <- 3 * (bw / res(x$value)[1])
+    x$value <- extend(x$value, c(pad, pad))
+    x <- sdcSpatial::protect_smooth(x, bw)
+  } else {
+    x <- sdcSpatial::protect_smooth(bw)
+  }
+  x
+}
+

The idea behind these wrappers is to add some empty cells around the borders of the focus areas in order to capture distribution mass that ‘spills out’. This mass can then be taken into account later-on (esp. for KWD computation).

+

The necessary enlargement depends on the protection mechanism and can be inferred by looking into the functions sdcSpatial::protect_quadtree() and sdcSpatial::protect_smooth().

+

For the quadtree method we need to enlarge the grid by \[2^{\text{max_zoom} - 1} - 1\] rows / columns on each side. For Gaussian smoothing it is \[3 \cdot \frac{\text{bandwidth}}{\text{cellwidth}}\] rows / columns on each side.

+

We call the (wrapped) protection algorithms separately for each focus area (4 methods times 4 areas = 16 outcomes).

+
par(mfrow = c(2, 2))
+
+# removal
+fa1_rm <- remove_sensitive(fa1)
+fa2_rm <- remove_sensitive(fa2)
+fa3_rm <- remove_sensitive(fa3)
+fa4_rm <- remove_sensitive(fa4)
+
+# quad tree I
+fa1_qt1 <- wrap_qt(fa1, max_zoom = 2)
+fa2_qt1 <- wrap_qt(fa2, max_zoom = 2)
+fa3_qt1 <- wrap_qt(fa3, max_zoom = 2)
+fa4_qt1 <- wrap_qt(fa4, max_zoom = 2)
+
+# quad tree II
+fa1_qt2 <- wrap_qt(fa1, max_zoom = 3)
+fa2_qt2 <- wrap_qt(fa2, max_zoom = 3)
+fa3_qt2 <- wrap_qt(fa3, max_zoom = 3)
+fa4_qt2 <- wrap_qt(fa4, max_zoom = 3)
+
+# smoothing
+fa1_sm <- wrap_sm(fa1, bw = cellsize)
+fa2_sm <- wrap_sm(fa2, bw = cellsize)
+fa3_sm <- wrap_sm(fa3, bw = cellsize)
+fa4_sm <- wrap_sm(fa4, bw = cellsize)
+
+
+# show results
+
+plot(fa1_rm, "count", sensitive = FALSE, col = pal, main = "Ruhr valley (rm)")
+plot(fa2_rm, "count", sensitive = FALSE, col = pal, main = "MZ + WI (rm)")
+plot(fa3_rm, "count", sensitive = FALSE, col = pal, main = "Strelasund (rm)")
+plot(fa4_rm, "count", sensitive = FALSE, col = pal, main = "Allgäu (rm)")
+

+
plot(fa1_qt1, "count", sensitive = FALSE, col = pal, main = "Ruhr valley (qt1)")
+plot(fa2_qt1, "count", sensitive = FALSE, col = pal, main = "MZ + WI (qt1)")
+plot(fa3_qt1, "count", sensitive = FALSE, col = pal, main = "Strelasund (qt1)")
+plot(fa4_qt1, "count", sensitive = FALSE, col = pal, main = "Allgäu (qt1)")
+

+
plot(fa1_qt2, "count", sensitive = FALSE, col = pal, main = "Ruhr valley (qt2)")
+plot(fa2_qt2, "count", sensitive = FALSE, col = pal, main = "MZ + WI (qt2)")
+plot(fa3_qt2, "count", sensitive = FALSE, col = pal, main = "Strelasund (qt2)")
+plot(fa4_qt2, "count", sensitive = FALSE, col = pal, main = "Allgäu (qt2)")
+

+
plot(fa1_sm, "count", sensitive = FALSE, col = pal, main = "Ruhr valley (sm)")
+plot(fa2_sm, "count", sensitive = FALSE, col = pal, main = "MZ + WI (sm)")
+plot(fa3_sm, "count", sensitive = FALSE, col = pal, main = "Strelasund (sm)")
+plot(fa4_sm, "count", sensitive = FALSE, col = pal, main = "Allgäu (sm)")
+

+
par(mfrow = c(1, 1))
+

Note that for qt1, qt2 and sm the mass spills slightly beyond the original borders. We can show that with the exception of the removal method, all mass has been preserved:

+
mass_rm  <- c(sum(getValues(fa1_rm$value$count),  na.rm = TRUE),
+              sum(getValues(fa2_rm$value$count),  na.rm = TRUE),
+              sum(getValues(fa3_rm$value$count),  na.rm = TRUE),
+              sum(getValues(fa4_rm$value$count),  na.rm = TRUE))
+
+mass_qt1 <- c(sum(getValues(fa1_qt1$value$count), na.rm = TRUE),
+              sum(getValues(fa2_qt1$value$count), na.rm = TRUE),
+              sum(getValues(fa3_qt1$value$count), na.rm = TRUE),
+              sum(getValues(fa4_qt1$value$count), na.rm = TRUE))
+
+mass_qt2 <- c(sum(getValues(fa1_qt2$value$count), na.rm = TRUE),
+              sum(getValues(fa2_qt2$value$count), na.rm = TRUE),
+              sum(getValues(fa3_qt2$value$count), na.rm = TRUE),
+              sum(getValues(fa4_qt2$value$count), na.rm = TRUE))
+
+mass_sm  <- c(sum(getValues(fa1_sm$value$count),  na.rm = TRUE),
+              sum(getValues(fa2_sm$value$count),  na.rm = TRUE),
+              sum(getValues(fa3_sm$value$count),  na.rm = TRUE),
+              sum(getValues(fa4_sm$value$count),  na.rm = TRUE))
+
+masses <- cbind(p_fa, mass_rm, mass_qt1, mass_qt2, mass_sm)
+masses <- cbind(masses, round(masses[, 2:5] - p_fa))
+colnames(masses) <- c("original", "removal", "quad tree I", "quad tree II", "smoothing",
+                      "diff_removal", "diff_qt1", "diff_qt2", "diff_sm")
+rownames(masses) <- c("fa1", "fa2", "fa3", "fa4")
+masses
+
##     original removal quad tree I quad tree II smoothing diff_removal diff_qt1
+## fa1  1745062 1744773     1745062      1745062   1745062         -289        0
+## fa2   558055  557830      558055       558055    558055         -225        0
+## fa3   107863  107175      107863       107863    107863         -688        0
+## fa4   106283  105116      106283       106283    106283        -1167        0
+##     diff_qt2 diff_sm
+## fa1        0       0
+## fa2        0       0
+## fa3        0       0
+## fa4        0       0
+

We can now calculate risk measures for the focus areas.

+
# --- share of cells at risk ---
+# initial
+sens     <- c(sensitivity_score(fa1),     sensitivity_score(fa2),     sensitivity_score(fa3),     sensitivity_score(fa4)) 
+# after SDC
+sens_rm  <- c(sensitivity_score(fa1_rm),  sensitivity_score(fa2_rm),  sensitivity_score(fa3_rm),  sensitivity_score(fa4_rm))
+sens_qt1 <- c(sensitivity_score(fa1_qt1), sensitivity_score(fa2_qt1), sensitivity_score(fa3_qt1), sensitivity_score(fa4_qt1))
+sens_qt2 <- c(sensitivity_score(fa1_qt2), sensitivity_score(fa2_qt2), sensitivity_score(fa3_qt2), sensitivity_score(fa4_qt2))
+sens_sm  <- c(sensitivity_score(fa1_sm),  sensitivity_score(fa2_sm),  sensitivity_score(fa3_sm),  sensitivity_score(fa4_sm))
+
+risk_cell <- cbind(sens, sens_rm, sens_qt1, sens_qt2, sens_sm)
+colnames(risk_cell) <- c("cell_initial", "cell_removal", "cell_quad tree I", "cell_quad tree II", "cell_smoothing")
+rownames(risk_cell) <- c("fa1", "fa2", "fa3", "fa4")
+round(risk_cell, 3)
+
##     cell_initial cell_removal cell_quad tree I cell_quad tree II cell_smoothing
+## fa1        0.033            0            0.007                 0              0
+## fa2        0.077            0            0.089                 0              0
+## fa3        0.186            0            0.142                 0              0
+## fa4        0.213            0            0.077                 0              0
+
# --- share of pop. at risk ---
+# initial
+sensp <- c(sum(fa1$value$count[is_sensitive(fa1)]), sum(fa2$value$count[is_sensitive(fa2)]),
+           sum(fa3$value$count[is_sensitive(fa3)]), sum(fa4$value$count[is_sensitive(fa4)]))
+# after SDC
+sensp_rm  <- c(sum(fa1_rm$value$count[is_sensitive(fa1_rm)]),   sum(fa2_rm$value$count[is_sensitive(fa2_rm)]),
+               sum(fa3_rm$value$count[is_sensitive(fa3_rm)]),   sum(fa4_rm$value$count[is_sensitive(fa4_rm)]))
+sensp_qt1 <- c(sum(fa1_qt1$value$count[is_sensitive(fa1_qt1)]), sum(fa2_qt1$value$count[is_sensitive(fa2_qt1)]),
+               sum(fa3_qt1$value$count[is_sensitive(fa3_qt1)]), sum(fa4_qt1$value$count[is_sensitive(fa4_qt1)]))
+sensp_qt2 <- c(sum(fa1_qt2$value$count[is_sensitive(fa1_qt2)]), sum(fa2_qt2$value$count[is_sensitive(fa2_qt2)]),
+               sum(fa3_qt2$value$count[is_sensitive(fa3_qt2)]), sum(fa4_qt2$value$count[is_sensitive(fa4_qt2)]))
+sensp_sm  <- c(sum(fa1_sm$value$count[is_sensitive(fa1_sm)]),   sum(fa2_sm$value$count[is_sensitive(fa2_sm)]),
+               sum(fa3_sm$value$count[is_sensitive(fa3_sm)]),   sum(fa4_sm$value$count[is_sensitive(fa4_sm)]))
+risk_pop <- cbind(sensp/p_fa, sensp_rm/p_fa, sensp_qt1/p_fa, sensp_qt2/p_fa, sensp_sm/p_fa)
+colnames(risk_pop) <- c("pop_initial", "pop_removal", "pop_quad tree I", "pop_quad tree II", "pop_smoothing")
+rownames(risk_pop) <- c("fa1", "fa2", "fa3", "fa4")
+risk_pop
+
##      pop_initial pop_removal pop_quad tree I pop_quad tree II pop_smoothing
+## fa1 0.0001656102           0    8.309160e-06                0             0
+## fa2 0.0004031861           0    1.334994e-04                0             0
+## fa3 0.0063784616           0    1.678055e-03                0             0
+## fa4 0.0109801191           0    1.232558e-03                0             0
+

Next, we calculate the normalized Hellinger distances.

+

In accordance with the definition, mass mismatch does not bother the metric, so we consider only the original focus areas without enlargement around the borders. We therefore include a step where if the focus area was enlarged by a wrapper function, it gets cropped to its original size first.

+
getHD <- function(x, orig, value = "count") {
+  
+  # resize protected raster if it was enlarged
+  if(extent(x$value) > extent(orig$value)) {
+    x$value <- crop(x$value, extent(orig$value))
+  }
+  
+  # extract value raster of interest
+  r_x <- x$value[[value]]
+  r_o <- orig$value[[value]]
+  
+  # extract cell values
+  v_x <- raster::getValues(r_x)
+  v_o <- raster::getValues(r_o)
+  
+  # set NAs to 0
+  v_x[is.na(v_x)] <- 0
+  v_o[is.na(v_o)] <- 0
+  
+  # rescale
+  v_x <- v_x / sum(v_x)
+  v_o <- v_o / sum(v_o)
+  
+  HD <- sqrt(sum((sqrt(v_x) - sqrt(v_o))^2)) * (1/sqrt(2))
+  HD
+}
+
+hd_rm  <- c(getHD(fa1_rm, fa1),  getHD(fa2_rm, fa2),  getHD(fa3_rm, fa3),  getHD(fa4_rm, fa4))
+hd_qt1 <- c(getHD(fa1_qt1, fa1), getHD(fa2_qt1, fa2), getHD(fa3_qt1, fa3), getHD(fa4_qt1, fa4))
+hd_qt2 <- c(getHD(fa1_qt2, fa1), getHD(fa2_qt2, fa2), getHD(fa3_qt2, fa3), getHD(fa4_qt2, fa4))
+hd_sm  <- c(getHD(fa1_sm, fa1),  getHD(fa2_sm, fa2),  getHD(fa3_sm, fa3),  getHD(fa4_sm, fa4))
+
+hd <- cbind(hd_rm, hd_qt1, hd_qt2, hd_sm)
+colnames(hd) <- c("removal", "quad tree I", "quad tree II", "smoothing")
+rownames(hd) <- c("fa1", "fa2", "fa3", "fa4")
+round(hd, 3)
+
##     removal quad tree I quad tree II smoothing
+## fa1   0.009       0.070        0.083     0.280
+## fa2   0.014       0.114        0.213     0.366
+## fa3   0.057       0.151        0.205     0.451
+## fa4   0.074       0.167        0.226     0.416
+
+
+

KWD for focus areas

+

KWD requires that both maps have the same mass and extent. By enlarging the focus areas slightly the mass was kept for all methods except removal (see above).

+

In order to account for the slightly larger extent, the ground truth is also slightly enlarged before the comparison. Since all the padding is made up of zero values, it does not distort the measure. This takes care of quadtree and smoothing.

+
getKWD <- function(x, orig, value = "count", balance = "none", plot_map = FALSE, ...) {
+  
+  # resize original raster to balance mass for qt/sm
+  if(extent(x$value) > extent(orig$value)) {
+    orig$value <- extend(orig$value, extent(x$value))
+  }
+  
+  if(balance == "frame") {
+    x$value <- extend(x$value, c(1, 1), value = 0)
+    orig$value <- extend(orig$value, c(1, 1), value = 0)
+  }
+  
+  # extract value raster of interest
+  r_x <- x$value[[value]]
+  r_o <- orig$value[[value]]
+  
+  # extract cell values
+  v_x <- raster::getValues(r_x)
+  v_o <- raster::getValues(r_o)
+  
+  # set NAs to 0
+  v_x[is.na(v_x)] <- 0
+  v_o[is.na(v_o)] <- 0
+  
+  xy <- raster::xyFromCell(r_x, 1:ncell(r_x))
+  
+  # balance mass for removal
+  if(balance %in% c("pop_ctrd", "frame")) {
+    
+    mdiff <- sum(v_o) - sum(v_x) # mass difference must be positive
+    
+    if(balance == "frame") {
+      frameclls <- unique(c(raster::cellFromCol(r_x, c(1, ncol(r_x))),
+                            raster::cellFromRow(r_x, c(1, nrow(r_x)))))
+      r_x[frameclls] <- 1/(length(frameclls)) * mdiff
+    }
+    
+    if(balance == "pop_ctrd") {
+      
+      # get population weighted centroid
+      pop_ctrd <- c(stats::weighted.mean(xy[, 1], v_x),
+                    stats::weighted.mean(xy[, 2], v_x))
+      mcell <- cellFromXY(r_x, pop_ctrd)
+      r_x[mcell] <- ifelse(is.na(r_x[mcell]), mdiff, r_x[mcell] + mdiff)
+    }
+    
+    # extract new re-distributed mass
+    v_x <- raster::getValues(r_x)
+    v_x[is.na(v_x)] <- 0
+  }
+  
+  if(plot_map) {
+    plot(r_x, col = pal)
+    if(balance == "frame") {
+      fr <- extent(r_x, 2, ncol(r_x) - 1, 2, nrow(r_x) - 1)
+    } else if(balance == "pop_ctrd") {
+      fr <- c(xFromCell(r_x, mcell) - res(r_x)[1]/2, 
+              xFromCell(r_x, mcell) + res(r_x)[1]/2,
+              yFromCell(r_x, mcell) - res(r_x)[2]/2, 
+              yFromCell(r_x, mcell) + res(r_x)[2]/2)
+    }
+    if(balance %in% c("frame", "pop_ctrd")) {
+      rect(xleft = fr[1], xright = fr[2], ybottom = fr[3], ytop = fr[4], 
+           border = "red")
+    }
+  }
+  
+  KWD <- compareOneToOne(Coordinates = xy, Weights = cbind(v_x, v_o), ...)
+                        
+  KWD$distance
+}
+

For removal, we need to make assumptions on how to balance mass. Two versions of manually balancing are considered:

+
    +
  1. The missing mass gets added in a frame around the focus area. Therefore, missing mass in the visual center of the map gets punished more.

  2. +
  3. The missing mass gets added in a single cell at the population-weighted centroid of the map. The intuition behind this is that this point should be a kind of ‘best guess’ as to where it might be located.

  4. +
+
# Visualizing the two approaches:
+
+par(mfrow = c(2, 2))
+
+getKWD(fa1_rm, fa1, "count", balance = "frame", plot_map = TRUE)
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
## [1] 0.002546098
+
getKWD(fa2_rm, fa2, "count", balance = "frame", plot_map = TRUE)
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
## [1] 0.00379919
+
getKWD(fa3_rm, fa3, "count", balance = "frame", plot_map = TRUE)
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
## [1] 0.1285745
+
getKWD(fa4_rm, fa4, "count", balance = "frame", plot_map = TRUE)
+

+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
## [1] 0.1314477
+
getKWD(fa1_rm, fa1, "count", balance = "pop_ctrd", plot_map = TRUE)
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
## [1] 0.004185462
+
getKWD(fa2_rm, fa2, "count", balance = "pop_ctrd", plot_map = TRUE)
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
## [1] 0.006253455
+
getKWD(fa3_rm, fa3, "count", balance = "pop_ctrd", plot_map = TRUE)
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
## [1] 0.1740762
+
getKWD(fa4_rm, fa4, "count", balance = "pop_ctrd", plot_map = TRUE)
+

+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
## [1] 0.2377131
+
par(mfrow = c(1, 1))
+

We compute now the KWD for all methods and focus areas. For removal, both ideas about mass balancing are compared.

+
kwd_rm_f <- c(getKWD(fa1_rm, fa1, "count", balance = "frame"),
+              getKWD(fa2_rm, fa2, "count", balance = "frame"),
+              getKWD(fa3_rm, fa3, "count", balance = "frame"),
+              getKWD(fa4_rm, fa4, "count", balance = "frame"))
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_rm_c <- c(getKWD(fa1_rm, fa1, "count", balance = "pop_ctrd"),
+              getKWD(fa2_rm, fa2, "count", balance = "pop_ctrd"),
+              getKWD(fa3_rm, fa3, "count", balance = "pop_ctrd"),
+              getKWD(fa4_rm, fa4, "count", balance = "pop_ctrd"))
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_qt1 <- c(getKWD(fa1_qt1, fa1, "count"), getKWD(fa2_qt1, fa2, "count"),
+             getKWD(fa3_qt1, fa3, "count"), getKWD(fa4_qt1, fa4, "count"))
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_qt2 <- c(getKWD(fa1_qt2, fa1, "count"), getKWD(fa2_qt2, fa2, "count"),
+             getKWD(fa3_qt2, fa3, "count"), getKWD(fa4_qt2, fa4, "count"))
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_sm  <- c(getKWD(fa1_sm,  fa1, "count"), getKWD(fa2_sm,  fa2, "count"),
+             getKWD(fa3_sm,  fa3, "count"), getKWD(fa4_sm,  fa4, "count"))
+
## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+## CompareOneToOne, Solution method: APPROX
+## WARNING: the Xs input coordinates are not consecutives integers.
+## WARNING: the Ys input coordinates are not consecutives integers.
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd <- cbind(kwd_rm_f, kwd_rm_c, kwd_qt1, kwd_qt2, kwd_sm)
+colnames(kwd) <- c("removal (frame)", "removal (centroid)", "quad tree I", "quad tree II", "smoothing")
+rownames(kwd) <- c("fa1", "fa2", "fa3", "fa4")
+round(kwd, 3)
+
##     removal (frame) removal (centroid) quad tree I quad tree II smoothing
+## fa1           0.003              0.004       0.012        0.018     0.381
+## fa2           0.004              0.006       0.030        0.144     0.493
+## fa3           0.129              0.174       0.053        0.119     0.626
+## fa4           0.131              0.238       0.067        0.150     0.624
+

The results we get are now a lot more subtle.

+

Removal is not universally worse than other methods (according to KWD), but actually very good in the more densely populated areas, but beaten by quad tree in rural focus areas.

+
+
+

Appendix:How to normalize KWD?

+

Some considerations on how to normalize KWD from the focusArea() function. Suppose we have a total count of 400. We move 10 units of mass by exactly one cell.

+
wdth <- 11
+m <- matrix(data = 1, nrow = wdth, ncol = wdth)
+
+set.seed(100)
+m[sample(1:(11^2), 50)] <- 6
+m[6, 6] <- m[6, 6] + 29
+
+r_exa <- raster(m, xmn = 0, xmx = wdth, ymn = 0, ymx = wdth)
+
+r_exa2 <- r_exa
+r_exa2[6, 6] <- r_exa2[6, 6] - 10
+r_exa2[6, 7] <- r_exa2[6, 7] + 10
+
+par(mfrow = c(1, 2))
+plot(r_exa,  breaks = seq(5, 35, 5), col = terrain.colors(7),
+     main = paste("count total:", sum(getValues(r_exa))))
+plot(r_exa2, breaks = seq(5, 35, 5), col = terrain.colors(7),
+     main = paste("count total:", sum(getValues(r_exa2))))
+

+
par(mfrow = c(1, 1))
+

We know the KWD before normalization must be 10, afterwards 0.025.

+
xy <- ceiling(raster::xyFromCell(r_exa, 1:ncell(r_exa)))
+
+kwd_test <- compareOneToOne(Coordinates = xy, 
+                            Weights = cbind(getValues(r_exa), getValues(r_exa2)),
+                            method = "exact")$distance
+
## CompareOneToOne, Solution method: EXACT
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_fa_test <- focusArea(Coordinates = xy,
+                         Weights = cbind(getValues(r_exa), getValues(r_exa2)),
+                         x = 6, y = 6, radius = 3, area = "linf",
+                         method = "exact")$distance
+
## FocusArea, Solution method: APPROX
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
tst_results <- c(kwd_test, kwd_fa_test, kwd_fa_test / sum(getValues(r_exa2)))
+names(tst_results) <- c("full map", "focus area", "focus area / sum(full map)")
+tst_results
+
##                   full map                 focus area 
+##                      0.025                     10.000 
+## focus area / sum(full map) 
+##                      0.025
+

Suppose now we select our focus area from a smaller super area (with fewer total counts). In order to get to the same results we need to calibrate.

+
r_exa_new  <- crop(r_exa, extent(r_exa,   2, wdth - 1, 2, wdth - 1))
+r_exa2_new <- crop(r_exa2, extent(r_exa2, 2, wdth - 1, 2, wdth - 1))
+
+fa_factor <- sum(getValues(r_exa)) / sum(getValues(r_exa_new))
+
+xy <- ceiling(raster::xyFromCell(r_exa_new, 1:ncell(r_exa_new)))
+
+kwd_test <- compareOneToOne(Coordinates = xy, 
+                            Weights = cbind(getValues(r_exa_new), getValues(r_exa2_new)),
+                            method = "exact")$distance
+
## CompareOneToOne, Solution method: EXACT
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
kwd_fa_test <- focusArea(Coordinates = xy,
+                         Weights = cbind(getValues(r_exa_new), getValues(r_exa2_new)),
+                         x = 5, y = 5, radius = 3, area = "linf",
+                         method = "exact")$distance
+
## FocusArea, Solution method: APPROX
+## INFO: Recoding the input coordinates to consecutive integers.
+## INFO: change <timelimit> to 14400.000000
+## INFO: change <verbosity> to silent
+## INFO: change <opt_tolerance> to 0.000001
+
tst_results <- c(kwd_test, kwd_test / fa_factor,
+                 kwd_fa_test / sum(r_exa2_new[(5-3):(5+3), (5-3):(5+3)]),
+                 (kwd_fa_test / sum(getValues(r_exa2_new)) / fa_factor))
+names(tst_results) <- c("cropped area", "cropped area (calib.)", "focus area", "focus area (calib.)")
+tst_results
+
##          cropped area cropped area (calib.)            focus area 
+##            0.03571429            0.02500000            0.05617978 
+##   focus area (calib.) 
+##            0.02500000
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/generate_data.R b/generate_data.R deleted file mode 100644 index 29b2fef..0000000 --- a/generate_data.R +++ /dev/null @@ -1,67 +0,0 @@ -################################################# -# script to generate dummy data for austrian population -# - - -######################### -# load libraries -library(data.table) -library(sf) - -######################### -# download data from open portal -url <- "https://data.statistik.gv.at/data/OGDEXT_RASTER_1_STATISTIK_AUSTRIA_LAEA_GPKG.zip" - -# download .zip -zipfile <- file.path(getwd(),basename(url)) -download.file(url,zipfile) - -# unzip file -unzipfolder <- paste0(sub(".zip","",zipfile)) -unzip(zipfile,exdir=unzipfolder) -# delete zip file -file.remove(zipfile) - -# read geopackage file -st_layers(file.path(unzipfolder,"STATISTIK_AUSTRIA_RASTER_LAEA_3035.gpkg")) - -l500 <- st_read(file.path(unzipfolder,"STATISTIK_AUSTRIA_RASTER_LAEA_3035.gpkg"), layer = "l000500") - -# delete folder -unlink(unzipfolder, recursive = TRUE) - -######################### -# define lookup for 500m, 1km and 2km gridcells -raster_cells <- data.table(L000500=l500$cellcode) -raster_cells[,y:=as.numeric(paste0(substr(L000500,6,10),"00"))] -raster_cells[,x:=as.numeric(paste0(substr(L000500,12,16),"00"))] -raster_cells[,c("y_1k","x_1k"):=.(floor(y/1000),floor(x/1000))] -raster_cells[,L001000:=paste0("1kmN",y_1k,"E",x_1k)] -raster_cells[,c("y_2k","x_2k"):=.(y_1k-y_1k%%2,x_1k-x_1k%%2)] -raster_cells[,L002000:=paste0("2kmN",y_2k,"E",x_2k)] -raster_cells[,c("x_1k","y_1k","x_2k","y_2k","x","y"):=NULL] -setorder(raster_cells,L002000,L001000,L000500) - -######################### -# randomly populate raster_cells to build dummy population -set.seed(202305) -share_unpop_cells <- runif(1,min=0.65,0.75) - -pop_data <- copy(raster_cells) -# populate some grid cells with 0 -pop_data[sample(.N,round(share_unpop_cells*.N)),pop_count:=0] -# remaining gridcells populate with some values between 1 and 10000 -pop_data[is.na(pop_count),pop_count:=sample(1:10000,.N,replace=TRUE,prob=1/(1:10000)^(1.455))] -pop_data[,sum(pop_count)] - -pop_data <- pop_data[pop_count>0,.(1:pop_count),by=.(L000500,L001000,L002000)] -pop_data[,V1:=NULL] -pop_data[,PID:=1:.N] # dummy PID -# coordinates of each person = 500m grid cell coordinates -pop_data[,y:=as.numeric(paste0(substr(L000500,6,10),"00"))] -pop_data[,x:=as.numeric(paste0(substr(L000500,12,16),"00"))] - -######################### -# save data -save(pop_data,raster_cells,file="test_data.RData",compress=TRUE) - diff --git a/pop_data_DE.RData b/pop_data_DE.RData new file mode 100644 index 0000000..6b71573 Binary files /dev/null and b/pop_data_DE.RData differ diff --git a/sdcSpatial_template.R b/sdcSpatial_template.R deleted file mode 100644 index 2bb3492..0000000 --- a/sdcSpatial_template.R +++ /dev/null @@ -1,243 +0,0 @@ -####################################################################### -# Script to apply sdcSpatial on 500/1000/2000 m grid count tables -# -# Script applies the following steps (in that order) -# -# - load needed libraries -# - load dummy population data -# - define "raster"-object for grid cells to be used with sdcSpatial -# - define sdc_raster object using population data and raster-object from before -# - check sensitive cells -# - apply protection method; check sensitive cells again and suppress if necessary -# - get protected table and apply information loss function -# - check if the original value of protected cells can be re-estimated -# - -################################## -# load libraries -library(data.table) -library(raster) -library(sf) -library(sdcSpatial) - - -################################## -# load population data -load("test_data.RData") -pop_data -# data.table should be one row per person and contain variables -# coordinates for each person (x, y) -# IDs for gridcells -# - L000500 (500m) -# - L001000 (1000m) -# - L002000 (2000m) -# PID (optional) - -raster_cells -# data.table containing all raster cells for a country even unpopulated ones -# should contain all grid cell IDs -# - L000500 (500m) -# - L001000 (1000m) -# - L002000 (2000m) - -# define cell size -m <- 500 # dimension of rasters in meters -geo <- paste(c("L",paste(rep(0,6-nchar(m))),m),collapse="") - -# make table original cell counts -grid_data <- pop_data[,.(count_original=.N),by=c(geo)] -grid_data <- merge(grid_data, raster_cells, by=c(geo),all=TRUE) - -# get center coordinates of grid cells - needed later -grid_data[,y:=as.numeric(paste0(substr(get(geo),6,10),"00"))] -grid_data[,x:=as.numeric(paste0(substr(get(geo),12,16),"00"))] -grid_data[is.na(count_original),count_original:=0] - -################################## -# define "raster"-object for grid cells to be used with sdcSpatial - -# download and read in shapefiles -path <- paste0("https://data.statistik.gv.at/data/OGDEXT_RASTER_1_STATISTIK_AUSTRIA_", - geo,"_LAEA.zip") -layer <- paste0("STATISTIK_AUSTRIA_", - geo,"_LAEA") - -zipname <- basename(path) -# download .zip -my_shapefile <- paste0(getwd(),"/",zipname) -download.file(path,my_shapefile) -# unzip file -unzipfolder <- paste0(sub(".zip","",my_shapefile)) -unzip(my_shapefile,exdir=unzipfolder) -# read shape file -area_shape <- sf::st_read(dsn = unzipfolder, layer = layer) -# delete zip file -file.remove(my_shapefile) -# delete folder optional -# unlink(unzipfolder, recursive = TRUE) - -# create raster object to be used by sdcSpatial -bounds <- sf::st_bbox(area_shape) -ncol <- (bounds$xmax-bounds$xmin)/m+1 -nrow <- (bounds$ymax-bounds$ymin)/m+1 -projection <- sf::st_crs(area_shape)$proj4string -r <- raster::raster(nrows=nrow,ncols=ncol, - xmn = bounds$xmin-m/2, - xmx = bounds$xmax+m/2, - ymn = bounds$ymin-m/2, - ymx = bounds$ymax+m/2, - crs = sp::CRS(projection)) - -# for overview on projections see: -# https://www.nceas.ucsb.edu/sites/default/files/2020-04/OverviewCoordinateReferenceSystems.pdf - -################################## -# define sdc_raster object using population data and raster-object from before - -# convert populatin data to SpatialPointsDataFrame -pop_data[,value:=1] # set value needed for sdcSpatial -sp_data <- sp::SpatialPointsDataFrame(coords = as.data.frame(pop_data[,.(x,y)]), - proj4string = sp::CRS(projection), - data = as.data.frame(pop_data)) - -# define sdc_raster object -pop_raster <- sdc_raster(x = sp_data, # SpatialPointsDataFrame holding the data - variable = "value", # variable cannot be missing - r = r, # set specific raster - min_count = 5 # ~ cells should have at least min_count persons - ) - -# object summary -pop_raster -# mean sensitivity score: share of sensitive cells - -# plot object -plot(pop_raster, - value="count", # what value should be plotted - col=RColorBrewer::brewer.pal(8,"Blues")[-c(1:2)]) - - - -################################## -# check sensitive cells -sensitive_cells <- is_sensitive(pop_raster) # get sensitive cells -data_sens <- as.data.frame(sensitive_cells,xy=TRUE) # convert to data frame -setDT(data_sens) - -# compare with input data (sanity check) -grid_data[,sensitive_direct:=count_original<5 & count_original>0] -grid_data <- merge(grid_data,data_sens,by=c("x","y"),all=TRUE) -# filter by all grid cells which are part of country -# use shape file for this -grid_data[,part_of_country:=get(geo) %in% area_shape$ID] -grid_data[,.N,by=.(sensitive,sensitive_direct,part_of_country)] - - - -################################## -# apply protection method; check sensitive cells again and suppress if necessary - -# 3 different methods implemented in sdcSpatial -# protect_quadtree() -> specifically for count data -# protect_neighborhood() -> sum over neighborhood in circle? -# protect_smooth() -> seems to be more applicable for numerical data - -### -# apply quadtree -> example from ?protect_quadtree() -fined <- sdc_raster(enterprises, enterprises$fined, r=50) -plot(fined) -fined_qt <- protect_quadtree(fined) -plot(fined_qt) -### - -# apply on own data -protected_raster <- protect_quadtree(pop_raster, - max_zoom = 2 # number of zoom-steps - 1 - ) -protected_raster # mean sensitivity for protected cells -plot(protected_raster, - value="count", # what value should be plotted - col=RColorBrewer::brewer.pal(8,"Blues")[-c(1:2)]) - -# get protected data in data.frame format for further analysis -protected_data <- as.data.frame(protected_raster$value,xy=TRUE) -setDT(protected_data) -grid_data[protected_data,count_quadtree:=count,on=.(x,y)] -grid_data - -# apply new sensitivity flag -# cells which are still sensitive will be supressed afterwards -sensitive_cells <- is_sensitive(protected_raster) -data_sens <- as.data.frame(sensitive_cells,xy=TRUE) -setDT(data_sens) -grid_data[data_sens,sensitive:=i.sensitive,on=.(x,y)] -grid_data[,.N,by=.(sensitive,part_of_country)] -grid_data[!is.na(sensitive) & part_of_country==FALSE] -# some cells have values which are not party of country - -################################## -# get protected table and apply information loss function -grid_data[,suppress_value:=FALSE] -grid_data[sensitive==TRUE,suppress_value:=TRUE] # supress all cells which are still sensitive - -grid_data[is.na(count_original),count_original:=0] -grid_data[is.na(count_quadtree),count_quadtree:=0] -infoLoss <- cellKey::ck_cnt_measures(grid_data[suppress_value==FALSE]$count_original, - grid_data[suppress_value==FALSE]$count_quadtree) -infoLoss -# infoloss/noise is quite high in some cases -# appears if neighbouring cells have very different cell count and are averaged -# infoloss should probably be scaled with some distance - -# share of supressed values -grid_data[part_of_country==TRUE & count_original>0,mean(suppress_value)] -# this is less than mean sensitivity score [0,1]: 0.1247862 -# additional 500m raster which were not populated before -# some are not part of country - -# get share of cells which have been aggregated -cells_aggregated <- grid_data[count_original!=count_quadtree,uniqueN(L000500)] -cells_total <- grid_data[,uniqueN(L000500)] -cells_aggregated/cells_total - - -################################## -# use higher level grid data and try to get an estimate on grid cell values - -# higher grid level -geo2 <- paste0("L00",m*2) -count_geo <- paste0("count",geo2) -grid_data[part_of_country==TRUE,c(count_geo):=sum(count_original),by=c(geo2)] - -# aggregated grid is the same as predefined higher level grid used by protect_quadtree -grid_data[part_of_country==TRUE,.(sum(count_original),sum(count_quadtree)),by=c(geo2)][V1==V2] - -# example of raster plot -r <- raster(nrows=4, ncols=4) -r <- setValues(r, sample(1:ncell(r))) -plot(r,col=RColorBrewer::brewer.pal(9,"Blues")) - -# check how close protected values are to original ones -# filter on cells which where used during protection and are now no longer sensitive -# all cells with values after the decimal point were surely used during protection -cells_protected <- grid_data[count_quadtree%%1!=0 & part_of_country==TRUE & sensitive == FALSE,unique(get(geo2))] -grid_data[get(geo2) %in% cells_protected,quantile(count_quadtree-count_original,probs=seq(0,1,.1))] - -# check distance between original and protected value of previously sensitive cells -tab1 <- grid_data[get(geo2) %in% cells_protected & sensitive_direct==TRUE & sensitive == FALSE] -tab1 <- tab1[,.N,by=.(diff=abs(count_quadtree-count_original))] # group by abs difference -tab1 <- tab1[order(diff)] -tab1[,cumulative_p:=round(cumsum(N/sum(N))*100,2)] -tab1 -tab1[diff<1] # % of protected cells which are less than 1 away from original -tab1[1:10] - -# are there sensitive cells which are exactly the same after applying quadtree -cells_protected <- grid_data[sensitive_direct == TRUE,unique(get(geo2))] -tab1 <- grid_data[get(geo2) %in% cells_protected & sensitive_direct==TRUE & sensitive == FALSE] -tab1 <- tab1[,.N,by=.(diff=abs(count_quadtree-count_original))] # group by abs difference -tab1 <- tab1[order(diff)] -tab1[,cumulative_p:=round(cumsum(N/sum(N))*100,2)] -tab1 -tab1[diff==0] # % of sensitive cells which stay the same -tab1[1:10]