From b5f8d2700984dfe9b22bb3d38e1c0d3e346ed0de Mon Sep 17 00:00:00 2001 From: avisionh Date: Fri, 27 Nov 2020 12:51:44 +0000 Subject: [PATCH 1/9] Import Excel and csv files This is so we can link up the data. --- src/make_data/link_data.R | 55 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 src/make_data/link_data.R diff --git a/src/make_data/link_data.R b/src/make_data/link_data.R new file mode 100644 index 0000000..d1574c0 --- /dev/null +++ b/src/make_data/link_data.R @@ -0,0 +1,55 @@ +library(googledrive) +library(readr) +library(readxl) +library(httr) +library(dplyr) + +# get list of all files +file_main <- drive_get(path = as_id(x = "https://drive.google.com/file/d/1PS9xQIP_O048rGb-uvwPonyrV_3CCYc4/view?usp=sharing")) +files <- drive_ls( + path = as_id(x = "https://drive.google.com/drive/u/0/folders/1sfavbXr3UAqfd_zWAuDChvC7Hnb69gi5"), + type = "csv" +) + +# store Excel file temporarily and import +GET( + url = file_main$drive_resource[[1]]$webContentLink, + write_disk(tf <- tempfile()) +) + +# import Excel sheet +sheet_questions <- read_excel( + path = tf, + sheet = "List of Questions", + col_names = FALSE +) +sheet_questionnaires <- read_excel( + path = tf, + sheet = "Questionnaires", + skip = 1, + col_names = TRUE +) + +# rename columns +sheet_questions <- rename( + .data = sheet_questions, + measure = `...1`, + questionnaire = `...2`, + question = `...3` +) +sheet_questionnaires <- select(.data = sheet_questionnaires, pupil_id:dSEND) + +# import each csv from gdrive +list_df <- list() +j <- 1 + +for (i in files$drive_resource) { + link <- i$webContentLink + df <- read_csv(file = link) + # store in list + list_df[[j]] <- df + j <- j + 1 +} + +# name list for easy access of each df +names(list_df) <- files$name From 46ed15420f4ed823580918046216dcca445ca634 Mon Sep 17 00:00:00 2001 From: avisionh Date: Fri, 27 Nov 2020 15:39:14 +0000 Subject: [PATCH 2/9] Join datasets together This is so we can get in a format ImpactEd are expecting. --- src/make_data/01_link_data.R | 59 ++++++++++++++++++++++++++++++++++++ src/make_data/02_link_data.R | 21 +++++++++++++ src/make_data/link_data.R | 6 +++- 3 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 src/make_data/01_link_data.R create mode 100644 src/make_data/02_link_data.R diff --git a/src/make_data/01_link_data.R b/src/make_data/01_link_data.R new file mode 100644 index 0000000..e267e4c --- /dev/null +++ b/src/make_data/01_link_data.R @@ -0,0 +1,59 @@ +library(googledrive) +library(readr) +library(readxl) +library(httr) +library(dplyr) + +# get list of all files +file_main <- drive_get(path = as_id(x = "https://drive.google.com/file/d/1PS9xQIP_O048rGb-uvwPonyrV_3CCYc4/view?usp=sharing")) +files <- drive_ls( + path = as_id(x = "https://drive.google.com/drive/u/0/folders/1sfavbXr3UAqfd_zWAuDChvC7Hnb69gi5"), + type = "csv" +) + +# store Excel file temporarily +GET( + url = file_main$drive_resource[[1]]$webContentLink, + write_disk(tf <- tempfile()) +) + +# import Excel sheet +sheet_questions <- read_excel( + path = tf, + sheet = "List of Questions", + col_names = FALSE +) +sheet_questionnaires <- read_excel( + path = tf, + sheet = "Questionnaires", + skip = 1, + col_names = TRUE +) + +# rename columns +sheet_questions <- rename( + .data = sheet_questions, + measure = `...1`, + questionnaire = `...2`, + question = `...3` +) +sheet_questionnaires <- select(.data = sheet_questionnaires, pupil_id:dSEND) + +# import each csv from gdrive +list_df <- list() +j <- 1 + +for (i in files$drive_resource) { + link <- i$webContentLink + df <- read_csv(file = link) + # store in list + list_df[[j]] <- df + j <- j + 1 +} + +# name list for easy access of each df +names(list_df) <- files$name + +# clear environment +unlink(tf) +rm(df, file_main, files, i, j, link, tf) diff --git a/src/make_data/02_link_data.R b/src/make_data/02_link_data.R new file mode 100644 index 0000000..2e2195e --- /dev/null +++ b/src/make_data/02_link_data.R @@ -0,0 +1,21 @@ +source("src/make_data/01_link_data.R") + +library(dplyr) +library(tidyr) +library(purrr) + + +# join so in format requested +df_step <- list_df[[1]] +for (i in 1:(length(list_df) - 1)) { + df_step <- df_step %>% + full_join(y = list_df[[i + 1]], by = c("pupil_id", "pupil_impacted_id", "measurement_date")) +} + +# we have repeated columns that exist in the source csv files +# however, there are three valid responses in the repeated columns that are not in the +# non-repeated columns +df <- df_step %>% + select(!ends_with(match = "_1")) %>% + rename_at(.vars = vars(sheet_questions$question), .funs = list(sheet_questions$questionnaire)) +df <- rename_at(.data) diff --git a/src/make_data/link_data.R b/src/make_data/link_data.R index d1574c0..e267e4c 100644 --- a/src/make_data/link_data.R +++ b/src/make_data/link_data.R @@ -11,7 +11,7 @@ files <- drive_ls( type = "csv" ) -# store Excel file temporarily and import +# store Excel file temporarily GET( url = file_main$drive_resource[[1]]$webContentLink, write_disk(tf <- tempfile()) @@ -53,3 +53,7 @@ for (i in files$drive_resource) { # name list for easy access of each df names(list_df) <- files$name + +# clear environment +unlink(tf) +rm(df, file_main, files, i, j, link, tf) From 2dbf0f91de39e5b928e8a3508bce02069ecb7d9a Mon Sep 17 00:00:00 2001 From: avisionh Date: Fri, 27 Nov 2020 16:34:04 +0000 Subject: [PATCH 3/9] Deal with duplicated columns robustly This is inspired by Duncan's code. --- src/make_data/link_data.R | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/make_data/link_data.R b/src/make_data/link_data.R index e267e4c..ec44c45 100644 --- a/src/make_data/link_data.R +++ b/src/make_data/link_data.R @@ -4,6 +4,29 @@ library(readxl) library(httr) library(dplyr) +# from Duncan - function for importing +read_responses <- function(file) { + read_csv(file, col_types = cols(.default = col_character())) %>% + pivot_longer(-c(pupil_id, pupil_impacted_id, measurement_date), + names_to = "question", values_to = "response" + ) %>% + # Deal with duplicated questions. + # Three pupils in questionnaire 186 have a set of columns to themselves, so + # we merge them back in by: + # + # 1. filtering out everyone's blank responses. That means that most pupils + # only have responses to the first instance of each question, and the + # three affected pupils only have responses to the second instance of + # each question. + # 2. dropping the suffix that readr automatically adds to the second + # instance of each question. Now all pupils are the same. + # + # instance of each column. + filter(!is.na(response)) %>% + mutate(question = str_remove(question, "_\\d+$")) +} + + # get list of all files file_main <- drive_get(path = as_id(x = "https://drive.google.com/file/d/1PS9xQIP_O048rGb-uvwPonyrV_3CCYc4/view?usp=sharing")) files <- drive_ls( @@ -45,14 +68,14 @@ j <- 1 for (i in files$drive_resource) { link <- i$webContentLink - df <- read_csv(file = link) + df <- read_responses(file = link) # store in list list_df[[j]] <- df j <- j + 1 } -# name list for easy access of each df -names(list_df) <- files$name +responses <- map_dfr(.x = list_df, .f = rbind) %>% + mutate(measurement_date = parse_date(x = measurement_date, format = "%d/%m/%Y")) # clear environment unlink(tf) From 619e4c2f08a835e61e259ef3d08175c53d8f5983 Mon Sep 17 00:00:00 2001 From: avisionh Date: Fri, 27 Nov 2020 21:43:28 +0000 Subject: [PATCH 4/9] Handle duplicated rows This is so we can pivot wider without creating lists. Also, export to csv for sharing. --- src/make_data/01_link_data.R | 45 +++++++++++++++----- src/make_data/02_link_data.R | 57 +++++++++++++++++-------- src/make_data/link_data.R | 82 ------------------------------------ 3 files changed, 75 insertions(+), 109 deletions(-) delete mode 100644 src/make_data/link_data.R diff --git a/src/make_data/01_link_data.R b/src/make_data/01_link_data.R index e267e4c..699256e 100644 --- a/src/make_data/01_link_data.R +++ b/src/make_data/01_link_data.R @@ -4,6 +4,29 @@ library(readxl) library(httr) library(dplyr) +# from Duncan - function for importing +read_responses <- function(file) { + read_csv(file, col_types = cols(.default = col_character())) %>% + pivot_longer(-c(pupil_id, pupil_impacted_id, measurement_date), + names_to = "question", values_to = "response" + ) %>% + # Deal with duplicated questions. + # Three pupils in questionnaire 186 have a set of columns to themselves, so + # we merge them back in by: + # + # 1. filtering out everyone's blank responses. That means that most pupils + # only have responses to the first instance of each question, and the + # three affected pupils only have responses to the second instance of + # each question. + # 2. dropping the suffix that readr automatically adds to the second + # instance of each question. Now all pupils are the same. + # + # instance of each column. + filter(!is.na(response)) %>% + mutate(question = str_remove(question, "_\\d+$")) +} + + # get list of all files file_main <- drive_get(path = as_id(x = "https://drive.google.com/file/d/1PS9xQIP_O048rGb-uvwPonyrV_3CCYc4/view?usp=sharing")) files <- drive_ls( @@ -31,12 +54,14 @@ sheet_questionnaires <- read_excel( ) # rename columns -sheet_questions <- rename( - .data = sheet_questions, - measure = `...1`, - questionnaire = `...2`, - question = `...3` -) +sheet_questions <- sheet_questions %>% + rename( + .data = sheet_questions, + measure = `...1`, + questionnaire = `...2`, + question = `...3` + ) %>% + fill(measure, .direction = "down") sheet_questionnaires <- select(.data = sheet_questionnaires, pupil_id:dSEND) # import each csv from gdrive @@ -45,15 +70,15 @@ j <- 1 for (i in files$drive_resource) { link <- i$webContentLink - df <- read_csv(file = link) + df <- read_responses(file = link) # store in list list_df[[j]] <- df j <- j + 1 } -# name list for easy access of each df -names(list_df) <- files$name +responses <- map_dfr(.x = list_df, .f = rbind) %>% + mutate(measurement_date = parse_date(x = measurement_date, format = "%d/%m/%Y")) # clear environment unlink(tf) -rm(df, file_main, files, i, j, link, tf) +rm(df, file_main, files, i, j, link, list_df, tf) diff --git a/src/make_data/02_link_data.R b/src/make_data/02_link_data.R index 2e2195e..8bfc1fc 100644 --- a/src/make_data/02_link_data.R +++ b/src/make_data/02_link_data.R @@ -2,20 +2,43 @@ source("src/make_data/01_link_data.R") library(dplyr) library(tidyr) -library(purrr) - - -# join so in format requested -df_step <- list_df[[1]] -for (i in 1:(length(list_df) - 1)) { - df_step <- df_step %>% - full_join(y = list_df[[i + 1]], by = c("pupil_id", "pupil_impacted_id", "measurement_date")) -} - -# we have repeated columns that exist in the source csv files -# however, there are three valid responses in the repeated columns that are not in the -# non-repeated columns -df <- df_step %>% - select(!ends_with(match = "_1")) %>% - rename_at(.vars = vars(sheet_questions$question), .funs = list(sheet_questions$questionnaire)) -df <- rename_at(.data) +library(stringr) + + +# get questionnaire info in to use as column names later +responses <- responses %>% + left_join(y = sheet_questions, by = "question") %>% + mutate(qq = paste0(questionnaire, " - ", question)) %>% + # remove duplicates + distinct() + + +# see if have unique combo of rows so can pivot_wider safely +responses %>% + group_by(pupil_id, pupil_impacted_id, measurement_date, qq) %>% + summarise(count = n()) %>% + filter(count > 1) +# have duplicates, here's an example +responses %>% + filter(pupil_id == "100165", measurement_date == "2020-09-09", qq == "207_10 - I have felt like I have missed important school work") + +# partition these duplicate responses with a row number; allocation of this is random +# this seems the best we can do +responses <- responses %>% + select(pupil_id:measurement_date, qq, response) %>% + group_by(pupil_id, pupil_impacted_id, measurement_date, qq) %>% + mutate(rank = row_number()) %>% + arrange(pupil_id, pupil_impacted_id, measurement_date, qq) + +# isolate unique records +responses_dedupe <- filter(.data = responses, rank == 1) + +# pivot wider for ImpactEd's purposes +df_output <- responses_dedupe %>% + pivot_wider( + id_cols = c(pupil_id, pupil_impacted_id, measurement_date), + names_from = "qq", + values_from = "response" + ) %>% + arrange(pupil_id, measurement_date) +write_csv(x = df_output, file = "data/processed/questionnaires_linked.csv") diff --git a/src/make_data/link_data.R b/src/make_data/link_data.R deleted file mode 100644 index ec44c45..0000000 --- a/src/make_data/link_data.R +++ /dev/null @@ -1,82 +0,0 @@ -library(googledrive) -library(readr) -library(readxl) -library(httr) -library(dplyr) - -# from Duncan - function for importing -read_responses <- function(file) { - read_csv(file, col_types = cols(.default = col_character())) %>% - pivot_longer(-c(pupil_id, pupil_impacted_id, measurement_date), - names_to = "question", values_to = "response" - ) %>% - # Deal with duplicated questions. - # Three pupils in questionnaire 186 have a set of columns to themselves, so - # we merge them back in by: - # - # 1. filtering out everyone's blank responses. That means that most pupils - # only have responses to the first instance of each question, and the - # three affected pupils only have responses to the second instance of - # each question. - # 2. dropping the suffix that readr automatically adds to the second - # instance of each question. Now all pupils are the same. - # - # instance of each column. - filter(!is.na(response)) %>% - mutate(question = str_remove(question, "_\\d+$")) -} - - -# get list of all files -file_main <- drive_get(path = as_id(x = "https://drive.google.com/file/d/1PS9xQIP_O048rGb-uvwPonyrV_3CCYc4/view?usp=sharing")) -files <- drive_ls( - path = as_id(x = "https://drive.google.com/drive/u/0/folders/1sfavbXr3UAqfd_zWAuDChvC7Hnb69gi5"), - type = "csv" -) - -# store Excel file temporarily -GET( - url = file_main$drive_resource[[1]]$webContentLink, - write_disk(tf <- tempfile()) -) - -# import Excel sheet -sheet_questions <- read_excel( - path = tf, - sheet = "List of Questions", - col_names = FALSE -) -sheet_questionnaires <- read_excel( - path = tf, - sheet = "Questionnaires", - skip = 1, - col_names = TRUE -) - -# rename columns -sheet_questions <- rename( - .data = sheet_questions, - measure = `...1`, - questionnaire = `...2`, - question = `...3` -) -sheet_questionnaires <- select(.data = sheet_questionnaires, pupil_id:dSEND) - -# import each csv from gdrive -list_df <- list() -j <- 1 - -for (i in files$drive_resource) { - link <- i$webContentLink - df <- read_responses(file = link) - # store in list - list_df[[j]] <- df - j <- j + 1 -} - -responses <- map_dfr(.x = list_df, .f = rbind) %>% - mutate(measurement_date = parse_date(x = measurement_date, format = "%d/%m/%Y")) - -# clear environment -unlink(tf) -rm(df, file_main, files, i, j, link, tf) From 508614c7517a1e558a1ca219c14d69927dab236e Mon Sep 17 00:00:00 2001 From: avisionh Date: Fri, 27 Nov 2020 21:50:38 +0000 Subject: [PATCH 5/9] Bump This is to update packages used in scripts. --- renv.lock | 187 +++++++---------------------------- src/make_data/02_link_data.R | 2 +- 2 files changed, 35 insertions(+), 154 deletions(-) diff --git a/renv.lock b/renv.lock index eab7438..06d956c 100644 --- a/renv.lock +++ b/renv.lock @@ -1,10 +1,10 @@ { "R": { - "Version": "4.0.3", + "Version": "4.0.2", "Repositories": [ { "Name": "CRAN", - "URL": "https://cran.ma.imperial.ac.uk" + "URL": "https://cran.rstudio.com" } ] }, @@ -25,10 +25,10 @@ }, "MASS": { "Package": "MASS", - "Version": "7.3-53", + "Version": "7.3-51.6", "Source": "Repository", "Repository": "CRAN", - "Hash": "d1bc1c8e9c0ace57ec9ffea01021d45f" + "Hash": "1dad32ac9dbd8057167b2979fb932ff7" }, "Matrix": { "Package": "Matrix", @@ -37,40 +37,12 @@ "Repository": "CRAN", "Hash": "08588806cba69f04797dab50627428ed" }, - "R.cache": { - "Package": "R.cache", - "Version": "0.14.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "1ca02d43e1a4d49e616bd23bb39b17e6" - }, - "R.methodsS3": { - "Package": "R.methodsS3", - "Version": "1.8.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "4bf6453323755202d5909697b6f7c109" - }, - "R.oo": { - "Package": "R.oo", - "Version": "1.24.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "5709328352717e2f0a9c012be8a97554" - }, - "R.utils": { - "Package": "R.utils", - "Version": "2.10.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "a9e316277ff12a43997266f2f6567780" - }, "R6": { "Package": "R6", - "Version": "2.4.1", + "Version": "2.5.0", "Source": "Repository", "Repository": "CRAN", - "Hash": "292b54f8f4b94669b08f94e5acce6be2" + "Hash": "b203113193e70978a696b2809525649d" }, "RColorBrewer": { "Package": "RColorBrewer", @@ -170,13 +142,6 @@ "Repository": "CRAN", "Hash": "6b436e95723d1f0e861224dd9b094dfb" }, - "commonmark": { - "Package": "commonmark", - "Version": "1.7", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "0f22be39ec1d141fd03683c06f3a6e67" - }, "cpp11": { "Package": "cpp11", "Version": "0.2.3", @@ -198,13 +163,6 @@ "Repository": "CRAN", "Hash": "2b7d10581cc730804e9ed178c8374bd6" }, - "cyclocomp": { - "Package": "cyclocomp", - "Version": "1.1.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "53cbed70a2f7472d48fb6aef08442f25" - }, "dbplyr": { "Package": "dbplyr", "Version": "1.4.4", @@ -221,17 +179,10 @@ }, "digest": { "Package": "digest", - "Version": "0.6.25", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "f697db7d92b7028c4b3436e9603fb636" - }, - "docopt": { - "Package": "docopt", - "Version": "0.7.1", + "Version": "0.6.27", "Source": "Repository", "Repository": "CRAN", - "Hash": "e9eeef7931ee99ca0093f3f20b88e09b" + "Hash": "a0cbe758a531d054b537d16dff4d58a1" }, "dplyr": { "Package": "dplyr", @@ -282,6 +233,13 @@ "Repository": "CRAN", "Hash": "44594a07a42e5f91fac9f93fda6d0109" }, + "gargle": { + "Package": "gargle", + "Version": "0.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "aaacaf8b0ec3dfe45df9eb6bc040db44" + }, "generics": { "Package": "generics", "Version": "0.0.2", @@ -296,20 +254,6 @@ "Repository": "CRAN", "Hash": "4ded8b439797f7b1693bd3d238d0106b" }, - "gh": { - "Package": "gh", - "Version": "1.1.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "89ea5998938d1ad55f035c8a86f96b74" - }, - "git2r": { - "Package": "git2r", - "Version": "0.27.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "531a82d1beed1f545beb25f4f5945bf7" - }, "glue": { "Package": "glue", "Version": "1.4.2", @@ -317,6 +261,13 @@ "Repository": "CRAN", "Hash": "6efd734b14c6471cfe443345f3e35e29" }, + "googledrive": { + "Package": "googledrive", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "79ba5d18133290a69b7c135dc3dfef1a" + }, "gtable": { "Package": "gtable", "Version": "0.3.0", @@ -366,20 +317,6 @@ "Repository": "CRAN", "Hash": "a525aba14184fec243f9eaec62fbed43" }, - "hunspell": { - "Package": "hunspell", - "Version": "3.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "71e7853d60b6b4ba891d62ede21752e9" - }, - "ini": { - "Package": "ini", - "Version": "0.3.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "6154ec2223172bce8162d4153cda21f7" - }, "isoband": { "Package": "isoband", "Version": "0.2.2", @@ -415,13 +352,6 @@ "Repository": "CRAN", "Hash": "fbd9285028b0263d76d18c95ae51a53d" }, - "lazyeval": { - "Package": "lazyeval", - "Version": "0.2.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "d908914ae53b04d4c0c0fd72ecc35370" - }, "lifecycle": { "Package": "lifecycle", "Version": "0.2.0", @@ -429,13 +359,6 @@ "Repository": "CRAN", "Hash": "361811f31f71f8a617a9a68bf63f1f42" }, - "lintr": { - "Package": "lintr", - "Version": "2.0.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "023cecbdc0a32f86ad3cb1734c018d2e" - }, "lubridate": { "Package": "lubridate", "Version": "1.7.9", @@ -459,10 +382,10 @@ }, "mgcv": { "Package": "mgcv", - "Version": "1.8-33", + "Version": "1.8-31", "Source": "Repository", "Repository": "CRAN", - "Hash": "eb7b6439bc6d812eed2cddba5edc6be3" + "Hash": "4bb7e0c4f3557583e1e8d3c9ffb8ba5c" }, "mime": { "Package": "mime", @@ -487,10 +410,10 @@ }, "nlme": { "Package": "nlme", - "Version": "3.1-149", + "Version": "3.1-148", "Source": "Repository", "Repository": "CRAN", - "Hash": "7c24ab3a1e3afe50388eb2d893aab255" + "Hash": "662f52871983ff3e3ef042c62de126df" }, "openssl": { "Package": "openssl", @@ -510,7 +433,7 @@ "Package": "piton", "Version": "0.1.1", "Source": "Repository", - "Repository": null, + "Repository": "CRAN", "Hash": "c4e3ffb3a754a77c5fdf89145a2f7d1d" }, "pkgbuild": { @@ -597,20 +520,6 @@ "Repository": "CRAN", "Hash": "c66b930d20bb6d858cd18e1cebcfae5c" }, - "rematch2": { - "Package": "rematch2", - "Version": "2.1.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "76c9e04c712a05848ae7a23d2f170a40" - }, - "remotes": { - "Package": "remotes", - "Version": "2.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "430a0908aee75b1fcba0e62857cab0ce" - }, "renv": { "Package": "renv", "Version": "0.12.0", @@ -625,13 +534,6 @@ "Repository": "CRAN", "Hash": "b06bfb3504cc8a4579fd5567646f745b" }, - "rex": { - "Package": "rex", - "Version": "1.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "093584b944440c5cd07a696b3c8e0e4c" - }, "rlang": { "Package": "rlang", "Version": "0.4.8", @@ -681,13 +583,6 @@ "Repository": "CRAN", "Hash": "3838071b66e0c566d55cc26bd6e27bf4" }, - "spelling": { - "Package": "spelling", - "Version": "2.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "b3a5ecc3351f41eb30ef87f65cbff390" - }, "stringi": { "Package": "stringi", "Version": "1.5.3", @@ -702,13 +597,6 @@ "Repository": "CRAN", "Hash": "0759e6b6c0957edb1311028a49a35e76" }, - "styler": { - "Package": "styler", - "Version": "1.3.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "60b23effde8d08a56a64ebeb92a32749" - }, "sys": { "Package": "sys", "Version": "3.4", @@ -772,13 +660,6 @@ "Repository": "CRAN", "Hash": "9926b1bcf0b8f907b5c1b1dd922875bd" }, - "usethis": { - "Package": "usethis", - "Version": "1.6.3", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "c541a7aed5f7fb3b487406bf92842e34" - }, "utf8": { "Package": "utf8", "Version": "1.1.4", @@ -786,6 +667,13 @@ "Repository": "CRAN", "Hash": "4a5081acfb7b81a572e4384a7aaf2af1" }, + "uuid": { + "Package": "uuid", + "Version": "0.1-4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e4169eb989a5d03ccb6b628cad1b1b50" + }, "vctrs": { "Package": "vctrs", "Version": "0.3.4", @@ -828,13 +716,6 @@ "Repository": "CRAN", "Hash": "d4d71a75dd3ea9eb5fa28cc21f9585e2" }, - "xmlparsedata": { - "Package": "xmlparsedata", - "Version": "1.0.4", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "373bcee7aad3980799936749cfca6f24" - }, "yaml": { "Package": "yaml", "Version": "2.2.1", diff --git a/src/make_data/02_link_data.R b/src/make_data/02_link_data.R index 8bfc1fc..ccfa1a6 100644 --- a/src/make_data/02_link_data.R +++ b/src/make_data/02_link_data.R @@ -2,7 +2,7 @@ source("src/make_data/01_link_data.R") library(dplyr) library(tidyr) -library(stringr) + # get questionnaire info in to use as column names later From f60657b4ad0295ccd3b7a586f95bcd63859754fd Mon Sep 17 00:00:00 2001 From: avisionh Date: Sat, 28 Nov 2020 23:33:02 +0000 Subject: [PATCH 6/9] Move scripts to one This is because the second script cannot call the first script (causes RStudio to abort). One suspects it is to do with Google Authorisation not being able to apply across scripts. Move function to separate script so it can be tested more easily and to modularise code. --- src/make_data/01_link_data.R | 84 --------------------------- src/make_data/02_link_data.R | 44 --------------- src/make_data/link_data.R | 106 +++++++++++++++++++++++++++++++++++ src/utils/read_responses.R | 27 +++++++++ 4 files changed, 133 insertions(+), 128 deletions(-) delete mode 100644 src/make_data/01_link_data.R delete mode 100644 src/make_data/02_link_data.R create mode 100644 src/make_data/link_data.R create mode 100644 src/utils/read_responses.R diff --git a/src/make_data/01_link_data.R b/src/make_data/01_link_data.R deleted file mode 100644 index 699256e..0000000 --- a/src/make_data/01_link_data.R +++ /dev/null @@ -1,84 +0,0 @@ -library(googledrive) -library(readr) -library(readxl) -library(httr) -library(dplyr) - -# from Duncan - function for importing -read_responses <- function(file) { - read_csv(file, col_types = cols(.default = col_character())) %>% - pivot_longer(-c(pupil_id, pupil_impacted_id, measurement_date), - names_to = "question", values_to = "response" - ) %>% - # Deal with duplicated questions. - # Three pupils in questionnaire 186 have a set of columns to themselves, so - # we merge them back in by: - # - # 1. filtering out everyone's blank responses. That means that most pupils - # only have responses to the first instance of each question, and the - # three affected pupils only have responses to the second instance of - # each question. - # 2. dropping the suffix that readr automatically adds to the second - # instance of each question. Now all pupils are the same. - # - # instance of each column. - filter(!is.na(response)) %>% - mutate(question = str_remove(question, "_\\d+$")) -} - - -# get list of all files -file_main <- drive_get(path = as_id(x = "https://drive.google.com/file/d/1PS9xQIP_O048rGb-uvwPonyrV_3CCYc4/view?usp=sharing")) -files <- drive_ls( - path = as_id(x = "https://drive.google.com/drive/u/0/folders/1sfavbXr3UAqfd_zWAuDChvC7Hnb69gi5"), - type = "csv" -) - -# store Excel file temporarily -GET( - url = file_main$drive_resource[[1]]$webContentLink, - write_disk(tf <- tempfile()) -) - -# import Excel sheet -sheet_questions <- read_excel( - path = tf, - sheet = "List of Questions", - col_names = FALSE -) -sheet_questionnaires <- read_excel( - path = tf, - sheet = "Questionnaires", - skip = 1, - col_names = TRUE -) - -# rename columns -sheet_questions <- sheet_questions %>% - rename( - .data = sheet_questions, - measure = `...1`, - questionnaire = `...2`, - question = `...3` - ) %>% - fill(measure, .direction = "down") -sheet_questionnaires <- select(.data = sheet_questionnaires, pupil_id:dSEND) - -# import each csv from gdrive -list_df <- list() -j <- 1 - -for (i in files$drive_resource) { - link <- i$webContentLink - df <- read_responses(file = link) - # store in list - list_df[[j]] <- df - j <- j + 1 -} - -responses <- map_dfr(.x = list_df, .f = rbind) %>% - mutate(measurement_date = parse_date(x = measurement_date, format = "%d/%m/%Y")) - -# clear environment -unlink(tf) -rm(df, file_main, files, i, j, link, list_df, tf) diff --git a/src/make_data/02_link_data.R b/src/make_data/02_link_data.R deleted file mode 100644 index ccfa1a6..0000000 --- a/src/make_data/02_link_data.R +++ /dev/null @@ -1,44 +0,0 @@ -source("src/make_data/01_link_data.R") - -library(dplyr) -library(tidyr) - - - -# get questionnaire info in to use as column names later -responses <- responses %>% - left_join(y = sheet_questions, by = "question") %>% - mutate(qq = paste0(questionnaire, " - ", question)) %>% - # remove duplicates - distinct() - - -# see if have unique combo of rows so can pivot_wider safely -responses %>% - group_by(pupil_id, pupil_impacted_id, measurement_date, qq) %>% - summarise(count = n()) %>% - filter(count > 1) -# have duplicates, here's an example -responses %>% - filter(pupil_id == "100165", measurement_date == "2020-09-09", qq == "207_10 - I have felt like I have missed important school work") - -# partition these duplicate responses with a row number; allocation of this is random -# this seems the best we can do -responses <- responses %>% - select(pupil_id:measurement_date, qq, response) %>% - group_by(pupil_id, pupil_impacted_id, measurement_date, qq) %>% - mutate(rank = row_number()) %>% - arrange(pupil_id, pupil_impacted_id, measurement_date, qq) - -# isolate unique records -responses_dedupe <- filter(.data = responses, rank == 1) - -# pivot wider for ImpactEd's purposes -df_output <- responses_dedupe %>% - pivot_wider( - id_cols = c(pupil_id, pupil_impacted_id, measurement_date), - names_from = "qq", - values_from = "response" - ) %>% - arrange(pupil_id, measurement_date) -write_csv(x = df_output, file = "data/processed/questionnaires_linked.csv") diff --git a/src/make_data/link_data.R b/src/make_data/link_data.R new file mode 100644 index 0000000..b44dda1 --- /dev/null +++ b/src/make_data/link_data.R @@ -0,0 +1,106 @@ +library(googledrive) +library(readxl) +library(httr) +library(dplyr) +library(purrr) +library(tidyr) + +source("src/utils/read_responses.R") + + +# authorise access to gdrive +drive_auth() + +# get list of all files +file_main <- drive_get(path = as_id(x = "https://drive.google.com/file/d/1PS9xQIP_O048rGb-uvwPonyrV_3CCYc4/view?usp=sharing")) +files <- drive_ls( + path = as_id(x = "https://drive.google.com/drive/u/0/folders/1sfavbXr3UAqfd_zWAuDChvC7Hnb69gi5"), + type = "csv" +) + +# store Excel file temporarily +GET( + url = file_main$drive_resource[[1]]$webContentLink, + write_disk(tf <- tempfile()) +) + +# import Excel sheet +sheet_questions <- read_excel( + path = tf, + sheet = "List of Questions", + col_names = FALSE +) +sheet_questionnaires <- read_excel( + path = tf, + sheet = "Questionnaires", + skip = 1, + col_names = TRUE +) + +# rename columns +sheet_questions <- sheet_questions %>% + rename( + measure = `...1`, + questionnaire = `...2`, + question = `...3` + ) %>% + fill(measure, .direction = "down") +sheet_questionnaires <- select(.data = sheet_questionnaires, pupil_id:dSEND) + +# import each csv from gdrive +list_df <- list() +j <- 1 + +for (i in files$drive_resource) { + link <- i$webContentLink + df <- read_responses(file = link) + # store in list + list_df[[j]] <- df + j <- j + 1 +} + +responses <- map_dfr(.x = list_df, .f = rbind) %>% + mutate(measurement_date = parse_date(x = measurement_date, format = "%d/%m/%Y")) + +# clear environment +unlink(tf) +rm(df, file_main, files, i, j, link, list_df, tf) + + +# get questionnaire info in to use as column names later +responses <- responses %>% + left_join(y = sheet_questions, by = "question") %>% + mutate(qq = paste0(questionnaire, " - ", question)) %>% + # remove duplicates + distinct() + + +# see if have unique combo of rows so can pivot_wider safely +responses %>% + group_by(pupil_id, pupil_impacted_id, measurement_date, qq) %>% + summarise(count = n()) %>% + filter(count > 1) +# have duplicates, here's an example +responses %>% + filter(pupil_id == "100165", measurement_date == "2020-09-09", qq == "207_10 - I have felt like I have missed important school work") + +# partition these duplicate responses with a row number; allocation of this is random +# this seems the best we can do +responses <- responses %>% + select(pupil_id:measurement_date, qq, response) %>% + group_by(pupil_id, pupil_impacted_id, measurement_date, qq) %>% + mutate(rank = row_number()) %>% + arrange(pupil_id, pupil_impacted_id, measurement_date, qq) + +# isolate unique records +responses_dedupe <- filter(.data = responses, rank == 1) + +# pivot wider for ImpactEd's purposes +df_output <- responses_dedupe %>% + pivot_wider( + id_cols = c(pupil_id, pupil_impacted_id, measurement_date), + names_from = "qq", + values_from = "response" + ) %>% + arrange(pupil_id, measurement_date) +write_csv(x = df_output, file = "data/processed/questionnaires_linked.csv") diff --git a/src/utils/read_responses.R b/src/utils/read_responses.R new file mode 100644 index 0000000..87fc285 --- /dev/null +++ b/src/utils/read_responses.R @@ -0,0 +1,27 @@ +library(readr) +library(dplyr) +library(tidyr) +library(stringr) + + +# from Duncan - function for importing +read_responses <- function(file) { + read_csv(file, col_types = cols(.default = col_character())) %>% + pivot_longer(-c(pupil_id, pupil_impacted_id, measurement_date), + names_to = "question", values_to = "response" + ) %>% + # Deal with duplicated questions. + # Three pupils in questionnaire 186 have a set of columns to themselves, so + # we merge them back in by: + # + # 1. filtering out everyone's blank responses. That means that most pupils + # only have responses to the first instance of each question, and the + # three affected pupils only have responses to the second instance of + # each question. + # 2. dropping the suffix that readr automatically adds to the second + # instance of each question. Now all pupils are the same. + # + # instance of each column. + filter(!is.na(response)) %>% + mutate(question = str_remove(question, "_\\d+$")) +} From cf6193c1c7886b54794c30a8ae6ae9ff28b56728 Mon Sep 17 00:00:00 2001 From: avisionh Date: Wed, 30 Dec 2020 23:31:47 +0000 Subject: [PATCH 7/9] Wrangle data for long plotting This is so we can see how respondents answered over time. --- notebooks/longitudinal-tracking/analysis.R | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 notebooks/longitudinal-tracking/analysis.R diff --git a/notebooks/longitudinal-tracking/analysis.R b/notebooks/longitudinal-tracking/analysis.R new file mode 100644 index 0000000..ba52461 --- /dev/null +++ b/notebooks/longitudinal-tracking/analysis.R @@ -0,0 +1,48 @@ +library(readr) +library(lubridate) +library(dplyr) +library(tidyr) + +df <- read_csv(file = "data/processed/questionnaires_linked.csv") + +# focus on: +# - Wellbeing: Q185 +# - Anxiety: Q206 +# - Remote-learning: Q188 + +# "Is it possible to track a micro-cohort of pupils across the whole period?" +# "Focusing on those respondents who have multiple responses in the period +# – there should be a good number of pupils with at least 4 or more responses over the time period" +pupils_long <- df %>% + count(pupil_id) %>% + filter(n > 4) %>% + distinct(pupil_id) %>% + pull() + +# are pupils who have returned multiple surveys within the same month +# as we're informed that ideally, each student should be filling out a survey every month +# then there's something possibly dodgy about multiple returns in the same month +# so let's get these students and isolate them +pupils_return_several_in_one_month <- df %>% + filter(pupil_id %in% pupils_long) %>% + # check they have completed multiple returns across several months + # as we want to avoid duplicates + mutate(measurement_month = month(measurement_date)) %>% + group_by(pupil_id, measurement_month) %>% + tally() %>% + filter(n > 1) %>% + distinct(pupil_id) %>% + pull() + +df_wellbeing <- df %>% + # filter for students with >= 4 returns + # and students who did not return more than one response in same month + filter((pupil_id %in% pupils_long) & !(pupil_id %in% pupils_return_several_in_one_month)) %>% + # select Q to focus on + select(pupil_id, measurement_date, starts_with(match = "185")) %>% + # unpivot + pivot_longer( + cols = -c("pupil_id", "measurement_date"), + names_to = "question", + values_to = "response" + ) From 2f65d3834bd0280942560c038266b2e40e61e141 Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 3 Jan 2021 18:04:05 +0000 Subject: [PATCH 8/9] Add individual level plot of wellbeing against time This is so we can see how people have responded over time. --- notebooks/longitudinal-tracking/analysis.R | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/notebooks/longitudinal-tracking/analysis.R b/notebooks/longitudinal-tracking/analysis.R index ba52461..2969ff7 100644 --- a/notebooks/longitudinal-tracking/analysis.R +++ b/notebooks/longitudinal-tracking/analysis.R @@ -2,6 +2,7 @@ library(readr) library(lubridate) library(dplyr) library(tidyr) +library(ggplot2) df <- read_csv(file = "data/processed/questionnaires_linked.csv") @@ -45,4 +46,23 @@ df_wellbeing <- df %>% cols = -c("pupil_id", "measurement_date"), names_to = "question", values_to = "response" - ) + ) %>% + # extract month for simplication + mutate(measurement_month = as.factor(month(measurement_date))) + + +# suggestion i: +# I guess my go-to would be % of respondents rating 'highly likely' per data point, with time as x-axis? +# So it could be a stacked area / bar chart / line chart if you decide to include the breaks as well + + +# suggestion ii: +# geom point + jitter them + colour them by ordinal scale (1 - 7 or whatever) + use borders +# if you need to highlight the micro-cohorts + x-axis is time +ggplot(data = df_wellbeing, mapping = aes( + x = measurement_month, + y = as.factor(pupil_id), + colour = as.factor(response) +)) + + geom_point() + + facet_grid(. ~ question) From 9018a8bfe1ae7f70c1408a714878e92769e0ca11 Mon Sep 17 00:00:00 2001 From: avisionh Date: Tue, 5 Jan 2021 01:45:00 +0000 Subject: [PATCH 9/9] Add stacked bar chart This is to show how the share of responses change between each month. --- .Rbuildignore | 2 + notebooks/longitudinal-tracking/analysis.R | 43 ++++++++++++++++++---- surveyanalysis.Rproj | 17 +++++++++ 3 files changed, 55 insertions(+), 7 deletions(-) create mode 100644 surveyanalysis.Rproj diff --git a/.Rbuildignore b/.Rbuildignore index caf548c..2c82ca8 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,5 @@ ^renv$ ^renv\.lock$ ^requirements\.txt$ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/notebooks/longitudinal-tracking/analysis.R b/notebooks/longitudinal-tracking/analysis.R index 2969ff7..5826c85 100644 --- a/notebooks/longitudinal-tracking/analysis.R +++ b/notebooks/longitudinal-tracking/analysis.R @@ -3,6 +3,7 @@ library(lubridate) library(dplyr) library(tidyr) library(ggplot2) +library(scales) df <- read_csv(file = "data/processed/questionnaires_linked.csv") @@ -47,22 +48,50 @@ df_wellbeing <- df %>% names_to = "question", values_to = "response" ) %>% - # extract month for simplication - mutate(measurement_month = as.factor(month(measurement_date))) + # extract month for simplification + mutate(measurement_month = factor(x = month(measurement_date)), + response = factor(x = response, + levels = c(NA, seq(from = 1, to = 5, by = 1)), + ordered = TRUE)) # suggestion i: # I guess my go-to would be % of respondents rating 'highly likely' per data point, with time as x-axis? # So it could be a stacked area / bar chart / line chart if you decide to include the breaks as well +df_stack <- df_wellbeing %>% + group_by(measurement_month, response) %>% + tally() %>% + rename('counts' = 'n') %>% + mutate(label = paste0(round(x = 100 * counts / sum(counts), digits = 2), '%')) + +ggplot(data = df_stack, mapping = aes(x = measurement_month, + y = counts, + fill = response)) + + geom_bar(stat = 'identity') + + geom_text(mapping = aes(label = label), + position = position_stack(vjust = 0.5)) # suggestion ii: # geom point + jitter them + colour them by ordinal scale (1 - 7 or whatever) + use borders # if you need to highlight the micro-cohorts + x-axis is time -ggplot(data = df_wellbeing, mapping = aes( - x = measurement_month, - y = as.factor(pupil_id), - colour = as.factor(response) -)) + +ggplot(data = df_wellbeing, + mapping = aes(x = measurement_month, + y = as.factor(pupil_id), + colour = response)) + geom_point() + facet_grid(. ~ question) + +# suggestion iii: +# graph visualisation where nodes are responses and edges are % of responses following this path + +# no. of surveys they complete +n <- 4 + +# no. of levels (likert) +lvls <- list(1:5) + +l <- rep(x = lvls, n) + +# get no. of unique permutations +combos <- expand.grid(l) diff --git a/surveyanalysis.Rproj b/surveyanalysis.Rproj new file mode 100644 index 0000000..21a4da0 --- /dev/null +++ b/surveyanalysis.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source