diff --git a/.Rbuildignore b/.Rbuildignore index caf548c..2c82ca8 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,5 @@ ^renv$ ^renv\.lock$ ^requirements\.txt$ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/notebooks/longitudinal-tracking/analysis.R b/notebooks/longitudinal-tracking/analysis.R new file mode 100644 index 0000000..5826c85 --- /dev/null +++ b/notebooks/longitudinal-tracking/analysis.R @@ -0,0 +1,97 @@ +library(readr) +library(lubridate) +library(dplyr) +library(tidyr) +library(ggplot2) +library(scales) + +df <- read_csv(file = "data/processed/questionnaires_linked.csv") + +# focus on: +# - Wellbeing: Q185 +# - Anxiety: Q206 +# - Remote-learning: Q188 + +# "Is it possible to track a micro-cohort of pupils across the whole period?" +# "Focusing on those respondents who have multiple responses in the period +# – there should be a good number of pupils with at least 4 or more responses over the time period" +pupils_long <- df %>% + count(pupil_id) %>% + filter(n > 4) %>% + distinct(pupil_id) %>% + pull() + +# are pupils who have returned multiple surveys within the same month +# as we're informed that ideally, each student should be filling out a survey every month +# then there's something possibly dodgy about multiple returns in the same month +# so let's get these students and isolate them +pupils_return_several_in_one_month <- df %>% + filter(pupil_id %in% pupils_long) %>% + # check they have completed multiple returns across several months + # as we want to avoid duplicates + mutate(measurement_month = month(measurement_date)) %>% + group_by(pupil_id, measurement_month) %>% + tally() %>% + filter(n > 1) %>% + distinct(pupil_id) %>% + pull() + +df_wellbeing <- df %>% + # filter for students with >= 4 returns + # and students who did not return more than one response in same month + filter((pupil_id %in% pupils_long) & !(pupil_id %in% pupils_return_several_in_one_month)) %>% + # select Q to focus on + select(pupil_id, measurement_date, starts_with(match = "185")) %>% + # unpivot + pivot_longer( + cols = -c("pupil_id", "measurement_date"), + names_to = "question", + values_to = "response" + ) %>% + # extract month for simplification + mutate(measurement_month = factor(x = month(measurement_date)), + response = factor(x = response, + levels = c(NA, seq(from = 1, to = 5, by = 1)), + ordered = TRUE)) + + +# suggestion i: +# I guess my go-to would be % of respondents rating 'highly likely' per data point, with time as x-axis? +# So it could be a stacked area / bar chart / line chart if you decide to include the breaks as well +df_stack <- df_wellbeing %>% + group_by(measurement_month, response) %>% + tally() %>% + rename('counts' = 'n') %>% + mutate(label = paste0(round(x = 100 * counts / sum(counts), digits = 2), '%')) + +ggplot(data = df_stack, mapping = aes(x = measurement_month, + y = counts, + fill = response)) + + geom_bar(stat = 'identity') + + geom_text(mapping = aes(label = label), + position = position_stack(vjust = 0.5)) + + +# suggestion ii: +# geom point + jitter them + colour them by ordinal scale (1 - 7 or whatever) + use borders +# if you need to highlight the micro-cohorts + x-axis is time +ggplot(data = df_wellbeing, + mapping = aes(x = measurement_month, + y = as.factor(pupil_id), + colour = response)) + + geom_point() + + facet_grid(. ~ question) + +# suggestion iii: +# graph visualisation where nodes are responses and edges are % of responses following this path + +# no. of surveys they complete +n <- 4 + +# no. of levels (likert) +lvls <- list(1:5) + +l <- rep(x = lvls, n) + +# get no. of unique permutations +combos <- expand.grid(l) diff --git a/renv.lock b/renv.lock index eab7438..06d956c 100644 --- a/renv.lock +++ b/renv.lock @@ -1,10 +1,10 @@ { "R": { - "Version": "4.0.3", + "Version": "4.0.2", "Repositories": [ { "Name": "CRAN", - "URL": "https://cran.ma.imperial.ac.uk" + "URL": "https://cran.rstudio.com" } ] }, @@ -25,10 +25,10 @@ }, "MASS": { "Package": "MASS", - "Version": "7.3-53", + "Version": "7.3-51.6", "Source": "Repository", "Repository": "CRAN", - "Hash": "d1bc1c8e9c0ace57ec9ffea01021d45f" + "Hash": "1dad32ac9dbd8057167b2979fb932ff7" }, "Matrix": { "Package": "Matrix", @@ -37,40 +37,12 @@ "Repository": "CRAN", "Hash": "08588806cba69f04797dab50627428ed" }, - "R.cache": { - "Package": "R.cache", - "Version": "0.14.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "1ca02d43e1a4d49e616bd23bb39b17e6" - }, - "R.methodsS3": { - "Package": "R.methodsS3", - "Version": "1.8.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "4bf6453323755202d5909697b6f7c109" - }, - "R.oo": { - "Package": "R.oo", - "Version": "1.24.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "5709328352717e2f0a9c012be8a97554" - }, - "R.utils": { - "Package": "R.utils", - "Version": "2.10.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "a9e316277ff12a43997266f2f6567780" - }, "R6": { "Package": "R6", - "Version": "2.4.1", + "Version": "2.5.0", "Source": "Repository", "Repository": "CRAN", - "Hash": "292b54f8f4b94669b08f94e5acce6be2" + "Hash": "b203113193e70978a696b2809525649d" }, "RColorBrewer": { "Package": "RColorBrewer", @@ -170,13 +142,6 @@ "Repository": "CRAN", "Hash": "6b436e95723d1f0e861224dd9b094dfb" }, - "commonmark": { - "Package": "commonmark", - "Version": "1.7", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "0f22be39ec1d141fd03683c06f3a6e67" - }, "cpp11": { "Package": "cpp11", "Version": "0.2.3", @@ -198,13 +163,6 @@ "Repository": "CRAN", "Hash": "2b7d10581cc730804e9ed178c8374bd6" }, - "cyclocomp": { - "Package": "cyclocomp", - "Version": "1.1.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "53cbed70a2f7472d48fb6aef08442f25" - }, "dbplyr": { "Package": "dbplyr", "Version": "1.4.4", @@ -221,17 +179,10 @@ }, "digest": { "Package": "digest", - "Version": "0.6.25", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "f697db7d92b7028c4b3436e9603fb636" - }, - "docopt": { - "Package": "docopt", - "Version": "0.7.1", + "Version": "0.6.27", "Source": "Repository", "Repository": "CRAN", - "Hash": "e9eeef7931ee99ca0093f3f20b88e09b" + "Hash": "a0cbe758a531d054b537d16dff4d58a1" }, "dplyr": { "Package": "dplyr", @@ -282,6 +233,13 @@ "Repository": "CRAN", "Hash": "44594a07a42e5f91fac9f93fda6d0109" }, + "gargle": { + "Package": "gargle", + "Version": "0.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "aaacaf8b0ec3dfe45df9eb6bc040db44" + }, "generics": { "Package": "generics", "Version": "0.0.2", @@ -296,20 +254,6 @@ "Repository": "CRAN", "Hash": "4ded8b439797f7b1693bd3d238d0106b" }, - "gh": { - "Package": "gh", - "Version": "1.1.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "89ea5998938d1ad55f035c8a86f96b74" - }, - "git2r": { - "Package": "git2r", - "Version": "0.27.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "531a82d1beed1f545beb25f4f5945bf7" - }, "glue": { "Package": "glue", "Version": "1.4.2", @@ -317,6 +261,13 @@ "Repository": "CRAN", "Hash": "6efd734b14c6471cfe443345f3e35e29" }, + "googledrive": { + "Package": "googledrive", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "79ba5d18133290a69b7c135dc3dfef1a" + }, "gtable": { "Package": "gtable", "Version": "0.3.0", @@ -366,20 +317,6 @@ "Repository": "CRAN", "Hash": "a525aba14184fec243f9eaec62fbed43" }, - "hunspell": { - "Package": "hunspell", - "Version": "3.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "71e7853d60b6b4ba891d62ede21752e9" - }, - "ini": { - "Package": "ini", - "Version": "0.3.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "6154ec2223172bce8162d4153cda21f7" - }, "isoband": { "Package": "isoband", "Version": "0.2.2", @@ -415,13 +352,6 @@ "Repository": "CRAN", "Hash": "fbd9285028b0263d76d18c95ae51a53d" }, - "lazyeval": { - "Package": "lazyeval", - "Version": "0.2.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "d908914ae53b04d4c0c0fd72ecc35370" - }, "lifecycle": { "Package": "lifecycle", "Version": "0.2.0", @@ -429,13 +359,6 @@ "Repository": "CRAN", "Hash": "361811f31f71f8a617a9a68bf63f1f42" }, - "lintr": { - "Package": "lintr", - "Version": "2.0.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "023cecbdc0a32f86ad3cb1734c018d2e" - }, "lubridate": { "Package": "lubridate", "Version": "1.7.9", @@ -459,10 +382,10 @@ }, "mgcv": { "Package": "mgcv", - "Version": "1.8-33", + "Version": "1.8-31", "Source": "Repository", "Repository": "CRAN", - "Hash": "eb7b6439bc6d812eed2cddba5edc6be3" + "Hash": "4bb7e0c4f3557583e1e8d3c9ffb8ba5c" }, "mime": { "Package": "mime", @@ -487,10 +410,10 @@ }, "nlme": { "Package": "nlme", - "Version": "3.1-149", + "Version": "3.1-148", "Source": "Repository", "Repository": "CRAN", - "Hash": "7c24ab3a1e3afe50388eb2d893aab255" + "Hash": "662f52871983ff3e3ef042c62de126df" }, "openssl": { "Package": "openssl", @@ -510,7 +433,7 @@ "Package": "piton", "Version": "0.1.1", "Source": "Repository", - "Repository": null, + "Repository": "CRAN", "Hash": "c4e3ffb3a754a77c5fdf89145a2f7d1d" }, "pkgbuild": { @@ -597,20 +520,6 @@ "Repository": "CRAN", "Hash": "c66b930d20bb6d858cd18e1cebcfae5c" }, - "rematch2": { - "Package": "rematch2", - "Version": "2.1.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "76c9e04c712a05848ae7a23d2f170a40" - }, - "remotes": { - "Package": "remotes", - "Version": "2.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "430a0908aee75b1fcba0e62857cab0ce" - }, "renv": { "Package": "renv", "Version": "0.12.0", @@ -625,13 +534,6 @@ "Repository": "CRAN", "Hash": "b06bfb3504cc8a4579fd5567646f745b" }, - "rex": { - "Package": "rex", - "Version": "1.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "093584b944440c5cd07a696b3c8e0e4c" - }, "rlang": { "Package": "rlang", "Version": "0.4.8", @@ -681,13 +583,6 @@ "Repository": "CRAN", "Hash": "3838071b66e0c566d55cc26bd6e27bf4" }, - "spelling": { - "Package": "spelling", - "Version": "2.1", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "b3a5ecc3351f41eb30ef87f65cbff390" - }, "stringi": { "Package": "stringi", "Version": "1.5.3", @@ -702,13 +597,6 @@ "Repository": "CRAN", "Hash": "0759e6b6c0957edb1311028a49a35e76" }, - "styler": { - "Package": "styler", - "Version": "1.3.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "60b23effde8d08a56a64ebeb92a32749" - }, "sys": { "Package": "sys", "Version": "3.4", @@ -772,13 +660,6 @@ "Repository": "CRAN", "Hash": "9926b1bcf0b8f907b5c1b1dd922875bd" }, - "usethis": { - "Package": "usethis", - "Version": "1.6.3", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "c541a7aed5f7fb3b487406bf92842e34" - }, "utf8": { "Package": "utf8", "Version": "1.1.4", @@ -786,6 +667,13 @@ "Repository": "CRAN", "Hash": "4a5081acfb7b81a572e4384a7aaf2af1" }, + "uuid": { + "Package": "uuid", + "Version": "0.1-4", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "e4169eb989a5d03ccb6b628cad1b1b50" + }, "vctrs": { "Package": "vctrs", "Version": "0.3.4", @@ -828,13 +716,6 @@ "Repository": "CRAN", "Hash": "d4d71a75dd3ea9eb5fa28cc21f9585e2" }, - "xmlparsedata": { - "Package": "xmlparsedata", - "Version": "1.0.4", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "373bcee7aad3980799936749cfca6f24" - }, "yaml": { "Package": "yaml", "Version": "2.2.1", diff --git a/src/make_data/link_data.R b/src/make_data/link_data.R new file mode 100644 index 0000000..b44dda1 --- /dev/null +++ b/src/make_data/link_data.R @@ -0,0 +1,106 @@ +library(googledrive) +library(readxl) +library(httr) +library(dplyr) +library(purrr) +library(tidyr) + +source("src/utils/read_responses.R") + + +# authorise access to gdrive +drive_auth() + +# get list of all files +file_main <- drive_get(path = as_id(x = "https://drive.google.com/file/d/1PS9xQIP_O048rGb-uvwPonyrV_3CCYc4/view?usp=sharing")) +files <- drive_ls( + path = as_id(x = "https://drive.google.com/drive/u/0/folders/1sfavbXr3UAqfd_zWAuDChvC7Hnb69gi5"), + type = "csv" +) + +# store Excel file temporarily +GET( + url = file_main$drive_resource[[1]]$webContentLink, + write_disk(tf <- tempfile()) +) + +# import Excel sheet +sheet_questions <- read_excel( + path = tf, + sheet = "List of Questions", + col_names = FALSE +) +sheet_questionnaires <- read_excel( + path = tf, + sheet = "Questionnaires", + skip = 1, + col_names = TRUE +) + +# rename columns +sheet_questions <- sheet_questions %>% + rename( + measure = `...1`, + questionnaire = `...2`, + question = `...3` + ) %>% + fill(measure, .direction = "down") +sheet_questionnaires <- select(.data = sheet_questionnaires, pupil_id:dSEND) + +# import each csv from gdrive +list_df <- list() +j <- 1 + +for (i in files$drive_resource) { + link <- i$webContentLink + df <- read_responses(file = link) + # store in list + list_df[[j]] <- df + j <- j + 1 +} + +responses <- map_dfr(.x = list_df, .f = rbind) %>% + mutate(measurement_date = parse_date(x = measurement_date, format = "%d/%m/%Y")) + +# clear environment +unlink(tf) +rm(df, file_main, files, i, j, link, list_df, tf) + + +# get questionnaire info in to use as column names later +responses <- responses %>% + left_join(y = sheet_questions, by = "question") %>% + mutate(qq = paste0(questionnaire, " - ", question)) %>% + # remove duplicates + distinct() + + +# see if have unique combo of rows so can pivot_wider safely +responses %>% + group_by(pupil_id, pupil_impacted_id, measurement_date, qq) %>% + summarise(count = n()) %>% + filter(count > 1) +# have duplicates, here's an example +responses %>% + filter(pupil_id == "100165", measurement_date == "2020-09-09", qq == "207_10 - I have felt like I have missed important school work") + +# partition these duplicate responses with a row number; allocation of this is random +# this seems the best we can do +responses <- responses %>% + select(pupil_id:measurement_date, qq, response) %>% + group_by(pupil_id, pupil_impacted_id, measurement_date, qq) %>% + mutate(rank = row_number()) %>% + arrange(pupil_id, pupil_impacted_id, measurement_date, qq) + +# isolate unique records +responses_dedupe <- filter(.data = responses, rank == 1) + +# pivot wider for ImpactEd's purposes +df_output <- responses_dedupe %>% + pivot_wider( + id_cols = c(pupil_id, pupil_impacted_id, measurement_date), + names_from = "qq", + values_from = "response" + ) %>% + arrange(pupil_id, measurement_date) +write_csv(x = df_output, file = "data/processed/questionnaires_linked.csv") diff --git a/src/utils/read_responses.R b/src/utils/read_responses.R new file mode 100644 index 0000000..87fc285 --- /dev/null +++ b/src/utils/read_responses.R @@ -0,0 +1,27 @@ +library(readr) +library(dplyr) +library(tidyr) +library(stringr) + + +# from Duncan - function for importing +read_responses <- function(file) { + read_csv(file, col_types = cols(.default = col_character())) %>% + pivot_longer(-c(pupil_id, pupil_impacted_id, measurement_date), + names_to = "question", values_to = "response" + ) %>% + # Deal with duplicated questions. + # Three pupils in questionnaire 186 have a set of columns to themselves, so + # we merge them back in by: + # + # 1. filtering out everyone's blank responses. That means that most pupils + # only have responses to the first instance of each question, and the + # three affected pupils only have responses to the second instance of + # each question. + # 2. dropping the suffix that readr automatically adds to the second + # instance of each question. Now all pupils are the same. + # + # instance of each column. + filter(!is.na(response)) %>% + mutate(question = str_remove(question, "_\\d+$")) +} diff --git a/surveyanalysis.Rproj b/surveyanalysis.Rproj new file mode 100644 index 0000000..21a4da0 --- /dev/null +++ b/surveyanalysis.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source