From 77844cabe657b8b1fc5c1857a0b56d030fac1303 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:10 +0000 Subject: [PATCH 01/36] sparse-checkout: create builtin with 'list' subcommand The sparse-checkout feature is mostly hidden to users, as its only documentation is supplementary information in the docs for 'git read-tree'. In addition, users need to know how to edit the .git/info/sparse-checkout file with the right patterns, then run the appropriate 'git read-tree -mu HEAD' command. Keeping the working directory in sync with the sparse-checkout file requires care. Begin an effort to make the sparse-checkout feature a porcelain feature by creating a new 'git sparse-checkout' builtin. This builtin will be the preferred mechanism for manipulating the sparse-checkout file and syncing the working directory. The documentation provided is adapted from the "git read-tree" documentation with a few edits for clarity in the new context. Extra sections are added to hint toward a future change to a more restricted pattern set. Helped-by: Elijah Newren Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- .gitignore | 1 + Documentation/git-read-tree.txt | 2 +- Documentation/git-sparse-checkout.txt | 89 +++++++++++++++++++++++++++ Makefile | 1 + builtin.h | 1 + builtin/sparse-checkout.c | 86 ++++++++++++++++++++++++++ command-list.txt | 1 + git.c | 1 + t/t1091-sparse-checkout-builtin.sh | 45 ++++++++++++++ 9 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 Documentation/git-sparse-checkout.txt create mode 100644 builtin/sparse-checkout.c create mode 100755 t/t1091-sparse-checkout-builtin.sh diff --git a/.gitignore b/.gitignore index 89b3b79c1a2278..aebe7c0908f168 100644 --- a/.gitignore +++ b/.gitignore @@ -158,6 +158,7 @@ /git-show-branch /git-show-index /git-show-ref +/git-sparse-checkout /git-stage /git-stash /git-status diff --git a/Documentation/git-read-tree.txt b/Documentation/git-read-tree.txt index d2718426085613..da33f84f33d2c5 100644 --- a/Documentation/git-read-tree.txt +++ b/Documentation/git-read-tree.txt @@ -436,7 +436,7 @@ support. SEE ALSO -------- linkgit:git-write-tree[1]; linkgit:git-ls-files[1]; -linkgit:gitignore[5] +linkgit:gitignore[5]; linkgit:git-sparse-checkout[1]; GIT --- diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt new file mode 100644 index 00000000000000..9d6ca2291701f5 --- /dev/null +++ b/Documentation/git-sparse-checkout.txt @@ -0,0 +1,89 @@ +git-sparse-checkout(1) +====================== + +NAME +---- +git-sparse-checkout - Initialize and modify the sparse-checkout +configuration, which reduces the checkout to a set of directories +given by a list of prefixes. + + +SYNOPSIS +-------- +[verse] +'git sparse-checkout [options]' + + +DESCRIPTION +----------- + +Initialize and modify the sparse-checkout configuration, which reduces +the checkout to a set of directories given by a list of prefixes. + +THIS COMMAND IS EXPERIMENTAL. ITS BEHAVIOR, AND THE BEHAVIOR OF OTHER +COMMANDS IN THE PRESENCE OF SPARSE-CHECKOUTS, WILL LIKELY CHANGE IN +THE FUTURE. + + +COMMANDS +-------- +'list':: + Provide a list of the contents in the sparse-checkout file. + + +SPARSE CHECKOUT +--------------- + +"Sparse checkout" allows populating the working directory sparsely. +It uses the skip-worktree bit (see linkgit:git-update-index[1]) to tell +Git whether a file in the working directory is worth looking at. If +the skip-worktree bit is set, then the file is ignored in the working +directory. Git will not populate the contents of those files, which +makes a sparse checkout helpful when working in a repository with many +files, but only a few are important to the current user. + +The `$GIT_DIR/info/sparse-checkout` file is used to define the +skip-worktree reference bitmap. When Git updates the working +directory, it updates the skip-worktree bits in the index based +on this file. The files matching the patterns in the file will +appear in the working directory, and the rest will not. + +## FULL PATTERN SET + +By default, the sparse-checkout file uses the same syntax as `.gitignore` +files. + +While `$GIT_DIR/info/sparse-checkout` is usually used to specify what +files are included, you can also specify what files are _not_ included, +using negative patterns. For example, to remove the file `unwanted`: + +---------------- +/* +!unwanted +---------------- + +Another tricky thing is fully repopulating the working directory when you +no longer want sparse checkout. You cannot just disable "sparse +checkout" because skip-worktree bits are still in the index and your working +directory is still sparsely populated. You should re-populate the working +directory with the `$GIT_DIR/info/sparse-checkout` file content as +follows: + +---------------- +/* +---------------- + +Then you can disable sparse checkout. Sparse checkout support in 'git +checkout' and similar commands is disabled by default. You need to +set `core.sparseCheckout` to `true` in order to have sparse checkout +support. + +SEE ALSO +-------- + +linkgit:git-read-tree[1] +linkgit:gitignore[5] + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Makefile b/Makefile index de60c8e7aada06..adefc229fe9382 100644 --- a/Makefile +++ b/Makefile @@ -1125,6 +1125,7 @@ BUILTIN_OBJS += builtin/shortlog.o BUILTIN_OBJS += builtin/show-branch.o BUILTIN_OBJS += builtin/show-index.o BUILTIN_OBJS += builtin/show-ref.o +BUILTIN_OBJS += builtin/sparse-checkout.o BUILTIN_OBJS += builtin/stash.o BUILTIN_OBJS += builtin/stripspace.o BUILTIN_OBJS += builtin/submodule--helper.o diff --git a/builtin.h b/builtin.h index 5cf5df69f72fd5..2b25a80cde37b4 100644 --- a/builtin.h +++ b/builtin.h @@ -225,6 +225,7 @@ int cmd_shortlog(int argc, const char **argv, const char *prefix); int cmd_show(int argc, const char **argv, const char *prefix); int cmd_show_branch(int argc, const char **argv, const char *prefix); int cmd_show_index(int argc, const char **argv, const char *prefix); +int cmd_sparse_checkout(int argc, const char **argv, const char *prefix); int cmd_status(int argc, const char **argv, const char *prefix); int cmd_stash(int argc, const char **argv, const char *prefix); int cmd_stripspace(int argc, const char **argv, const char *prefix); diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c new file mode 100644 index 00000000000000..5717c9b2cb30c0 --- /dev/null +++ b/builtin/sparse-checkout.c @@ -0,0 +1,86 @@ +#include "builtin.h" +#include "config.h" +#include "dir.h" +#include "parse-options.h" +#include "pathspec.h" +#include "repository.h" +#include "run-command.h" +#include "strbuf.h" + +static char const * const builtin_sparse_checkout_usage[] = { + N_("git sparse-checkout list"), + NULL +}; + +static char *get_sparse_checkout_filename(void) +{ + return git_pathdup("info/sparse-checkout"); +} + +static void write_patterns_to_file(FILE *fp, struct pattern_list *pl) +{ + int i; + + for (i = 0; i < pl->nr; i++) { + struct path_pattern *p = pl->patterns[i]; + + if (p->flags & PATTERN_FLAG_NEGATIVE) + fprintf(fp, "!"); + + fprintf(fp, "%s", p->pattern); + + if (p->flags & PATTERN_FLAG_MUSTBEDIR) + fprintf(fp, "/"); + + fprintf(fp, "\n"); + } +} + +static int sparse_checkout_list(int argc, const char **argv) +{ + struct pattern_list pl; + char *sparse_filename; + int res; + + memset(&pl, 0, sizeof(pl)); + + sparse_filename = get_sparse_checkout_filename(); + res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL); + free(sparse_filename); + + if (res < 0) { + warning(_("this worktree is not sparse (sparse-checkout file may not exist)")); + return 0; + } + + write_patterns_to_file(stdout, &pl); + clear_pattern_list(&pl); + + return 0; +} + +int cmd_sparse_checkout(int argc, const char **argv, const char *prefix) +{ + static struct option builtin_sparse_checkout_options[] = { + OPT_END(), + }; + + if (argc == 2 && !strcmp(argv[1], "-h")) + usage_with_options(builtin_sparse_checkout_usage, + builtin_sparse_checkout_options); + + argc = parse_options(argc, argv, prefix, + builtin_sparse_checkout_options, + builtin_sparse_checkout_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + + git_config(git_default_config, NULL); + + if (argc > 0) { + if (!strcmp(argv[0], "list")) + return sparse_checkout_list(argc, argv); + } + + usage_with_options(builtin_sparse_checkout_usage, + builtin_sparse_checkout_options); +} diff --git a/command-list.txt b/command-list.txt index a9ac72bef487ef..d3d28252b38254 100644 --- a/command-list.txt +++ b/command-list.txt @@ -166,6 +166,7 @@ git-show-index plumbinginterrogators git-show-ref plumbinginterrogators git-sh-i18n purehelpers git-sh-setup purehelpers +git-sparse-checkout mainporcelain worktree git-stash mainporcelain git-stage complete git-status mainporcelain info diff --git a/git.c b/git.c index ce6ab0ece2cc6d..7be7ad34bd0538 100644 --- a/git.c +++ b/git.c @@ -572,6 +572,7 @@ static struct cmd_struct commands[] = { { "show-branch", cmd_show_branch, RUN_SETUP }, { "show-index", cmd_show_index }, { "show-ref", cmd_show_ref, RUN_SETUP }, + { "sparse-checkout", cmd_sparse_checkout, RUN_SETUP | NEED_WORK_TREE }, { "stage", cmd_add, RUN_SETUP | NEED_WORK_TREE }, /* * NEEDSWORK: Until the builtin stash is thoroughly robust and no diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh new file mode 100755 index 00000000000000..9b73d449071dae --- /dev/null +++ b/t/t1091-sparse-checkout-builtin.sh @@ -0,0 +1,45 @@ +#!/bin/sh + +test_description='sparse checkout builtin tests' + +. ./test-lib.sh + +test_expect_success 'setup' ' + git init repo && + ( + cd repo && + echo "initial" >a && + mkdir folder1 folder2 deep && + mkdir deep/deeper1 deep/deeper2 && + mkdir deep/deeper1/deepest && + cp a folder1 && + cp a folder2 && + cp a deep && + cp a deep/deeper1 && + cp a deep/deeper2 && + cp a deep/deeper1/deepest && + git add . && + git commit -m "initial commit" + ) +' + +test_expect_success 'git sparse-checkout list (empty)' ' + git -C repo sparse-checkout list >list 2>err && + test_must_be_empty list && + test_i18ngrep "this worktree is not sparse (sparse-checkout file may not exist)" err +' + +test_expect_success 'git sparse-checkout list (populated)' ' + test_when_finished rm -f repo/.git/info/sparse-checkout && + cat >repo/.git/info/sparse-checkout <<-EOF && + /folder1/* + /deep/ + **/a + !*bin* + EOF + cp repo/.git/info/sparse-checkout expect && + git -C repo sparse-checkout list >list && + test_cmp expect list +' + +test_done From 7e12ce4c40afea6614e0afb6f8c281e2a259123b Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:11 +0000 Subject: [PATCH 02/36] sparse-checkout: create 'init' subcommand Getting started with a sparse-checkout file can be daunting. Help users start their sparse enlistment using 'git sparse-checkout init'. This will set 'core.sparseCheckout=true' in their config, write an initial set of patterns to the sparse-checkout file, and update their working directory. Make sure to use the `extensions.worktreeConfig` setting and write the sparse checkout config to the worktree-specific config file. This avoids confusing interactions with other worktrees. The use of running another process for 'git read-tree' is sub- optimal. This will be removed in a later change. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/git-sparse-checkout.txt | 11 ++++ builtin/sparse-checkout.c | 79 ++++++++++++++++++++++++++- t/t1091-sparse-checkout-builtin.sh | 41 ++++++++++++++ 3 files changed, 130 insertions(+), 1 deletion(-) diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt index 9d6ca2291701f5..930a361567694a 100644 --- a/Documentation/git-sparse-checkout.txt +++ b/Documentation/git-sparse-checkout.txt @@ -30,6 +30,17 @@ COMMANDS 'list':: Provide a list of the contents in the sparse-checkout file. +'init':: + Enable the `core.sparseCheckout` setting. If the + sparse-checkout file does not exist, then populate it with + patterns that match every file in the root directory and + no other directories, then will remove all directories tracked + by Git. Add patterns to the sparse-checkout file to + repopulate the working directory. ++ +To avoid interfering with other worktrees, it first enables the +`extensions.worktreeConfig` setting and makes sure to set the +`core.sparseCheckout` setting in the worktree-specific config file. SPARSE CHECKOUT --------------- diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c index 5717c9b2cb30c0..77aa52ca017ab3 100644 --- a/builtin/sparse-checkout.c +++ b/builtin/sparse-checkout.c @@ -8,7 +8,7 @@ #include "strbuf.h" static char const * const builtin_sparse_checkout_usage[] = { - N_("git sparse-checkout list"), + N_("git sparse-checkout (init|list)"), NULL }; @@ -59,6 +59,81 @@ static int sparse_checkout_list(int argc, const char **argv) return 0; } +static int update_working_directory(void) +{ + struct argv_array argv = ARGV_ARRAY_INIT; + int result = 0; + argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL); + + if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) { + error(_("failed to update index with new sparse-checkout paths")); + result = 1; + } + + argv_array_clear(&argv); + return result; +} + +enum sparse_checkout_mode { + MODE_NO_PATTERNS = 0, + MODE_ALL_PATTERNS = 1, +}; + +static int sc_set_config(enum sparse_checkout_mode mode) +{ + struct argv_array argv = ARGV_ARRAY_INIT; + + if (git_config_set_gently("extensions.worktreeConfig", "true")) { + error(_("failed to set extensions.worktreeConfig setting")); + return 1; + } + + argv_array_pushl(&argv, "config", "--worktree", "core.sparseCheckout", NULL); + + if (mode) + argv_array_pushl(&argv, "true", NULL); + else + argv_array_pushl(&argv, "false", NULL); + + if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) { + error(_("failed to enable core.sparseCheckout")); + return 1; + } + + return 0; +} + +static int sparse_checkout_init(int argc, const char **argv) +{ + struct pattern_list pl; + char *sparse_filename; + FILE *fp; + int res; + + if (sc_set_config(MODE_ALL_PATTERNS)) + return 1; + + memset(&pl, 0, sizeof(pl)); + + sparse_filename = get_sparse_checkout_filename(); + res = add_patterns_from_file_to_list(sparse_filename, "", 0, &pl, NULL); + + /* If we already have a sparse-checkout file, use it. */ + if (res >= 0) { + free(sparse_filename); + goto reset_dir; + } + + /* initial mode: all blobs at root */ + fp = xfopen(sparse_filename, "w"); + free(sparse_filename); + fprintf(fp, "/*\n!/*/\n"); + fclose(fp); + +reset_dir: + return update_working_directory(); +} + int cmd_sparse_checkout(int argc, const char **argv, const char *prefix) { static struct option builtin_sparse_checkout_options[] = { @@ -79,6 +154,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix) if (argc > 0) { if (!strcmp(argv[0], "list")) return sparse_checkout_list(argc, argv); + if (!strcmp(argv[0], "init")) + return sparse_checkout_init(argc, argv); } usage_with_options(builtin_sparse_checkout_usage, diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index 9b73d449071dae..cd56cc384b4569 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -42,4 +42,45 @@ test_expect_success 'git sparse-checkout list (populated)' ' test_cmp expect list ' +test_expect_success 'git sparse-checkout init' ' + git -C repo sparse-checkout init && + cat >expect <<-EOF && + /* + !/*/ + EOF + test_cmp expect repo/.git/info/sparse-checkout && + git -C repo config --list >config && + test_i18ngrep "core.sparsecheckout=true" config && + ls repo >dir && + echo a >expect && + test_cmp expect dir +' + +test_expect_success 'git sparse-checkout list after init' ' + git -C repo sparse-checkout list >actual && + cat >expect <<-EOF && + /* + !/*/ + EOF + test_cmp expect actual +' + +test_expect_success 'init with existing sparse-checkout' ' + echo "*folder*" >> repo/.git/info/sparse-checkout && + git -C repo sparse-checkout init && + cat >expect <<-EOF && + /* + !/*/ + *folder* + EOF + test_cmp expect repo/.git/info/sparse-checkout && + ls repo >dir && + cat >expect <<-EOF && + a + folder1 + folder2 + EOF + test_cmp expect dir +' + test_done From 2e885711e4b70de1664244202abdc762e7c16b80 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:12 +0000 Subject: [PATCH 03/36] clone: add --sparse mode When someone wants to clone a large repository, but plans to work using a sparse-checkout file, they either need to do a full checkout first and then reduce the patterns they included, or clone with --no-checkout, set up their patterns, and then run a checkout manually. This requires knowing a lot about the repo shape and how sparse-checkout works. Add a new '--sparse' option to 'git clone' that initializes the sparse-checkout file to include the following patterns: /* !/*/ These patterns include every file in the root directory, but no directories. This allows a repo to include files like a README or a bootstrapping script to grow enlistments from that point. During the 'git sparse-checkout init' call, we must first look to see if HEAD is valid, since 'git clone' does not have a valid HEAD at the point where it initializes the sparse-checkout. The following checkout within the clone command will create the HEAD ref and update the working directory correctly. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/git-clone.txt | 8 +++++++- builtin/clone.c | 27 +++++++++++++++++++++++++++ builtin/sparse-checkout.c | 6 ++++++ t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt index 34011c2940ad4b..0fe91d2f04766b 100644 --- a/Documentation/git-clone.txt +++ b/Documentation/git-clone.txt @@ -15,7 +15,7 @@ SYNOPSIS [--dissociate] [--separate-git-dir ] [--depth ] [--[no-]single-branch] [--no-tags] [--recurse-submodules[=]] [--[no-]shallow-submodules] - [--[no-]remote-submodules] [--jobs ] [--] + [--[no-]remote-submodules] [--jobs ] [--sparse] [--] [] DESCRIPTION @@ -156,6 +156,12 @@ objects from the source repository into a pack in the cloned repository. used, neither remote-tracking branches nor the related configuration variables are created. +--sparse:: + Initialize the sparse-checkout file so the working + directory starts with only the files in the root + of the repository. The sparse-checkout file can be + modified to grow the working directory as needed. + --mirror:: Set up a mirror of the source repository. This implies `--bare`. Compared to `--bare`, `--mirror` not only maps local branches of the diff --git a/builtin/clone.c b/builtin/clone.c index c46ee29f0ad579..4348d962c9c81e 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -59,6 +59,7 @@ static const char *real_git_dir; static char *option_upload_pack = "git-upload-pack"; static int option_verbosity; static int option_progress = -1; +static int option_sparse_checkout; static enum transport_family family; static struct string_list option_config = STRING_LIST_INIT_NODUP; static struct string_list option_required_reference = STRING_LIST_INIT_NODUP; @@ -146,6 +147,8 @@ static struct option builtin_clone_options[] = { OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options), OPT_BOOL(0, "remote-submodules", &option_remote_submodules, N_("any cloned submodules will use their remote-tracking branch")), + OPT_BOOL(0, "sparse", &option_sparse_checkout, + N_("initialize sparse-checkout file to include only files at root")), OPT_END() }; @@ -733,6 +736,27 @@ static void update_head(const struct ref *our, const struct ref *remote, } } +static int git_sparse_checkout_init(const char *repo) +{ + struct argv_array argv = ARGV_ARRAY_INIT; + int result = 0; + argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL); + + /* + * We must apply the setting in the current process + * for the later checkout to use the sparse-checkout file. + */ + core_apply_sparse_checkout = 1; + + if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) { + error(_("failed to initialize sparse-checkout")); + result = 1; + } + + argv_array_clear(&argv); + return result; +} + static int checkout(int submodule_progress) { struct object_id oid; @@ -1106,6 +1130,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix) if (option_required_reference.nr || option_optional_reference.nr) setup_reference(); + if (option_sparse_checkout && git_sparse_checkout_init(repo)) + return 1; + remote = remote_get(option_origin); strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix, diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c index 77aa52ca017ab3..6c336b7ab3f429 100644 --- a/builtin/sparse-checkout.c +++ b/builtin/sparse-checkout.c @@ -109,6 +109,7 @@ static int sparse_checkout_init(int argc, const char **argv) char *sparse_filename; FILE *fp; int res; + struct object_id oid; if (sc_set_config(MODE_ALL_PATTERNS)) return 1; @@ -130,6 +131,11 @@ static int sparse_checkout_init(int argc, const char **argv) fprintf(fp, "/*\n!/*/\n"); fclose(fp); + if (get_oid("HEAD", &oid)) { + /* assume we are in a fresh repo */ + return 0; + } + reset_dir: return update_working_directory(); } diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index cd56cc384b4569..cb74715ca63200 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -83,4 +83,17 @@ test_expect_success 'init with existing sparse-checkout' ' test_cmp expect dir ' +test_expect_success 'clone --sparse' ' + git clone --sparse repo clone && + git -C clone sparse-checkout list >actual && + cat >expect <<-EOF && + /* + !/*/ + EOF + test_cmp expect actual && + ls clone >dir && + echo a >expect && + test_cmp expect dir +' + test_done From c6b249a6e1d2e9a52b9acf9dcf0f8d4cf1807789 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:13 +0000 Subject: [PATCH 04/36] sparse-checkout: 'set' subcommand The 'git sparse-checkout set' subcommand takes a list of patterns as arguments and writes them to the sparse-checkout file. Then, it updates the working directory using 'git read-tree -mu HEAD'. The 'set' subcommand will replace the entire contents of the sparse-checkout file. The write_patterns_and_update() method is extracted from cmd_sparse_checkout() to make it easier to implement 'add' and/or 'remove' subcommands in the future. If the core.sparseCheckout config setting is disabled, then enable the config setting in the worktree config. If we set the config this way and the sparse-checkout fails, then re-disable the config setting. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/git-sparse-checkout.txt | 6 ++++ builtin/sparse-checkout.c | 45 ++++++++++++++++++++++++++- t/t1091-sparse-checkout-builtin.sh | 32 +++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt index 930a361567694a..b933043b3d6a8f 100644 --- a/Documentation/git-sparse-checkout.txt +++ b/Documentation/git-sparse-checkout.txt @@ -42,6 +42,12 @@ To avoid interfering with other worktrees, it first enables the `extensions.worktreeConfig` setting and makes sure to set the `core.sparseCheckout` setting in the worktree-specific config file. +'set':: + Write a set of patterns to the sparse-checkout file, as given as + a list of arguments following the 'set' subcommand. Update the + working directory to match the new patterns. Enable the + core.sparseCheckout config setting if it is not already enabled. + SPARSE CHECKOUT --------------- diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c index 6c336b7ab3f429..834ee421f03c57 100644 --- a/builtin/sparse-checkout.c +++ b/builtin/sparse-checkout.c @@ -8,7 +8,7 @@ #include "strbuf.h" static char const * const builtin_sparse_checkout_usage[] = { - N_("git sparse-checkout (init|list)"), + N_("git sparse-checkout (init|list|set) "), NULL }; @@ -140,6 +140,47 @@ static int sparse_checkout_init(int argc, const char **argv) return update_working_directory(); } +static int write_patterns_and_update(struct pattern_list *pl) +{ + char *sparse_filename; + FILE *fp; + + sparse_filename = get_sparse_checkout_filename(); + fp = fopen(sparse_filename, "w"); + write_patterns_to_file(fp, pl); + fclose(fp); + free(sparse_filename); + + return update_working_directory(); +} + +static int sparse_checkout_set(int argc, const char **argv, const char *prefix) +{ + static const char *empty_base = ""; + int i; + struct pattern_list pl; + int result; + int set_config = 0; + memset(&pl, 0, sizeof(pl)); + + for (i = 1; i < argc; i++) + add_pattern(argv[i], empty_base, 0, &pl, 0); + + if (!core_apply_sparse_checkout) { + sc_set_config(MODE_ALL_PATTERNS); + core_apply_sparse_checkout = 1; + set_config = 1; + } + + result = write_patterns_and_update(&pl); + + if (result && set_config) + sc_set_config(MODE_NO_PATTERNS); + + clear_pattern_list(&pl); + return result; +} + int cmd_sparse_checkout(int argc, const char **argv, const char *prefix) { static struct option builtin_sparse_checkout_options[] = { @@ -162,6 +203,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix) return sparse_checkout_list(argc, argv); if (!strcmp(argv[0], "init")) return sparse_checkout_init(argc, argv); + if (!strcmp(argv[0], "set")) + return sparse_checkout_set(argc, argv, prefix); } usage_with_options(builtin_sparse_checkout_usage, diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index cb74715ca63200..bf2dc55bb131e5 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -96,4 +96,36 @@ test_expect_success 'clone --sparse' ' test_cmp expect dir ' +test_expect_success 'set enables config' ' + git init empty-config && + ( + cd empty-config && + test_commit test file && + test_path_is_missing .git/config.worktree && + test_must_fail git sparse-checkout set nothing && + test_i18ngrep "sparseCheckout = false" .git/config.worktree && + git sparse-checkout set "/*" && + test_i18ngrep "sparseCheckout = true" .git/config.worktree + ) +' + +test_expect_success 'set sparse-checkout using builtin' ' + git -C repo sparse-checkout set "/*" "!/*/" "*folder*" && + cat >expect <<-EOF && + /* + !/*/ + *folder* + EOF + git -C repo sparse-checkout list >actual && + test_cmp expect actual && + test_cmp expect repo/.git/info/sparse-checkout && + ls repo >dir && + cat >expect <<-EOF && + a + folder1 + folder2 + EOF + test_cmp expect dir +' + test_done From ead918f1269c9de4058b0d82f8226f559a53a6c5 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:14 +0000 Subject: [PATCH 05/36] sparse-checkout: add '--stdin' option to set subcommand The 'git sparse-checkout set' subcommand takes a list of patterns and places them in the sparse-checkout file. Then, it updates the working directory to match those patterns. For a large list of patterns, the command-line call can get very cumbersome. Add a '--stdin' option to instead read patterns over standard in. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- builtin/sparse-checkout.c | 35 ++++++++++++++++++++++++++++-- t/t1091-sparse-checkout-builtin.sh | 20 +++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c index 834ee421f03c57..f2e2bd772d7035 100644 --- a/builtin/sparse-checkout.c +++ b/builtin/sparse-checkout.c @@ -154,6 +154,15 @@ static int write_patterns_and_update(struct pattern_list *pl) return update_working_directory(); } +static char const * const builtin_sparse_checkout_set_usage[] = { + N_("git sparse-checkout set [--stdin|]"), + NULL +}; + +static struct sparse_checkout_set_opts { + int use_stdin; +} set_opts; + static int sparse_checkout_set(int argc, const char **argv, const char *prefix) { static const char *empty_base = ""; @@ -161,10 +170,32 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix) struct pattern_list pl; int result; int set_config = 0; + + static struct option builtin_sparse_checkout_set_options[] = { + OPT_BOOL(0, "stdin", &set_opts.use_stdin, + N_("read patterns from standard in")), + OPT_END(), + }; + memset(&pl, 0, sizeof(pl)); - for (i = 1; i < argc; i++) - add_pattern(argv[i], empty_base, 0, &pl, 0); + argc = parse_options(argc, argv, prefix, + builtin_sparse_checkout_set_options, + builtin_sparse_checkout_set_usage, + PARSE_OPT_KEEP_UNKNOWN); + + if (set_opts.use_stdin) { + struct strbuf line = STRBUF_INIT; + + while (!strbuf_getline(&line, stdin)) { + size_t len; + char *buf = strbuf_detach(&line, &len); + add_pattern(buf, empty_base, 0, &pl, 0); + } + } else { + for (i = 0; i < argc; i++) + add_pattern(argv[i], empty_base, 0, &pl, 0); + } if (!core_apply_sparse_checkout) { sc_set_config(MODE_ALL_PATTERNS); diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index bf2dc55bb131e5..a9ff5eb9ec7e6d 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -128,4 +128,24 @@ test_expect_success 'set sparse-checkout using builtin' ' test_cmp expect dir ' +test_expect_success 'set sparse-checkout using --stdin' ' + cat >expect <<-EOF && + /* + !/*/ + /folder1/ + /folder2/ + EOF + git -C repo sparse-checkout set --stdin actual && + test_cmp expect actual && + test_cmp expect repo/.git/info/sparse-checkout && + ls repo >dir && + cat >expect <<-EOF && + a + folder1 + folder2 + EOF + test_cmp expect dir +' + test_done From 043e1a3d9acd03697bae74ba8cd90f7e0d52aa29 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:15 +0000 Subject: [PATCH 06/36] sparse-checkout: create 'disable' subcommand The instructions for disabling a sparse-checkout to a full working directory are complicated and non-intuitive. Add a subcommand, 'git sparse-checkout disable', to perform those steps for the user. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/git-sparse-checkout.txt | 27 ++++++++++++--------------- builtin/sparse-checkout.c | 26 +++++++++++++++++++++++++- t/t1091-sparse-checkout-builtin.sh | 15 +++++++++++++++ 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt index b933043b3d6a8f..f794d4797a5589 100644 --- a/Documentation/git-sparse-checkout.txt +++ b/Documentation/git-sparse-checkout.txt @@ -48,6 +48,10 @@ To avoid interfering with other worktrees, it first enables the working directory to match the new patterns. Enable the core.sparseCheckout config setting if it is not already enabled. +'disable':: + Remove the sparse-checkout file, set `core.sparseCheckout` to + `false`, and restore the working directory to include all files. + SPARSE CHECKOUT --------------- @@ -65,6 +69,14 @@ directory, it updates the skip-worktree bits in the index based on this file. The files matching the patterns in the file will appear in the working directory, and the rest will not. +To enable the sparse-checkout feature, run `git sparse-checkout init` to +initialize a simple sparse-checkout file and enable the `core.sparseCheckout` +config setting. Then, run `git sparse-checkout set` to modify the patterns in +the sparse-checkout file. + +To repopulate the working directory with all files, use the +`git sparse-checkout disable` command. + ## FULL PATTERN SET By default, the sparse-checkout file uses the same syntax as `.gitignore` @@ -79,21 +91,6 @@ using negative patterns. For example, to remove the file `unwanted`: !unwanted ---------------- -Another tricky thing is fully repopulating the working directory when you -no longer want sparse checkout. You cannot just disable "sparse -checkout" because skip-worktree bits are still in the index and your working -directory is still sparsely populated. You should re-populate the working -directory with the `$GIT_DIR/info/sparse-checkout` file content as -follows: - ----------------- -/* ----------------- - -Then you can disable sparse checkout. Sparse checkout support in 'git -checkout' and similar commands is disabled by default. You need to -set `core.sparseCheckout` to `true` in order to have sparse checkout -support. SEE ALSO -------- diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c index f2e2bd772d7035..9fdcc6c4efc5ae 100644 --- a/builtin/sparse-checkout.c +++ b/builtin/sparse-checkout.c @@ -8,7 +8,7 @@ #include "strbuf.h" static char const * const builtin_sparse_checkout_usage[] = { - N_("git sparse-checkout (init|list|set) "), + N_("git sparse-checkout (init|list|set|disable) "), NULL }; @@ -212,6 +212,28 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix) return result; } +static int sparse_checkout_disable(int argc, const char **argv) +{ + char *sparse_filename; + FILE *fp; + + if (sc_set_config(MODE_ALL_PATTERNS)) + die(_("failed to change config")); + + sparse_filename = get_sparse_checkout_filename(); + fp = xfopen(sparse_filename, "w"); + fprintf(fp, "/*\n"); + fclose(fp); + + if (update_working_directory()) + die(_("error while refreshing working directory")); + + unlink(sparse_filename); + free(sparse_filename); + + return sc_set_config(MODE_NO_PATTERNS); +} + int cmd_sparse_checkout(int argc, const char **argv, const char *prefix) { static struct option builtin_sparse_checkout_options[] = { @@ -236,6 +258,8 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix) return sparse_checkout_init(argc, argv); if (!strcmp(argv[0], "set")) return sparse_checkout_set(argc, argv, prefix); + if (!strcmp(argv[0], "disable")) + return sparse_checkout_disable(argc, argv); } usage_with_options(builtin_sparse_checkout_usage, diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index a9ff5eb9ec7e6d..583c232e9e7ea6 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -148,4 +148,19 @@ test_expect_success 'set sparse-checkout using --stdin' ' test_cmp expect dir ' +test_expect_success 'sparse-checkout disable' ' + git -C repo sparse-checkout disable && + test_path_is_missing repo/.git/info/sparse-checkout && + git -C repo config --list >config && + test_i18ngrep "core.sparsecheckout=false" config && + ls repo >dir && + cat >expect <<-EOF && + a + deep + folder1 + folder2 + EOF + test_cmp expect dir +' + test_done From d97f70b862f422dd1e765010a4d4a19456d2347b Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 21 Oct 2019 13:56:16 +0000 Subject: [PATCH 07/36] trace2: add region in clear_ce_flags When Git updates the working directory with the sparse-checkout feature enabled, the unpack_trees() method calls clear_ce_flags() to update the skip-wortree bits on the cache entries. This check can be expensive, depending on the patterns used. Add trace2 regions around the method, including some flag information, so we can get granular performance data during experiments. This data will be used to measure improvements to the pattern-matching algorithms for sparse-checkout. Signed-off-by: Jeff Hostetler Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- unpack-trees.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/unpack-trees.c b/unpack-trees.c index 33ea7810d8cfbb..01a05ff66dda8b 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -1407,15 +1407,23 @@ static int clear_ce_flags(struct index_state *istate, struct pattern_list *pl) { static struct strbuf prefix = STRBUF_INIT; + char label[100]; + int rval; strbuf_reset(&prefix); - return clear_ce_flags_1(istate, + xsnprintf(label, sizeof(label), "clear_ce_flags(0x%08lx,0x%08lx)", + (unsigned long)select_mask, (unsigned long)clear_mask); + trace2_region_enter("unpack_trees", label, the_repository); + rval = clear_ce_flags_1(istate, istate->cache, istate->cache_nr, &prefix, select_mask, clear_mask, pl, 0); + trace2_region_leave("unpack_trees", label, the_repository); + + return rval; } /* From 95a6be78f301b14c6baa8e27922138a959ffafe1 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:17 +0000 Subject: [PATCH 08/36] sparse-checkout: add 'cone' mode The sparse-checkout feature can have quadratic performance as the number of patterns and number of entries in the index grow. If there are 1,000 patterns and 1,000,000 entries, this time can be very significant. Create a new Boolean config option, core.sparseCheckoutCone, to indicate that we expect the sparse-checkout file to contain a more limited set of patterns. This is a separate config setting from core.sparseCheckout to avoid breaking older clients by introducing a tri-state option. The config option does nothing right now, but will be expanded upon in a later commit. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- Documentation/config/core.txt | 10 ++++-- Documentation/git-sparse-checkout.txt | 50 +++++++++++++++++++++++++++ cache.h | 4 ++- config.c | 5 +++ environment.c | 1 + t/t1091-sparse-checkout-builtin.sh | 14 ++++++++ 6 files changed, 81 insertions(+), 3 deletions(-) diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt index 852d2ba37a1204..bdbbee58b92525 100644 --- a/Documentation/config/core.txt +++ b/Documentation/config/core.txt @@ -593,8 +593,14 @@ core.multiPackIndex:: multi-pack-index design document]. core.sparseCheckout:: - Enable "sparse checkout" feature. See section "Sparse checkout" in - linkgit:git-read-tree[1] for more information. + Enable "sparse checkout" feature. See linkgit:git-sparse-checkout[1] + for more information. + +core.sparseCheckoutCone:: + Enables the "cone mode" of the sparse checkout feature. When the + sparse-checkout file contains a limited set of patterns, then this + mode provides significant performance advantages. See + linkgit:git-sparse-checkout[1] for more information. core.abbrev:: Set the length object names are abbreviated to. If diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt index f794d4797a5589..3931e4f2ad8baf 100644 --- a/Documentation/git-sparse-checkout.txt +++ b/Documentation/git-sparse-checkout.txt @@ -92,6 +92,56 @@ using negative patterns. For example, to remove the file `unwanted`: ---------------- +## CONE PATTERN SET + +The full pattern set allows for arbitrary pattern matches and complicated +inclusion/exclusion rules. These can result in O(N*M) pattern matches when +updating the index, where N is the number of patterns and M is the number +of paths in the index. To combat this performance issue, a more restricted +pattern set is allowed when `core.spareCheckoutCone` is enabled. + +The accepted patterns in the cone pattern set are: + +1. *Recursive:* All paths inside a directory are included. + +2. *Parent:* All files immediately inside a directory are included. + +In addition to the above two patterns, we also expect that all files in the +root directory are included. If a recursive pattern is added, then all +leading directories are added as parent patterns. + +By default, when running `git sparse-checkout init`, the root directory is +added as a parent pattern. At this point, the sparse-checkout file contains +the following patterns: + +``` +/* +!/*/ +``` + +This says "include everything in root, but nothing two levels below root." +If we then add the folder `A/B/C` as a recursive pattern, the folders `A` and +`A/B` are added as parent patterns. The resulting sparse-checkout file is +now + +``` +/* +!/*/ +/A/ +!/A/*/ +/A/B/ +!/A/B/*/ +/A/B/C/ +``` + +Here, order matters, so the negative patterns are overridden by the positive +patterns that appear lower in the file. + +If `core.sparseCheckoutCone=true`, then Git will parse the sparse-checkout file +expecting patterns of these types. Git will warn if the patterns do not match. +If the patterns do match the expected format, then Git will use faster hash- +based algorithms to compute inclusion in the sparse-checkout. + SEE ALSO -------- diff --git a/cache.h b/cache.h index 04cabaac119540..4980ee198e0173 100644 --- a/cache.h +++ b/cache.h @@ -918,12 +918,14 @@ extern char *git_replace_ref_base; extern int fsync_object_files; extern int core_preload_index; -extern int core_apply_sparse_checkout; extern int precomposed_unicode; extern int protect_hfs; extern int protect_ntfs; extern const char *core_fsmonitor; +int core_apply_sparse_checkout; +int core_sparse_checkout_cone; + /* * Include broken refs in all ref iterations, which will * generally choke dangerous operations rather than letting diff --git a/config.c b/config.c index e7052b39773e4c..d75f88ca0ce31f 100644 --- a/config.c +++ b/config.c @@ -1364,6 +1364,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb) return 0; } + if (!strcmp(var, "core.sparsecheckoutcone")) { + core_sparse_checkout_cone = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "core.precomposeunicode")) { precomposed_unicode = git_config_bool(var, value); return 0; diff --git a/environment.c b/environment.c index efa072680a2bca..2a1a866659c39c 100644 --- a/environment.c +++ b/environment.c @@ -67,6 +67,7 @@ enum object_creation_mode object_creation_mode = OBJECT_CREATION_MODE; char *notes_ref_name; int grafts_replace_parents = 1; int core_apply_sparse_checkout; +int core_sparse_checkout_cone; int merge_log_config = -1; int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */ unsigned long pack_size_limit_cfg; diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index 583c232e9e7ea6..b3058d0e9b2821 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -148,6 +148,20 @@ test_expect_success 'set sparse-checkout using --stdin' ' test_cmp expect dir ' +test_expect_success 'cone mode: match patterns' ' + git -C repo config --worktree core.sparseCheckoutCone true && + rm -rf repo/a repo/folder1 repo/folder2 && + git -C repo read-tree -mu HEAD && + git -C repo reset --hard && + ls repo >dir && + cat >expect <<-EOF && + a + folder1 + folder2 + EOF + test_cmp expect dir +' + test_expect_success 'sparse-checkout disable' ' git -C repo sparse-checkout disable && test_path_is_missing repo/.git/info/sparse-checkout && From fa1e8e3adb2be735d9d032f82fbc84a3d445730d Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:18 +0000 Subject: [PATCH 09/36] sparse-checkout: use hashmaps for cone patterns The parent and recursive patterns allowed by the "cone mode" option in sparse-checkout are restrictive enough that we can avoid using the regex parsing. Everything is based on prefix matches, so we can use hashsets to store the prefixes from the sparse-checkout file. When checking a path, we can strip path entries from the path and check the hashset for an exact match. As a test, I created a cone-mode sparse-checkout file for the Linux repository that actually includes every file. This was constructed by taking every folder in the Linux repo and creating the pattern pairs here: /$folder/ !/$folder/*/ This resulted in a sparse-checkout file sith 8,296 patterns. Running 'git read-tree -mu HEAD' on this file had the following performance: core.sparseCheckout=false: 0.21 s (0.00 s) core.sparseCheckout=true: 3.75 s (3.50 s) core.sparseCheckoutCone=true: 0.23 s (0.01 s) The times in parentheses above correspond to the time spent in the first clear_ce_flags() call, according to the trace2 performance traces. While this example is contrived, it demonstrates how these patterns can slow the sparse-checkout feature. Helped-by: Eric Wong Helped-by: Johannes Schindelin Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- dir.c | 207 +++++++++++++++++++++++++++-- dir.h | 31 +++++ t/t1091-sparse-checkout-builtin.sh | 11 +- unpack-trees.c | 1 + 4 files changed, 241 insertions(+), 9 deletions(-) diff --git a/dir.c b/dir.c index 61f559f98008af..dfabf9982fe59b 100644 --- a/dir.c +++ b/dir.c @@ -611,6 +611,150 @@ void parse_path_pattern(const char **pattern, *patternlen = len; } +static int pl_hashmap_cmp(const void *unused_cmp_data, + const struct hashmap_entry *a, + const struct hashmap_entry *b, + const void *key) +{ + const struct pattern_entry *ee1 = + container_of(a, struct pattern_entry, ent); + const struct pattern_entry *ee2 = + container_of(b, struct pattern_entry, ent); + + size_t min_len = ee1->patternlen <= ee2->patternlen + ? ee1->patternlen + : ee2->patternlen; + + return strncmp(ee1->pattern, ee2->pattern, min_len); +} + +static void add_pattern_to_hashsets(struct pattern_list *pl, struct path_pattern *given) +{ + struct pattern_entry *translated; + char *truncated; + char *data = NULL; + + if (!pl->use_cone_patterns) + return; + + if (given->flags & PATTERN_FLAG_NEGATIVE && + given->flags & PATTERN_FLAG_MUSTBEDIR && + !strcmp(given->pattern, "/*")) { + pl->full_cone = 0; + return; + } + + if (!given->flags && !strcmp(given->pattern, "/*")) { + pl->full_cone = 1; + return; + } + + if (given->patternlen > 2 && + !strcmp(given->pattern + given->patternlen - 2, "/*")) { + if (!(given->flags & PATTERN_FLAG_NEGATIVE)) { + /* Not a cone pattern. */ + pl->use_cone_patterns = 0; + warning(_("unrecognized pattern: '%s'"), given->pattern); + goto clear_hashmaps; + } + + truncated = xstrdup(given->pattern); + truncated[given->patternlen - 2] = 0; + + translated = xmalloc(sizeof(struct pattern_entry)); + translated->pattern = truncated; + translated->patternlen = given->patternlen - 2; + hashmap_entry_init(&translated->ent, + memhash(translated->pattern, translated->patternlen)); + + if (!hashmap_get_entry(&pl->recursive_hashmap, + translated, ent, NULL)) { + /* We did not see the "parent" included */ + warning(_("unrecognized negative pattern: '%s'"), + given->pattern); + free(truncated); + free(translated); + goto clear_hashmaps; + } + + hashmap_add(&pl->parent_hashmap, &translated->ent); + hashmap_remove(&pl->recursive_hashmap, &translated->ent, &data); + free(data); + return; + } + + if (given->flags & PATTERN_FLAG_NEGATIVE) { + warning(_("unrecognized negative pattern: '%s'"), + given->pattern); + goto clear_hashmaps; + } + + translated = xmalloc(sizeof(struct pattern_entry)); + + translated->pattern = xstrdup(given->pattern); + translated->patternlen = given->patternlen; + hashmap_entry_init(&translated->ent, + memhash(translated->pattern, translated->patternlen)); + + hashmap_add(&pl->recursive_hashmap, &translated->ent); + + if (hashmap_get_entry(&pl->parent_hashmap, translated, ent, NULL)) { + /* we already included this at the parent level */ + warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"), + given->pattern); + hashmap_remove(&pl->parent_hashmap, &translated->ent, &data); + free(data); + free(translated); + } + + return; + +clear_hashmaps: + warning(_("disabling cone pattern matching")); + hashmap_free_entries(&pl->parent_hashmap, struct pattern_entry, ent); + hashmap_free_entries(&pl->recursive_hashmap, struct pattern_entry, ent); + pl->use_cone_patterns = 0; +} + +static int hashmap_contains_path(struct hashmap *map, + struct strbuf *pattern) +{ + struct pattern_entry p; + + /* Check straight mapping */ + p.pattern = pattern->buf; + p.patternlen = pattern->len; + hashmap_entry_init(&p.ent, memhash(p.pattern, p.patternlen)); + return !!hashmap_get_entry(map, &p, ent, NULL); +} + +int hashmap_contains_parent(struct hashmap *map, + const char *path, + struct strbuf *buffer) +{ + char *slash_pos; + + strbuf_setlen(buffer, 0); + + if (path[0] != '/') + strbuf_addch(buffer, '/'); + + strbuf_addstr(buffer, path); + + slash_pos = strrchr(buffer->buf, '/'); + + while (slash_pos > buffer->buf) { + strbuf_setlen(buffer, slash_pos - buffer->buf); + + if (hashmap_contains_path(map, buffer)) + return 1; + + slash_pos = strrchr(buffer->buf, '/'); + } + + return 0; +} + void add_pattern(const char *string, const char *base, int baselen, struct pattern_list *pl, int srcpos) { @@ -635,6 +779,8 @@ void add_pattern(const char *string, const char *base, ALLOC_GROW(pl->patterns, pl->nr + 1, pl->alloc); pl->patterns[pl->nr++] = pattern; pattern->pl = pl; + + add_pattern_to_hashsets(pl, pattern); } static int read_skip_worktree_file_from_index(const struct index_state *istate, @@ -860,6 +1006,9 @@ static int add_patterns_from_buffer(char *buf, size_t size, int i, lineno = 1; char *entry; + hashmap_init(&pl->recursive_hashmap, pl_hashmap_cmp, NULL, 0); + hashmap_init(&pl->parent_hashmap, pl_hashmap_cmp, NULL, 0); + pl->filebuf = buf; if (skip_utf8_bom(&buf, size)) @@ -1096,16 +1245,58 @@ enum pattern_match_result path_matches_pattern_list( struct index_state *istate) { struct path_pattern *pattern; - pattern = last_matching_pattern_from_list(pathname, pathlen, basename, - dtype, pl, istate); - if (pattern) { - if (pattern->flags & PATTERN_FLAG_NEGATIVE) - return NOT_MATCHED; - else - return MATCHED; + struct strbuf parent_pathname = STRBUF_INIT; + int result = NOT_MATCHED; + const char *slash_pos; + + if (!pl->use_cone_patterns) { + pattern = last_matching_pattern_from_list(pathname, pathlen, basename, + dtype, pl, istate); + if (pattern) { + if (pattern->flags & PATTERN_FLAG_NEGATIVE) + return NOT_MATCHED; + else + return MATCHED; + } + + return UNDECIDED; + } + + if (pl->full_cone) + return MATCHED; + + strbuf_addch(&parent_pathname, '/'); + strbuf_add(&parent_pathname, pathname, pathlen); + + if (hashmap_contains_path(&pl->recursive_hashmap, + &parent_pathname)) { + result = MATCHED; + goto done; + } + + slash_pos = strrchr(parent_pathname.buf, '/'); + + if (slash_pos == parent_pathname.buf) { + /* include every file in root */ + result = MATCHED; + goto done; } - return UNDECIDED; + strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf); + + if (hashmap_contains_path(&pl->parent_hashmap, &parent_pathname)) { + result = MATCHED; + goto done; + } + + if (hashmap_contains_parent(&pl->recursive_hashmap, + pathname, + &parent_pathname)) + result = MATCHED; + +done: + strbuf_release(&parent_pathname); + return result; } static struct path_pattern *last_matching_pattern_from_lists( diff --git a/dir.h b/dir.h index 2fbdef014f57c6..f8edbca72b3acc 100644 --- a/dir.h +++ b/dir.h @@ -4,6 +4,7 @@ /* See Documentation/technical/api-directory-listing.txt */ #include "cache.h" +#include "hashmap.h" #include "strbuf.h" struct dir_entry { @@ -37,6 +38,13 @@ struct path_pattern { int srcpos; }; +/* used for hashmaps for cone patterns */ +struct pattern_entry { + struct hashmap_entry ent; + char *pattern; + size_t patternlen; +}; + /* * Each excludes file will be parsed into a fresh exclude_list which * is appended to the relevant exclude_list_group (either EXC_DIRS or @@ -55,6 +63,26 @@ struct pattern_list { const char *src; struct path_pattern **patterns; + + /* + * While scanning the excludes, we attempt to match the patterns + * with a more restricted set that allows us to use hashsets for + * matching logic, which is faster than the linear lookup in the + * excludes array above. If non-zero, that check succeeded. + */ + unsigned use_cone_patterns; + unsigned full_cone; + + /* + * Stores paths where everything starting with those paths + * is included. + */ + struct hashmap recursive_hashmap; + + /* + * Used to check single-level parents of blobs. + */ + struct hashmap parent_hashmap; }; /* @@ -271,6 +299,9 @@ int is_excluded(struct dir_struct *dir, struct index_state *istate, const char *name, int *dtype); +int hashmap_contains_parent(struct hashmap *map, + const char *path, + struct strbuf *buffer); struct pattern_list *add_pattern_list(struct dir_struct *dir, int group_type, const char *src); int add_patterns_from_file_to_list(const char *fname, const char *base, int baselen, diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index b3058d0e9b2821..9907278fc10324 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -151,7 +151,8 @@ test_expect_success 'set sparse-checkout using --stdin' ' test_expect_success 'cone mode: match patterns' ' git -C repo config --worktree core.sparseCheckoutCone true && rm -rf repo/a repo/folder1 repo/folder2 && - git -C repo read-tree -mu HEAD && + git -C repo read-tree -mu HEAD 2>err && + test_i18ngrep ! "disabling cone patterns" err && git -C repo reset --hard && ls repo >dir && cat >expect <<-EOF && @@ -162,6 +163,14 @@ test_expect_success 'cone mode: match patterns' ' test_cmp expect dir ' +test_expect_success 'cone mode: warn on bad pattern' ' + test_when_finished mv sparse-checkout repo/.git/info/ && + cp repo/.git/info/sparse-checkout . && + echo "!/deep/deeper/*" >>repo/.git/info/sparse-checkout && + git -C repo read-tree -mu HEAD 2>err && + test_i18ngrep "unrecognized negative pattern" err +' + test_expect_success 'sparse-checkout disable' ' git -C repo sparse-checkout disable && test_path_is_missing repo/.git/info/sparse-checkout && diff --git a/unpack-trees.c b/unpack-trees.c index 01a05ff66dda8b..a90d71845dbeda 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -1482,6 +1482,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options o->skip_sparse_checkout = 1; if (!o->skip_sparse_checkout) { char *sparse = git_pathdup("info/sparse-checkout"); + pl.use_cone_patterns = core_sparse_checkout_cone; if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0) o->skip_sparse_checkout = 1; else From 93f2e4e9df12920aa19f2b71241c6fa88b832b75 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:19 +0000 Subject: [PATCH 10/36] sparse-checkout: init and set in cone mode To make the cone pattern set easy to use, update the behavior of 'git sparse-checkout [init|set]'. Add '--cone' flag to 'git sparse-checkout init' to set the config option 'core.sparseCheckoutCone=true'. When running 'git sparse-checkout set' in cone mode, a user only needs to supply a list of recursive folder matches. Git will automatically add the necessary parent matches for the leading directories. When testing 'git sparse-checkout set' in cone mode, check the error stream to ensure we do not see any errors. Specifically, we want to avoid the warning that the patterns do not match the cone-mode patterns. Helped-by: Eric Wong Helped-by: Johannes Schindelin Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- builtin/sparse-checkout.c | 159 +++++++++++++++++++++++++++-- dir.c | 8 +- dir.h | 4 + t/t1091-sparse-checkout-builtin.sh | 51 +++++++++ 4 files changed, 208 insertions(+), 14 deletions(-) diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c index 9fdcc6c4efc5ae..6c3bf68c894e74 100644 --- a/builtin/sparse-checkout.c +++ b/builtin/sparse-checkout.c @@ -6,6 +6,7 @@ #include "repository.h" #include "run-command.h" #include "strbuf.h" +#include "string-list.h" static char const * const builtin_sparse_checkout_usage[] = { N_("git sparse-checkout (init|list|set|disable) "), @@ -77,11 +78,13 @@ static int update_working_directory(void) enum sparse_checkout_mode { MODE_NO_PATTERNS = 0, MODE_ALL_PATTERNS = 1, + MODE_CONE_PATTERNS = 2, }; static int sc_set_config(enum sparse_checkout_mode mode) { struct argv_array argv = ARGV_ARRAY_INIT; + struct argv_array cone_argv = ARGV_ARRAY_INIT; if (git_config_set_gently("extensions.worktreeConfig", "true")) { error(_("failed to set extensions.worktreeConfig setting")); @@ -100,9 +103,31 @@ static int sc_set_config(enum sparse_checkout_mode mode) return 1; } + argv_array_pushl(&cone_argv, "config", "--worktree", + "core.sparseCheckoutCone", NULL); + + if (mode == MODE_CONE_PATTERNS) + argv_array_push(&cone_argv, "true"); + else + argv_array_push(&cone_argv, "false"); + + if (run_command_v_opt(cone_argv.argv, RUN_GIT_CMD)) { + error(_("failed to enable core.sparseCheckoutCone")); + return 1; + } + return 0; } +static char const * const builtin_sparse_checkout_init_usage[] = { + N_("git sparse-checkout init [--cone]"), + NULL +}; + +static struct sparse_checkout_init_opts { + int cone_mode; +} init_opts; + static int sparse_checkout_init(int argc, const char **argv) { struct pattern_list pl; @@ -110,8 +135,21 @@ static int sparse_checkout_init(int argc, const char **argv) FILE *fp; int res; struct object_id oid; + int mode; - if (sc_set_config(MODE_ALL_PATTERNS)) + static struct option builtin_sparse_checkout_init_options[] = { + OPT_BOOL(0, "cone", &init_opts.cone_mode, + N_("initialize the sparse-checkout in cone mode")), + OPT_END(), + }; + + argc = parse_options(argc, argv, NULL, + builtin_sparse_checkout_init_options, + builtin_sparse_checkout_init_usage, 0); + + mode = init_opts.cone_mode ? MODE_CONE_PATTERNS : MODE_ALL_PATTERNS; + + if (sc_set_config(mode)) return 1; memset(&pl, 0, sizeof(pl)); @@ -140,6 +178,70 @@ static int sparse_checkout_init(int argc, const char **argv) return update_working_directory(); } +static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path) +{ + struct pattern_entry *e = xmalloc(sizeof(*e)); + e->patternlen = path->len; + e->pattern = strbuf_detach(path, NULL); + hashmap_entry_init(&e->ent, memhash(e->pattern, e->patternlen)); + + hashmap_add(&pl->recursive_hashmap, &e->ent); + + while (e->patternlen) { + char *slash = strrchr(e->pattern, '/'); + char *oldpattern = e->pattern; + size_t newlen; + + if (slash == e->pattern) + break; + + newlen = slash - e->pattern; + e = xmalloc(sizeof(struct pattern_entry)); + e->patternlen = newlen; + e->pattern = xstrndup(oldpattern, newlen); + hashmap_entry_init(&e->ent, memhash(e->pattern, e->patternlen)); + + if (!hashmap_get_entry(&pl->parent_hashmap, e, ent, NULL)) + hashmap_add(&pl->parent_hashmap, &e->ent); + } +} + +static void write_cone_to_file(FILE *fp, struct pattern_list *pl) +{ + int i; + struct pattern_entry *pe; + struct hashmap_iter iter; + struct string_list sl = STRING_LIST_INIT_DUP; + + hashmap_for_each_entry(&pl->parent_hashmap, &iter, pe, ent) + string_list_insert(&sl, pe->pattern); + + string_list_sort(&sl); + string_list_remove_duplicates(&sl, 0); + + fprintf(fp, "/*\n!/*/\n"); + + for (i = 0; i < sl.nr; i++) { + char *pattern = sl.items[i].string; + + if (strlen(pattern)) + fprintf(fp, "%s/\n!%s/*/\n", pattern, pattern); + } + + string_list_clear(&sl, 0); + + hashmap_for_each_entry(&pl->recursive_hashmap, &iter, pe, ent) + string_list_insert(&sl, pe->pattern); + + string_list_sort(&sl); + string_list_remove_duplicates(&sl, 0); + + for (i = 0; i < sl.nr; i++) { + char *pattern = sl.items[i].string; + fprintf(fp, "%s/\n", pattern); + } +} + static int write_patterns_and_update(struct pattern_list *pl) { char *sparse_filename; @@ -147,13 +249,33 @@ static int write_patterns_and_update(struct pattern_list *pl) sparse_filename = get_sparse_checkout_filename(); fp = fopen(sparse_filename, "w"); - write_patterns_to_file(fp, pl); + + if (core_sparse_checkout_cone) + write_cone_to_file(fp, pl); + else + write_patterns_to_file(fp, pl); + fclose(fp); free(sparse_filename); return update_working_directory(); } +static void strbuf_to_cone_pattern(struct strbuf *line, struct pattern_list *pl) +{ + strbuf_trim(line); + + strbuf_trim_trailing_dir_sep(line); + + if (!line->len) + return; + + if (line->buf[0] != '/') + strbuf_insert(line, 0, "/", 1); + + insert_recursive_pattern(pl, line); +} + static char const * const builtin_sparse_checkout_set_usage[] = { N_("git sparse-checkout set [--stdin|]"), NULL @@ -184,17 +306,34 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix) builtin_sparse_checkout_set_usage, PARSE_OPT_KEEP_UNKNOWN); - if (set_opts.use_stdin) { + if (core_sparse_checkout_cone) { struct strbuf line = STRBUF_INIT; - - while (!strbuf_getline(&line, stdin)) { - size_t len; - char *buf = strbuf_detach(&line, &len); - add_pattern(buf, empty_base, 0, &pl, 0); + hashmap_init(&pl.recursive_hashmap, pl_hashmap_cmp, NULL, 0); + hashmap_init(&pl.parent_hashmap, pl_hashmap_cmp, NULL, 0); + + if (set_opts.use_stdin) { + while (!strbuf_getline(&line, stdin)) + strbuf_to_cone_pattern(&line, &pl); + } else { + for (i = 0; i < argc; i++) { + strbuf_setlen(&line, 0); + strbuf_addstr(&line, argv[i]); + strbuf_to_cone_pattern(&line, &pl); + } } } else { - for (i = 0; i < argc; i++) - add_pattern(argv[i], empty_base, 0, &pl, 0); + if (set_opts.use_stdin) { + struct strbuf line = STRBUF_INIT; + + while (!strbuf_getline(&line, stdin)) { + size_t len; + char *buf = strbuf_detach(&line, &len); + add_pattern(buf, empty_base, 0, &pl, 0); + } + } else { + for (i = 0; i < argc; i++) + add_pattern(argv[i], empty_base, 0, &pl, 0); + } } if (!core_apply_sparse_checkout) { diff --git a/dir.c b/dir.c index dfabf9982fe59b..35c1ca9e241505 100644 --- a/dir.c +++ b/dir.c @@ -611,10 +611,10 @@ void parse_path_pattern(const char **pattern, *patternlen = len; } -static int pl_hashmap_cmp(const void *unused_cmp_data, - const struct hashmap_entry *a, - const struct hashmap_entry *b, - const void *key) +int pl_hashmap_cmp(const void *unused_cmp_data, + const struct hashmap_entry *a, + const struct hashmap_entry *b, + const void *key) { const struct pattern_entry *ee1 = container_of(a, struct pattern_entry, ent); diff --git a/dir.h b/dir.h index f8edbca72b3acc..8e232085cd5514 100644 --- a/dir.h +++ b/dir.h @@ -299,6 +299,10 @@ int is_excluded(struct dir_struct *dir, struct index_state *istate, const char *name, int *dtype); +int pl_hashmap_cmp(const void *unused_cmp_data, + const struct hashmap_entry *a, + const struct hashmap_entry *b, + const void *key); int hashmap_contains_parent(struct hashmap *map, const char *path, struct strbuf *buffer); diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index 9907278fc10324..ae99803d40c46a 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -186,4 +186,55 @@ test_expect_success 'sparse-checkout disable' ' test_cmp expect dir ' +test_expect_success 'cone mode: init and set' ' + git -C repo sparse-checkout init --cone && + git -C repo config --list >config && + test_i18ngrep "core.sparsecheckoutcone=true" config && + ls repo >dir && + echo a >expect && + test_cmp expect dir && + git -C repo sparse-checkout set deep/deeper1/deepest/ 2>err && + test_line_count = 0 err && + ls repo >dir && + cat >expect <<-EOF && + a + deep + EOF + test_cmp expect dir && + ls repo/deep >dir && + cat >expect <<-EOF && + a + deeper1 + EOF + test_cmp expect dir && + ls repo/deep/deeper1 >dir && + cat >expect <<-EOF && + a + deepest + EOF + test_cmp expect dir && + cat >expect <<-EOF && + /* + !/*/ + /deep/ + !/deep/*/ + /deep/deeper1/ + !/deep/deeper1/*/ + /deep/deeper1/deepest/ + EOF + test_cmp expect repo/.git/info/sparse-checkout && + git -C repo sparse-checkout set --stdin 2>err <<-EOF && + folder1 + folder2 + EOF + test_line_count = 0 err && + cat >expect <<-EOF && + a + folder1 + folder2 + EOF + ls repo >dir && + test_cmp expect dir +' + test_done From b2522a9757c717cac3813baf198feba6c02377e0 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:20 +0000 Subject: [PATCH 11/36] unpack-trees: hash less in cone mode The sparse-checkout feature in "cone mode" can use the fact that the recursive patterns are "connected" to the root via parent patterns to decide if a directory is entirely contained in the sparse-checkout or entirely removed. In these cases, we can skip hashing the paths within those directories and simply set the skipworktree bit to the correct value. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- dir.c | 4 ++-- dir.h | 1 + unpack-trees.c | 38 +++++++++++++++++++++++--------------- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/dir.c b/dir.c index 35c1ca9e241505..2ef92a50a04606 100644 --- a/dir.c +++ b/dir.c @@ -1270,7 +1270,7 @@ enum pattern_match_result path_matches_pattern_list( if (hashmap_contains_path(&pl->recursive_hashmap, &parent_pathname)) { - result = MATCHED; + result = MATCHED_RECURSIVE; goto done; } @@ -1292,7 +1292,7 @@ enum pattern_match_result path_matches_pattern_list( if (hashmap_contains_parent(&pl->recursive_hashmap, pathname, &parent_pathname)) - result = MATCHED; + result = MATCHED_RECURSIVE; done: strbuf_release(&parent_pathname); diff --git a/dir.h b/dir.h index 8e232085cd5514..77a43dbf89712f 100644 --- a/dir.h +++ b/dir.h @@ -264,6 +264,7 @@ enum pattern_match_result { UNDECIDED = -1, NOT_MATCHED = 0, MATCHED = 1, + MATCHED_RECURSIVE = 2, }; /* diff --git a/unpack-trees.c b/unpack-trees.c index a90d71845dbeda..c0dca208656728 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -1283,15 +1283,17 @@ static int clear_ce_flags_dir(struct index_state *istate, struct cache_entry **cache_end; int dtype = DT_DIR; int rc; - enum pattern_match_result ret; - ret = path_matches_pattern_list(prefix->buf, prefix->len, - basename, &dtype, pl, istate); + enum pattern_match_result ret, orig_ret; + orig_ret = path_matches_pattern_list(prefix->buf, prefix->len, + basename, &dtype, pl, istate); strbuf_addch(prefix, '/'); /* If undecided, use matching result of parent dir in defval */ - if (ret == UNDECIDED) + if (orig_ret == UNDECIDED) ret = default_match; + else + ret = orig_ret; for (cache_end = cache; cache_end != cache + nr; cache_end++) { struct cache_entry *ce = *cache_end; @@ -1299,17 +1301,23 @@ static int clear_ce_flags_dir(struct index_state *istate, break; } - /* - * TODO: check pl, if there are no patterns that may conflict - * with ret (iow, we know in advance the incl/excl - * decision for the entire directory), clear flag here without - * calling clear_ce_flags_1(). That function will call - * the expensive path_matches_pattern_list() on every entry. - */ - rc = clear_ce_flags_1(istate, cache, cache_end - cache, - prefix, - select_mask, clear_mask, - pl, ret); + if (pl->use_cone_patterns && orig_ret == MATCHED_RECURSIVE) { + struct cache_entry **ce = cache; + rc = (cache_end - cache) / sizeof(struct cache_entry *); + + while (ce < cache_end) { + (*ce)->ce_flags &= ~clear_mask; + ce++; + } + } else if (pl->use_cone_patterns && orig_ret == NOT_MATCHED) { + rc = (cache_end - cache) / sizeof(struct cache_entry *); + } else { + rc = clear_ce_flags_1(istate, cache, cache_end - cache, + prefix, + select_mask, clear_mask, + pl, ret); + } + strbuf_setlen(prefix, prefix->len - 1); return rc; } From 705f31cb7b01c538f12fa2a58b6aecf8cff69e51 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:21 +0000 Subject: [PATCH 12/36] unpack-trees: add progress to clear_ce_flags() When a large repository has many sparse-checkout patterns, the process for updating the skip-worktree bits can take long enough that a user gets confused why nothing is happening. Update the clear_ce_flags() method to write progress. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- cache.h | 2 ++ unpack-trees.c | 56 ++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 15 deletions(-) diff --git a/cache.h b/cache.h index 4980ee198e0173..d3c89e7a53e85d 100644 --- a/cache.h +++ b/cache.h @@ -304,6 +304,7 @@ static inline unsigned int canon_mode(unsigned int mode) struct split_index; struct untracked_cache; +struct progress; struct index_state { struct cache_entry **cache; @@ -326,6 +327,7 @@ struct index_state { uint64_t fsmonitor_last_update; struct ewah_bitmap *fsmonitor_dirty; struct mem_pool *ce_mem_pool; + struct progress *progress; }; /* Name hashing */ diff --git a/unpack-trees.c b/unpack-trees.c index c0dca208656728..8bb684ad62aba4 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -1269,7 +1269,8 @@ static int clear_ce_flags_1(struct index_state *istate, struct strbuf *prefix, int select_mask, int clear_mask, struct pattern_list *pl, - enum pattern_match_result default_match); + enum pattern_match_result default_match, + int progress_nr); /* Whole directory matching */ static int clear_ce_flags_dir(struct index_state *istate, @@ -1278,7 +1279,8 @@ static int clear_ce_flags_dir(struct index_state *istate, char *basename, int select_mask, int clear_mask, struct pattern_list *pl, - enum pattern_match_result default_match) + enum pattern_match_result default_match, + int progress_nr) { struct cache_entry **cache_end; int dtype = DT_DIR; @@ -1315,7 +1317,8 @@ static int clear_ce_flags_dir(struct index_state *istate, rc = clear_ce_flags_1(istate, cache, cache_end - cache, prefix, select_mask, clear_mask, - pl, ret); + pl, ret, + progress_nr); } strbuf_setlen(prefix, prefix->len - 1); @@ -1342,7 +1345,8 @@ static int clear_ce_flags_1(struct index_state *istate, struct strbuf *prefix, int select_mask, int clear_mask, struct pattern_list *pl, - enum pattern_match_result default_match) + enum pattern_match_result default_match, + int progress_nr) { struct cache_entry **cache_end = cache + nr; @@ -1356,8 +1360,11 @@ static int clear_ce_flags_1(struct index_state *istate, int len, dtype; enum pattern_match_result ret; + display_progress(istate->progress, progress_nr); + if (select_mask && !(ce->ce_flags & select_mask)) { cache++; + progress_nr++; continue; } @@ -1378,20 +1385,26 @@ static int clear_ce_flags_1(struct index_state *istate, prefix, prefix->buf + prefix->len - len, select_mask, clear_mask, - pl, default_match); + pl, default_match, + progress_nr); /* clear_c_f_dir eats a whole dir already? */ if (processed) { cache += processed; + progress_nr += processed; strbuf_setlen(prefix, prefix->len - len); continue; } strbuf_addch(prefix, '/'); - cache += clear_ce_flags_1(istate, cache, cache_end - cache, - prefix, - select_mask, clear_mask, pl, - default_match); + processed = clear_ce_flags_1(istate, cache, cache_end - cache, + prefix, + select_mask, clear_mask, pl, + default_match, progress_nr); + + cache += processed; + progress_nr += processed; + strbuf_setlen(prefix, prefix->len - len - 1); continue; } @@ -1406,19 +1419,27 @@ static int clear_ce_flags_1(struct index_state *istate, if (ret == MATCHED) ce->ce_flags &= ~clear_mask; cache++; + progress_nr++; } + + display_progress(istate->progress, progress_nr); return nr - (cache_end - cache); } static int clear_ce_flags(struct index_state *istate, int select_mask, int clear_mask, - struct pattern_list *pl) + struct pattern_list *pl, + int show_progress) { static struct strbuf prefix = STRBUF_INIT; char label[100]; int rval; strbuf_reset(&prefix); + if (show_progress) + istate->progress = start_delayed_progress( + _("Updating index flags"), + istate->cache_nr); xsnprintf(label, sizeof(label), "clear_ce_flags(0x%08lx,0x%08lx)", (unsigned long)select_mask, (unsigned long)clear_mask); @@ -1428,9 +1449,10 @@ static int clear_ce_flags(struct index_state *istate, istate->cache_nr, &prefix, select_mask, clear_mask, - pl, 0); + pl, 0, 0); trace2_region_leave("unpack_trees", label, the_repository); + stop_progress(&istate->progress); return rval; } @@ -1439,7 +1461,8 @@ static int clear_ce_flags(struct index_state *istate, */ static void mark_new_skip_worktree(struct pattern_list *pl, struct index_state *istate, - int select_flag, int skip_wt_flag) + int select_flag, int skip_wt_flag, + int show_progress) { int i; @@ -1463,7 +1486,7 @@ static void mark_new_skip_worktree(struct pattern_list *pl, * 2. Widen worktree according to sparse-checkout file. * Matched entries will have skip_wt_flag cleared (i.e. "in") */ - clear_ce_flags(istate, select_flag, skip_wt_flag, pl); + clear_ce_flags(istate, select_flag, skip_wt_flag, pl, show_progress); } static int verify_absent(const struct cache_entry *, @@ -1525,7 +1548,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options * Sparse checkout loop #1: set NEW_SKIP_WORKTREE on existing entries */ if (!o->skip_sparse_checkout) - mark_new_skip_worktree(o->pl, o->src_index, 0, CE_NEW_SKIP_WORKTREE); + mark_new_skip_worktree(o->pl, o->src_index, 0, + CE_NEW_SKIP_WORKTREE, o->verbose_update); if (!dfc) dfc = xcalloc(1, cache_entry_size(0)); @@ -1590,7 +1614,9 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options * If the will have NEW_SKIP_WORKTREE, also set CE_SKIP_WORKTREE * so apply_sparse_checkout() won't attempt to remove it from worktree */ - mark_new_skip_worktree(o->pl, &o->result, CE_ADDED, CE_SKIP_WORKTREE | CE_NEW_SKIP_WORKTREE); + mark_new_skip_worktree(o->pl, &o->result, + CE_ADDED, CE_SKIP_WORKTREE | CE_NEW_SKIP_WORKTREE, + o->verbose_update); ret = 0; for (i = 0; i < o->result.cache_nr; i++) { From 4326bc1811fb65d096614e722b255b1166d47bde Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:22 +0000 Subject: [PATCH 13/36] read-tree: show progress by default The read-tree builtin has a --verbose option that signals to show progress and other data while updating the index. Update this to be on by default when stderr is a terminal window. This will help tools like 'git sparse-checkout' to automatically benefit from progress indicators when a user runs these commands. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- builtin/read-tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/builtin/read-tree.c b/builtin/read-tree.c index ca5e655d2f8b42..69963d83dc62dd 100644 --- a/builtin/read-tree.c +++ b/builtin/read-tree.c @@ -162,6 +162,7 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix) opts.head_idx = -1; opts.src_index = &the_index; opts.dst_index = &the_index; + opts.verbose_update = isatty(2); git_config(git_read_tree_config, NULL); From f9e96bf90e2d6aa97ef64a1740eb00e31e2f6814 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:23 +0000 Subject: [PATCH 14/36] sparse-checkout: sanitize for nested folders If a user provides folders A/ and A/B/ for inclusion in a cone-mode sparse-checkout file, the parsing logic will notice that A/ appears both as a "parent" type pattern and as a "recursive" type pattern. This is unexpected and hence will complain via a warning and revert to the old logic for checking sparse-checkout patterns. Prevent this from happening accidentally by sanitizing the folders for this type of inclusion in the 'git sparse-checkout' builtin. This happens in two ways: 1. Do not include any parent patterns that also appear as recursive patterns. 2. Do not include any recursive patterns deeper than other recursive patterns. In order to minimize duplicate code for scanning parents, create hashmap_contains_parent() method. It takes a strbuf buffer to avoid reallocating a buffer when calling in a tight loop. Helped-by: Eric Wong Helped-by: Johannes Schindelin Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- builtin/sparse-checkout.c | 22 ++++++++++++++++++---- t/t1091-sparse-checkout-builtin.sh | 11 +++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c index 6c3bf68c894e74..b02e085495e7e3 100644 --- a/builtin/sparse-checkout.c +++ b/builtin/sparse-checkout.c @@ -212,9 +212,17 @@ static void write_cone_to_file(FILE *fp, struct pattern_list *pl) struct pattern_entry *pe; struct hashmap_iter iter; struct string_list sl = STRING_LIST_INIT_DUP; + struct strbuf parent_pattern = STRBUF_INIT; - hashmap_for_each_entry(&pl->parent_hashmap, &iter, pe, ent) - string_list_insert(&sl, pe->pattern); + hashmap_for_each_entry(&pl->parent_hashmap, &iter, pe, ent) { + if (hashmap_get_entry(&pl->recursive_hashmap, pe, ent, NULL)) + continue; + + if (!hashmap_contains_parent(&pl->recursive_hashmap, + pe->pattern, + &parent_pattern)) + string_list_insert(&sl, pe->pattern); + } string_list_sort(&sl); string_list_remove_duplicates(&sl, 0); @@ -230,8 +238,14 @@ static void write_cone_to_file(FILE *fp, struct pattern_list *pl) string_list_clear(&sl, 0); - hashmap_for_each_entry(&pl->recursive_hashmap, &iter, pe, ent) - string_list_insert(&sl, pe->pattern); + hashmap_for_each_entry(&pl->recursive_hashmap, &iter, pe, ent) { + if (!hashmap_contains_parent(&pl->recursive_hashmap, + pe->pattern, + &parent_pattern)) + string_list_insert(&sl, pe->pattern); + } + + strbuf_release(&parent_pattern); string_list_sort(&sl); string_list_remove_duplicates(&sl, 0); diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index ae99803d40c46a..0792caeb7e3195 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -237,4 +237,15 @@ test_expect_success 'cone mode: init and set' ' test_cmp expect dir ' +test_expect_success 'cone mode: set with nested folders' ' + git -C repo sparse-checkout set deep deep/deeper1/deepest 2>err && + test_line_count = 0 err && + cat >expect <<-EOF && + /* + !/*/ + /deep/ + EOF + test_cmp repo/.git/info/sparse-checkout expect +' + test_done From efd9c53b6d4a79029f5fb67be36c6a7930eaed8e Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:24 +0000 Subject: [PATCH 15/36] sparse-checkout: update working directory in-process The sparse-checkout builtin used 'git read-tree -mu HEAD' to update the skip-worktree bits in the index and to update the working directory. This extra process is overly complex, and prone to failure. It also requires that we write our changes to the sparse-checkout file before trying to update the index. Remove this extra process call by creating a direct call to unpack_trees() in the same way 'git read-tree -mu HEAD' does. In addition, provide an in-memory list of patterns so we can avoid reading from the sparse-checkout file. This allows us to test a proposed change to the file before writing to it. An earlier version of this patch included a bug when the 'set' command failed due to the "Sparse checkout leaves no entry on working directory" error. It would not rollback the index.lock file, so the replay of the old sparse-checkout specification would fail. A test in t1091 now covers that scenario. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- builtin/read-tree.c | 2 +- builtin/sparse-checkout.c | 83 +++++++++++++++++++++++++----- t/t1091-sparse-checkout-builtin.sh | 28 ++++++++++ unpack-trees.c | 5 +- unpack-trees.h | 3 +- 5 files changed, 105 insertions(+), 16 deletions(-) diff --git a/builtin/read-tree.c b/builtin/read-tree.c index 69963d83dc62dd..d7eeaa26ec1790 100644 --- a/builtin/read-tree.c +++ b/builtin/read-tree.c @@ -186,7 +186,7 @@ int cmd_read_tree(int argc, const char **argv, const char *cmd_prefix) if (opts.reset || opts.merge || opts.prefix) { if (read_cache_unmerged() && (opts.prefix || opts.merge)) - die("You need to resolve your current index first"); + die(_("You need to resolve your current index first")); stage = opts.merge = 1; } resolve_undo_clear(); diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c index b02e085495e7e3..ea2e7ee51bdd8a 100644 --- a/builtin/sparse-checkout.c +++ b/builtin/sparse-checkout.c @@ -7,6 +7,11 @@ #include "run-command.h" #include "strbuf.h" #include "string-list.h" +#include "cache.h" +#include "cache-tree.h" +#include "lockfile.h" +#include "resolve-undo.h" +#include "unpack-trees.h" static char const * const builtin_sparse_checkout_usage[] = { N_("git sparse-checkout (init|list|set|disable) "), @@ -60,18 +65,54 @@ static int sparse_checkout_list(int argc, const char **argv) return 0; } -static int update_working_directory(void) +static int update_working_directory(struct pattern_list *pl) { - struct argv_array argv = ARGV_ARRAY_INIT; int result = 0; - argv_array_pushl(&argv, "read-tree", "-m", "-u", "HEAD", NULL); + struct unpack_trees_options o; + struct lock_file lock_file = LOCK_INIT; + struct object_id oid; + struct tree *tree; + struct tree_desc t; + struct repository *r = the_repository; - if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) { - error(_("failed to update index with new sparse-checkout paths")); - result = 1; - } + if (repo_read_index_unmerged(r)) + die(_("You need to resolve your current index first")); + + if (get_oid("HEAD", &oid)) + return 0; + + tree = parse_tree_indirect(&oid); + parse_tree(tree); + init_tree_desc(&t, tree->buffer, tree->size); + + memset(&o, 0, sizeof(o)); + o.verbose_update = isatty(2); + o.merge = 1; + o.update = 1; + o.fn = oneway_merge; + o.head_idx = -1; + o.src_index = r->index; + o.dst_index = r->index; + o.skip_sparse_checkout = 0; + o.pl = pl; + o.keep_pattern_list = !!pl; + + resolve_undo_clear_index(r->index); + setup_work_tree(); + + cache_tree_free(&r->index->cache_tree); + + repo_hold_locked_index(r, &lock_file, LOCK_DIE_ON_ERROR); + + core_apply_sparse_checkout = 1; + result = unpack_trees(1, &t, &o); + + if (!result) { + prime_cache_tree(r, r->index, tree); + write_locked_index(r->index, &lock_file, COMMIT_LOCK); + } else + rollback_lock_file(&lock_file); - argv_array_clear(&argv); return result; } @@ -147,7 +188,11 @@ static int sparse_checkout_init(int argc, const char **argv) builtin_sparse_checkout_init_options, builtin_sparse_checkout_init_usage, 0); - mode = init_opts.cone_mode ? MODE_CONE_PATTERNS : MODE_ALL_PATTERNS; + if (init_opts.cone_mode) { + mode = MODE_CONE_PATTERNS; + core_sparse_checkout_cone = 1; + } else + mode = MODE_ALL_PATTERNS; if (sc_set_config(mode)) return 1; @@ -175,7 +220,8 @@ static int sparse_checkout_init(int argc, const char **argv) } reset_dir: - return update_working_directory(); + core_apply_sparse_checkout = 1; + return update_working_directory(NULL); } static void insert_recursive_pattern(struct pattern_list *pl, struct strbuf *path) @@ -260,6 +306,15 @@ static int write_patterns_and_update(struct pattern_list *pl) { char *sparse_filename; FILE *fp; + int result; + + result = update_working_directory(pl); + + if (result) { + clear_pattern_list(pl); + update_working_directory(NULL); + return result; + } sparse_filename = get_sparse_checkout_filename(); fp = fopen(sparse_filename, "w"); @@ -270,9 +325,11 @@ static int write_patterns_and_update(struct pattern_list *pl) write_patterns_to_file(fp, pl); fclose(fp); + free(sparse_filename); + clear_pattern_list(pl); - return update_working_directory(); + return 0; } static void strbuf_to_cone_pattern(struct strbuf *line, struct pattern_list *pl) @@ -324,6 +381,7 @@ static int sparse_checkout_set(int argc, const char **argv, const char *prefix) struct strbuf line = STRBUF_INIT; hashmap_init(&pl.recursive_hashmap, pl_hashmap_cmp, NULL, 0); hashmap_init(&pl.parent_hashmap, pl_hashmap_cmp, NULL, 0); + pl.use_cone_patterns = 1; if (set_opts.use_stdin) { while (!strbuf_getline(&line, stdin)) @@ -378,7 +436,8 @@ static int sparse_checkout_disable(int argc, const char **argv) fprintf(fp, "/*\n"); fclose(fp); - if (update_working_directory()) + core_apply_sparse_checkout = 1; + if (update_working_directory(NULL)) die(_("error while refreshing working directory")); unlink(sparse_filename); diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index 0792caeb7e3195..a580b795952294 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -248,4 +248,32 @@ test_expect_success 'cone mode: set with nested folders' ' test_cmp repo/.git/info/sparse-checkout expect ' +test_expect_success 'revert to old sparse-checkout on bad update' ' + echo update >repo/deep/deeper2/a && + cp repo/.git/info/sparse-checkout expect && + test_must_fail git -C repo sparse-checkout set deep/deeper1 2>err && + test_i18ngrep "Cannot update sparse checkout" err && + test_cmp repo/.git/info/sparse-checkout expect && + ls repo/deep >dir && + cat >expect <<-EOF && + a + deeper1 + deeper2 + EOF + test_cmp dir expect +' + +test_expect_success 'revert to old sparse-checkout on empty update' ' + git init empty-test && + ( + echo >file && + git add file && + git commit -m "test" && + test_must_fail git sparse-checkout set nothing 2>err && + test_i18ngrep "Sparse checkout leaves no entry on working directory" err && + test_i18ngrep ! ".git/index.lock" err && + git sparse-checkout set file + ) +' + test_done diff --git a/unpack-trees.c b/unpack-trees.c index 8bb684ad62aba4..3789a22cf0a519 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -1511,7 +1511,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options memset(&pl, 0, sizeof(pl)); if (!core_apply_sparse_checkout || !o->update) o->skip_sparse_checkout = 1; - if (!o->skip_sparse_checkout) { + if (!o->skip_sparse_checkout && !o->pl) { char *sparse = git_pathdup("info/sparse-checkout"); pl.use_cone_patterns = core_sparse_checkout_cone; if (add_patterns_from_file_to_list(sparse, "", 0, &pl, NULL) < 0) @@ -1684,7 +1684,8 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options done: trace_performance_leave("unpack_trees"); - clear_pattern_list(&pl); + if (!o->keep_pattern_list) + clear_pattern_list(&pl); return ret; return_failed: diff --git a/unpack-trees.h b/unpack-trees.h index f2eee0c7c54cf3..ca94a421a5dd84 100644 --- a/unpack-trees.h +++ b/unpack-trees.h @@ -59,7 +59,8 @@ struct unpack_trees_options { quiet, exiting_early, show_all_errors, - dry_run; + dry_run, + keep_pattern_list; const char *prefix; int cache_bottom; struct dir_struct *dir; From a879c5bfb5738d7f8468af1f9b02f12e6bb73c38 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:25 +0000 Subject: [PATCH 16/36] sparse-checkout: write using lockfile If two 'git sparse-checkout set' subcommands are launched at the same time, the behavior can be unexpected as they compete to write the sparse-checkout file and update the working directory. Take a lockfile around the writes to the sparse-checkout file. In addition, acquire this lock around the working directory update to avoid two commands updating the working directory in different ways. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- builtin/sparse-checkout.c | 15 ++++++++++++--- t/t1091-sparse-checkout-builtin.sh | 7 +++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/builtin/sparse-checkout.c b/builtin/sparse-checkout.c index ea2e7ee51bdd8a..76f65d8eddd513 100644 --- a/builtin/sparse-checkout.c +++ b/builtin/sparse-checkout.c @@ -306,25 +306,34 @@ static int write_patterns_and_update(struct pattern_list *pl) { char *sparse_filename; FILE *fp; + int fd; + struct lock_file lk = LOCK_INIT; int result; result = update_working_directory(pl); + sparse_filename = get_sparse_checkout_filename(); + fd = hold_lock_file_for_update(&lk, sparse_filename, + LOCK_DIE_ON_ERROR); + + result = update_working_directory(pl); if (result) { + rollback_lock_file(&lk); + free(sparse_filename); clear_pattern_list(pl); update_working_directory(NULL); return result; } - sparse_filename = get_sparse_checkout_filename(); - fp = fopen(sparse_filename, "w"); + fp = xfdopen(fd, "w"); if (core_sparse_checkout_cone) write_cone_to_file(fp, pl); else write_patterns_to_file(fp, pl); - fclose(fp); + fflush(fp); + commit_lock_file(&lk); free(sparse_filename); clear_pattern_list(pl); diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index a580b795952294..2db706a9e43465 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -276,4 +276,11 @@ test_expect_success 'revert to old sparse-checkout on empty update' ' ) ' +test_expect_success 'fail when lock is taken' ' + test_when_finished rm -rf repo/.git/info/sparse-checkout.lock && + touch repo/.git/info/sparse-checkout.lock && + test_must_fail git -C repo sparse-checkout set deep 2>err && + test_i18ngrep "File exists" err +' + test_done From 20e7bd3da4f602ca898bd887a479bb95dd96a670 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 21 Oct 2019 13:56:26 +0000 Subject: [PATCH 17/36] sparse-checkout: cone mode should not interact with .gitignore During the development of the sparse-checkout "cone mode" feature, an incorrect placement of the initializer for "use_cone_patterns = 1" caused warnings to show up when a .gitignore file was present with non-cone-mode patterns. This was fixed in the original commit introducing the cone mode, but now we should add a test to avoid hitting this problem again in the future. Signed-off-by: Derrick Stolee Signed-off-by: Junio C Hamano --- t/t1091-sparse-checkout-builtin.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index 2db706a9e43465..4205e03449bc07 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -283,4 +283,11 @@ test_expect_success 'fail when lock is taken' ' test_i18ngrep "File exists" err ' +test_expect_success '.gitignore should not warn about cone mode' ' + git -C repo config --worktree core.sparseCheckoutCone true && + echo "**/bin/*" >repo/.gitignore && + git -C repo reset --hard 2>err && + test_i18ngrep ! "disabling cone patterns" err +' + test_done From 3641e269732bd78abc532ff02ce6a351fd25d1e0 Mon Sep 17 00:00:00 2001 From: Kevin Willford Date: Fri, 1 Nov 2019 13:46:19 -0600 Subject: [PATCH 18/36] fsmonitor: fix watchman integration When running Git commands quickly -- such as in a shell script or the test suite -- the Git commands frequently complete and start again during the same second. The example fsmonitor hooks to integrate with Watchman truncate the nanosecond times to seconds. In principle, this is fine, as Watchman claims to use inclusive comparisons [1]. The result should only be an over-representation of the changed paths since the last Git command. However, Watchman's own documentation claims "Using a timestamp is prone to race conditions in understanding the complete state of the file tree" [2]. All of their documented examples use a "clockspec" that looks like 'c:123:234'. Git should eventually learn how to store this type of string to provide a stronger integration, but that will be a more invasive change. When using GIT_TEST_FSMONITOR="$(pwd)/t7519/fsmonitor-watchman", scripts such as t7519-wtstatus.sh fail due to these race conditions. In fact, running any test script with GIT_TEST_FSMONITOR pointing at t/t7519/fsmonitor-wathcman will cause failures in the test_commit function. The 'git add "$indir$file"' command fails due to not enough time between the creation of '$file' and the 'git add' command. For now, subtract one second from the timestamp we pass to Watchman. This will make our window large enough to avoid these race conditions. Increasing the window causes tests like t7519-wtstatus.sh to pass. When the integration was introduced in def437671 (fsmonitor: add a sample integration script for Watchman, 2018-09-22), the query included an expression that would ignore files created and deleted in that window. The performance reason for this change was to ignore temporary files created by a build between Git commands. However, this causes failures in script scenarios where Git is creating or deleting files quickly. When using GIT_TEST_FSMONITOR as before, t2203-add-intent.sh fails due to this add-and-delete race condition. By removing the "expression" from the Watchman query, we remove this race condition. It will lead to some performance degradation in the case of users creating and deleting temporary files inside their working directory between Git commands. However, that is a cost we need to pay to be correct. [1] https://github.com/facebook/watchman/blob/master/query/since.cpp#L35-L39 [2] https://facebook.github.io/watchman/docs/clockspec.html Helped-by: Derrick Stolee Signed-off-by: Kevin Willford --- t/t7519/fsmonitor-watchman | 13 ++++--------- templates/hooks--fsmonitor-watchman.sample | 13 ++++--------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/t/t7519/fsmonitor-watchman b/t/t7519/fsmonitor-watchman index 5514edcf68be80..d8e7a1e5ba85c0 100755 --- a/t/t7519/fsmonitor-watchman +++ b/t/t7519/fsmonitor-watchman @@ -23,7 +23,8 @@ my ($version, $time) = @ARGV; if ($version == 1) { # convert nanoseconds to seconds - $time = int $time / 1000000000; + # subtract one second to make sure watchman will return all changes + $time = int ($time / 1000000000) - 1; } else { die "Unsupported query-fsmonitor hook version '$version'.\n" . "Falling back to scanning...\n"; @@ -54,18 +55,12 @@ sub launch_watchman { # # To accomplish this, we're using the "since" generator to use the # recency index to select candidate nodes and "fields" to limit the - # output to file names only. Then we're using the "expression" term to - # further constrain the results. - # - # The category of transient files that we want to ignore will have a - # creation clock (cclock) newer than $time_t value and will also not - # currently exist. + # output to file names only. my $query = <<" END"; ["query", "$git_work_tree", { "since": $time, - "fields": ["name"], - "expression": ["not", ["allof", ["since", $time, "cclock"], ["not", "exists"]]] + "fields": ["name"] }] END diff --git a/templates/hooks--fsmonitor-watchman.sample b/templates/hooks--fsmonitor-watchman.sample index e673bb3980f3c2..ef94fa293800b3 100755 --- a/templates/hooks--fsmonitor-watchman.sample +++ b/templates/hooks--fsmonitor-watchman.sample @@ -22,7 +22,8 @@ my ($version, $time) = @ARGV; if ($version == 1) { # convert nanoseconds to seconds - $time = int $time / 1000000000; + # subtract one second to make sure watchman will return all changes + $time = int ($time / 1000000000) - 1; } else { die "Unsupported query-fsmonitor hook version '$version'.\n" . "Falling back to scanning...\n"; @@ -53,18 +54,12 @@ sub launch_watchman { # # To accomplish this, we're using the "since" generator to use the # recency index to select candidate nodes and "fields" to limit the - # output to file names only. Then we're using the "expression" term to - # further constrain the results. - # - # The category of transient files that we want to ignore will have a - # creation clock (cclock) newer than $time_t value and will also not - # currently exist. + # output to file names only. my $query = <<" END"; ["query", "$git_work_tree", { "since": $time, - "fields": ["name"], - "expression": ["not", ["allof", ["since", $time, "cclock"], ["not", "exists"]]] + "fields": ["name"] }] END From 5a5aefda1d99b8bb067d2abba9e7a685833143cc Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 3 Oct 2019 13:21:26 -0400 Subject: [PATCH 19/36] credential: set trace2_child_class for credential manager children Signed-off-by: Jeff Hostetler --- credential.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/credential.c b/credential.c index 62be651b03b55e..d89bcf969fb65c 100644 --- a/credential.c +++ b/credential.c @@ -224,6 +224,8 @@ static int run_credential_helper(struct credential *c, else helper.no_stdout = 1; + helper.trace2_child_class = "cred"; + if (start_command(&helper) < 0) return -1; From fc9c77fe77a3c6757715921c5655e5afb234df57 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 18 Sep 2019 10:35:45 -0400 Subject: [PATCH 20/36] sub-process: do not borrow cmd pointer from caller Teach subprocess_start() to use a copy of the passed `cmd` string rather than borrowing the buffer from the caller. Some callers of subprocess_start() pass the value returned from find_hook() which points to a static buffer and therefore is only good until the next call to find_hook(). This could cause problems for the long-running background processes managed by sub-process.c where later calls to subprocess_find_entry() to get an existing process will fail. This could cause more than 1 long-running process to be created. TODO Need to confirm, but if only read_object_hook() uses TODO subprocess_start() in this manner, we could drop this TODO commit when we drop support for read_object_hook(). Signed-off-by: Jeff Hostetler --- sub-process.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sub-process.c b/sub-process.c index 1b1af9dcbd9599..a49052900e89a6 100644 --- a/sub-process.c +++ b/sub-process.c @@ -80,7 +80,12 @@ int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, co int err; struct child_process *process; - entry->cmd = cmd; + // BUGBUG most callers to subprocess_start() pass in "cmd" the value + // BUGBUG of find_hook() which returns a static buffer (that's only + // BUGBUG good until the next call to find_hook()). + // BUGFIX Defer assignment until we copy the string in our argv. + // entry->cmd = cmd; + process = &entry->process; child_process_init(process); @@ -92,6 +97,8 @@ int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, co process->clean_on_exit_handler = subprocess_exit_handler; process->trace2_child_class = "subprocess"; + entry->cmd = process->args.argv[0]; + err = start_command(process); if (err) { error("cannot fork to run subprocess '%s'", cmd); From 3019adb0672f521e4a49d3c47ee0d021fc05ee46 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 18 Sep 2019 10:45:58 -0400 Subject: [PATCH 21/36] sub-process: add subprocess_start_argv() Add function to start a subprocess with an argv. Signed-off-by: Jeff Hostetler --- sub-process.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ sub-process.h | 6 ++++++ 2 files changed, 53 insertions(+) diff --git a/sub-process.c b/sub-process.c index a49052900e89a6..cd05ef28521f3a 100644 --- a/sub-process.c +++ b/sub-process.c @@ -4,6 +4,7 @@ #include "sub-process.h" #include "sigchain.h" #include "pkt-line.h" +#include "quote.h" int cmd2process_cmp(const void *unused_cmp_data, const struct hashmap_entry *eptr, @@ -118,6 +119,52 @@ int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, co return 0; } +int subprocess_start_argv(struct hashmap *hashmap, + struct subprocess_entry *entry, + int is_git_cmd, + const struct argv_array *argv, + subprocess_start_fn startfn) +{ + int err; + int k; + struct child_process *process; + struct strbuf quoted = STRBUF_INIT; + + process = &entry->process; + + child_process_init(process); + for (k = 0; k < argv->argc; k++) + argv_array_push(&process->args, argv->argv[k]); + process->use_shell = 1; + process->in = -1; + process->out = -1; + process->git_cmd = is_git_cmd; + process->clean_on_exit = 1; + process->clean_on_exit_handler = subprocess_exit_handler; + process->trace2_child_class = "subprocess"; + + sq_quote_argv_pretty("ed, argv->argv); + entry->cmd = strbuf_detach("ed, 0); + + err = start_command(process); + if (err) { + error("cannot fork to run subprocess '%s'", entry->cmd); + return err; + } + + hashmap_entry_init(&entry->ent, strhash(entry->cmd)); + + err = startfn(entry); + if (err) { + error("initialization for subprocess '%s' failed", entry->cmd); + subprocess_stop(hashmap, entry); + return err; + } + + hashmap_add(hashmap, &entry->ent); + return 0; +} + static int handshake_version(struct child_process *process, const char *welcome_prefix, int *versions, int *chosen_version) diff --git a/sub-process.h b/sub-process.h index e85f21fa1a7c2b..df91febac6179a 100644 --- a/sub-process.h +++ b/sub-process.h @@ -57,6 +57,12 @@ typedef int(*subprocess_start_fn)(struct subprocess_entry *entry); int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, const char *cmd, subprocess_start_fn startfn); +int subprocess_start_argv(struct hashmap *hashmap, + struct subprocess_entry *entry, + int is_git_cmd, + const struct argv_array *argv, + subprocess_start_fn startfn); + /* Kill a subprocess and remove it from the subprocess hashmap. */ void subprocess_stop(struct hashmap *hashmap, struct subprocess_entry *entry); From 15d8415ded2b6870448a195a68e68ed39381ac49 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 24 Sep 2019 14:31:10 -0400 Subject: [PATCH 22/36] sha1-file: add function to update existing loose object cache Create a function to add a new object to the loose object cache after the existing odb/xx/ directory was scanned. This will be used in a later commit to keep the loose object cache fresh after dynamically fetching an individual object and without requiring the odb/xx/ directory to be rescanned. Signed-off-by: Jeff Hostetler --- object-store.h | 8 ++++++++ sha1-file.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/object-store.h b/object-store.h index 7f7b3cdd806b75..203678186ea625 100644 --- a/object-store.h +++ b/object-store.h @@ -56,6 +56,14 @@ void add_to_alternates_memory(const char *dir); struct oid_array *odb_loose_cache(struct object_directory *odb, const struct object_id *oid); +/* + * Add a new object to the loose object cache (possibly after the + * cache was populated). This might be used after dynamically + * fetching a missing object. + */ +void odb_loose_cache_add_new_oid(struct object_directory *odb, + const struct object_id *oid); + /* Empty the loose object cache for the specified object directory. */ void odb_clear_loose_cache(struct object_directory *odb); diff --git a/sha1-file.c b/sha1-file.c index 384fd7b1d7f7eb..c960dfa348b8d5 100644 --- a/sha1-file.c +++ b/sha1-file.c @@ -2482,6 +2482,39 @@ struct oid_array *odb_loose_cache(struct object_directory *odb, return &odb->loose_objects_cache[subdir_nr]; } +void odb_loose_cache_add_new_oid(struct object_directory *odb, + const struct object_id *oid) +{ + int subdir_nr = oid->hash[0]; + + if (subdir_nr < 0 || + subdir_nr >= ARRAY_SIZE(odb->loose_objects_subdir_seen)) + BUG("subdir_nr out of range"); + + /* + * If the looose object cache already has an oid_array covering + * cell [xx], we assume that the cache was loaded *before* the + * new object was created, so we just need to append our new one + * to the existing array. + * + * Otherwise, cause the [xx] cell to be created by scanning the + * directory. And since this happens *after* our caller created + * the loose object, we don't need to explicitly add it to the + * array. + * + * TODO If the subdir has not been seen, we don't technically + * TODO need to force load it now. We could wait and let our + * TODO caller (or whoever requested the missing object) cause + * TODO try to read the xx/ object and fill the cache. + * TODO Not sure it matters either way. + */ + if (odb->loose_objects_subdir_seen[subdir_nr]) + append_loose_object(oid, NULL, + &odb->loose_objects_cache[subdir_nr]); + else + odb_loose_cache(odb, oid); +} + void odb_clear_loose_cache(struct object_directory *odb) { int i; From 82592603462e3d59bf876944da94143924b5cbaa Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Wed, 25 Sep 2019 13:36:54 -0400 Subject: [PATCH 23/36] packfile: add install_packed_git_and_mru() Create a function to install a new packfile into the packed-git list and add it to the head of the MRU list. This function will be used in a later commit to install packfiles created by dynamic object fetching. Signed-off-by: Jeff Hostetler --- packfile.c | 6 ++++++ packfile.h | 1 + 2 files changed, 7 insertions(+) diff --git a/packfile.c b/packfile.c index 9feaa1e836ddf4..127960f92b7db5 100644 --- a/packfile.c +++ b/packfile.c @@ -759,6 +759,12 @@ void install_packed_git(struct repository *r, struct packed_git *pack) r->objects->packed_git = pack; } +void install_packed_git_and_mru(struct repository *r, struct packed_git *pack) +{ + install_packed_git(r, pack); + list_add(&pack->mru, &r->objects->packed_git_mru); +} + void (*report_garbage)(unsigned seen_bits, const char *path); static void report_helper(const struct string_list *list, diff --git a/packfile.h b/packfile.h index 13a5368c9f06e9..f9ea6c733b416b 100644 --- a/packfile.h +++ b/packfile.h @@ -53,6 +53,7 @@ extern void (*report_garbage)(unsigned seen_bits, const char *path); void reprepare_packed_git(struct repository *r); void install_packed_git(struct repository *r, struct packed_git *pack); +void install_packed_git_and_mru(struct repository *r, struct packed_git *pack); struct packed_git *get_packed_git(struct repository *r); struct list_head *get_packed_git_mru(struct repository *r); From abfc0a402237eac83f00a32d354fb8cbae2f6d6f Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 24 Sep 2019 15:51:16 -0400 Subject: [PATCH 24/36] index-pack: avoid immediate object fetch while parsing packfile Prevent packfile parsing from accidentally dynamically fetching each individual object found in the packfile. When index-pack parses the input packfile, it does a lookup in the ODB to test for conflicts/collisions. This can accidentally cause the object to be individually fetched when gvfs-helper (or read-object-hook or partial-clone) is enabled. Signed-off-by: Jeff Hostetler --- builtin/index-pack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 60a5591039abbd..81f228e2739afc 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -781,7 +781,7 @@ static void sha1_object(const void *data, struct object_entry *obj_entry, if (startup_info->have_repository) { read_lock(); collision_test_needed = - has_object_file_with_flags(oid, OBJECT_INFO_QUICK); + has_object_file_with_flags(oid, OBJECT_INFO_FOR_PREFETCH); read_unlock(); } From 706d2aec7a3048c49ea0c1267b5012db4d3fddc7 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 13 Aug 2019 12:12:08 -0400 Subject: [PATCH 25/36] gvfs-helper: create tool to fetch objects using the GVFS Protocol Create gvfs-helper. This is a helper tool to use the GVFS Protocol REST API to fetch objects and configuration data from a GVFS cache-server or Git server. This tool uses libcurl to send object requests to either server. This tool creates loose objects and/or packfiles. Create gvfs-helper-client. This code resides within git proper and uses the sub-process API to manage gvfs-helper as a long-running background process. Signed-off-by: Jeff Hostetler --- .gitignore | 1 + Documentation/config.txt | 2 + Documentation/config/core.txt | 3 + Documentation/config/gvfs.txt | 5 + Makefile | 7 + cache.h | 3 + config.c | 40 + environment.c | 3 + gvfs-helper-client.c | 403 ++++++ gvfs-helper-client.h | 61 + gvfs-helper.c | 2235 +++++++++++++++++++++++++++++++++ promisor-remote.c | 12 +- sha1-file.c | 30 + 13 files changed, 2804 insertions(+), 1 deletion(-) create mode 100644 Documentation/config/gvfs.txt create mode 100644 gvfs-helper-client.c create mode 100644 gvfs-helper-client.h create mode 100644 gvfs-helper.c diff --git a/.gitignore b/.gitignore index aebe7c0908f168..f6fe9647302f0b 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ /git-gc /git-get-tar-commit-id /git-grep +/git-gvfs-helper /git-hash-object /git-help /git-http-backend diff --git a/Documentation/config.txt b/Documentation/config.txt index 5bd5d994dfbbe5..577b0259978831 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -371,6 +371,8 @@ include::config/gui.txt[] include::config/guitool.txt[] +include::config/gvfs.txt[] + include::config/help.txt[] include::config/http.txt[] diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt index 8fccebd7d7faca..4d1fd4f2daf4ee 100644 --- a/Documentation/config/core.txt +++ b/Documentation/config/core.txt @@ -661,6 +661,9 @@ core.gvfs:: flag just blocks them from occurring at all. -- +core.useGvfsHelper:: + TODO + core.sparseCheckout:: Enable "sparse checkout" feature. See linkgit:git-sparse-checkout[1] for more information. diff --git a/Documentation/config/gvfs.txt b/Documentation/config/gvfs.txt new file mode 100644 index 00000000000000..6ab221ded36c91 --- /dev/null +++ b/Documentation/config/gvfs.txt @@ -0,0 +1,5 @@ +gvfs.cache-server:: + TODO + +gvfs.sharedcache:: + TODO diff --git a/Makefile b/Makefile index eb4220170ad479..2fae073ede08c1 100644 --- a/Makefile +++ b/Makefile @@ -893,6 +893,7 @@ LIB_OBJS += gpg-interface.o LIB_OBJS += graph.o LIB_OBJS += grep.o LIB_OBJS += gvfs.o +LIB_OBJS += gvfs-helper-client.o LIB_OBJS += hashmap.o LIB_OBJS += linear-assignment.o LIB_OBJS += help.o @@ -1370,6 +1371,8 @@ else CURL_LIBCURL += $(shell $(CURL_CONFIG) --libs) endif + PROGRAM_OBJS += gvfs-helper.o + REMOTE_CURL_PRIMARY = git-remote-http$X REMOTE_CURL_ALIASES = git-remote-https$X git-remote-ftp$X git-remote-ftps$X REMOTE_CURL_NAMES = $(REMOTE_CURL_PRIMARY) $(REMOTE_CURL_ALIASES) @@ -2480,6 +2483,10 @@ $(REMOTE_CURL_PRIMARY): remote-curl.o http.o http-walker.o GIT-LDFLAGS $(GITLIBS $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \ $(CURL_LIBCURL) $(EXPAT_LIBEXPAT) $(LIBS) +git-gvfs-helper$X: gvfs-helper.o http.o GIT-LDFLAGS $(GITLIBS) + $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \ + $(CURL_LIBCURL) $(EXPAT_LIBEXPAT) $(LIBS) + $(LIB_FILE): $(LIB_OBJS) $(QUIET_AR)$(RM) $@ && $(AR) $(ARFLAGS) $@ $^ diff --git a/cache.h b/cache.h index e6f8f916e7bfa5..5e055a4a5e59fa 100644 --- a/cache.h +++ b/cache.h @@ -926,6 +926,9 @@ extern int precomposed_unicode; extern int protect_hfs; extern int protect_ntfs; extern const char *core_fsmonitor; +extern int core_use_gvfs_helper; +extern const char *gvfs_cache_server_url; +extern const char *gvfs_shared_cache_pathname; extern int core_apply_sparse_checkout; extern int core_sparse_checkout_cone; diff --git a/config.c b/config.c index 801184d0a36856..02ed8947af103f 100644 --- a/config.c +++ b/config.c @@ -21,6 +21,7 @@ #include "color.h" #include "refs.h" #include "gvfs.h" +#include "transport.h" struct config_source { struct config_source *prev; @@ -1365,6 +1366,11 @@ static int git_default_core_config(const char *var, const char *value, void *cb) return 0; } + if (!strcmp(var, "core.usegvfshelper")) { + core_use_gvfs_helper = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "core.sparsecheckout")) { /* virtual file system relies on the sparse checkout logic so force it on */ if (core_virtualfilesystem) @@ -1490,6 +1496,37 @@ static int git_default_mailmap_config(const char *var, const char *value) return 0; } +static int git_default_gvfs_config(const char *var, const char *value) +{ + if (!strcmp(var, "gvfs.cache-server")) { + const char *v2 = NULL; + + if (!git_config_string(&v2, var, value) && v2 && *v2) + gvfs_cache_server_url = transport_anonymize_url(v2); + free((char*)v2); + return 0; + } + + if (!strcmp(var, "gvfs.sharedcache") && value && *value) { + struct strbuf buf = STRBUF_INIT; + strbuf_addstr(&buf, value); + if (strbuf_normalize_path(&buf) < 0) { + /* + * Pretend it wasn't set. This will cause us to + * fallback to ".git/objects" effectively. + */ + strbuf_release(&buf); + return 0; + } + strbuf_trim_trailing_dir_sep(&buf); + + gvfs_shared_cache_pathname = strbuf_detach(&buf, NULL); + return 0; + } + + return 0; +} + int git_default_config(const char *var, const char *value, void *cb) { if (starts_with(var, "core.")) @@ -1536,6 +1573,9 @@ int git_default_config(const char *var, const char *value, void *cb) return 0; } + if (starts_with(var, "gvfs.")) + return git_default_gvfs_config(var, value); + /* Add other config variables here and to Documentation/config.txt. */ return 0; } diff --git a/environment.c b/environment.c index 617f38fc63d13e..62543cc16bc366 100644 --- a/environment.c +++ b/environment.c @@ -86,6 +86,9 @@ int protect_hfs = PROTECT_HFS_DEFAULT; #endif int protect_ntfs = PROTECT_NTFS_DEFAULT; const char *core_fsmonitor; +int core_use_gvfs_helper; +const char *gvfs_cache_server_url; +const char *gvfs_shared_cache_pathname; /* * The character that begins a commented line in user-editable file diff --git a/gvfs-helper-client.c b/gvfs-helper-client.c new file mode 100644 index 00000000000000..4c67a2cb2131f6 --- /dev/null +++ b/gvfs-helper-client.c @@ -0,0 +1,403 @@ +#include "cache.h" +#include "argv-array.h" +#include "trace2.h" +#include "oidset.h" +#include "object.h" +#include "object-store.h" +#include "gvfs-helper-client.h" +#include "sub-process.h" +#include "sigchain.h" +#include "pkt-line.h" +#include "quote.h" +#include "packfile.h" + +static struct oidset ghc__oidset_queued = OIDSET_INIT; +static unsigned long ghc__oidset_count; +static int ghc__includes_immediate; + +struct ghs__process { + struct subprocess_entry subprocess; /* must be first */ + unsigned int supported_capabilities; +}; + +static int ghs__subprocess_map_initialized; +static struct hashmap ghs__subprocess_map; +static struct object_directory *ghs__chosen_odb; + +#define CAP_GET (1u<<1) + +static int ghc__start_fn(struct subprocess_entry *subprocess) +{ + static int versions[] = {1, 0}; + static struct subprocess_capability capabilities[] = { + { "get", CAP_GET }, + { NULL, 0 } + }; + + struct ghs__process *entry = (struct ghs__process *)subprocess; + + return subprocess_handshake(subprocess, "gvfs-helper", versions, + NULL, capabilities, + &entry->supported_capabilities); +} + +/* + * Send: + * + * get LF + * ( LF)* + * + * + */ +static int ghc__get__send_command(struct child_process *process) +{ + struct oidset_iter iter; + struct object_id *oid; + int err; + + /* + * We assume that all of the packet_ routines call error() + * so that we don't have to. + */ + + err = packet_write_fmt_gently(process->in, "get\n"); + if (err) + return err; + + oidset_iter_init(&ghc__oidset_queued, &iter); + while ((oid = oidset_iter_next(&iter))) { + err = packet_write_fmt_gently(process->in, "%s\n", + oid_to_hex(oid)); + if (err) + return err; + } + + err = packet_flush_gently(process->in); + if (err) + return err; + + return 0; +} + +/* + * Verify that the pathname found in the "odb" response line matches + * what we requested. + * + * Since we ALWAYS send a "--shared-cache=" arg to "gvfs-helper", + * we should be able to verify that the value is what we requested. + * In particular, I don't see a need to try to search for the response + * value in from our list of alternates. + */ +static void ghc__verify_odb_line(const char *line) +{ + const char *v1_odb_path; + + if (!skip_prefix(line, "odb ", &v1_odb_path)) + BUG("verify_odb_line: invalid line '%s'", line); + + if (!ghs__chosen_odb || strcmp(v1_odb_path, ghs__chosen_odb->path)) + BUG("verify_odb_line: unexpeced odb path '%s' vs '%s'", + v1_odb_path, ghs__chosen_odb->path); +} + +/* + * Update the loose object cache to include the newly created + * object. + */ +static void ghc__update_loose_cache(const char *line) +{ + const char *v1_oid; + struct object_id oid; + + if (!skip_prefix(line, "loose ", &v1_oid)) + BUG("update_loose_cache: invalid line '%s'", line); + + odb_loose_cache_add_new_oid(ghs__chosen_odb, &oid); +} + +/* + * Update the packed-git list to include the newly created packfile. + */ +static void ghc__update_packed_git(const char *line) +{ + struct strbuf path = STRBUF_INIT; + const char *v1_filename; + struct packed_git *p; + int is_local; + + if (!skip_prefix(line, "packfile ", &v1_filename)) + BUG("update_packed_git: invalid line '%s'", line); + + /* + * ODB[0] is the local .git/objects. All others are alternates. + */ + is_local = (ghs__chosen_odb == the_repository->objects->odb); + + strbuf_addf(&path, "%s/pack/%s", ghs__chosen_odb->path, v1_filename); + strbuf_strip_suffix(&path, ".pack"); + strbuf_addstr(&path, ".idx"); + + p = add_packed_git(path.buf, path.len, is_local); + if (p) + install_packed_git_and_mru(the_repository, p); +} + +/* + * We expect: + * + * + * * + * + * + * + * Where: + * + * ::= odb SP LF + * + * ::= / + * + * ::= packfile SP LF + * + * ::= loose SP LF + * + * ::= ok LF + * / partial LF + * / error SP LF + * + * Note that `gvfs-helper` controls how/if it chunks the request when + * it talks to the cache-server and/or main Git server. So it is + * possible for us to receive many packfiles and/or loose objects *AND + * THEN* get a hard network error or a 404 on an individual object. + * + * If we get a partial result, we can let the caller try to continue + * -- for example, maybe an immediate request for a tree object was + * grouped with a queued request for a blob. The tree-walk *might* be + * able to continue and let the 404 blob be handled later. + */ +static int ghc__get__receive_response(struct child_process *process, + enum ghc__created *p_ghc, + int *p_nr_loose, int *p_nr_packfile) +{ + enum ghc__created ghc = GHC__CREATED__NOTHING; + const char *v1; + char *line; + int len; + int err = 0; + + while (1) { + /* + * Warning: packet_read_line_gently() calls die() + * despite the _gently moniker. + */ + len = packet_read_line_gently(process->out, NULL, &line); + if ((len < 0) || !line) + break; + + if (starts_with(line, "odb")) { + ghc__verify_odb_line(line); + } + + else if (starts_with(line, "packfile")) { + ghc__update_packed_git(line); + ghc |= GHC__CREATED__PACKFILE; + *p_nr_packfile += 1; + } + + else if (starts_with(line, "loose")) { + ghc__update_loose_cache(line); + ghc |= GHC__CREATED__LOOSE; + *p_nr_loose += 1; + } + + else if (starts_with(line, "ok")) + ; + else if (starts_with(line, "partial")) + ; + else if (skip_prefix(line, "error ", &v1)) { + error("gvfs-helper error: '%s'", v1); + err = -1; + } + } + + *p_ghc = ghc; + + return err; +} + +static void ghc__choose_odb(void) +{ + struct object_directory *odb; + + if (ghs__chosen_odb) + return; + + prepare_alt_odb(the_repository); + + if (gvfs_shared_cache_pathname && *gvfs_shared_cache_pathname) { + for (odb = the_repository->objects->odb; odb; odb = odb->next) { + if (!strcmp(odb->path, gvfs_shared_cache_pathname)) { + ghs__chosen_odb = odb; + return; + } + } + } + + /* + * Use .git/objects if "gvfs.sharedcache" not set or set to an + * unknown pathname. + */ + ghs__chosen_odb = the_repository->objects->odb; +} + +static int ghc__get(enum ghc__created *p_ghc) +{ + struct ghs__process *entry; + struct child_process *process; + struct argv_array argv = ARGV_ARRAY_INIT; + struct strbuf quoted = STRBUF_INIT; + int nr_loose = 0; + int nr_packfile = 0; + int err = 0; + + trace2_region_enter("gh-client", "get", the_repository); + + ghc__choose_odb(); + + /* + * TODO decide what defaults we want. + */ + argv_array_push(&argv, "gvfs-helper"); + argv_array_push(&argv, "--fallback"); + argv_array_push(&argv, "--cache-server=trust"); + argv_array_pushf(&argv, "--shared-cache=%s", ghs__chosen_odb->path); + argv_array_push(&argv, "server"); + + sq_quote_argv_pretty("ed, argv.argv); + + if (!ghs__subprocess_map_initialized) { + ghs__subprocess_map_initialized = 1; + hashmap_init(&ghs__subprocess_map, + (hashmap_cmp_fn)cmd2process_cmp, NULL, 0); + entry = NULL; + } else + entry = (struct ghs__process *)subprocess_find_entry( + &ghs__subprocess_map, quoted.buf); + + if (!entry) { + entry = xmalloc(sizeof(*entry)); + entry->supported_capabilities = 0; + + err = subprocess_start_argv( + &ghs__subprocess_map, &entry->subprocess, 1, + &argv, ghc__start_fn); + if (err) { + free(entry); + goto leave_region; + } + } + + process = &entry->subprocess.process; + + if (!(CAP_GET & entry->supported_capabilities)) { + error("gvfs-helper: does not support GET"); + subprocess_stop(&ghs__subprocess_map, + (struct subprocess_entry *)entry); + free(entry); + err = -1; + goto leave_region; + } + + sigchain_push(SIGPIPE, SIG_IGN); + + err = ghc__get__send_command(process); + if (!err) + err = ghc__get__receive_response(process, p_ghc, + &nr_loose, &nr_packfile); + + sigchain_pop(SIGPIPE); + + if (err) { + subprocess_stop(&ghs__subprocess_map, + (struct subprocess_entry *)entry); + free(entry); + } + +leave_region: + argv_array_clear(&argv); + strbuf_release("ed); + + trace2_data_intmax("gh-client", the_repository, + "get/immediate", ghc__includes_immediate); + + trace2_data_intmax("gh-client", the_repository, + "get/nr_objects", ghc__oidset_count); + + if (nr_loose) + trace2_data_intmax("gh-client", the_repository, + "get/nr_loose", nr_loose); + + if (nr_packfile) + trace2_data_intmax("gh-client", the_repository, + "get/nr_packfile", nr_packfile); + + if (err) + trace2_data_intmax("gh-client", the_repository, + "get/error", err); + + trace2_region_leave("gh-client", "get", the_repository); + + oidset_clear(&ghc__oidset_queued); + ghc__oidset_count = 0; + ghc__includes_immediate = 0; + + return err; +} + +void ghc__queue_oid(const struct object_id *oid) +{ + // TODO consider removing this trace2. it is useful for interactive + // TODO debugging, but may generate way too much noise for a data + // TODO event. + trace2_printf("ghc__queue_oid: %s", oid_to_hex(oid)); + + if (!oidset_insert(&ghc__oidset_queued, oid)) + ghc__oidset_count++; +} + +/* + * This routine should actually take a "const struct oid_array *" + * rather than the component parts, but fetch_objects() uses + * this model (because of the call in sha1-file.c). + */ +void ghc__queue_oid_array(const struct object_id *oids, int oid_nr) +{ + int k; + + for (k = 0; k < oid_nr; k++) + ghc__queue_oid(&oids[k]); +} + +int ghc__drain_queue(enum ghc__created *p_ghc) +{ + *p_ghc = GHC__CREATED__NOTHING; + + if (!ghc__oidset_count) + return 0; + + return ghc__get(p_ghc); +} + +int ghc__get_immediate(const struct object_id *oid, enum ghc__created *p_ghc) +{ + ghc__includes_immediate = 1; + + // TODO consider removing this trace2. it is useful for interactive + // TODO debugging, but may generate way too much noise for a data + // TODO event. + trace2_printf("ghc__get_immediate: %s", oid_to_hex(oid)); + + if (!oidset_insert(&ghc__oidset_queued, oid)) + ghc__oidset_count++; + + return ghc__drain_queue(p_ghc); +} diff --git a/gvfs-helper-client.h b/gvfs-helper-client.h new file mode 100644 index 00000000000000..61f026d1faa233 --- /dev/null +++ b/gvfs-helper-client.h @@ -0,0 +1,61 @@ +#ifndef GVFS_HELPER_CLIENT_H +#define GVFS_HELPER_CLIENT_H + +struct repository; +struct commit; + +enum ghc__created { + /* + * The _get_ operation did not create anything. If doesn't + * matter if `gvfs-helper` had errors or not -- just that + * nothing was created. + */ + GHC__CREATED__NOTHING = 0, + + /* + * The _get_operation created one or more packfiles. + */ + GHC__CREATED__PACKFILE = 1<<1, + + /* + * The _get_ operation created one or more loose objects. + * (Not necessarily the for the individual OID you requested.) + */ + GHC__CREATED__LOOSE = 1<<2, + + /* + * The _get_ operation created one or more packfilea *and* + * one or more loose objects. + */ + GHC__CREATED__PACKFILE_AND_LOOSE = (GHC__CREATED__PACKFILE | + GHC__CREATED__LOOSE), +}; + +/* + * Ask `gvfs-helper server` to immediately fetch an object. + * Wait for the response. + * + * This may also fetch any queued (non-immediate) objects and + * so may create one or more loose objects and/or packfiles. + * It is undefined whether the requested OID will be loose or + * in a packfile. + */ +int ghc__get_immediate(const struct object_id *oid, enum ghc__created *p_ghc); + +/* + * Queue this OID for a future fetch using `gvfs-helper service`. + * It does not wait. + * + * The GHC layer is free to process this queue in any way it wants, + * including individual fetches, bulk fetches, and batching. And + * it may add queued objects to immediate requests. + * + * Callers should not rely on the queued object being on disk until + * the queue has been drained. + */ +void ghc__queue_oid(const struct object_id *oid); +void ghc__queue_oid_array(const struct object_id *oids, int oid_nr); + +int ghc__drain_queue(enum ghc__created *p_ghc); + +#endif /* GVFS_HELPER_CLIENT_H */ diff --git a/gvfs-helper.c b/gvfs-helper.c new file mode 100644 index 00000000000000..6e9c0dfb707183 --- /dev/null +++ b/gvfs-helper.c @@ -0,0 +1,2235 @@ +// TODO Write a man page. Here are some notes for dogfooding. +// TODO +// +// Usage: git gvfs-helper [] [] +// +// : +// +// --remote= // defaults to "origin" +// +// --fallback // boolean. defaults to off +// +// When a fetch from the cache-server fails, automatically +// fallback to the main Git server. This option has no effect +// if no cache-server is defined. +// +// --cache-server= // defaults to "verify" +// +// verify := lookup the set of defined cache-servers using +// "gvfs/config" and confirm that the selected +// cache-server is well-known. Silently disable the +// cache-server if not. (See security notes later.) +// +// error := verify cache-server and abort if not well-known. +// +// trust := do not verify cache-server. just use it. +// +// disable := disable the cache-server and always use the main +// Git server. +// +// --shared-cache= +// +// A relative or absolute pathname to the ODB directory to store +// fetched objects. +// +// If this option is not specified, we default to the value +// in the "gvfs.sharedcache" config setting and then to the +// local ".git/objects" directory. +// +// : +// +// config +// +// Fetch the "gvfs/config" string from the main Git server. +// (The cache-server setting is ignored because cache-servers +// do not support this REST API.) +// +// get +// +// Fetch 1 or more objects. If a cache-server is configured, +// try it first. Optionally fallback to the main Git server. +// +// The set of objects is given on stdin and is assumed to be +// a list of , one per line. +// +// : +// +// --block-size= // defaults to "4000" +// +// Request objects from server in batches of at +// most n objects (not bytes). +// +// --depth= // defaults to "1" +// +// server +// +// Interactive/sub-process mode. Listen for a series of commands +// and data on stdin and return results on stdout. This command +// uses pkt-line format [1] and implements the long-running process +// protocol [2] to communicate with the foreground/parent process. +// +// : +// +// --block-size= // defaults to "4000" +// +// Request objects from server in batches of at +// most n objects (not bytes). +// +// --depth= // defaults to "1" +// +// Interactive verb: get +// +// Fetch 1 or more objects. If a cache-server is configured, +// try it first. Optionally fallback to the main Git server. +// Create 1 or more loose objects and/or packfiles in the +// requested shared-cache directory (given on the command +// line and which is reported at the beginning of the +// response). +// +// git> get +// git> +// git> +// git> ... +// git> +// git> 0000 +// +// git< odb +// git< loose | packfile +// git< loose | packfile +// gid< ... +// git< loose | packfile +// git< ok | partial | error +// git< 0000 +// +// [1] Documentation/technical/protocol-common.txt +// [2] Documentation/technical/long-running-process-protocol.txt +// [3] See GIT_TRACE_PACKET +// +// Example: +// +// $ git -c core.virtualizeobjects=false -c core.usegvfshelper=false +// rev-list --objects --no-walk --missing=print HEAD +// | grep "^?" +// | sed 's/^?//' +// | git gvfs-helper get-missing +// +// Note: In this example, we need to turn off "core.virtualizeobjects" and +// "core.usegvfshelper" when building the list of objects. This prevents +// rev-list (in oid_object_info_extended() from automatically fetching +// them with read-object-hook or "gvfs-helper server" sub-process (and +// defeating the whole purpose of this example). +// +////////////////////////////////////////////////////////////////// + +#include "cache.h" +#include "config.h" +#include "remote.h" +#include "connect.h" +#include "strbuf.h" +#include "walker.h" +#include "http.h" +#include "exec-cmd.h" +#include "run-command.h" +#include "pkt-line.h" +#include "string-list.h" +#include "sideband.h" +#include "argv-array.h" +#include "credential.h" +#include "sha1-array.h" +#include "send-pack.h" +#include "protocol.h" +#include "quote.h" +#include "transport.h" +#include "parse-options.h" +#include "object-store.h" +#include "json-writer.h" +#include "tempfile.h" +#include "oidset.h" +#include "dir.h" +#include "progress.h" + +static const char * const main_usage[] = { + N_("git gvfs-helper [] config []"), + N_("git gvfs-helper [] get []"), + N_("git gvfs-helper [] server []"), + NULL +}; + +static const char *const get_usage[] = { + N_("git gvfs-helper [] get []"), + NULL +}; + +static const char *const server_usage[] = { + N_("git gvfs-helper [] server []"), + NULL +}; + +#define GH__DEFAULT_BLOCK_SIZE 4000 + +/* + * Our exit-codes. + */ +enum gh__error_code { + GH__ERROR_CODE__USAGE = -1, /* will be mapped to usage() */ + GH__ERROR_CODE__OK = 0, + GH__ERROR_CODE__ERROR = 1, /* unspecified */ +// GH__ERROR_CODE__CACHE_SERVER_NOT_FOUND = 2, + GH__ERROR_CODE__CURL_ERROR = 3, + GH__ERROR_CODE__HTTP_401 = 4, + GH__ERROR_CODE__HTTP_404 = 5, + GH__ERROR_CODE__HTTP_UNEXPECTED_CODE = 6, + GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE = 7, + GH__ERROR_CODE__COULD_NOT_CREATE_TEMPFILE = 8, + GH__ERROR_CODE__COULD_NOT_INSTALL_LOOSE = 9, + GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE = 10, + GH__ERROR_CODE__SUBPROCESS_SYNTAX = 11, +}; + +enum gh__cache_server_mode { + /* verify URL. disable if unknown. */ + GH__CACHE_SERVER_MODE__VERIFY_DISABLE = 0, + /* verify URL. error if unknown. */ + GH__CACHE_SERVER_MODE__VERIFY_ERROR, + /* disable the cache-server, if defined */ + GH__CACHE_SERVER_MODE__DISABLE, + /* trust any cache-server */ + GH__CACHE_SERVER_MODE__TRUST_WITHOUT_VERIFY, +}; + +/* + * The set of command line, config, and environment variables + * that we use as input to decide how we should operate. + */ +static struct gh__cmd_opts { + const char *remote_name; + + int try_fallback; /* to git server if cache-server fails */ + int show_progress; + + int depth; + int block_size; + + enum gh__cache_server_mode cache_server_mode; +} gh__cmd_opts; + +/* + * The chosen global state derrived from the inputs in gh__cmd_opts. + */ +static struct gh__global { + struct remote *remote; + + struct credential main_creds; + struct credential cache_creds; + + const char *main_url; + const char *cache_server_url; + + struct strbuf buf_odb_path; + + int http_is_initialized; + int cache_server_is_initialized; /* did sub-command look for one */ + int main_creds_need_approval; /* try to only approve them once */ + +} gh__global; + +/* + * Stolen from http.c + */ +static CURLcode gh__curlinfo_strbuf(CURL *curl, CURLINFO info, struct strbuf *buf) +{ + char *ptr; + CURLcode ret; + + strbuf_reset(buf); + ret = curl_easy_getinfo(curl, info, &ptr); + if (!ret && ptr) + strbuf_addstr(buf, ptr); + return ret; +} + +enum gh__progress_state { + GH__PROGRESS_STATE__START = 0, + GH__PROGRESS_STATE__PHASE1, + GH__PROGRESS_STATE__PHASE2, + GH__PROGRESS_STATE__PHASE3, +}; + +/* + * Parameters to drive an HTTP request (with any necessary retries). + */ +struct gh__request_params { + int b_is_post; /* POST=1 or GET=0 */ + int b_write_to_file; /* write to file=1 or strbuf=0 */ + int b_no_cache_server; /* force main server only */ + + unsigned long object_count; /* number of objects being fetched */ + + const struct strbuf *post_payload; /* POST body to send */ + + struct curl_slist *headers; /* additional http headers to send */ + struct tempfile *tempfile; /* for response content when file */ + struct strbuf *buffer; /* for response content when strbuf */ + struct strbuf label; /* for trace2 regions */ + + struct strbuf loose_path; + + /* + * Note that I am putting all of the progress-related instance data + * inside the request-params in the hope that we can eventually + * do multi-threaded/concurrent HTTP requests when chunking + * large requests. However, the underlying "struct progress" API + * is not thread safe (that is, it doesn't allow concurrent progress + * reports (since that might require multiple lines on the screen + * or something)). + */ + enum gh__progress_state progress_state; + struct strbuf progress_base_phase2_msg; + struct strbuf progress_base_phase3_msg; + + /* + * The buffer for the formatted progress message is shared by the + * "struct progress" API and must remain valid for the duration of + * the start_progress..stop_progress lifespan. + */ + struct strbuf progress_msg; + struct progress *progress; +}; + +#define GH__REQUEST_PARAMS_INIT { \ + .b_is_post = 0, \ + .b_write_to_file = 0, \ + .b_no_cache_server = 0, \ + .object_count = 0, \ + .post_payload = NULL, \ + .headers = NULL, \ + .tempfile = NULL, \ + .buffer = NULL, \ + .label = STRBUF_INIT, \ + .loose_path = STRBUF_INIT, \ + .progress_state = GH__PROGRESS_STATE__START, \ + .progress_base_phase2_msg = STRBUF_INIT, \ + .progress_base_phase3_msg = STRBUF_INIT, \ + .progress_msg = STRBUF_INIT, \ + .progress = NULL, \ + } + +static void gh__request_params__release(struct gh__request_params *params) +{ + if (!params) + return; + + params->post_payload = NULL; /* we do not own this */ + + curl_slist_free_all(params->headers); + params->headers = NULL; + + delete_tempfile(¶ms->tempfile); + + params->buffer = NULL; /* we do not own this */ + + strbuf_release(¶ms->label); + strbuf_release(¶ms->loose_path); + + strbuf_release(¶ms->progress_base_phase2_msg); + strbuf_release(¶ms->progress_base_phase3_msg); + strbuf_release(¶ms->progress_msg); + + stop_progress(¶ms->progress); + params->progress = NULL; +} + +/* + * Bucket to describe the results of an HTTP requests (may be + * overwritten during retries so that it describes the final attempt). + */ +struct gh__response_status { + struct strbuf error_message; + struct strbuf content_type; + long response_code; /* http response code */ + CURLcode curl_code; + enum gh__error_code ec; + intmax_t bytes_received; +}; + +#define GH__RESPONSE_STATUS_INIT { \ + .error_message = STRBUF_INIT, \ + .content_type = STRBUF_INIT, \ + .response_code = 0, \ + .curl_code = CURLE_OK, \ + .ec = GH__ERROR_CODE__OK, \ + .bytes_received = 0, \ + } + +static void gh__response_status__zero(struct gh__response_status *s) +{ + strbuf_setlen(&s->error_message, 0); + strbuf_setlen(&s->content_type, 0); + s->response_code = 0; + s->curl_code = CURLE_OK; + s->ec = GH__ERROR_CODE__OK; + s->bytes_received = 0; +} + +/* + * Create a single normalized 'ec' error-code from the status we + * received from the HTTP request. Map a few of the expected HTTP + * status code to 'ec', but don't get too crazy here. + */ +static void gh__response_status__set_from_slot( + struct gh__request_params *params, + struct gh__response_status *status, + const struct active_request_slot *slot) +{ + status->curl_code = slot->results->curl_result; + gh__curlinfo_strbuf(slot->curl, CURLINFO_CONTENT_TYPE, + &status->content_type); + curl_easy_getinfo(slot->curl, CURLINFO_RESPONSE_CODE, + &status->response_code); + + strbuf_setlen(&status->error_message, 0); + + if (status->response_code == 200) + status->ec = GH__ERROR_CODE__OK; + + else if (status->response_code == 401) { + strbuf_addstr(&status->error_message, "401 Not Authorized"); + status->ec = GH__ERROR_CODE__HTTP_401; + + } else if (status->response_code == 404) { + strbuf_addstr(&status->error_message, "404 Not Found"); + status->ec = GH__ERROR_CODE__HTTP_404; + + } else if (status->curl_code != CURLE_OK) { + strbuf_addf(&status->error_message, "%s (curl)", + curl_easy_strerror(status->curl_code)); + status->ec = GH__ERROR_CODE__CURL_ERROR; + } else { + strbuf_addf(&status->error_message, "HTTP %ld Unexpected", + status->response_code); + status->ec = GH__ERROR_CODE__HTTP_UNEXPECTED_CODE; + } + + if (status->ec != GH__ERROR_CODE__OK) + status->bytes_received = 0; + else if (params->b_write_to_file) + status->bytes_received = (intmax_t)ftell(params->tempfile->fp); + else + status->bytes_received = (intmax_t)params->buffer->len; +} + +static void gh__response_status__release(struct gh__response_status *status) +{ + if (!status) + return; + strbuf_release(&status->error_message); + strbuf_release(&status->content_type); +} + +/* + * The cache-server sends a somewhat bogus 400 instead of + * the normal 401 when AUTH is required. Fixup the status + * to hide that. + */ +static void fixup_cache_server_400_to_401(struct gh__response_status *status) +{ + if (status->response_code != 400) + return; + + /* + * TODO Technically, the cache-server could send a 400 + * TODO for many reasons, not just for their bogus + * TODO pseudo-401, but we're going to assume it is a + * TODO 401 for now. We should confirm the expected + * TODO error message in the response-body. + */ + status->response_code = 401; +} + +static int gh__curl_progress_cb(void *clientp, + curl_off_t dltotal, curl_off_t dlnow, + curl_off_t ultotal, curl_off_t ulnow) +{ + struct gh__request_params *params = clientp; + + /* + * From what I can tell, CURL progress arrives in 3 phases. + * + * [1] An initial connection setup phase where we get [0,0] [0,0]. + * [2] An upload phase where we start sending the request headers + * and body. ulnow will be > 0. ultotal may or may not be 0. + * [3] A download phase where we start receiving the response + * headers and payload body. dlnow will be > 0. dltotal may + * or may not be 0. + * + * If we pass zero for the total to the "struct progress" API, we + * get simple numbers rather than percentages. So our progress + * output format may vary depending. + * + * It is unclear if CURL will give us a final callback after + * everything is finished, so we leave the progress handle open + * and let the caller issue the final stop_progress(). + * + * There is a bit of a mismatch between the CURL API and the + * "struct progress" API. The latter requires us to set the + * progress message when we call one of the start_progress + * methods. We cannot change the progress message while we are + * showing progress state. And we cannot change the denominator + * (total) after we start. CURL may or may not give us the total + * sizes for each phase. + * + * Also be advised that the "struct progress" API eats messages + * so that the screen is only updated every second or so. And + * may not print anything if the start..stop happen in less then + * 2 seconds. Whereas CURL calls this callback very frequently. + * The net-net is that we may not actually see this progress + * message for small/fast HTTP requests. + */ + + switch (params->progress_state) { + case GH__PROGRESS_STATE__START: /* first callback */ + if (dlnow == 0 && ulnow == 0) + goto enter_phase_1; + + if (ulnow) + goto enter_phase_2; + else + goto enter_phase_3; + + case GH__PROGRESS_STATE__PHASE1: + if (dlnow == 0 && ulnow == 0) + return 0; + + if (ulnow) + goto enter_phase_2; + else + goto enter_phase_3; + + case GH__PROGRESS_STATE__PHASE2: + display_progress(params->progress, ulnow); + if (dlnow == 0) + return 0; + + stop_progress(¶ms->progress); + goto enter_phase_3; + + case GH__PROGRESS_STATE__PHASE3: + display_progress(params->progress, dlnow); + return 0; + + default: + return 0; + } + +enter_phase_1: + /* + * Don't bother to create a progress handle during phase [1]. + * Because we get [0,0,0,0], we don't have any data to report + * and would just have to synthesize some type of progress. + * From my testing, phase [1] is fairly quick (probably just + * the SSL handshake), so the "struct progress" API will most + * likely completely eat any messages that we did produce. + */ + params->progress_state = GH__PROGRESS_STATE__PHASE1; + return 0; + +enter_phase_2: + strbuf_setlen(¶ms->progress_msg, 0); + if (params->progress_base_phase2_msg.len) { + strbuf_addf(¶ms->progress_msg, "%s (bytes sent)", + params->progress_base_phase2_msg.buf); + params->progress = start_progress(params->progress_msg.buf, ultotal); + display_progress(params->progress, ulnow); + } + params->progress_state = GH__PROGRESS_STATE__PHASE2; + return 0; + +enter_phase_3: + strbuf_setlen(¶ms->progress_msg, 0); + if (params->progress_base_phase3_msg.len) { + strbuf_addf(¶ms->progress_msg, "%s (bytes received)", + params->progress_base_phase3_msg.buf); + params->progress = start_progress(params->progress_msg.buf, dltotal); + display_progress(params->progress, dlnow); + } + params->progress_state = GH__PROGRESS_STATE__PHASE3; + return 0; +} + +/* + * Run the request without using "run_one_slot()" because we + * don't want the post-request normalization, error handling, + * and auto-reauth handling in http.c. + */ +static void gh__run_one_slot(struct active_request_slot *slot, + struct gh__request_params *params, + struct gh__response_status *status) +{ + trace2_region_enter("gvfs-helper", params->label.buf, NULL); + + if (!start_active_slot(slot)) { + status->curl_code = CURLE_FAILED_INIT; /* a bit of a lie */ + strbuf_addstr(&status->error_message, + "failed to start HTTP request"); + } else { + run_active_slot(slot); + if (params->b_write_to_file) + fflush(params->tempfile->fp); + + gh__response_status__set_from_slot(params, status, slot); + + if (status->ec == GH__ERROR_CODE__OK) { + int old_len = params->label.len; + + strbuf_addstr(¶ms->label, "/nr_objects"); + trace2_data_intmax("gvfs-helper", NULL, + params->label.buf, + params->object_count); + strbuf_setlen(¶ms->label, old_len); + + strbuf_addstr(¶ms->label, "/nr_bytes"); + trace2_data_intmax("gvfs-helper", NULL, + params->label.buf, + status->bytes_received); + strbuf_setlen(¶ms->label, old_len); + } + } + + if (params->progress) + stop_progress(¶ms->progress); + + trace2_region_leave("gvfs-helper", params->label.buf, NULL); +} + +static int option_parse_cache_server_mode(const struct option *opt, + const char *arg, int unset) +{ + if (unset) /* should not happen */ + return error(_("missing value for switch '%s'"), + opt->long_name); + + else if (!strcmp(arg, "verify")) + gh__cmd_opts.cache_server_mode = + GH__CACHE_SERVER_MODE__VERIFY_DISABLE; + + else if (!strcmp(arg, "error")) + gh__cmd_opts.cache_server_mode = + GH__CACHE_SERVER_MODE__VERIFY_ERROR; + + else if (!strcmp(arg, "disable")) + gh__cmd_opts.cache_server_mode = + GH__CACHE_SERVER_MODE__DISABLE; + + else if (!strcmp(arg, "trust")) + gh__cmd_opts.cache_server_mode = + GH__CACHE_SERVER_MODE__TRUST_WITHOUT_VERIFY; + + else + return error(_("invalid value for switch '%s'"), + opt->long_name); + + return 0; +} + +/* + * Let command line args override "gvfs.sharedcache" config setting. + * + * It would be nice to move this to parse-options.c as an + * OPTION_PATHNAME handler. And maybe have flags for exists() + * and is_directory(). + */ +static int option_parse_shared_cache_directory(const struct option *opt, + const char *arg, int unset) +{ + if (unset) /* should not happen */ + return error(_("missing value for switch '%s'"), + opt->long_name); + + if (!is_directory(arg)) + return error(_("value for switch '%s' is not a directory: '%s'"), + opt->long_name, arg); + + gvfs_shared_cache_pathname = arg; + + return 0; +} + +/* + * Lookup the URL for this remote (defaults to 'origin'). + */ +static void lookup_main_url(void) +{ + /* + * Both VFS and Scalar only work with 'origin', so we expect this. + * The command line arg is mainly for debugging. + */ + if (!gh__cmd_opts.remote_name || !*gh__cmd_opts.remote_name) + gh__cmd_opts.remote_name = "origin"; + + gh__global.remote = remote_get(gh__cmd_opts.remote_name); + if (!gh__global.remote->url[0] || !*gh__global.remote->url[0]) + die("unknown remote '%s'", gh__cmd_opts.remote_name); + + /* + * Strip out any in-line auth in the origin server URL so that + * we can control which creds we fetch. + * + * Azure DevOps has been known to suggest https URLS of the + * form "https://@dev.azure.com//". + * + * Break that so that we can force the use of a PAT. + */ + gh__global.main_url = transport_anonymize_url(gh__global.remote->url[0]); + + trace2_data_string("gvfs-helper", NULL, "remote/url", gh__global.main_url); +} + +static void do__gvfs_config(struct gh__response_status *status, + struct strbuf *config_data); + +/* + * Find the URL of the cache-server, if we have one. + * + * This routined is called by the initialization code and is allowed + * to call die() rather than returning an 'ec'. + */ +static void select_cache_server(void) +{ + struct gh__response_status status = GH__RESPONSE_STATUS_INIT; + struct strbuf config_data = STRBUF_INIT; + const char *match = NULL; + + /* + * This only indicates that the sub-command actually called + * this routine. We rely on gh__global.cache_server_url to tell + * us if we actually have a cache-server configured. + */ + gh__global.cache_server_is_initialized = 1; + gh__global.cache_server_url = NULL; + + if (gh__cmd_opts.cache_server_mode == GH__CACHE_SERVER_MODE__DISABLE) { + trace2_data_string("gvfs-helper", NULL, "cache/url", "disabled"); + return; + } + + /* + * If the cache-server and main Git server have the same URL, we + * can silently disable the cache-server (by NOT setting the field + * in gh__global and explicitly disable the fallback logic.) + */ + if (!strcmp(gvfs_cache_server_url, gh__global.main_url)) { + gh__cmd_opts.try_fallback = 0; + trace2_data_string("gvfs-helper", NULL, "cache/url", "same"); + return; + } + + if (gh__cmd_opts.cache_server_mode == + GH__CACHE_SERVER_MODE__TRUST_WITHOUT_VERIFY) { + gh__global.cache_server_url = gvfs_cache_server_url; + trace2_data_string("gvfs-helper", NULL, "cache/url", + gvfs_cache_server_url); + return; + } + + /* + * GVFS cache-servers use the main Git server's creds rather + * than having their own creds. This feels like a security + * hole. For example, if the cache-server URL is pointed to a + * bad site, we'll happily send them our creds to the main Git + * server with each request to the cache-server. This would + * allow an attacker to later use our creds to impersonate us + * on the main Git server. + * + * So we optionally verify that the URL to the cache-server is + * well-known by the main Git server. + */ + + do__gvfs_config(&status, &config_data); + + if (status.ec == GH__ERROR_CODE__OK) { + /* + * The gvfs/config response is in JSON, but I don't think + * we need to parse it and all that. Lets just do a simple + * strstr() and assume it is sufficient. + * + * We do add some context to the pattern to guard against + * some attacks. + */ + struct strbuf pattern = STRBUF_INIT; + + strbuf_addf(&pattern, "\"Url\":\"%s\"", gvfs_cache_server_url); + match = strstr(config_data.buf, pattern.buf); + + strbuf_release(&pattern); + } + + strbuf_release(&config_data); + + if (match) { + gh__global.cache_server_url = gvfs_cache_server_url; + trace2_data_string("gvfs-helper", NULL, "cache/url", + gvfs_cache_server_url); + } + + else if (gh__cmd_opts.cache_server_mode == + GH__CACHE_SERVER_MODE__VERIFY_ERROR) { + if (status.ec != GH__ERROR_CODE__OK) + die("could not verify cache-server '%s': %s", + gvfs_cache_server_url, + status.error_message.buf); + else + die("could not verify cache-server '%s'", + gvfs_cache_server_url); + } + + else if (gh__cmd_opts.cache_server_mode == + GH__CACHE_SERVER_MODE__VERIFY_DISABLE) { + if (status.ec != GH__ERROR_CODE__OK) + warning("could not verify cache-server '%s': %s", + gvfs_cache_server_url, + status.error_message.buf); + else + warning("could not verify cache-server '%s'", + gvfs_cache_server_url); + trace2_data_string("gvfs-helper", NULL, "cache/url", + "disabled"); + } + + gh__response_status__release(&status); +} + +/* + * Read stdin until EOF (or a blank line) and add the desired OIDs + * to the oidset. + * + * Stdin should contain a list of OIDs. It may have additional + * decoration that we need to strip out. + * + * We expect: + * [] // present OIDs + */ +static unsigned long read_stdin_from_rev_list(struct oidset *oids) +{ + struct object_id oid; + struct strbuf buf_stdin = STRBUF_INIT; + unsigned long count = 0; + + do { + if (strbuf_getline(&buf_stdin, stdin) == EOF || !buf_stdin.len) + break; + + if (get_oid_hex(buf_stdin.buf, &oid)) + continue; /* just silently eat it */ + + if (!oidset_insert(oids, &oid)) + count++; + } while (1); + + return count; +} + +/* + * Build a complete JSON payload for a gvfs/objects POST request + * containing the first n OIDs in an OIDSET index by the iterator. + * + * https://github.com/microsoft/VFSForGit/blob/master/Protocol.md + */ +static unsigned long build_json_payload__gvfs_objects( + struct json_writer *jw_req, + struct oidset_iter *iter, + unsigned long nr_in_block) +{ + unsigned long k; + const struct object_id *oid; + + k = 0; + + jw_init(jw_req); + jw_object_begin(jw_req, 0); + jw_object_intmax(jw_req, "commitDepth", gh__cmd_opts.depth); + jw_object_inline_begin_array(jw_req, "objectIds"); + while (k < nr_in_block && (oid = oidset_iter_next(iter))) { + jw_array_string(jw_req, oid_to_hex(oid)); + k++; + } + jw_end(jw_req); + jw_end(jw_req); + + return k; +} + +/* + * Lookup the creds for the main/origin Git server. + */ +static void lookup_main_creds(void) +{ + if (gh__global.main_creds.username && *gh__global.main_creds.username) + return; + + credential_from_url(&gh__global.main_creds, gh__global.main_url); + credential_fill(&gh__global.main_creds); + gh__global.main_creds_need_approval = 1; +} + +/* + * If we have a set of creds for the main Git server, tell the credential + * manager to throw them away and ask it to reacquire them. + */ +static void refresh_main_creds(void) +{ + if (gh__global.main_creds.username && *gh__global.main_creds.username) + credential_reject(&gh__global.main_creds); + + lookup_main_creds(); + + // TODO should we compare before and after values of u/p and + // TODO shortcut reauth if we already know it will fail? + // TODO if so, return a bool if same/different. +} + +static void approve_main_creds(void) +{ + if (!gh__global.main_creds_need_approval) + return; + + credential_approve(&gh__global.main_creds); + gh__global.main_creds_need_approval = 0; +} + +/* + * Build a set of creds for the cache-server based upon the main Git + * server (assuming we have a cache-server configured). + * + * That is, we NEVER fill them directly for the cache-server -- we + * only synthesize them from the filled main creds. + */ +static void synthesize_cache_server_creds(void) +{ + if (!gh__global.cache_server_is_initialized) + BUG("sub-command did not initialize cache-server vars"); + + if (!gh__global.cache_server_url) + return; + + if (gh__global.cache_creds.username && *gh__global.cache_creds.username) + return; + + /* + * Get the main Git server creds so we can borrow the username + * and password when we talk to the cache-server. + */ + lookup_main_creds(); + gh__global.cache_creds.username = xstrdup(gh__global.main_creds.username); + gh__global.cache_creds.password = xstrdup(gh__global.main_creds.password); +} + +/* + * Flush and refresh the cache-server creds. Because the cache-server + * does not do 401s (or manage creds), we have to reload the main Git + * server creds first. + * + * That is, we NEVER reject them directly because we never filled them. + */ +static void refresh_cache_server_creds(void) +{ + credential_clear(&gh__global.cache_creds); + + refresh_main_creds(); + synthesize_cache_server_creds(); +} + +/* + * We NEVER approve cache-server creds directly because we never directly + * filled them. However, we should be able to infer that the main ones + * are valid and can approve them if necessary. + */ +static void approve_cache_server_creds(void) +{ + approve_main_creds(); +} + +/* + * Select the ODB directory where we will write objects that we + * download. If was given on the command line or define in the + * config, use the local ODB (in ".git/objects"). + */ +static void select_odb(void) +{ + const char *odb_path = NULL; + + strbuf_init(&gh__global.buf_odb_path, 0); + + if (gvfs_shared_cache_pathname && *gvfs_shared_cache_pathname) + odb_path = gvfs_shared_cache_pathname; + else { + prepare_alt_odb(the_repository); + odb_path = the_repository->objects->odb->path; + } + + strbuf_addstr(&gh__global.buf_odb_path, odb_path); +} + +/* + * Create a tempfile to stream the packfile into. + * + * We create a tempfile in the chosen ODB directory and let CURL + * automatically stream data to the file. If successful, we can + * later rename it to a proper .pack and run "git index-pack" on + * it to create the corresponding .idx file. + * + * TODO I would rather to just stream the packfile directly into + * TODO "git index-pack --stdin" (and save some I/O) because it + * TODO will automatically take care of the rename of both files + * TODO and any other cleanup. BUT INDEX-PACK WILL ONLY WRITE + * TODO TO THE PRIMARY ODB -- it will not write into the alternates + * TODO (this is considered bad form). So we would need to add + * TODO an option to index-pack to handle this. I don't want to + * TODO deal with this issue right now. + * + * TODO Consider using lockfile for this rather than naked tempfile. + */ +static struct tempfile *create_tempfile_for_packfile(void) +{ + static unsigned int nth = 0; + static struct timeval tv = {0}; + static struct tm tm = {0}; + static time_t secs = 0; + static char tbuf[32] = {0}; + + struct tempfile *tempfile = NULL; + struct strbuf buf_path = STRBUF_INIT; + + if (!nth) { + gettimeofday(&tv, NULL); + secs = tv.tv_sec; + gmtime_r(&secs, &tm); + + xsnprintf(tbuf, sizeof(tbuf), "%4d%02d%02d-%02d%02d%02d-%06ld", + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, + (long)tv.tv_usec); + } + + // TODO should this be in the "/pack/tempPacks/" + // TODO directory instead? YES + + strbuf_addbuf(&buf_path, &gh__global.buf_odb_path); + strbuf_complete(&buf_path, '/'); + strbuf_addf(&buf_path, "pack/vfs-%s-%04d.temp", tbuf, nth++); + + tempfile = create_tempfile(buf_path.buf); + fdopen_tempfile(tempfile, "w"); + + strbuf_release(&buf_path); + + return tempfile; +} + +/* + * Create a tempfile to stream a loose object into. + * + * We create a tempfile in the chosen ODB directory and let CURL + * automatically stream data to the file. + * + * We put it directly in the "/xx/" directory. + */ +static void create_tempfile_for_loose( + struct gh__request_params *params, + struct gh__response_status *status, + const struct object_id *oid) +{ + struct strbuf buf_path = STRBUF_INIT; + const char *hex; + + gh__response_status__zero(status); + + hex = oid_to_hex(oid); + + strbuf_addbuf(&buf_path, &gh__global.buf_odb_path); + strbuf_complete(&buf_path, '/'); + strbuf_add(&buf_path, hex, 2); + + if (!file_exists(buf_path.buf) && mkdir(buf_path.buf, 0777) == -1) { + strbuf_addf(&status->error_message, + "cannot create directory for loose object '%s'", + buf_path.buf); + status->ec = GH__ERROR_CODE__COULD_NOT_CREATE_TEMPFILE; + goto cleanup; + } + + strbuf_addch(&buf_path, '/'); + strbuf_addstr(&buf_path, hex+2); + + /* Remember the full path of the final destination. */ + strbuf_setlen(¶ms->loose_path, 0); + strbuf_addbuf(¶ms->loose_path, &buf_path); + + /* + * Build a unique tempfile pathname based upon it. We avoid + * using lockfiles to avoid issues with stale locks after + * crashes. + */ + strbuf_addf(&buf_path, ".%08u.temp", getpid()); + + params->tempfile = create_tempfile(buf_path.buf); + if (!params->tempfile) { + strbuf_addstr(&status->error_message, + "could not create tempfile for loose object"); + status->ec = GH__ERROR_CODE__COULD_NOT_CREATE_TEMPFILE; + goto cleanup; + } + + fdopen_tempfile(params->tempfile, "w"); + +cleanup: + strbuf_release(&buf_path); +} + +/* + * Extract the filename portion of the given pathname. + * + * TODO Wish I could find a strbuf_filename() function for this. + */ +static void extract_filename(struct strbuf *filename, + const struct strbuf *pathname) +{ + size_t len = pathname->len; + + strbuf_setlen(filename, 0); + + while (len > 0 && !is_dir_sep(pathname->buf[len - 1])) + len--; + + strbuf_addstr(filename, &pathname->buf[len]); +} + +/* + * Convert the tempfile into a permanent .pack packfile in the ODB. + * Create the corresponding .idx file. + * + * Return the filename (not pathname) of the resulting packfile. + */ +static void install_packfile(struct gh__response_status *status, + struct tempfile **pp_tempfile, + struct strbuf *packfile_filename) +{ + struct child_process ip = CHILD_PROCESS_INIT; + struct strbuf pack_name_tmp = STRBUF_INIT; + struct strbuf pack_name_dst = STRBUF_INIT; + struct strbuf idx_name_tmp = STRBUF_INIT; + struct strbuf idx_name_dst = STRBUF_INIT; + size_t len_base; + + gh__response_status__zero(status); + + strbuf_setlen(packfile_filename, 0); + + /* + * start with ".temp" (that is owned by tempfile class). + * rename to ".pack.temp" to break ownership. + * + * create ".idx.temp" on provisional packfile. + * + * officially install both ".{pack,idx}.temp" as + * ".{pack,idx}". + */ + + strbuf_addstr(&pack_name_tmp, get_tempfile_path(*pp_tempfile)); + if (!strip_suffix(pack_name_tmp.buf, ".temp", &len_base)) { + /* + * This is more of a BUG(), but I want the error + * code propagated. + */ + strbuf_addf(&status->error_message, + "packfile tempfile does not end in '.temp': '%s'", + pack_name_tmp.buf); + status->ec = GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE; + goto cleanup; + } + + strbuf_setlen(&pack_name_tmp, (int)len_base); + strbuf_addbuf(&pack_name_dst, &pack_name_tmp); + strbuf_addbuf(&idx_name_tmp, &pack_name_tmp); + strbuf_addbuf(&idx_name_dst, &pack_name_tmp); + + strbuf_addstr(&pack_name_tmp, ".pack.temp"); + strbuf_addstr(&pack_name_dst, ".pack"); + strbuf_addstr(&idx_name_tmp, ".idx.temp"); + strbuf_addstr(&idx_name_dst, ".idx"); + + // TODO if either pack_name_dst or idx_name_dst already + // TODO exists in the ODB, create alternate names so that + // TODO we don't step on them. + + if (rename_tempfile(pp_tempfile, pack_name_tmp.buf) == -1) { + strbuf_addf(&status->error_message, + "could not rename packfile to '%s'", + pack_name_tmp.buf); + status->ec = GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE; + goto cleanup; + } + + argv_array_push(&ip.args, "index-pack"); + if (gh__cmd_opts.show_progress) + argv_array_push(&ip.args, "-v"); + argv_array_pushl(&ip.args, "-o", idx_name_tmp.buf, NULL); + argv_array_push(&ip.args, pack_name_tmp.buf); + ip.git_cmd = 1; + ip.no_stdin = 1; + ip.no_stdout = 1; + + // TODO consider capturing stdout from index-pack because + // TODO it will contain the SHA of the packfile and we can + // TODO (should?) add it to the .pack and .idx pathnames + // TODO when we install them. + // TODO + // TODO See pipe_command() rather than run_command(). + // TODO + // TODO Or should be SHA-it ourselves (or read the last 20 bytes)? + + /* + * Note that I DO NOT have a region around the index-pack process. + * The region in gh__run_one_slot() currently only covers the + * download time. This index-pack is a separate step not covered + * in the above region. Later, if/when we have CURL directly stream + * to index-pack, that region will be the combined download+index + * time. So, I'm not going to introduce it here. + */ + if (run_command(&ip)) { + unlink(pack_name_tmp.buf); + unlink(idx_name_tmp.buf); + strbuf_addf(&status->error_message, + "index-pack failed on '%s'", pack_name_tmp.buf); + status->ec = GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE; + goto cleanup; + } + + if (finalize_object_file(pack_name_tmp.buf, pack_name_dst.buf) || + finalize_object_file(idx_name_tmp.buf, idx_name_dst.buf)) { + unlink(pack_name_tmp.buf); + unlink(pack_name_dst.buf); + unlink(idx_name_tmp.buf); + unlink(idx_name_dst.buf); + strbuf_addf(&status->error_message, + "could not install packfile '%s'", + pack_name_dst.buf); + status->ec = GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE; + goto cleanup; + } + + extract_filename(packfile_filename, &pack_name_dst); + +cleanup: + child_process_clear(&ip); + strbuf_release(&pack_name_tmp); + strbuf_release(&pack_name_dst); + strbuf_release(&idx_name_tmp); + strbuf_release(&idx_name_dst); +} + +/* + * Convert the tempfile into a permanent loose object in the ODB. + */ +static void install_loose(struct gh__request_params *params, + struct gh__response_status *status) +{ + struct strbuf tmp_path = STRBUF_INIT; + + gh__response_status__zero(status); + + /* + * close tempfile to steal ownership away from tempfile class. + */ + strbuf_addstr(&tmp_path, get_tempfile_path(params->tempfile)); + close_tempfile_gently(params->tempfile); + + /* + * Try to install the tempfile as the actual loose object. + * + * If the loose object already exists, finalize_object_file() + * will NOT overwrite/replace it. It will silently eat the + * EEXIST error and unlink the tempfile as it if was + * successful. We just let it lie to us. + * + * Since our job is to back-fill missing objects needed by a + * foreground git process -- git should have called + * oid_object_info_extended() and loose_object_info() BEFORE + * asking us to download the missing object. So if we get a + * collision we have to assume something else is happening in + * parallel and we lost the race. And that's OK. + */ + if (finalize_object_file(tmp_path.buf, params->loose_path.buf)) { + unlink(tmp_path.buf); + strbuf_addf(&status->error_message, + "could not install loose object '%s'", + params->loose_path.buf); + status->ec = GH__ERROR_CODE__COULD_NOT_INSTALL_LOOSE; + } + + strbuf_release(&tmp_path); +} + +/* + * Our wrapper to initialize the HTTP layer. + * + * We always use the real origin server, not the cache-server, when + * initializing the http/curl layer. + */ +static void gh_http_init(void) +{ + if (gh__global.http_is_initialized) + return; + + http_init(gh__global.remote, gh__global.main_url, 0); + gh__global.http_is_initialized = 1; +} + +static void gh_http_cleanup(void) +{ + if (!gh__global.http_is_initialized) + return; + + http_cleanup(); + gh__global.http_is_initialized = 0; +} + +/* + * Do a single HTTP request without auth-retry or fallback. + */ +static void do_req(const char *url_base, + const char *url_component, + const struct credential *creds, + struct gh__request_params *params, + struct gh__response_status *status) +{ + struct active_request_slot *slot; + struct slot_results results; + struct strbuf rest_url = STRBUF_INIT; + + gh__response_status__zero(status); + + if (params->b_write_to_file) { + // TODO ftruncate tempfile ?? + } else { + strbuf_setlen(params->buffer, 0); + } + + end_url_with_slash(&rest_url, url_base); + strbuf_addstr(&rest_url, url_component); + + slot = get_active_slot(); + slot->results = &results; + + curl_easy_setopt(slot->curl, CURLOPT_NOBODY, 0); /* not a HEAD request */ + curl_easy_setopt(slot->curl, CURLOPT_URL, rest_url.buf); + curl_easy_setopt(slot->curl, CURLOPT_HTTPHEADER, params->headers); + + if (params->b_is_post) { + curl_easy_setopt(slot->curl, CURLOPT_POST, 1); + curl_easy_setopt(slot->curl, CURLOPT_ENCODING, NULL); + curl_easy_setopt(slot->curl, CURLOPT_POSTFIELDS, + params->post_payload->buf); + curl_easy_setopt(slot->curl, CURLOPT_POSTFIELDSIZE, + params->post_payload->len); + } else { + curl_easy_setopt(slot->curl, CURLOPT_POST, 0); + } + + if (params->b_write_to_file) { + curl_easy_setopt(slot->curl, CURLOPT_WRITEFUNCTION, fwrite); + curl_easy_setopt(slot->curl, CURLOPT_WRITEDATA, + (void*)params->tempfile->fp); + } else { + curl_easy_setopt(slot->curl, CURLOPT_WRITEFUNCTION, + fwrite_buffer); + curl_easy_setopt(slot->curl, CURLOPT_FILE, params->buffer); + } + + if (creds && creds->username) { + /* + * Force CURL to respect the username/password we provide by + * turning off the AUTH-ANY negotiation stuff. + * + * That is, CURLAUTH_ANY causes CURL to NOT send the creds + * on an initial request in order to force a 401 and let it + * negotiate the best auth scheme and then retry. + * + * This is problematic when talking to the cache-servers + * because they send a 400 (with a "A valid Basic Auth..." + * message body) rather than a 401. This means that the + * the automatic retry will never happen. And even if we + * do force a retry, CURL still won't send the creds. + * + * So we turn it off and force it use our creds. + */ + curl_easy_setopt(slot->curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); + curl_easy_setopt(slot->curl, CURLOPT_USERNAME, creds->username); + curl_easy_setopt(slot->curl, CURLOPT_PASSWORD, creds->password); + } else { + /* + * Turn on the AUTH-ANY negotiation. This only works + * with the main Git server (because the cache-server + * doesn't handle 401s). + * + * TODO Think about if we really need to handle this case + * TODO and if so, add an .is_main to params. + */ + curl_easy_setopt(slot->curl, CURLOPT_HTTPAUTH, CURLAUTH_ANY); + } + + if (params->progress_base_phase2_msg.len || + params->progress_base_phase3_msg.len) { + curl_easy_setopt(slot->curl, CURLOPT_XFERINFOFUNCTION, + gh__curl_progress_cb); + curl_easy_setopt(slot->curl, CURLOPT_XFERINFODATA, params); + curl_easy_setopt(slot->curl, CURLOPT_NOPROGRESS, 0); + } else { + curl_easy_setopt(slot->curl, CURLOPT_NOPROGRESS, 1); + } + + gh__run_one_slot(slot, params, status); +} + +static void do_req__to_main(const char *url_component, + struct gh__request_params *params, + struct gh__response_status *status) +{ +// lookup_main_creds(); + + do_req(gh__global.main_url, url_component, &gh__global.main_creds, + params, status); + + if (status->response_code == 401) { + refresh_main_creds(); + + do_req(gh__global.main_url, url_component, &gh__global.main_creds, + params, status); + } + + if (status->response_code == 200) + approve_main_creds(); +} + +static void do_req__to_cache_server(const char *url_component, + struct gh__request_params *params, + struct gh__response_status *status) +{ + synthesize_cache_server_creds(); + + do_req(gh__global.cache_server_url, url_component, &gh__global.cache_creds, + params, status); + fixup_cache_server_400_to_401(status); + + if (status->response_code == 401) { + refresh_cache_server_creds(); + + do_req(gh__global.cache_server_url, url_component, + &gh__global.cache_creds, params, status); + fixup_cache_server_400_to_401(status); + } + + if (status->response_code == 200) + approve_cache_server_creds(); +} + +static void do_req__with_fallback(const char *url_component, + struct gh__request_params *params, + struct gh__response_status *status) +{ + if (gh__global.cache_server_url && !params->b_no_cache_server) { + do_req__to_cache_server(url_component, params, status); + + if (status->response_code == 200) + return; + + if (!gh__cmd_opts.try_fallback) + return; + + /* + * The cache-server shares creds with the main Git server, + * so if our creds failed against the cache-server, they + * will also fail against the main Git server. We just let + * this fail. + * + * Falling-back would likely just cause the 3rd (or maybe + * 4th) cred prompt. + */ + if (status->response_code == 401) + return; + } + + do_req__to_main(url_component, params, status); +} + +/* + * Call "gvfs/config" REST API. + * + * Return server's response buffer. This is probably a raw JSON string. + */ +static void do__gvfs_config(struct gh__response_status *status, + struct strbuf *config_data) +{ + struct gh__request_params params = GH__REQUEST_PARAMS_INIT; + + strbuf_addstr(¶ms.label, "GET/config"); + + params.b_is_post = 0; + params.b_write_to_file = 0; + params.b_no_cache_server = 1; /* they don't handle gvfs/config API */ + params.buffer = config_data; + + params.object_count = 1; /* a bit of a lie */ + + /* + * "X-TFS-FedAuthRedirect: Suppress" disables the 302 + 203 redirect + * sequence to a login page and forces the main Git server to send a + * normal 401. + */ + params.headers = http_copy_default_headers(); + params.headers = curl_slist_append(params.headers, + "X-TFS-FedAuthRedirect: Suppress"); + params.headers = curl_slist_append(params.headers, + "Pragma: no-cache"); + + if (gh__cmd_opts.show_progress) { + /* + * gvfs/config has a very small reqest payload, so I don't + * see any need to report progress on the upload side of + * the GET. So just report progress on the download side. + */ + strbuf_addstr(¶ms.progress_base_phase3_msg, + "Receiving gvfs/config"); + } + + do_req__with_fallback("gvfs/config", ¶ms, status); + + gh__request_params__release(¶ms); +} + +/* + * Call "gvfs/objects/" REST API to fetch a loose object + * and write it to the ODB. + */ +static void do__loose__gvfs_object(struct gh__response_status *status, + const struct object_id *oid) +{ + struct gh__request_params params = GH__REQUEST_PARAMS_INIT; + struct strbuf component_url = STRBUF_INIT; + + gh__response_status__zero(status); + + strbuf_addf(&component_url, "gvfs/objects/%s", oid_to_hex(oid)); + + strbuf_addstr(¶ms.label, "GET/objects"); + + params.b_is_post = 0; + params.b_write_to_file = 1; + params.b_no_cache_server = 0; + + params.object_count = 1; + + params.headers = http_copy_default_headers(); + params.headers = curl_slist_append(params.headers, + "X-TFS-FedAuthRedirect: Suppress"); + params.headers = curl_slist_append(params.headers, + "Pragma: no-cache"); + + create_tempfile_for_loose(¶ms, status, oid); + if (!params.tempfile) + goto cleanup; + + if (gh__cmd_opts.show_progress) { + /* + * Likewise, a gvfs/objects/{oid} has a very small reqest + * payload, so I don't see any need to report progress on + * the upload side of the GET. So just report progress + * on the download side. + */ + strbuf_addstr(¶ms.progress_base_phase3_msg, + "Receiving 1 loose object"); + } + + do_req__with_fallback(component_url.buf, ¶ms, status); + + if (status->ec == GH__ERROR_CODE__OK) + install_loose(¶ms, status); + +cleanup: + gh__request_params__release(¶ms); + strbuf_release(&component_url); +} + +/* + * Call "gvfs/objects" POST REST API to fetch a packfile containing + * the objects in the requested OIDSET. Returns the filename (not + * pathname) to the new packfile. + */ +static void do__packfile__gvfs_objects(struct gh__response_status *status, + struct oidset_iter *iter, + unsigned long nr_wanted_in_block, + struct strbuf *output_filename, + unsigned long *nr_taken) +{ + struct json_writer jw_req = JSON_WRITER_INIT; + struct gh__request_params params = GH__REQUEST_PARAMS_INIT; + + gh__response_status__zero(status); + + params.object_count = build_json_payload__gvfs_objects( + &jw_req, iter, nr_wanted_in_block); + *nr_taken = params.object_count; + + strbuf_addstr(¶ms.label, "POST/objects"); + + params.b_is_post = 1; + params.b_write_to_file = 1; + params.b_no_cache_server = 0; + + params.post_payload = &jw_req.json; + + params.headers = http_copy_default_headers(); + params.headers = curl_slist_append(params.headers, + "X-TFS-FedAuthRedirect: Suppress"); + params.headers = curl_slist_append(params.headers, + "Pragma: no-cache"); + params.headers = curl_slist_append(params.headers, + "Content-Type: application/json"); + /* + * We really always want a packfile. But if the payload only + * requests 1 OID, the server will/may send us a single loose + * objects instead. (Apparently the server ignores us when we + * only send application/x-git-packfile and does it anyway.) + * + * So to make it clear to my future self, go ahead and add + * an accept header for loose objects and own it. + */ + params.headers = curl_slist_append(params.headers, + "Accept: application/x-git-packfile"); + params.headers = curl_slist_append(params.headers, + "Accept: application/x-git-loose-object"); + + params.tempfile = create_tempfile_for_packfile(); + if (!params.tempfile) { + strbuf_addstr(&status->error_message, + "could not create tempfile for packfile"); + status->ec = GH__ERROR_CODE__COULD_NOT_CREATE_TEMPFILE; + goto cleanup; + } + + if (gh__cmd_opts.show_progress) { + strbuf_addf(¶ms.progress_base_phase2_msg, + "Requesting packfile with %ld objects", + params.object_count); + strbuf_addf(¶ms.progress_base_phase3_msg, + "Receiving packfile with %ld objects", + params.object_count); + } + + do_req__with_fallback("gvfs/objects", ¶ms, status); + + if (status->ec == GH__ERROR_CODE__OK) { + if (!strcmp(status->content_type.buf, + "application/x-git-packfile")) { + + // TODO Consider having a worker thread to manage + // TODO running index-pack and then install the + // TODO resulting .idx and .pack files. This would + // TODO let us interleave those steps with our thread + // TODO fetching the next block of objects from the + // TODO server. (Need to think about how progress + // TODO messages from our thread and index-pack + // TODO would mesh.) + // TODO + // TODO But then again, if we hack index-pack to write + // TODO to our alternate and stream the data thru it, + // TODO it won't matter. + + install_packfile(status, ¶ms.tempfile, + output_filename); + goto cleanup; + } + + if (!strcmp(status->content_type.buf, + "application/x-git-loose-object")) + { + /* + * This should not happen (when we request + * more than one object). The server can send + * us a loose object (even when we use the + * POST form) if there is only one object in + * the payload (and despite the set of accept + * headers we send), so I'm going to leave + * this here. + */ + strbuf_addstr(&status->error_message, + "received loose object when packfile expected"); + status->ec = GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE; + goto cleanup; + } + + strbuf_addf(&status->error_message, + "received unknown content-type '%s'", + status->content_type.buf); + status->ec = GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE; + goto cleanup; + } + +cleanup: + gh__request_params__release(¶ms); + jw_release(&jw_req); +} + +/* + * Bulk or individually fetch a list of objects in one or more http requests. + * Create one or more packfiles and/or loose objects. + * + * We accumulate results for each request in `result_list` until we get a + * hard error and have to stop. + */ +static void do_fetch_oidset(struct gh__response_status *status, + struct oidset *oids, + unsigned long nr_total, + struct string_list *result_list) +{ + struct oidset_iter iter; + struct strbuf output_filename = STRBUF_INIT; + struct strbuf msg = STRBUF_INIT; + struct strbuf err404 = STRBUF_INIT; + const struct object_id *oid; + unsigned long k; + unsigned long nr_taken; + int had_404 = 0; + + gh__response_status__zero(status); + if (!nr_total) + return; + + oidset_iter_init(oids, &iter); + + for (k = 0; k < nr_total; k += nr_taken) { + if (nr_total - k == 1 || gh__cmd_opts.block_size == 1) { + oid = oidset_iter_next(&iter); + nr_taken = 1; + + do__loose__gvfs_object(status, oid); + + /* + * If we get a 404 for an individual object, ignore + * it and get the rest. We'll fixup the 'ec' later. + */ + if (status->ec == GH__ERROR_CODE__HTTP_404) { + if (!err404.len) + strbuf_addf(&err404, "%s: loose object %s", + status->error_message.buf, + oid_to_hex(oid)); + /* + * Mark the fetch as "incomplete", but don't + * stop trying to get other chunks. + */ + had_404 = 1; + continue; + } + + if (status->ec != GH__ERROR_CODE__OK) { + /* Stop at the first hard error. */ + strbuf_addf(&status->error_message, ": loose %s", + oid_to_hex(oid)); + goto cleanup; + } + + strbuf_setlen(&msg, 0); + strbuf_addf(&msg, "loose %s", oid_to_hex(oid)); + string_list_append(result_list, msg.buf); + + } else { + strbuf_setlen(&output_filename, 0); + + do__packfile__gvfs_objects(status, &iter, + gh__cmd_opts.block_size, + &output_filename, + &nr_taken); + + /* + * Because the oidset iterator has random + * order, it does no good to say the k-th or + * n-th chunk was incomplete; the client + * cannot use that index for anything. + * + * We get a 404 when at least one object in + * the chunk was not found. + * + * TODO Consider various retry strategies (such as + * TODO loose or bisect) on the members within this + * TODO chunk to reduce the impact of the miss. + * + * For now, ignore the 404 and go on to the + * next chunk and then fixup the 'ec' later. + */ + if (status->ec == GH__ERROR_CODE__HTTP_404) { + if (!err404.len) + strbuf_addf(&err404, + "%s: packfile object", + status->error_message.buf); + /* + * Mark the fetch as "incomplete", but don't + * stop trying to get other chunks. + */ + had_404 = 1; + continue; + } + + if (status->ec != GH__ERROR_CODE__OK) { + /* Stop at the first hard error. */ + strbuf_addstr(&status->error_message, + ": in packfile"); + goto cleanup; + } + + strbuf_setlen(&msg, 0); + strbuf_addf(&msg, "packfile %s", output_filename.buf); + string_list_append(result_list, msg.buf); + } + } + +cleanup: + strbuf_release(&msg); + strbuf_release(&err404); + strbuf_release(&output_filename); + + if (had_404 && status->ec == GH__ERROR_CODE__OK) { + strbuf_setlen(&status->error_message, 0); + strbuf_addstr(&status->error_message, "404 Not Found"); + status->ec = GH__ERROR_CODE__HTTP_404; + } +} + +/* + * Finish with initialization. This happens after the main option + * parsing, dispatch to sub-command, and sub-command option parsing + * and before actually doing anything. + * + * Optionally configure the cache-server if the sub-command will + * use it. + */ +static void finish_init(int setup_cache_server) +{ + select_odb(); + + lookup_main_url(); + gh_http_init(); + + if (setup_cache_server) + select_cache_server(); +} + +/* + * Request gvfs/config from main Git server. (Config data is not + * available from a GVFS cache-server.) + * + * Print the received server configuration (as the raw JSON string). + */ +static enum gh__error_code do_sub_cmd__config(int argc, const char **argv) +{ + struct gh__response_status status = GH__RESPONSE_STATUS_INIT; + struct strbuf config_data = STRBUF_INIT; + enum gh__error_code ec = GH__ERROR_CODE__OK; + + trace2_cmd_mode("config"); + + finish_init(0); + + do__gvfs_config(&status, &config_data); + ec = status.ec; + + if (ec == GH__ERROR_CODE__OK) + printf("%s\n", config_data.buf); + else + error("config: %s", status.error_message.buf); + + gh__response_status__release(&status); + strbuf_release(&config_data); + + return ec; +} + +/* + * Read a list of objects from stdin and fetch them in a single request (or + * multiple block-size requests). + */ +static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) +{ + static struct option get_options[] = { + OPT_MAGNITUDE('b', "block-size", &gh__cmd_opts.block_size, + N_("number of objects to request at a time")), + OPT_INTEGER('d', "depth", &gh__cmd_opts.depth, + N_("Commit depth")), + OPT_END(), + }; + + struct gh__response_status status = GH__RESPONSE_STATUS_INIT; + struct oidset oids = OIDSET_INIT; + struct string_list result_list = STRING_LIST_INIT_DUP; + enum gh__error_code ec = GH__ERROR_CODE__OK; + unsigned long nr_total; + int k; + + trace2_cmd_mode("get"); + + if (argc > 1 && !strcmp(argv[1], "-h")) + usage_with_options(get_usage, get_options); + + argc = parse_options(argc, argv, NULL, get_options, get_usage, 0); + if (gh__cmd_opts.depth < 1) + gh__cmd_opts.depth = 1; + + finish_init(1); + + nr_total = read_stdin_from_rev_list(&oids); + + trace2_region_enter("gvfs-helper", "get", NULL); + trace2_data_intmax("gvfs-helper", NULL, "get/nr_objects", nr_total); + do_fetch_oidset(&status, &oids, nr_total, &result_list); + trace2_region_leave("gvfs-helper", "get", NULL); + + ec = status.ec; + + for (k = 0; k < result_list.nr; k++) + printf("%s\n", result_list.items[k].string); + + if (ec != GH__ERROR_CODE__OK) + error("get: %s", status.error_message.buf); + + gh__response_status__release(&status); + oidset_clear(&oids); + string_list_clear(&result_list, 0); + + return ec; +} + +/* + * Handle the 'get' command when in "server mode". Only call error() + * for hard errors where we cannot communicate correctly with the foreground + * client process. Pass any actual data errors (such as 404's or 401's from + * the fetch back to the client process. + */ +static enum gh__error_code do_server_subprocess_get(void) +{ + struct gh__response_status status = GH__RESPONSE_STATUS_INIT; + struct oidset oids = OIDSET_INIT; + struct object_id oid; + struct string_list result_list = STRING_LIST_INIT_DUP; + enum gh__error_code ec = GH__ERROR_CODE__OK; + char *line; + int len; + int err; + int k; + unsigned long nr_total = 0; + + /* + * Inside the "get" command, we expect a list of OIDs + * and a flush. + */ + while (1) { + len = packet_read_line_gently(0, NULL, &line); + if (len < 0 || !line) + break; + + if (get_oid_hex(line, &oid)) { + error("server: invalid oid syntax '%s'", line); + ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; + goto cleanup; + } + + if (!oidset_insert(&oids, &oid)) + nr_total++; + } + + if (!nr_total) { + if (packet_write_fmt_gently(1, "ok\n")) { + error("server: cannot write 'get' result to client"); + ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; + } else + ec = GH__ERROR_CODE__OK; + goto cleanup; + } + + trace2_region_enter("gvfs-helper", "server/get", NULL); + trace2_data_intmax("gvfs-helper", NULL, "server/get/nr_objects", nr_total); + do_fetch_oidset(&status, &oids, nr_total, &result_list); + trace2_region_leave("gvfs-helper", "server/get", NULL); + + /* + * Write pathname of the ODB where we wrote all of the objects + * we fetched. + */ + if (packet_write_fmt_gently(1, "odb %s\n", + gh__global.buf_odb_path.buf)) { + error("server: cannot write 'odb' to client"); + ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; + goto cleanup; + } + + for (k = 0; k < result_list.nr; k++) + if (packet_write_fmt_gently(1, "%s\n", + result_list.items[k].string)) + { + error("server: cannot write result to client: '%s'", + result_list.items[k].string); + ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; + goto cleanup; + } + + err = 0; + if (ec == GH__ERROR_CODE__OK) + err = packet_write_fmt_gently(1, "ok\n"); + else if (ec == GH__ERROR_CODE__HTTP_404) + err = packet_write_fmt_gently(1, "partial\n"); + else + err = packet_write_fmt_gently(1, "error %s\n", + status.error_message.buf); + if (err) { + error("server: cannot write result to client"); + ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; + goto cleanup; + } + + if (packet_flush_gently(1)) { + error("server: cannot flush result to client"); + ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; + goto cleanup; + } + +cleanup: + oidset_clear(&oids); + string_list_clear(&result_list, 0); + + return ec; +} + +typedef enum gh__error_code (fn_subprocess_cmd)(void); + +struct subprocess_capability { + const char *name; + int client_has; + fn_subprocess_cmd *pfn; +}; + +static struct subprocess_capability caps[] = { + { "get", 0, do_server_subprocess_get }, + { NULL, 0, NULL }, +}; + +/* + * Handle the subprocess protocol handshake as described in: + * [] Documentation/technical/protocol-common.txt + * [] Documentation/technical/long-running-process-protocol.txt + */ +static int do_protocol_handshake(void) +{ +#define OUR_SUBPROCESS_VERSION "1" + + char *line; + int len; + int k; + int b_support_our_version = 0; + + len = packet_read_line_gently(0, NULL, &line); + if (len < 0 || !line || strcmp(line, "gvfs-helper-client")) { + error("server: subprocess welcome handshake failed: %s", line); + return -1; + } + + while (1) { + const char *v; + len = packet_read_line_gently(0, NULL, &line); + if (len < 0 || !line) + break; + if (!skip_prefix(line, "version=", &v)) { + error("server: subprocess version handshake failed: %s", + line); + return -1; + } + b_support_our_version |= (!strcmp(v, OUR_SUBPROCESS_VERSION)); + } + if (!b_support_our_version) { + error("server: client does not support our version: %s", + OUR_SUBPROCESS_VERSION); + return -1; + } + + if (packet_write_fmt_gently(1, "gvfs-helper-server\n") || + packet_write_fmt_gently(1, "version=%s\n", + OUR_SUBPROCESS_VERSION) || + packet_flush_gently(1)) { + error("server: cannot write version handshake"); + return -1; + } + + while (1) { + const char *v; + int k; + + len = packet_read_line_gently(0, NULL, &line); + if (len < 0 || !line) + break; + if (!skip_prefix(line, "capability=", &v)) { + error("server: subprocess capability handshake failed: %s", + line); + return -1; + } + for (k = 0; caps[k].name; k++) + if (!strcmp(v, caps[k].name)) + caps[k].client_has = 1; + } + + for (k = 0; caps[k].name; k++) + if (caps[k].client_has) + if (packet_write_fmt_gently(1, "capability=%s\n", + caps[k].name)) { + error("server: cannot write capabilities handshake: %s", + caps[k].name); + return -1; + } + if (packet_flush_gently(1)) { + error("server: cannot write capabilities handshake"); + return -1; + } + + return 0; +} + +/* + * Interactively listen to stdin for a series of commands and execute them. + */ +static enum gh__error_code do_sub_cmd__server(int argc, const char **argv) +{ + static struct option server_options[] = { + OPT_MAGNITUDE('b', "block-size", &gh__cmd_opts.block_size, + N_("number of objects to request at a time")), + OPT_INTEGER('d', "depth", &gh__cmd_opts.depth, + N_("Commit depth")), + OPT_END(), + }; + + enum gh__error_code ec = GH__ERROR_CODE__OK; + char *line; + int len; + int k; + + trace2_cmd_mode("server"); + + if (argc > 1 && !strcmp(argv[1], "-h")) + usage_with_options(server_usage, server_options); + + argc = parse_options(argc, argv, NULL, server_options, server_usage, 0); + if (gh__cmd_opts.depth < 1) + gh__cmd_opts.depth = 1; + + finish_init(1); + + if (do_protocol_handshake()) { + ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; + goto cleanup; + } + +top_of_loop: + while (1) { + len = packet_read_line_gently(0, NULL, &line); + if (len < 0 || !line) { + /* use extra FLUSH as a QUIT */ + ec = GH__ERROR_CODE__OK; + goto cleanup; + } + + for (k = 0; caps[k].name; k++) { + if (caps[k].client_has && !strcmp(line, caps[k].name)) { + ec = (caps[k].pfn)(); + if (ec != GH__ERROR_CODE__OK) + goto cleanup; + goto top_of_loop; + } + } + + error("server: unknown command '%s'", line); + ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; + goto cleanup; + } + +cleanup: + return ec; +} + +static enum gh__error_code do_sub_cmd(int argc, const char **argv) +{ + if (!strcmp(argv[0], "get")) + return do_sub_cmd__get(argc, argv); + + if (!strcmp(argv[0], "config")) + return do_sub_cmd__config(argc, argv); + + if (!strcmp(argv[0], "server")) + return do_sub_cmd__server(argc, argv); + + // TODO have "test" mode that could be used to drive + // TODO unit testing. + + return GH__ERROR_CODE__USAGE; +} + +/* + * Communicate with the primary Git server or a GVFS cache-server using the + * GVFS Protocol. + * + * https://github.com/microsoft/VFSForGit/blob/master/Protocol.md + */ +int cmd_main(int argc, const char **argv) +{ + static struct option main_options[] = { + OPT_STRING('r', "remote", &gh__cmd_opts.remote_name, + N_("remote"), + N_("Remote name")), + OPT_BOOL('f', "fallback", &gh__cmd_opts.try_fallback, + N_("Fallback to Git server if cache-server fails")), + OPT_CALLBACK(0, "cache-server", NULL, + N_("cache-server"), + N_("cache-server=disable|trust|verify|error"), + option_parse_cache_server_mode), + OPT_CALLBACK(0, "shared-cache", NULL, + N_("pathname"), + N_("Pathname to shared objects directory"), + option_parse_shared_cache_directory), + OPT_BOOL('p', "progress", &gh__cmd_opts.show_progress, + N_("Show progress")), + OPT_END(), + }; + + enum gh__error_code ec = GH__ERROR_CODE__OK; + + if (argc > 1 && !strcmp(argv[1], "-h")) + usage_with_options(main_usage, main_options); + + trace2_cmd_name("gvfs-helper"); + + setup_git_directory_gently(NULL); + + git_config(git_default_config, NULL); + + /* Set any non-zero initial values in gh__cmd_opts. */ + gh__cmd_opts.depth = 1; + gh__cmd_opts.block_size = GH__DEFAULT_BLOCK_SIZE; + gh__cmd_opts.show_progress = !!isatty(2); + + argc = parse_options(argc, argv, NULL, main_options, main_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (argc == 0) + usage_with_options(main_usage, main_options); + + ec = do_sub_cmd(argc, argv); + + gh_http_cleanup(); + + if (ec == GH__ERROR_CODE__USAGE) + usage_with_options(main_usage, main_options); + + return ec; +} diff --git a/promisor-remote.c b/promisor-remote.c index 9bd5b79d59446d..3902745f66c11b 100644 --- a/promisor-remote.c +++ b/promisor-remote.c @@ -3,6 +3,7 @@ #include "promisor-remote.h" #include "config.h" #include "transport.h" +#include "gvfs-helper-client.h" static char *repository_format_partial_clone; static const char *core_partial_clone_filter_default; @@ -40,6 +41,7 @@ static int fetch_objects(const char *remote_name, struct ref *ref = NULL; int i; + for (i = 0; i < oid_nr; i++) { struct ref *new_ref = alloc_ref(oid_to_hex(&oids[i])); oidcpy(&new_ref->old_oid, &oids[i]); @@ -199,7 +201,7 @@ struct promisor_remote *promisor_remote_find(const char *remote_name) int has_promisor_remote(void) { - return !!promisor_remote_find(NULL); + return core_use_gvfs_helper || !!promisor_remote_find(NULL); } static int remove_fetched_oids(struct repository *repo, @@ -244,6 +246,14 @@ int promisor_remote_get_direct(struct repository *repo, int to_free = 0; int res = -1; + if (core_use_gvfs_helper) { + enum gh_client__created ghc = GHC__CREATED__NOTHING; + + trace2_data_intmax("bug", the_repository, "fetch_objects/gvfs-helper", oid_nr); + gh_client__queue_oid_array(oids, oid_nr); + return gh_client__drain_queue(&ghc); + } + promisor_remote_init(); for (r = promisors; r; r = r->next) { diff --git a/sha1-file.c b/sha1-file.c index c960dfa348b8d5..756b095cc26a90 100644 --- a/sha1-file.c +++ b/sha1-file.c @@ -35,6 +35,7 @@ #include "sigchain.h" #include "sub-process.h" #include "pkt-line.h" +#include "gvfs-helper-client.h" /* The maximum size for an object header. */ #define MAX_HEADER_LEN 32 @@ -1556,6 +1557,7 @@ int oid_object_info_extended(struct repository *r, const struct object_id *oid, const struct object_id *real = oid; int already_retried = 0; int tried_hook = 0; + int tried_gvfs_helper = 0; if (flags & OBJECT_INFO_LOOKUP_REPLACE) real = lookup_replace_object(r, oid); @@ -1598,13 +1600,41 @@ int oid_object_info_extended(struct repository *r, const struct object_id *oid, if (!loose_object_info(r, real, oi, flags)) return 0; + if (core_use_gvfs_helper && !tried_gvfs_helper) { + enum ghc__created ghc; + + if (flags & OBJECT_INFO_SKIP_FETCH_OBJECT) + return -1; + + ghc__get_immediate(real, &ghc); + tried_gvfs_helper = 1; + + /* + * Retry the lookup IIF `gvfs-helper` created one + * or more new packfiles or loose objects. + */ + if (ghc != GHC__CREATED__NOTHING) + continue; + + /* + * If `gvfs-helper` fails, we just want to return -1. + * But allow the other providers to have a shot at it. + * (At least until we have a chance to consolidate + * them.) + */ + } + /* Not a loose object; someone else may have just packed it. */ if (!(flags & OBJECT_INFO_QUICK)) { reprepare_packed_git(r); if (find_pack_entry(r, real, &e)) break; if (core_virtualize_objects && !tried_hook) { + // TODO Assert or at least trace2 if gvfs-helper + // TODO was tried and failed and then read-object-hook + // TODO is successful at getting this object. tried_hook = 1; + // TODO BUG? Should 'oid' be 'real' ? if (!read_object_process(oid)) goto retry; } From 73d2d384fd7c7a562579d09dd0535618e2580f94 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 4 Oct 2019 15:30:34 -0400 Subject: [PATCH 26/36] gvfs-helper: fix race condition when creating loose object dirs When two gvfs-helper processes are the first to create a loose object directory, the processes (A and B in the timeline below) could have the following race: 1. A sees that the directory does not exist. 2. B sees that the directory does not exist. 3. A creates the directory with success. 4. B fails to create the directory and fails. Instead of having B fail here, just check for the directory's existence before reporting an error. That solves the race and allows tests to pass. Signed-off-by: Derrick Stolee --- gvfs-helper.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gvfs-helper.c b/gvfs-helper.c index 6e9c0dfb707183..37726bfef99c21 100644 --- a/gvfs-helper.c +++ b/gvfs-helper.c @@ -1049,7 +1049,9 @@ static void create_tempfile_for_loose( strbuf_complete(&buf_path, '/'); strbuf_add(&buf_path, hex, 2); - if (!file_exists(buf_path.buf) && mkdir(buf_path.buf, 0777) == -1) { + if (!file_exists(buf_path.buf) && + mkdir(buf_path.buf, 0777) == -1 && + !file_exists(buf_path.buf)) { strbuf_addf(&status->error_message, "cannot create directory for loose object '%s'", buf_path.buf); From 7f68d1da5f1f5fb3e37a1030147796bd00336759 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 7 Oct 2019 13:55:42 -0400 Subject: [PATCH 27/36] sha1-file: create shared-cache directory if it doesn't exist The config variable `gvfs.sharedCache` contains the pathname to an alternate that will be used by `gvfs-helper` to store dynamically-fetched missing objects. If this directory does not exist on disk, `prepare_alt_odb()` omits this directory from the in-memory list of alternates. This causes `git` commands (and `gvfs-helper` in particular) to fall-back to `.git/objects` for storage of these objects. This disables the shared-cache and leads to poorer performance. Teach `alt_obj_usable()` and `prepare_alt_odb()`, match up the directory named in `gvfs.sharedCache` with an entry in `.git/objects/info/alternates` and force-create the `` root directory (and the associated `/pack` directory) if necessary. If the value of `gvfs.sharedCache` refers to a directory that is NOT listed as an alternate, create an in-memory alternate entry in the odb-list. (This is similar to how GIT_ALTERNATE_OBJECT_DIRECTORIES works.) This work happens the first time that `prepare_alt_odb()` is called. Furthermore, teach the `--shared-cache=` command line option in `gvfs-helper` (which is runs after the first call to `prepare_alt_odb()`) to override the inherited shared-cache (and again, create the ODB directory if necessary). Signed-off-by: Jeff Hostetler --- cache.h | 2 +- config.c | 12 ++--- environment.c | 2 +- gvfs-helper-client.c | 28 ++++++----- gvfs-helper.c | 107 +++++++++++++++++++++++++++++++++---------- sha1-file.c | 75 ++++++++++++++++++++++++++++++ 6 files changed, 181 insertions(+), 45 deletions(-) diff --git a/cache.h b/cache.h index 5e055a4a5e59fa..374e57cb81d081 100644 --- a/cache.h +++ b/cache.h @@ -928,7 +928,7 @@ extern int protect_ntfs; extern const char *core_fsmonitor; extern int core_use_gvfs_helper; extern const char *gvfs_cache_server_url; -extern const char *gvfs_shared_cache_pathname; +extern struct strbuf gvfs_shared_cache_pathname; extern int core_apply_sparse_checkout; extern int core_sparse_checkout_cone; diff --git a/config.c b/config.c index 02ed8947af103f..82ec061a6eaffd 100644 --- a/config.c +++ b/config.c @@ -1508,19 +1508,17 @@ static int git_default_gvfs_config(const char *var, const char *value) } if (!strcmp(var, "gvfs.sharedcache") && value && *value) { - struct strbuf buf = STRBUF_INIT; - strbuf_addstr(&buf, value); - if (strbuf_normalize_path(&buf) < 0) { + strbuf_setlen(&gvfs_shared_cache_pathname, 0); + strbuf_addstr(&gvfs_shared_cache_pathname, value); + if (strbuf_normalize_path(&gvfs_shared_cache_pathname) < 0) { /* * Pretend it wasn't set. This will cause us to * fallback to ".git/objects" effectively. */ - strbuf_release(&buf); + strbuf_release(&gvfs_shared_cache_pathname); return 0; } - strbuf_trim_trailing_dir_sep(&buf); - - gvfs_shared_cache_pathname = strbuf_detach(&buf, NULL); + strbuf_trim_trailing_dir_sep(&gvfs_shared_cache_pathname); return 0; } diff --git a/environment.c b/environment.c index 62543cc16bc366..032841d787c303 100644 --- a/environment.c +++ b/environment.c @@ -88,7 +88,7 @@ int protect_ntfs = PROTECT_NTFS_DEFAULT; const char *core_fsmonitor; int core_use_gvfs_helper; const char *gvfs_cache_server_url; -const char *gvfs_shared_cache_pathname; +struct strbuf gvfs_shared_cache_pathname = STRBUF_INIT; /* * The character that begins a commented line in user-editable file diff --git a/gvfs-helper-client.c b/gvfs-helper-client.c index 4c67a2cb2131f6..c03747b01f09ca 100644 --- a/gvfs-helper-client.c +++ b/gvfs-helper-client.c @@ -224,6 +224,13 @@ static int ghc__get__receive_response(struct child_process *process, return err; } +/* + * Select the preferred ODB for fetching missing objects. + * This should be the alternate with the same directory + * name as set in `gvfs.sharedCache`. + * + * Fallback to .git/objects if necessary. + */ static void ghc__choose_odb(void) { struct object_directory *odb; @@ -231,22 +238,19 @@ static void ghc__choose_odb(void) if (ghs__chosen_odb) return; + ghs__chosen_odb = the_repository->objects->odb; + prepare_alt_odb(the_repository); - if (gvfs_shared_cache_pathname && *gvfs_shared_cache_pathname) { - for (odb = the_repository->objects->odb; odb; odb = odb->next) { - if (!strcmp(odb->path, gvfs_shared_cache_pathname)) { - ghs__chosen_odb = odb; - return; - } + if (!gvfs_shared_cache_pathname.len) + return; + + for (odb = the_repository->objects->odb->next; odb; odb = odb->next) { + if (!strcmp(odb->path, gvfs_shared_cache_pathname.buf)) { + ghs__chosen_odb = odb; + return; } } - - /* - * Use .git/objects if "gvfs.sharedcache" not set or set to an - * unknown pathname. - */ - ghs__chosen_odb = the_repository->objects->odb; } static int ghc__get(enum ghc__created *p_ghc) diff --git a/gvfs-helper.c b/gvfs-helper.c index 37726bfef99c21..727e4f235cb4d0 100644 --- a/gvfs-helper.c +++ b/gvfs-helper.c @@ -81,10 +81,11 @@ // // Fetch 1 or more objects. If a cache-server is configured, // try it first. Optionally fallback to the main Git server. +// // Create 1 or more loose objects and/or packfiles in the -// requested shared-cache directory (given on the command -// line and which is reported at the beginning of the -// response). +// shared-cache ODB. (The pathname of the selected ODB is +// reported at the beginning of the response; this should +// match the pathname given on the command line). // // git> get // git> @@ -632,26 +633,88 @@ static int option_parse_cache_server_mode(const struct option *opt, } /* - * Let command line args override "gvfs.sharedcache" config setting. + * Let command line args override "gvfs.sharedcache" config setting + * and override the value set by git_default_config(). + * + * The command line is parsed *AFTER* the config is loaded, so + * prepared_alt_odb() has already been called any default or inherited + * shared-cache has already been set. * - * It would be nice to move this to parse-options.c as an - * OPTION_PATHNAME handler. And maybe have flags for exists() - * and is_directory(). + * We have a chance to override it here. */ static int option_parse_shared_cache_directory(const struct option *opt, const char *arg, int unset) { + struct strbuf buf_arg = STRBUF_INIT; + if (unset) /* should not happen */ return error(_("missing value for switch '%s'"), opt->long_name); - if (!is_directory(arg)) - return error(_("value for switch '%s' is not a directory: '%s'"), - opt->long_name, arg); + strbuf_addstr(&buf_arg, arg); + if (strbuf_normalize_path(&buf_arg) < 0) { + /* + * Pretend command line wasn't given. Use whatever + * settings we already have from the config. + */ + strbuf_release(&buf_arg); + return 0; + } + strbuf_trim_trailing_dir_sep(&buf_arg); + + if (!strbuf_cmp(&buf_arg, &gvfs_shared_cache_pathname)) { + /* + * The command line argument matches what we got from + * the config, so we're already setup correctly. (And + * we have already verified that the directory exists + * on disk.) + */ + strbuf_release(&buf_arg); + return 0; + } + + else if (!gvfs_shared_cache_pathname.len) { + /* + * A shared-cache was requested and we did not inherit one. + * Try it, but let alt_odb_usabe() secretly disable it if + * it cannot create the directory on disk. + */ + strbuf_addbuf(&gvfs_shared_cache_pathname, &buf_arg); - gvfs_shared_cache_pathname = arg; + add_to_alternates_memory(buf_arg.buf); - return 0; + strbuf_release(&buf_arg); + return 0; + } + + else { + /* + * The requested shared-cache is different from the one + * we inherited. Replace the inherited value with this + * one, but smartly fallback if necessary. + */ + struct strbuf buf_prev = STRBUF_INIT; + + strbuf_addbuf(&buf_prev, &gvfs_shared_cache_pathname); + + strbuf_setlen(&gvfs_shared_cache_pathname, 0); + strbuf_addbuf(&gvfs_shared_cache_pathname, &buf_arg); + + add_to_alternates_memory(buf_arg.buf); + + /* + * alt_odb_usabe() releases gvfs_shared_cache_pathname + * if it cannot create the directory on disk, so fallback + * to the previous choice when it fails. + */ + if (!gvfs_shared_cache_pathname.len) + strbuf_addbuf(&gvfs_shared_cache_pathname, + &buf_prev); + + strbuf_release(&buf_arg); + strbuf_release(&buf_prev); + return 0; + } } /* @@ -949,24 +1012,20 @@ static void approve_cache_server_creds(void) } /* - * Select the ODB directory where we will write objects that we - * download. If was given on the command line or define in the - * config, use the local ODB (in ".git/objects"). + * Get the pathname to the ODB where we write objects that we download. */ static void select_odb(void) { - const char *odb_path = NULL; + prepare_alt_odb(the_repository); strbuf_init(&gh__global.buf_odb_path, 0); - if (gvfs_shared_cache_pathname && *gvfs_shared_cache_pathname) - odb_path = gvfs_shared_cache_pathname; - else { - prepare_alt_odb(the_repository); - odb_path = the_repository->objects->odb->path; - } - - strbuf_addstr(&gh__global.buf_odb_path, odb_path); + if (gvfs_shared_cache_pathname.len) + strbuf_addbuf(&gh__global.buf_odb_path, + &gvfs_shared_cache_pathname); + else + strbuf_addstr(&gh__global.buf_odb_path, + the_repository->objects->odb->path); } /* diff --git a/sha1-file.c b/sha1-file.c index 756b095cc26a90..a1477b8431d38f 100644 --- a/sha1-file.c +++ b/sha1-file.c @@ -442,6 +442,8 @@ const char *loose_object_path(struct repository *r, struct strbuf *buf, return odb_loose_path(r->objects->odb, buf, oid); } +static int gvfs_matched_shared_cache_to_alternate; + /* * Return non-zero iff the path is usable as an alternate object database. */ @@ -451,6 +453,52 @@ static int alt_odb_usable(struct raw_object_store *o, { struct object_directory *odb; + if (!strbuf_cmp(path, &gvfs_shared_cache_pathname)) { + /* + * `gvfs.sharedCache` is the preferred alternate that we + * will use with `gvfs-helper.exe` to dynamically fetch + * missing objects. It is set during git_default_config(). + * + * Make sure the directory exists on disk before we let the + * stock code discredit it. + */ + struct strbuf buf_pack_foo = STRBUF_INIT; + enum scld_error scld; + + /* + * Force create the "" and "/pack" directories, if + * not present on disk. Append an extra bogus directory to + * get safe_create_leading_directories() to see "/pack" + * as a leading directory of something deeper (which it + * won't create). + */ + strbuf_addf(&buf_pack_foo, "%s/pack/foo", path->buf); + + scld = safe_create_leading_directories(buf_pack_foo.buf); + if (scld != SCLD_OK && scld != SCLD_EXISTS) { + error_errno(_("could not create shared-cache ODB '%s'"), + gvfs_shared_cache_pathname.buf); + + strbuf_release(&buf_pack_foo); + + /* + * Pretend no shared-cache was requested and + * effectively fallback to ".git/objects" for + * fetching missing objects. + */ + strbuf_release(&gvfs_shared_cache_pathname); + return 0; + } + + /* + * We know that there is an alternate (either from + * .git/objects/info/alternates or from a memory-only + * entry) associated with the shared-cache directory. + */ + gvfs_matched_shared_cache_to_alternate++; + strbuf_release(&buf_pack_foo); + } + /* Detect cases where alternate disappeared */ if (!is_directory(path->buf)) { error(_("object directory %s does not exist; " @@ -866,6 +914,33 @@ void prepare_alt_odb(struct repository *r) link_alt_odb_entries(r, r->objects->alternate_db, PATH_SEP, NULL, 0); read_info_alternates(r, r->objects->odb->path, 0); + + if (gvfs_shared_cache_pathname.len && + !gvfs_matched_shared_cache_to_alternate) { + /* + * There is no entry in .git/objects/info/alternates for + * the requested shared-cache directory. Therefore, the + * odb-list does not contain this directory. + * + * Force this directory into the odb-list as an in-memory + * alternate. Implicitly create the directory on disk, if + * necessary. + * + * See GIT_ALTERNATE_OBJECT_DIRECTORIES for another example + * of this kind of usage. + * + * Note: This has the net-effect of allowing Git to treat + * `gvfs.sharedCache` as an unofficial alternate. This + * usage should be discouraged for compatbility reasons + * with other tools in the overall Git ecosystem (that + * won't know about this trick). It would be much better + * for us to update .git/objects/info/alternates instead. + * The code here is considered a backstop. + */ + link_alt_odb_entries(r, gvfs_shared_cache_pathname.buf, + '\n', NULL, 0); + } + r->objects->loaded_alternates = 1; } From c478dfbc2c1ab3773c9461513c0f5642b74b1c00 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 8 Oct 2019 12:58:55 -0400 Subject: [PATCH 28/36] gvfs-helper-client: rename ghs__ symbols to gh_server__ Signed-off-by: Jeff Hostetler --- gvfs-helper-client.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/gvfs-helper-client.c b/gvfs-helper-client.c index c03747b01f09ca..061a37af4eb2cc 100644 --- a/gvfs-helper-client.c +++ b/gvfs-helper-client.c @@ -15,13 +15,13 @@ static struct oidset ghc__oidset_queued = OIDSET_INIT; static unsigned long ghc__oidset_count; static int ghc__includes_immediate; -struct ghs__process { +struct gh_server__process { struct subprocess_entry subprocess; /* must be first */ unsigned int supported_capabilities; }; -static int ghs__subprocess_map_initialized; -static struct hashmap ghs__subprocess_map; +static int gh_server__subprocess_map_initialized; +static struct hashmap gh_server__subprocess_map; static struct object_directory *ghs__chosen_odb; #define CAP_GET (1u<<1) @@ -34,7 +34,7 @@ static int ghc__start_fn(struct subprocess_entry *subprocess) { NULL, 0 } }; - struct ghs__process *entry = (struct ghs__process *)subprocess; + struct gh_server__process *entry = (struct gh_server__process *)subprocess; return subprocess_handshake(subprocess, "gvfs-helper", versions, NULL, capabilities, @@ -255,7 +255,7 @@ static void ghc__choose_odb(void) static int ghc__get(enum ghc__created *p_ghc) { - struct ghs__process *entry; + struct gh_server__process *entry; struct child_process *process; struct argv_array argv = ARGV_ARRAY_INIT; struct strbuf quoted = STRBUF_INIT; @@ -278,21 +278,21 @@ static int ghc__get(enum ghc__created *p_ghc) sq_quote_argv_pretty("ed, argv.argv); - if (!ghs__subprocess_map_initialized) { - ghs__subprocess_map_initialized = 1; - hashmap_init(&ghs__subprocess_map, + if (!gh_server__subprocess_map_initialized) { + gh_server__subprocess_map_initialized = 1; + hashmap_init(&gh_server__subprocess_map, (hashmap_cmp_fn)cmd2process_cmp, NULL, 0); entry = NULL; } else - entry = (struct ghs__process *)subprocess_find_entry( - &ghs__subprocess_map, quoted.buf); + entry = (struct gh_server__process *)subprocess_find_entry( + &gh_server__subprocess_map, quoted.buf); if (!entry) { entry = xmalloc(sizeof(*entry)); entry->supported_capabilities = 0; err = subprocess_start_argv( - &ghs__subprocess_map, &entry->subprocess, 1, + &gh_server__subprocess_map, &entry->subprocess, 1, &argv, ghc__start_fn); if (err) { free(entry); @@ -304,7 +304,7 @@ static int ghc__get(enum ghc__created *p_ghc) if (!(CAP_GET & entry->supported_capabilities)) { error("gvfs-helper: does not support GET"); - subprocess_stop(&ghs__subprocess_map, + subprocess_stop(&gh_server__subprocess_map, (struct subprocess_entry *)entry); free(entry); err = -1; @@ -321,7 +321,7 @@ static int ghc__get(enum ghc__created *p_ghc) sigchain_pop(SIGPIPE); if (err) { - subprocess_stop(&ghs__subprocess_map, + subprocess_stop(&gh_server__subprocess_map, (struct subprocess_entry *)entry); free(entry); } From 1c31a8c12ee58c549678c679f2d8488604db2745 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 8 Oct 2019 13:03:24 -0400 Subject: [PATCH 29/36] gvfs-helper-client: rename ghs__chosen_odb to gh_client__chosen_odb for clarity Signed-off-by: Jeff Hostetler --- gvfs-helper-client.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/gvfs-helper-client.c b/gvfs-helper-client.c index 061a37af4eb2cc..ef807349008704 100644 --- a/gvfs-helper-client.c +++ b/gvfs-helper-client.c @@ -22,7 +22,7 @@ struct gh_server__process { static int gh_server__subprocess_map_initialized; static struct hashmap gh_server__subprocess_map; -static struct object_directory *ghs__chosen_odb; +static struct object_directory *gh_client__chosen_odb; #define CAP_GET (1u<<1) @@ -95,9 +95,10 @@ static void ghc__verify_odb_line(const char *line) if (!skip_prefix(line, "odb ", &v1_odb_path)) BUG("verify_odb_line: invalid line '%s'", line); - if (!ghs__chosen_odb || strcmp(v1_odb_path, ghs__chosen_odb->path)) + if (!gh_client__chosen_odb || + strcmp(v1_odb_path, gh_client__chosen_odb->path)) BUG("verify_odb_line: unexpeced odb path '%s' vs '%s'", - v1_odb_path, ghs__chosen_odb->path); + v1_odb_path, gh_client__chosen_odb->path); } /* @@ -112,7 +113,7 @@ static void ghc__update_loose_cache(const char *line) if (!skip_prefix(line, "loose ", &v1_oid)) BUG("update_loose_cache: invalid line '%s'", line); - odb_loose_cache_add_new_oid(ghs__chosen_odb, &oid); + odb_loose_cache_add_new_oid(gh_client__chosen_odb, &oid); } /* @@ -131,9 +132,10 @@ static void ghc__update_packed_git(const char *line) /* * ODB[0] is the local .git/objects. All others are alternates. */ - is_local = (ghs__chosen_odb == the_repository->objects->odb); + is_local = (gh_client__chosen_odb == the_repository->objects->odb); - strbuf_addf(&path, "%s/pack/%s", ghs__chosen_odb->path, v1_filename); + strbuf_addf(&path, "%s/pack/%s", + gh_client__chosen_odb->path, v1_filename); strbuf_strip_suffix(&path, ".pack"); strbuf_addstr(&path, ".idx"); @@ -235,10 +237,10 @@ static void ghc__choose_odb(void) { struct object_directory *odb; - if (ghs__chosen_odb) + if (gh_client__chosen_odb) return; - ghs__chosen_odb = the_repository->objects->odb; + gh_client__chosen_odb = the_repository->objects->odb; prepare_alt_odb(the_repository); @@ -247,7 +249,7 @@ static void ghc__choose_odb(void) for (odb = the_repository->objects->odb->next; odb; odb = odb->next) { if (!strcmp(odb->path, gvfs_shared_cache_pathname.buf)) { - ghs__chosen_odb = odb; + gh_client__chosen_odb = odb; return; } } @@ -273,7 +275,8 @@ static int ghc__get(enum ghc__created *p_ghc) argv_array_push(&argv, "gvfs-helper"); argv_array_push(&argv, "--fallback"); argv_array_push(&argv, "--cache-server=trust"); - argv_array_pushf(&argv, "--shared-cache=%s", ghs__chosen_odb->path); + argv_array_pushf(&argv, "--shared-cache=%s", + gh_client__chosen_odb->path); argv_array_push(&argv, "server"); sq_quote_argv_pretty("ed, argv.argv); From 16e40b32af2a62bfb208226d160ca2f9ab561aef Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 8 Oct 2019 13:13:02 -0400 Subject: [PATCH 30/36] gvfs-helper-client: rename ghc__ symbols to gh_client__ for clarity Signed-off-by: Jeff Hostetler --- gvfs-helper-client.c | 86 ++++++++++++++++++++++---------------------- gvfs-helper-client.h | 13 +++---- sha1-file.c | 4 +-- 3 files changed, 53 insertions(+), 50 deletions(-) diff --git a/gvfs-helper-client.c b/gvfs-helper-client.c index ef807349008704..0d5b4582f2f600 100644 --- a/gvfs-helper-client.c +++ b/gvfs-helper-client.c @@ -11,9 +11,9 @@ #include "quote.h" #include "packfile.h" -static struct oidset ghc__oidset_queued = OIDSET_INIT; -static unsigned long ghc__oidset_count; -static int ghc__includes_immediate; +static struct oidset gh_client__oidset_queued = OIDSET_INIT; +static unsigned long gh_client__oidset_count; +static int gh_client__includes_immediate; struct gh_server__process { struct subprocess_entry subprocess; /* must be first */ @@ -26,7 +26,7 @@ static struct object_directory *gh_client__chosen_odb; #define CAP_GET (1u<<1) -static int ghc__start_fn(struct subprocess_entry *subprocess) +static int gh_client__start_fn(struct subprocess_entry *subprocess) { static int versions[] = {1, 0}; static struct subprocess_capability capabilities[] = { @@ -49,7 +49,7 @@ static int ghc__start_fn(struct subprocess_entry *subprocess) * * */ -static int ghc__get__send_command(struct child_process *process) +static int gh_client__get__send_command(struct child_process *process) { struct oidset_iter iter; struct object_id *oid; @@ -64,7 +64,7 @@ static int ghc__get__send_command(struct child_process *process) if (err) return err; - oidset_iter_init(&ghc__oidset_queued, &iter); + oidset_iter_init(&gh_client__oidset_queued, &iter); while ((oid = oidset_iter_next(&iter))) { err = packet_write_fmt_gently(process->in, "%s\n", oid_to_hex(oid)); @@ -88,7 +88,7 @@ static int ghc__get__send_command(struct child_process *process) * In particular, I don't see a need to try to search for the response * value in from our list of alternates. */ -static void ghc__verify_odb_line(const char *line) +static void gh_client__verify_odb_line(const char *line) { const char *v1_odb_path; @@ -105,7 +105,7 @@ static void ghc__verify_odb_line(const char *line) * Update the loose object cache to include the newly created * object. */ -static void ghc__update_loose_cache(const char *line) +static void gh_client__update_loose_cache(const char *line) { const char *v1_oid; struct object_id oid; @@ -119,7 +119,7 @@ static void ghc__update_loose_cache(const char *line) /* * Update the packed-git list to include the newly created packfile. */ -static void ghc__update_packed_git(const char *line) +static void gh_client__update_packed_git(const char *line) { struct strbuf path = STRBUF_INIT; const char *v1_filename; @@ -176,11 +176,12 @@ static void ghc__update_packed_git(const char *line) * grouped with a queued request for a blob. The tree-walk *might* be * able to continue and let the 404 blob be handled later. */ -static int ghc__get__receive_response(struct child_process *process, - enum ghc__created *p_ghc, - int *p_nr_loose, int *p_nr_packfile) +static int gh_client__get__receive_response( + struct child_process *process, + enum gh_client__created *p_ghc, + int *p_nr_loose, int *p_nr_packfile) { - enum ghc__created ghc = GHC__CREATED__NOTHING; + enum gh_client__created ghc = GHC__CREATED__NOTHING; const char *v1; char *line; int len; @@ -196,17 +197,17 @@ static int ghc__get__receive_response(struct child_process *process, break; if (starts_with(line, "odb")) { - ghc__verify_odb_line(line); + gh_client__verify_odb_line(line); } else if (starts_with(line, "packfile")) { - ghc__update_packed_git(line); + gh_client__update_packed_git(line); ghc |= GHC__CREATED__PACKFILE; *p_nr_packfile += 1; } else if (starts_with(line, "loose")) { - ghc__update_loose_cache(line); + gh_client__update_loose_cache(line); ghc |= GHC__CREATED__LOOSE; *p_nr_loose += 1; } @@ -233,7 +234,7 @@ static int ghc__get__receive_response(struct child_process *process, * * Fallback to .git/objects if necessary. */ -static void ghc__choose_odb(void) +static void gh_client__choose_odb(void) { struct object_directory *odb; @@ -255,7 +256,7 @@ static void ghc__choose_odb(void) } } -static int ghc__get(enum ghc__created *p_ghc) +static int gh_client__get(enum gh_client__created *p_ghc) { struct gh_server__process *entry; struct child_process *process; @@ -267,7 +268,7 @@ static int ghc__get(enum ghc__created *p_ghc) trace2_region_enter("gh-client", "get", the_repository); - ghc__choose_odb(); + gh_client__choose_odb(); /* * TODO decide what defaults we want. @@ -296,7 +297,7 @@ static int ghc__get(enum ghc__created *p_ghc) err = subprocess_start_argv( &gh_server__subprocess_map, &entry->subprocess, 1, - &argv, ghc__start_fn); + &argv, gh_client__start_fn); if (err) { free(entry); goto leave_region; @@ -316,9 +317,9 @@ static int ghc__get(enum ghc__created *p_ghc) sigchain_push(SIGPIPE, SIG_IGN); - err = ghc__get__send_command(process); + err = gh_client__get__send_command(process); if (!err) - err = ghc__get__receive_response(process, p_ghc, + err = gh_client__get__receive_response(process, p_ghc, &nr_loose, &nr_packfile); sigchain_pop(SIGPIPE); @@ -334,10 +335,10 @@ static int ghc__get(enum ghc__created *p_ghc) strbuf_release("ed); trace2_data_intmax("gh-client", the_repository, - "get/immediate", ghc__includes_immediate); + "get/immediate", gh_client__includes_immediate); trace2_data_intmax("gh-client", the_repository, - "get/nr_objects", ghc__oidset_count); + "get/nr_objects", gh_client__oidset_count); if (nr_loose) trace2_data_intmax("gh-client", the_repository, @@ -353,22 +354,22 @@ static int ghc__get(enum ghc__created *p_ghc) trace2_region_leave("gh-client", "get", the_repository); - oidset_clear(&ghc__oidset_queued); - ghc__oidset_count = 0; - ghc__includes_immediate = 0; + oidset_clear(&gh_client__oidset_queued); + gh_client__oidset_count = 0; + gh_client__includes_immediate = 0; return err; } -void ghc__queue_oid(const struct object_id *oid) +void gh_client__queue_oid(const struct object_id *oid) { // TODO consider removing this trace2. it is useful for interactive // TODO debugging, but may generate way too much noise for a data // TODO event. - trace2_printf("ghc__queue_oid: %s", oid_to_hex(oid)); + trace2_printf("gh_client__queue_oid: %s", oid_to_hex(oid)); - if (!oidset_insert(&ghc__oidset_queued, oid)) - ghc__oidset_count++; + if (!oidset_insert(&gh_client__oidset_queued, oid)) + gh_client__oidset_count++; } /* @@ -376,35 +377,36 @@ void ghc__queue_oid(const struct object_id *oid) * rather than the component parts, but fetch_objects() uses * this model (because of the call in sha1-file.c). */ -void ghc__queue_oid_array(const struct object_id *oids, int oid_nr) +void gh_client__queue_oid_array(const struct object_id *oids, int oid_nr) { int k; for (k = 0; k < oid_nr; k++) - ghc__queue_oid(&oids[k]); + gh_client__queue_oid(&oids[k]); } -int ghc__drain_queue(enum ghc__created *p_ghc) +int gh_client__drain_queue(enum gh_client__created *p_ghc) { *p_ghc = GHC__CREATED__NOTHING; - if (!ghc__oidset_count) + if (!gh_client__oidset_count) return 0; - return ghc__get(p_ghc); + return gh_client__get(p_ghc); } -int ghc__get_immediate(const struct object_id *oid, enum ghc__created *p_ghc) +int gh_client__get_immediate(const struct object_id *oid, + enum gh_client__created *p_ghc) { - ghc__includes_immediate = 1; + gh_client__includes_immediate = 1; // TODO consider removing this trace2. it is useful for interactive // TODO debugging, but may generate way too much noise for a data // TODO event. - trace2_printf("ghc__get_immediate: %s", oid_to_hex(oid)); + trace2_printf("gh_client__get_immediate: %s", oid_to_hex(oid)); - if (!oidset_insert(&ghc__oidset_queued, oid)) - ghc__oidset_count++; + if (!oidset_insert(&gh_client__oidset_queued, oid)) + gh_client__oidset_count++; - return ghc__drain_queue(p_ghc); + return gh_client__drain_queue(p_ghc); } diff --git a/gvfs-helper-client.h b/gvfs-helper-client.h index 61f026d1faa233..5f5e824bb0771a 100644 --- a/gvfs-helper-client.h +++ b/gvfs-helper-client.h @@ -4,7 +4,7 @@ struct repository; struct commit; -enum ghc__created { +enum gh_client__created { /* * The _get_ operation did not create anything. If doesn't * matter if `gvfs-helper` had errors or not -- just that @@ -13,7 +13,7 @@ enum ghc__created { GHC__CREATED__NOTHING = 0, /* - * The _get_operation created one or more packfiles. + * The _get_ operation created one or more packfiles. */ GHC__CREATED__PACKFILE = 1<<1, @@ -40,7 +40,8 @@ enum ghc__created { * It is undefined whether the requested OID will be loose or * in a packfile. */ -int ghc__get_immediate(const struct object_id *oid, enum ghc__created *p_ghc); +int gh_client__get_immediate(const struct object_id *oid, + enum gh_client__created *p_ghc); /* * Queue this OID for a future fetch using `gvfs-helper service`. @@ -53,9 +54,9 @@ int ghc__get_immediate(const struct object_id *oid, enum ghc__created *p_ghc); * Callers should not rely on the queued object being on disk until * the queue has been drained. */ -void ghc__queue_oid(const struct object_id *oid); -void ghc__queue_oid_array(const struct object_id *oids, int oid_nr); +void gh_client__queue_oid(const struct object_id *oid); +void gh_client__queue_oid_array(const struct object_id *oids, int oid_nr); -int ghc__drain_queue(enum ghc__created *p_ghc); +int gh_client__drain_queue(enum gh_client__created *p_ghc); #endif /* GVFS_HELPER_CLIENT_H */ diff --git a/sha1-file.c b/sha1-file.c index a1477b8431d38f..974a69f5be6e64 100644 --- a/sha1-file.c +++ b/sha1-file.c @@ -1676,12 +1676,12 @@ int oid_object_info_extended(struct repository *r, const struct object_id *oid, return 0; if (core_use_gvfs_helper && !tried_gvfs_helper) { - enum ghc__created ghc; + enum gh_client__created ghc; if (flags & OBJECT_INFO_SKIP_FETCH_OBJECT) return -1; - ghc__get_immediate(real, &ghc); + gh_client__get_immediate(real, &ghc); tried_gvfs_helper = 1; /* From 617fd0555e0a26c2aaa1f90d200f991de35dfe62 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 8 Oct 2019 14:01:26 -0400 Subject: [PATCH 31/36] gvfs-helper: better handling of network errors Add trace2 message for CURL and HTTP errors. Fix typo reporting network error code back to gvfs-helper-client. Signed-off-by: Jeff Hostetler --- gvfs-helper.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/gvfs-helper.c b/gvfs-helper.c index 727e4f235cb4d0..8561a3fcf900df 100644 --- a/gvfs-helper.c +++ b/gvfs-helper.c @@ -405,10 +405,16 @@ static void gh__response_status__set_from_slot( strbuf_addf(&status->error_message, "%s (curl)", curl_easy_strerror(status->curl_code)); status->ec = GH__ERROR_CODE__CURL_ERROR; + + trace2_data_string("gvfs-helper", NULL, + "error/curl", status->error_message.buf); } else { strbuf_addf(&status->error_message, "HTTP %ld Unexpected", status->response_code); status->ec = GH__ERROR_CODE__HTTP_UNEXPECTED_CODE; + + trace2_data_string("gvfs-helper", NULL, + "error/http", status->error_message.buf); } if (status->ec != GH__ERROR_CODE__OK) @@ -1968,7 +1974,7 @@ static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) } /* - * Handle the 'get' command when in "server mode". Only call error() + * Handle the 'get' command when in "server mode". Only call error() and set ec * for hard errors where we cannot communicate correctly with the foreground * client process. Pass any actual data errors (such as 404's or 401's from * the fetch back to the client process. @@ -2040,10 +2046,15 @@ static enum gh__error_code do_server_subprocess_get(void) goto cleanup; } + /* + * We only use status.ec to tell the client whether the request + * was complete, incomplete, or had IO errors. We DO NOT return + * this value to our caller. + */ err = 0; - if (ec == GH__ERROR_CODE__OK) + if (status.ec == GH__ERROR_CODE__OK) err = packet_write_fmt_gently(1, "ok\n"); - else if (ec == GH__ERROR_CODE__HTTP_404) + else if (status.ec == GH__ERROR_CODE__HTTP_404) err = packet_write_fmt_gently(1, "partial\n"); else err = packet_write_fmt_gently(1, "error %s\n", @@ -2270,6 +2281,7 @@ int cmd_main(int argc, const char **argv) usage_with_options(main_usage, main_options); trace2_cmd_name("gvfs-helper"); + packet_trace_identity("gvfs-helper"); setup_git_directory_gently(NULL); From 891a51b40cbc6f5f0734bdae298ca22854c89431 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 8 Oct 2019 14:30:25 -0400 Subject: [PATCH 32/36] gvfs-helper-client: properly update loose cache with fetched OID Fix parsing of the "loose " response from `gvfs-helper` and use the actually parsed OID when updating the loose oid cache. Previously, an uninitialized "struct oid" was used to update the cache. This did not cause any corruption, but could cause extra fetches for objects visited multiple times. Signed-off-by: Jeff Hostetler --- gvfs-helper-client.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gvfs-helper-client.c b/gvfs-helper-client.c index 0d5b4582f2f600..1564610aae594b 100644 --- a/gvfs-helper-client.c +++ b/gvfs-helper-client.c @@ -113,6 +113,9 @@ static void gh_client__update_loose_cache(const char *line) if (!skip_prefix(line, "loose ", &v1_oid)) BUG("update_loose_cache: invalid line '%s'", line); + if (get_oid_hex(v1_oid, &oid)) + BUG("update_loose_cache: invalid line '%s'", line); + odb_loose_cache_add_new_oid(gh_client__chosen_odb, &oid); } From df4a8db1555bdd40385ebe4b11dc77f7bc57ce82 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 10 Oct 2019 10:58:07 -0400 Subject: [PATCH 33/36] gvfs-helper: V2 robust retry and throttling Add robust-retry mechanism to automatically retry a request after network errors. This includes retry after: [] transient network problems reported by CURL. [] http 429 throttling (with associated Retry-After) [] http 503 server unavailable (with associated Retry-After) Add voluntary throttling using Azure X-RateLimit-* hints to avoid being soft-throttled (tarpitted) or hard-throttled (429) on later requests. Add global (outside of a single request) azure-throttle data to track the rate limit hints from the cache-server and main Git server independently. Add exponential retry backoff. This is used for transient network problems when we don't have a Retry-After hint. Move the call to index-pack earlier in the response/error handling sequence so that if we receive a 200 but yet the packfile is truncated/corrupted, we can use the regular retry logic to get it again. Refactor the way we create tempfiles for packfiles to use /pack/tempPacks/ rather than working directly in the /pack/ directory. Move the code to create a new tempfile to the start of a single request attempt (initial and retry attempts), rather than at the overall start of a request. This gives us a fresh tempfile for each network request attempt. This simplifies the retry mechanism and isolates us from the file ownership issues hidden within the tempfile class. And avoids the need to truncate previous incomplete results. This was necessary because index-pack was pulled into the retry loop. Minor: Add support for logging X-VSS-E2EID to telemetry on network errors. Minor: rename variable: params.b_no_cache_server --> params.b_permit_cache_server_if_defined. This variable is used to indicate whether we should try to use the cache-server when it is defined. Got rid of double-negative logic. Minor: rename variable: params.label --> params.tr2_label Clarify that this variable is only used with trace2 logging. Minor: Move the code to automatically map cache-server 400 responses to normal 401 response earlier in the response/error handling sequence to simplify later retry logic. Minor: Decorate trace2 messages with "(cs)" or "(main)" to identify the server in log messages. Add params->server_type to simplify this. Signed-off-by: Jeff Hostetler --- gvfs-helper.c | 1429 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 1126 insertions(+), 303 deletions(-) diff --git a/gvfs-helper.c b/gvfs-helper.c index 8561a3fcf900df..f13eace2c3cca0 100644 --- a/gvfs-helper.c +++ b/gvfs-helper.c @@ -61,6 +61,11 @@ // // --depth= // defaults to "1" // +// --max-retries= // defaults to "6" +// +// Number of retries after transient network errors. +// Set to zero to disable such retries. +// // server // // Interactive/sub-process mode. Listen for a series of commands @@ -77,6 +82,11 @@ // // --depth= // defaults to "1" // +// --max-retries= // defaults to "6" +// +// Number of retries after transient network errors. +// Set to zero to disable such retries. +// // Interactive verb: get // // Fetch 1 or more objects. If a cache-server is configured, @@ -166,8 +176,26 @@ static const char *const server_usage[] = { NULL }; +/* + * "commitDepth" field in gvfs protocol + */ +#define GH__DEFAULT_COMMIT_DEPTH 1 + +/* + * Chunk/block size in number of objects we request in each packfile + */ #define GH__DEFAULT_BLOCK_SIZE 4000 +/* + * Retry attempts (after the initial request) for transient errors and 429s. + */ +#define GH__DEFAULT_MAX_RETRIES 6 + +/* + * Maximum delay in seconds for transient (network) error retries. + */ +#define GH__DEFAULT_MAX_TRANSIENT_BACKOFF_SEC 300 + /* * Our exit-codes. */ @@ -175,16 +203,18 @@ enum gh__error_code { GH__ERROR_CODE__USAGE = -1, /* will be mapped to usage() */ GH__ERROR_CODE__OK = 0, GH__ERROR_CODE__ERROR = 1, /* unspecified */ -// GH__ERROR_CODE__CACHE_SERVER_NOT_FOUND = 2, - GH__ERROR_CODE__CURL_ERROR = 3, - GH__ERROR_CODE__HTTP_401 = 4, - GH__ERROR_CODE__HTTP_404 = 5, - GH__ERROR_CODE__HTTP_UNEXPECTED_CODE = 6, - GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE = 7, + GH__ERROR_CODE__CURL_ERROR = 2, + GH__ERROR_CODE__HTTP_401 = 3, + GH__ERROR_CODE__HTTP_404 = 4, + GH__ERROR_CODE__HTTP_429 = 5, + GH__ERROR_CODE__HTTP_503 = 6, + GH__ERROR_CODE__HTTP_OTHER = 7, + GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE = 8, GH__ERROR_CODE__COULD_NOT_CREATE_TEMPFILE = 8, - GH__ERROR_CODE__COULD_NOT_INSTALL_LOOSE = 9, - GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE = 10, - GH__ERROR_CODE__SUBPROCESS_SYNTAX = 11, + GH__ERROR_CODE__COULD_NOT_INSTALL_LOOSE = 10, + GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE = 11, + GH__ERROR_CODE__SUBPROCESS_SYNTAX = 12, + GH__ERROR_CODE__INDEX_PACK_FAILED = 13, }; enum gh__cache_server_mode { @@ -210,6 +240,8 @@ static struct gh__cmd_opts { int depth; int block_size; + int max_retries; + int max_transient_backoff_sec; enum gh__cache_server_mode cache_server_mode; } gh__cmd_opts; @@ -234,6 +266,47 @@ static struct gh__global { } gh__global; +enum gh__server_type { + GH__SERVER_TYPE__MAIN = 0, + GH__SERVER_TYPE__CACHE = 1, + + GH__SERVER_TYPE__NR, +}; + +static const char *gh__server_type_label[GH__SERVER_TYPE__NR] = { + "(main)", + "(cs)" +}; + +struct gh__azure_throttle +{ + unsigned long tstu_limit; + unsigned long tstu_remaining; + + unsigned long reset_sec; + unsigned long retry_after_sec; +}; + +static void gh__azure_throttle__zero(struct gh__azure_throttle *azure) +{ + azure->tstu_limit = 0; + azure->tstu_remaining = 0; + azure->reset_sec = 0; + azure->retry_after_sec = 0; +} + +#define GH__AZURE_THROTTLE_INIT { \ + .tstu_limit = 0, \ + .tstu_remaining = 0, \ + .reset_sec = 0, \ + .retry_after_sec = 0, \ + } + +static struct gh__azure_throttle gh__global_throttle[GH__SERVER_TYPE__NR] = { + GH__AZURE_THROTTLE_INIT, + GH__AZURE_THROTTLE_INIT, +}; + /* * Stolen from http.c */ @@ -262,7 +335,12 @@ enum gh__progress_state { struct gh__request_params { int b_is_post; /* POST=1 or GET=0 */ int b_write_to_file; /* write to file=1 or strbuf=0 */ - int b_no_cache_server; /* force main server only */ + int b_permit_cache_server_if_defined; + + enum gh__server_type server_type; + + int k_attempt; /* robust retry attempt */ + int k_transient_delay_sec; /* delay before transient error retries */ unsigned long object_count; /* number of objects being fetched */ @@ -271,9 +349,16 @@ struct gh__request_params { struct curl_slist *headers; /* additional http headers to send */ struct tempfile *tempfile; /* for response content when file */ struct strbuf *buffer; /* for response content when strbuf */ - struct strbuf label; /* for trace2 regions */ + struct strbuf tr2_label; /* for trace2 regions */ struct strbuf loose_path; + struct object_id loose_oid; + + struct strbuf temp_path_pack; + struct strbuf temp_path_idx; + struct strbuf final_path_pack; + struct strbuf final_path_idx; + struct strbuf final_packfile_filename; /* * Note that I am putting all of the progress-related instance data @@ -295,24 +380,36 @@ struct gh__request_params { */ struct strbuf progress_msg; struct progress *progress; + + struct strbuf e2eid; }; #define GH__REQUEST_PARAMS_INIT { \ .b_is_post = 0, \ .b_write_to_file = 0, \ - .b_no_cache_server = 0, \ + .b_permit_cache_server_if_defined = 1, \ + .server_type = GH__SERVER_TYPE__MAIN, \ + .k_attempt = 0, \ + .k_transient_delay_sec = 0, \ .object_count = 0, \ .post_payload = NULL, \ .headers = NULL, \ .tempfile = NULL, \ .buffer = NULL, \ - .label = STRBUF_INIT, \ + .tr2_label = STRBUF_INIT, \ .loose_path = STRBUF_INIT, \ + .loose_oid = {{0}}, \ + .temp_path_pack = STRBUF_INIT, \ + .temp_path_idx = STRBUF_INIT, \ + .final_path_pack = STRBUF_INIT, \ + .final_path_idx = STRBUF_INIT, \ + .final_packfile_filename = STRBUF_INIT, \ .progress_state = GH__PROGRESS_STATE__START, \ .progress_base_phase2_msg = STRBUF_INIT, \ .progress_base_phase3_msg = STRBUF_INIT, \ .progress_msg = STRBUF_INIT, \ .progress = NULL, \ + .e2eid = STRBUF_INIT, \ } static void gh__request_params__release(struct gh__request_params *params) @@ -329,8 +426,13 @@ static void gh__request_params__release(struct gh__request_params *params) params->buffer = NULL; /* we do not own this */ - strbuf_release(¶ms->label); + strbuf_release(¶ms->tr2_label); strbuf_release(¶ms->loose_path); + strbuf_release(¶ms->temp_path_pack); + strbuf_release(¶ms->temp_path_idx); + strbuf_release(¶ms->final_path_pack); + strbuf_release(¶ms->final_path_idx); + strbuf_release(¶ms->final_packfile_filename); strbuf_release(¶ms->progress_base_phase2_msg); strbuf_release(¶ms->progress_base_phase3_msg); @@ -338,8 +440,55 @@ static void gh__request_params__release(struct gh__request_params *params) stop_progress(¶ms->progress); params->progress = NULL; + + strbuf_release(¶ms->e2eid); } +/* + * How we handle retries for various unexpected network errors. + */ +enum gh__retry_mode { + /* + * The operation was successful, so no retry is needed. + * Use this for HTTP 200, for example. + */ + GH__RETRY_MODE__SUCCESS = 0, + + /* + * Retry using the normal 401 Auth mechanism. + */ + GH__RETRY_MODE__HTTP_401, + + /* + * Fail because at least one of the requested OIDs does not exist. + */ + GH__RETRY_MODE__FAIL_404, + + /* + * A transient network error, such as dropped connection + * or network IO error. Our belief is that a retry MAY + * succeed. (See Gremlins and Cosmic Rays....) + */ + GH__RETRY_MODE__TRANSIENT, + + /* + * Request was blocked completely because of a 429. + */ + GH__RETRY_MODE__HTTP_429, + + /* + * Request failed because the server was (temporarily?) offline. + */ + GH__RETRY_MODE__HTTP_503, + + /* + * The operation had a hard failure and we have no + * expectation that a second attempt will give a different + * answer, such as a bad hostname or a mal-formed URL. + */ + GH__RETRY_MODE__HARD_FAIL, +}; + /* * Bucket to describe the results of an HTTP requests (may be * overwritten during retries so that it describes the final attempt). @@ -350,7 +499,9 @@ struct gh__response_status { long response_code; /* http response code */ CURLcode curl_code; enum gh__error_code ec; + enum gh__retry_mode retry; intmax_t bytes_received; + struct gh__azure_throttle *azure; }; #define GH__RESPONSE_STATUS_INIT { \ @@ -359,7 +510,9 @@ struct gh__response_status { .response_code = 0, \ .curl_code = CURLE_OK, \ .ec = GH__ERROR_CODE__OK, \ + .retry = GH__RETRY_MODE__SUCCESS, \ .bytes_received = 0, \ + .azure = NULL, \ } static void gh__response_status__zero(struct gh__response_status *s) @@ -369,7 +522,323 @@ static void gh__response_status__zero(struct gh__response_status *s) s->response_code = 0; s->curl_code = CURLE_OK; s->ec = GH__ERROR_CODE__OK; + s->retry = GH__RETRY_MODE__SUCCESS; s->bytes_received = 0; + s->azure = NULL; +} + +static void install_packfile(struct gh__request_params *params, + struct gh__response_status *status); +static void install_loose(struct gh__request_params *params, + struct gh__response_status *status); + +/* + * Log the E2EID for the current request. + * + * Since every HTTP request to the cache-server and to the main Git server + * will send back a unique E2EID (probably a GUID), we don't want to overload + * telemetry with each ID -- rather, only the ones for which there was a + * problem and that may be helpful in a post mortem. + */ +static void log_e2eid(struct gh__request_params *params, + struct gh__response_status *status) +{ + if (!params->e2eid.len) + return; + + switch (status->retry) { + default: + case GH__RETRY_MODE__SUCCESS: + case GH__RETRY_MODE__HTTP_401: + case GH__RETRY_MODE__FAIL_404: + return; + + case GH__RETRY_MODE__HARD_FAIL: + case GH__RETRY_MODE__TRANSIENT: + case GH__RETRY_MODE__HTTP_429: + case GH__RETRY_MODE__HTTP_503: + break; + } + + if (trace2_is_enabled()) { + struct strbuf key = STRBUF_INIT; + + strbuf_addstr(&key, "e2eid"); + strbuf_addstr(&key, gh__server_type_label[params->server_type]); + + trace2_data_string("gvfs-helper", NULL, key.buf, + params->e2eid.buf); + + strbuf_release(&key); + } +} + +/* + * Normalize a few error codes before we try to decide + * how to dispatch on them. + */ +static void gh__response_status__normalize_odd_codes( + struct gh__request_params *params, + struct gh__response_status *status) +{ + if (params->server_type == GH__SERVER_TYPE__CACHE && + status->response_code == 400) { + /* + * The cache-server sends a somewhat bogus 400 instead of + * the normal 401 when AUTH is required. Fixup the status + * to hide that. + * + * TODO Technically, the cache-server could send a 400 + * TODO for many reasons, not just for their bogus + * TODO pseudo-401, but we're going to assume it is a + * TODO 401 for now. We should confirm the expected + * TODO error message in the response-body. + */ + status->response_code = 401; + } + + if (status->response_code == 203) { + /* + * A proxy server transformed a 200 from the origin server + * into a 203. We don't care about the subtle distinction. + */ + status->response_code = 200; + } +} + +/* + * Map HTTP response codes into a retry strategy. + * See https://en.wikipedia.org/wiki/List_of_HTTP_status_codes + * + * https://docs.microsoft.com/en-us/azure/devops/integrate/concepts/rate-limits?view=azure-devops + */ +static void compute_retry_mode_from_http_response( + struct gh__response_status *status) +{ + switch (status->response_code) { + + case 200: + status->retry = GH__RETRY_MODE__SUCCESS; + status->ec = GH__ERROR_CODE__OK; + return; + + case 301: /* all the various flavors of HTTP Redirect */ + case 302: + case 303: + case 304: + case 305: + case 306: + case 307: + case 308: + /* + * TODO Consider a redirected-retry (with or without + * TODO a Retry-After header). + */ + goto hard_fail; + + case 401: + strbuf_addstr(&status->error_message, + "(http:401) Not Authorized"); + status->retry = GH__RETRY_MODE__HTTP_401; + status->ec = GH__ERROR_CODE__HTTP_401; + return; + + case 404: + /* + * TODO if params->object_count > 1, consider + * TODO splitting the request into 2 halves + * TODO and retrying each half in series. + */ + strbuf_addstr(&status->error_message, + "(http:404) Not Found"); + status->retry = GH__RETRY_MODE__FAIL_404; + status->ec = GH__ERROR_CODE__HTTP_404; + return; + + case 429: + /* + * This is a hard block because we've been bad. + */ + strbuf_addstr(&status->error_message, + "(http:429) Too Many Requests [throttled]"); + status->retry = GH__RETRY_MODE__HTTP_429; + status->ec = GH__ERROR_CODE__HTTP_429; + + trace2_data_string("gvfs-helper", NULL, "error/http", + status->error_message.buf); + return; + + case 503: + /* + * We assume that this comes with a "Retry-After" header like 429s. + */ + strbuf_addstr(&status->error_message, + "(http:503) Server Unavailable [throttled]"); + status->retry = GH__RETRY_MODE__HTTP_503; + status->ec = GH__ERROR_CODE__HTTP_503; + + trace2_data_string("gvfs-helper", NULL, "error/http", + status->error_message.buf); + return; + + default: + goto hard_fail; + } + +hard_fail: + strbuf_addf(&status->error_message, "(http:%d) Other [hard_fail]", + (int)status->response_code); + status->retry = GH__RETRY_MODE__HARD_FAIL; + status->ec = GH__ERROR_CODE__HTTP_OTHER; + + trace2_data_string("gvfs-helper", NULL, "error/http", + status->error_message.buf); + return; +} + +/* + * Map CURLE errors code to a retry strategy. + * See and + * https://curl.haxx.se/libcurl/c/libcurl-errors.html + * + * This could be a static table rather than a switch, but + * that is harder to debug and we may want to selectively + * log errors. + * + * I've commented out all of the hard-fail cases for now + * and let the default handle them. This is to indicate + * that I considered them and found them to be not actionable. + * Also, the spelling of some of the CURLE_ symbols seem + * to change between curl releases on different platforms, + * so I'm not going to fight that. + */ +static void compute_retry_mode_from_curl_error( + struct gh__response_status *status) +{ + switch (status->curl_code) { + case CURLE_OK: + status->retry = GH__RETRY_MODE__SUCCESS; + status->ec = GH__ERROR_CODE__OK; + return; + + //se CURLE_UNSUPPORTED_PROTOCOL: goto hard_fail; + //se CURLE_FAILED_INIT: goto hard_fail; + //se CURLE_URL_MALFORMAT: goto hard_fail; + //se CURLE_NOT_BUILT_IN: goto hard_fail; + //se CURLE_COULDNT_RESOLVE_PROXY: goto hard_fail; + //se CURLE_COULDNT_RESOLVE_HOST: goto hard_fail; + case CURLE_COULDNT_CONNECT: goto transient; + //se CURLE_WEIRD_SERVER_REPLY: goto hard_fail; + //se CURLE_REMOTE_ACCESS_DENIED: goto hard_fail; + //se CURLE_FTP_ACCEPT_FAILED: goto hard_fail; + //se CURLE_FTP_WEIRD_PASS_REPLY: goto hard_fail; + //se CURLE_FTP_ACCEPT_TIMEOUT: goto hard_fail; + //se CURLE_FTP_WEIRD_PASV_REPLY: goto hard_fail; + //se CURLE_FTP_WEIRD_227_FORMAT: goto hard_fail; + //se CURLE_FTP_CANT_GET_HOST: goto hard_fail; + case CURLE_HTTP2: goto transient; + //se CURLE_FTP_COULDNT_SET_TYPE: goto hard_fail; + case CURLE_PARTIAL_FILE: goto transient; + //se CURLE_FTP_COULDNT_RETR_FILE: goto hard_fail; + //se CURLE_OBSOLETE20: goto hard_fail; + //se CURLE_QUOTE_ERROR: goto hard_fail; + //se CURLE_HTTP_RETURNED_ERROR: goto hard_fail; + case CURLE_WRITE_ERROR: goto transient; + //se CURLE_OBSOLETE24: goto hard_fail; + case CURLE_UPLOAD_FAILED: goto transient; + //se CURLE_READ_ERROR: goto hard_fail; + //se CURLE_OUT_OF_MEMORY: goto hard_fail; + case CURLE_OPERATION_TIMEDOUT: goto transient; + //se CURLE_OBSOLETE29: goto hard_fail; + //se CURLE_FTP_PORT_FAILED: goto hard_fail; + //se CURLE_FTP_COULDNT_USE_REST: goto hard_fail; + //se CURLE_OBSOLETE32: goto hard_fail; + //se CURLE_RANGE_ERROR: goto hard_fail; + case CURLE_HTTP_POST_ERROR: goto transient; + //se CURLE_SSL_CONNECT_ERROR: goto hard_fail; + //se CURLE_BAD_DOWNLOAD_RESUME: goto hard_fail; + //se CURLE_FILE_COULDNT_READ_FILE: goto hard_fail; + //se CURLE_LDAP_CANNOT_BIND: goto hard_fail; + //se CURLE_LDAP_SEARCH_FAILED: goto hard_fail; + //se CURLE_OBSOLETE40: goto hard_fail; + //se CURLE_FUNCTION_NOT_FOUND: goto hard_fail; + //se CURLE_ABORTED_BY_CALLBACK: goto hard_fail; + //se CURLE_BAD_FUNCTION_ARGUMENT: goto hard_fail; + //se CURLE_OBSOLETE44: goto hard_fail; + //se CURLE_INTERFACE_FAILED: goto hard_fail; + //se CURLE_OBSOLETE46: goto hard_fail; + //se CURLE_TOO_MANY_REDIRECTS: goto hard_fail; + //se CURLE_UNKNOWN_OPTION: goto hard_fail; + //se CURLE_TELNET_OPTION_SYNTAX: goto hard_fail; + //se CURLE_OBSOLETE50: goto hard_fail; + //se CURLE_PEER_FAILED_VERIFICATION: goto hard_fail; + //se CURLE_GOT_NOTHING: goto hard_fail; + //se CURLE_SSL_ENGINE_NOTFOUND: goto hard_fail; + //se CURLE_SSL_ENGINE_SETFAILED: goto hard_fail; + case CURLE_SEND_ERROR: goto transient; + case CURLE_RECV_ERROR: goto transient; + //se CURLE_OBSOLETE57: goto hard_fail; + //se CURLE_SSL_CERTPROBLEM: goto hard_fail; + //se CURLE_SSL_CIPHER: goto hard_fail; + //se CURLE_SSL_CACERT: goto hard_fail; + //se CURLE_BAD_CONTENT_ENCODING: goto hard_fail; + //se CURLE_LDAP_INVALID_URL: goto hard_fail; + //se CURLE_FILESIZE_EXCEEDED: goto hard_fail; + //se CURLE_USE_SSL_FAILED: goto hard_fail; + //se CURLE_SEND_FAIL_REWIND: goto hard_fail; + //se CURLE_SSL_ENGINE_INITFAILED: goto hard_fail; + //se CURLE_LOGIN_DENIED: goto hard_fail; + //se CURLE_TFTP_NOTFOUND: goto hard_fail; + //se CURLE_TFTP_PERM: goto hard_fail; + //se CURLE_REMOTE_DISK_FULL: goto hard_fail; + //se CURLE_TFTP_ILLEGAL: goto hard_fail; + //se CURLE_TFTP_UNKNOWNID: goto hard_fail; + //se CURLE_REMOTE_FILE_EXISTS: goto hard_fail; + //se CURLE_TFTP_NOSUCHUSER: goto hard_fail; + //se CURLE_CONV_FAILED: goto hard_fail; + //se CURLE_CONV_REQD: goto hard_fail; + //se CURLE_SSL_CACERT_BADFILE: goto hard_fail; + //se CURLE_REMOTE_FILE_NOT_FOUND: goto hard_fail; + //se CURLE_SSH: goto hard_fail; + //se CURLE_SSL_SHUTDOWN_FAILED: goto hard_fail; + case CURLE_AGAIN: goto transient; + //se CURLE_SSL_CRL_BADFILE: goto hard_fail; + //se CURLE_SSL_ISSUER_ERROR: goto hard_fail; + //se CURLE_FTP_PRET_FAILED: goto hard_fail; + //se CURLE_RTSP_CSEQ_ERROR: goto hard_fail; + //se CURLE_RTSP_SESSION_ERROR: goto hard_fail; + //se CURLE_FTP_BAD_FILE_LIST: goto hard_fail; + //se CURLE_CHUNK_FAILED: goto hard_fail; + //se CURLE_NO_CONNECTION_AVAILABLE: goto hard_fail; + //se CURLE_SSL_PINNEDPUBKEYNOTMATCH: goto hard_fail; + //se CURLE_SSL_INVALIDCERTSTATUS: goto hard_fail; +#ifdef CURLE_HTTP2_STREAM + case CURLE_HTTP2_STREAM: goto transient; +#endif + default: goto hard_fail; + } + +hard_fail: + strbuf_addf(&status->error_message, "(curl:%d) %s [hard_fail]", + status->curl_code, + curl_easy_strerror(status->curl_code)); + status->retry = GH__RETRY_MODE__HARD_FAIL; + status->ec = GH__ERROR_CODE__CURL_ERROR; + + trace2_data_string("gvfs-helper", NULL, "error/curl", + status->error_message.buf); + return; + +transient: + strbuf_addf(&status->error_message, "(curl:%d) %s [transient]", + status->curl_code, + curl_easy_strerror(status->curl_code)); + status->retry = GH__RETRY_MODE__TRANSIENT; + status->ec = GH__ERROR_CODE__CURL_ERROR; + + trace2_data_string("gvfs-helper", NULL, "error/curl", + status->error_message.buf); + return; } /* @@ -390,32 +859,18 @@ static void gh__response_status__set_from_slot( strbuf_setlen(&status->error_message, 0); - if (status->response_code == 200) - status->ec = GH__ERROR_CODE__OK; - - else if (status->response_code == 401) { - strbuf_addstr(&status->error_message, "401 Not Authorized"); - status->ec = GH__ERROR_CODE__HTTP_401; - - } else if (status->response_code == 404) { - strbuf_addstr(&status->error_message, "404 Not Found"); - status->ec = GH__ERROR_CODE__HTTP_404; - - } else if (status->curl_code != CURLE_OK) { - strbuf_addf(&status->error_message, "%s (curl)", - curl_easy_strerror(status->curl_code)); - status->ec = GH__ERROR_CODE__CURL_ERROR; - - trace2_data_string("gvfs-helper", NULL, - "error/curl", status->error_message.buf); - } else { - strbuf_addf(&status->error_message, "HTTP %ld Unexpected", - status->response_code); - status->ec = GH__ERROR_CODE__HTTP_UNEXPECTED_CODE; + gh__response_status__normalize_odd_codes(params, status); - trace2_data_string("gvfs-helper", NULL, - "error/http", status->error_message.buf); - } + /* + * Use normalized response/status codes form curl/http to decide + * how to set the error-code we propagate *AND* to decide if we + * we should retry because of transient network problems. + */ + if (status->curl_code == CURLE_OK || + status->curl_code == CURLE_HTTP_RETURNED_ERROR) + compute_retry_mode_from_http_response(status); + else + compute_retry_mode_from_curl_error(status); if (status->ec != GH__ERROR_CODE__OK) status->bytes_received = 0; @@ -433,26 +888,6 @@ static void gh__response_status__release(struct gh__response_status *status) strbuf_release(&status->content_type); } -/* - * The cache-server sends a somewhat bogus 400 instead of - * the normal 401 when AUTH is required. Fixup the status - * to hide that. - */ -static void fixup_cache_server_400_to_401(struct gh__response_status *status) -{ - if (status->response_code != 400) - return; - - /* - * TODO Technically, the cache-server could send a 400 - * TODO for many reasons, not just for their bogus - * TODO pseudo-401, but we're going to assume it is a - * TODO 401 for now. We should confirm the expected - * TODO error message in the response-body. - */ - status->response_code = 401; -} - static int gh__curl_progress_cb(void *clientp, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow) @@ -543,8 +978,13 @@ static int gh__curl_progress_cb(void *clientp, enter_phase_2: strbuf_setlen(¶ms->progress_msg, 0); if (params->progress_base_phase2_msg.len) { - strbuf_addf(¶ms->progress_msg, "%s (bytes sent)", - params->progress_base_phase2_msg.buf); + if (params->k_attempt > 0) + strbuf_addf(¶ms->progress_msg, "%s [retry %d/%d] (bytes sent)", + params->progress_base_phase2_msg.buf, + params->k_attempt, gh__cmd_opts.max_retries); + else + strbuf_addf(¶ms->progress_msg, "%s (bytes sent)", + params->progress_base_phase2_msg.buf); params->progress = start_progress(params->progress_msg.buf, ultotal); display_progress(params->progress, ulnow); } @@ -554,8 +994,13 @@ static int gh__curl_progress_cb(void *clientp, enter_phase_3: strbuf_setlen(¶ms->progress_msg, 0); if (params->progress_base_phase3_msg.len) { - strbuf_addf(¶ms->progress_msg, "%s (bytes received)", - params->progress_base_phase3_msg.buf); + if (params->k_attempt > 0) + strbuf_addf(¶ms->progress_msg, "%s [retry %d/%d] (bytes received)", + params->progress_base_phase3_msg.buf, + params->k_attempt, gh__cmd_opts.max_retries); + else + strbuf_addf(¶ms->progress_msg, "%s (bytes received)", + params->progress_base_phase3_msg.buf); params->progress = start_progress(params->progress_msg.buf, dltotal); display_progress(params->progress, dlnow); } @@ -572,12 +1017,19 @@ static void gh__run_one_slot(struct active_request_slot *slot, struct gh__request_params *params, struct gh__response_status *status) { - trace2_region_enter("gvfs-helper", params->label.buf, NULL); + struct strbuf key = STRBUF_INIT; + + strbuf_addbuf(&key, ¶ms->tr2_label); + strbuf_addstr(&key, gh__server_type_label[params->server_type]); + + params->progress_state = GH__PROGRESS_STATE__START; + strbuf_setlen(¶ms->e2eid, 0); + + trace2_region_enter("gvfs-helper", key.buf, NULL); if (!start_active_slot(slot)) { status->curl_code = CURLE_FAILED_INIT; /* a bit of a lie */ - strbuf_addstr(&status->error_message, - "failed to start HTTP request"); + compute_retry_mode_from_curl_error(status); } else { run_active_slot(slot); if (params->b_write_to_file) @@ -585,27 +1037,38 @@ static void gh__run_one_slot(struct active_request_slot *slot, gh__response_status__set_from_slot(params, status, slot); - if (status->ec == GH__ERROR_CODE__OK) { - int old_len = params->label.len; + log_e2eid(params, status); - strbuf_addstr(¶ms->label, "/nr_objects"); - trace2_data_intmax("gvfs-helper", NULL, - params->label.buf, - params->object_count); - strbuf_setlen(¶ms->label, old_len); + if (status->ec == GH__ERROR_CODE__OK) { + int old_len = key.len; - strbuf_addstr(¶ms->label, "/nr_bytes"); + /* + * We only log the number of bytes received. + * We do not log the number of objects requested + * because the server may give us more than that + * (such as when we request a commit). + */ + strbuf_addstr(&key, "/nr_bytes"); trace2_data_intmax("gvfs-helper", NULL, - params->label.buf, + key.buf, status->bytes_received); - strbuf_setlen(¶ms->label, old_len); + strbuf_setlen(&key, old_len); } } if (params->progress) stop_progress(¶ms->progress); - trace2_region_leave("gvfs-helper", params->label.buf, NULL); + if (status->ec == GH__ERROR_CODE__OK && params->b_write_to_file) { + if (params->b_is_post) + install_packfile(params, status); + else + install_loose(params, status); + } + + trace2_region_leave("gvfs-helper", key.buf, NULL); + + strbuf_release(&key); } static int option_parse_cache_server_mode(const struct option *opt, @@ -1053,41 +1516,115 @@ static void select_odb(void) * * TODO Consider using lockfile for this rather than naked tempfile. */ -static struct tempfile *create_tempfile_for_packfile(void) +static void create_tempfile_for_packfile( + struct gh__request_params *params, + struct gh__response_status *status) { static unsigned int nth = 0; static struct timeval tv = {0}; static struct tm tm = {0}; static time_t secs = 0; - static char tbuf[32] = {0}; + static char date[32] = {0}; - struct tempfile *tempfile = NULL; - struct strbuf buf_path = STRBUF_INIT; + struct strbuf basename = STRBUF_INIT; + struct strbuf buf = STRBUF_INIT; + int len_p; + enum scld_error scld; + + gh__response_status__zero(status); if (!nth) { + /* + * Create a string to use in the name of all packfiles + * created by this process. + */ gettimeofday(&tv, NULL); secs = tv.tv_sec; gmtime_r(&secs, &tm); - xsnprintf(tbuf, sizeof(tbuf), "%4d%02d%02d-%02d%02d%02d-%06ld", + xsnprintf(date, sizeof(date), "%4d%02d%02d-%02d%02d%02d-%06ld", tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, (long)tv.tv_usec); } - // TODO should this be in the "/pack/tempPacks/" - // TODO directory instead? YES + /* + * Create a for this packfile using a series number , + * so that all of the chunks we download will group together. + */ + strbuf_addf(&basename, "vfs-%s-%04d", date, nth++); - strbuf_addbuf(&buf_path, &gh__global.buf_odb_path); - strbuf_complete(&buf_path, '/'); - strbuf_addf(&buf_path, "pack/vfs-%s-%04d.temp", tbuf, nth++); + /* + * We will stream the data into a managed tempfile() in: + * + * "/pack/tempPacks/vfs--.temp" + */ + strbuf_setlen(&buf, 0); + strbuf_addbuf(&buf, &gh__global.buf_odb_path); + strbuf_complete(&buf, '/'); + strbuf_addstr(&buf, "pack/"); + len_p = buf.len; + strbuf_addstr(&buf, "tempPacks/"); + strbuf_addbuf(&buf, &basename); + strbuf_addstr(&buf, ".temp"); + + scld = safe_create_leading_directories(buf.buf); + if (scld != SCLD_OK && scld != SCLD_EXISTS) { + strbuf_addf(&status->error_message, + "could not create directory for packfile: '%s'", + buf.buf); + status->ec = GH__ERROR_CODE__COULD_NOT_CREATE_TEMPFILE; + goto cleanup; + } + + params->tempfile = create_tempfile(buf.buf); + if (!params->tempfile) { + strbuf_addf(&status->error_message, + "could not create tempfile for packfile: '%s'", + buf.buf); + status->ec = GH__ERROR_CODE__COULD_NOT_CREATE_TEMPFILE; + goto cleanup; + } - tempfile = create_tempfile(buf_path.buf); - fdopen_tempfile(tempfile, "w"); + fdopen_tempfile(params->tempfile, "w"); - strbuf_release(&buf_path); + /* + * After the download is complete, we will need to steal the file + * from the tempfile() class (so that it doesn't magically delete + * it when we close the file handle) and then index it. + * + * We do this into the tempPacks directory to avoid contaminating + * the real pack directory until we know there is no corruption. + * + * "/pack/tempPacks/vfs--.temp.pack" + * "/pack/tempPacks/vfs--.temp.idx" + */ + strbuf_setlen(¶ms->temp_path_pack, 0); + strbuf_addf(¶ms->temp_path_pack, "%s.pack", buf.buf); + + strbuf_setlen(¶ms->temp_path_idx, 0); + strbuf_addf(¶ms->temp_path_idx, "%s.idx", buf.buf); + + /* + * Later, if all goes well, we will install them as: + * + * "/pack/vfs--.pack" + * "/pack/vfs--.idx" + */ + strbuf_setlen(&buf, len_p); + strbuf_setlen(¶ms->final_path_pack, 0); + strbuf_addf(¶ms->final_path_pack, "%s%s.pack", + buf.buf, basename.buf); + strbuf_setlen(¶ms->final_path_idx, 0); + strbuf_addf(¶ms->final_path_idx, "%s%s.idx", + buf.buf, basename.buf); + strbuf_setlen(¶ms->final_packfile_filename, 0); + strbuf_addf(¶ms->final_packfile_filename, "%s.pack", + basename.buf); - return tempfile; +cleanup: + strbuf_release(&buf); + strbuf_release(&basename); } /* @@ -1100,15 +1637,15 @@ static struct tempfile *create_tempfile_for_packfile(void) */ static void create_tempfile_for_loose( struct gh__request_params *params, - struct gh__response_status *status, - const struct object_id *oid) + struct gh__response_status *status) { + static int nth = 0; struct strbuf buf_path = STRBUF_INIT; const char *hex; gh__response_status__zero(status); - hex = oid_to_hex(oid); + hex = oid_to_hex(¶ms->loose_oid); strbuf_addbuf(&buf_path, &gh__global.buf_odb_path); strbuf_complete(&buf_path, '/'); @@ -1136,7 +1673,7 @@ static void create_tempfile_for_loose( * using lockfiles to avoid issues with stale locks after * crashes. */ - strbuf_addf(&buf_path, ".%08u.temp", getpid()); + strbuf_addf(&buf_path, ".%08u.%.06u.temp", getpid(), nth++); params->tempfile = create_tempfile(buf_path.buf); if (!params->tempfile) { @@ -1153,85 +1690,34 @@ static void create_tempfile_for_loose( } /* - * Extract the filename portion of the given pathname. - * - * TODO Wish I could find a strbuf_filename() function for this. + * Convert the tempfile into a temporary .pack, index it into a temporary .idx + * file, and then install the pair into ODB. */ -static void extract_filename(struct strbuf *filename, - const struct strbuf *pathname) -{ - size_t len = pathname->len; - - strbuf_setlen(filename, 0); - - while (len > 0 && !is_dir_sep(pathname->buf[len - 1])) - len--; - - strbuf_addstr(filename, &pathname->buf[len]); -} - -/* - * Convert the tempfile into a permanent .pack packfile in the ODB. - * Create the corresponding .idx file. - * - * Return the filename (not pathname) of the resulting packfile. - */ -static void install_packfile(struct gh__response_status *status, - struct tempfile **pp_tempfile, - struct strbuf *packfile_filename) +static void install_packfile(struct gh__request_params *params, + struct gh__response_status *status) { struct child_process ip = CHILD_PROCESS_INIT; - struct strbuf pack_name_tmp = STRBUF_INIT; - struct strbuf pack_name_dst = STRBUF_INIT; - struct strbuf idx_name_tmp = STRBUF_INIT; - struct strbuf idx_name_dst = STRBUF_INIT; - size_t len_base; - - gh__response_status__zero(status); - - strbuf_setlen(packfile_filename, 0); /* - * start with ".temp" (that is owned by tempfile class). - * rename to ".pack.temp" to break ownership. - * - * create ".idx.temp" on provisional packfile. - * - * officially install both ".{pack,idx}.temp" as - * ".{pack,idx}". + * When we request more than 1 object, the server should always + * send us a packfile. */ - - strbuf_addstr(&pack_name_tmp, get_tempfile_path(*pp_tempfile)); - if (!strip_suffix(pack_name_tmp.buf, ".temp", &len_base)) { - /* - * This is more of a BUG(), but I want the error - * code propagated. - */ + if (strcmp(status->content_type.buf, + "application/x-git-packfile")) { strbuf_addf(&status->error_message, - "packfile tempfile does not end in '.temp': '%s'", - pack_name_tmp.buf); - status->ec = GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE; + "received unknown content-type '%s'", + status->content_type.buf); + status->ec = GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE; goto cleanup; } - strbuf_setlen(&pack_name_tmp, (int)len_base); - strbuf_addbuf(&pack_name_dst, &pack_name_tmp); - strbuf_addbuf(&idx_name_tmp, &pack_name_tmp); - strbuf_addbuf(&idx_name_dst, &pack_name_tmp); - - strbuf_addstr(&pack_name_tmp, ".pack.temp"); - strbuf_addstr(&pack_name_dst, ".pack"); - strbuf_addstr(&idx_name_tmp, ".idx.temp"); - strbuf_addstr(&idx_name_dst, ".idx"); - - // TODO if either pack_name_dst or idx_name_dst already - // TODO exists in the ODB, create alternate names so that - // TODO we don't step on them. + gh__response_status__zero(status); - if (rename_tempfile(pp_tempfile, pack_name_tmp.buf) == -1) { + if (rename_tempfile(¶ms->tempfile, + params->temp_path_pack.buf) == -1) { strbuf_addf(&status->error_message, "could not rename packfile to '%s'", - pack_name_tmp.buf); + params->temp_path_pack.buf); status->ec = GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE; goto cleanup; } @@ -1239,59 +1725,54 @@ static void install_packfile(struct gh__response_status *status, argv_array_push(&ip.args, "index-pack"); if (gh__cmd_opts.show_progress) argv_array_push(&ip.args, "-v"); - argv_array_pushl(&ip.args, "-o", idx_name_tmp.buf, NULL); - argv_array_push(&ip.args, pack_name_tmp.buf); + argv_array_pushl(&ip.args, "-o", params->temp_path_idx.buf, NULL); + argv_array_push(&ip.args, params->temp_path_pack.buf); ip.git_cmd = 1; ip.no_stdin = 1; ip.no_stdout = 1; - // TODO consider capturing stdout from index-pack because - // TODO it will contain the SHA of the packfile and we can - // TODO (should?) add it to the .pack and .idx pathnames - // TODO when we install them. - // TODO - // TODO See pipe_command() rather than run_command(). - // TODO - // TODO Or should be SHA-it ourselves (or read the last 20 bytes)? - /* - * Note that I DO NOT have a region around the index-pack process. - * The region in gh__run_one_slot() currently only covers the - * download time. This index-pack is a separate step not covered - * in the above region. Later, if/when we have CURL directly stream - * to index-pack, that region will be the combined download+index - * time. So, I'm not going to introduce it here. + * Note that I DO NOT have a trace2 region around the + * index-pack process by itself. Currently, we are inside the + * trace2 region for running the request and that's fine. + * Later, if/when we stream the download directly to + * index-pack, it will be inside under the same region anyway. + * So, I'm not going to introduce it here. */ if (run_command(&ip)) { - unlink(pack_name_tmp.buf); - unlink(idx_name_tmp.buf); + unlink(params->temp_path_pack.buf); + unlink(params->temp_path_idx.buf); strbuf_addf(&status->error_message, - "index-pack failed on '%s'", pack_name_tmp.buf); - status->ec = GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE; + "index-pack failed on '%s'", + params->temp_path_pack.buf); + /* + * Lets assume that index-pack failed because the + * downloaded file is corrupt (truncated). + * + * Retry it as if the network had dropped. + */ + status->retry = GH__RETRY_MODE__TRANSIENT; + status->ec = GH__ERROR_CODE__INDEX_PACK_FAILED; goto cleanup; } - if (finalize_object_file(pack_name_tmp.buf, pack_name_dst.buf) || - finalize_object_file(idx_name_tmp.buf, idx_name_dst.buf)) { - unlink(pack_name_tmp.buf); - unlink(pack_name_dst.buf); - unlink(idx_name_tmp.buf); - unlink(idx_name_dst.buf); + if (finalize_object_file(params->temp_path_pack.buf, + params->final_path_pack.buf) || + finalize_object_file(params->temp_path_idx.buf, + params->final_path_idx.buf)) { + unlink(params->temp_path_pack.buf); + unlink(params->temp_path_idx.buf); + unlink(params->final_path_pack.buf); + unlink(params->final_path_idx.buf); strbuf_addf(&status->error_message, "could not install packfile '%s'", - pack_name_dst.buf); + params->final_path_pack.buf); status->ec = GH__ERROR_CODE__COULD_NOT_INSTALL_PACKFILE; goto cleanup; } - extract_filename(packfile_filename, &pack_name_dst); - cleanup: child_process_clear(&ip); - strbuf_release(&pack_name_tmp); - strbuf_release(&pack_name_dst); - strbuf_release(&idx_name_tmp); - strbuf_release(&idx_name_dst); } /* @@ -1361,7 +1842,250 @@ static void gh_http_cleanup(void) } /* - * Do a single HTTP request without auth-retry or fallback. + * buffer has ": [\r]\n" + */ +static void parse_resp_hdr_1(const char *buffer, size_t size, size_t nitems, + struct strbuf *key, struct strbuf *value) +{ + const char *end = buffer + (size * nitems); + const char *p; + + p = strchr(buffer, ':'); + + strbuf_setlen(key, 0); + strbuf_add(key, buffer, (p - buffer)); + + p++; /* skip ':' */ + p++; /* skip ' ' */ + + strbuf_setlen(value, 0); + strbuf_add(value, p, (end - p)); + strbuf_trim_trailing_newline(value); +} + +static size_t parse_resp_hdr(char *buffer, size_t size, size_t nitems, + void *void_params) +{ + struct gh__request_params *params = void_params; + struct gh__azure_throttle *azure = &gh__global_throttle[params->server_type]; + + if (starts_with(buffer, "X-RateLimit-")) { + struct strbuf key = STRBUF_INIT; + struct strbuf val = STRBUF_INIT; + + parse_resp_hdr_1(buffer, size, nitems, &key, &val); + + /* + * The following X- headers are specific to AzureDevOps. + * Other servers have similar sets of values, but I haven't + * compared them in depth. + * + * TODO Remove this. + */ + trace2_printf("Throttle: %s %s", key.buf, val.buf); + + if (!strcmp(key.buf, "X-RateLimit-Resource")) { + /* + * The name of the resource that is complaining. + * Just log it because we can't do anything with it. + */ + strbuf_setlen(&key, 0); + strbuf_addstr(&key, "ratelimit/resource"); + strbuf_addstr(&key, gh__server_type_label[params->server_type]); + + trace2_data_string("gvfs-helper", NULL, key.buf, val.buf); + } + + else if (!strcmp(key.buf, "X-RateLimit-Delay")) { + /* + * The amount of delay added to our response. + * Just log it because we can't do anything with it. + */ + unsigned long tarpit_delay_ms; + + strbuf_setlen(&key, 0); + strbuf_addstr(&key, "ratelimit/delay_ms"); + strbuf_addstr(&key, gh__server_type_label[params->server_type]); + + git_parse_ulong(val.buf, &tarpit_delay_ms); + + trace2_data_intmax("gvfs-helper", NULL, key.buf, tarpit_delay_ms); + } + + else if (!strcmp(key.buf, "X-RateLimit-Limit")) { + /* + * The resource limit/quota before we get a 429. + */ + git_parse_ulong(val.buf, &azure->tstu_limit); + } + + else if (!strcmp(key.buf, "X-RateLimit-Remaining")) { + /* + * The amount of our quota remaining. When zero, we + * should get 429s on futher requests until the reset + * time. + */ + git_parse_ulong(val.buf, &azure->tstu_remaining); + } + + else if (!strcmp(key.buf, "X-RateLimit-Reset")) { + /* + * The server gave us a time-in-seconds-since-the-epoch + * for when our quota will be reset (if we stop all + * activity right now). + * + * Checkpoint the local system clock so we can do some + * sanity checks on any clock skew. Also, since we get + * the headers before we get the content, we can adjust + * our delay to compensate for the full download time. + */ + unsigned long now = time(NULL); + unsigned long reset_time; + + git_parse_ulong(val.buf, &reset_time); + if (reset_time > now) + azure->reset_sec = reset_time - now; + } + + strbuf_release(&key); + strbuf_release(&val); + } + + else if (starts_with(buffer, "Retry-After")) { + struct strbuf key = STRBUF_INIT; + struct strbuf val = STRBUF_INIT; + + parse_resp_hdr_1(buffer, size, nitems, &key, &val); + + /* + * We get this header with a 429 and 503 and possibly a 30x. + * + * Curl does have CURLINFO_RETRY_AFTER that nicely parses and + * normalizes the value (and supports HTTP/1.1 usage), but it + * is not present yet in the version shipped with the Mac, so + * we do it directly here. + */ + git_parse_ulong(val.buf, &azure->retry_after_sec); + + strbuf_release(&key); + strbuf_release(&val); + } + + else if (starts_with(buffer, "X-VSS-E2EID")) { + struct strbuf key = STRBUF_INIT; + + /* + * Capture the E2EID as it goes by, but don't log it until we + * know the request result. + */ + parse_resp_hdr_1(buffer, size, nitems, &key, ¶ms->e2eid); + + strbuf_release(&key); + } + + return nitems * size; +} + +/* + * Wait "duration" seconds and drive the progress mechanism. + * + * We spin slightly faster than we need to to keep the progress bar + * drawn (especially if the user presses return while waiting) and to + * compensate for delay factors built into the progress class (which + * might wait for 2 seconds before drawing the first message). + */ +static void do_throttle_spin(struct gh__request_params *params, + const char *tr2_label, + const char *progress_msg, + int duration) +{ + struct strbuf region = STRBUF_INIT; + struct progress *progress = NULL; + unsigned long begin = time(NULL); + unsigned long now = begin; + unsigned long end = begin + duration; + + strbuf_addstr(®ion, tr2_label); + strbuf_addstr(®ion, gh__server_type_label[params->server_type]); + trace2_region_enter("gvfs-helper", region.buf, NULL); + + progress = start_progress(progress_msg, duration); + while (now < end) { + display_progress(progress, (now - begin)); + + sleep_millisec(100); + + now = time(NULL); + } + display_progress(progress, duration); + stop_progress(&progress); + + trace2_region_leave("gvfs-helper", region.buf, NULL); + strbuf_release(®ion); +} + +/* + * Delay the outbound request if necessary in response to previous throttle + * blockages or hints. Throttle data is somewhat orthogonal to the status + * results from any previous request and/or the request params of the next + * request. + * + * Note that the throttle info also is cross-process information, such as + * 2 concurrent fetches in 2 different terminal windows to the same server + * will be sharing the same server quota. These could be coordinated too, + * so that a blockage received in one process would prevent the other + * process from starting another request (and also blocked or extending + * the delay interval). We're NOT going to do that level of integration. + * We will let both processes independently attempt the next request. + * This may cause us to miss the end-of-quota boundary if the server + * extends it because of the second request. + * + * TODO Should we have a max-wait option and then return a hard-error + * TODO of some type? + */ +static void do_throttle_wait(struct gh__request_params *params, + struct gh__response_status *status) +{ + struct gh__azure_throttle *azure = + &gh__global_throttle[params->server_type]; + + if (azure->retry_after_sec) { + /* + * We were given a hard delay (such as after a 429). + * Spin until the requested time. + */ + do_throttle_spin(params, "throttle/hard", + "Waiting on hard throttle (sec)", + azure->retry_after_sec); + return; + } + + if (azure->reset_sec > 0) { + /* + * We were given a hint that we are overloading + * the server. Voluntarily backoff (before we + * get tarpitted or blocked). + */ + do_throttle_spin(params, "throttle/soft", + "Waiting on soft throttle (sec)", + azure->reset_sec); + return; + } + + if (params->k_transient_delay_sec) { + /* + * Insert an arbitrary delay before retrying after a + * transient (network) failure. + */ + do_throttle_spin(params, "throttle/transient", + "Waiting to retry after network error (sec)", + params->k_transient_delay_sec); + return; + } +} + +/* + * Do a single HTTP request WITHOUT robust-retry, auth-retry or fallback. */ static void do_req(const char *url_base, const char *url_component, @@ -1376,14 +2100,27 @@ static void do_req(const char *url_base, gh__response_status__zero(status); if (params->b_write_to_file) { - // TODO ftruncate tempfile ?? + /* Delete dirty tempfile from a previous attempt. */ + if (params->tempfile) + delete_tempfile(¶ms->tempfile); + + if (params->b_is_post) + create_tempfile_for_packfile(params, status); + else + create_tempfile_for_loose(params, status); + if (!params->tempfile || status->ec != GH__ERROR_CODE__OK) + return; } else { + /* Guard against caller using dirty buffer */ strbuf_setlen(params->buffer, 0); } end_url_with_slash(&rest_url, url_base); strbuf_addstr(&rest_url, url_component); + do_throttle_wait(params, status); + gh__azure_throttle__zero(&gh__global_throttle[params->server_type]); + slot = get_active_slot(); slot->results = &results; @@ -1412,6 +2149,9 @@ static void do_req(const char *url_base, curl_easy_setopt(slot->curl, CURLOPT_FILE, params->buffer); } + curl_easy_setopt(slot->curl, CURLOPT_HEADERFUNCTION, parse_resp_hdr); + curl_easy_setopt(slot->curl, CURLOPT_HEADERDATA, params); + if (creds && creds->username) { /* * Force CURL to respect the username/password we provide by @@ -1438,8 +2178,8 @@ static void do_req(const char *url_base, * with the main Git server (because the cache-server * doesn't handle 401s). * - * TODO Think about if we really need to handle this case - * TODO and if so, add an .is_main to params. + * TODO Think about if we really need to handle this case. + * TODO Guard with "if (params->sever_type == __MAIN)" */ curl_easy_setopt(slot->curl, CURLOPT_HTTPAUTH, CURLAUTH_ANY); } @@ -1457,20 +2197,123 @@ static void do_req(const char *url_base, gh__run_one_slot(slot, params, status); } +/* + * Compute the delay for the nth attempt. + * + * No delay for the first attempt. Then use a normal exponential backoff + * starting from 8. + */ +static int compute_transient_delay(int attempt) +{ + int v; + + if (attempt < 1) + return 0; + + /* + * Let 8K be our hard limit (for integer overflow protection). + * That's over 2 hours. This is 8<<10. + */ + if (attempt > 10) + attempt = 10; + + v = 8 << (attempt - 1); + + if (v > gh__cmd_opts.max_transient_backoff_sec) + v = gh__cmd_opts.max_transient_backoff_sec; + + return v; +} + +/* + * Robustly make an HTTP request. Retry if necessary to hide common + * transient network errors and/or 429 blockages. + * + * For a transient (network) failure (where we do not have a throttle + * delay factor), we should insert a small delay to let the network + * recover. The outage might be because the VPN dropped, or the + * machine went to sleep or something and we want to give the network + * time to come back up. Insert AI here :-) + */ +static void do_req__with_robust_retry(const char *url_base, + const char *url_component, + const struct credential *creds, + struct gh__request_params *params, + struct gh__response_status *status) +{ + for (params->k_attempt = 0; + params->k_attempt < gh__cmd_opts.max_retries + 1; + params->k_attempt++) { + + do_req(url_base, url_component, creds, params, status); + + switch (status->retry) { + default: + case GH__RETRY_MODE__SUCCESS: + case GH__RETRY_MODE__HTTP_401: /* caller does auth-retry */ + case GH__RETRY_MODE__HARD_FAIL: + case GH__RETRY_MODE__FAIL_404: + return; + + case GH__RETRY_MODE__HTTP_429: + case GH__RETRY_MODE__HTTP_503: + /* + * We should have gotten a "Retry-After" header with + * these and that gives us the wait time. If not, + * fallthru and use the backoff delay. + */ + if (gh__global_throttle[params->server_type].retry_after_sec) + continue; + /*fallthru*/ + + case GH__RETRY_MODE__TRANSIENT: + params->k_transient_delay_sec = + compute_transient_delay(params->k_attempt); + continue; + } + } +} + static void do_req__to_main(const char *url_component, struct gh__request_params *params, struct gh__response_status *status) { -// lookup_main_creds(); + params->server_type = GH__SERVER_TYPE__MAIN; - do_req(gh__global.main_url, url_component, &gh__global.main_creds, - params, status); + /* + * When talking to the main Git server, we do allow non-auth'd + * initial requests (mainly for testing, since production servers + * will usually always want creds), so we DO NOT force load the + * user's creds here and let the normal 401 mechanism handle things. + * This has a slight perf penalty -- if we're bulk fetching 4000 + * objects from a production server, we're going to do a 100K+ byte + * POST just to get a 401 and then repeat the 100K+ byte POST + * with the creds. + * + * TODO Consider making a test or runtime flag to alter this. + * TODO If set/not-set, add a call here to pre-populate the creds. + * TODO + * TODO lookup_main_creds(); + * TODO + * + * TODO Testing with GIT_TRACE_CURL shows that curl will *SOMETIME* + * TODO send an "Expect: 100-continue" and silently handle/eat the + * TODO "HTTP/1.1 100 Continue" before the POST-body is sent, + * TODO so maybe this isn't an issue. (Other than the extra + * TODO roundtrip for the first set of headers.) More testing here + * TODO should be performed. + */ + + do_req__with_robust_retry(gh__global.main_url, url_component, + &gh__global.main_creds, + params, status); if (status->response_code == 401) { refresh_main_creds(); - do_req(gh__global.main_url, url_component, &gh__global.main_creds, - params, status); + do_req__with_robust_retry(gh__global.main_url, url_component, + &gh__global.main_creds, + params, status); } if (status->response_code == 200) @@ -1481,29 +2324,36 @@ static void do_req__to_cache_server(const char *url_component, struct gh__request_params *params, struct gh__response_status *status) { + params->server_type = GH__SERVER_TYPE__CACHE; + synthesize_cache_server_creds(); - do_req(gh__global.cache_server_url, url_component, &gh__global.cache_creds, - params, status); - fixup_cache_server_400_to_401(status); + do_req__with_robust_retry(gh__global.cache_server_url, url_component, + &gh__global.cache_creds, + params, status); if (status->response_code == 401) { refresh_cache_server_creds(); - do_req(gh__global.cache_server_url, url_component, - &gh__global.cache_creds, params, status); - fixup_cache_server_400_to_401(status); + do_req__with_robust_retry(gh__global.cache_server_url, + url_component, + &gh__global.cache_creds, + params, status); } if (status->response_code == 200) approve_cache_server_creds(); } +/* + * Try the cache-server (if configured) then fall-back to the main Git server. + */ static void do_req__with_fallback(const char *url_component, struct gh__request_params *params, struct gh__response_status *status) { - if (gh__global.cache_server_url && !params->b_no_cache_server) { + if (gh__global.cache_server_url && + params->b_permit_cache_server_if_defined) { do_req__to_cache_server(url_component, params, status); if (status->response_code == 200) @@ -1538,11 +2388,12 @@ static void do__gvfs_config(struct gh__response_status *status, { struct gh__request_params params = GH__REQUEST_PARAMS_INIT; - strbuf_addstr(¶ms.label, "GET/config"); + strbuf_addstr(¶ms.tr2_label, "GET/config"); params.b_is_post = 0; params.b_write_to_file = 0; - params.b_no_cache_server = 1; /* they don't handle gvfs/config API */ + /* cache-servers do not handle gvfs/config REST calls */ + params.b_permit_cache_server_if_defined = 0; params.buffer = config_data; params.object_count = 1; /* a bit of a lie */ @@ -1587,11 +2438,11 @@ static void do__loose__gvfs_object(struct gh__response_status *status, strbuf_addf(&component_url, "gvfs/objects/%s", oid_to_hex(oid)); - strbuf_addstr(¶ms.label, "GET/objects"); + strbuf_addstr(¶ms.tr2_label, "GET/objects"); params.b_is_post = 0; params.b_write_to_file = 1; - params.b_no_cache_server = 0; + params.b_permit_cache_server_if_defined = 1; params.object_count = 1; @@ -1601,9 +2452,7 @@ static void do__loose__gvfs_object(struct gh__response_status *status, params.headers = curl_slist_append(params.headers, "Pragma: no-cache"); - create_tempfile_for_loose(¶ms, status, oid); - if (!params.tempfile) - goto cleanup; + oidcpy(¶ms.loose_oid, oid); if (gh__cmd_opts.show_progress) { /* @@ -1618,10 +2467,6 @@ static void do__loose__gvfs_object(struct gh__response_status *status, do_req__with_fallback(component_url.buf, ¶ms, status); - if (status->ec == GH__ERROR_CODE__OK) - install_loose(¶ms, status); - -cleanup: gh__request_params__release(¶ms); strbuf_release(&component_url); } @@ -1634,23 +2479,26 @@ static void do__loose__gvfs_object(struct gh__response_status *status, static void do__packfile__gvfs_objects(struct gh__response_status *status, struct oidset_iter *iter, unsigned long nr_wanted_in_block, + int j_pack_num, int j_pack_den, struct strbuf *output_filename, - unsigned long *nr_taken) + unsigned long *nr_oid_taken) { struct json_writer jw_req = JSON_WRITER_INIT; struct gh__request_params params = GH__REQUEST_PARAMS_INIT; + strbuf_setlen(output_filename, 0); + gh__response_status__zero(status); params.object_count = build_json_payload__gvfs_objects( &jw_req, iter, nr_wanted_in_block); - *nr_taken = params.object_count; + *nr_oid_taken = params.object_count; - strbuf_addstr(¶ms.label, "POST/objects"); + strbuf_addstr(¶ms.tr2_label, "POST/objects"); params.b_is_post = 1; params.b_write_to_file = 1; - params.b_no_cache_server = 0; + params.b_permit_cache_server_if_defined = 1; params.post_payload = &jw_req.json; @@ -1675,73 +2523,21 @@ static void do__packfile__gvfs_objects(struct gh__response_status *status, params.headers = curl_slist_append(params.headers, "Accept: application/x-git-loose-object"); - params.tempfile = create_tempfile_for_packfile(); - if (!params.tempfile) { - strbuf_addstr(&status->error_message, - "could not create tempfile for packfile"); - status->ec = GH__ERROR_CODE__COULD_NOT_CREATE_TEMPFILE; - goto cleanup; - } - if (gh__cmd_opts.show_progress) { strbuf_addf(¶ms.progress_base_phase2_msg, - "Requesting packfile with %ld objects", + "Requesting packfile %d/%d with %ld objects", + j_pack_num, j_pack_den, params.object_count); strbuf_addf(¶ms.progress_base_phase3_msg, - "Receiving packfile with %ld objects", + "Receiving packfile %d/%d with %ld objects", + j_pack_num, j_pack_den, params.object_count); } do_req__with_fallback("gvfs/objects", ¶ms, status); + if (status->ec == GH__ERROR_CODE__OK) + strbuf_addbuf(output_filename, ¶ms.final_packfile_filename); - if (status->ec == GH__ERROR_CODE__OK) { - if (!strcmp(status->content_type.buf, - "application/x-git-packfile")) { - - // TODO Consider having a worker thread to manage - // TODO running index-pack and then install the - // TODO resulting .idx and .pack files. This would - // TODO let us interleave those steps with our thread - // TODO fetching the next block of objects from the - // TODO server. (Need to think about how progress - // TODO messages from our thread and index-pack - // TODO would mesh.) - // TODO - // TODO But then again, if we hack index-pack to write - // TODO to our alternate and stream the data thru it, - // TODO it won't matter. - - install_packfile(status, ¶ms.tempfile, - output_filename); - goto cleanup; - } - - if (!strcmp(status->content_type.buf, - "application/x-git-loose-object")) - { - /* - * This should not happen (when we request - * more than one object). The server can send - * us a loose object (even when we use the - * POST form) if there is only one object in - * the payload (and despite the set of accept - * headers we send), so I'm going to leave - * this here. - */ - strbuf_addstr(&status->error_message, - "received loose object when packfile expected"); - status->ec = GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE; - goto cleanup; - } - - strbuf_addf(&status->error_message, - "received unknown content-type '%s'", - status->content_type.buf); - status->ec = GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE; - goto cleanup; - } - -cleanup: gh__request_params__release(¶ms); jw_release(&jw_req); } @@ -1755,7 +2551,7 @@ static void do__packfile__gvfs_objects(struct gh__response_status *status, */ static void do_fetch_oidset(struct gh__response_status *status, struct oidset *oids, - unsigned long nr_total, + unsigned long nr_oid_total, struct string_list *result_list) { struct oidset_iter iter; @@ -1764,19 +2560,25 @@ static void do_fetch_oidset(struct gh__response_status *status, struct strbuf err404 = STRBUF_INIT; const struct object_id *oid; unsigned long k; - unsigned long nr_taken; + unsigned long nr_oid_taken; int had_404 = 0; + int j_pack_den = 0; + int j_pack_num = 0; gh__response_status__zero(status); - if (!nr_total) + if (!nr_oid_total) return; + if (nr_oid_total > 1) + j_pack_den = ((nr_oid_total + gh__cmd_opts.block_size - 1) + / gh__cmd_opts.block_size); + oidset_iter_init(oids, &iter); - for (k = 0; k < nr_total; k += nr_taken) { - if (nr_total - k == 1 || gh__cmd_opts.block_size == 1) { + for (k = 0; k < nr_oid_total; k += nr_oid_taken) { + if (nr_oid_total - k == 1 || gh__cmd_opts.block_size == 1) { oid = oidset_iter_next(&iter); - nr_taken = 1; + nr_oid_taken = 1; do__loose__gvfs_object(status, oid); @@ -1811,10 +2613,13 @@ static void do_fetch_oidset(struct gh__response_status *status, } else { strbuf_setlen(&output_filename, 0); + j_pack_num++; + do__packfile__gvfs_objects(status, &iter, gh__cmd_opts.block_size, + j_pack_num, j_pack_den, &output_filename, - &nr_taken); + &nr_oid_taken); /* * Because the oidset iterator has random @@ -1930,6 +2735,8 @@ static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) N_("number of objects to request at a time")), OPT_INTEGER('d', "depth", &gh__cmd_opts.depth, N_("Commit depth")), + OPT_INTEGER('r', "max-retries", &gh__cmd_opts.max_retries, + N_("retries for transient network errors")), OPT_END(), }; @@ -1937,7 +2744,7 @@ static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) struct oidset oids = OIDSET_INIT; struct string_list result_list = STRING_LIST_INIT_DUP; enum gh__error_code ec = GH__ERROR_CODE__OK; - unsigned long nr_total; + unsigned long nr_oid_total; int k; trace2_cmd_mode("get"); @@ -1948,14 +2755,16 @@ static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) argc = parse_options(argc, argv, NULL, get_options, get_usage, 0); if (gh__cmd_opts.depth < 1) gh__cmd_opts.depth = 1; + if (gh__cmd_opts.max_retries < 0) + gh__cmd_opts.max_retries = 0; finish_init(1); - nr_total = read_stdin_from_rev_list(&oids); + nr_oid_total = read_stdin_from_rev_list(&oids); trace2_region_enter("gvfs-helper", "get", NULL); - trace2_data_intmax("gvfs-helper", NULL, "get/nr_objects", nr_total); - do_fetch_oidset(&status, &oids, nr_total, &result_list); + trace2_data_intmax("gvfs-helper", NULL, "get/nr_objects", nr_oid_total); + do_fetch_oidset(&status, &oids, nr_oid_total, &result_list); trace2_region_leave("gvfs-helper", "get", NULL); ec = status.ec; @@ -1990,7 +2799,7 @@ static enum gh__error_code do_server_subprocess_get(void) int len; int err; int k; - unsigned long nr_total = 0; + unsigned long nr_oid_total = 0; /* * Inside the "get" command, we expect a list of OIDs @@ -2008,10 +2817,10 @@ static enum gh__error_code do_server_subprocess_get(void) } if (!oidset_insert(&oids, &oid)) - nr_total++; + nr_oid_total++; } - if (!nr_total) { + if (!nr_oid_total) { if (packet_write_fmt_gently(1, "ok\n")) { error("server: cannot write 'get' result to client"); ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; @@ -2021,8 +2830,8 @@ static enum gh__error_code do_server_subprocess_get(void) } trace2_region_enter("gvfs-helper", "server/get", NULL); - trace2_data_intmax("gvfs-helper", NULL, "server/get/nr_objects", nr_total); - do_fetch_oidset(&status, &oids, nr_total, &result_list); + trace2_data_intmax("gvfs-helper", NULL, "server/get/nr_objects", nr_oid_total); + do_fetch_oidset(&status, &oids, nr_oid_total, &result_list); trace2_region_leave("gvfs-helper", "server/get", NULL); /* @@ -2180,6 +2989,8 @@ static enum gh__error_code do_sub_cmd__server(int argc, const char **argv) N_("number of objects to request at a time")), OPT_INTEGER('d', "depth", &gh__cmd_opts.depth, N_("Commit depth")), + OPT_INTEGER('r', "max-retries", &gh__cmd_opts.max_retries, + N_("retries for transient network errors")), OPT_END(), }; @@ -2196,6 +3007,8 @@ static enum gh__error_code do_sub_cmd__server(int argc, const char **argv) argc = parse_options(argc, argv, NULL, server_options, server_usage, 0); if (gh__cmd_opts.depth < 1) gh__cmd_opts.depth = 1; + if (gh__cmd_opts.max_retries < 0) + gh__cmd_opts.max_retries = 0; finish_init(1); @@ -2285,13 +3098,23 @@ int cmd_main(int argc, const char **argv) setup_git_directory_gently(NULL); - git_config(git_default_config, NULL); - /* Set any non-zero initial values in gh__cmd_opts. */ - gh__cmd_opts.depth = 1; + gh__cmd_opts.depth = GH__DEFAULT_COMMIT_DEPTH; gh__cmd_opts.block_size = GH__DEFAULT_BLOCK_SIZE; + gh__cmd_opts.max_retries = GH__DEFAULT_MAX_RETRIES; + gh__cmd_opts.max_transient_backoff_sec = + GH__DEFAULT_MAX_TRANSIENT_BACKOFF_SEC; + gh__cmd_opts.show_progress = !!isatty(2); + // TODO use existing gvfs config settings to override our GH__DEFAULT_ + // TODO values in gh__cmd_opts. (And maybe add/remove our command line + // TODO options for them.) + // TODO + // TODO See "scalar.max-retries" (and maybe "gvfs.max-retries") + + git_config(git_default_config, NULL); + argc = parse_options(argc, argv, NULL, main_options, main_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (argc == 0) From ae0c9ccdfef5fb5dcdf1166f41e14dbb920c11a9 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 21 Oct 2019 12:29:19 -0400 Subject: [PATCH 34/36] gvfs-helper: expose gvfs/objects GET and POST semantics Expose the differences in the semantics of GET and POST for the "gvfs/objects" API: HTTP GET: fetches a single loose object over the network. When a commit object is requested, it just returns the single object. HTTP POST: fetches a batch of objects over the network. When the oid-set contains a commit object, all referenced trees are also included in the response. gvfs-helper is updated to take "get" and "post" command line options. the gvfs-helper "server" mode is updated to take "objects.get" and "objects.post" verbs. For convenience, the "get" option and the "objects.get" verb do allow more than one object to be requested. gvfs-helper will automatically issue a series of (single object) HTTP GET requests and creating a series of loose objects. The "post" option and the "objects.post" verb will perform bulk object fetching using the batch-size chunking. Individual HTTP POST requests containing more than one object will be created as a packfile. A HTTP POST for a single object will create a loose object. This commit also contains some refactoring to eliminate the assumption that POST is always associated with packfiles. In gvfs-helper-client.c, gh_client__get_immediate() now uses the "objects.get" verb and ignores any currently queued objects. In gvfs-helper-client.c, the OIDSET built by gh_client__queue_oid() is only processed when gh_client__drain_queue() is called. The queue is processed using the "object.post" verb. Signed-off-by: Jeff Hostetler --- gvfs-helper-client.c | 225 +++++++++----- gvfs-helper-client.h | 26 +- gvfs-helper.c | 712 +++++++++++++++++++++++++++---------------- 3 files changed, 622 insertions(+), 341 deletions(-) diff --git a/gvfs-helper-client.c b/gvfs-helper-client.c index 1564610aae594b..3bcec53cfe2acc 100644 --- a/gvfs-helper-client.c +++ b/gvfs-helper-client.c @@ -13,7 +13,6 @@ static struct oidset gh_client__oidset_queued = OIDSET_INIT; static unsigned long gh_client__oidset_count; -static int gh_client__includes_immediate; struct gh_server__process { struct subprocess_entry subprocess; /* must be first */ @@ -24,13 +23,20 @@ static int gh_server__subprocess_map_initialized; static struct hashmap gh_server__subprocess_map; static struct object_directory *gh_client__chosen_odb; -#define CAP_GET (1u<<1) +/* + * The "objects" capability has 2 verbs: "get" and "post". + */ +#define CAP_OBJECTS (1u<<1) +#define CAP_OBJECTS_NAME "objects" + +#define CAP_OBJECTS__VERB_GET1_NAME "get" +#define CAP_OBJECTS__VERB_POST_NAME "post" static int gh_client__start_fn(struct subprocess_entry *subprocess) { static int versions[] = {1, 0}; static struct subprocess_capability capabilities[] = { - { "get", CAP_GET }, + { CAP_OBJECTS_NAME, CAP_OBJECTS }, { NULL, 0 } }; @@ -42,14 +48,16 @@ static int gh_client__start_fn(struct subprocess_entry *subprocess) } /* - * Send: + * Send the queued OIDs in the OIDSET to gvfs-helper for it to + * fetch from the cache-server or main Git server using "/gvfs/objects" + * POST semantics. * - * get LF + * objects.post LF * ( LF)* * * */ -static int gh_client__get__send_command(struct child_process *process) +static int gh_client__send__objects_post(struct child_process *process) { struct oidset_iter iter; struct object_id *oid; @@ -60,7 +68,9 @@ static int gh_client__get__send_command(struct child_process *process) * so that we don't have to. */ - err = packet_write_fmt_gently(process->in, "get\n"); + err = packet_write_fmt_gently( + process->in, + (CAP_OBJECTS_NAME "." CAP_OBJECTS__VERB_POST_NAME "\n")); if (err) return err; @@ -79,6 +89,46 @@ static int gh_client__get__send_command(struct child_process *process) return 0; } +/* + * Send the given OID to gvfs-helper for it to fetch from the + * cache-server or main Git server using "/gvfs/objects" GET + * semantics. + * + * This ignores any queued OIDs. + * + * objects.get LF + * LF + * + * + */ +static int gh_client__send__objects_get(struct child_process *process, + const struct object_id *oid) +{ + int err; + + /* + * We assume that all of the packet_ routines call error() + * so that we don't have to. + */ + + err = packet_write_fmt_gently( + process->in, + (CAP_OBJECTS_NAME "." CAP_OBJECTS__VERB_GET1_NAME "\n")); + if (err) + return err; + + err = packet_write_fmt_gently(process->in, "%s\n", + oid_to_hex(oid)); + if (err) + return err; + + err = packet_flush_gently(process->in); + if (err) + return err; + + return 0; +} + /* * Verify that the pathname found in the "odb" response line matches * what we requested. @@ -148,7 +198,7 @@ static void gh_client__update_packed_git(const char *line) } /* - * We expect: + * Both CAP_OBJECTS verbs return the same format response: * * * * @@ -179,7 +229,7 @@ static void gh_client__update_packed_git(const char *line) * grouped with a queued request for a blob. The tree-walk *might* be * able to continue and let the 404 blob be handled later. */ -static int gh_client__get__receive_response( +static int gh_client__objects__receive_response( struct child_process *process, enum gh_client__created *p_ghc, int *p_nr_loose, int *p_nr_packfile) @@ -259,17 +309,12 @@ static void gh_client__choose_odb(void) } } -static int gh_client__get(enum gh_client__created *p_ghc) +static struct gh_server__process *gh_client__find_long_running_process( + unsigned int cap_needed) { struct gh_server__process *entry; - struct child_process *process; struct argv_array argv = ARGV_ARRAY_INIT; struct strbuf quoted = STRBUF_INIT; - int nr_loose = 0; - int nr_packfile = 0; - int err = 0; - - trace2_region_enter("gh-client", "get", the_repository); gh_client__choose_odb(); @@ -285,6 +330,11 @@ static int gh_client__get(enum gh_client__created *p_ghc) sq_quote_argv_pretty("ed, argv.argv); + /* + * Find an existing long-running process with the above command + * line -or- create a new long-running process for this and + * subsequent 'get' requests. + */ if (!gh_server__subprocess_map_initialized) { gh_server__subprocess_map_initialized = 1; hashmap_init(&gh_server__subprocess_map, @@ -298,70 +348,24 @@ static int gh_client__get(enum gh_client__created *p_ghc) entry = xmalloc(sizeof(*entry)); entry->supported_capabilities = 0; - err = subprocess_start_argv( - &gh_server__subprocess_map, &entry->subprocess, 1, - &argv, gh_client__start_fn); - if (err) { - free(entry); - goto leave_region; - } + if (subprocess_start_argv(&gh_server__subprocess_map, + &entry->subprocess, 1, + &argv, gh_client__start_fn)) + FREE_AND_NULL(entry); } - process = &entry->subprocess.process; - - if (!(CAP_GET & entry->supported_capabilities)) { - error("gvfs-helper: does not support GET"); + if (entry && + (entry->supported_capabilities & cap_needed) != cap_needed) { + error("gvfs-helper: does not support needed capabilities"); subprocess_stop(&gh_server__subprocess_map, (struct subprocess_entry *)entry); - free(entry); - err = -1; - goto leave_region; + FREE_AND_NULL(entry); } - sigchain_push(SIGPIPE, SIG_IGN); - - err = gh_client__get__send_command(process); - if (!err) - err = gh_client__get__receive_response(process, p_ghc, - &nr_loose, &nr_packfile); - - sigchain_pop(SIGPIPE); - - if (err) { - subprocess_stop(&gh_server__subprocess_map, - (struct subprocess_entry *)entry); - free(entry); - } - -leave_region: argv_array_clear(&argv); strbuf_release("ed); - trace2_data_intmax("gh-client", the_repository, - "get/immediate", gh_client__includes_immediate); - - trace2_data_intmax("gh-client", the_repository, - "get/nr_objects", gh_client__oidset_count); - - if (nr_loose) - trace2_data_intmax("gh-client", the_repository, - "get/nr_loose", nr_loose); - - if (nr_packfile) - trace2_data_intmax("gh-client", the_repository, - "get/nr_packfile", nr_packfile); - - if (err) - trace2_data_intmax("gh-client", the_repository, - "get/error", err); - - trace2_region_leave("gh-client", "get", the_repository); - - oidset_clear(&gh_client__oidset_queued); - gh_client__oidset_count = 0; - gh_client__includes_immediate = 0; - - return err; + return entry; } void gh_client__queue_oid(const struct object_id *oid) @@ -388,28 +392,97 @@ void gh_client__queue_oid_array(const struct object_id *oids, int oid_nr) gh_client__queue_oid(&oids[k]); } +/* + * Bulk fetch all of the queued OIDs in the OIDSET. + */ int gh_client__drain_queue(enum gh_client__created *p_ghc) { + struct gh_server__process *entry; + struct child_process *process; + int nr_loose = 0; + int nr_packfile = 0; + int err = 0; + *p_ghc = GHC__CREATED__NOTHING; if (!gh_client__oidset_count) return 0; - return gh_client__get(p_ghc); + entry = gh_client__find_long_running_process(CAP_OBJECTS); + if (!entry) + return -1; + + trace2_region_enter("gh-client", "objects/post", the_repository); + + process = &entry->subprocess.process; + + sigchain_push(SIGPIPE, SIG_IGN); + + err = gh_client__send__objects_post(process); + if (!err) + err = gh_client__objects__receive_response( + process, p_ghc, &nr_loose, &nr_packfile); + + sigchain_pop(SIGPIPE); + + if (err) { + subprocess_stop(&gh_server__subprocess_map, + (struct subprocess_entry *)entry); + FREE_AND_NULL(entry); + } + + trace2_data_intmax("gh-client", the_repository, + "objects/post/nr_objects", gh_client__oidset_count); + trace2_region_leave("gh-client", "objects/post", the_repository); + + oidset_clear(&gh_client__oidset_queued); + gh_client__oidset_count = 0; + + return err; } +/* + * Get exactly 1 object immediately. + * Ignore any queued objects. + */ int gh_client__get_immediate(const struct object_id *oid, enum gh_client__created *p_ghc) { - gh_client__includes_immediate = 1; + struct gh_server__process *entry; + struct child_process *process; + int nr_loose = 0; + int nr_packfile = 0; + int err = 0; // TODO consider removing this trace2. it is useful for interactive // TODO debugging, but may generate way too much noise for a data // TODO event. trace2_printf("gh_client__get_immediate: %s", oid_to_hex(oid)); - if (!oidset_insert(&gh_client__oidset_queued, oid)) - gh_client__oidset_count++; + entry = gh_client__find_long_running_process(CAP_OBJECTS); + if (!entry) + return -1; + + trace2_region_enter("gh-client", "objects/get", the_repository); - return gh_client__drain_queue(p_ghc); + process = &entry->subprocess.process; + + sigchain_push(SIGPIPE, SIG_IGN); + + err = gh_client__send__objects_get(process, oid); + if (!err) + err = gh_client__objects__receive_response( + process, p_ghc, &nr_loose, &nr_packfile); + + sigchain_pop(SIGPIPE); + + if (err) { + subprocess_stop(&gh_server__subprocess_map, + (struct subprocess_entry *)entry); + FREE_AND_NULL(entry); + } + + trace2_region_leave("gh-client", "objects/get", the_repository); + + return err; } diff --git a/gvfs-helper-client.h b/gvfs-helper-client.h index 5f5e824bb0771a..a5a951ff5b5bfe 100644 --- a/gvfs-helper-client.h +++ b/gvfs-helper-client.h @@ -32,13 +32,14 @@ enum gh_client__created { }; /* - * Ask `gvfs-helper server` to immediately fetch an object. - * Wait for the response. + * Ask `gvfs-helper server` to immediately fetch a single object + * using "/gvfs/objects" GET semantics. * - * This may also fetch any queued (non-immediate) objects and - * so may create one or more loose objects and/or packfiles. - * It is undefined whether the requested OID will be loose or - * in a packfile. + * A long-running background process is used to make subsequent + * requests more efficient. + * + * A loose object will be created in the shared-cache ODB and + * in-memory cache updated. */ int gh_client__get_immediate(const struct object_id *oid, enum gh_client__created *p_ghc); @@ -47,16 +48,21 @@ int gh_client__get_immediate(const struct object_id *oid, * Queue this OID for a future fetch using `gvfs-helper service`. * It does not wait. * - * The GHC layer is free to process this queue in any way it wants, - * including individual fetches, bulk fetches, and batching. And - * it may add queued objects to immediate requests. - * * Callers should not rely on the queued object being on disk until * the queue has been drained. */ void gh_client__queue_oid(const struct object_id *oid); void gh_client__queue_oid_array(const struct object_id *oids, int oid_nr); +/* + * Ask `gvfs-helper server` to fetch the set of queued OIDs using + * "/gvfs/objects" POST semantics. + * + * A long-running background process is used to subsequent requests + * more efficient. + * + * One or more packfiles will be created in the shared-cache ODB. + */ int gh_client__drain_queue(enum gh_client__created *p_ghc); #endif /* GVFS_HELPER_CLIENT_H */ diff --git a/gvfs-helper.c b/gvfs-helper.c index f13eace2c3cca0..34d98ff3807774 100644 --- a/gvfs-helper.c +++ b/gvfs-helper.c @@ -46,7 +46,28 @@ // // get // -// Fetch 1 or more objects. If a cache-server is configured, +// Fetch 1 or more objects one at a time using a "/gvfs/objects" +// GET request. +// +// If a cache-server is configured, +// try it first. Optionally fallback to the main Git server. +// +// The set of objects is given on stdin and is assumed to be +// a list of , one per line. +// +// : +// +// --max-retries= // defaults to "6" +// +// Number of retries after transient network errors. +// Set to zero to disable such retries. +// +// post +// +// Fetch 1 or more objects in bulk using a "/gvfs/objects" POST +// request. +// +// If a cache-server is configured, // try it first. Optionally fallback to the main Git server. // // The set of objects is given on stdin and is assumed to be @@ -78,7 +99,8 @@ // --block-size= // defaults to "4000" // // Request objects from server in batches of at -// most n objects (not bytes). +// most n objects (not bytes) when using POST +// requests. // // --depth= // defaults to "1" // @@ -87,17 +109,27 @@ // Number of retries after transient network errors. // Set to zero to disable such retries. // -// Interactive verb: get +// Interactive verb: objects.get +// +// Fetch 1 or more objects, one at a time, using a +// "/gvfs/ojbects" GET requests. +// +// Each object will be created as a loose object in the ODB. +// +// Interactive verb: objects.post // -// Fetch 1 or more objects. If a cache-server is configured, -// try it first. Optionally fallback to the main Git server. +// Fetch 1 or more objects, in bulk, using one or more +// "/gvfs/objects" POST requests. +// +// For both verbs, if a cache-server is configured, try it first. +// Optionally fallback to the main Git server. // // Create 1 or more loose objects and/or packfiles in the // shared-cache ODB. (The pathname of the selected ODB is // reported at the beginning of the response; this should // match the pathname given on the command line). // -// git> get +// git> objects.get | objects.post // git> // git> // git> ... @@ -116,20 +148,6 @@ // [2] Documentation/technical/long-running-process-protocol.txt // [3] See GIT_TRACE_PACKET // -// Example: -// -// $ git -c core.virtualizeobjects=false -c core.usegvfshelper=false -// rev-list --objects --no-walk --missing=print HEAD -// | grep "^?" -// | sed 's/^?//' -// | git gvfs-helper get-missing -// -// Note: In this example, we need to turn off "core.virtualizeobjects" and -// "core.usegvfshelper" when building the list of objects. This prevents -// rev-list (in oid_object_info_extended() from automatically fetching -// them with read-object-hook or "gvfs-helper server" sub-process (and -// defeating the whole purpose of this example). -// ////////////////////////////////////////////////////////////////// #include "cache.h" @@ -162,15 +180,21 @@ static const char * const main_usage[] = { N_("git gvfs-helper [] config []"), N_("git gvfs-helper [] get []"), + N_("git gvfs-helper [] post []"), N_("git gvfs-helper [] server []"), NULL }; -static const char *const get_usage[] = { +static const char *const objects_get_usage[] = { N_("git gvfs-helper [] get []"), NULL }; +static const char *const objects_post_usage[] = { + N_("git gvfs-helper [] post []"), + NULL +}; + static const char *const server_usage[] = { N_("git gvfs-helper [] server []"), NULL @@ -179,12 +203,12 @@ static const char *const server_usage[] = { /* * "commitDepth" field in gvfs protocol */ -#define GH__DEFAULT_COMMIT_DEPTH 1 +#define GH__DEFAULT__OBJECTS_POST__COMMIT_DEPTH 1 /* * Chunk/block size in number of objects we request in each packfile */ -#define GH__DEFAULT_BLOCK_SIZE 4000 +#define GH__DEFAULT__OBJECTS_POST__BLOCK_SIZE 4000 /* * Retry attempts (after the initial request) for transient errors and 429s. @@ -278,6 +302,28 @@ static const char *gh__server_type_label[GH__SERVER_TYPE__NR] = { "(cs)" }; +enum gh__objects_mode { + /* + * Bulk fetch objects. + * + * But also, force the use of HTTP POST regardless of how many + * objects we are requesting. + * + * The GVFS Protocol treats requests for commit objects + * differently in GET and POST requests WRT whether it + * automatically also fetches the referenced trees. + */ + GH__OBJECTS_MODE__POST, + + /* + * Fetch objects one at a time using HTTP GET. + * + * Force the use of GET (primarily because of the commit + * object treatment). + */ + GH__OBJECTS_MODE__GET, +}; + struct gh__azure_throttle { unsigned long tstu_limit; @@ -333,7 +379,20 @@ enum gh__progress_state { * Parameters to drive an HTTP request (with any necessary retries). */ struct gh__request_params { - int b_is_post; /* POST=1 or GET=0 */ + /* + * b_is_post indicates if the current HTTP request is a POST=1 or + * a GET=0. This is a lower level field used to setup CURL and + * the tempfile used to receive the content. + * + * It is related to, but different from the GH__OBJECTS_MODE__ + * field that we present to the gvfs-helper client or in the CLI + * (which only concerns the semantics of the /gvfs/objects protocol + * on the set of requested OIDs). + * + * For example, we use an HTTP GET to get the /gvfs/config data + * into a buffer. + */ + int b_is_post; int b_write_to_file; /* write to file=1 or strbuf=0 */ int b_permit_cache_server_if_defined; @@ -496,8 +555,6 @@ enum gh__retry_mode { struct gh__response_status { struct strbuf error_message; struct strbuf content_type; - long response_code; /* http response code */ - CURLcode curl_code; enum gh__error_code ec; enum gh__retry_mode retry; intmax_t bytes_received; @@ -507,8 +564,6 @@ struct gh__response_status { #define GH__RESPONSE_STATUS_INIT { \ .error_message = STRBUF_INIT, \ .content_type = STRBUF_INIT, \ - .response_code = 0, \ - .curl_code = CURLE_OK, \ .ec = GH__ERROR_CODE__OK, \ .retry = GH__RETRY_MODE__SUCCESS, \ .bytes_received = 0, \ @@ -519,8 +574,6 @@ static void gh__response_status__zero(struct gh__response_status *s) { strbuf_setlen(&s->error_message, 0); strbuf_setlen(&s->content_type, 0); - s->response_code = 0; - s->curl_code = CURLE_OK; s->ec = GH__ERROR_CODE__OK; s->retry = GH__RETRY_MODE__SUCCESS; s->bytes_received = 0; @@ -574,15 +627,14 @@ static void log_e2eid(struct gh__request_params *params, } /* - * Normalize a few error codes before we try to decide + * Normalize a few HTTP response codes before we try to decide * how to dispatch on them. */ -static void gh__response_status__normalize_odd_codes( - struct gh__request_params *params, - struct gh__response_status *status) +static long gh__normalize_odd_codes(struct gh__request_params *params, + long http_response_code) { if (params->server_type == GH__SERVER_TYPE__CACHE && - status->response_code == 400) { + http_response_code == 400) { /* * The cache-server sends a somewhat bogus 400 instead of * the normal 401 when AUTH is required. Fixup the status @@ -594,16 +646,18 @@ static void gh__response_status__normalize_odd_codes( * TODO 401 for now. We should confirm the expected * TODO error message in the response-body. */ - status->response_code = 401; + return 401; } - if (status->response_code == 203) { + if (http_response_code == 203) { /* * A proxy server transformed a 200 from the origin server * into a 203. We don't care about the subtle distinction. */ - status->response_code = 200; + return 200; } + + return http_response_code; } /* @@ -613,9 +667,10 @@ static void gh__response_status__normalize_odd_codes( * https://docs.microsoft.com/en-us/azure/devops/integrate/concepts/rate-limits?view=azure-devops */ static void compute_retry_mode_from_http_response( - struct gh__response_status *status) + struct gh__response_status *status, + long http_response_code) { - switch (status->response_code) { + switch (http_response_code) { case 200: status->retry = GH__RETRY_MODE__SUCCESS; @@ -687,7 +742,7 @@ static void compute_retry_mode_from_http_response( hard_fail: strbuf_addf(&status->error_message, "(http:%d) Other [hard_fail]", - (int)status->response_code); + (int)http_response_code); status->retry = GH__RETRY_MODE__HARD_FAIL; status->ec = GH__ERROR_CODE__HTTP_OTHER; @@ -713,9 +768,10 @@ static void compute_retry_mode_from_http_response( * so I'm not going to fight that. */ static void compute_retry_mode_from_curl_error( - struct gh__response_status *status) + struct gh__response_status *status, + CURLcode curl_code) { - switch (status->curl_code) { + switch (curl_code) { case CURLE_OK: status->retry = GH__RETRY_MODE__SUCCESS; status->ec = GH__ERROR_CODE__OK; @@ -820,8 +876,7 @@ static void compute_retry_mode_from_curl_error( hard_fail: strbuf_addf(&status->error_message, "(curl:%d) %s [hard_fail]", - status->curl_code, - curl_easy_strerror(status->curl_code)); + curl_code, curl_easy_strerror(curl_code)); status->retry = GH__RETRY_MODE__HARD_FAIL; status->ec = GH__ERROR_CODE__CURL_ERROR; @@ -831,8 +886,7 @@ static void compute_retry_mode_from_curl_error( transient: strbuf_addf(&status->error_message, "(curl:%d) %s [transient]", - status->curl_code, - curl_easy_strerror(status->curl_code)); + curl_code, curl_easy_strerror(curl_code)); status->retry = GH__RETRY_MODE__TRANSIENT; status->ec = GH__ERROR_CODE__CURL_ERROR; @@ -851,26 +905,31 @@ static void gh__response_status__set_from_slot( struct gh__response_status *status, const struct active_request_slot *slot) { - status->curl_code = slot->results->curl_result; + long http_response_code; + CURLcode curl_code; + + curl_code = slot->results->curl_result; gh__curlinfo_strbuf(slot->curl, CURLINFO_CONTENT_TYPE, &status->content_type); curl_easy_getinfo(slot->curl, CURLINFO_RESPONSE_CODE, - &status->response_code); + &http_response_code); strbuf_setlen(&status->error_message, 0); - gh__response_status__normalize_odd_codes(params, status); + http_response_code = gh__normalize_odd_codes(params, + http_response_code); /* * Use normalized response/status codes form curl/http to decide * how to set the error-code we propagate *AND* to decide if we * we should retry because of transient network problems. */ - if (status->curl_code == CURLE_OK || - status->curl_code == CURLE_HTTP_RETURNED_ERROR) - compute_retry_mode_from_http_response(status); + if (curl_code == CURLE_OK || + curl_code == CURLE_HTTP_RETURNED_ERROR) + compute_retry_mode_from_http_response(status, + http_response_code); else - compute_retry_mode_from_curl_error(status); + compute_retry_mode_from_curl_error(status, curl_code); if (status->ec != GH__ERROR_CODE__OK) status->bytes_received = 0; @@ -1028,8 +1087,8 @@ static void gh__run_one_slot(struct active_request_slot *slot, trace2_region_enter("gvfs-helper", key.buf, NULL); if (!start_active_slot(slot)) { - status->curl_code = CURLE_FAILED_INIT; /* a bit of a lie */ - compute_retry_mode_from_curl_error(status); + compute_retry_mode_from_curl_error(status, + CURLE_FAILED_INIT); } else { run_active_slot(slot); if (params->b_write_to_file) @@ -1060,7 +1119,7 @@ static void gh__run_one_slot(struct active_request_slot *slot, stop_progress(¶ms->progress); if (status->ec == GH__ERROR_CODE__OK && params->b_write_to_file) { - if (params->b_is_post) + if (params->b_is_post && params->object_count > 1) install_packfile(params, status); else install_loose(params, status); @@ -1216,8 +1275,8 @@ static void lookup_main_url(void) trace2_data_string("gvfs-helper", NULL, "remote/url", gh__global.main_url); } -static void do__gvfs_config(struct gh__response_status *status, - struct strbuf *config_data); +static void do__http_get__gvfs_config(struct gh__response_status *status, + struct strbuf *config_data); /* * Find the URL of the cache-server, if we have one. @@ -1276,7 +1335,7 @@ static void select_cache_server(void) * well-known by the main Git server. */ - do__gvfs_config(&status, &config_data); + do__http_get__gvfs_config(&status, &config_data); if (status.ec == GH__ERROR_CODE__OK) { /* @@ -1334,13 +1393,10 @@ static void select_cache_server(void) * Read stdin until EOF (or a blank line) and add the desired OIDs * to the oidset. * - * Stdin should contain a list of OIDs. It may have additional - * decoration that we need to strip out. - * - * We expect: - * [] // present OIDs + * Stdin should contain a list of OIDs. Lines may have additional + * text following the OID that we ignore. */ -static unsigned long read_stdin_from_rev_list(struct oidset *oids) +static unsigned long read_stdin_for_oids(struct oidset *oids) { struct object_id oid; struct strbuf buf_stdin = STRBUF_INIT; @@ -1362,17 +1418,23 @@ static unsigned long read_stdin_from_rev_list(struct oidset *oids) /* * Build a complete JSON payload for a gvfs/objects POST request - * containing the first n OIDs in an OIDSET index by the iterator. + * containing the first `nr_in_block` OIDs found in the OIDSET + * indexed by the given iterator. * * https://github.com/microsoft/VFSForGit/blob/master/Protocol.md + * + * Return the number of OIDs we actually put into the payload. + * If only 1 OID was found, also return it. */ static unsigned long build_json_payload__gvfs_objects( struct json_writer *jw_req, struct oidset_iter *iter, - unsigned long nr_in_block) + unsigned long nr_in_block, + struct object_id *oid_out) { unsigned long k; const struct object_id *oid; + const struct object_id *oid_prev = NULL; k = 0; @@ -1383,10 +1445,18 @@ static unsigned long build_json_payload__gvfs_objects( while (k < nr_in_block && (oid = oidset_iter_next(iter))) { jw_array_string(jw_req, oid_to_hex(oid)); k++; + oid_prev = oid; } jw_end(jw_req); jw_end(jw_req); + if (oid_out) { + if (k == 1) + oidcpy(oid_out, oid_prev); + else + oidclr(oid_out); + } + return k; } @@ -1627,6 +1697,33 @@ static void create_tempfile_for_packfile( strbuf_release(&basename); } +/* + * Create a pathname to the loose object in the shared-cache ODB + * with the given OID. Try to "mkdir -p" to ensure the parent + * directories exist. + */ +static int create_loose_pathname_in_odb(struct strbuf *buf_path, + const struct object_id *oid) +{ + enum scld_error scld; + const char *hex; + + hex = oid_to_hex(oid); + + strbuf_setlen(buf_path, 0); + strbuf_addbuf(buf_path, &gh__global.buf_odb_path); + strbuf_complete(buf_path, '/'); + strbuf_add(buf_path, hex, 2); + strbuf_addch(buf_path, '/'); + strbuf_addstr(buf_path, hex+2); + + scld = safe_create_leading_directories(buf_path->buf); + if (scld != SCLD_OK && scld != SCLD_EXISTS) + return -1; + + return 0; +} + /* * Create a tempfile to stream a loose object into. * @@ -1641,19 +1738,10 @@ static void create_tempfile_for_loose( { static int nth = 0; struct strbuf buf_path = STRBUF_INIT; - const char *hex; gh__response_status__zero(status); - hex = oid_to_hex(¶ms->loose_oid); - - strbuf_addbuf(&buf_path, &gh__global.buf_odb_path); - strbuf_complete(&buf_path, '/'); - strbuf_add(&buf_path, hex, 2); - - if (!file_exists(buf_path.buf) && - mkdir(buf_path.buf, 0777) == -1 && - !file_exists(buf_path.buf)) { + if (create_loose_pathname_in_odb(&buf_path, ¶ms->loose_oid)) { strbuf_addf(&status->error_message, "cannot create directory for loose object '%s'", buf_path.buf); @@ -1661,9 +1749,6 @@ static void create_tempfile_for_loose( goto cleanup; } - strbuf_addch(&buf_path, '/'); - strbuf_addstr(&buf_path, hex+2); - /* Remember the full path of the final destination. */ strbuf_setlen(¶ms->loose_path, 0); strbuf_addbuf(¶ms->loose_path, &buf_path); @@ -1705,7 +1790,7 @@ static void install_packfile(struct gh__request_params *params, if (strcmp(status->content_type.buf, "application/x-git-packfile")) { strbuf_addf(&status->error_message, - "received unknown content-type '%s'", + "install_packfile: received unknown content-type '%s'", status->content_type.buf); status->ec = GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE; goto cleanup; @@ -1783,6 +1868,19 @@ static void install_loose(struct gh__request_params *params, { struct strbuf tmp_path = STRBUF_INIT; + /* + * We expect a loose object when we do a GET -or- when we + * do a POST with only 1 object. + */ + if (strcmp(status->content_type.buf, + "application/x-git-loose-object")) { + strbuf_addf(&status->error_message, + "install_loose: received unknown content-type '%s'", + status->content_type.buf); + status->ec = GH__ERROR_CODE__UNEXPECTED_CONTENT_TYPE; + return; + } + gh__response_status__zero(status); /* @@ -1879,10 +1977,8 @@ static size_t parse_resp_hdr(char *buffer, size_t size, size_t nitems, * The following X- headers are specific to AzureDevOps. * Other servers have similar sets of values, but I haven't * compared them in depth. - * - * TODO Remove this. */ - trace2_printf("Throttle: %s %s", key.buf, val.buf); + // trace2_printf("Throttle: %s %s", key.buf, val.buf); if (!strcmp(key.buf, "X-RateLimit-Resource")) { /* @@ -2104,10 +2200,11 @@ static void do_req(const char *url_base, if (params->tempfile) delete_tempfile(¶ms->tempfile); - if (params->b_is_post) + if (params->b_is_post && params->object_count > 1) create_tempfile_for_packfile(params, status); else create_tempfile_for_loose(params, status); + if (!params->tempfile || status->ec != GH__ERROR_CODE__OK) return; } else { @@ -2308,7 +2405,7 @@ static void do_req__to_main(const char *url_component, &gh__global.main_creds, params, status); - if (status->response_code == 401) { + if (status->retry == GH__RETRY_MODE__HTTP_401) { refresh_main_creds(); do_req__with_robust_retry(gh__global.main_url, url_component, @@ -2316,7 +2413,7 @@ static void do_req__to_main(const char *url_component, params, status); } - if (status->response_code == 200) + if (status->retry == GH__RETRY_MODE__SUCCESS) approve_main_creds(); } @@ -2332,7 +2429,7 @@ static void do_req__to_cache_server(const char *url_component, &gh__global.cache_creds, params, status); - if (status->response_code == 401) { + if (status->retry == GH__RETRY_MODE__HTTP_401) { refresh_cache_server_creds(); do_req__with_robust_retry(gh__global.cache_server_url, @@ -2341,7 +2438,7 @@ static void do_req__to_cache_server(const char *url_component, params, status); } - if (status->response_code == 200) + if (status->retry == GH__RETRY_MODE__SUCCESS) approve_cache_server_creds(); } @@ -2356,7 +2453,7 @@ static void do_req__with_fallback(const char *url_component, params->b_permit_cache_server_if_defined) { do_req__to_cache_server(url_component, params, status); - if (status->response_code == 200) + if (status->retry == GH__RETRY_MODE__SUCCESS) return; if (!gh__cmd_opts.try_fallback) @@ -2371,7 +2468,7 @@ static void do_req__with_fallback(const char *url_component, * Falling-back would likely just cause the 3rd (or maybe * 4th) cred prompt. */ - if (status->response_code == 401) + if (status->retry == GH__RETRY_MODE__HTTP_401) return; } @@ -2383,8 +2480,8 @@ static void do_req__with_fallback(const char *url_component, * * Return server's response buffer. This is probably a raw JSON string. */ -static void do__gvfs_config(struct gh__response_status *status, - struct strbuf *config_data) +static void do__http_get__gvfs_config(struct gh__response_status *status, + struct strbuf *config_data) { struct gh__request_params params = GH__REQUEST_PARAMS_INIT; @@ -2424,12 +2521,34 @@ static void do__gvfs_config(struct gh__response_status *status, gh__request_params__release(¶ms); } +static void setup_gvfs_objects_progress(struct gh__request_params *params, + unsigned long num, unsigned long den) +{ + if (!gh__cmd_opts.show_progress) + return; + + if (params->b_is_post && params->object_count > 1) { + strbuf_addf(¶ms->progress_base_phase2_msg, + "Requesting packfile %ld/%ld with %ld objects", + num, den, params->object_count); + strbuf_addf(¶ms->progress_base_phase3_msg, + "Receiving packfile %ld/%ld with %ld objects", + num, den, params->object_count); + } else { + strbuf_addf(¶ms->progress_base_phase3_msg, + "Receiving %ld/%ld loose object", + num, den); + } +} + /* * Call "gvfs/objects/" REST API to fetch a loose object * and write it to the ODB. */ -static void do__loose__gvfs_object(struct gh__response_status *status, - const struct object_id *oid) +static void do__http_get__gvfs_object(struct gh__response_status *status, + const struct object_id *oid, + unsigned long l_num, unsigned long l_den, + struct string_list *result_list) { struct gh__request_params params = GH__REQUEST_PARAMS_INIT; struct strbuf component_url = STRBUF_INIT; @@ -2454,44 +2573,52 @@ static void do__loose__gvfs_object(struct gh__response_status *status, oidcpy(¶ms.loose_oid, oid); - if (gh__cmd_opts.show_progress) { - /* - * Likewise, a gvfs/objects/{oid} has a very small reqest - * payload, so I don't see any need to report progress on - * the upload side of the GET. So just report progress - * on the download side. - */ - strbuf_addstr(¶ms.progress_base_phase3_msg, - "Receiving 1 loose object"); - } + setup_gvfs_objects_progress(¶ms, l_num, l_den); do_req__with_fallback(component_url.buf, ¶ms, status); + if (status->ec == GH__ERROR_CODE__OK) { + struct strbuf msg = STRBUF_INIT; + + strbuf_addf(&msg, "loose %s", + oid_to_hex(¶ms.loose_oid)); + + string_list_append(result_list, msg.buf); + strbuf_release(&msg); + } + gh__request_params__release(¶ms); strbuf_release(&component_url); } /* - * Call "gvfs/objects" POST REST API to fetch a packfile containing - * the objects in the requested OIDSET. Returns the filename (not - * pathname) to the new packfile. + * Call "gvfs/objects" POST REST API to fetch a batch of objects + * from the OIDSET. Normal, this is results in a packfile containing + * `nr_wanted_in_block` objects. And we return the number actually + * consumed (along with the filename of the resulting packfile). + * + * However, if we only have 1 oid (remaining) in the OIDSET, the + * server will respond to our POST with a loose object rather than + * a packfile with 1 object. + * + * Append a message to the result_list describing the result. + * + * Return the number of OIDs consumed from the OIDSET. */ -static void do__packfile__gvfs_objects(struct gh__response_status *status, - struct oidset_iter *iter, - unsigned long nr_wanted_in_block, - int j_pack_num, int j_pack_den, - struct strbuf *output_filename, - unsigned long *nr_oid_taken) +static void do__http_post__gvfs_objects(struct gh__response_status *status, + struct oidset_iter *iter, + unsigned long nr_wanted_in_block, + int j_pack_num, int j_pack_den, + struct string_list *result_list, + unsigned long *nr_oid_taken) { struct json_writer jw_req = JSON_WRITER_INIT; struct gh__request_params params = GH__REQUEST_PARAMS_INIT; - strbuf_setlen(output_filename, 0); - gh__response_status__zero(status); params.object_count = build_json_payload__gvfs_objects( - &jw_req, iter, nr_wanted_in_block); + &jw_req, iter, nr_wanted_in_block, ¶ms.loose_oid); *nr_oid_taken = params.object_count; strbuf_addstr(¶ms.tr2_label, "POST/objects"); @@ -2511,7 +2638,7 @@ static void do__packfile__gvfs_objects(struct gh__response_status *status, "Content-Type: application/json"); /* * We really always want a packfile. But if the payload only - * requests 1 OID, the server will/may send us a single loose + * requests 1 OID, the server will send us a single loose * objects instead. (Apparently the server ignores us when we * only send application/x-git-packfile and does it anyway.) * @@ -2523,156 +2650,172 @@ static void do__packfile__gvfs_objects(struct gh__response_status *status, params.headers = curl_slist_append(params.headers, "Accept: application/x-git-loose-object"); - if (gh__cmd_opts.show_progress) { - strbuf_addf(¶ms.progress_base_phase2_msg, - "Requesting packfile %d/%d with %ld objects", - j_pack_num, j_pack_den, - params.object_count); - strbuf_addf(¶ms.progress_base_phase3_msg, - "Receiving packfile %d/%d with %ld objects", - j_pack_num, j_pack_den, - params.object_count); - } + setup_gvfs_objects_progress(¶ms, j_pack_num, j_pack_den); do_req__with_fallback("gvfs/objects", ¶ms, status); - if (status->ec == GH__ERROR_CODE__OK) - strbuf_addbuf(output_filename, ¶ms.final_packfile_filename); + + if (status->ec == GH__ERROR_CODE__OK) { + struct strbuf msg = STRBUF_INIT; + + if (params.object_count > 1) + strbuf_addf(&msg, "packfile %s", + params.final_packfile_filename.buf); + else + strbuf_addf(&msg, "loose %s", + oid_to_hex(¶ms.loose_oid)); + + string_list_append(result_list, msg.buf); + strbuf_release(&msg); + } gh__request_params__release(¶ms); jw_release(&jw_req); } /* - * Bulk or individually fetch a list of objects in one or more http requests. - * Create one or more packfiles and/or loose objects. + * Drive one or more HTTP GET requests to fetch the objects + * in the given OIDSET. These are received into loose objects. * - * We accumulate results for each request in `result_list` until we get a + * Accumulate results for each request in `result_list` until we get a * hard error and have to stop. */ -static void do_fetch_oidset(struct gh__response_status *status, - struct oidset *oids, - unsigned long nr_oid_total, - struct string_list *result_list) +static void do__http_get__fetch_oidset(struct gh__response_status *status, + struct oidset *oids, + unsigned long nr_oid_total, + struct string_list *result_list) { struct oidset_iter iter; - struct strbuf output_filename = STRBUF_INIT; - struct strbuf msg = STRBUF_INIT; struct strbuf err404 = STRBUF_INIT; const struct object_id *oid; unsigned long k; - unsigned long nr_oid_taken; int had_404 = 0; - int j_pack_den = 0; - int j_pack_num = 0; gh__response_status__zero(status); if (!nr_oid_total) return; - if (nr_oid_total > 1) - j_pack_den = ((nr_oid_total + gh__cmd_opts.block_size - 1) - / gh__cmd_opts.block_size); - oidset_iter_init(oids, &iter); - for (k = 0; k < nr_oid_total; k += nr_oid_taken) { - if (nr_oid_total - k == 1 || gh__cmd_opts.block_size == 1) { - oid = oidset_iter_next(&iter); - nr_oid_taken = 1; + for (k = 0; k < nr_oid_total; k++) { + oid = oidset_iter_next(&iter); - do__loose__gvfs_object(status, oid); + do__http_get__gvfs_object(status, oid, k+1, nr_oid_total, + result_list); + /* + * If we get a 404 for an individual object, ignore + * it and get the rest. We'll fixup the 'ec' later. + */ + if (status->ec == GH__ERROR_CODE__HTTP_404) { + if (!err404.len) + strbuf_addf(&err404, "%s: from GET %s", + status->error_message.buf, + oid_to_hex(oid)); /* - * If we get a 404 for an individual object, ignore - * it and get the rest. We'll fixup the 'ec' later. + * Mark the fetch as "incomplete", but don't + * stop trying to get other chunks. */ - if (status->ec == GH__ERROR_CODE__HTTP_404) { - if (!err404.len) - strbuf_addf(&err404, "%s: loose object %s", - status->error_message.buf, - oid_to_hex(oid)); - /* - * Mark the fetch as "incomplete", but don't - * stop trying to get other chunks. - */ - had_404 = 1; - continue; - } + had_404 = 1; + continue; + } - if (status->ec != GH__ERROR_CODE__OK) { - /* Stop at the first hard error. */ - strbuf_addf(&status->error_message, ": loose %s", - oid_to_hex(oid)); - goto cleanup; - } + if (status->ec != GH__ERROR_CODE__OK) { + /* Stop at the first hard error. */ + strbuf_addf(&status->error_message, ": from GET %s", + oid_to_hex(oid)); + goto cleanup; + } + } - strbuf_setlen(&msg, 0); - strbuf_addf(&msg, "loose %s", oid_to_hex(oid)); - string_list_append(result_list, msg.buf); +cleanup: + if (had_404 && status->ec == GH__ERROR_CODE__OK) { + strbuf_setlen(&status->error_message, 0); + strbuf_addbuf(&status->error_message, &err404); + status->ec = GH__ERROR_CODE__HTTP_404; + } - } else { - strbuf_setlen(&output_filename, 0); + strbuf_release(&err404); +} - j_pack_num++; +/* + * Drive one or more HTTP POST requests to bulk fetch the objects in + * the given OIDSET. Create one or more packfiles and/or loose objects. + * + * Accumulate results for each request in `result_list` until we get a + * hard error and have to stop. + */ +static void do__http_post__fetch_oidset(struct gh__response_status *status, + struct oidset *oids, + unsigned long nr_oid_total, + struct string_list *result_list) +{ + struct oidset_iter iter; + struct strbuf err404 = STRBUF_INIT; + unsigned long k; + unsigned long nr_oid_taken; + int j_pack_den = 0; + int j_pack_num = 0; + int had_404 = 0; + + gh__response_status__zero(status); + if (!nr_oid_total) + return; - do__packfile__gvfs_objects(status, &iter, - gh__cmd_opts.block_size, - j_pack_num, j_pack_den, - &output_filename, - &nr_oid_taken); + oidset_iter_init(oids, &iter); + + j_pack_den = ((nr_oid_total + gh__cmd_opts.block_size - 1) + / gh__cmd_opts.block_size); + + for (k = 0; k < nr_oid_total; k += nr_oid_taken) { + j_pack_num++; + do__http_post__gvfs_objects(status, &iter, + gh__cmd_opts.block_size, + j_pack_num, j_pack_den, + result_list, + &nr_oid_taken); + + /* + * Because the oidset iterator has random + * order, it does no good to say the k-th or + * n-th chunk was incomplete; the client + * cannot use that index for anything. + * + * We get a 404 when at least one object in + * the chunk was not found. + * + * For now, ignore the 404 and go on to the + * next chunk and then fixup the 'ec' later. + */ + if (status->ec == GH__ERROR_CODE__HTTP_404) { + if (!err404.len) + strbuf_addf(&err404, + "%s: from POST", + status->error_message.buf); /* - * Because the oidset iterator has random - * order, it does no good to say the k-th or - * n-th chunk was incomplete; the client - * cannot use that index for anything. - * - * We get a 404 when at least one object in - * the chunk was not found. - * - * TODO Consider various retry strategies (such as - * TODO loose or bisect) on the members within this - * TODO chunk to reduce the impact of the miss. - * - * For now, ignore the 404 and go on to the - * next chunk and then fixup the 'ec' later. + * Mark the fetch as "incomplete", but don't + * stop trying to get other chunks. */ - if (status->ec == GH__ERROR_CODE__HTTP_404) { - if (!err404.len) - strbuf_addf(&err404, - "%s: packfile object", - status->error_message.buf); - /* - * Mark the fetch as "incomplete", but don't - * stop trying to get other chunks. - */ - had_404 = 1; - continue; - } - - if (status->ec != GH__ERROR_CODE__OK) { - /* Stop at the first hard error. */ - strbuf_addstr(&status->error_message, - ": in packfile"); - goto cleanup; - } + had_404 = 1; + continue; + } - strbuf_setlen(&msg, 0); - strbuf_addf(&msg, "packfile %s", output_filename.buf); - string_list_append(result_list, msg.buf); + if (status->ec != GH__ERROR_CODE__OK) { + /* Stop at the first hard error. */ + strbuf_addstr(&status->error_message, + ": from POST"); + goto cleanup; } } cleanup: - strbuf_release(&msg); - strbuf_release(&err404); - strbuf_release(&output_filename); - if (had_404 && status->ec == GH__ERROR_CODE__OK) { strbuf_setlen(&status->error_message, 0); - strbuf_addstr(&status->error_message, "404 Not Found"); + strbuf_addbuf(&status->error_message, &err404); status->ec = GH__ERROR_CODE__HTTP_404; } + + strbuf_release(&err404); } /* @@ -2710,7 +2853,7 @@ static enum gh__error_code do_sub_cmd__config(int argc, const char **argv) finish_init(0); - do__gvfs_config(&status, &config_data); + do__http_get__gvfs_config(&status, &config_data); ec = status.ec; if (ec == GH__ERROR_CODE__OK) @@ -2725,12 +2868,61 @@ static enum gh__error_code do_sub_cmd__config(int argc, const char **argv) } /* - * Read a list of objects from stdin and fetch them in a single request (or - * multiple block-size requests). + * Read a list of objects from stdin and fetch them as a series of + * single object HTTP GET requests. */ static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) { static struct option get_options[] = { + OPT_INTEGER('r', "max-retries", &gh__cmd_opts.max_retries, + N_("retries for transient network errors")), + OPT_END(), + }; + + struct gh__response_status status = GH__RESPONSE_STATUS_INIT; + struct oidset oids = OIDSET_INIT; + struct string_list result_list = STRING_LIST_INIT_DUP; + enum gh__error_code ec = GH__ERROR_CODE__OK; + unsigned long nr_oid_total; + int k; + + trace2_cmd_mode("get"); + + if (argc > 1 && !strcmp(argv[1], "-h")) + usage_with_options(objects_get_usage, get_options); + + argc = parse_options(argc, argv, NULL, get_options, objects_get_usage, 0); + if (gh__cmd_opts.max_retries < 0) + gh__cmd_opts.max_retries = 0; + + finish_init(1); + + nr_oid_total = read_stdin_for_oids(&oids); + + do__http_get__fetch_oidset(&status, &oids, nr_oid_total, &result_list); + + ec = status.ec; + + for (k = 0; k < result_list.nr; k++) + printf("%s\n", result_list.items[k].string); + + if (ec != GH__ERROR_CODE__OK) + error("get: %s", status.error_message.buf); + + gh__response_status__release(&status); + oidset_clear(&oids); + string_list_clear(&result_list, 0); + + return ec; +} + +/* + * Read a list of objects from stdin and fetch them in a single request (or + * multiple block-size requests) using one or more HTTP POST requests. + */ +static enum gh__error_code do_sub_cmd__post(int argc, const char **argv) +{ + static struct option post_options[] = { OPT_MAGNITUDE('b', "block-size", &gh__cmd_opts.block_size, N_("number of objects to request at a time")), OPT_INTEGER('d', "depth", &gh__cmd_opts.depth, @@ -2747,12 +2939,12 @@ static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) unsigned long nr_oid_total; int k; - trace2_cmd_mode("get"); + trace2_cmd_mode("post"); if (argc > 1 && !strcmp(argv[1], "-h")) - usage_with_options(get_usage, get_options); + usage_with_options(objects_post_usage, post_options); - argc = parse_options(argc, argv, NULL, get_options, get_usage, 0); + argc = parse_options(argc, argv, NULL, post_options, objects_post_usage, 0); if (gh__cmd_opts.depth < 1) gh__cmd_opts.depth = 1; if (gh__cmd_opts.max_retries < 0) @@ -2760,12 +2952,9 @@ static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) finish_init(1); - nr_oid_total = read_stdin_from_rev_list(&oids); + nr_oid_total = read_stdin_for_oids(&oids); - trace2_region_enter("gvfs-helper", "get", NULL); - trace2_data_intmax("gvfs-helper", NULL, "get/nr_objects", nr_oid_total); - do_fetch_oidset(&status, &oids, nr_oid_total, &result_list); - trace2_region_leave("gvfs-helper", "get", NULL); + do__http_post__fetch_oidset(&status, &oids, nr_oid_total, &result_list); ec = status.ec; @@ -2773,7 +2962,7 @@ static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) printf("%s\n", result_list.items[k].string); if (ec != GH__ERROR_CODE__OK) - error("get: %s", status.error_message.buf); + error("post: %s", status.error_message.buf); gh__response_status__release(&status); oidset_clear(&oids); @@ -2783,12 +2972,14 @@ static enum gh__error_code do_sub_cmd__get(int argc, const char **argv) } /* - * Handle the 'get' command when in "server mode". Only call error() and set ec - * for hard errors where we cannot communicate correctly with the foreground - * client process. Pass any actual data errors (such as 404's or 401's from - * the fetch back to the client process. + * Handle the 'objects.get' and 'objects.post' verbs in "server mode". + * + * Only call error() and set ec for hard errors where we cannot + * communicate correctly with the foreground client process. Pass any + * actual data errors (such as 404's or 401's from the fetch) back to + * the client process. */ -static enum gh__error_code do_server_subprocess_get(void) +static enum gh__error_code do_server_subprocess__objects(const char *verb_line) { struct gh__response_status status = GH__RESPONSE_STATUS_INIT; struct oidset oids = OIDSET_INIT; @@ -2799,12 +2990,19 @@ static enum gh__error_code do_server_subprocess_get(void) int len; int err; int k; + enum gh__objects_mode objects_mode; unsigned long nr_oid_total = 0; - /* - * Inside the "get" command, we expect a list of OIDs - * and a flush. - */ + if (!strcmp(verb_line, "objects.get")) + objects_mode = GH__OBJECTS_MODE__GET; + else if (!strcmp(verb_line, "objects.post")) + objects_mode = GH__OBJECTS_MODE__POST; + else { + error("server: unexpected objects-mode verb '%s'", verb_line); + ec = GH__ERROR_CODE__SUBPROCESS_SYNTAX; + goto cleanup; + } + while (1) { len = packet_read_line_gently(0, NULL, &line); if (len < 0 || !line) @@ -2829,10 +3027,10 @@ static enum gh__error_code do_server_subprocess_get(void) goto cleanup; } - trace2_region_enter("gvfs-helper", "server/get", NULL); - trace2_data_intmax("gvfs-helper", NULL, "server/get/nr_objects", nr_oid_total); - do_fetch_oidset(&status, &oids, nr_oid_total, &result_list); - trace2_region_leave("gvfs-helper", "server/get", NULL); + if (objects_mode == GH__OBJECTS_MODE__GET) + do__http_get__fetch_oidset(&status, &oids, nr_oid_total, &result_list); + else + do__http_post__fetch_oidset(&status, &oids, nr_oid_total, &result_list); /* * Write pathname of the ODB where we wrote all of the objects @@ -2887,7 +3085,7 @@ static enum gh__error_code do_server_subprocess_get(void) return ec; } -typedef enum gh__error_code (fn_subprocess_cmd)(void); +typedef enum gh__error_code (fn_subprocess_cmd)(const char *verb_line); struct subprocess_capability { const char *name; @@ -2896,7 +3094,7 @@ struct subprocess_capability { }; static struct subprocess_capability caps[] = { - { "get", 0, do_server_subprocess_get }, + { "objects", 0, do_server_subprocess__objects }, { NULL, 0, NULL }, }; @@ -3027,8 +3225,9 @@ static enum gh__error_code do_sub_cmd__server(int argc, const char **argv) } for (k = 0; caps[k].name; k++) { - if (caps[k].client_has && !strcmp(line, caps[k].name)) { - ec = (caps[k].pfn)(); + if (caps[k].client_has && + starts_with(line, caps[k].name)) { + ec = (caps[k].pfn)(line); if (ec != GH__ERROR_CODE__OK) goto cleanup; goto top_of_loop; @@ -3049,6 +3248,9 @@ static enum gh__error_code do_sub_cmd(int argc, const char **argv) if (!strcmp(argv[0], "get")) return do_sub_cmd__get(argc, argv); + if (!strcmp(argv[0], "post")) + return do_sub_cmd__post(argc, argv); + if (!strcmp(argv[0], "config")) return do_sub_cmd__config(argc, argv); @@ -3099,8 +3301,8 @@ int cmd_main(int argc, const char **argv) setup_git_directory_gently(NULL); /* Set any non-zero initial values in gh__cmd_opts. */ - gh__cmd_opts.depth = GH__DEFAULT_COMMIT_DEPTH; - gh__cmd_opts.block_size = GH__DEFAULT_BLOCK_SIZE; + gh__cmd_opts.depth = GH__DEFAULT__OBJECTS_POST__COMMIT_DEPTH; + gh__cmd_opts.block_size = GH__DEFAULT__OBJECTS_POST__BLOCK_SIZE; gh__cmd_opts.max_retries = GH__DEFAULT_MAX_RETRIES; gh__cmd_opts.max_transient_backoff_sec = GH__DEFAULT_MAX_TRANSIENT_BACKOFF_SEC; From 740de11def56c0c9fb4fed297673a2313365bbfb Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 24 Oct 2019 08:11:25 -0400 Subject: [PATCH 35/36] gvfs-helper: dramatically reduce progress noise During development, it was very helpful to see the gvfs-helper do its work to request a pack-file or download a loose object. When these messages appear during normal use, it leads to a very noisy terminal output. Remove all progress indicators when downloading loose objects. We know that these can be numbered in the thousands in certain kinds of history calls, and would litter the terminal output with noise. This happens during 'git fetch' or 'git pull' as well when the tip commits are checked for the new refs. Remove the "Requesting packfile with %ld objects" message, as this operation is very fast. We quickly follow up with the more valuable "Receiving packfile %ld%ld with %ld objects". When a large "git checkout" causes many pack-file downloads, it is good to know that Git is asking for data from the server. Signed-off-by: Derrick Stolee --- gvfs-helper.c | 8 +------- unpack-trees.c | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/gvfs-helper.c b/gvfs-helper.c index 34d98ff3807774..16fe8d1222b104 100644 --- a/gvfs-helper.c +++ b/gvfs-helper.c @@ -2528,17 +2528,11 @@ static void setup_gvfs_objects_progress(struct gh__request_params *params, return; if (params->b_is_post && params->object_count > 1) { - strbuf_addf(¶ms->progress_base_phase2_msg, - "Requesting packfile %ld/%ld with %ld objects", - num, den, params->object_count); strbuf_addf(¶ms->progress_base_phase3_msg, "Receiving packfile %ld/%ld with %ld objects", num, den, params->object_count); - } else { - strbuf_addf(¶ms->progress_base_phase3_msg, - "Receiving %ld/%ld loose object", - num, den); } + /* If requesting only one object, then do not show progress */ } /* diff --git a/unpack-trees.c b/unpack-trees.c index bff730d73808bf..1f48ccc4abc035 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -1547,7 +1547,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options memset(&pl, 0, sizeof(pl)); if (!core_apply_sparse_checkout || !o->update) o->skip_sparse_checkout = 1; - if (!o->skip_sparse_checkout) { + if (!o->skip_sparse_checkout && !o->pl) { if (core_virtualfilesystem) { o->pl = &pl; } else { From 8cf9286267368ede5a4f1011857ac8644be61056 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 4 Nov 2019 14:47:57 -0500 Subject: [PATCH 36/36] gvfs-helper-client.h: define struct object_id Signed-off-by: Derrick Stolee --- gvfs-helper-client.h | 1 + 1 file changed, 1 insertion(+) diff --git a/gvfs-helper-client.h b/gvfs-helper-client.h index a5a951ff5b5bfe..c1e38fad75f841 100644 --- a/gvfs-helper-client.h +++ b/gvfs-helper-client.h @@ -3,6 +3,7 @@ struct repository; struct commit; +struct object_id; enum gh_client__created { /*