diff --git a/Documentation/git-multi-pack-index.txt b/Documentation/git-multi-pack-index.txt index cc63531cc02b77..233b2b786271cc 100644 --- a/Documentation/git-multi-pack-index.txt +++ b/Documentation/git-multi-pack-index.txt @@ -37,15 +37,21 @@ expire:: afterward to remove all references to these pack-files. repack:: - Collect a batch of pack-files whose size are all at most the - size given by --batch-size, but whose sizes sum to larger - than --batch-size. The batch is selected by greedily adding - small pack-files starting with the oldest pack-files that fit - the size. Create a new pack-file containing the objects the - multi-pack-index indexes into those pack-files, and rewrite - the multi-pack-index to contain that pack-file. A later run - of 'git multi-pack-index expire' will delete the pack-files - that were part of this batch. + Create a new pack-file containing objects in small pack-files + referenced by the multi-pack-index. If the size given by the + `--batch-size=` argument is zero, then create a pack + containing all objects referenced by the multi-pack-index. For + a non-zero batch size, Select the pack-files by examining packs + from oldest-to-newest, computing the "expected size" by counting + the number of objects in the pack referenced by the + multi-pack-index, then divide by the total number of objects in + the pack and multiply by the pack size. We select packs with + expected size below the batch size until the set of packs have + total expected size at least the batch size. If the total size + does not reach the batch size, then do nothing. If a new pack- + file is created, rewrite the multi-pack-index to reference the + new pack-file. A later run of 'git multi-pack-index expire' will + delete the pack-files that were part of this batch. EXAMPLES diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c index 3d47f6dc1733b5..b1ea1a6aa17724 100644 --- a/builtin/multi-pack-index.c +++ b/builtin/multi-pack-index.c @@ -49,7 +49,7 @@ int cmd_multi_pack_index(int argc, const char **argv, if (!strcmp(argv[0], "repack")) return midx_repack(the_repository, opts.object_dir, (size_t)opts.batch_size); if (opts.batch_size) - die(_("--batch-size option is only for 'repack' verb")); + die(_("--batch-size option is only for 'repack' subcommand")); if (!strcmp(argv[0], "write")) return write_midx_file(opts.object_dir); @@ -58,5 +58,5 @@ int cmd_multi_pack_index(int argc, const char **argv, if (!strcmp(argv[0], "expire")) return expire_midx_packs(the_repository, opts.object_dir); - die(_("unrecognized verb: %s"), argv[0]); + die(_("unrecognized subcommand: %s"), argv[0]); } diff --git a/midx.c b/midx.c index b268b703392af9..3b28564e9eb9d2 100644 --- a/midx.c +++ b/midx.c @@ -36,6 +36,8 @@ #define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t)) #define MIDX_LARGE_OFFSET_NEEDED 0x80000000 +#define PACK_EXPIRED UINT_MAX + static char *get_midx_filename(const char *object_dir) { return xstrfmt("%s/pack/multi-pack-index", object_dir); @@ -429,27 +431,24 @@ static size_t write_midx_header(struct hashfile *f, return MIDX_HEADER_SIZE; } -struct midx_info { +struct pack_info { uint32_t orig_pack_int_id; - uint32_t new_pack_int_id; char *pack_name; struct packed_git *p; unsigned expired : 1; }; -static int midx_info_compare(const void *_a, const void *_b) +static int pack_info_compare(const void *_a, const void *_b) { - struct midx_info *a = (struct midx_info *)_a; - struct midx_info *b = (struct midx_info *)_b; + struct pack_info *a = (struct pack_info *)_a; + struct pack_info *b = (struct pack_info *)_b; return strcmp(a->pack_name, b->pack_name); } struct pack_list { - struct midx_info *info; - uint32_t *perm; + struct pack_info *info; uint32_t nr; uint32_t alloc; - size_t pack_name_concat_len; struct multi_pack_index *m; }; @@ -465,9 +464,8 @@ static void add_pack_to_midx(const char *full_path, size_t full_path_len, ALLOC_GROW(packs->info, packs->nr + 1, packs->alloc); packs->info[packs->nr].p = add_packed_git(full_path, - full_path_len, - 0); - packs->info[packs->nr].expired = 0; + full_path_len, + 0); if (!packs->info[packs->nr].p) { warning(_("failed to add packfile '%s'"), @@ -485,6 +483,7 @@ static void add_pack_to_midx(const char *full_path, size_t full_path_len, packs->info[packs->nr].pack_name = xstrdup(file_name); packs->info[packs->nr].orig_pack_int_id = packs->nr; + packs->info[packs->nr].expired = 0; packs->nr++; } } @@ -555,7 +554,7 @@ static void fill_pack_entry(uint32_t pack_int_id, * of a packfile containing the object). */ static struct pack_midx_entry *get_sorted_entries(struct multi_pack_index *m, - struct midx_info *info, + struct pack_info *info, uint32_t nr_packs, uint32_t *nr_objects) { @@ -636,7 +635,7 @@ static struct pack_midx_entry *get_sorted_entries(struct multi_pack_index *m, } static size_t write_midx_pack_names(struct hashfile *f, - struct midx_info *info, + struct pack_info *info, uint32_t num_packs) { uint32_t i; @@ -649,13 +648,12 @@ static size_t write_midx_pack_names(struct hashfile *f, if (info[i].expired) continue; - writelen = strlen(info[i].pack_name) + 1; - if (i && strcmp(info[i].pack_name, info[i - 1].pack_name) <= 0) BUG("incorrect pack-file order: %s before %s", info[i - 1].pack_name, info[i].pack_name); + writelen = strlen(info[i].pack_name) + 1; hashwrite(f, info[i].pack_name, writelen); written += writelen; } @@ -736,13 +734,13 @@ static size_t write_midx_object_offsets(struct hashfile *f, int large_offset_nee for (i = 0; i < nr_objects; i++) { struct pack_midx_entry *obj = list++; - int pack_int_id = perm[obj->pack_int_id]; - if (pack_int_id == UINT_MAX) - BUG("tried to write an object %s with expired pack-int-id", - oid_to_hex(&obj->oid)); + if (perm[obj->pack_int_id] == PACK_EXPIRED) + BUG("object %s is in an expired pack with int-id %d", + oid_to_hex(&obj->oid), + obj->pack_int_id); - hashwrite_be32(f, pack_int_id); + hashwrite_be32(f, perm[obj->pack_int_id]); if (large_offset_needed && obj->offset >> 31) hashwrite_be32(f, MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++); @@ -793,16 +791,19 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * { unsigned char cur_chunk, num_chunks = 0; char *midx_name; - uint32_t i, drop_count; + uint32_t i; struct hashfile *f = NULL; struct lock_file lk; struct pack_list packs; + uint32_t *pack_perm = NULL; uint64_t written = 0; uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; uint32_t nr_entries, num_large_offsets = 0; struct pack_midx_entry *entries = NULL; int large_offsets_needed = 0; + int pack_name_concat_len = 0; + int dropped_packs = 0; int result = 0; midx_name = get_midx_filename(object_dir); @@ -820,16 +821,14 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * packs.nr = 0; packs.alloc = packs.m ? packs.m->num_packs : 16; packs.info = NULL; - packs.perm = NULL; - packs.pack_name_concat_len = 0; ALLOC_ARRAY(packs.info, packs.alloc); if (packs.m) { for (i = 0; i < packs.m->num_packs; i++) { ALLOC_GROW(packs.info, packs.nr + 1, packs.alloc); - packs.info[packs.nr].pack_name = xstrdup(packs.m->pack_names[i]); packs.info[packs.nr].orig_pack_int_id = i; + packs.info[packs.nr].pack_name = xstrdup(packs.m->pack_names[i]); packs.info[packs.nr].p = NULL; packs.info[packs.nr].expired = 0; packs.nr++; @@ -850,7 +849,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * large_offsets_needed = 1; } - QSORT(packs.info, packs.nr, midx_info_compare); + QSORT(packs.info, packs.nr, pack_info_compare); if (packs_to_drop && packs_to_drop->nr) { int drop_index = 0; @@ -869,6 +868,8 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * drop_index++; missing_drops++; i--; + } else { + packs.info[i].expired = 0; } } @@ -878,31 +879,30 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * } } - drop_count = 0; - for (i = 0; i < packs.nr; i++) { - if (packs.info[i].expired) - drop_count++; - else - packs.info[i].new_pack_int_id = i - drop_count; - } - - packs.perm = xcalloc(packs.nr, sizeof(uint32_t)); + /* + * pack_perm stores a permutation between pack-int-ids from the + * previous multi-pack-index to the new one we are writing: + * + * pack_perm[old_id] = new_id + */ + ALLOC_ARRAY(pack_perm, packs.nr); for (i = 0; i < packs.nr; i++) { - if (packs.info[i].expired) - packs.perm[packs.info[i].orig_pack_int_id] = UINT_MAX; - else - packs.perm[packs.info[i].orig_pack_int_id] = - packs.info[i].new_pack_int_id; + if (packs.info[i].expired) { + dropped_packs++; + pack_perm[packs.info[i].orig_pack_int_id] = PACK_EXPIRED; + } else { + pack_perm[packs.info[i].orig_pack_int_id] = i - dropped_packs; + } } for (i = 0; i < packs.nr; i++) { if (!packs.info[i].expired) - packs.pack_name_concat_len += strlen(packs.info[i].pack_name) + 1; + pack_name_concat_len += strlen(packs.info[i].pack_name) + 1; } - if (packs.pack_name_concat_len % MIDX_CHUNK_ALIGNMENT) - packs.pack_name_concat_len += MIDX_CHUNK_ALIGNMENT - - (packs.pack_name_concat_len % MIDX_CHUNK_ALIGNMENT); + if (pack_name_concat_len % MIDX_CHUNK_ALIGNMENT) + pack_name_concat_len += MIDX_CHUNK_ALIGNMENT - + (pack_name_concat_len % MIDX_CHUNK_ALIGNMENT); hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR); f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf); @@ -914,14 +914,14 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * cur_chunk = 0; num_chunks = large_offsets_needed ? 5 : 4; - written = write_midx_header(f, num_chunks, packs.nr - drop_count); + written = write_midx_header(f, num_chunks, packs.nr - dropped_packs); chunk_ids[cur_chunk] = MIDX_CHUNKID_PACKNAMES; chunk_offsets[cur_chunk] = written + (num_chunks + 1) * MIDX_CHUNKLOOKUP_WIDTH; cur_chunk++; chunk_ids[cur_chunk] = MIDX_CHUNKID_OIDFANOUT; - chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + packs.pack_name_concat_len; + chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + pack_name_concat_len; cur_chunk++; chunk_ids[cur_chunk] = MIDX_CHUNKID_OIDLOOKUP; @@ -981,7 +981,7 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * break; case MIDX_CHUNKID_OBJECTOFFSETS: - written += write_midx_object_offsets(f, large_offsets_needed, packs.perm, entries, nr_entries); + written += write_midx_object_offsets(f, large_offsets_needed, pack_perm, entries, nr_entries); break; case MIDX_CHUNKID_LARGEOFFSETS: @@ -1012,8 +1012,8 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index * } free(packs.info); - free(packs.perm); free(entries); + free(pack_perm); free(midx_name); return result; } @@ -1229,17 +1229,18 @@ int expire_midx_packs(struct repository *r, const char *object_dir) return result; } -struct time_and_id { +struct repack_info { timestamp_t mtime; + uint32_t referenced_objects; uint32_t pack_int_id; }; static int compare_by_mtime(const void *a_, const void *b_) { - const struct time_and_id *a, *b; + const struct repack_info *a, *b; - a = (const struct time_and_id *)a_; - b = (const struct time_and_id *)b_; + a = (const struct repack_info *)a_; + b = (const struct repack_info *)b_; if (a->mtime < b->mtime) return -1; @@ -1248,50 +1249,92 @@ static int compare_by_mtime(const void *a_, const void *b_) return 0; } -int midx_repack(struct repository *r, const char *object_dir, size_t batch_size) +static int fill_included_packs_all(struct multi_pack_index *m, + unsigned char *include_pack) { - int result = 0; - uint32_t i, packs_to_repack; - size_t total_size; - struct time_and_id *pack_ti; - unsigned char *include_pack; - struct child_process cmd = CHILD_PROCESS_INIT; - struct strbuf base_name = STRBUF_INIT; - struct multi_pack_index *m = load_multi_pack_index(object_dir, 1); + uint32_t i; - if (!m) - return 0; + for (i = 0; i < m->num_packs; i++) + include_pack[i] = 1; - include_pack = xcalloc(m->num_packs, sizeof(unsigned char)); - pack_ti = xcalloc(m->num_packs, sizeof(struct time_and_id)); + return m->num_packs < 2; +} + +static int fill_included_packs_batch(struct repository *r, + struct multi_pack_index *m, + unsigned char *include_pack, + size_t batch_size) +{ + uint32_t i, packs_to_repack; + size_t total_size; + struct repack_info *pack_info = xcalloc(m->num_packs, sizeof(struct repack_info)); for (i = 0; i < m->num_packs; i++) { - pack_ti[i].pack_int_id = i; + pack_info[i].pack_int_id = i; if (prepare_midx_pack(r, m, i)) continue; - pack_ti[i].mtime = m->packs[i]->mtime; + pack_info[i].mtime = m->packs[i]->mtime; } - QSORT(pack_ti, m->num_packs, compare_by_mtime); + + for (i = 0; batch_size && i < m->num_objects; i++) { + uint32_t pack_int_id = nth_midxed_pack_int_id(m, i); + pack_info[pack_int_id].referenced_objects++; + } + + QSORT(pack_info, m->num_packs, compare_by_mtime); total_size = 0; packs_to_repack = 0; for (i = 0; total_size < batch_size && i < m->num_packs; i++) { - int pack_int_id = pack_ti[i].pack_int_id; + int pack_int_id = pack_info[i].pack_int_id; struct packed_git *p = m->packs[pack_int_id]; + size_t expected_size; if (!p) continue; - if (p->pack_size >= batch_size) + if (open_pack_index(p) || !p->num_objects) + continue; + + expected_size = (size_t)(p->pack_size + * pack_info[i].referenced_objects); + expected_size /= p->num_objects; + + if (expected_size >= batch_size) continue; packs_to_repack++; - total_size += p->pack_size; + total_size += expected_size; include_pack[pack_int_id] = 1; } + free(pack_info); + if (total_size < batch_size || packs_to_repack < 2) + return 1; + + return 0; +} + +int midx_repack(struct repository *r, const char *object_dir, size_t batch_size) +{ + int result = 0; + uint32_t i; + unsigned char *include_pack; + struct child_process cmd = CHILD_PROCESS_INIT; + struct strbuf base_name = STRBUF_INIT; + struct multi_pack_index *m = load_multi_pack_index(object_dir, 1); + + if (!m) + return 0; + + include_pack = xcalloc(m->num_packs, sizeof(unsigned char)); + + if (batch_size) { + if (fill_included_packs_batch(r, m, include_pack, batch_size)) + goto cleanup; + } else if (fill_included_packs_all(m, include_pack)) goto cleanup; argv_array_push(&cmd.args, "pack-objects"); @@ -1336,6 +1379,5 @@ int midx_repack(struct repository *r, const char *object_dir, size_t batch_size) if (m) close_midx(m); free(include_pack); - free(pack_ti); return result; } diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh index 9d0bad8798d4a2..79bfaeafa9b172 100755 --- a/t/t5319-multi-pack-index.sh +++ b/t/t5319-multi-pack-index.sh @@ -368,6 +368,8 @@ test_expect_success 'setup expire tests' ' ( cd dup && git init && + test-tool genrandom "data" 4096 >large_file.txt && + git update-index --add large_file.txt && for i in $(test_seq 1 20) do test_commit $i @@ -377,26 +379,27 @@ test_expect_success 'setup expire tests' ' git branch C HEAD~13 && git branch D HEAD~16 && git branch E HEAD~18 && - git pack-objects --revs .git/objects/pack/pack-E <<-EOF && - refs/heads/E + git pack-objects --revs .git/objects/pack/pack-A <<-EOF && + refs/heads/A + ^refs/heads/B EOF - git pack-objects --revs .git/objects/pack/pack-D <<-EOF && - refs/heads/D - ^refs/heads/E + git pack-objects --revs .git/objects/pack/pack-B <<-EOF && + refs/heads/B + ^refs/heads/C EOF git pack-objects --revs .git/objects/pack/pack-C <<-EOF && refs/heads/C ^refs/heads/D EOF - git pack-objects --revs .git/objects/pack/pack-B <<-EOF && - refs/heads/B - ^refs/heads/C + git pack-objects --revs .git/objects/pack/pack-D <<-EOF && + refs/heads/D + ^refs/heads/E EOF - git pack-objects --revs .git/objects/pack/pack-A <<-EOF && - refs/heads/A - ^refs/heads/B + git pack-objects --revs .git/objects/pack/pack-E <<-EOF && + refs/heads/E EOF - git multi-pack-index write + git multi-pack-index write && + cp -r .git/objects/pack .git/objects/pack-backup ) ' @@ -421,13 +424,24 @@ test_expect_success 'expire removes unreferenced packs' ' ls .git/objects/pack | grep -v -e pack-[AB] >expect && git multi-pack-index expire && ls .git/objects/pack >actual && - test_cmp expect actual + test_cmp expect actual && + ls .git/objects/pack/ | grep idx >expect-idx && + test-tool read-midx .git/objects | grep idx >actual-midx && + test_cmp expect-idx actual-midx && + git multi-pack-index verify && + git fsck ) ' test_expect_success 'repack with minimum size does not alter existing packs' ' ( cd dup && + rm -rf .git/objects/pack && + mv .git/objects/pack-backup .git/objects/pack && + touch -m -t 201901010000 .git/objects/pack/pack-D* && + touch -m -t 201901010001 .git/objects/pack/pack-C* && + touch -m -t 201901010002 .git/objects/pack/pack-B* && + touch -m -t 201901010003 .git/objects/pack/pack-A* && ls .git/objects/pack >expect && MINSIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 1) && git multi-pack-index repack --batch-size=$MINSIZE && @@ -439,25 +453,28 @@ test_expect_success 'repack with minimum size does not alter existing packs' ' test_expect_success 'repack creates a new pack' ' ( cd dup && - SECOND_SMALLEST_SIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 2 | tail -n 1) && - BATCH_SIZE=$(($SECOND_SMALLEST_SIZE + 1)) && - git multi-pack-index repack --batch-size=$BATCH_SIZE && ls .git/objects/pack/*idx >idx-list && test_line_count = 5 idx-list && + THIRD_SMALLEST_SIZE=$(ls -l .git/objects/pack/*pack | awk "{print \$5;}" | sort -n | head -n 3 | tail -n 1) && + BATCH_SIZE=$(($THIRD_SMALLEST_SIZE + 1)) && + git multi-pack-index repack --batch-size=$BATCH_SIZE && + ls .git/objects/pack/*idx >idx-list && + test_line_count = 6 idx-list && test-tool read-midx .git/objects | grep idx >midx-list && - test_line_count = 5 midx-list + test_line_count = 6 midx-list ) ' test_expect_success 'expire removes repacked packs' ' ( cd dup && - ls -S .git/objects/pack/*pack | head -n 3 >expect && + ls -al .git/objects/pack/*pack && + ls -S .git/objects/pack/*pack | head -n 4 >expect && git multi-pack-index expire && ls -S .git/objects/pack/*pack >actual && test_cmp expect actual && test-tool read-midx .git/objects | grep idx >midx-list && - test_line_count = 3 midx-list + test_line_count = 4 midx-list ) ' @@ -477,16 +494,57 @@ test_expect_success 'expire works when adding new packs' ' ^refs/heads/D EOF git multi-pack-index write && - git pack-objects --revs .git/objects/pack/pack-a <<-EOF && + git pack-objects --revs .git/objects/pack/a-pack <<-EOF && refs/heads/D ^refs/heads/E EOF git multi-pack-index write && - git pack-objects --revs .git/objects/pack/pack-z <<-EOF && + git pack-objects --revs .git/objects/pack/z-pack <<-EOF && refs/heads/E EOF - git multi-pack-index expire && + git multi-pack-index expire && + ls .git/objects/pack/ | grep idx >expect && + test-tool read-midx .git/objects | grep idx >actual && + test_cmp expect actual && git multi-pack-index verify ) ' + +test_expect_success 'expire respects .keep files' ' + ( + cd dup && + git pack-objects --revs .git/objects/pack/pack-all <<-EOF && + refs/heads/A + EOF + git multi-pack-index write && + PACKA=$(ls .git/objects/pack/a-pack*\.pack | sed s/\.pack\$//) && + touch $PACKA.keep && + git multi-pack-index expire && + ls -S .git/objects/pack/a-pack* | grep $PACKA >a-pack-files && + test_line_count = 3 a-pack-files && + test-tool read-midx .git/objects | grep idx >midx-list && + test_line_count = 2 midx-list + ) +' + +test_expect_success 'repack --batch-size=0 repacks everything' ' + ( + cd dup && + rm .git/objects/pack/*.keep && + ls .git/objects/pack/*idx >idx-list && + test_line_count = 2 idx-list && + git multi-pack-index repack --batch-size=0 && + ls .git/objects/pack/*idx >idx-list && + test_line_count = 3 idx-list && + test-tool read-midx .git/objects | grep idx >midx-list && + test_line_count = 3 midx-list && + git multi-pack-index expire && + ls -al .git/objects/pack/*idx >idx-list && + test_line_count = 1 idx-list && + git multi-pack-index repack --batch-size=0 && + ls -al .git/objects/pack/*idx >new-idx-list && + test_cmp idx-list new-idx-list + ) +' + test_done