Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions builtin/multi-pack-index.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "config.h"
#include "parse-options.h"
#include "midx.h"
#include "trace2.h"

static char const * const builtin_multi_pack_index_usage[] = {
N_("git multi-pack-index [--object-dir=<dir>] (write|verify|expire|repack --batch-size=<size>)"),
Expand Down Expand Up @@ -43,6 +44,8 @@ int cmd_multi_pack_index(int argc, const char **argv,
return 1;
}

trace2_cmd_mode(argv[0]);

if (!strcmp(argv[0], "repack"))
return midx_repack(opts.object_dir, (size_t)opts.batch_size);
if (opts.batch_size)
Expand Down
53 changes: 48 additions & 5 deletions midx.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "midx.h"
#include "progress.h"
#include "run-command.h"
#include "trace2.h"

#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
#define MIDX_VERSION 1
Expand Down Expand Up @@ -165,6 +166,9 @@ struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local
m->pack_names[i]);
}

trace2_data_intmax("midx", the_repository, "load/num_packs", m->num_packs);
trace2_data_intmax("midx", the_repository, "load/num_objects", m->num_objects);

return m;

cleanup_fail:
Expand Down Expand Up @@ -1001,9 +1005,29 @@ static void midx_report(const char *fmt, ...)
va_end(ap);
}

struct pair_pos_vs_id
{
uint32_t pos;
uint32_t pack_int_id;
};

static int compare_pair_pos_vs_id(const void *_a, const void *_b)
{
struct pair_pos_vs_id *a = (struct pair_pos_vs_id *)_a;
struct pair_pos_vs_id *b = (struct pair_pos_vs_id *)_b;

if (a->pack_int_id < b->pack_int_id)
return -1;
if (a->pack_int_id > b->pack_int_id)
return 1;

return 0;
}

int verify_midx_file(const char *object_dir)
{
uint32_t i;
struct pair_pos_vs_id *pairs = NULL;
uint32_t i, k;
struct progress *progress;
struct multi_pack_index *m = load_multi_pack_index(object_dir, 1);
verify_midx_error = 0;
Expand Down Expand Up @@ -1040,15 +1064,32 @@ int verify_midx_file(const char *object_dir)
}

progress = start_progress(_("Verifying object offsets"), m->num_objects);

/*
* Create an array mapping each object to its packfile id. Sort it

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, and follows a pattern that exists elsewhere in this file (ie midx_repack()). Clearly results in a huge perf win by avoiding opening/closing pack files all the time.

* to group the objects by packfile. Using this permutation to visit
* each of the objects only require 1 packfile to be open at a time.
*/
ALLOC_ARRAY(pairs, m->num_objects);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For very large repos, this could be asking for many GB, right? Should a failure to alloc cause a failure in midx validation?

Copy link
Member

@jrbriggs jrbriggs Feb 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jeff and I talked offline and I misunderstood the number of objects in play. We should ask for less than 500MB, which should be serviceable.

for (i = 0; i < m->num_objects; i++) {
pairs[i].pos = i;
pairs[i].pack_int_id = nth_midxed_pack_int_id(m, i);
}
QSORT(pairs, m->num_objects, compare_pair_pos_vs_id);

for (k = 0; k < m->num_objects; k++) {
struct object_id oid;
struct pack_entry e;
off_t m_offset, p_offset;

nth_midxed_object_oid(&oid, m, i);
if (k > 0 && pairs[k-1].pack_int_id != pairs[k].pack_int_id &&
m->packs[pairs[k-1].pack_int_id])
close_pack_fd(m->packs[pairs[k-1].pack_int_id]);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm reading this correctly, this is an (optional) optimization that will keep the pack files open to a minimum. I'm assuming without it, they would start being closed transparently as you reached some max threshold. Since you know they are sorted, makes sense to do the optimization here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, the problem with max file descriptors that i fixed yesterday said we were holding 2000+ packfiles open when we started running out. fixing that caused us to still hold 2000+ open, but close and open packfiles as necessary to do the random access.

So yeah, this fix kinda eliminates the need for the previous fix. But i'm keeping that one in for now since it is harmless and just seems like the correct thing to do.


nth_midxed_object_oid(&oid, m, pairs[k].pos);
if (!fill_midx_entry(&oid, &e, m)) {
midx_report(_("failed to load pack entry for oid[%d] = %s"),
i, oid_to_hex(&oid));
pairs[k].pos, oid_to_hex(&oid));
continue;
}

Expand All @@ -1063,12 +1104,14 @@ int verify_midx_file(const char *object_dir)

if (m_offset != p_offset)
midx_report(_("incorrect object offset for oid[%d] = %s: %"PRIx64" != %"PRIx64),
i, oid_to_hex(&oid), m_offset, p_offset);
pairs[k].pos, oid_to_hex(&oid), m_offset, p_offset);

display_progress(progress, i + 1);
display_progress(progress, k + 1);
}
stop_progress(&progress);

free(pairs);

return verify_midx_error;
}

Expand Down
2 changes: 1 addition & 1 deletion packfile.c
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ void close_pack_windows(struct packed_git *p)
}
}

static int close_pack_fd(struct packed_git *p)
int close_pack_fd(struct packed_git *p)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This wouldn't be needed without the optimization above but I don't see any problem making this public.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right. the new verify loop completely verifies all objects in one packfile before moving to the next packfile (because of the sort). But when we hit 2000+ packfiles in the directory, visiting the next packfile requires us to free up a fd, and
this triggers the LRU search in close_one_pack(). So by closing the previous packfile in that loop, we'll only have 1
packfile open and avoid all of the LRU searching (which is O(n^2)).

{
if (p->pack_fd < 0)
return 0;
Expand Down
2 changes: 2 additions & 0 deletions packfile.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ extern int open_pack_index(struct packed_git *);
*/
extern void close_pack_index(struct packed_git *);

int close_pack_fd(struct packed_git *p);

extern uint32_t get_pack_fanout(struct packed_git *p, uint32_t value);

extern unsigned char *use_pack(struct packed_git *, struct pack_window **, off_t, unsigned long *);
Expand Down