From 1cbdbf3bef7e9167794cb2c12f9af1a450584ebe Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 7 Dec 2020 14:11:02 -0500 Subject: commit-graph: drop count_distinct_commits() function When writing a commit graph, we collect a list of object ids in an array, which we'll eventually copy into an array of "struct commit" pointers. Before we do that, though, we count the number of distinct commit entries. There's a subtle bug in this step, though. We eliminate not only duplicate oids, but also in split mode, any oids which are not commits or which are already in a graph file. However, the loop starts at index 1, always counting index 0 as distinct. And indeed it can't be a duplicate, since we check for those by comparing against the previous entry, and there isn't one for index 0. But it could be a commit that's already in a graph file, and we'd overcount the number of commits by 1 in that case. That turns out not to be a problem, though. The only things we do with the count are: - check if our count will overflow our data structures. But the limit there is 2^31 commits, so while this is a useful check, the off-by-one is not likely to matter. - pre-allocate the array of commit pointers. But over-allocating by one isn't a problem; we'll just waste a few extra bytes. The bug would be easy enough to fix, but we can observe that neither of those steps is necessary. After building the actual commit array, we'll likewise check its count for overflow. So the extra check of the distinct commit count here is redundant. And likewise we use ALLOC_GROW() when building the commit array, so there's no need to preallocate it (it's possible that doing so is slightly more efficient, but if we care we can just optimistically allocate one slot for each oid; I didn't bother here). So count_distinct_commits() isn't doing anything useful. Let's just get rid of that step. Note that a side effect of the function was that we sorted the list of oids, which we do rely on in copy_oids_to_commits(), since it must also skip the duplicates. So we'll move the qsort there. I didn't copy the "TODO" about adding more progress meters. It's actually quite hard to make a repository large enough for this qsort would take an appreciable amount of time, so this doesn't seem like a useful note. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 43 ++----------------------------------------- 1 file changed, 2 insertions(+), 41 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index 6f62a07313..1ac3516cf5 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -1588,35 +1588,6 @@ static void fill_oids_from_all_packs(struct write_commit_graph_context *ctx) stop_progress(&ctx->progress); } -static uint32_t count_distinct_commits(struct write_commit_graph_context *ctx) -{ - uint32_t i, count_distinct = 1; - - if (ctx->report_progress) - ctx->progress = start_delayed_progress( - _("Counting distinct commits in commit graph"), - ctx->oids.nr); - display_progress(ctx->progress, 0); /* TODO: Measure QSORT() progress */ - QSORT(ctx->oids.list, ctx->oids.nr, oid_compare); - - for (i = 1; i < ctx->oids.nr; i++) { - display_progress(ctx->progress, i + 1); - if (!oideq(&ctx->oids.list[i - 1], &ctx->oids.list[i])) { - if (ctx->split) { - struct commit *c = lookup_commit(ctx->r, &ctx->oids.list[i]); - - if (!c || commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH) - continue; - } - - count_distinct++; - } - } - stop_progress(&ctx->progress); - - return count_distinct; -} - static void copy_oids_to_commits(struct write_commit_graph_context *ctx) { uint32_t i; @@ -1628,6 +1599,7 @@ static void copy_oids_to_commits(struct write_commit_graph_context *ctx) ctx->progress = start_delayed_progress( _("Finding extra edges in commit graph"), ctx->oids.nr); + QSORT(ctx->oids.list, ctx->oids.nr, oid_compare); for (i = 0; i < ctx->oids.nr; i++) { unsigned int num_parents; @@ -2155,7 +2127,7 @@ int write_commit_graph(struct object_directory *odb, const struct commit_graph_opts *opts) { struct write_commit_graph_context *ctx; - uint32_t i, count_distinct = 0; + uint32_t i; int res = 0; int replace = 0; struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS; @@ -2268,17 +2240,6 @@ int write_commit_graph(struct object_directory *odb, close_reachable(ctx); - count_distinct = count_distinct_commits(ctx); - - if (count_distinct >= GRAPH_EDGE_LAST_MASK) { - error(_("the commit graph format cannot write %d commits"), count_distinct); - res = -1; - goto cleanup; - } - - ctx->commits.alloc = count_distinct; - ALLOC_ARRAY(ctx->commits.list, ctx->commits.alloc); - copy_oids_to_commits(ctx); if (ctx->commits.nr >= GRAPH_EDGE_LAST_MASK) { -- cgit v1.2.3 From a5f1c448998fba5f5a1b00e1b9a779176ec665a6 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 7 Dec 2020 14:11:05 -0500 Subject: commit-graph: replace packed_oid_list with oid_array Our custom packed_oid_list data structure is really just an oid_array in disguise. Let's switch to using the generic structure, which shortens and simplifies the code slightly. There's one slightly awkward part: in the old code we copied a hash straight from the mmap'd on-disk data into the final object_id. And now we'll copy to a temporary oid, which we'll then pass to oid_array_append(). But this is an operation we have to do all over the commit-graph code already, since it mostly uses object_id structs internally. I also measured "git commit-graph --append", which triggers this code path, and it showed no difference. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 62 ++++++++++++++-------------------------------------------- 1 file changed, 15 insertions(+), 47 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index 1ac3516cf5..4a718fd6e6 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -936,17 +936,11 @@ struct packed_commit_list { int alloc; }; -struct packed_oid_list { - struct object_id *list; - int nr; - int alloc; -}; - struct write_commit_graph_context { struct repository *r; struct object_directory *odb; char *graph_name; - struct packed_oid_list oids; + struct oid_array oids; struct packed_commit_list commits; int num_extra_edges; unsigned long approx_nr_objects; @@ -1240,13 +1234,6 @@ static int write_graph_chunk_bloom_data(struct hashfile *f, return 0; } -static int oid_compare(const void *_a, const void *_b) -{ - const struct object_id *a = (const struct object_id *)_a; - const struct object_id *b = (const struct object_id *)_b; - return oidcmp(a, b); -} - static int add_packed_commits(const struct object_id *oid, struct packed_git *pack, uint32_t pos, @@ -1267,10 +1254,7 @@ static int add_packed_commits(const struct object_id *oid, if (type != OBJ_COMMIT) return 0; - ALLOC_GROW(ctx->oids.list, ctx->oids.nr + 1, ctx->oids.alloc); - oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid); - ctx->oids.nr++; - + oid_array_append(&ctx->oids, oid); set_commit_pos(ctx->r, oid); return 0; @@ -1281,9 +1265,7 @@ static void add_missing_parents(struct write_commit_graph_context *ctx, struct c struct commit_list *parent; for (parent = commit->parents; parent; parent = parent->next) { if (!(parent->item->object.flags & REACHABLE)) { - ALLOC_GROW(ctx->oids.list, ctx->oids.nr + 1, ctx->oids.alloc); - oidcpy(&ctx->oids.list[ctx->oids.nr], &(parent->item->object.oid)); - ctx->oids.nr++; + oid_array_append(&ctx->oids, &parent->item->object.oid); parent->item->object.flags |= REACHABLE; } } @@ -1302,7 +1284,7 @@ static void close_reachable(struct write_commit_graph_context *ctx) ctx->oids.nr); for (i = 0; i < ctx->oids.nr; i++) { display_progress(ctx->progress, i + 1); - commit = lookup_commit(ctx->r, &ctx->oids.list[i]); + commit = lookup_commit(ctx->r, &ctx->oids.oid[i]); if (commit) commit->object.flags |= REACHABLE; } @@ -1319,7 +1301,7 @@ static void close_reachable(struct write_commit_graph_context *ctx) 0); for (i = 0; i < ctx->oids.nr; i++) { display_progress(ctx->progress, i + 1); - commit = lookup_commit(ctx->r, &ctx->oids.list[i]); + commit = lookup_commit(ctx->r, &ctx->oids.oid[i]); if (!commit) continue; @@ -1339,7 +1321,7 @@ static void close_reachable(struct write_commit_graph_context *ctx) ctx->oids.nr); for (i = 0; i < ctx->oids.nr; i++) { display_progress(ctx->progress, i + 1); - commit = lookup_commit(ctx->r, &ctx->oids.list[i]); + commit = lookup_commit(ctx->r, &ctx->oids.oid[i]); if (commit) commit->object.flags &= ~REACHABLE; @@ -1567,9 +1549,7 @@ static int fill_oids_from_commits(struct write_commit_graph_context *ctx, oidset_iter_init(commits, &iter); while ((oid = oidset_iter_next(&iter))) { - ALLOC_GROW(ctx->oids.list, ctx->oids.nr + 1, ctx->oids.alloc); - oidcpy(&ctx->oids.list[ctx->oids.nr], oid); - ctx->oids.nr++; + oid_array_append(&ctx->oids, oid); } return 0; @@ -1599,16 +1579,14 @@ static void copy_oids_to_commits(struct write_commit_graph_context *ctx) ctx->progress = start_delayed_progress( _("Finding extra edges in commit graph"), ctx->oids.nr); - QSORT(ctx->oids.list, ctx->oids.nr, oid_compare); - for (i = 0; i < ctx->oids.nr; i++) { + oid_array_sort(&ctx->oids); + for (i = 0; i < ctx->oids.nr; i = oid_array_next_unique(&ctx->oids, i)) { unsigned int num_parents; display_progress(ctx->progress, i + 1); - if (i > 0 && oideq(&ctx->oids.list[i - 1], &ctx->oids.list[i])) - continue; ALLOC_GROW(ctx->commits.list, ctx->commits.nr + 1, ctx->commits.alloc); - ctx->commits.list[ctx->commits.nr] = lookup_commit(ctx->r, &ctx->oids.list[i]); + ctx->commits.list[ctx->commits.nr] = lookup_commit(ctx->r, &ctx->oids.oid[i]); if (ctx->split && flags != COMMIT_GRAPH_SPLIT_REPLACE && commit_graph_position(ctx->commits.list[ctx->commits.nr]) != COMMIT_NOT_FROM_GRAPH) @@ -2199,26 +2177,16 @@ int write_commit_graph(struct object_directory *odb, } ctx->approx_nr_objects = approximate_object_count(); - ctx->oids.alloc = ctx->approx_nr_objects / 32; - if (ctx->split && opts && ctx->oids.alloc > opts->max_commits) - ctx->oids.alloc = opts->max_commits; - - if (ctx->append) { + if (ctx->append) prepare_commit_graph_one(ctx->r, ctx->odb); - if (ctx->r->objects->commit_graph) - ctx->oids.alloc += ctx->r->objects->commit_graph->num_commits; - } - - if (ctx->oids.alloc < 1024) - ctx->oids.alloc = 1024; - ALLOC_ARRAY(ctx->oids.list, ctx->oids.alloc); if (ctx->append && ctx->r->objects->commit_graph) { struct commit_graph *g = ctx->r->objects->commit_graph; for (i = 0; i < g->num_commits; i++) { - const unsigned char *hash = g->chunk_oid_lookup + g->hash_len * i; - hashcpy(ctx->oids.list[ctx->oids.nr++].hash, hash); + struct object_id oid; + hashcpy(oid.hash, g->chunk_oid_lookup + g->hash_len * i); + oid_array_append(&ctx->oids, &oid); } } @@ -2274,7 +2242,7 @@ int write_commit_graph(struct object_directory *odb, cleanup: free(ctx->graph_name); free(ctx->commits.list); - free(ctx->oids.list); + oid_array_clear(&ctx->oids); if (ctx->commit_graph_filenames_after) { for (i = 0; i < ctx->num_commit_graphs_after; i++) { -- cgit v1.2.3 From 3361390cbedc962adcddcfd98676695c125fa180 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 7 Dec 2020 14:11:08 -0500 Subject: commit-graph: use size_t for array allocation and indexing Our packed_commit_list is an array of pointers to commit structs. We use "int" for the allocation, which is 32-bit even on 64-bit platforms. This isn't likely to overflow in practice (we're writing commit graphs, so you'd need to actually have billions of unique commits in the repository). But it's good practice to use size_t for allocations. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- commit-graph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'commit-graph.c') diff --git a/commit-graph.c b/commit-graph.c index 4a718fd6e6..06f8dc1d89 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -932,8 +932,8 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit struct packed_commit_list { struct commit **list; - int nr; - int alloc; + size_t nr; + size_t alloc; }; struct write_commit_graph_context { -- cgit v1.2.3