From a56b9464cd0a49317fafde080ae4e73c5430ac9b Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 6 Apr 2020 16:59:52 +0000 Subject: revision.c: use Bloom filters to speed up path based revision walks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revision walk will now use Bloom filters for commits to speed up revision walks for a particular path (for computing history for that path), if they are present in the commit-graph file. We load the Bloom filters during the prepare_revision_walk step, currently only when dealing with a single pathspec. Extending it to work with multiple pathspecs can be explored and built on top of this series in the future. While comparing trees in rev_compare_trees(), if the Bloom filter says that the file is not different between the two trees, we don't need to compute the expensive diff. This is where we get our performance gains. The other response of the Bloom filter is '`:maybe', in which case we fall back to the full diff calculation to determine if the path was changed in the commit. We do not try to use Bloom filters when the '--walk-reflogs' option is specified. The '--walk-reflogs' option does not walk the commit ancestry chain like the rest of the options. Incorporating the performance gains when walking reflog entries would add more complexity, and can be explored in a later series. Performance Gains: We tested the performance of `git log -- ` on the git repo, the linux and some internal large repos, with a variety of paths of varying depths. On the git and linux repos: - we observed a 2x to 5x speed up. On a large internal repo with files seated 6-10 levels deep in the tree: - we observed 10x to 20x speed ups, with some paths going up to 28 times faster. Helped-by: Derrick Stolee Helped-by: Jonathan Tan Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano --- revision.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 2 deletions(-) (limited to 'revision.c') diff --git a/revision.c b/revision.c index 8136929e23..d3fcb7c6ff 100644 --- a/revision.c +++ b/revision.c @@ -29,6 +29,7 @@ #include "prio-queue.h" #include "hashmap.h" #include "utf8.h" +#include "bloom.h" volatile show_early_output_fn_t show_early_output; @@ -624,11 +625,80 @@ static void file_change(struct diff_options *options, options->flags.has_changes = 1; } +static void prepare_to_use_bloom_filter(struct rev_info *revs) +{ + struct pathspec_item *pi; + char *path_alloc = NULL; + const char *path; + int last_index; + int len; + + if (!revs->commits) + return; + + repo_parse_commit(revs->repo, revs->commits->item); + + if (!revs->repo->objects->commit_graph) + return; + + revs->bloom_filter_settings = revs->repo->objects->commit_graph->bloom_filter_settings; + if (!revs->bloom_filter_settings) + return; + + pi = &revs->pruning.pathspec.items[0]; + last_index = pi->len - 1; + + /* remove single trailing slash from path, if needed */ + if (pi->match[last_index] == '/') { + path_alloc = xstrdup(pi->match); + path_alloc[last_index] = '\0'; + path = path_alloc; + } else + path = pi->match; + + len = strlen(path); + + revs->bloom_key = xmalloc(sizeof(struct bloom_key)); + fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings); + + free(path_alloc); +} + +static int check_maybe_different_in_bloom_filter(struct rev_info *revs, + struct commit *commit) +{ + struct bloom_filter *filter; + int result; + + if (!revs->repo->objects->commit_graph) + return -1; + + if (commit->generation == GENERATION_NUMBER_INFINITY) + return -1; + + filter = get_bloom_filter(revs->repo, commit, 0); + + if (!filter) { + return -1; + } + + if (!filter->len) { + return -1; + } + + result = bloom_filter_contains(filter, + revs->bloom_key, + revs->bloom_filter_settings); + + return result; +} + static int rev_compare_tree(struct rev_info *revs, - struct commit *parent, struct commit *commit) + struct commit *parent, struct commit *commit, int nth_parent) { struct tree *t1 = get_commit_tree(parent); struct tree *t2 = get_commit_tree(commit); + int bloom_ret = 1; if (!t1) return REV_TREE_NEW; @@ -653,11 +723,19 @@ static int rev_compare_tree(struct rev_info *revs, return REV_TREE_SAME; } + if (revs->bloom_key && !nth_parent) { + bloom_ret = check_maybe_different_in_bloom_filter(revs, commit); + + if (bloom_ret == 0) + return REV_TREE_SAME; + } + tree_difference = REV_TREE_SAME; revs->pruning.flags.has_changes = 0; if (diff_tree_oid(&t1->object.oid, &t2->object.oid, "", &revs->pruning) < 0) return REV_TREE_DIFFERENT; + return tree_difference; } @@ -855,7 +933,7 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit) die("cannot simplify commit %s (because of %s)", oid_to_hex(&commit->object.oid), oid_to_hex(&p->object.oid)); - switch (rev_compare_tree(revs, p, commit)) { + switch (rev_compare_tree(revs, p, commit, nth_parent)) { case REV_TREE_SAME: if (!revs->simplify_history || !relevant_commit(p)) { /* Even if a merge with an uninteresting @@ -3362,6 +3440,8 @@ int prepare_revision_walk(struct rev_info *revs) FOR_EACH_OBJECT_PROMISOR_ONLY); } + if (revs->pruning.pathspec.nr == 1 && !revs->reflog_info) + prepare_to_use_bloom_filter(revs); if (revs->no_walk != REVISION_WALK_NO_WALK_UNSORTED) commit_list_sort_by_date(&revs->commits); if (revs->no_walk) @@ -3379,6 +3459,7 @@ int prepare_revision_walk(struct rev_info *revs) simplify_merges(revs); if (revs->children.name) set_children(revs); + return 0; } -- cgit v1.2.3 From 42e50e78c6fd8978c2218bbd7b3483ae51d5e3f9 Mon Sep 17 00:00:00 2001 From: Garima Singh Date: Mon, 6 Apr 2020 16:59:53 +0000 Subject: revision.c: add trace2 stats around Bloom filter usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add trace2 statistics around Bloom filter usage and behavior for 'git log -- path' commands that are hoping to benefit from the presence of computed changed paths Bloom filters. These statistics are great for performance analysis work and for formal testing, which we will see in the commit following this one. Helped-by: Derrick Stolee Helped-by: Jonathan Tan Signed-off-by: Garima Singh Signed-off-by: Junio C Hamano --- revision.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'revision.c') diff --git a/revision.c b/revision.c index d3fcb7c6ff..2b06ee739c 100644 --- a/revision.c +++ b/revision.c @@ -30,6 +30,7 @@ #include "hashmap.h" #include "utf8.h" #include "bloom.h" +#include "json-writer.h" volatile show_early_output_fn_t show_early_output; @@ -625,6 +626,30 @@ static void file_change(struct diff_options *options, options->flags.has_changes = 1; } +static int bloom_filter_atexit_registered; +static unsigned int count_bloom_filter_maybe; +static unsigned int count_bloom_filter_definitely_not; +static unsigned int count_bloom_filter_false_positive; +static unsigned int count_bloom_filter_not_present; +static unsigned int count_bloom_filter_length_zero; + +static void trace2_bloom_filter_statistics_atexit(void) +{ + struct json_writer jw = JSON_WRITER_INIT; + + jw_object_begin(&jw, 0); + jw_object_intmax(&jw, "filter_not_present", count_bloom_filter_not_present); + jw_object_intmax(&jw, "zero_length_filter", count_bloom_filter_length_zero); + jw_object_intmax(&jw, "maybe", count_bloom_filter_maybe); + jw_object_intmax(&jw, "definitely_not", count_bloom_filter_definitely_not); + jw_object_intmax(&jw, "false_positive", count_bloom_filter_false_positive); + jw_end(&jw); + + trace2_data_json("bloom", the_repository, "statistics", &jw); + + jw_release(&jw); +} + static void prepare_to_use_bloom_filter(struct rev_info *revs) { struct pathspec_item *pi; @@ -661,6 +686,11 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs) revs->bloom_key = xmalloc(sizeof(struct bloom_key)); fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings); + if (trace2_is_enabled() && !bloom_filter_atexit_registered) { + atexit(trace2_bloom_filter_statistics_atexit); + bloom_filter_atexit_registered = 1; + } + free(path_alloc); } @@ -679,10 +709,12 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs, filter = get_bloom_filter(revs->repo, commit, 0); if (!filter) { + count_bloom_filter_not_present++; return -1; } if (!filter->len) { + count_bloom_filter_length_zero++; return -1; } @@ -690,6 +722,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs, revs->bloom_key, revs->bloom_filter_settings); + if (result) + count_bloom_filter_maybe++; + else + count_bloom_filter_definitely_not++; + return result; } @@ -736,6 +773,10 @@ static int rev_compare_tree(struct rev_info *revs, &revs->pruning) < 0) return REV_TREE_DIFFERENT; + if (!nth_parent) + if (bloom_ret == 1 && tree_difference == REV_TREE_SAME) + count_bloom_filter_false_positive++; + return tree_difference; } -- cgit v1.2.3