From 53632befe3c2b5d77cb9e063d3cf32db8018034a Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 6 Sep 2024 14:16:13 -0400 Subject: [PATCH 1/9] revision: create mark_trees_uninteresting_dense() The sparse tree walk algorithm was created in d5d2e93577e (revision: implement sparse algorithm, 2019-01-16) and involves using the mark_trees_uninteresting_sparse() method. This method takes a repository and an oidset of tree IDs, some of which have the UNINTERESTING flag and some of which do not. Create a method that has an equivalent set of preconditions but uses a "dense" walk (recursively visits all reachable trees, as long as they have not previously been marked UNINTERESTING). This is an important difference from mark_tree_uninteresting(), which short-circuits if the given tree has the UNINTERESTING flag. A use of this method will be added in a later change, with a condition set whether the sparse or dense approach should be used. Signed-off-by: Derrick Stolee --- revision.c | 15 +++++++++++++++ revision.h | 1 + 2 files changed, 16 insertions(+) diff --git a/revision.c b/revision.c index 1c0192f522507e..61a9f6f6de26f8 100644 --- a/revision.c +++ b/revision.c @@ -219,6 +219,21 @@ static void add_children_by_path(struct repository *r, free_tree_buffer(tree); } +void mark_trees_uninteresting_dense(struct repository *r, + struct oidset *trees) +{ + struct object_id *oid; + struct oidset_iter iter; + + oidset_iter_init(trees, &iter); + while ((oid = oidset_iter_next(&iter))) { + struct tree *tree = lookup_tree(r, oid); + + if (tree->object.flags & UNINTERESTING) + mark_tree_contents_uninteresting(r, tree); + } +} + void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees) { diff --git a/revision.h b/revision.h index 0e470d1df19f69..6c3df8e42bfa6d 100644 --- a/revision.h +++ b/revision.h @@ -487,6 +487,7 @@ void put_revision_mark(const struct rev_info *revs, void mark_parents_uninteresting(struct rev_info *revs, struct commit *commit); void mark_tree_uninteresting(struct repository *r, struct tree *tree); +void mark_trees_uninteresting_dense(struct repository *r, struct oidset *trees); void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees); void show_object_with_name(FILE *, struct object *, const char *); From 3e9b6712b9cf4c5ba3b1961719cda257c6e23272 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 3 Sep 2024 21:55:47 -0400 Subject: [PATCH 2/9] path-walk: add prune_all_uninteresting option This option causes the path-walk API to act like the sparse tree-walk algorithm implemented by mark_trees_uninteresting_sparse() in list-objects.c. Starting from the commits marked as UNINTERESTING, their root trees and all objects reachable from those trees are UNINTERSTING, at least as we walk path-by-path. When we reach a path where all objects associated with that path are marked UNINTERESTING, then do no continue walking the children of that path. We need to be careful to pass the UNINTERESTING flag in a deep way on the UNINTERESTING objects before we start the path-walk, or else the depth-first search for the path-walk API may accidentally report some objects as interesting. Signed-off-by: Derrick Stolee --- path-walk.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++--- path-walk.h | 8 +++++++ 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/path-walk.c b/path-walk.c index 65f9856afa2f66..08de29614f7299 100644 --- a/path-walk.c +++ b/path-walk.c @@ -23,6 +23,7 @@ struct type_and_oid_list { enum object_type type; struct oid_array oids; + int maybe_interesting; }; #define TYPE_AND_OID_LIST_INIT { \ @@ -139,6 +140,9 @@ static int add_children(struct path_walk_context *ctx, list->type = type; strmap_put(&ctx->paths_to_lists, path.buf, list); string_list_append(&ctx->path_stack, path.buf); + + if (!(o->flags & UNINTERESTING)) + list->maybe_interesting = 1; } oid_array_append(&list->oids, &entry.oid); } @@ -161,6 +165,40 @@ static int walk_path(struct path_walk_context *ctx, list = strmap_get(&ctx->paths_to_lists, path); + if (ctx->info->prune_all_uninteresting) { + /* + * This is true if all objects were UNINTERESTING + * when added to the list. + */ + if (!list->maybe_interesting) + return 0; + + /* + * But it's still possible that the objects were set + * as UNINTERESTING after being added. Do a quick check. + */ + list->maybe_interesting = 0; + for (size_t i = 0; + !list->maybe_interesting && i < list->oids.nr; + i++) { + if (list->type == OBJ_TREE) { + struct tree *t = lookup_tree(ctx->repo, + &list->oids.oid[i]); + if (t && !(t->object.flags & UNINTERESTING)) + list->maybe_interesting = 1; + } else { + struct blob *b = lookup_blob(ctx->repo, + &list->oids.oid[i]); + if (b && !(b->object.flags & UNINTERESTING)) + list->maybe_interesting = 1; + } + } + + /* We have confirmed that all objects are UNINTERESTING. */ + if (!list->maybe_interesting) + return 0; + } + /* Evaluate function pointer on this data, if requested. */ if ((list->type == OBJ_TREE && ctx->info->trees) || (list->type == OBJ_BLOB && ctx->info->blobs)) @@ -203,7 +241,7 @@ static void clear_strmap(struct strmap *map) int walk_objects_by_path(struct path_walk_info *info) { const char *root_path = ""; - int ret = 0; + int ret = 0, has_uninteresting = 0; size_t commits_nr = 0, paths_nr = 0; struct commit *c; struct type_and_oid_list *root_tree_list; @@ -215,6 +253,7 @@ int walk_objects_by_path(struct path_walk_info *info) .path_stack = STRING_LIST_INIT_DUP, .paths_to_lists = STRMAP_INIT }; + struct oidset root_tree_set = OIDSET_INIT; struct oid_array tagged_tree_list = OID_ARRAY_INIT; struct oid_array tagged_blob_list = OID_ARRAY_INIT; @@ -227,7 +266,9 @@ int walk_objects_by_path(struct path_walk_info *info) /* Insert a single list for the root tree into the paths. */ CALLOC_ARRAY(root_tree_list, 1); root_tree_list->type = OBJ_TREE; + root_tree_list->maybe_interesting = 1; strmap_put(&ctx.paths_to_lists, root_path, root_tree_list); + if (prepare_revision_walk(info->revs)) die(_("failed to setup revision walk")); @@ -247,11 +288,17 @@ int walk_objects_by_path(struct path_walk_info *info) oid = get_commit_tree_oid(c); t = lookup_tree(info->revs->repo, oid); - if (t) + if (t) { + oidset_insert(&root_tree_set, oid); oid_array_append(&root_tree_list->oids, oid); - else + } else { warning("could not find tree %s", oid_to_hex(oid)); + } + if (t && (c->object.flags & UNINTERESTING)) { + t->object.flags |= UNINTERESTING; + has_uninteresting = 1; + } } trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr); @@ -318,6 +365,21 @@ int walk_objects_by_path(struct path_walk_info *info) oid_array_clear(&tagged_blob_list); } + /* + * Before performing a DFS of our paths and emitting them as interesting, + * do a full walk of the trees to distribute the UNINTERESTING bit. Use + * the sparse algorithm if prune_all_uninteresting was set. + */ + if (has_uninteresting) { + trace2_region_enter("path-walk", "uninteresting-walk", info->revs->repo); + if (info->prune_all_uninteresting) + mark_trees_uninteresting_sparse(ctx.repo, &root_tree_set); + else + mark_trees_uninteresting_dense(ctx.repo, &root_tree_set); + trace2_region_leave("path-walk", "uninteresting-walk", info->revs->repo); + } + oidset_clear(&root_tree_set); + string_list_append(&ctx.path_stack, root_path); trace2_region_enter("path-walk", "path-walk", info->revs->repo); diff --git a/path-walk.h b/path-walk.h index 637d3b0cabb69a..7c02bca71561aa 100644 --- a/path-walk.h +++ b/path-walk.h @@ -50,6 +50,14 @@ struct path_walk_info { * the sparse-checkout patterns. */ struct pattern_list *pl; + + /** + * When 'prune_all_uninteresting' is set and a path has all objects + * marked as UNINTERESTING, then the path-walk will not visit those + * objects. It will not call path_fn on those objects and will not + * walk the children of such trees. + */ + int prune_all_uninteresting; }; #define PATH_WALK_INFO_INIT { \ From d192ae7162406fc5bf6494d37f5bda65e3a56b4a Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 5 Sep 2024 10:04:51 -0400 Subject: [PATCH 3/9] pack-objects: add --path-walk option In order to more easily compute delta bases among objects that appear at the exact same path, add a --path-walk option to 'git pack-objects'. This option will use the path-walk API instead of the object walk given by the revision machinery. Since objects will be provided in batches representing a common path, those objects can be tested for delta bases immediately instead of waiting for a sort of the full object list by name-hash. This has multiple benefits, including avoiding collisions by name-hash. The objects marked as UNINTERESTING are included in these batches, so we are guaranteeing some locality to find good delta bases. After the individual passes are done on a per-path basis, the default name-hash is used to find other opportunistic delta bases that did not match exactly by the full path name. RFC TODO: It is important to note that this option is inherently incompatible with using a bitmap index. This walk probably also does not work with other advanced features, such as delta islands. Getting ahead of myself, this option compares well with --full-name-hash when the packfile is large enough, but also performs at least as well as the default in all cases that I've seen. RFC TODO: this should probably be recording the batch locations to another list so they could be processed in a second phase using threads. RFC TODO: list some examples of how this outperforms previous pack-objects strategies. (This is coming in later commits that include performance test changes.) Signed-off-by: Derrick Stolee --- builtin/pack-objects.c | 209 ++++++++++++++++++++++++++++++++++------- path-walk.c | 2 +- 2 files changed, 177 insertions(+), 34 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index f395488971f9c5..50f5ae5bc06996 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -39,6 +39,9 @@ #include "promisor-remote.h" #include "pack-mtimes.h" #include "parse-options.h" +#include "blob.h" +#include "tree.h" +#include "path-walk.h" /* * Objects we are going to pack are collected in the `to_pack` structure. @@ -215,6 +218,7 @@ static int delta_search_threads; static int pack_to_stdout; static int sparse; static int thin; +static int path_walk; static int num_preferred_base; static struct progress *progress_state; @@ -3139,6 +3143,38 @@ static int add_ref_tag(const char *tag UNUSED, const struct object_id *oid, return 0; } +static int should_attempt_deltas(struct object_entry *entry) +{ + if (DELTA(entry)) + /* This happens if we decided to reuse existing + * delta from a pack. "reuse_delta &&" is implied. + */ + return 0; + + if (!entry->type_valid || + oe_size_less_than(&to_pack, entry, 50)) + return 0; + + if (entry->no_try_delta) + return 0; + + if (!entry->preferred_base) { + if (oe_type(entry) < 0) + die(_("unable to get type of object %s"), + oid_to_hex(&entry->idx.oid)); + } else { + if (oe_type(entry) < 0) { + /* + * This object is not found, but we + * don't have to include it anyway. + */ + return 0; + } + } + + return 1; +} + static void prepare_pack(int window, int depth) { struct object_entry **delta_list; @@ -3169,33 +3205,11 @@ static void prepare_pack(int window, int depth) for (i = 0; i < to_pack.nr_objects; i++) { struct object_entry *entry = to_pack.objects + i; - if (DELTA(entry)) - /* This happens if we decided to reuse existing - * delta from a pack. "reuse_delta &&" is implied. - */ + if (!should_attempt_deltas(entry)) continue; - if (!entry->type_valid || - oe_size_less_than(&to_pack, entry, 50)) - continue; - - if (entry->no_try_delta) - continue; - - if (!entry->preferred_base) { + if (!entry->preferred_base) nr_deltas++; - if (oe_type(entry) < 0) - die(_("unable to get type of object %s"), - oid_to_hex(&entry->idx.oid)); - } else { - if (oe_type(entry) < 0) { - /* - * This object is not found, but we - * don't have to include it anyway. - */ - continue; - } - } delta_list[n++] = entry; } @@ -4109,6 +4123,117 @@ static void mark_bitmap_preferred_tips(void) } } +static inline int is_oid_interesting(struct repository *repo, + struct object_id *oid, + enum object_type type) +{ + if (type == OBJ_TAG) { + struct tag *t = lookup_tag(repo, oid); + return t && !(t->object.flags & UNINTERESTING); + } + + if (type == OBJ_COMMIT) { + struct commit *c = lookup_commit(repo, oid); + return c && !(c->object.flags & UNINTERESTING); + } + + if (type == OBJ_TREE) { + struct tree *t = lookup_tree(repo, oid); + return t && !(t->object.flags & UNINTERESTING); + } + + if (type == OBJ_BLOB) { + struct blob *b = lookup_blob(repo, oid); + return b && !(b->object.flags & UNINTERESTING); + } + + return 0; +} + +static int add_objects_by_path(const char *path, + struct oid_array *oids, + enum object_type type, + void *data) +{ + struct object_entry **delta_list; + size_t oe_start = to_pack.nr_objects; + size_t oe_end; + unsigned int sub_list_size; + unsigned int *processed = data; + + /* + * First, add all objects to the packing data, including the ones + * marked UNINTERESTING (translated to 'exclude') as they can be + * used as delta bases. + */ + for (size_t i = 0; i < oids->nr; i++) { + struct object_id *oid = &oids->oid[i]; + int exclude = !is_oid_interesting(the_repository, oid, type); + add_object_entry(oid, type, path, exclude); + } + + oe_end = to_pack.nr_objects; + + /* We can skip delta calculations if it is a no-op. */ + if (oe_end == oe_start || !window) + return 0; + + sub_list_size = 0; + ALLOC_ARRAY(delta_list, oe_end - oe_start); + + for (size_t i = 0; i < oe_end - oe_start; i++) { + struct object_entry *entry = to_pack.objects + oe_start + i; + + if (!should_attempt_deltas(entry)) + continue; + + delta_list[sub_list_size++] = entry; + } + + /* + * Find delta bases among this list of objects that all match the same + * path. This causes the delta compression to be interleaved in the + * object walk, which can lead to confusing progress indicators. This is + * also incompatible with threaded delta calculations. In the future, + * consider creating a list of regions in the full to_pack.objects array + * that could be picked up by the threaded delta computation. + */ + if (sub_list_size && window) { + QSORT(delta_list, sub_list_size, type_size_sort); + find_deltas(delta_list, &sub_list_size, window, depth, processed); + } + + free(delta_list); + return 0; +} + +static void get_object_list_path_walk(struct rev_info *revs) +{ + struct path_walk_info info = PATH_WALK_INFO_INIT; + unsigned int processed = 0; + + info.revs = revs; + + info.revs->tag_objects = 1; + info.tags = 1; + info.commits = 1; + info.trees = 1; + info.blobs = 1; + info.path_fn = add_objects_by_path; + info.path_fn_data = &processed; + + /* + * Allow the --[no-]sparse option to be interesting here, if only + * for testing purposes. Paths with no interesting objects will not + * contribute to the resulting pack, but only create noisy preferred + * base objects. + */ + info.prune_all_uninteresting = sparse; + + if (walk_objects_by_path(&info)) + die(_("failed to pack objects via path-walk")); +} + static void get_object_list(struct rev_info *revs, int ac, const char **av) { struct setup_revision_opt s_r_opt = { @@ -4155,7 +4280,7 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av) warn_on_object_refname_ambiguity = save_warning; - if (use_bitmap_index && !get_object_list_from_bitmap(revs)) + if (use_bitmap_index && !path_walk && !get_object_list_from_bitmap(revs)) return; if (use_delta_islands) @@ -4164,15 +4289,19 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av) if (write_bitmap_index) mark_bitmap_preferred_tips(); - if (prepare_revision_walk(revs)) - die(_("revision walk setup failed")); - mark_edges_uninteresting(revs, show_edge, sparse); - if (!fn_show_object) fn_show_object = show_object; - traverse_commit_list(revs, - show_commit, fn_show_object, - NULL); + + if (path_walk) { + get_object_list_path_walk(revs); + } else { + if (prepare_revision_walk(revs)) + die(_("revision walk setup failed")); + mark_edges_uninteresting(revs, show_edge, sparse); + traverse_commit_list(revs, + show_commit, fn_show_object, + NULL); + } if (unpack_unreachable_expiration) { revs->ignore_missing_links = 1; @@ -4367,6 +4496,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) N_("use the sparse reachability algorithm")), OPT_BOOL(0, "thin", &thin, N_("create thin packs")), + OPT_BOOL(0, "path-walk", &path_walk, + N_("use the path-walk API to walk objects when possible")), OPT_BOOL(0, "shallow", &shallow, N_("create packs suitable for shallow fetches")), OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk, @@ -4447,7 +4578,19 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) window = 0; strvec_push(&rp, "pack-objects"); - if (thin) { + + if (path_walk && filter_options.choice) { + warning(_("cannot use --filter with --path-walk")); + path_walk = 0; + } + if (path_walk) { + strvec_push(&rp, "--boundary"); + /* + * We must disable the bitmaps because we are removing + * the --objects / --objects-edge[-aggressive] options. + */ + use_bitmap_index = 0; + } else if (thin) { use_internal_rev_list = 1; strvec_push(&rp, shallow ? "--objects-edge-aggressive" diff --git a/path-walk.c b/path-walk.c index 08de29614f7299..9391e0579aea4b 100644 --- a/path-walk.c +++ b/path-walk.c @@ -306,7 +306,7 @@ int walk_objects_by_path(struct path_walk_info *info) /* Track all commits. */ if (info->commits) - ret = info->path_fn("", &commit_list->oids, OBJ_COMMIT, + ret = info->path_fn("initial", &commit_list->oids, OBJ_COMMIT, info->path_fn_data); oid_array_clear(&commit_list->oids); free(commit_list); From ab0bc080725917836685870802cfaf0525ff4128 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 6 Sep 2024 15:24:10 -0400 Subject: [PATCH 4/9] pack-objects: extract should_attempt_deltas() Signed-off-by: Derrick Stolee --- builtin/pack-objects.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 50f5ae5bc06996..7214c3a3ec7b3f 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -3151,8 +3151,7 @@ static int should_attempt_deltas(struct object_entry *entry) */ return 0; - if (!entry->type_valid || - oe_size_less_than(&to_pack, entry, 50)) + if (!entry->type_valid || oe_size_less_than(&to_pack, entry, 50)) return 0; if (entry->no_try_delta) @@ -3162,14 +3161,12 @@ static int should_attempt_deltas(struct object_entry *entry) if (oe_type(entry) < 0) die(_("unable to get type of object %s"), oid_to_hex(&entry->idx.oid)); - } else { - if (oe_type(entry) < 0) { - /* - * This object is not found, but we - * don't have to include it anyway. - */ - return 0; - } + } else if (oe_type(entry) < 0) { + /* + * This object is not found, but we + * don't have to include it anyway. + */ + return 0; } return 1; From c6d4832e0723f9938fac7ea54aacc7a60e7f76da Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Fri, 6 Sep 2024 09:18:43 -0400 Subject: [PATCH 5/9] pack-objects: introduce GIT_TEST_PACK_PATH_WALK There are many tests that validate whether 'git pack-objects' works as expected. Instead of duplicating these tests, add a new test environment variable, GIT_TEST_PACK_PATH_WALK, that implies --path-walk by default when specified. This was useful in testing the implementation of the --path-walk implementation, especially in conjunction with test such as: - t5322-pack-objects-sparse.sh : This demonstrates the effectiveness of the --sparse option and how it combines with --path-walk. RFC TODO: list other helpful test cases, as well as the ones where the behavior breaks if this is enabled... Signed-off-by: Derrick Stolee --- builtin/pack-objects.c | 1 + t/README | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 7214c3a3ec7b3f..de6d7046e432b4 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -4533,6 +4533,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) disable_replace_refs(); + path_walk = git_env_bool("GIT_TEST_PACK_PATH_WALK", 0); sparse = git_env_bool("GIT_TEST_PACK_SPARSE", -1); if (the_repository->gitdir) { prepare_repo_settings(the_repository); diff --git a/t/README b/t/README index cd884fdcd4dfe2..c0271042c03e83 100644 --- a/t/README +++ b/t/README @@ -433,6 +433,10 @@ GIT_TEST_PACK_SPARSE= if disabled will default the pack-objects builtin to use the non-sparse object walk. This can still be overridden by the --sparse command-line argument. +GIT_TEST_PACK_PATH_WALK= if enabled will default the pack-objects +builtin to use the path-walk API for the object walk. This can still be +overridden by the --no-path-walk command-line argument. + GIT_TEST_PRELOAD_INDEX= exercises the preload-index code path by overriding the minimum number of cache entries required per thread. From c2092f0ef9acdc14b7e260a1addebf8bda2da725 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Wed, 28 Aug 2024 12:07:42 -0400 Subject: [PATCH 6/9] p5313: add size comparison test To test the benefits of the new --path-walk option in 'git pack-objects', create a performance test that times the process but also compares the size of the output. Against the microsoft/fluentui repo [1] against a particular commit [2], this has reproducible results of a similar scale: Test this tree --------------------------------------------------------------- 5313.2: thin pack 0.39(0.48+0.03) 5313.3: thin pack size 1.2M 5313.4: thin pack with --path-walk 0.09(0.07+0.01) 5313.5: thin pack size with --path-walk 20.8K 5313.6: big recent pack 2.13(8.29+0.26) 5313.7: big recent pack size 17.7M 5313.8: big recent pack with --path-walk 3.18(4.21+0.22) 5313.9: big recent pack size with --path-walk 15.0M [1] https://github.com/microsoft/reactui [2] e70848ebac1cd720875bccaa3026f4a9ed700e08 RFC TODO: Note that the path-walk version is slower for the big case, but the delta calculation is single-threaded with the current implementation! It's still faster for the small case that mimics a typical push. Signed-off-by: Derrick Stolee --- t/perf/p5313-pack-objects.sh | 55 ++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100755 t/perf/p5313-pack-objects.sh diff --git a/t/perf/p5313-pack-objects.sh b/t/perf/p5313-pack-objects.sh new file mode 100755 index 00000000000000..fdcdf188f952ad --- /dev/null +++ b/t/perf/p5313-pack-objects.sh @@ -0,0 +1,55 @@ +#!/bin/sh + +test_description='Tests pack performance using bitmaps' +. ./perf-lib.sh + +GIT_TEST_PASSING_SANITIZE_LEAK=0 +export GIT_TEST_PASSING_SANITIZE_LEAK + +test_perf_large_repo + +test_expect_success 'create rev input' ' + cat >in-thin <<-EOF && + $(git rev-parse HEAD) + ^$(git rev-parse HEAD~1) + EOF + + cat >in-big-recent <<-EOF + $(git rev-parse HEAD) + ^$(git rev-parse HEAD~1000) + EOF +' + +test_perf 'thin pack' ' + git pack-objects --thin --stdout --revs --sparse out +' + +test_size 'thin pack size' ' + wc -c out +' + +test_size 'thin pack size with --path-walk' ' + wc -c out +' + +test_size 'big recent pack size' ' + wc -c out +' + +test_size 'big recent pack size with --path-walk' ' + wc -c Date: Thu, 5 Sep 2024 09:49:23 -0400 Subject: [PATCH 7/9] repack: add --path-walk option Since 'git pack-objects' supports a --path-walk option, allow passing it through in 'git repack'. This presents interesting testing opportunities for comparing the different repacking strategies against each other. For the microsoft/fluentui repo [1], the results are very interesting: Test this tree ------------------------------------------------------------------- 5313.10: full repack 97.91(663.47+2.83) 5313.11: full repack size 449.1K 5313.12: full repack with --path-walk 105.42(120.49+0.95) 5313.13: full repack size with --path-walk 159.1K [1] https://github.com/microsoft/fluentui This repo suffers from having a lot of paths that collide in the name hash, so examining them in groups by path leads to better deltas. Also, in this case, the single-threaded implementation is competitive with the full repack. This is saving time diffing files that have significant differences from each other. Signed-off-by: Derrick Stolee --- builtin/repack.c | 5 +++++ t/perf/p5313-pack-objects.sh | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/builtin/repack.c b/builtin/repack.c index f0317fa94ad46c..fe1454d32dd80a 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -57,6 +57,7 @@ struct pack_objects_args { int no_reuse_object; int quiet; int local; + int path_walk; struct list_objects_filter_options filter_options; }; @@ -288,6 +289,8 @@ static void prepare_pack_objects(struct child_process *cmd, strvec_pushf(&cmd->args, "--no-reuse-delta"); if (args->no_reuse_object) strvec_pushf(&cmd->args, "--no-reuse-object"); + if (args->path_walk) + strvec_pushf(&cmd->args, "--path-walk"); if (args->local) strvec_push(&cmd->args, "--local"); if (args->quiet) @@ -1157,6 +1160,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix) N_("pass --no-reuse-delta to git-pack-objects")), OPT_BOOL('F', NULL, &po_args.no_reuse_object, N_("pass --no-reuse-object to git-pack-objects")), + OPT_BOOL(0, "path-walk", &po_args.path_walk, + N_("pass --path-walk to git-pack-objects")), OPT_NEGBIT('n', NULL, &run_update_server_info, N_("do not run git-update-server-info"), 1), OPT__QUIET(&po_args.quiet, N_("be quiet")), diff --git a/t/perf/p5313-pack-objects.sh b/t/perf/p5313-pack-objects.sh index fdcdf188f952ad..48fc05bb6c6ce3 100755 --- a/t/perf/p5313-pack-objects.sh +++ b/t/perf/p5313-pack-objects.sh @@ -52,4 +52,24 @@ test_size 'big recent pack size with --path-walk' ' wc -c Date: Thu, 5 Sep 2024 09:50:06 -0400 Subject: [PATCH 8/9] pack-objects: enable --path-walk via config Users may want to enable the --path-walk option for 'git pack-objects' by default, especially underneath commands like 'git push' or 'git repack'. This should be limited to client repositories, since the --path-walk option disables bitmap walks, so would be bad to include in Git servers when serving fetches and clones. There is potential that it may be helpful to consider when repacking the repository, to take advantage of improved deltas across historical versions of the same files. Much like how "pack.useSparse" was introduced and included in "feature.experimental" before being enabled by default, use the repository settings infrastructure to make the new "pack.usePathWalk" config enabled by "feature.experimental" and "feature.manyFiles". Signed-off-by: Derrick Stolee --- Documentation/config/pack.txt | 8 ++++++++ builtin/pack-objects.c | 4 +++- repo-settings.c | 4 ++++ repository.h | 1 + 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/Documentation/config/pack.txt b/Documentation/config/pack.txt index da527377fafcb6..08d06271177006 100644 --- a/Documentation/config/pack.txt +++ b/Documentation/config/pack.txt @@ -155,6 +155,14 @@ pack.useSparse:: commits contain certain types of direct renames. Default is `true`. +pack.usePathWalk:: + When true, git will default to using the '--path-walk' option in + 'git pack-objects' when the '--revs' option is present. This + algorithm groups objects by path to maximize the ability to + compute delta chains across historical versions of the same + object. This may disable other options, such as using bitmaps to + enumerate objects. + pack.preferBitmapTips:: When selecting which commits will receive bitmaps, prefer a commit at the tip of any reference that is a suffix of any value diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index de6d7046e432b4..d510c095d64eb8 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -4533,12 +4533,14 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) disable_replace_refs(); - path_walk = git_env_bool("GIT_TEST_PACK_PATH_WALK", 0); + path_walk = git_env_bool("GIT_TEST_PACK_PATH_WALK", -1); sparse = git_env_bool("GIT_TEST_PACK_SPARSE", -1); if (the_repository->gitdir) { prepare_repo_settings(the_repository); if (sparse < 0) sparse = the_repository->settings.pack_use_sparse; + if (path_walk < 0) + path_walk = the_repository->settings.pack_use_path_walk; if (the_repository->settings.pack_use_multi_pack_reuse) allow_pack_reuse = MULTI_PACK_REUSE; } diff --git a/repo-settings.c b/repo-settings.c index f104e6fc73c38f..7c152cebae7b96 100644 --- a/repo-settings.c +++ b/repo-settings.c @@ -71,11 +71,14 @@ void prepare_repo_settings(struct repository *r) repo_config_get_maybe_bool(r, "core.fsmonitor", &value) > 0 && repo_config_get_bool(r, "core.useBuiltinFSMonitor", &value)) fsm_settings__set_ipc(r); + + r->settings.pack_use_path_walk = 1; } if (manyfiles) { r->settings.index_version = 4; r->settings.index_skip_hash = 1; r->settings.core_untracked_cache = UNTRACKED_CACHE_WRITE; + r->settings.pack_use_path_walk = 1; } /* Commit graph config or default, does not cascade (simple) */ @@ -90,6 +93,7 @@ void prepare_repo_settings(struct repository *r) /* Boolean config or default, does not cascade (simple) */ repo_cfg_bool(r, "pack.usesparse", &r->settings.pack_use_sparse, 1); + repo_cfg_bool(r, "pack.usepathwalk", &r->settings.pack_use_path_walk, 0); repo_cfg_bool(r, "core.multipackindex", &r->settings.core_multi_pack_index, 1); repo_cfg_bool(r, "index.sparse", &r->settings.sparse_index, 0); repo_cfg_bool(r, "index.skiphash", &r->settings.index_skip_hash, r->settings.index_skip_hash); diff --git a/repository.h b/repository.h index af6ea0a62cdb70..2ae9c2b1741528 100644 --- a/repository.h +++ b/repository.h @@ -62,6 +62,7 @@ struct repo_settings { enum untracked_cache_setting core_untracked_cache; int pack_use_sparse; + int pack_use_path_walk; enum fetch_negotiation_setting fetch_negotiation_algorithm; int core_multi_pack_index; From e43582c6df4c197a79fac2d8e382772db602006a Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 5 Sep 2024 09:51:33 -0400 Subject: [PATCH 9/9] scalar: enable path-walk during push via config Repositories registered with Scalar are expected to be client-only repositories that are rather large. This means that they are more likely to be good candidates for using the --path-walk option when running 'git pack-objects', especially under the hood of 'git push'. Enable this config in Scalar repositories. Signed-off-by: Derrick Stolee --- scalar.c | 1 + 1 file changed, 1 insertion(+) diff --git a/scalar.c b/scalar.c index 1fe8a93e6529a2..1c41a2916227c1 100644 --- a/scalar.c +++ b/scalar.c @@ -170,6 +170,7 @@ static int set_recommended_config(int reconfigure) { "core.autoCRLF", "false" }, { "core.safeCRLF", "false" }, { "fetch.showForcedUpdates", "false" }, + { "push.usePathWalk", "true" }, { NULL, NULL }, }; int i;