From c145b9e6b3c52055ba0f5677d62374c81de899f1 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sat, 7 Sep 2024 19:43:45 -0400 Subject: [PATCH 1/4] pack-objects: add --full-name-hash option RFC NOTE: this is essentially the same as the patch introduced independently of the RFC, but now is on top of the --path-walk option instead. This is included in the RFC for comparison purposes. RFC NOTE: As you can see from the details below, the --full-name-hash option essentially attempts to do similar things as the --path-walk option, but sometimes misses the mark. Collisions still happen with the --full-name-hash option, leading to some misses. However, in cases where the default name-hash algorithm has low collision rates and deltas are actually desired across objects with similar names but different full names, the --path-walk option can still take advantage of the default name hash approach. Here are the new performance details simulating a single push in an internal monorepo using a lot of paths that collide in the default name hash. We can see that --full-name-hash gets close to the --path-walk option's size. Test this tree -------------------------------------------------------------- 5313.2: thin pack 2.43(2.92+0.14) 5313.3: thin pack size 4.5M 5313.4: thin pack with --full-name-hash 0.31(0.49+0.12) 5313.5: thin pack size with --full-name-hash 15.5K 5313.6: thin pack with --path-walk 0.35(0.31+0.04) 5313.7: thin pack size with --path-walk 14.2K However, when simulating pushes on repositories that do not have issues with name-hash collisions, the --full-name-hash option presents a potential of worse delta calculations, such as this example using my local Git repository: Test this tree -------------------------------------------------------------- 5313.2: thin pack 0.03(0.01+0.01) 5313.3: thin pack size 475 5313.4: thin pack with --full-name-hash 0.02(0.01+0.01) 5313.5: thin pack size with --full-name-hash 14.8K 5313.6: thin pack with --path-walk 0.02(0.01+0.01) 5313.7: thin pack size with --path-walk 475 Note that the path-walk option found the same delta bases as the default options in this case. In the full repack case, the --full-name-hash option may be preferable because it interacts well with other advanced features, such as using bitmap indexes and tracking delta islands. Signed-off-by: Derrick Stolee --- builtin/pack-objects.c | 20 +++++++++++++++----- builtin/repack.c | 5 +++++ pack-objects.h | 20 ++++++++++++++++++++ t/perf/p5313-pack-objects.sh | 26 ++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 5 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index d510c095d64eb8..094d253fa8d8d3 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -270,6 +270,14 @@ struct configured_exclusion { static struct oidmap configured_exclusions; static struct oidset excluded_by_config; +static int use_full_name_hash; + +static inline uint32_t pack_name_hash_fn(const char *name) +{ + if (use_full_name_hash) + return pack_full_name_hash(name); + return pack_name_hash(name); +} /* * stats @@ -1674,7 +1682,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type, return 0; } - create_object_entry(oid, type, pack_name_hash(name), + create_object_entry(oid, type, pack_name_hash_fn(name), exclude, name && no_try_delta(name), found_pack, found_offset); return 1; @@ -1888,7 +1896,7 @@ static void add_preferred_base_object(const char *name) { struct pbase_tree *it; size_t cmplen; - unsigned hash = pack_name_hash(name); + unsigned hash = pack_name_hash_fn(name); if (!num_preferred_base || check_pbase_path(hash)) return; @@ -3405,7 +3413,7 @@ static void show_object_pack_hint(struct object *object, const char *name, * here using a now in order to perhaps improve the delta selection * process. */ - oe->hash = pack_name_hash(name); + oe->hash = pack_name_hash_fn(name); oe->no_try_delta = name && no_try_delta(name); stdin_packs_hints_nr++; @@ -3555,7 +3563,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type entry = packlist_find(&to_pack, oid); if (entry) { if (name) { - entry->hash = pack_name_hash(name); + entry->hash = pack_name_hash_fn(name); entry->no_try_delta = no_try_delta(name); } } else { @@ -3578,7 +3586,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type return; } - entry = create_object_entry(oid, type, pack_name_hash(name), + entry = create_object_entry(oid, type, pack_name_hash_fn(name), 0, name && no_try_delta(name), pack, offset); } @@ -4525,6 +4533,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) OPT_STRING_LIST(0, "uri-protocol", &uri_protocols, N_("protocol"), N_("exclude any configured uploadpack.blobpackfileuri with this protocol")), + OPT_BOOL(0, "full-name-hash", &use_full_name_hash, + N_("optimize delta compression across identical path names over time")), OPT_END(), }; diff --git a/builtin/repack.c b/builtin/repack.c index fe1454d32dd80a..575dfdab4e78dc 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -58,6 +58,7 @@ struct pack_objects_args { int quiet; int local; int path_walk; + int full_name_hash; struct list_objects_filter_options filter_options; }; @@ -291,6 +292,8 @@ static void prepare_pack_objects(struct child_process *cmd, strvec_pushf(&cmd->args, "--no-reuse-object"); if (args->path_walk) strvec_pushf(&cmd->args, "--path-walk"); + if (args->full_name_hash) + strvec_pushf(&cmd->args, "--full-name-hash"); if (args->local) strvec_push(&cmd->args, "--local"); if (args->quiet) @@ -1162,6 +1165,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix) N_("pass --no-reuse-object to git-pack-objects")), OPT_BOOL(0, "path-walk", &po_args.path_walk, N_("pass --path-walk to git-pack-objects")), + OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash, + N_("pass --full-name-hash to git-pack-objects")), OPT_NEGBIT('n', NULL, &run_update_server_info, N_("do not run git-update-server-info"), 1), OPT__QUIET(&po_args.quiet, N_("be quiet")), diff --git a/pack-objects.h b/pack-objects.h index b9898a4e64b8b4..50097552d03f20 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -207,6 +207,26 @@ static inline uint32_t pack_name_hash(const char *name) return hash; } +static inline uint32_t pack_full_name_hash(const char *name) +{ + const uint32_t bigp = 1234572167U; + uint32_t c, hash = bigp; + + if (!name) + return 0; + + /* + * Just do the dumbest thing possible: add random multiples of a + * large prime number with a binary shift. Goal is not cryptographic, + * but generally uniformly distributed. + */ + while ((c = *name++) != 0) { + hash += c * bigp; + hash = (hash >> 5) | (hash << 27); + } + return hash; +} + static inline enum object_type oe_type(const struct object_entry *e) { return e->type_valid ? e->type_ : OBJ_BAD; diff --git a/t/perf/p5313-pack-objects.sh b/t/perf/p5313-pack-objects.sh index 48fc05bb6c6ce3..b3b7fff8abf3bf 100755 --- a/t/perf/p5313-pack-objects.sh +++ b/t/perf/p5313-pack-objects.sh @@ -28,6 +28,14 @@ test_size 'thin pack size' ' wc -c out +' + +test_size 'thin pack size with --full-name-hash' ' + wc -c out ' @@ -44,6 +52,14 @@ test_size 'big recent pack size' ' wc -c out +' + +test_size 'big recent pack size with --full-name-hash' ' + wc -c out ' @@ -62,6 +78,16 @@ test_size 'full repack size' ' sort -nr | head -n 1 ' +test_perf 'full repack with --full-name-hash' ' + git repack -adf --no-write-bitmap-index --full-name-hash +' + +test_size 'full repack size with --full-name-hash' ' + du -a .git/objects/pack | \ + awk "{ print \$1; }" | \ + sort -nr | head -n 1 +' + test_perf 'full repack with --path-walk' ' git repack -adf --no-write-bitmap-index --path-walk ' From 72191a0cba8acce3b3b7ba0c6d704198a2c86689 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 8 Sep 2024 19:30:57 -0400 Subject: [PATCH 2/4] test-name-hash: add helper to compute name-hash functions Using this tool, we can count how many distinct name-hash values exist within a list of paths. Examples include git ls-tree -r --name-only HEAD | \ test-tool name-hash | \ awk "{print \$1;}" | \ sort -ns | uniq | wc -l which outputs the number of distinct name-hash values that appear at HEAD. Or, the following which presents the resulting name-hash values of maximum multiplicity: git ls-tree -r --name-only HEAD | \ test-tool name-hash | \ awk "{print \$1;}" | \ sort -n | uniq -c | sort -nr | head -n 25 For an internal monorepo with around a quarter million paths at HEAD, the highest multiplicity for the standard name-hash function was 14,424 while the full name-hash algorithm had only seven hash values with any collision, with a maximum multiplicity of two. Signed-off-by: Derrick Stolee --- Makefile | 1 + t/helper/test-name-hash.c | 23 +++++++++++++++++++++++ t/helper/test-tool.c | 1 + t/helper/test-tool.h | 1 + 4 files changed, 26 insertions(+) create mode 100644 t/helper/test-name-hash.c diff --git a/Makefile b/Makefile index 076f7581541740..ef73d24a969efe 100644 --- a/Makefile +++ b/Makefile @@ -814,6 +814,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o TEST_BUILTINS_OBJS += test-match-trees.o TEST_BUILTINS_OBJS += test-mergesort.o TEST_BUILTINS_OBJS += test-mktemp.o +TEST_BUILTINS_OBJS += test-name-hash.o TEST_BUILTINS_OBJS += test-oid-array.o TEST_BUILTINS_OBJS += test-online-cpus.o TEST_BUILTINS_OBJS += test-pack-mtimes.o diff --git a/t/helper/test-name-hash.c b/t/helper/test-name-hash.c new file mode 100644 index 00000000000000..c82ccd7cefd860 --- /dev/null +++ b/t/helper/test-name-hash.c @@ -0,0 +1,23 @@ +/* + * test-name-hash.c: Read a list of paths over stdin and report on their + * name-hash and full name-hash. + */ + +#include "test-tool.h" +#include "git-compat-util.h" +#include "pack-objects.h" +#include "strbuf.h" + +int cmd__name_hash(int argc, const char **argv) +{ + struct strbuf line = STRBUF_INIT; + + while (!strbuf_getline(&line, stdin)) { + uint32_t name_hash = pack_name_hash(line.buf); + uint32_t full_hash = pack_full_name_hash(line.buf); + + printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf); + } + + return 0; +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index af909330f0eab6..eeb10fae59cf8b 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -44,6 +44,7 @@ static struct test_cmd cmds[] = { { "match-trees", cmd__match_trees }, { "mergesort", cmd__mergesort }, { "mktemp", cmd__mktemp }, + { "name-hash", cmd__name_hash }, { "oid-array", cmd__oid_array }, { "online-cpus", cmd__online_cpus }, { "pack-mtimes", cmd__pack_mtimes }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index 6b6a514323ff10..bd5f26d8a7c3a2 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -38,6 +38,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv); int cmd__match_trees(int argc, const char **argv); int cmd__mergesort(int argc, const char **argv); int cmd__mktemp(int argc, const char **argv); +int cmd__name_hash(int argc, const char **argv); int cmd__online_cpus(int argc, const char **argv); int cmd__pack_mtimes(int argc, const char **argv); int cmd__parse_options(int argc, const char **argv); From 5039f0348dc67ee31484002fa65ecd07f1a033b4 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 8 Sep 2024 21:04:52 -0400 Subject: [PATCH 3/4] p5314: add a size test for name-hash collisions This test helps inform someone as to the behavior of the name-hash algorithms for their repo based on the paths at HEAD. For example, the microsoft/fluentui repo had these statistics at time of committing: Test this tree ----------------------------------------------------------------- 5314.1: paths at head 19.6K 5314.2: number of distinct name-hashes 8.2K 5314.3: number of distinct full-name-hashes 19.6K 5314.4: maximum multiplicity of name-hashes 279 5314.5: maximum multiplicity of fullname-hashes 1 That demonstrates that of the nearly twenty thousand path names, they are assigned around eight thousand distinct values. 279 paths are assigned to a single value, leading the packing algorithm to sort objects from those paths together, by size. In this repository, no collisions occur for the full-name-hash algorithm. In a more extreme example, an internal monorepo had a much worse collision rate: Test this tree ----------------------------------------------------------------- 5314.1: paths at head 221.6K 5314.2: number of distinct name-hashes 72.0K 5314.3: number of distinct full-name-hashes 221.6K 5314.4: maximum multiplicity of name-hashes 14.4K 5314.5: maximum multiplicity of fullname-hashes 2 Even in this repository with many more paths at HEAD, the collision rate was low and the maximum number of paths being grouped into a single bucket by the full-path-name algorithm was two. Signed-off-by: Derrick Stolee --- t/perf/p5314-name-hash.sh | 41 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 t/perf/p5314-name-hash.sh diff --git a/t/perf/p5314-name-hash.sh b/t/perf/p5314-name-hash.sh new file mode 100755 index 00000000000000..9fe26612facc1d --- /dev/null +++ b/t/perf/p5314-name-hash.sh @@ -0,0 +1,41 @@ +#!/bin/sh + +test_description='Tests pack performance using bitmaps' +. ./perf-lib.sh + +GIT_TEST_PASSING_SANITIZE_LEAK=0 +export GIT_TEST_PASSING_SANITIZE_LEAK + +test_perf_large_repo + +test_size 'paths at head' ' + git ls-tree -r --name-only HEAD >path-list && + wc -l name-hashes && + cat name-hashes | awk "{ print \$1; }" | sort -n | uniq -c >name-hash-count && + wc -l full-name-hash-count && + wc -l Date: Wed, 28 Aug 2024 12:06:43 -0400 Subject: [PATCH 4/4] pack-objects: output debug info about deltas In order to debug what is going on during delta calculations, add a --debug-file= option to 'git pack-objects'. This leads to sending a JSON-formatted description of the delta information to that file. Signed-off-by: Derrick Stolee --- builtin/pack-objects.c | 69 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 094d253fa8d8d3..080b3f98bef32f 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -50,6 +50,9 @@ */ static struct packing_data to_pack; +static FILE *delta_file; +static int delta_file_nr; + static inline struct object_entry *oe_delta( const struct packing_data *pack, const struct object_entry *e) @@ -516,6 +519,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent hdrlen = encode_in_pack_object_header(header, sizeof(header), type, size); + if (delta_file) { + if (delta_file_nr++) + fprintf(delta_file, ",\n"); + fprintf(delta_file, "\t\t{\n"); + fprintf(delta_file, "\t\t\t\"oid\" : \"%s\",\n", oid_to_hex(&entry->idx.oid)); + fprintf(delta_file, "\t\t\t\"size\" : %"PRIuMAX",\n", datalen); + } + if (type == OBJ_OFS_DELTA) { /* * Deltas with relative base contain an additional @@ -536,6 +547,11 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent hashwrite(f, header, hdrlen); hashwrite(f, dheader + pos, sizeof(dheader) - pos); hdrlen += sizeof(dheader) - pos; + if (delta_file) { + fprintf(delta_file, "\t\t\t\"delta_type\" : \"OFS\",\n"); + fprintf(delta_file, "\t\t\t\"offset\" : %"PRIuMAX",\n", ofs); + fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid)); + } } else if (type == OBJ_REF_DELTA) { /* * Deltas with a base reference contain @@ -550,6 +566,10 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent hashwrite(f, header, hdrlen); hashwrite(f, DELTA(entry)->idx.oid.hash, hashsz); hdrlen += hashsz; + if (delta_file) { + fprintf(delta_file, "\t\t\t\"delta_type\" : \"REF\",\n"); + fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid)); + } } else { if (limit && hdrlen + datalen + hashsz >= limit) { if (st) @@ -559,6 +579,10 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent } hashwrite(f, header, hdrlen); } + + if (delta_file) + fprintf(delta_file, "\t\t\t\"reused\" : false\n\t\t}"); + if (st) { datalen = write_large_blob_data(st, f, &entry->idx.oid); close_istream(st); @@ -619,6 +643,14 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, return write_no_reuse_object(f, entry, limit, usable_delta); } + if (delta_file) { + if (delta_file_nr++) + fprintf(delta_file, ",\n"); + fprintf(delta_file, "\t\t{\n"); + fprintf(delta_file, "\t\t\t\"oid\" : \"%s\",\n", oid_to_hex(&entry->idx.oid)); + fprintf(delta_file, "\t\t\t\"size\" : %"PRIuMAX",\n", entry_size); + } + if (type == OBJ_OFS_DELTA) { off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset; unsigned pos = sizeof(dheader) - 1; @@ -633,6 +665,12 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, hashwrite(f, dheader + pos, sizeof(dheader) - pos); hdrlen += sizeof(dheader) - pos; reused_delta++; + + if (delta_file) { + fprintf(delta_file, "\t\t\t\"delta_type\" : \"OFS\",\n"); + fprintf(delta_file, "\t\t\t\"offset\" : %"PRIuMAX",\n", ofs); + fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid)); + } } else if (type == OBJ_REF_DELTA) { if (limit && hdrlen + hashsz + datalen + hashsz >= limit) { unuse_pack(&w_curs); @@ -642,6 +680,10 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, hashwrite(f, DELTA(entry)->idx.oid.hash, hashsz); hdrlen += hashsz; reused_delta++; + if (delta_file) { + fprintf(delta_file, "\t\t\t\"delta_type\" : \"REF\",\n"); + fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid)); + } } else { if (limit && hdrlen + datalen + hashsz >= limit) { unuse_pack(&w_curs); @@ -652,6 +694,10 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, copy_pack_data(f, p, &w_curs, offset, datalen); unuse_pack(&w_curs); reused++; + + if (delta_file) + fprintf(delta_file, "\t\t\t\"reused\" : true\n\t\t}"); + return hdrlen + datalen; } @@ -1264,6 +1310,11 @@ static void write_pack_file(void) ALLOC_ARRAY(written_list, to_pack.nr_objects); write_order = compute_write_order(); + if (delta_file) { + fprintf(delta_file, "{\n\t\"num_objects\" : %"PRIu32",\n", to_pack.nr_objects); + fprintf(delta_file, "\t\"objects\" : [\n"); + } + do { unsigned char hash[GIT_MAX_RAWSZ]; char *pack_tmp_name = NULL; @@ -1412,6 +1463,9 @@ static void write_pack_file(void) written, nr_result); trace2_data_intmax("pack-objects", the_repository, "write_pack_file/wrote", nr_result); + + if (delta_file) + fprintf(delta_file, "\n\t]\n}"); } static int no_try_delta(const char *path) @@ -4429,6 +4483,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) struct string_list keep_pack_list = STRING_LIST_INIT_NODUP; struct list_objects_filter_options filter_options = LIST_OBJECTS_FILTER_INIT; + const char *delta_file_name = NULL; struct option pack_objects_options[] = { OPT_CALLBACK_F('q', "quiet", &progress, NULL, @@ -4535,6 +4590,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) N_("exclude any configured uploadpack.blobpackfileuri with this protocol")), OPT_BOOL(0, "full-name-hash", &use_full_name_hash, N_("optimize delta compression across identical path names over time")), + OPT_STRING(0, "delta-file", &delta_file_name, + N_("filename"), + N_("output delta compression details to the given file")), OPT_END(), }; @@ -4572,6 +4630,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (pack_to_stdout != !base_name || argc) usage_with_options(pack_usage, pack_objects_options); + if (delta_file_name) { + delta_file = fopen(delta_file_name, "w"); + if (!delta_file) + die_errno("failed to open '%s'", delta_file_name); + trace2_printf("opened '%s' for writing deltas", delta_file_name); + } if (depth < 0) depth = 0; if (depth >= (1 << OE_DEPTH_BITS)) { @@ -4795,5 +4859,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) list_objects_filter_release(&filter_options); strvec_clear(&rp); + if (delta_file) { + fflush(delta_file); + fclose(delta_file); + } + return 0; }