Skip to content

Commit

Permalink
Merge branch 'path-walk'
Browse files Browse the repository at this point in the history
This is a simplified version of the patch series that was split out by
itself earlier for full review. This was split out on its own partly
because it doesn't actually use the path-walk API. This has benefits and
drawbacks, but it seems like a quick win for many scenarios.
  • Loading branch information
dscho committed Sep 15, 2024
2 parents f172042 + 88fee5b commit 6161ea8
Show file tree
Hide file tree
Showing 9 changed files with 202 additions and 5 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
TEST_BUILTINS_OBJS += test-match-trees.o
TEST_BUILTINS_OBJS += test-mergesort.o
TEST_BUILTINS_OBJS += test-mktemp.o
TEST_BUILTINS_OBJS += test-name-hash.o
TEST_BUILTINS_OBJS += test-oid-array.o
TEST_BUILTINS_OBJS += test-online-cpus.o
TEST_BUILTINS_OBJS += test-pack-mtimes.o
Expand Down
89 changes: 84 additions & 5 deletions builtin/pack-objects.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@
*/
static struct packing_data to_pack;

static FILE *delta_file;
static int delta_file_nr;

static inline struct object_entry *oe_delta(
const struct packing_data *pack,
const struct object_entry *e)
Expand Down Expand Up @@ -270,6 +273,14 @@ struct configured_exclusion {
static struct oidmap configured_exclusions;

static struct oidset excluded_by_config;
static int use_full_name_hash;

static inline uint32_t pack_name_hash_fn(const char *name)
{
if (use_full_name_hash)
return pack_full_name_hash(name);
return pack_name_hash(name);
}

/*
* stats
Expand Down Expand Up @@ -508,6 +519,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
hdrlen = encode_in_pack_object_header(header, sizeof(header),
type, size);

if (delta_file) {
if (delta_file_nr++)
fprintf(delta_file, ",\n");
fprintf(delta_file, "\t\t{\n");
fprintf(delta_file, "\t\t\t\"oid\" : \"%s\",\n", oid_to_hex(&entry->idx.oid));
fprintf(delta_file, "\t\t\t\"size\" : %"PRIuMAX",\n", datalen);
}

if (type == OBJ_OFS_DELTA) {
/*
* Deltas with relative base contain an additional
Expand All @@ -528,6 +547,11 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
hashwrite(f, header, hdrlen);
hashwrite(f, dheader + pos, sizeof(dheader) - pos);
hdrlen += sizeof(dheader) - pos;
if (delta_file) {
fprintf(delta_file, "\t\t\t\"delta_type\" : \"OFS\",\n");
fprintf(delta_file, "\t\t\t\"offset\" : %"PRIuMAX",\n", ofs);
fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid));
}
} else if (type == OBJ_REF_DELTA) {
/*
* Deltas with a base reference contain
Expand All @@ -542,6 +566,10 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
hashwrite(f, header, hdrlen);
hashwrite(f, DELTA(entry)->idx.oid.hash, hashsz);
hdrlen += hashsz;
if (delta_file) {
fprintf(delta_file, "\t\t\t\"delta_type\" : \"REF\",\n");
fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid));
}
} else {
if (limit && hdrlen + datalen + hashsz >= limit) {
if (st)
Expand All @@ -551,6 +579,10 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
}
hashwrite(f, header, hdrlen);
}

if (delta_file)
fprintf(delta_file, "\t\t\t\"reused\" : false\n\t\t}");

if (st) {
datalen = write_large_blob_data(st, f, &entry->idx.oid);
close_istream(st);
Expand Down Expand Up @@ -611,6 +643,14 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
return write_no_reuse_object(f, entry, limit, usable_delta);
}

if (delta_file) {
if (delta_file_nr++)
fprintf(delta_file, ",\n");
fprintf(delta_file, "\t\t{\n");
fprintf(delta_file, "\t\t\t\"oid\" : \"%s\",\n", oid_to_hex(&entry->idx.oid));
fprintf(delta_file, "\t\t\t\"size\" : %"PRIuMAX",\n", entry_size);
}

if (type == OBJ_OFS_DELTA) {
off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
unsigned pos = sizeof(dheader) - 1;
Expand All @@ -625,6 +665,12 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
hashwrite(f, dheader + pos, sizeof(dheader) - pos);
hdrlen += sizeof(dheader) - pos;
reused_delta++;

if (delta_file) {
fprintf(delta_file, "\t\t\t\"delta_type\" : \"OFS\",\n");
fprintf(delta_file, "\t\t\t\"offset\" : %"PRIuMAX",\n", ofs);
fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid));
}
} else if (type == OBJ_REF_DELTA) {
if (limit && hdrlen + hashsz + datalen + hashsz >= limit) {
unuse_pack(&w_curs);
Expand All @@ -634,6 +680,10 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
hashwrite(f, DELTA(entry)->idx.oid.hash, hashsz);
hdrlen += hashsz;
reused_delta++;
if (delta_file) {
fprintf(delta_file, "\t\t\t\"delta_type\" : \"REF\",\n");
fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid));
}
} else {
if (limit && hdrlen + datalen + hashsz >= limit) {
unuse_pack(&w_curs);
Expand All @@ -644,6 +694,10 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
copy_pack_data(f, p, &w_curs, offset, datalen);
unuse_pack(&w_curs);
reused++;

if (delta_file)
fprintf(delta_file, "\t\t\t\"reused\" : true\n\t\t}");

return hdrlen + datalen;
}

Expand Down Expand Up @@ -1256,6 +1310,11 @@ static void write_pack_file(void)
ALLOC_ARRAY(written_list, to_pack.nr_objects);
write_order = compute_write_order();

if (delta_file) {
fprintf(delta_file, "{\n\t\"num_objects\" : %"PRIu32",\n", to_pack.nr_objects);
fprintf(delta_file, "\t\"objects\" : [\n");
}

do {
unsigned char hash[GIT_MAX_RAWSZ];
char *pack_tmp_name = NULL;
Expand Down Expand Up @@ -1404,6 +1463,9 @@ static void write_pack_file(void)
written, nr_result);
trace2_data_intmax("pack-objects", the_repository,
"write_pack_file/wrote", nr_result);

if (delta_file)
fprintf(delta_file, "\n\t]\n}");
}

static int no_try_delta(const char *path)
Expand Down Expand Up @@ -1674,7 +1736,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
return 0;
}

create_object_entry(oid, type, pack_name_hash(name),
create_object_entry(oid, type, pack_name_hash_fn(name),
exclude, name && no_try_delta(name),
found_pack, found_offset);
return 1;
Expand Down Expand Up @@ -1888,7 +1950,7 @@ static void add_preferred_base_object(const char *name)
{
struct pbase_tree *it;
size_t cmplen;
unsigned hash = pack_name_hash(name);
unsigned hash = pack_name_hash_fn(name);

if (!num_preferred_base || check_pbase_path(hash))
return;
Expand Down Expand Up @@ -3405,7 +3467,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
* here using a now in order to perhaps improve the delta selection
* process.
*/
oe->hash = pack_name_hash(name);
oe->hash = pack_name_hash_fn(name);
oe->no_try_delta = name && no_try_delta(name);

stdin_packs_hints_nr++;
Expand Down Expand Up @@ -3555,7 +3617,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
entry = packlist_find(&to_pack, oid);
if (entry) {
if (name) {
entry->hash = pack_name_hash(name);
entry->hash = pack_name_hash_fn(name);
entry->no_try_delta = no_try_delta(name);
}
} else {
Expand All @@ -3578,7 +3640,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
return;
}

entry = create_object_entry(oid, type, pack_name_hash(name),
entry = create_object_entry(oid, type, pack_name_hash_fn(name),
0, name && no_try_delta(name),
pack, offset);
}
Expand Down Expand Up @@ -4421,6 +4483,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
struct list_objects_filter_options filter_options =
LIST_OBJECTS_FILTER_INIT;
const char *delta_file_name = NULL;

struct option pack_objects_options[] = {
OPT_CALLBACK_F('q', "quiet", &progress, NULL,
Expand Down Expand Up @@ -4525,6 +4588,11 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
N_("protocol"),
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
N_("optimize delta compression across identical path names over time")),
OPT_STRING(0, "delta-file", &delta_file_name,
N_("filename"),
N_("output delta compression details to the given file")),
OPT_END(),
};

Expand Down Expand Up @@ -4562,6 +4630,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
if (pack_to_stdout != !base_name || argc)
usage_with_options(pack_usage, pack_objects_options);

if (delta_file_name) {
delta_file = fopen(delta_file_name, "w");
if (!delta_file)
die_errno("failed to open '%s'", delta_file_name);
trace2_printf("opened '%s' for writing deltas", delta_file_name);
}
if (depth < 0)
depth = 0;
if (depth >= (1 << OE_DEPTH_BITS)) {
Expand Down Expand Up @@ -4785,5 +4859,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
list_objects_filter_release(&filter_options);
strvec_clear(&rp);

if (delta_file) {
fflush(delta_file);
fclose(delta_file);
}

return 0;
}
5 changes: 5 additions & 0 deletions builtin/repack.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ struct pack_objects_args {
int quiet;
int local;
int path_walk;
int full_name_hash;
struct list_objects_filter_options filter_options;
};

Expand Down Expand Up @@ -291,6 +292,8 @@ static void prepare_pack_objects(struct child_process *cmd,
strvec_pushf(&cmd->args, "--no-reuse-object");
if (args->path_walk)
strvec_pushf(&cmd->args, "--path-walk");
if (args->full_name_hash)
strvec_pushf(&cmd->args, "--full-name-hash");
if (args->local)
strvec_push(&cmd->args, "--local");
if (args->quiet)
Expand Down Expand Up @@ -1162,6 +1165,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
N_("pass --no-reuse-object to git-pack-objects")),
OPT_BOOL(0, "path-walk", &po_args.path_walk,
N_("pass --path-walk to git-pack-objects")),
OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash,
N_("pass --full-name-hash to git-pack-objects")),
OPT_NEGBIT('n', NULL, &run_update_server_info,
N_("do not run git-update-server-info"), 1),
OPT__QUIET(&po_args.quiet, N_("be quiet")),
Expand Down
20 changes: 20 additions & 0 deletions pack-objects.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,26 @@ static inline uint32_t pack_name_hash(const char *name)
return hash;
}

static inline uint32_t pack_full_name_hash(const char *name)
{
const uint32_t bigp = 1234572167U;
uint32_t c, hash = bigp;

if (!name)
return 0;

/*
* Just do the dumbest thing possible: add random multiples of a
* large prime number with a binary shift. Goal is not cryptographic,
* but generally uniformly distributed.
*/
while ((c = *name++) != 0) {
hash += c * bigp;
hash = (hash >> 5) | (hash << 27);
}
return hash;
}

static inline enum object_type oe_type(const struct object_entry *e)
{
return e->type_valid ? e->type_ : OBJ_BAD;
Expand Down
23 changes: 23 additions & 0 deletions t/helper/test-name-hash.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* test-name-hash.c: Read a list of paths over stdin and report on their
* name-hash and full name-hash.
*/

#include "test-tool.h"
#include "git-compat-util.h"
#include "pack-objects.h"
#include "strbuf.h"

int cmd__name_hash(int argc, const char **argv)
{
struct strbuf line = STRBUF_INIT;

while (!strbuf_getline(&line, stdin)) {
uint32_t name_hash = pack_name_hash(line.buf);
uint32_t full_hash = pack_full_name_hash(line.buf);

printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
}

return 0;
}
1 change: 1 addition & 0 deletions t/helper/test-tool.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ static struct test_cmd cmds[] = {
{ "match-trees", cmd__match_trees },
{ "mergesort", cmd__mergesort },
{ "mktemp", cmd__mktemp },
{ "name-hash", cmd__name_hash },
{ "oid-array", cmd__oid_array },
{ "online-cpus", cmd__online_cpus },
{ "pack-mtimes", cmd__pack_mtimes },
Expand Down
1 change: 1 addition & 0 deletions t/helper/test-tool.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv);
int cmd__match_trees(int argc, const char **argv);
int cmd__mergesort(int argc, const char **argv);
int cmd__mktemp(int argc, const char **argv);
int cmd__name_hash(int argc, const char **argv);
int cmd__online_cpus(int argc, const char **argv);
int cmd__pack_mtimes(int argc, const char **argv);
int cmd__parse_options(int argc, const char **argv);
Expand Down
26 changes: 26 additions & 0 deletions t/perf/p5313-pack-objects.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ test_size 'thin pack size' '
wc -c <out
'

test_perf 'thin pack with --full-name-hash' '
git pack-objects --thin --stdout --revs --sparse --full-name-hash <in-thin >out
'

test_size 'thin pack size with --full-name-hash' '
wc -c <out
'

test_perf 'thin pack with --path-walk' '
git pack-objects --thin --stdout --revs --sparse --path-walk <in-thin >out
'
Expand All @@ -44,6 +52,14 @@ test_size 'big recent pack size' '
wc -c <out
'

test_perf 'big recent pack with --full-name-hash' '
git pack-objects --stdout --revs --full-name-hash <in-big-recent >out
'

test_size 'big recent pack size with --full-name-hash' '
wc -c <out
'

test_perf 'big recent pack with --path-walk' '
git pack-objects --stdout --revs --path-walk <in-big-recent >out
'
Expand All @@ -62,6 +78,16 @@ test_size 'full repack size' '
sort -nr | head -n 1
'

test_perf 'full repack with --full-name-hash' '
git repack -adf --no-write-bitmap-index --full-name-hash
'

test_size 'full repack size with --full-name-hash' '
du -a .git/objects/pack | \
awk "{ print \$1; }" | \
sort -nr | head -n 1
'

test_perf 'full repack with --path-walk' '
git repack -adf --no-write-bitmap-index --path-walk
'
Expand Down
Loading

0 comments on commit 6161ea8

Please sign in to comment.