Skip to content

Commit

Permalink
Merge pull request facebook#4288 from facebook/stronger_patchfrom
Browse files Browse the repository at this point in the history
Improve compression ratio of the --patch-from mode
  • Loading branch information
Cyan4973 authored Feb 10, 2025
2 parents 5b8575a + d2c562b commit d84d70b
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 52 deletions.
34 changes: 24 additions & 10 deletions lib/compress/zstd_ldm.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */
#include "zstd_ldm_geartab.h"

#define LDM_BUCKET_SIZE_LOG 3
#define LDM_BUCKET_SIZE_LOG 4
#define LDM_MIN_MATCH_LENGTH 64
#define LDM_HASH_RLOG 7

Expand Down Expand Up @@ -133,21 +133,35 @@ static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state,
}

void ZSTD_ldm_adjustParameters(ldmParams_t* params,
ZSTD_compressionParameters const* cParams)
const ZSTD_compressionParameters* cParams)
{
params->windowLog = cParams->windowLog;
ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
if (params->hashRateLog == 0) {
if (params->hashLog > 0) {
/* if params->hashLog is set, derive hashRateLog from it */
assert(params->hashLog <= ZSTD_HASHLOG_MAX);
if (params->windowLog > params->hashLog) {
params->hashRateLog = params->windowLog - params->hashLog;
}
} else {
assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9);
/* mapping from [fast, rate7] to [btultra2, rate4] */
params->hashRateLog = 7 - (cParams->strategy/3);
}
}
if (params->hashLog == 0) {
params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
assert(params->hashLog <= ZSTD_HASHLOG_MAX);
params->hashLog = BOUNDED(ZSTD_HASHLOG_MIN, params->windowLog - params->hashRateLog, ZSTD_HASHLOG_MAX);
}
if (params->hashRateLog == 0) {
params->hashRateLog = params->windowLog < params->hashLog
? 0
: params->windowLog - params->hashLog;
if (params->minMatchLength == 0) {
params->minMatchLength = LDM_MIN_MATCH_LENGTH;
if (cParams->strategy >= ZSTD_btultra)
params->minMatchLength /= 2;
}
if (params->bucketSizeLog==0) {
assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9);
params->bucketSizeLog = BOUNDED(LDM_BUCKET_SIZE_LOG, (U32)cParams->strategy, ZSTD_LDM_BUCKETSIZELOG_MAX);
}
params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
}
Expand Down
11 changes: 6 additions & 5 deletions programs/fileio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1100,11 +1100,12 @@ static void FIO_adjustParamsForPatchFromMode(FIO_prefs_t* const prefs,
FIO_setLdmFlag(prefs, 1);
}
if (cParams.strategy >= ZSTD_btopt) {
DISPLAYLEVEL(3, "[Optimal parser notes] Consider the following to improve patch size at the cost of speed:\n");
DISPLAYLEVEL(3, "- Use --single-thread mode in the zstd cli\n");
DISPLAYLEVEL(3, "- Set a larger targetLength (e.g. --zstd=targetLength=4096)\n");
DISPLAYLEVEL(3, "- Set a larger chainLog (e.g. --zstd=chainLog=%u)\n", ZSTD_CHAINLOG_MAX);
DISPLAYLEVEL(3, "Also consider playing around with searchLog and hashLog\n");
DISPLAYLEVEL(4, "[Optimal parser notes] Consider the following to improve patch size at the cost of speed:\n");
DISPLAYLEVEL(4, "- Set a larger targetLength (e.g. --zstd=targetLength=4096)\n");
DISPLAYLEVEL(4, "- Set a larger chainLog (e.g. --zstd=chainLog=%u)\n", ZSTD_CHAINLOG_MAX);
DISPLAYLEVEL(4, "- Set a larger LDM hashLog (e.g. --zstd=ldmHashLog=%u)\n", ZSTD_LDM_HASHLOG_MAX);
DISPLAYLEVEL(4, "- Set a smaller LDM rateLog (e.g. --zstd=ldmHashRateLog=%u)\n", ZSTD_LDM_HASHRATELOG_MIN);
DISPLAYLEVEL(4, "Also consider playing around with searchLog and hashLog\n");
}
}

Expand Down
27 changes: 14 additions & 13 deletions programs/zstd.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,17 @@ The list of available _options_:
Value 0 is special and means "default": _ovlog_ is automatically determined by `zstd`.
In which case, _ovlog_ will range from 6 to 9, depending on selected _strat_.

- `ldmHashRateLog`=_lhrlog_, `lhrlog`=_lhrlog_:
Specify the frequency of inserting entries into the long distance matching
hash table.

This option is ignored unless long distance matching is enabled.

Larger values will improve compression speed. Deviating far from the
default value will likely result in a decrease in compression ratio.

The default value varies between 4 and 7, depending on `strategy`.

- `ldmHashLog`=_lhlog_, `lhlog`=_lhlog_:
Specify the maximum size for a hash table used for long distance matching.

Expand All @@ -463,7 +474,7 @@ The list of available _options_:
Bigger hash tables usually improve compression ratio at the expense of more
memory during compression and a decrease in compression speed.

The minimum _lhlog_ is 6 and the maximum is 30 (default: 20).
The minimum _lhlog_ is 6 and the maximum is 30 (default: `windowLog - ldmHashRateLog`).

- `ldmMinMatch`=_lmml_, `lmml`=_lmml_:
Specify the minimum searched length of a match for long distance matching.
Expand All @@ -472,7 +483,7 @@ The list of available _options_:

Larger/very small values usually decrease compression ratio.

The minimum _lmml_ is 4 and the maximum is 4096 (default: 64).
The minimum _lmml_ is 4 and the maximum is 4096 (default: 32 to 64, depending on `strategy`).

- `ldmBucketSizeLog`=_lblog_, `lblog`=_lblog_:
Specify the size of each bucket for the hash table used for long distance
Expand All @@ -483,18 +494,8 @@ The list of available _options_:
Larger bucket sizes improve collision resolution but decrease compression
speed.

The minimum _lblog_ is 1 and the maximum is 8 (default: 3).

- `ldmHashRateLog`=_lhrlog_, `lhrlog`=_lhrlog_:
Specify the frequency of inserting entries into the long distance matching
hash table.

This option is ignored unless long distance matching is enabled.

Larger values will improve compression speed. Deviating far from the
default value will likely result in a decrease in compression ratio.
The minimum _lblog_ is 1 and the maximum is 8 (default: 4 to 8, depending on `strategy`).

The default value is `wlog - lhlog`.

### Example
The following parameters sets advanced compression options to something
Expand Down
Loading

0 comments on commit d84d70b

Please sign in to comment.