Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

loosen version upgrade checks to allow for lts version upgrades #31079

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions ci/nightly/pipeline.template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,18 @@ steps:
composition: platform-checks
args: [--scenario=UpgradeEntireMz, "--seed=$BUILDKITE_JOB_ID"]

- id: checks-lts-upgrade
label: "Checks LTS upgrade, whole-Mz restart"
depends_on: build-aarch64
timeout_in_minutes: 180
parallelism: 2
agents:
queue: hetzner-aarch64-16cpu-32gb
plugins:
- ./ci/plugins/mzcompose:
composition: platform-checks
args: [--scenario=UpgradeEntireMzFromLatestLTS, "--seed=$BUILDKITE_JOB_ID"]

- id: checks-preflight-check-rollback
label: "Checks preflight-check and roll back upgrade"
depends_on: build-aarch64
Expand Down
36 changes: 35 additions & 1 deletion misc/python/materialize/checks/scenarios_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from materialize.checks.scenarios import Scenario
from materialize.mz_version import MzVersion
from materialize.mzcompose.services.materialized import LEADER_STATUS_HEALTHCHECK
from materialize.version_list import get_published_minor_mz_versions
from materialize.version_list import LTS_VERSIONS, get_published_minor_mz_versions

# late initialization
_minor_versions: list[MzVersion] | None = None
Expand Down Expand Up @@ -83,6 +83,40 @@ def start_mz_read_only(
)


class UpgradeEntireMzFromLatestLTS(Scenario):
"""Upgrade the entire Mz instance from the last LTS version without any intermediate steps. This makes sure our LTS releases for self-managed Materialize stay upgradable."""

def base_version(self) -> MzVersion:
return LTS_VERSIONS[-1]

def actions(self) -> list[Action]:
print(f"Upgrading from tag {self.base_version()}")
return [
StartMz(
self,
tag=self.base_version(),
),
Initialize(self),
Manipulate(self, phase=1),
KillMz(
capture_logs=True
), # We always use True here otherwise docker-compose will lose the pre-upgrade logs
StartMz(
self,
tag=None,
),
Manipulate(self, phase=2),
Validate(self),
# A second restart while already on the new version
KillMz(capture_logs=True),
StartMz(
self,
tag=None,
),
Validate(self),
]


class UpgradeEntireMz(Scenario):
"""Upgrade the entire Mz instance from the last released version."""

Expand Down
5 changes: 5 additions & 0 deletions misc/python/materialize/version_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@

MZ_ROOT = Path(os.environ["MZ_ROOT"])

LTS_VERSIONS = [
MzVersion.parse_mz("v0.130.1"), # v25.1.0
# Put new versions at the bottom
]

# not released on Docker
INVALID_VERSIONS = {
MzVersion.parse_mz("v0.52.1"),
Expand Down
56 changes: 55 additions & 1 deletion src/persist-client/src/cfg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ use crate::operators::STORAGE_SOURCE_DECODE_FUEL;
use crate::project::OPTIMIZE_IGNORED_DATA_DECODE;
use crate::read::READER_LEASE_DURATION;

const LTS_VERSIONS: &[Version] = &[
// 25.1
Version::new(0, 130, 0),
Copy link
Contributor

@def- def- Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the patch number matter here? (It shouldn't because we might have to bump it a few times)

];

/// The tunable knobs for persist.
///
/// Tuning inputs:
Expand Down Expand Up @@ -609,6 +614,10 @@ impl BlobKnobs for PersistConfig {
}
}

pub fn check_data_version(code_version: &Version, data_version: &Version) -> Result<(), String> {
check_data_version_with_lts_versions(code_version, data_version, LTS_VERSIONS)
}

// If persist gets some encoded ProtoState from the future (e.g. two versions of
// code are running simultaneously against the same shard), it might have a
// field that the current code doesn't know about. This would be silently
Expand All @@ -633,7 +642,51 @@ impl BlobKnobs for PersistConfig {
// data we read is going to be because we fetched it using a pointer stored in
// some persist state. If we can handle the state, we can handle the blobs it
// references, too.
pub fn check_data_version(code_version: &Version, data_version: &Version) -> Result<(), String> {
pub(crate) fn check_data_version_with_lts_versions(
code_version: &Version,
data_version: &Version,
lts_versions: &[Version],
) -> Result<(), String> {
// Allow upgrades specifically between consecutive LTS releases.
if data_version >= code_version {
for window in lts_versions.windows(2) {
if window[0]
== (Version {
patch: 0,
..code_version.clone()
})
&& data_version
<= (&Version {
patch: u64::MAX,
..window[1].clone()
})
{
eprintln!(
"{}-{} {} {}",
window[0], window[1], code_version, data_version,
);
return Ok(());
} else {
eprintln!(
"not {}-{} {} {}",
window[0], window[1], code_version, data_version,
);
}
}

if let Some(last) = lts_versions.last() {
if last
== (&Version {
patch: 0,
..code_version.clone()
})
{
eprintln!("{}- {} {}", last, code_version, data_version);
return Ok(());
}
}
}

// Allow one minor version of forward compatibility. We could avoid the
// clone with some nested comparisons of the semver fields, but this code
// isn't particularly performance sensitive and I find this impl easier to
Expand All @@ -649,6 +702,7 @@ pub fn check_data_version(code_version: &Version, data_version: &Version) -> Res
"{code_version} received persist state from the future {data_version}",
))
} else {
eprintln!("{} {}", code_version, data_version);
Ok(())
}
}
146 changes: 146 additions & 0 deletions src/persist-client/src/internal/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2059,6 +2059,152 @@ mod tests {
proptest!(|(state in any_state::<u64>(0..3))| testcase(state));
}

#[mz_ore::test]
fn check_data_versions_with_lts_versions() {
#[track_caller]
fn testcase(code: &str, data: &str, lts_versions: &[Version], expected: Result<(), ()>) {
let code = Version::parse(code).unwrap();
let data = Version::parse(data).unwrap();
let actual = cfg::check_data_version_with_lts_versions(&code, &data, lts_versions)
.map_err(|_| ());
assert_eq!(actual, expected);
}

let none = [];
let one = [Version::new(0, 130, 0)];
let two = [Version::new(0, 130, 0), Version::new(0, 140, 0)];
let three = [
Version::new(0, 130, 0),
Version::new(0, 140, 0),
Version::new(0, 150, 0),
];

testcase("0.130.0", "0.128.0", &none, Ok(()));
testcase("0.130.0", "0.129.0", &none, Ok(()));
testcase("0.130.0", "0.130.0", &none, Ok(()));
testcase("0.130.0", "0.130.1", &none, Ok(()));
testcase("0.130.1", "0.130.0", &none, Ok(()));
testcase("0.130.0", "0.131.0", &none, Ok(()));
testcase("0.130.0", "0.132.0", &none, Err(()));

testcase("0.129.0", "0.127.0", &none, Ok(()));
testcase("0.129.0", "0.128.0", &none, Ok(()));
testcase("0.129.0", "0.129.0", &none, Ok(()));
testcase("0.129.0", "0.129.1", &none, Ok(()));
testcase("0.129.1", "0.129.0", &none, Ok(()));
testcase("0.129.0", "0.130.0", &none, Ok(()));
testcase("0.129.0", "0.131.0", &none, Err(()));

testcase("0.130.0", "0.128.0", &one, Ok(()));
testcase("0.130.0", "0.129.0", &one, Ok(()));
testcase("0.130.0", "0.130.0", &one, Ok(()));
testcase("0.130.0", "0.130.1", &one, Ok(()));
testcase("0.130.1", "0.130.0", &one, Ok(()));
testcase("0.130.0", "0.131.0", &one, Ok(()));
testcase("0.130.0", "0.132.0", &one, Ok(()));

testcase("0.129.0", "0.127.0", &one, Ok(()));
testcase("0.129.0", "0.128.0", &one, Ok(()));
testcase("0.129.0", "0.129.0", &one, Ok(()));
testcase("0.129.0", "0.129.1", &one, Ok(()));
testcase("0.129.1", "0.129.0", &one, Ok(()));
testcase("0.129.0", "0.130.0", &one, Ok(()));
testcase("0.129.0", "0.131.0", &one, Err(()));

testcase("0.131.0", "0.129.0", &one, Ok(()));
testcase("0.131.0", "0.130.0", &one, Ok(()));
testcase("0.131.0", "0.131.0", &one, Ok(()));
testcase("0.131.0", "0.131.1", &one, Ok(()));
testcase("0.131.1", "0.131.0", &one, Ok(()));
testcase("0.131.0", "0.132.0", &one, Ok(()));
testcase("0.131.0", "0.133.0", &one, Err(()));

testcase("0.130.0", "0.128.0", &two, Ok(()));
testcase("0.130.0", "0.129.0", &two, Ok(()));
testcase("0.130.0", "0.130.0", &two, Ok(()));
testcase("0.130.0", "0.130.1", &two, Ok(()));
testcase("0.130.1", "0.130.0", &two, Ok(()));
testcase("0.130.0", "0.131.0", &two, Ok(()));
testcase("0.130.0", "0.132.0", &two, Ok(()));
testcase("0.130.0", "0.135.0", &two, Ok(()));
testcase("0.130.0", "0.138.0", &two, Ok(()));
testcase("0.130.0", "0.139.0", &two, Ok(()));
testcase("0.130.0", "0.140.0", &two, Ok(()));
testcase("0.130.9", "0.140.0", &two, Ok(()));
testcase("0.130.0", "0.140.1", &two, Ok(()));
testcase("0.130.3", "0.140.1", &two, Ok(()));
testcase("0.130.3", "0.140.9", &two, Ok(()));
testcase("0.130.0", "0.141.0", &two, Err(()));
testcase("0.129.0", "0.133.0", &two, Err(()));
testcase("0.129.0", "0.140.0", &two, Err(()));
testcase("0.131.0", "0.133.0", &two, Err(()));
testcase("0.131.0", "0.140.0", &two, Err(()));

testcase("0.130.0", "0.128.0", &three, Ok(()));
testcase("0.130.0", "0.129.0", &three, Ok(()));
testcase("0.130.0", "0.130.0", &three, Ok(()));
testcase("0.130.0", "0.130.1", &three, Ok(()));
testcase("0.130.1", "0.130.0", &three, Ok(()));
testcase("0.130.0", "0.131.0", &three, Ok(()));
testcase("0.130.0", "0.132.0", &three, Ok(()));
testcase("0.130.0", "0.135.0", &three, Ok(()));
testcase("0.130.0", "0.138.0", &three, Ok(()));
testcase("0.130.0", "0.139.0", &three, Ok(()));
testcase("0.130.0", "0.140.0", &three, Ok(()));
testcase("0.130.9", "0.140.0", &three, Ok(()));
testcase("0.130.0", "0.140.1", &three, Ok(()));
testcase("0.130.3", "0.140.1", &three, Ok(()));
testcase("0.130.3", "0.140.9", &three, Ok(()));
testcase("0.130.0", "0.141.0", &three, Err(()));
testcase("0.129.0", "0.133.0", &three, Err(()));
testcase("0.129.0", "0.140.0", &three, Err(()));
testcase("0.131.0", "0.133.0", &three, Err(()));
testcase("0.131.0", "0.140.0", &three, Err(()));
testcase("0.130.0", "0.150.0", &three, Err(()));

testcase("0.140.0", "0.138.0", &three, Ok(()));
testcase("0.140.0", "0.139.0", &three, Ok(()));
testcase("0.140.0", "0.140.0", &three, Ok(()));
testcase("0.140.0", "0.140.1", &three, Ok(()));
testcase("0.140.1", "0.140.0", &three, Ok(()));
testcase("0.140.0", "0.141.0", &three, Ok(()));
testcase("0.140.0", "0.142.0", &three, Ok(()));
testcase("0.140.0", "0.145.0", &three, Ok(()));
testcase("0.140.0", "0.148.0", &three, Ok(()));
testcase("0.140.0", "0.149.0", &three, Ok(()));
testcase("0.140.0", "0.150.0", &three, Ok(()));
testcase("0.140.9", "0.150.0", &three, Ok(()));
testcase("0.140.0", "0.150.1", &three, Ok(()));
testcase("0.140.3", "0.150.1", &three, Ok(()));
testcase("0.140.3", "0.150.9", &three, Ok(()));
testcase("0.140.0", "0.151.0", &three, Err(()));
testcase("0.139.0", "0.143.0", &three, Err(()));
testcase("0.139.0", "0.150.0", &three, Err(()));
testcase("0.141.0", "0.143.0", &three, Err(()));
testcase("0.141.0", "0.150.0", &three, Err(()));

testcase("0.150.0", "0.148.0", &three, Ok(()));
testcase("0.150.0", "0.149.0", &three, Ok(()));
testcase("0.150.0", "0.150.0", &three, Ok(()));
testcase("0.150.0", "0.150.1", &three, Ok(()));
testcase("0.150.1", "0.150.0", &three, Ok(()));
testcase("0.150.0", "0.151.0", &three, Ok(()));
testcase("0.150.0", "0.152.0", &three, Ok(()));
testcase("0.150.0", "0.155.0", &three, Ok(()));
testcase("0.150.0", "0.158.0", &three, Ok(()));
testcase("0.150.0", "0.159.0", &three, Ok(()));
testcase("0.150.0", "0.160.0", &three, Ok(()));
testcase("0.150.9", "0.160.0", &three, Ok(()));
testcase("0.150.0", "0.160.1", &three, Ok(()));
testcase("0.150.3", "0.160.1", &three, Ok(()));
testcase("0.150.3", "0.160.9", &three, Ok(()));
testcase("0.150.0", "0.161.0", &three, Ok(()));
testcase("0.149.0", "0.153.0", &three, Err(()));
testcase("0.149.0", "0.160.0", &three, Err(()));
testcase("0.151.0", "0.153.0", &three, Err(()));
testcase("0.151.0", "0.160.0", &three, Err(()));
}

#[mz_ore::test]
fn check_data_versions() {
#[track_caller]
Expand Down
13 changes: 12 additions & 1 deletion test/legacy-upgrade/mzcompose.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from materialize.mzcompose.services.testdrive import Testdrive
from materialize.mzcompose.services.zookeeper import Zookeeper
from materialize.version_list import (
LTS_VERSIONS,
VersionsFromDocs,
get_all_published_mz_versions,
get_published_minor_mz_versions,
Expand Down Expand Up @@ -131,6 +132,15 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
test_upgrade_from_version(
c, "current_source", priors=[], filter=args.filter, zero_downtime=False
)
# Direct upgrade from latest LTS version without any inbetween versions
test_upgrade_from_version(
c,
f"{LTS_VERSIONS[-1]}",
priors=[],
filter=args.filter,
zero_downtime=False,
lts_upgrade=True,
)


def get_all_and_latest_two_minor_mz_versions(
Expand All @@ -152,6 +162,7 @@ def test_upgrade_from_version(
priors: list[MzVersion],
filter: str,
zero_downtime: bool,
lts_upgrade: bool = False,
) -> None:
print(
f"+++ Testing {'0dt upgrade' if zero_downtime else 'regular upgrade'} from Materialize {from_version} to current_source."
Expand Down Expand Up @@ -241,7 +252,7 @@ def test_upgrade_from_version(
c.kill(mz_service)
c.rm(mz_service, "testdrive")

if from_version != "current_source":
if from_version != "current_source" and not lts_upgrade:
# We can't skip in-between minor versions anymore, so go through all of them
for version in get_published_minor_mz_versions(newest_first=False):
if version <= from_version:
Expand Down
Loading