From ea89904c943c2ef837a4df04762384d90a70cfc9 Mon Sep 17 00:00:00 2001 From: "Dima K." Date: Thu, 23 Jan 2025 16:43:39 -0800 Subject: [PATCH] [Docs] upgrade/chain halt recovery (#837) ## Summary Performed the first upgrade on the Alpha TestNet. Add some documentation changes to prevent some issues in the future. ## Issue N/A ## Type of change Select one or more from the following: - [ ] New feature, functionality or library - [ ] Consensus breaking; add the `consensus-breaking` label if so. See #791 for details - [ ] Bug fix - [x] Code health or cleanup - [x] Documentation - [ ] Other (specify) ## Testing - [x] **Documentation**: `make docusaurus_start`; only needed if you make doc changes - [ ] **Unit Tests**: `make go_develop_and_test` - [ ] **LocalNet E2E Tests**: `make test_e2e` - [ ] **DevNet E2E Tests**: Add the `devnet-test-e2e` label to the PR. ## Sanity Checklist - [ ] I have tested my changes using the available tooling - [ ] I have commented my code - [ ] I have performed a self-review of my own code; both comments & source code - [ ] I create and reference any new tickets, if applicable - [ ] I have left TODOs throughout the codebase, if applicable --------- Co-authored-by: DK Co-authored-by: Daniel Olshansky Co-authored-by: Bryan White --- Dockerfile.release | 1 - app/upgrades/historical.go | 10 + .../chain_halt_troubleshooting.md | 23 +- .../recovery_from_chain_halt.md | 196 +++++++++++++++++ .../protocol/upgrades/contigency_plans.md | 100 +++++++++ .../docs/protocol/upgrades/release_process.md | 13 +- .../protocol/upgrades/upgrade_procedure.md | 205 +++++++++++++++--- makefiles/localnet.mk | 8 + .../upgrades/authz_cancel_upgrade_tx.json | 10 + tools/scripts/upgrades/upgrade_tx_v0.0.9.json | 15 ++ 10 files changed, 543 insertions(+), 38 deletions(-) create mode 100644 docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md create mode 100644 docusaurus/docs/protocol/upgrades/contigency_plans.md create mode 100644 tools/scripts/upgrades/authz_cancel_upgrade_tx.json create mode 100644 tools/scripts/upgrades/upgrade_tx_v0.0.9.json diff --git a/Dockerfile.release b/Dockerfile.release index 35d2a659c..efd5d2f44 100644 --- a/Dockerfile.release +++ b/Dockerfile.release @@ -8,7 +8,6 @@ RUN apt-get update && \ apt-get install -y --no-install-recommends ca-certificates && \ rm -rf /var/lib/apt/lists/* - # Use `1025` G/UID so users can switch between this and `heighliner` image without a need to chown the files. RUN groupadd -g 1025 pocket && useradd -u 1025 -g pocket -m -s /sbin/nologin pocket diff --git a/app/upgrades/historical.go b/app/upgrades/historical.go index 2c0740652..35393ad02 100644 --- a/app/upgrades/historical.go +++ b/app/upgrades/historical.go @@ -17,6 +17,7 @@ import ( "github.com/cosmos/cosmos-sdk/types/module" consensusparamtypes "github.com/cosmos/cosmos-sdk/x/consensus/types" + cosmostypes "github.com/cosmos/cosmos-sdk/types" "github.com/pokt-network/poktroll/app/keepers" ) @@ -29,6 +30,8 @@ func defaultUpgradeHandler( configurator module.Configurator, ) upgradetypes.UpgradeHandler { return func(ctx context.Context, plan upgradetypes.Plan, vm module.VersionMap) (module.VersionMap, error) { + logger := cosmostypes.UnwrapSDKContext(ctx).Logger() + logger.Info("Starting the migration in defaultUpgradeHandler") return mm.RunMigrations(ctx, configurator, vm) } } @@ -87,3 +90,10 @@ var Upgrade_0_0_4 = Upgrade{ // No changes to the KVStore in this upgrade. StoreUpgrades: storetypes.StoreUpgrades{}, } + +// Upgrade_0_0_9 is a small upgrade on TestNet. +var Upgrade_0_0_9 = Upgrade{ + PlanName: "v0.0.9", + CreateUpgradeHandler: defaultUpgradeHandler, + StoreUpgrades: storetypes.StoreUpgrades{}, +} diff --git a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md index 72da1f4f3..5b32f5cda 100644 --- a/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md +++ b/docusaurus/docs/develop/developer_guide/chain_halt_troubleshooting.md @@ -8,13 +8,15 @@ title: Chain Halt Troubleshooting - [Understanding Chain Halts](#understanding-chain-halts) - [Definition and Causes](#definition-and-causes) - [Impact on Network](#impact-on-network) -- [Troubleshooting Process](#troubleshooting-process) +- [Troubleshooting `wrong Block.Header.AppHash`](#troubleshooting-wrong-blockheaderapphash) - [Step 1: Identifying the Issue](#step-1-identifying-the-issue) - [Step 2: Collecting Node Data](#step-2-collecting-node-data) - [Step 3: Analyzing Discrepancies](#step-3-analyzing-discrepancies) - [Step 4: Decoding and Interpreting Data](#step-4-decoding-and-interpreting-data) - [Step 5: Comparing Records](#step-5-comparing-records) - [Step 6: Investigation and Resolution](#step-6-investigation-and-resolution) +- [Troubleshooting `wrong Block.Header.LastResultsHash`](#troubleshooting-wrong-blockheaderlastresultshash) +- [Syncing from genesis](#syncing-from-genesis) ## Understanding Chain Halts @@ -40,7 +42,7 @@ Chain halts can have severe consequences for the network: Given these impacts, swift and effective troubleshooting is crucial to maintain network health and user trust. -## Troubleshooting Process +## Troubleshooting `wrong Block.Header.AppHash` ### Step 1: Identifying the Issue @@ -94,3 +96,20 @@ Based on the identified discrepancies: 2. Develop a fix or patch to address the issue. 3. If necessary, initiate discussions with the validator community to reach social consensus on how to proceed. 4. Implement the agreed-upon solution and monitor the network closely during and after the fix. + +## Troubleshooting `wrong Block.Header.LastResultsHash` + +Errors like the following can occur from using the incorrect binary version at a certain height. + +```bash +reactor validation error: wrong Block.Header.LastResultsHash. +``` + +The solution is to use the correct binary version to sync the full node at the correct height. + +Tools like [cosmosvisor](https://docs.cosmos.network/v0.45/run-node/cosmovisor.html) make it easier +to sync a node from genesis by automatically using the appropriate binary for each range of block heights. + +## Syncing from genesis + +If you're encountering any of the errors mentioned above while trying to sync the historical blocks - make sure you're running the correct version of the binary in accordance with this table [Upgrade List](../../protocol/upgrades/upgrade_list.md). diff --git a/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md new file mode 100644 index 000000000..d1ca5a069 --- /dev/null +++ b/docusaurus/docs/develop/developer_guide/recovery_from_chain_halt.md @@ -0,0 +1,196 @@ +--- +sidebar_position: 7 +title: Chain Halt Recovery +--- + +## Chain Halt Recovery + +This document describes how to recover from a chain halt. + +It assumes that the cause of the chain halt has been identified, and that the +new release has been created and verified to function correctly. + +:::tip + +See [Chain Halt Troubleshooting](./chain_halt_troubleshooting.md) for more information on identifying the cause of a chain halt. + +::: + +- [Background](#background) +- [Resolving halts during a network upgrade](#resolving-halts-during-a-network-upgrade) + - [Manual binary replacement (preferred)](#manual-binary-replacement-preferred) + - [Rollback, fork and upgrade](#rollback-fork-and-upgrade) + - [Troubleshooting](#troubleshooting) + - [Data rollback - retrieving snapshot at a specific height (step 5)](#data-rollback---retrieving-snapshot-at-a-specific-height-step-5) + - [Validator Isolation - risks (step 6)](#validator-isolation---risks-step-6) + +## Background + +Pocket network is built on top of `cosmos-sdk`, which utilizes the CometBFT consensus engine. +Comet's Byzantine Fault Tolerant (BFT) consensus algorithm requires that **at least** 2/3 of Validators +are online and voting for the same block to reach a consensus. In order to maintain liveness +and avoid a chain-halt, we need the majority (> 2/3) of Validators to participate +and use the same version of the software. + +## Resolving halts during a network upgrade + +If the halt is caused by the network upgrade, it is possible the solution can be as simple as +skipping an upgrade (i.e. `unsafe-skip-upgrade`) and creating a new (fixed) upgrade. + +Read more about [upgrade contingency plans](../../protocol/upgrades/contigency_plans.md). + +### Manual binary replacement (preferred) + +:::note + +This is the preferred way of resolving consensus-breaking issues. + +**Significant side effect**: this breaks an ability to sync from genesis **without manual interventions**. +For example, when a consensus-breaking issue occurs on a node that is synching from the first block, node operators need +to manually replace the binary with the new one. There are efforts underway to mitigate this issue, including +configuration for `cosmovisor` that could automate the process. + + + +::: + +Since the chain is not moving, **it is impossible** to issue an automatic upgrade with an upgrade plan. Instead, +we need **social consensus** to manually replace the binary and get the chain moving. + +The steps to doing so are: + +1. Prepare and verify a new binary that addresses the consensus-breaking issue. +2. Reach out to the community and validators so they can upgrade the binary manually. +3. Update [the documentation](../../protocol/upgrades/upgrade_list.md) to include a range a height when the binary needs + to be replaced. + +:::warning + +TODO_MAINNET(@okdas): + +1. **For step 2**: Investigate if the CometBFT rounds/steps need to be aligned as in Morse chain halts. See [this ref](https://docs.cometbft.com/v1.0/spec/consensus/consensus). +2. **For step 3**: Add `cosmovisor` documentation so its configured to automatically replace the binary when synching from genesis. + +::: + +```mermaid +sequenceDiagram + participant DevTeam + participant Community + participant Validators + participant Documentation + participant Network + + DevTeam->>DevTeam: 1. Prepare and verify new binary + DevTeam->>Community: 2. Announce new binary and instructions + DevTeam->>Validators: 2. Notify validators to upgrade manually + Validators->>Validators: 2. Manually replace the binary + Validators->>Network: 2. Restart nodes with new binary + DevTeam->>Documentation: 3. Update documentation (GitHub Release and Upgrade List to include instructions) + Validators-->>Network: Network resumes operation + +``` + +### Rollback, fork and upgrade + +:::info + +These instructions are only relevant to Pocket Network's Shannon release. + +We do not currently use `x/gov` or on-chain voting for upgrades. +Instead, all participants in our DAO vote on upgrades off-chain, and the Foundation +executes transactions on their behalf. + +::: + +:::warning + +This should be avoided or more testing is required. In our tests, the full nodes were +propagating the existing blocks signed by the Validators, making it hard to rollback. + +::: + +**Performing a rollback is analogous to forking the network at the older height.** + +However, if necessary, the instructions to follow are: + +1. Prepare & verify a new binary that addresses the consensus-breaking issue. +2. [Create a release](../../protocol/upgrades/release_process.md). +3. [Prepare an upgrade transaction](../../protocol/upgrades/upgrade_procedure.md#writing-an-upgrade-transaction) to the new version. +4. Disconnect the `Validator set` from the rest of the network **3 blocks** prior to the height of the chain halt. For example: + - Assume an issue at height `103`. + - Revert the `validator set` to height `100`. + - Submit an upgrade transaction at `101`. + - Upgrade the chain at height `102`. + - Avoid the issue at height `103`. +5. Ensure all validators rolled back to the same height and use the same snapshot ([how to get a snapshot](#data-rollback---retrieving-snapshot-at-a-specific-height-step-5)) + - The snapshot should be imported into each Validator's data directory. + - This is necessary to ensure data continuity and prevent forks. +6. Isolate the `validator set` from full nodes - ([why this is necessary](#validator-isolation---risks-step-6)). + - This is necessary to avoid full nodes from gossiping blocks that have been rolled back. + - This may require using a firewall or a private network. + - Validators should only be permitted to gossip blocks amongst themselves. +7. Start the `validator set` and perform the upgrade. For example, reiterating the process above: + - Start all Validators at height `100`. + - On block `101`, submit the `MsgSoftwareUpgrade` transaction with a `Plan.height` set to `102`. + - `x/upgrade` will perform the upgrade in the `EndBlocker` of block `102`. + - The node will stop climbing with an error waiting for the upgrade to be performed. + - Cosmovisor deployments automatically replace the binary. + - Manual deployments will require a manual replacement at this point. + - Start the node back up. +8. Wait for the network to reach the height of the previous ledger (`104`+). +9. Allow validators to open their network to full nodes again. + - **Note**: full nodes will need to perform the rollback or use a snapshot as well. + +```mermaid +sequenceDiagram + participant DevTeam + participant Foundation + participant Validators + participant FullNodes + %% participant Network + + DevTeam->>DevTeam: 1. Prepare & verify new binary + DevTeam->>DevTeam: 2 & 3. Create a release & prepare upgrade transaction + Validators->>Validators: 4 & 5. Roll back to height before issue or import snapshot + Validators->>Validators: 6. Isolate from Full Nodes + Foundation->>Validators: 7. Distribute upgrade transaction + Validators->>Validators: 7. Start network and perform upgrade + + break + Validators->>Validators: 8. Wait until previously problematic height elapses + end + + Validators-->FullNodes: 9. Open network connections + FullNodes-->>Validators: 9. Sync with updated network + note over Validators,FullNodes: Network resumes operation +``` + +### Troubleshooting + +#### Data rollback - retrieving snapshot at a specific height (step 5) + +There are two ways to get a snapshot from a prior height: + +1. Execute + + ```bash + poktrolld rollback --hard + ``` + + repeately, until the command responds with the desired block number. + +2. Use a snapshot from below the halt height (e.g. `100`) and start the node with `--halt-height=100` parameter so it only syncs up to certain height and then + gracefully shuts down. Add this argument to `poktrolld start` like this: + + ```bash + poktrolld start --halt-height=100 + ``` + +#### Validator Isolation - risks (step 6) + +Having at least one node that has knowledge of the forking ledger can jeopardize the whole process. In particular, the +following errors in logs are the sign of the nodes syncing blocks from the wrong fork: + +- `found conflicting vote from ourselves; did you unsafe_reset a validator?` +- `conflicting votes from validator` diff --git a/docusaurus/docs/protocol/upgrades/contigency_plans.md b/docusaurus/docs/protocol/upgrades/contigency_plans.md new file mode 100644 index 000000000..260f37823 --- /dev/null +++ b/docusaurus/docs/protocol/upgrades/contigency_plans.md @@ -0,0 +1,100 @@ +--- +title: Failed upgrade contingency plan +sidebar_position: 5 +--- + +:::tip + +This documentation covers failed upgrade contingency for `poktroll` - a `cosmos-sdk` based chain. + +While this can be helpful for other blockchain networks, it is not guaranteed to work for other chains. + +::: + +## Contingency plans + +There's always a chance the upgrade will fail. + +This document is intended to help you recover without significant downtime. + +- [Option 0: The bug is discovered before the upgrade height is reached](#option-0-the-bug-is-discovered-before-the-upgrade-height-is-reached) +- [Option 1: The migration didn't start (i.e. migration halt)](#option-1-the-migration-didnt-start-ie-migration-halt) +- [Option 2: The migration is stuck (i.e. incomplete/partial migration)](#option-2-the-migration-is-stuck-ie-incompletepartial-migration) +- [Option 3: The migration succeed but the network is stuck (i.e. migration had a bug)](#option-3-the-migration-succeed-but-the-network-is-stuck-ie-migration-had-a-bug) +- [MANDATORY Checklist of Documentation \& Scripts to Update](#mandatory-checklist-of-documentation--scripts-to-update) + +### Option 0: The bug is discovered before the upgrade height is reached + +**Cancel the upgrade plan!** + +See the instructions of [how to do that here](./upgrade_procedure.md#cancelling-the-upgrade-plan). + +### Option 1: The migration didn't start (i.e. migration halt) + +**This is unlikely to happen.** + +Possible reasons for this are if the name of the upgrade handler is different +from the one specified in the upgrade plan, or if the binary suggested by the +upgrade plan is wrong. + +If the nodes on the network stopped at the upgrade height and the migration did not +start yet (i.e. there are no logs indicating the upgrade handler and store migrations are being executed), +we **MUST** gather social consensus to restart validators with the `--unsafe-skip-upgrade=$upgradeHeightNumber` flag. + +This will skip the upgrade process, allowing the chain to continue and the protocol team to plan another release. + +`--unsafe-skip-upgrade` simply skips the upgrade handler and store migrations. +The chain continues as if the upgrade plan was never set. +The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. + +:::caution + +`--unsafe-skip-upgrade` needs to be documented in the list of upgrades and added +to the scripts so the next time somebody tries to sync the network from genesis, +they will automatically skip the failed upgrade. +[Documentation and scripts to update](#documentation-and-scripts-to-update) + + + +::: + +### Option 2: The migration is stuck (i.e. incomplete/partial migration) + +If the migration is stuck, there's always a chance the upgrade handler was executed on-chain as scheduled, but the migration didn't complete. + +In such a case, we need: + +- **All full nodes and validators**: Roll back validators to the backup + + - A snapshot is taken by `cosmovisor` automatically prior to upgrade when `UNSAFE_SKIP_BACKUP` is set to `false` (the default recommended value; + [more information](https://docs.cosmos.network/main/build/tooling/cosmovisor#command-line-arguments-and-environment-variables)) + +- **All full nodes and validators**: skip the upgrade + + - Add the `--unsafe-skip-upgrade=$upgradeHeightNumber` argument to `poktroll start` command like so: + + ```bash + poktrolld start --unsafe-skip-upgrade=$upgradeHeightNumber # ... the rest of the arguments + ``` + +- **Protocol team**: Resolve the issue with an upgrade and schedule a new plan. + + - The upgrade needs to be fixed, and then a new plan needs to be submitted to the network. + +- **Protocol team**: document the failed upgrade + + - Document and add `--unsafe-skip-upgrade=$upgradeHeightNumber` to the scripts (such as docker-compose and cosmovisor installer) + - The next time somebody tries to sync the network from genesis they will automatically skip the failed upgrade; see [documentation and scripts to update](#documentation-and-scripts-to-update) + + + +### Option 3: The migration succeed but the network is stuck (i.e. migration had a bug) + +This should be treated as a consensus or non-determinism bug that is unrelated to the upgrade. See [Recovery From Chain Halt](../../develop/developer_guide/recovery_from_chain_halt.md) for more information on how to handle such issues. + +### MANDATORY Checklist of Documentation & Scripts to Update + +- [ ] The [upgrade list](./upgrade_list.md) should reflect a failed upgrade and provide a range of heights that served by each version. +- [ ] Systemd service should include`--unsafe-skip-upgrade=$upgradeHeightNumber` argument in its start command [here](https://github.com/pokt-network/poktroll/blob/main/tools/installer/full-node.sh). +- [ ] The [Helm chart](https://github.com/pokt-network/helm-charts/blob/main/charts/poktrolld/templates/StatefulSet.yaml) should point to the latest version;consider exposing via a `values.yaml` file +- [ ] The [docker-compose](https://github.com/pokt-network/poktroll-docker-compose-example/tree/main/scripts) examples should point to the latest version diff --git a/docusaurus/docs/protocol/upgrades/release_process.md b/docusaurus/docs/protocol/upgrades/release_process.md index 2845f4c84..398d56c05 100644 --- a/docusaurus/docs/protocol/upgrades/release_process.md +++ b/docusaurus/docs/protocol/upgrades/release_process.md @@ -16,13 +16,6 @@ sidebar_position: 4 This document is for the Pocket Network protocol team's internal use only. ::: -- [1. Determine if the Release is Consensus-Breaking](#1-determine-if-the-release-is-consensus-breaking) -- [2. Create a GitHub Release](#2-create-a-github-release) - - [Legend](#legend) -- [3. Write an Upgrade Plan](#3-write-an-upgrade-plan) -- [4. Issue Upgrade on TestNet](#4-issue-upgrade-on-testnet) -- [5. Issue Upgrade on MainNet](#5-issue-upgrade-on-mainnet) - ### 1. Determine if the Release is Consensus-Breaking :::note @@ -59,12 +52,18 @@ You can find an example [here](https://github.com/pokt-network/poktroll/releases ```text ## Protocol Upgrades + + - **Planned Upgrade:** ❌ Not applicable for this release. - **Breaking Change:** ❌ Not applicable for this release. - **Manual Intervention Required:** ✅ Yes, but only for Alpha TestNet participants. If you are participating, please follow the [instructions provided here](https://dev.poktroll.com/operate/quickstart/docker_compose_walkthrough#restarting-a-full-node-after-re-genesis-) for restarting your full node after re-genesis. - **Upgrade Height:** ❌ Not applicable for this release. ## What's Changed + ``` diff --git a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md index 8dd572ece..91dfc12bf 100644 --- a/docusaurus/docs/protocol/upgrades/upgrade_procedure.md +++ b/docusaurus/docs/protocol/upgrades/upgrade_procedure.md @@ -6,22 +6,32 @@ sidebar_position: 2 # Upgrade procedure :::warning -This page describes the protocol upgrade process, which is internal to the protocol team. If you're interested in upgrading your Pocket Network node, please check our [releases page](https://github.com/pokt-network/poktroll/releases) for upgrade instructions and changelogs. + +This page describes the protocol upgrade process, intended for the protocol team's internal use. + +If you're interested in upgrading your Pocket Network node, please check our [releases page](https://github.com/pokt-network/poktroll/releases) for upgrade instructions and changelogs. + ::: - [When is an Upgrade Warranted?](#when-is-an-upgrade-warranted) - [Implementing the Upgrade](#implementing-the-upgrade) - [Writing an Upgrade Transaction](#writing-an-upgrade-transaction) + - [Validate the URLs (live network only)](#validate-the-urls-live-network-only) - [Submitting the upgrade onchain](#submitting-the-upgrade-onchain) +- [Cancelling the upgrade plan](#cancelling-the-upgrade-plan) - [Testing the Upgrade](#testing-the-upgrade) - - [LocalNet](#localnet) - - [DevNet](#devnet) - - [TestNet](#testnet) - - [Mainnet](#mainnet) + - [LocalNet Upgrades](#localnet-upgrades) + - [LocalNet Upgrade Cheat Sheet](#localnet-upgrade-cheat-sheet) + - [DevNet Upgrades](#devnet-upgrades) + - [TestNet Upgrades](#testnet-upgrades) + - [Mainnet Upgrades](#mainnet-upgrades) ## Overview -When a consensus-breaking change is made to the protocol, we must carefully evaluate and implement an upgrade path that allows existing nodes to transition safely from one software version to another without disruption. This process involves several key steps: +When a consensus-breaking change is made to the protocol, we must carefully evaluate and implement an upgrade path that +allows existing nodes to transition safely from one software version to another without disruption. + +This process involves several key steps: 1. **Proposal**: The DAO drafts an upgrade proposal using our offchain governance system. 2. **Implementation**: The proposed changes are implemented in the codebase. @@ -36,16 +46,34 @@ An upgrade is necessary whenever there's an API, State Machine, or other Consens ## Implementing the Upgrade -1. When a new version includes a consensus-breaking change, plan for the next protocol upgrade: - - If there's a change to a specific module, bump that module's consensus version. +1. When a new version includes a `consensus-breaking` change, plan for the next protocol upgrade: + + - If there's a change to a specific module -> bump that module's consensus version. - Note any potential parameter changes to include in the upgrade. + 2. Create a new upgrade in `app/upgrades`: - Refer to `historical.go` for past upgrades and examples. - - Consult Cosmos-sdk documentation on upgrades for additional guidance [here](https://docs.cosmos.network/main/build/building-apps/app-upgrade) and [here](https://docs.cosmos.network/main/build/modules/upgrade). + - Consult Cosmos-sdk documentation on upgrades for additional guidance on [building-apps/app-upgrade](https://docs.cosmos.network/main/build/building-apps/app-upgrade) and [modules/upgrade](https://docs.cosmos.network/main/build/modules/upgrade). + +:::info + +Creating a new upgrade plan **MUST BE DONE** even if there are no state changes. + +::: ## Writing an Upgrade Transaction -An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/blob/0fda53f265de4bcf4be1a13ea9fad450fc2e66d4/x/upgrade/proto/cosmos/upgrade/v1beta1/upgrade.proto#L14) with specific details about the upgrade. This information helps schedule the upgrade on the network and provides necessary data for automatic upgrades via `Cosmovisor`. A typical upgrade transaction will look like the following: +An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/blob/0fda53f265de4bcf4be1a13ea9fad450fc2e66d4/x/upgrade/proto/cosmos/upgrade/v1beta1/upgrade.proto#L14) with specific details about the upgrade. + +This information helps schedule the upgrade on the network and provides necessary data for automatic upgrades via `Cosmovisor`. + +A typical upgrade transaction includes: + +- `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. +- `height`: The height at which an upgrade should be executed and the node will be restarted. +- `info`: Can be empty. **Only needed for live networks where we want cosmovisor to upgrade nodes automatically**. + +And looks like the following as an example: ```json { @@ -65,52 +93,171 @@ An upgrade transaction includes a [Plan](https://github.com/cosmos/cosmos-sdk/bl } ``` -- `name`: Name of the upgrade. It should match the `VersionName` of `upgrades.Upgrade`. -- `height`: The height at which an upgrade should be executed and the node will be restarted. -- `info`: While this field can theoretically contain any information about the upgrade, in practice, `cosmovisor`uses it to obtain information about the binaries. When`cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in this field and perform a hash verification (which is optional). +:::tip + +When `cosmovisor` is configured to automatically download binaries, it will pull the binary from the link provided in +the object about and perform a hash verification (which is also optional). + +**NOTE THAT** we only know the hashes **AFTER** the release has been cut and CI created artifacts for this version. + +::: + +### Validate the URLs (live network only) + +The URLs of the binaries contain checksums. It is critical to ensure they are correct. +Otherwise Cosmovisor won't be able to download the binaries and go through the upgrade. + +The command below (using tools build by the authors of Cosmosvisor) can be used to achieve the above: + +```bash +jq -r '.body.messages[0].plan.info | fromjson | .binaries[]' $PATH_TO_UPGRADE_TRANSACTION_JSON | while IFS= read -r url; do + go-getter "$url" . +done +``` + +The output should look like this: + +```text +2024/09/24 12:40:40 success! +2024/09/24 12:40:42 success! +2024/09/24 12:40:44 success! +2024/09/24 12:40:46 success! +``` + +:::tip + +`go-getter` can be installed using the following command: + +```bash +go install github.com/hashicorp/go-getter/cmd/go-getter@latest +``` + +::: ## Submitting the upgrade onchain The `MsgSoftwareUpgrade` can be submitted using the following command: ```bash -poktrolld tx authz exec PATH_TO_TRANSACTION_JSON --from pnf +poktrolld tx authz exec $PATH_TO_UPGRADE_TRANSACTION_JSON --from=pnf ``` -If the transaction has been accepted, upgrade plan can be viewed with this command: +If the transaction has been accepted, the upgrade plan can be viewed with this command: ```bash poktrolld query upgrade plan ``` +## Cancelling the upgrade plan + +It is possible to cancel the upgrade before the upgrade plan height is reached. To do so, execute the following make target: + +```bash +make localnet_cancel_upgrade +``` + ## Testing the Upgrade :::warning -Note that for local testing, `cosmovisor` won't pull the binary from the info field. +Note that for local testing, `cosmovisor` won't pull the binary from the upgrade Plan's info field. ::: -### LocalNet +### LocalNet Upgrades + +LocalNet **DOES NOT** support `cosmovisor` and automatic upgrades at the moment. + +However, **IT IS NOT NEEDED** to simulate and test the upgrade procedure. + +#### LocalNet Upgrade Cheat Sheet + +For a hypothetical scenario to upgrade from `0.1` to `0.2`: + +1. **Stop LocalNet** to prevent interference. Pull the `poktroll` repo into two separate directories. Let's name them `old` and `new`. It is recommended to open at least two tabs/shell panels in each directory for easier switching between directories. + +2. **(`old` repo)** - Check out the old version. For the test to be accurate, we need to upgrade from the correct version. + + ```bash + git checkout v0.1 + ``` + +3. **(`new` repo)** + + ```bash + git checkout -b branch_to_test + ``` -LocalNet currently does not support `cosmovisor` and automatic upgrades. However, we have provided scripts to facilitate local testing in the `tools/scripts/upgrades` directory: + Replace `branch_to_test` with the actual branch you want to test. -1. Modify `tools/scripts/upgrades/authz_upgrade_tx_example_v0.0.4_height_30.json` to reflect the name of the upgrade and the height at which it should be scheduled. + :::note + This branch should have an upgrade implemented per the docs in [Implementing the Upgrade](#implementing-the-upgrade). + Here, the upgrade should be named `v0.2`. + ::: -2. Check and update the `tools/scripts/upgrades/cosmovisor-start-node.sh` to point to the correct binaries: +4. **(BOTH repos)** - We'll use binaries from both versions - old and new. - - The old binary should be compiled to work before the upgrade. - - The new binary should contain the upgrade logic to be executed immediately after the node is started using the new binary. + ```bash + make go_develop ignite_release ignite_release_extract_binaries + ``` -3. Run `bash tools/scripts/upgrades/cosmovisor-start-node.sh` to wipe the `~/.poktroll` directory and place binaries in the correct locations. + :::note + The binary produced by these commands in the old repo should result in the same binary as it was downloaded from [production releases](https://github.com/pokt-network/poktroll/releases). You can use them as an alternative to building the binary from source. + ::: -4. Execute the transaction as shown in [Submitting the upgrade onchain](#submitting-the-upgrade-onchain) section above. +5. **(`old` repo)** - Clean up and generate an empty genesis using the old version. -### DevNet + ```bash + rm -rf ~/.poktroll && ./release_binaries/poktroll_darwin_arm64 comet unsafe-reset-all && make localnet_regenesis + ``` + +6. **(`old` repo)** Write and save [an upgrade transaction](#writing-an-upgrade-transaction) for `v0.2`. The upgrade plan should be named after the version to which you're upgrading. + +7. **(`old` repo)** Start the node: + + ```bash + ./release_binaries/poktroll_darwin_arm64 start + ``` + + The validator node should run and produce blocks as expected. + +8. **(`old` repo)** Submit the upgrade transaction. **NOTE THAT** the upgrade height in the transaction should be higher than the current block height. Adjust and submit if necessary: + + ```bash + ./release_binaries/poktroll_darwin_arm64 tx authz exec tools/scripts/upgrades/local_test_v0.2.json --from=pnf + ``` + + Replace the path to the JSON transaction with your prepared upgrade transaction. Verify the upgrade plan was submitted and accepted: + + ```bash + ./release_binaries/poktroll_darwin_arm64 query upgrade plan + ``` + +9. Wait for the upgrade height to be reached on the old version. The old version should stop working since it has no knowledge of the `v0.2` upgrade. This simulates a real-world scenario. Stop the old node, and switch to the new version. + +10. **(`new` repo)** + + ```bash + ./release_binaries/poktroll_darwin_arm64 start + ``` + +11. **Observe the output:** + + - A successful upgrade should output `applying upgrade "v0.2" at height: 20 module=x/upgrade`. + - The node on the new version should continue producing blocks. + - If there were errors during the upgrade, investigate and address them. + +12. **(`new` repo, optional**) - If parameters were changed during the upgrade, test if these changes were applied. For example: + + ```bash + ./release_binaries/poktroll_darwin_arm64 q application params + ``` + +### DevNet Upgrades DevNets currently do not support `cosmovisor`. We use Kubernetes to manage software versions, including validators. Introducing another component to manage versions would be complex, requiring a re-architecture of our current solution to accommodate this change. -### TestNet +### TestNet Upgrades We currently deploy TestNet validators using Kubernetes with helm charts, which prevents us from managing the validator with `cosmovisor`. We do not control what other TestNet participants are running. However, if participants have deployed their nodes using the [cosmovisor guide](../../operate/run_a_node/full_node_walkthrough.md), their nodes will upgrade automatically. @@ -121,9 +268,11 @@ Until we transition to [cosmos-operator](https://github.com/strangelove-ventures 3. Monitor validator node(s) as they start and begin producing blocks. :::tip -If you are a member of Grove, you can find the instructions to access the infrastructure [here](https://www.notion.so/buildwithgrove/How-to-re-genesis-a-Shannon-TestNet-a6230dd8869149c3a4c21613e3cfad15?pvs=4). + +If you are a member of Grove, you can find the instructions to access the infrastructure [on notion](https://www.notion.so/buildwithgrove/How-to-re-genesis-a-Shannon-TestNet-a6230dd8869149c3a4c21613e3cfad15?pvs=4). + ::: -### Mainnet +### Mainnet Upgrades The Mainnet upgrade process is to be determined. We aim to develop and implement improved tooling for this environment. diff --git a/makefiles/localnet.mk b/makefiles/localnet.mk index d43cdccb6..9278457e6 100644 --- a/makefiles/localnet.mk +++ b/makefiles/localnet.mk @@ -29,3 +29,11 @@ localnet_regenesis: check_yq warn_message_acc_initialize_pubkeys ## Regenerate t .PHONY: cosmovisor_start_node cosmovisor_start_node: ## Starts the node using cosmovisor that waits for an upgrade plan bash tools/scripts/upgrades/cosmovisor-start-node.sh + +.PHONY: localnet_cancel_upgrade +localnet_cancel_upgrade: ## Cancels the planed upgrade on local node + poktrolld tx authz exec tools/scripts/upgrades/authz_cancel_upgrade_tx.json --gas=auto --from=pnf + +.PHONY: localnet_show_upgrade_plan +localnet_show_upgrade_plan: ## Shows the upgrade plan on local node + poktrolld query upgrade plan diff --git a/tools/scripts/upgrades/authz_cancel_upgrade_tx.json b/tools/scripts/upgrades/authz_cancel_upgrade_tx.json new file mode 100644 index 000000000..014eaac60 --- /dev/null +++ b/tools/scripts/upgrades/authz_cancel_upgrade_tx.json @@ -0,0 +1,10 @@ +{ + "body": { + "messages": [ + { + "@type": "/cosmos.upgrade.v1beta1.MsgCancelUpgrade", + "authority": "pokt10d07y265gmmuvt4z0w9aw880jnsr700j8yv32t" + } + ] + } +} \ No newline at end of file diff --git a/tools/scripts/upgrades/upgrade_tx_v0.0.9.json b/tools/scripts/upgrades/upgrade_tx_v0.0.9.json new file mode 100644 index 000000000..c945229d9 --- /dev/null +++ b/tools/scripts/upgrades/upgrade_tx_v0.0.9.json @@ -0,0 +1,15 @@ +{ + "body": { + "messages": [ + { + "@type": "/cosmos.upgrade.v1beta1.MsgSoftwareUpgrade", + "authority": "pokt10d07y265gmmuvt4z0w9aw880jnsr700j8yv32t", + "plan": { + "name": "v0.0.9", + "height": "15510", + "info": "{\"binaries\":{\"linux/amd64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_linux_amd64.tar.gz?checksum=sha256:ab5b99ca0bc4bfbdd7031378d5a01c2a9f040ff310b745866a4dee7e62321c94\",\"linux/arm64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_linux_arm64.tar.gz?checksum=sha256:4b68c2ad326da055d43af1ad1a580158cec0f229d2ec6d9e18280d065260b622\",\"darwin/amd64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_darwin_amd64.tar.gz?checksum=sha256:c81aabddeb190044b979412e5a518bbf5c88305272f72a47e32e13aa765c3330\",\"darwin/arm64\":\"https://github.com/pokt-network/poktroll/releases/download/v0.0.9/poktroll_darwin_arm64.tar.gz?checksum=sha256:e683c55ac13902d107d7a726ed4a5c5affb2af1be3c67dd131ec2072a2cfbcb2\"}}" + } + } + ] + } +} \ No newline at end of file