From 9b1b32f9d5b87a69cb00283500884521f66d6626 Mon Sep 17 00:00:00 2001 From: kaxxa123 Date: Wed, 11 Sep 2024 15:02:43 +0200 Subject: [PATCH] research: sparse merkle tree optimizations proposed by vitalik --- vitalik_merkle_optimizations/.gitignore | 162 ++++++++++++++++++ vitalik_merkle_optimizations/bintrie_test.py | 45 +++++ vitalik_merkle_optimizations/new_bintrie.py | 125 ++++++++++++++ .../new_bintrie_hex.py | 125 ++++++++++++++ .../new_bintrie_optimized.py | 126 ++++++++++++++ .../new_bintrie_test.py | 27 +++ 6 files changed, 610 insertions(+) create mode 100644 vitalik_merkle_optimizations/.gitignore create mode 100644 vitalik_merkle_optimizations/bintrie_test.py create mode 100644 vitalik_merkle_optimizations/new_bintrie.py create mode 100644 vitalik_merkle_optimizations/new_bintrie_hex.py create mode 100644 vitalik_merkle_optimizations/new_bintrie_optimized.py create mode 100644 vitalik_merkle_optimizations/new_bintrie_test.py diff --git a/vitalik_merkle_optimizations/.gitignore b/vitalik_merkle_optimizations/.gitignore new file mode 100644 index 0000000..efa407c --- /dev/null +++ b/vitalik_merkle_optimizations/.gitignore @@ -0,0 +1,162 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/vitalik_merkle_optimizations/bintrie_test.py b/vitalik_merkle_optimizations/bintrie_test.py new file mode 100644 index 0000000..e0c2460 --- /dev/null +++ b/vitalik_merkle_optimizations/bintrie_test.py @@ -0,0 +1,45 @@ +import new_bintrie as t1 +import new_bintrie_optimized as t2 +import new_bintrie_hex as t3 +import time +import binascii + +keys = [t1.sha3(bytes([i // 256, i % 256])) for i in range(10000)] + +d = t1.EphemDB() +r = t1.new_tree(d) +a = time.time() +for k in keys[:1000]: + r = t1.update(d, r, k, k) +print("Naive bintree time to update: %.4f" % (time.time() - a)) +print("Root: %s" % binascii.hexlify(r)) + +d = t2.EphemDB() +r = t2.new_tree(d) +a = time.time() +for k in keys[:1000]: + r = t2.update(d, r, k, k) +print("DB-optimized bintree time to update: %.4f" % (time.time() - a)) +print("Root: %s" % binascii.hexlify(r)) +print("Writes: %d, reads: %d" % (d.writes, d.reads)) +d.reads = 0 +for k in keys[:500]: + assert t2.get(d, r, k) == k +for k in keys[-500:]: + assert t2.get(d, r, k) == b'\x00' * 32 +print("Reads: %d" % d.reads) + +d = t3.EphemDB() +r = t3.new_tree(d) +a = time.time() +for k in keys[:1000]: + r = t3.update(d, r, k, k) +print("DB-optimized bintree time to update: %.4f" % (time.time() - a)) +print("Root: %s" % binascii.hexlify(r)) +print("Writes: %d, reads: %d" % (d.writes, d.reads)) +d.reads = 0 +for k in keys[:500]: + assert t3.get(d, r, k) == k +for k in keys[-500:]: + assert t3.get(d, r, k) == b'\x00' * 32 +print("Reads: %d" % d.reads) diff --git a/vitalik_merkle_optimizations/new_bintrie.py b/vitalik_merkle_optimizations/new_bintrie.py new file mode 100644 index 0000000..a12c4c5 --- /dev/null +++ b/vitalik_merkle_optimizations/new_bintrie.py @@ -0,0 +1,125 @@ +from ethereum.utils import sha3, encode_hex + +class EphemDB(): + def __init__(self, kv=None): + self.kv = kv or {} + + def get(self, k): + return self.kv.get(k, None) + + def put(self, k, v): + self.kv[k] = v + + def delete(self, k): + del self.kv[k] + +zerohashes = [b'\x00' * 32] +for i in range(256): + zerohashes.insert(0, sha3(zerohashes[0] + zerohashes[0])) + +def new_tree(db): + h = b'\x00' * 32 + for i in range(256): + newh = sha3(h + h) + db.put(newh, h + h) + h = newh + return h + +def key_to_path(k): + o = 0 + for c in k: + o = (o << 8) + c + return o + +def descend(db, root, *path): + v = root + for p in path: + if p: + v = db.get(v)[32:] + else: + v = db.get(v)[:32] + return v + +def get(db, root, key): + v = root + path = key_to_path(key) + for i in range(256): + if (path >> 255) & 1: + v = db.get(v)[32:] + else: + v = db.get(v)[:32] + path <<= 1 + return v + +def update(db, root, key, value): + v = root + path = path2 = key_to_path(key) + sidenodes = [] + for i in range(256): + if (path >> 255) & 1: + sidenodes.append(db.get(v)[:32]) + v = db.get(v)[32:] + else: + sidenodes.append(db.get(v)[32:]) + v = db.get(v)[:32] + path <<= 1 + v = value + for i in range(256): + if (path2 & 1): + newv = sha3(sidenodes[-1] + v) + db.put(newv, sidenodes[-1] + v) + else: + newv = sha3(v + sidenodes[-1]) + db.put(newv, v + sidenodes[-1]) + path2 >>= 1 + v = newv + sidenodes.pop() + return v + +def make_merkle_proof(db, root, key): + v = root + path = key_to_path(key) + sidenodes = [] + for i in range(256): + if (path >> 255) & 1: + sidenodes.append(db.get(v)[:32]) + v = db.get(v)[32:] + else: + sidenodes.append(db.get(v)[32:]) + v = db.get(v)[:32] + path <<= 1 + return sidenodes + +def verify_proof(proof, root, key, value): + path = key_to_path(key) + v = value + for i in range(256): + if (path & 1): + newv = sha3(proof[-1-i] + v) + else: + newv = sha3(v + proof[-1-i]) + path >>= 1 + v = newv + return root == v + +def compress_proof(proof): + bits = bytearray(32) + oproof = b'' + for i, p in enumerate(proof): + if p == zerohashes[i+1]: + bits[i // 8] ^= 1 << i % 8 + else: + oproof += p + return bytes(bits) + oproof + +def decompress_proof(oproof): + proof = [] + bits = bytearray(oproof[:32]) + pos = 32 + for i in range(256): + if bits[i // 8] & (1 << (i % 8)): + proof.append(zerohashes[i+1]) + else: + proof.append(oproof[pos: pos + 32]) + pos += 32 + return proof diff --git a/vitalik_merkle_optimizations/new_bintrie_hex.py b/vitalik_merkle_optimizations/new_bintrie_hex.py new file mode 100644 index 0000000..354e0cb --- /dev/null +++ b/vitalik_merkle_optimizations/new_bintrie_hex.py @@ -0,0 +1,125 @@ +from ethereum.utils import sha3, encode_hex + +class EphemDB(): + def __init__(self, kv=None): + self.reads = 0 + self.writes = 0 + self.kv = kv or {} + + def get(self, k): + self.reads += 1 + return self.kv.get(k, None) + + def put(self, k, v): + self.writes += 1 + self.kv[k] = v + + def delete(self, k): + del self.kv[k] + +# Hashes of empty subtrees +zerohashes = [b'\x00' * 32] +for i in range(256): + zerohashes.insert(0, sha3(zerohashes[0] + zerohashes[0])) + +# Create a new empty tree +def new_tree(db): + return zerohashes[0] + +# Convert a binary key into an integer path value +def key_to_path(k): + return int.from_bytes(k, 'big') + +tt256m1 = 2**256 - 1 + +# And convert back +def path_to_key(k): + return (k & tt256m1).to_bytes(32, 'big') + +# Read a key from a given tree +def get(db, root, key): + v = root + path = key_to_path(key) + for i in range(0, 256, 4): + if v == zerohashes[i]: + return b'\x00' * 32 + child = db.get(v) + if len(child) == 65: + if (path % 2**256) == key_to_path(child[1:33]): + return child[33:] + else: + return b'\x00' * 32 + else: + index = (path >> 252) & 15 + v = child[32*index: 32*index+32] + path <<= 4 + return v + +# Make a root hash of a (sub)tree with a single key/value pair +def make_single_key_hash(path, depth, value): + if depth == 256: + return value + elif (path >> 255) & 1: + return sha3(zerohashes[depth+1] + make_single_key_hash(path << 1, depth + 1, value)) + else: + return sha3(make_single_key_hash(path << 1, depth + 1, value) + zerohashes[depth+1]) + +# Hash together 16 elements +def hash_16_els(vals): + assert len(vals) == 16 + for _ in range(4): + vals = [sha3(vals[i] + vals[i+1]) for i in range(0, len(vals), 2)] + return vals[0] + +# Make a root hash of a (sub)tree with two key/value pairs, and save intermediate nodes in the DB +def make_double_key_hash(db, path1, path2, depth, value1, value2): + if depth == 256: + raise Exception("Cannot fit two values into one slot!") + if ((path1 >> 252) & 15) == ((path2 >> 252) & 15): + children = [zerohashes[depth+4]] * 16 + children[(path1 >> 252) & 15] = make_double_key_hash(db, path1 << 4, path2 << 4, depth + 4, value1, value2) + else: + Lkey = ((path1 >> 252) & 15) + L = make_single_key_hash(path1 << 4, depth + 4, value1) + Rkey = ((path2 >> 252) & 15) + R = make_single_key_hash(path2 << 4, depth + 4, value2) + db.put(L, b'\x01' + path_to_key(path1 << 4) + value1) + db.put(R, b'\x01' + path_to_key(path2 << 4) + value2) + children = [zerohashes[depth+4]] * 16 + children[Lkey] = L + children[Rkey] = R + h = hash_16_els(children) + db.put(h, b''.join(children)) + return h + +# Update a tree with a given key/value pair +def update(db, root, key, value): + return _update(db, root, key_to_path(key), 0, value) + +def _update(db, root, path, depth, value): + if depth == 256: + return value + # Update an empty subtree: make a single-key subtree + if root == zerohashes[depth]: + k = make_single_key_hash(path, depth, value) + db.put(k, b'\x01' + path_to_key(path) + value) + return k + child = db.get(root) + # Update a single-key subtree: make a double-key subtree + if len(child) == 65: + origpath, origvalue = key_to_path(child[1:33]), child[33:] + return make_double_key_hash(db, path, origpath, depth, value, origvalue) + # Update a multi-key subtree: recurse down + else: + assert len(child) == 512 + index = (path >> 252) & 15 + new_value = _update(db, child[index*32: index*32+32], path << 4, depth + 4, value) + new_children = [new_value if i == index else child[32*i:32*i+32] for i in range(16)] + h = hash_16_els(new_children) + db.put(h, b''.join(new_children)) + return h + +def multi_update(db, root, keys, values): + for k, v in zip(keys, values): + root = update(db, root, k, v) + return root diff --git a/vitalik_merkle_optimizations/new_bintrie_optimized.py b/vitalik_merkle_optimizations/new_bintrie_optimized.py new file mode 100644 index 0000000..9748b5c --- /dev/null +++ b/vitalik_merkle_optimizations/new_bintrie_optimized.py @@ -0,0 +1,126 @@ +from ethereum.utils import sha3, encode_hex + +class EphemDB(): + def __init__(self, kv=None): + self.reads = 0 + self.writes = 0 + self.kv = kv or {} + + def get(self, k): + self.reads += 1 + return self.kv.get(k, None) + + def put(self, k, v): + self.writes += 1 + self.kv[k] = v + + def delete(self, k): + del self.kv[k] + +# Hashes of empty subtrees +zerohashes = [b'\x00' * 32] +for i in range(256): + zerohashes.insert(0, sha3(zerohashes[0] + zerohashes[0])) + +# Create a new empty tree +def new_tree(db): + return zerohashes[0] + +# Convert a binary key into an integer path value +def key_to_path(k): + return int.from_bytes(k, 'big') + +tt256m1 = 2**256 - 1 + +# And convert back +def path_to_key(k): + return (k & tt256m1).to_bytes(32, 'big') + +# Read a key from a given tree +def get(db, root, key): + v = root + path = key_to_path(key) + for i in range(256): + if v == zerohashes[i]: + return b'\x00' * 32 + child = db.get(v) + if len(child) == 65: + if (path % 2**256) == key_to_path(child[1:33]): + return child[33:] + else: + return b'\x00' * 32 + else: + if (path >> 255) & 1: + v = child[32:] + else: + v = child[:32] + path <<= 1 + return v + +# Make a root hash of a (sub)tree with a single key/value pair +def make_single_key_hash(path, depth, value): + if depth == 256: + return value + elif (path >> 255) & 1: + return sha3(zerohashes[depth+1] + make_single_key_hash(path << 1, depth + 1, value)) + else: + return sha3(make_single_key_hash(path << 1, depth + 1, value) + zerohashes[depth+1]) + +# Make a root hash of a (sub)tree with two key/value pairs, and save intermediate nodes in the DB +def make_double_key_hash(db, path1, path2, depth, value1, value2): + if depth == 256: + raise Exception("Cannot fit two values into one slot!") + if (path1 >> 255) & 1: + if (path2 >> 255) & 1: + child = zerohashes[depth+1] + make_double_key_hash(db, path1 << 1, path2 << 1, depth + 1, value1, value2) + db.put(sha3(child), child) + return sha3(child) + else: + L = make_single_key_hash(path2 << 1, depth + 1, value2) + R = make_single_key_hash(path1 << 1, depth + 1, value1) + db.put(L, b'\x01' + path_to_key(path2 << 1) + value2) + db.put(R, b'\x01' + path_to_key(path1 << 1) + value1) + child = L + R + else: + if (path2 >> 255) & 1: + L = make_single_key_hash(path1 << 1, depth + 1, value1) + R = make_single_key_hash(path2 << 1, depth + 1, value2) + db.put(L, b'\x01' + path_to_key(path1 << 1) + value1) + db.put(R, b'\x01' + path_to_key(path2 << 1) + value2) + child = L + R + else: + child = make_double_key_hash(db, path1 << 1, path2 << 1, depth + 1, value1, value2) + zerohashes[depth+1] + db.put(sha3(child), child) + return sha3(child) + +# Update a tree with a given key/value pair +def update(db, root, key, value): + return _update(db, root, key_to_path(key), 0, value) + +def _update(db, root, path, depth, value): + if depth == 256: + return value + # Update an empty subtree: make a single-key subtree + if root == zerohashes[depth]: + k = make_single_key_hash(path, depth, value) + db.put(k, b'\x01' + path_to_key(path) + value) + return k + child = db.get(root) + # Update a single-key subtree: make a double-key subtree + if len(child) == 65: + origpath, origvalue = key_to_path(child[1:33]), child[33:] + return make_double_key_hash(db, path, origpath, depth, value, origvalue) + # Update a multi-key subtree: recurse down + elif (path >> 255) & 1: + new_child = child[:32] + _update(db, child[32:], path << 1, depth + 1, value) + db.put(sha3(new_child), new_child) + return sha3(new_child) + else: + new_child = _update(db, child[:32], path << 1, depth + 1, value) + child[32:] + db.put(sha3(new_child), new_child) + return sha3(new_child) + +def multi_update(db, root, keys, values): + for k, v in zip(keys, values): + root = update(db, root, k, v) + return root diff --git a/vitalik_merkle_optimizations/new_bintrie_test.py b/vitalik_merkle_optimizations/new_bintrie_test.py new file mode 100644 index 0000000..4e0553d --- /dev/null +++ b/vitalik_merkle_optimizations/new_bintrie_test.py @@ -0,0 +1,27 @@ +from new_bintrie import EphemDB, new_tree, get, update, make_merkle_proof, verify_proof, compress_proof, decompress_proof +import random +from ethereum.utils import sha3 + +KEYS = 500 + +db = EphemDB() +t = new_tree(db) +for i in range(KEYS): + t = update(db, t, sha3(str(i)), sha3(str(i**3))) +print('%d elements added' % KEYS) +for i in range(KEYS): + assert get(db, t, sha3(str(i))) == sha3(str(i**3)) +print('Get requests for present elements successful') +for i in range(KEYS + 1, KEYS * 2): + assert get(db, t, sha3(str(i))) == b'\x00' * 32 +print('Get requests for absent elements successful') + +TL = 0 +for i in range(KEYS * 2): + key = sha3(str(i)) + value = sha3(str(i ** 3)) if i < KEYS else b'\x00' * 32 + proof = make_merkle_proof(db, t, key) + assert verify_proof(proof, t, key, value) + assert decompress_proof(compress_proof(proof)) == proof + TL += len(compress_proof(proof)) +print('Average total length at %d keys: %d, %d including key' % (KEYS, TL // KEYS // 2, TL // KEYS // 2 + 32))