From 0ddeaaecfee37fe6e0ce0afb21b811e303fea334 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Fri, 3 Jan 2025 19:53:17 +0100
Subject: [PATCH 01/36] [WIP] Udpate cachedict for fewer/more flexible files

---
 exca/cachedict.py      | 69 ++++++++++++++++++++++++++++++------------
 exca/test_cachedict.py | 14 ++++++---
 2 files changed, 59 insertions(+), 24 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 1574067..8a97470 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -8,9 +8,12 @@
 Disk, RAM caches
 """
 import hashlib
+import json
 import logging
 import shutil
+import socket
 import subprocess
+import threading
 import typing as tp
 from concurrent import futures
 from pathlib import Path
@@ -91,8 +94,8 @@ def __init__(
                 except Exception as e:
                     msg = f"Failed to set permission to {self.permissions} on {self.folder}\n({e})"
                     logger.warning(msg)
-        self._ram_data: tp.Dict[str, X] = {}
-        self._key_uid: tp.Dict[str, str] = {}
+        self._ram_data: dict[str, X] = {}
+        self._key_info: dict[str, dict[str, tp.Any]] = {}
         self.cache_type = cache_type
         self._set_cache_type(cache_type)
 
@@ -101,9 +104,16 @@ def __repr__(self) -> str:
         keep_in_ram = self._keep_in_ram
         return f"{name}({self.folder},{keep_in_ram=})"
 
+    @property
+    def _write_fp(self) -> Path:
+        if self.folder is None:
+            raise RuntimeError("No write filepath with no provided folder")
+        name = f"{socket.gethostname()}-{threading.get_native_id()}-info.jsonl"
+        return Path(self.folder) / name
+
     def clear(self) -> None:
         self._ram_data.clear()
-        self._key_uid.clear()
+        self._key_info.clear()
         if self.folder is not None:
             # let's remove content but not the folder to keep same permissions
             for sub in self.folder.iterdir():
@@ -118,10 +128,11 @@ def __len__(self) -> int:
     def keys(self) -> tp.Iterator[str]:
         keys = set(self._ram_data)
         if self.folder is not None:
+            folder = Path(self.folder)
             # read all existing key files as fast as possible (pathlib.glob is slow)
             try:
                 out = subprocess.check_output(
-                    'find . -type f -name "*.key"', shell=True, cwd=self.folder
+                    'find . -type f -name "*.key"', shell=True, cwd=folder
                 ).decode("utf8")
             except subprocess.CalledProcessError as e:
                 out = e.output.decode("utf8")  # stderr contains missing tmp files
@@ -130,12 +141,27 @@ def keys(self) -> tp.Iterator[str]:
             # parallelize content reading
             with futures.ThreadPoolExecutor() as ex:
                 jobs = {
-                    name[:-4]: ex.submit((self.folder / name).read_text, "utf8")
+                    name[:-4]: ex.submit((folder / name).read_text, "utf8")
                     for name in names
-                    if name[:-4] not in self._key_uid
+                    if name[:-4] not in self._key_info
                 }
-            self._key_uid.update({j.result(): name for name, j in jobs.items()})
-            keys |= set(self._key_uid)
+            self._key_info.update({j.result(): {"uid": name} for name, j in jobs.items()})
+            keys |= set(self._key_info)
+            # read all existing jsonl files
+            try:
+                out = subprocess.check_output(
+                    'find . -type f -name "*-info.jsonl"', shell=True, cwd=folder
+                ).decode("utf8")
+            except subprocess.CalledProcessError as e:
+                out = e.output.decode("utf8")  # stderr contains missing tmp files
+            names = out.splitlines()
+            for name in names:
+                lines = (folder / name).read_text().splitlines()
+                for line in lines:
+                    if not line:
+                        continue
+                    info = json.loads(line)
+                    self._key_info[info.pop("key")] = info
         return iter(keys)
 
     def values(self) -> tp.Iterable[X]:
@@ -156,9 +182,9 @@ def __getitem__(self, key: str) -> X:
         # necessarily in file cache folder from now on
         if self.folder is None:
             raise RuntimeError("This should not happen")
-        if key not in self._key_uid:
+        if key not in self._key_info:
             _ = key in self
-        if key not in self._key_uid:
+        if key not in self._key_info:
             # trigger folder cache update:
             # https://stackoverflow.com/questions/3112546/os-path-exists-lies/3112717
             self.folder.chmod(self.folder.stat().st_mode)
@@ -167,11 +193,12 @@ def __getitem__(self, key: str) -> X:
             self.check_cache_type()
         if self.cache_type is None:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
-        uid = self._key_uid[key]
+        info = self._key_info[key]
+        uid = info["uid"]
         loader = DumperLoader.CLASSES[self.cache_type]
         loaded = loader.load(self.folder / uid)
         if self._keep_in_ram:
-            self._ram_data[key] = loaded  # type: ignore
+            self._ram_data[key] = loaded
         return loaded  # type: ignore
 
     def _set_cache_type(self, cache_type: str | None) -> None:
@@ -209,12 +236,16 @@ def __setitem__(self, key: str, value: X) -> None:
             self._ram_data[key] = value
         if self.folder is not None:
             uid = _string_uid(key)  # use a safe mapping
-            self._key_uid[key] = uid
+            self._key_info[key] = {"uid": uid}
             dumper = DumperLoader.CLASSES[self.cache_type]()
             dumper.dump(self.folder / uid, value)
             dumpfile = dumper.filepath(self.folder / uid)
             keyfile = self.folder / (uid + ".key")
             keyfile.write_text(key, encoding="utf8")
+            # new write
+            info = {"key": key, "uid": uid}
+            with self._write_fp.open("a") as f:
+                f.write(json.dumps(info) + "\n")
             # reading will reload to in-memory cache if need be
             # (since dumping may have loaded the underlying data, let's not keep it)
             if self.permissions is not None:
@@ -226,7 +257,7 @@ def __setitem__(self, key: str, value: X) -> None:
 
     def __delitem__(self, key: str) -> None:
         # necessarily in file cache folder from now on
-        if key not in self._key_uid:
+        if key not in self._key_info:
             _ = key in self
         self._ram_data.pop(key, None)
         if self.folder is None:
@@ -235,11 +266,11 @@ def __delitem__(self, key: str) -> None:
             self.check_cache_type()
         if self.cache_type is None:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
-        uid = self._key_uid.pop(key)
-        keyfile = self.folder / (uid + ".key")
+        info = self._key_info.pop(key)
+        keyfile = self.folder / (info["uid"] + ".key")
         keyfile.unlink()
         dumper = DumperLoader.CLASSES[self.cache_type]()
-        fp = dumper.filepath(self.folder / uid)
+        fp = dumper.filepath(self.folder / info["uid"])
         with utils.fast_unlink(fp):  # moves then delete to avoid weird effects
             pass
 
@@ -249,12 +280,12 @@ def __contains__(self, key: str) -> bool:
             return True
         if self.folder is not None:
             # in folder (already read once)
-            if key in self._key_uid:
+            if key in self._key_info:
                 return True
             # maybe in folder (never read it)
             uid = _string_uid(key)
             fp = self.folder / f"{uid}.key"
             if fp.exists():
-                self._key_uid[key] = uid
+                self._key_info[key] = {"uid": uid}
                 return True
         return False  # lazy check
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index b120080..5e9c998 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -83,12 +83,16 @@ def test_data_dump_suffix(tmp_path: Path, data: tp.Any) -> None:
     cache: cd.CacheDict[np.ndarray] = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
     cache["blublu.tmp"] = data
     assert cache.cache_type not in [None, "Pickle"]
-    name1, name2 = [fp.name for fp in tmp_path.iterdir() if not fp.name.startswith(".")]
-    if name2.endswith(".key"):
-        name1, name2 = name2, name1
-    num = len(name1) - 4
-    assert name1[:num] == name2[:num], f"Non-matching names {name1} and {name2}"
+    names = [fp.name for fp in tmp_path.iterdir() if not fp.name.startswith(".")]
+    assert len(names) == 3
+    k_name = [n for n in names if n.endswith(".key")][0]
+    j_name = [n for n in names if n.endswith("-info.jsonl")][0]
+    v_name = [n for n in names if not n.endswith((".key", "-info.jsonl"))][0]
+    num = len(k_name) - 4
+    assert k_name[:num] == k_name[:num], f"Non-matching names {k_name} and {v_name}"
     assert isinstance(cache["blublu.tmp"], type(data))
+    print((tmp_path / j_name).read_text())
+    raise
 
 
 @pytest.mark.parametrize(

From 952a9d31a208e8834bb90ec0d8b79d775bd84885 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Wed, 8 Jan 2025 10:39:08 +0100
Subject: [PATCH 02/36] wip

---
 exca/cachedict.py      | 54 +++++++++++++++++++++++++++---------------
 exca/test_cachedict.py | 23 ++++++++++++++++--
 2 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 8a97470..ad716e1 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -79,10 +79,12 @@ def __init__(
         keep_in_ram: bool = False,
         cache_type: None | str = None,
         permissions: int | None = 0o777,
+        _write_legacy_key_files: bool = False,
     ) -> None:
         self.folder = None if folder is None else Path(folder)
         self.permissions = permissions
         self._keep_in_ram = keep_in_ram
+        self._write_legacy_key_files = _write_legacy_key_files
         if self.folder is None and not keep_in_ram:
             raise ValueError("At least folder or keep_in_ram should be activated")
 
@@ -123,6 +125,7 @@ def clear(self) -> None:
                     sub.unlink()
 
     def __len__(self) -> int:
+        print("in len")
         return len(list(self.keys()))  # inefficient, but correct
 
     def keys(self) -> tp.Iterator[str]:
@@ -146,24 +149,31 @@ def keys(self) -> tp.Iterator[str]:
                     if name[:-4] not in self._key_info
                 }
             self._key_info.update({j.result(): {"uid": name} for name, j in jobs.items()})
+            self._load_info_files()
             keys |= set(self._key_info)
-            # read all existing jsonl files
-            try:
-                out = subprocess.check_output(
-                    'find . -type f -name "*-info.jsonl"', shell=True, cwd=folder
-                ).decode("utf8")
-            except subprocess.CalledProcessError as e:
-                out = e.output.decode("utf8")  # stderr contains missing tmp files
-            names = out.splitlines()
-            for name in names:
-                lines = (folder / name).read_text().splitlines()
-                for line in lines:
-                    if not line:
-                        continue
-                    info = json.loads(line)
-                    self._key_info[info.pop("key")] = info
         return iter(keys)
 
+    def _load_info_files(self) -> None:
+        if self.folder is None:
+            return
+        folder = Path(self.folder)
+        # read all existing jsonl files
+        try:
+            out = subprocess.check_output(
+                'find . -type f -name "*-info.jsonl"', shell=True, cwd=folder
+            ).decode("utf8")
+        except subprocess.CalledProcessError as e:
+            out = e.output.decode("utf8")  # stderr contains missing tmp files
+        print("JSON", out)
+        names = out.splitlines()
+        for name in names:
+            lines = (folder / name).read_text().splitlines()
+            for line in lines:
+                if not line:
+                    continue
+                info = json.loads(line)
+                self._key_info[info.pop("key")] = info
+
     def values(self) -> tp.Iterable[X]:
         for key in self:
             yield self[key]
@@ -240,16 +250,21 @@ def __setitem__(self, key: str, value: X) -> None:
             dumper = DumperLoader.CLASSES[self.cache_type]()
             dumper.dump(self.folder / uid, value)
             dumpfile = dumper.filepath(self.folder / uid)
-            keyfile = self.folder / (uid + ".key")
-            keyfile.write_text(key, encoding="utf8")
+            files = [dumpfile]
+            if self._write_legacy_key_files:
+                keyfile = self.folder / (uid + ".key")
+                keyfile.write_text(key, encoding="utf8")
+                files.append(keyfile)
             # new write
             info = {"key": key, "uid": uid}
-            with self._write_fp.open("a") as f:
+            write_fp = self._write_fp
+            with write_fp.open("a") as f:
                 f.write(json.dumps(info) + "\n")
+            files.append(write_fp)
             # reading will reload to in-memory cache if need be
             # (since dumping may have loaded the underlying data, let's not keep it)
             if self.permissions is not None:
-                for fp in [dumpfile, keyfile]:
+                for fp in files:
                     try:
                         fp.chmod(self.permissions)
                     except Exception:  # pylint: disable=broad-except
@@ -278,6 +293,7 @@ def __contains__(self, key: str) -> bool:
         # in-memory cache
         if key in self._ram_data:
             return True
+        self._load_info_files()
         if self.folder is not None:
             # in folder (already read once)
             if key in self._key_info:
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index 5e9c998..90baa1d 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -91,8 +91,6 @@ def test_data_dump_suffix(tmp_path: Path, data: tp.Any) -> None:
     num = len(k_name) - 4
     assert k_name[:num] == k_name[:num], f"Non-matching names {k_name} and {v_name}"
     assert isinstance(cache["blublu.tmp"], type(data))
-    print((tmp_path / j_name).read_text())
-    raise
 
 
 @pytest.mark.parametrize(
@@ -110,3 +108,24 @@ def test_specialized_dump(tmp_path: Path, data: tp.Any, cache_type: str) -> None
     )
     cache["x"] = data
     assert isinstance(cache["x"], type(data))
+
+
+@pytest.mark.parametrize("legacy_write", (True, False))
+def test_info_jsonl(tmp_path: Path, legacy_write: bool) -> None:
+    cache: cd.CacheDict[int] = cd.CacheDict(
+        folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
+    )
+    cache["x"] = 12
+    cache["y"] = 3
+    # check files
+    fps = list(tmp_path.iterdir())
+    info_path = [fp for fp in fps if fp.name.endswith("-info.jsonl")][0]
+    print(info_path.read_text())
+    # restore
+    cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
+    assert cache["x"] == 12
+    cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
+    assert "y" in cache
+    print("CHECKING SIZE")
+    cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
+    assert len(cache) == 2

From 0a7488e3b1fa5c2ca2c6c693cf39f356aca5fdd2 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Wed, 8 Jan 2025 10:46:27 +0100
Subject: [PATCH 03/36] wip

---
 exca/cachedict.py      | 2 --
 exca/test_cachedict.py | 9 ++++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index ad716e1..d27cde7 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -125,7 +125,6 @@ def clear(self) -> None:
                     sub.unlink()
 
     def __len__(self) -> int:
-        print("in len")
         return len(list(self.keys()))  # inefficient, but correct
 
     def keys(self) -> tp.Iterator[str]:
@@ -164,7 +163,6 @@ def _load_info_files(self) -> None:
             ).decode("utf8")
         except subprocess.CalledProcessError as e:
             out = e.output.decode("utf8")  # stderr contains missing tmp files
-        print("JSON", out)
         names = out.splitlines()
         for name in names:
             lines = (folder / name).read_text().splitlines()
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index 90baa1d..9348044 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -110,8 +110,10 @@ def test_specialized_dump(tmp_path: Path, data: tp.Any, cache_type: str) -> None
     assert isinstance(cache["x"], type(data))
 
 
-@pytest.mark.parametrize("legacy_write", (True, False))
-def test_info_jsonl(tmp_path: Path, legacy_write: bool) -> None:
+@pytest.mark.parametrize(
+    "legacy_write,remove_jsonl", ((True, True), (True, False), (False, False))
+)
+def test_info_jsonl(tmp_path: Path, legacy_write: bool, remove_jsonl: bool) -> None:
     cache: cd.CacheDict[int] = cd.CacheDict(
         folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
     )
@@ -121,11 +123,12 @@ def test_info_jsonl(tmp_path: Path, legacy_write: bool) -> None:
     fps = list(tmp_path.iterdir())
     info_path = [fp for fp in fps if fp.name.endswith("-info.jsonl")][0]
     print(info_path.read_text())
+    if remove_jsonl:
+        info_path.unlink()
     # restore
     cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
     assert cache["x"] == 12
     cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
     assert "y" in cache
-    print("CHECKING SIZE")
     cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
     assert len(cache) == 2

From e8c3030ea2e57fa64447f3ea01ecf12906e50752 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Wed, 8 Jan 2025 11:58:00 +0100
Subject: [PATCH 04/36] pool

---
 .gitignore             |  1 +
 exca/test_cachedict.py | 22 +++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index fb02a55..436edf5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -112,6 +112,7 @@ ENV/
 exp_local
 outputs
 data
+tmp
 
 # adding output from unit-tests here
 *-raw.fif
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index 9348044..e4b4fd1 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import typing as tp
+from concurrent import futures
 from pathlib import Path
 
 import nibabel as nib
@@ -113,22 +114,29 @@ def test_specialized_dump(tmp_path: Path, data: tp.Any, cache_type: str) -> None
 @pytest.mark.parametrize(
     "legacy_write,remove_jsonl", ((True, True), (True, False), (False, False))
 )
-def test_info_jsonl(tmp_path: Path, legacy_write: bool, remove_jsonl: bool) -> None:
+@pytest.mark.parametrize("process", (False,))  # add True for more (slower) tests
+def test_info_jsonl(
+    tmp_path: Path, legacy_write: bool, remove_jsonl: bool, process: bool
+) -> None:
     cache: cd.CacheDict[int] = cd.CacheDict(
         folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
     )
-    cache["x"] = 12
-    cache["y"] = 3
+    Pool = futures.ProcessPoolExecutor if process else futures.ThreadPoolExecutor
+    with Pool(max_workers=2) as ex:
+        ex.submit(cache.__setitem__, "x", 12)
+        ex.submit(cache.__setitem__, "y", 3)
+        ex.submit(cache.__setitem__, "z", 24)
     # check files
     fps = list(tmp_path.iterdir())
-    info_path = [fp for fp in fps if fp.name.endswith("-info.jsonl")][0]
-    print(info_path.read_text())
+    info_paths = [fp for fp in fps if fp.name.endswith("-info.jsonl")]
+    assert len(info_paths) == 2
     if remove_jsonl:
-        info_path.unlink()
+        for ipath in info_paths:
+            ipath.unlink()
     # restore
     cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
     assert cache["x"] == 12
     cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
     assert "y" in cache
     cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
-    assert len(cache) == 2
+    assert len(cache) == 3

From e1032718d323da07b7e19eae2a48860dc8d0d5f6 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Thu, 9 Jan 2025 11:59:05 +0100
Subject: [PATCH 05/36] wip

---
 exca/cachedict.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index d27cde7..93c410e 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -79,7 +79,7 @@ def __init__(
         keep_in_ram: bool = False,
         cache_type: None | str = None,
         permissions: int | None = 0o777,
-        _write_legacy_key_files: bool = False,
+        _write_legacy_key_files: bool = True,
     ) -> None:
         self.folder = None if folder is None else Path(folder)
         self.permissions = permissions
@@ -165,12 +165,17 @@ def _load_info_files(self) -> None:
             out = e.output.decode("utf8")  # stderr contains missing tmp files
         names = out.splitlines()
         for name in names:
-            lines = (folder / name).read_text().splitlines()
-            for line in lines:
-                if not line:
-                    continue
-                info = json.loads(line)
-                self._key_info[info.pop("key")] = info
+            fp = folder / name
+            num = 0
+            with fp.open("rb") as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    info = json.loads(line.decode("utf8"))
+                    info.update(_jsonl=fp, _byterange=(num, num + len(line)))
+                    self._key_info[info.pop("key")] = info
+                    num += len(line)
 
     def values(self) -> tp.Iterable[X]:
         for key in self:
@@ -244,7 +249,6 @@ def __setitem__(self, key: str, value: X) -> None:
             self._ram_data[key] = value
         if self.folder is not None:
             uid = _string_uid(key)  # use a safe mapping
-            self._key_info[key] = {"uid": uid}
             dumper = DumperLoader.CLASSES[self.cache_type]()
             dumper.dump(self.folder / uid, value)
             dumpfile = dumper.filepath(self.folder / uid)
@@ -256,8 +260,11 @@ def __setitem__(self, key: str, value: X) -> None:
             # new write
             info = {"key": key, "uid": uid}
             write_fp = self._write_fp
-            with write_fp.open("a") as f:
-                f.write(json.dumps(info) + "\n")
+            with write_fp.open("ab") as f:
+                b = json.dumps(info).encode("utf8")
+                f.write(b + b"\n")
+                info.update(_jsonl=write_fp, _byterange=(0, 0))
+            self._key_info[key] = info
             files.append(write_fp)
             # reading will reload to in-memory cache if need be
             # (since dumping may have loaded the underlying data, let's not keep it)
@@ -281,7 +288,7 @@ def __delitem__(self, key: str) -> None:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
         info = self._key_info.pop(key)
         keyfile = self.folder / (info["uid"] + ".key")
-        keyfile.unlink()
+        keyfile.unlink(missing_ok=True)
         dumper = DumperLoader.CLASSES[self.cache_type]()
         fp = dumper.filepath(self.folder / info["uid"])
         with utils.fast_unlink(fp):  # moves then delete to avoid weird effects

From bf15e40a0bd9926e2e2361c849c6f234edb36465 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Thu, 9 Jan 2025 16:32:31 +0100
Subject: [PATCH 06/36] wip

---
 exca/cachedict.py      | 13 ++++++++++---
 exca/test_cachedict.py | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 93c410e..f09a690 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -169,13 +169,14 @@ def _load_info_files(self) -> None:
             num = 0
             with fp.open("rb") as f:
                 for line in f:
+                    count = len(line)
                     line = line.strip()
                     if not line:
                         continue
                     info = json.loads(line.decode("utf8"))
-                    info.update(_jsonl=fp, _byterange=(num, num + len(line)))
+                    info.update(_jsonl=fp, _byterange=(num, num + count))
                     self._key_info[info.pop("key")] = info
-                    num += len(line)
+                    num += count
 
     def values(self) -> tp.Iterable[X]:
         for key in self:
@@ -262,8 +263,9 @@ def __setitem__(self, key: str, value: X) -> None:
             write_fp = self._write_fp
             with write_fp.open("ab") as f:
                 b = json.dumps(info).encode("utf8")
+                current = f.tell()
                 f.write(b + b"\n")
-                info.update(_jsonl=write_fp, _byterange=(0, 0))
+                info.update(_jsonl=write_fp, _byterange=(current, current + len(b) + 1))
             self._key_info[key] = info
             files.append(write_fp)
             # reading will reload to in-memory cache if need be
@@ -289,6 +291,11 @@ def __delitem__(self, key: str) -> None:
         info = self._key_info.pop(key)
         keyfile = self.folder / (info["uid"] + ".key")
         keyfile.unlink(missing_ok=True)
+        jsonl = Path(info["_jsonl"])
+        brange = info["_byterange"]
+        with jsonl.open("rb+") as f:
+            f.seek(brange[0])
+            f.write(b" " * (brange[1] - brange[0] - 1) + b"\n")
         dumper = DumperLoader.CLASSES[self.cache_type]()
         fp = dumper.filepath(self.folder / info["uid"])
         with utils.fast_unlink(fp):  # moves then delete to avoid weird effects
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index e4b4fd1..9ccbf65 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -140,3 +140,42 @@ def test_info_jsonl(
     assert "y" in cache
     cache = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
     assert len(cache) == 3
+    cache.clear()
+    assert not cache
+    assert not list(tmp_path.iterdir())
+
+
+@pytest.mark.parametrize(
+    "legacy_write,remove_jsonl", ((True, True), (True, False), (False, False))
+)
+def test_info_jsonl_deletion(
+    tmp_path: Path, legacy_write: bool, remove_jsonl: bool
+) -> None:
+    for k, v in [("x", 12), ("blüblû", 3)]:
+        cache: cd.CacheDict[str] = cd.CacheDict(
+            folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
+        )
+        cache[k] = v
+    _ = cache.keys()  # listing
+    info = cache._key_info
+    cache: cd.CacheDict[int] = cd.CacheDict(
+        folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
+    )
+    _ = cache.keys()  # listing
+    assert cache._key_info == info
+    for sub in info.values():
+        fp = Path(sub["_jsonl"])
+        r = sub["_byterange"]
+        with fp.open("rb") as f:
+            f.seek(r[0])
+            out = f.read(r[1] - r[0])
+            assert out.startswith(b"{") and out.endswith(b"}\n")
+            print(out)
+    # remove one
+    chosen = np.random.choice(["x", "blüblû"])
+    del cache[chosen]
+    assert len(cache) == 1
+    cache: cd.CacheDict[int] = cd.CacheDict(
+        folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
+    )
+    assert len(cache) == 1

From bbda15eb557a9e63a9be557334b135a0ff37c0b2 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Thu, 9 Jan 2025 17:09:04 +0100
Subject: [PATCH 07/36] working

---
 exca/cachedict.py      | 12 +++++++-----
 exca/test_cachedict.py | 24 +++++++++++++++---------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index f09a690..15c8009 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -291,11 +291,13 @@ def __delitem__(self, key: str) -> None:
         info = self._key_info.pop(key)
         keyfile = self.folder / (info["uid"] + ".key")
         keyfile.unlink(missing_ok=True)
-        jsonl = Path(info["_jsonl"])
-        brange = info["_byterange"]
-        with jsonl.open("rb+") as f:
-            f.seek(brange[0])
-            f.write(b" " * (brange[1] - brange[0] - 1) + b"\n")
+        if "_jsonl" in info:
+            jsonl = Path(info["_jsonl"])
+            brange = info["_byterange"]
+            # overwrite with whitespaces
+            with jsonl.open("rb+") as f:
+                f.seek(brange[0])
+                f.write(b" " * (brange[1] - brange[0] - 1) + b"\n")
         dumper = DumperLoader.CLASSES[self.cache_type]()
         fp = dumper.filepath(self.folder / info["uid"])
         with utils.fast_unlink(fp):  # moves then delete to avoid weird effects
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index 9ccbf65..16d1e67 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -151,14 +151,15 @@ def test_info_jsonl(
 def test_info_jsonl_deletion(
     tmp_path: Path, legacy_write: bool, remove_jsonl: bool
 ) -> None:
-    for k, v in [("x", 12), ("blüblû", 3)]:
-        cache: cd.CacheDict[str] = cd.CacheDict(
+    keys = ("x", "blüblû", "stuff")
+    for k in keys:
+        cache: cd.CacheDict[int] = cd.CacheDict(
             folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
         )
-        cache[k] = v
+        cache[k] = 12 if k == "x" else 3
     _ = cache.keys()  # listing
     info = cache._key_info
-    cache: cd.CacheDict[int] = cd.CacheDict(
+    cache = cd.CacheDict(
         folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
     )
     _ = cache.keys()  # listing
@@ -170,12 +171,17 @@ def test_info_jsonl_deletion(
             f.seek(r[0])
             out = f.read(r[1] - r[0])
             assert out.startswith(b"{") and out.endswith(b"}\n")
-            print(out)
+    if remove_jsonl:
+        for ipath in tmp_path.glob("*.jsonl"):
+            ipath.unlink()
+        cache = cd.CacheDict(
+            folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
+        )
     # remove one
-    chosen = np.random.choice(["x", "blüblû"])
+    chosen = np.random.choice(keys)
     del cache[chosen]
-    assert len(cache) == 1
-    cache: cd.CacheDict[int] = cd.CacheDict(
+    assert len(cache) == 2
+    cache = cd.CacheDict(
         folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
     )
-    assert len(cache) == 1
+    assert len(cache) == 2

From 6576354973203559ad4969b32cb8c96bd5c203bf Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Thu, 9 Jan 2025 17:12:14 +0100
Subject: [PATCH 08/36] fix

---
 exca/cachedict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 15c8009..4a5d76c 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -259,7 +259,7 @@ def __setitem__(self, key: str, value: X) -> None:
                 keyfile.write_text(key, encoding="utf8")
                 files.append(keyfile)
             # new write
-            info = {"key": key, "uid": uid}
+            info: dict[str, tp.Any] = {"key": key, "uid": uid}
             write_fp = self._write_fp
             with write_fp.open("ab") as f:
                 b = json.dumps(info).encode("utf8")

From ecde5d8da399185bd904542562ea5a28da9a4755 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Mon, 13 Jan 2025 13:50:47 +0100
Subject: [PATCH 09/36] wip

---
 exca/test_cachedict.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index 16d1e67..e585d47 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -92,6 +92,7 @@ def test_data_dump_suffix(tmp_path: Path, data: tp.Any) -> None:
     num = len(k_name) - 4
     assert k_name[:num] == k_name[:num], f"Non-matching names {k_name} and {v_name}"
     assert isinstance(cache["blublu.tmp"], type(data))
+    assert (tmp_path / j_name).read_text().startswith("{")
 
 
 @pytest.mark.parametrize(

From 9efc145843f8956731573a21e920b091c8e3aec8 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Mon, 13 Jan 2025 14:01:02 +0100
Subject: [PATCH 10/36] factor

---
 exca/cachedict.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 4a5d76c..cb65b4c 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -132,13 +132,12 @@ def keys(self) -> tp.Iterator[str]:
         if self.folder is not None:
             folder = Path(self.folder)
             # read all existing key files as fast as possible (pathlib.glob is slow)
+            find_cmd = 'find . -type f -name "*.key"'
             try:
-                out = subprocess.check_output(
-                    'find . -type f -name "*.key"', shell=True, cwd=folder
-                ).decode("utf8")
+                out = subprocess.check_output(find_cmd, shell=True, cwd=folder)
             except subprocess.CalledProcessError as e:
-                out = e.output.decode("utf8")  # stderr contains missing tmp files
-            names = out.splitlines()
+                out = e.output
+            names = out.decode("utf8").splitlines()
             jobs = {}
             # parallelize content reading
             with futures.ThreadPoolExecutor() as ex:

From e4186e60d959dc663f25339febb05016f478bb9a Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Mon, 13 Jan 2025 15:17:19 +0100
Subject: [PATCH 11/36] nit

---
 exca/cachedict.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index cb65b4c..1f6291b 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -156,13 +156,12 @@ def _load_info_files(self) -> None:
             return
         folder = Path(self.folder)
         # read all existing jsonl files
+        find_cmd = 'find . -type f -name "*-info.jsonl"'
         try:
-            out = subprocess.check_output(
-                'find . -type f -name "*-info.jsonl"', shell=True, cwd=folder
-            ).decode("utf8")
+            out = subprocess.check_output(find_cmd, shell=True, cwd=folder)
         except subprocess.CalledProcessError as e:
-            out = e.output.decode("utf8")  # stderr contains missing tmp files
-        names = out.splitlines()
+            out = e.output  # stderr contains missing tmp files
+        names = out.decode("utf8").splitlines()
         for name in names:
             fp = folder / name
             num = 0
@@ -260,13 +259,14 @@ def __setitem__(self, key: str, value: X) -> None:
             # new write
             info: dict[str, tp.Any] = {"key": key, "uid": uid}
             write_fp = self._write_fp
+            if not write_fp.exists():
+                files.append(write_fp)  # no need to update the file permission again
             with write_fp.open("ab") as f:
                 b = json.dumps(info).encode("utf8")
                 current = f.tell()
                 f.write(b + b"\n")
                 info.update(_jsonl=write_fp, _byterange=(current, current + len(b) + 1))
             self._key_info[key] = info
-            files.append(write_fp)
             # reading will reload to in-memory cache if need be
             # (since dumping may have loaded the underlying data, let's not keep it)
             if self.permissions is not None:

From bc0b8601b39d6b6514b02c697463aabd20437a43 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Mon, 13 Jan 2025 16:48:22 +0100
Subject: [PATCH 12/36] dict

---
 exca/map.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/exca/map.py b/exca/map.py
index 773e65d..e5c6c8c 100644
--- a/exca/map.py
+++ b/exca/map.py
@@ -461,8 +461,8 @@ def _method_override_futures(self, items: tp.Sequence[tp.Any]) -> tp.Iterator[tp
 
     def _call_and_store(
         self, items: tp.Sequence[tp.Any], use_cache_dict: bool = True
-    ) -> tp.Dict[str, tp.Any]:
-        d: tp.Dict[str, tp.Any] = self.cache_dict if use_cache_dict else {}  # type: ignore
+    ) -> dict[str, tp.Any]:
+        d: dict[str, tp.Any] = self.cache_dict if use_cache_dict else {}  # type: ignore
         imethod = self._infra_method
         if imethod is None:
             raise RuntimeError(f"Infra was not applied: {self!r}")

From 197fabf571195fe87df0265253bf50d8b5ced18a Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Tue, 14 Jan 2025 15:45:00 +0100
Subject: [PATCH 13/36] wip

---
 exca/cachedict.py    | 19 ++++++++++++------
 exca/dumperloader.py | 47 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 1f6291b..f74b91f 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -11,15 +11,13 @@
 import json
 import logging
 import shutil
-import socket
 import subprocess
-import threading
 import typing as tp
 from concurrent import futures
 from pathlib import Path
 
 from . import utils
-from .dumperloader import DumperLoader
+from .dumperloader import DumperLoader, host_pid
 
 X = tp.TypeVar("X")
 Y = tp.TypeVar("Y")
@@ -85,6 +83,7 @@ def __init__(
         self.permissions = permissions
         self._keep_in_ram = keep_in_ram
         self._write_legacy_key_files = _write_legacy_key_files
+        self._loaders: dict[str, DumperLoader] = {}  # loaders are persistent
         if self.folder is None and not keep_in_ram:
             raise ValueError("At least folder or keep_in_ram should be activated")
 
@@ -110,7 +109,7 @@ def __repr__(self) -> str:
     def _write_fp(self) -> Path:
         if self.folder is None:
             raise RuntimeError("No write filepath with no provided folder")
-        name = f"{socket.gethostname()}-{threading.get_native_id()}-info.jsonl"
+        name = f"{host_pid()}-info.jsonl"
         return Path(self.folder) / name
 
     def clear(self) -> None:
@@ -207,12 +206,20 @@ def __getitem__(self, key: str) -> X:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
         info = self._key_info[key]
         uid = info["uid"]
-        loader = DumperLoader.CLASSES[self.cache_type]
+        loader = self._get_loader()
         loaded = loader.load(self.folder / uid)
         if self._keep_in_ram:
             self._ram_data[key] = loaded
         return loaded  # type: ignore
 
+    def _get_loader(self) -> DumperLoader:
+        key = host_pid()  # make sure we dont use a loader from another thread
+        if self.folder is None:
+            raise RuntimeError("Cannot get loader with no folder")
+        if key not in self._loaders:
+            self._loaders[key] = DumperLoader.CLASSES[self.cache_type](self.folder)
+        return self._loaders[key]
+
     def _set_cache_type(self, cache_type: str | None) -> None:
         if self.folder is None:
             return  # not needed
@@ -248,7 +255,7 @@ def __setitem__(self, key: str, value: X) -> None:
             self._ram_data[key] = value
         if self.folder is not None:
             uid = _string_uid(key)  # use a safe mapping
-            dumper = DumperLoader.CLASSES[self.cache_type]()
+            dumper = self._get_loader()
             dumper.dump(self.folder / uid, value)
             dumpfile = dumper.filepath(self.folder / uid)
             files = [dumpfile]
diff --git a/exca/dumperloader.py b/exca/dumperloader.py
index 056a7f0..b890062 100644
--- a/exca/dumperloader.py
+++ b/exca/dumperloader.py
@@ -5,7 +5,10 @@
 # LICENSE file in the root directory of this source tree.
 
 import pickle
+import socket
+import threading
 import typing as tp
+import uuid
 import warnings
 from pathlib import Path
 
@@ -17,11 +20,18 @@
 Y = tp.TypeVar("Y", bound=tp.Type[tp.Any])
 
 
+def host_pid() -> str:
+    return f"{socket.gethostname()}-{threading.get_native_id()}"
+
+
 class DumperLoader(tp.Generic[X]):
     CLASSES: tp.MutableMapping[str, "tp.Type[DumperLoader[tp.Any]]"] = {}
     DEFAULTS: tp.MutableMapping[tp.Any, "tp.Type[DumperLoader[tp.Any]]"] = {}
     SUFFIX = ""
 
+    def __init__(self, folder: str | Path = "") -> None:
+        self.folder = Path(folder)
+
     @classmethod
     def filepath(cls, basepath: str | Path) -> Path:
         basepath = Path(basepath)
@@ -98,6 +108,43 @@ def load(cls, basepath: Path) -> np.ndarray:
         return np.load(cls.filepath(basepath), mmap_mode="r")  # type: ignore
 
 
+class NumpyMemmapArrayDumper(DumperLoader[np.ndarray]):
+
+    def __init__(self, folder: Path | str) -> None:
+        super().__init__(folder)
+        self.size = 1
+        self.row = 0
+        self._name: str | None = None
+
+    def load(self, filename: str, row: int) -> np.ndarray:
+        return np.load(self.folder / filename, mmap_mode="r")[row]  # type: ignore
+
+    def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
+        if not isinstance(value, np.ndarray):
+            raise TypeError(f"Expected numpy array but got {value} ({type(value)})")
+        mode = "r+"
+        shape: tp.Any = None
+        if self._name is None:
+            self.size = max(self.size, 1)
+            shape = (self.size,) + value.shape
+            self._name = f"{host_pid()}-{uuid.uuid4().hex[:8]}.npy"
+            self.row = 0
+            mode = "w+"
+        fp = self.folder / self._name
+        memmap = np.lib.format.open_memmap(
+            filename=fp, mode=mode, dtype=float, shape=shape
+        )
+        memmap[self.row] = value
+        memmap.flush()
+        memmap.close()
+        out = {"row": self.row, "filename": str(fp)}
+        self.row += 1
+        if self.size == self.row:
+            self.size = max(self.size + 1, int(round(1.2 * self.size)))
+            self._name = None
+        return out
+
+
 try:
     import pandas as pd
 except ImportError:

From c997ca35691ea271ebf4c8cb21e55845f82ffce6 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Tue, 14 Jan 2025 17:47:33 +0100
Subject: [PATCH 14/36] working

---
 exca/cachedict.py         |  52 +++++++-------
 exca/dumperloader.py      | 142 +++++++++++++++++++++-----------------
 exca/test_cachedict.py    |  20 ------
 exca/test_dumperloader.py |  20 ++++++
 4 files changed, 121 insertions(+), 113 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index f74b91f..dc7658a 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -7,7 +7,6 @@
 """
 Disk, RAM caches
 """
-import hashlib
 import json
 import logging
 import shutil
@@ -17,23 +16,14 @@
 from pathlib import Path
 
 from . import utils
-from .dumperloader import DumperLoader, host_pid
+from .dumperloader import DumperLoader, StaticDumperLoader, _string_uid, host_pid
 
 X = tp.TypeVar("X")
 Y = tp.TypeVar("Y")
 
-UNSAFE_TABLE = {ord(char): "-" for char in "/\\\n\t "}
 logger = logging.getLogger(__name__)
 
 
-def _string_uid(string: str) -> str:
-    out = string.translate(UNSAFE_TABLE)
-    if len(out) > 80:
-        out = out[:40] + "[.]" + out[-40:]
-    h = hashlib.md5(string.encode("utf8")).hexdigest()[:8]
-    return f"{out}-{h}"
-
-
 class CacheDict(tp.Generic[X]):
     """Dictionary-like object that caches and loads data on disk and ram.
 
@@ -172,7 +162,7 @@ def _load_info_files(self) -> None:
                         continue
                     info = json.loads(line.decode("utf8"))
                     info.update(_jsonl=fp, _byterange=(num, num + count))
-                    self._key_info[info.pop("key")] = info
+                    self._key_info[info.pop("_key")] = info
                     num += count
 
     def values(self) -> tp.Iterable[X]:
@@ -205,9 +195,8 @@ def __getitem__(self, key: str) -> X:
         if self.cache_type is None:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
         info = self._key_info[key]
-        uid = info["uid"]
         loader = self._get_loader()
-        loaded = loader.load(self.folder / uid)
+        loaded = loader.load(**{x: y for x, y in info.items() if not x.startswith("_")})
         if self._keep_in_ram:
             self._ram_data[key] = loaded
         return loaded  # type: ignore
@@ -254,17 +243,15 @@ def __setitem__(self, key: str, value: X) -> None:
         if self._keep_in_ram and self.folder is None:
             self._ram_data[key] = value
         if self.folder is not None:
-            uid = _string_uid(key)  # use a safe mapping
             dumper = self._get_loader()
-            dumper.dump(self.folder / uid, value)
-            dumpfile = dumper.filepath(self.folder / uid)
-            files = [dumpfile]
-            if self._write_legacy_key_files:
-                keyfile = self.folder / (uid + ".key")
+            info = dumper.dump(key, value)
+            files = [self.folder / info["filename"]]
+            if self._write_legacy_key_files and isinstance(dumper, StaticDumperLoader):
+                keyfile = self.folder / (info["filename"][: -len(dumper.SUFFIX)] + ".key")
                 keyfile.write_text(key, encoding="utf8")
                 files.append(keyfile)
             # new write
-            info: dict[str, tp.Any] = {"key": key, "uid": uid}
+            info["_key"] = key
             write_fp = self._write_fp
             if not write_fp.exists():
                 files.append(write_fp)  # no need to update the file permission again
@@ -294,9 +281,11 @@ def __delitem__(self, key: str) -> None:
             self.check_cache_type()
         if self.cache_type is None:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
+        loader = self._get_loader()
         info = self._key_info.pop(key)
-        keyfile = self.folder / (info["uid"] + ".key")
-        keyfile.unlink(missing_ok=True)
+        if isinstance(loader, StaticDumperLoader):
+            keyfile = self.folder / (info["filename"][: -len(loader.SUFFIX)] + ".key")
+            keyfile.unlink(missing_ok=True)
         if "_jsonl" in info:
             jsonl = Path(info["_jsonl"])
             brange = info["_byterange"]
@@ -304,10 +293,11 @@ def __delitem__(self, key: str) -> None:
             with jsonl.open("rb+") as f:
                 f.seek(brange[0])
                 f.write(b" " * (brange[1] - brange[0] - 1) + b"\n")
-        dumper = DumperLoader.CLASSES[self.cache_type]()
-        fp = dumper.filepath(self.folder / info["uid"])
-        with utils.fast_unlink(fp):  # moves then delete to avoid weird effects
-            pass
+        info = {x: y for x, y in info.items() if not x.startswith("_")}
+        if len(info) == 1:  # only filename -> we can remove it as it is not shared
+            # moves then delete to avoid weird effects
+            with utils.fast_unlink(Path(self.folder) / info["filename"]):
+                pass
 
     def __contains__(self, key: str) -> bool:
         # in-memory cache
@@ -322,6 +312,12 @@ def __contains__(self, key: str) -> bool:
             uid = _string_uid(key)
             fp = self.folder / f"{uid}.key"
             if fp.exists():
-                self._key_info[key] = {"uid": uid}
+                loader = self._get_loader()
+                if not isinstance(loader, StaticDumperLoader):
+                    raise RuntimeError(
+                        "Cannot regenerate info from non-static dumper-loader"
+                    )
+                filename = _string_uid(key) + loader.SUFFIX
+                self._key_info[key] = {"filename": filename}
                 return True
         return False  # lazy check
diff --git a/exca/dumperloader.py b/exca/dumperloader.py
index b890062..41e378f 100644
--- a/exca/dumperloader.py
+++ b/exca/dumperloader.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+import hashlib
 import pickle
 import socket
 import threading
@@ -19,6 +20,16 @@
 X = tp.TypeVar("X")
 Y = tp.TypeVar("Y", bound=tp.Type[tp.Any])
 
+UNSAFE_TABLE = {ord(char): "-" for char in "/\\\n\t "}
+
+
+def _string_uid(string: str) -> str:
+    out = string.translate(UNSAFE_TABLE)
+    if len(out) > 80:
+        out = out[:40] + "[.]" + out[-40:]
+    h = hashlib.md5(string.encode("utf8")).hexdigest()[:8]
+    return f"{out}-{h}"
+
 
 def host_pid() -> str:
     return f"{socket.gethostname()}-{threading.get_native_id()}"
@@ -27,29 +38,19 @@ def host_pid() -> str:
 class DumperLoader(tp.Generic[X]):
     CLASSES: tp.MutableMapping[str, "tp.Type[DumperLoader[tp.Any]]"] = {}
     DEFAULTS: tp.MutableMapping[tp.Any, "tp.Type[DumperLoader[tp.Any]]"] = {}
-    SUFFIX = ""
 
     def __init__(self, folder: str | Path = "") -> None:
         self.folder = Path(folder)
 
-    @classmethod
-    def filepath(cls, basepath: str | Path) -> Path:
-        basepath = Path(basepath)
-        if not cls.SUFFIX:
-            raise RuntimeError(f"Default filepath used with no suffix for class {cls}")
-        return basepath.parent / f"{basepath.name}{cls.SUFFIX}"
-
     @classmethod
     def __init_subclass__(cls, **kwargs: tp.Any) -> None:
         super().__init_subclass__(**kwargs)
         DumperLoader.CLASSES[cls.__name__] = cls
 
-    @classmethod
-    def load(cls, basepath: Path) -> X:
+    def load(self, filename: str, **kwargs: tp.Any) -> X:
         raise NotImplementedError
 
-    @classmethod
-    def dump(cls, basepath: Path, value: X) -> None:
+    def dump(self, key: str, value: X) -> dict[str, tp.Any]:
         raise NotImplementedError
 
     @staticmethod
@@ -65,36 +66,55 @@ def default_class(type_: Y) -> tp.Type["DumperLoader[Y]"]:
         return Cls  # type: ignore
 
 
-class Pickle(DumperLoader[tp.Any]):
+class StaticDumperLoader(DumperLoader[X]):
+    SUFFIX = ""
+
+    def load(self, filename: str) -> X:
+        filepath = self.folder / filename
+        return self.static_load(filepath)
+
+    def dump(self, key: str, value: X) -> dict[str, tp.Any]:
+        uid = _string_uid(key)
+        filename = uid + self.SUFFIX
+        self.static_dump(filepath=self.folder / filename, value=value)
+        return {"filename": filename}
+
+    @classmethod
+    def static_load(cls, filepath: Path) -> X:
+        raise NotImplementedError
+
+    @classmethod
+    def static_dump(cls, filepath: Path, value: X) -> None:
+        raise NotImplementedError
+
+
+class Pickle(StaticDumperLoader[tp.Any]):
     SUFFIX = ".pkl"
 
     @classmethod
-    def load(cls, basepath: Path) -> tp.Any:
-        fp = cls.filepath(basepath)
-        with fp.open("rb") as f:
+    def static_load(cls, filepath: Path) -> tp.Any:
+        with filepath.open("rb") as f:
             return pickle.load(f)
 
     @classmethod
-    def dump(cls, basepath: Path, value: tp.Any) -> None:
-        fp = cls.filepath(basepath)
-        with utils.temporary_save_path(fp) as tmp:
+    def static_dump(cls, filepath: Path, value: tp.Any) -> None:
+        with utils.temporary_save_path(filepath) as tmp:
             with tmp.open("wb") as f:
                 pickle.dump(value, f)
 
 
-class NumpyArray(DumperLoader[np.ndarray]):
+class NumpyArray(StaticDumperLoader[np.ndarray]):
     SUFFIX = ".npy"
 
     @classmethod
-    def load(cls, basepath: Path) -> np.ndarray:
-        return np.load(cls.filepath(basepath))  # type: ignore
+    def static_load(cls, filepath: Path) -> np.ndarray:
+        return np.load(filepath)  # type: ignore
 
     @classmethod
-    def dump(cls, basepath: Path, value: np.ndarray) -> None:
+    def static_dump(cls, filepath: Path, value: np.ndarray) -> None:
         if not isinstance(value, np.ndarray):
             raise TypeError(f"Expected numpy array but got {value} ({type(value)})")
-        fp = cls.filepath(basepath)
-        with utils.temporary_save_path(fp) as tmp:
+        with utils.temporary_save_path(filepath) as tmp:
             np.save(tmp, value)
 
 
@@ -104,8 +124,8 @@ def dump(cls, basepath: Path, value: np.ndarray) -> None:
 class NumpyMemmapArray(NumpyArray):
 
     @classmethod
-    def load(cls, basepath: Path) -> np.ndarray:
-        return np.load(cls.filepath(basepath), mmap_mode="r")  # type: ignore
+    def static_load(cls, filepath: Path) -> np.ndarray:
+        return np.load(filepath, mmap_mode="r")  # type: ignore
 
 
 class NumpyMemmapArrayDumper(DumperLoader[np.ndarray]):
@@ -131,7 +151,7 @@ def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
             self.row = 0
             mode = "w+"
         fp = self.folder / self._name
-        memmap = np.lib.format.open_memmap(
+        memmap = np.lib.format.open_memmap(  # type: ignore
             filename=fp, mode=mode, dtype=float, shape=shape
         )
         memmap[self.row] = value
@@ -151,18 +171,18 @@ def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
     pass
 else:
 
-    class PandasDataFrame(DumperLoader[pd.DataFrame]):
+    class PandasDataFrame(StaticDumperLoader[pd.DataFrame]):
         SUFFIX = ".csv"
 
         @classmethod
-        def load(cls, basepath: Path) -> pd.DataFrame:
-            fp = cls.filepath(basepath)
-            return pd.read_csv(fp, index_col=0, keep_default_na=False, na_values=[""])
+        def static_load(cls, filepath: Path) -> pd.DataFrame:
+            return pd.read_csv(
+                filepath, index_col=0, keep_default_na=False, na_values=[""]
+            )
 
         @classmethod
-        def dump(cls, basepath: Path, value: pd.DataFrame) -> None:
-            fp = cls.filepath(basepath)
-            with utils.temporary_save_path(fp) as tmp:
+        def static_dump(cls, filepath: Path, value: pd.DataFrame) -> None:
+            with utils.temporary_save_path(filepath) as tmp:
                 value.to_csv(tmp, index=True)
 
     DumperLoader.DEFAULTS[pd.DataFrame] = PandasDataFrame
@@ -174,21 +194,19 @@ def dump(cls, basepath: Path, value: pd.DataFrame) -> None:
         pass
     else:
 
-        class ParquetPandasDataFrame(DumperLoader[pd.DataFrame]):
+        class ParquetPandasDataFrame(StaticDumperLoader[pd.DataFrame]):
             SUFFIX = ".parquet"
 
             @classmethod
-            def load(cls, basepath: Path) -> pd.DataFrame:
-                fp = cls.filepath(basepath)
-                if not fp.exists():
+            def static_load(cls, filepath: Path) -> pd.DataFrame:
+                if not filepath.exists():
                     # fallback to csv for compatibility when updating to parquet
-                    return PandasDataFrame.load(basepath)
+                    return PandasDataFrame.static_load(filepath.with_suffix(".csv"))
                 return pd.read_parquet(fp, dtype_backend="numpy_nullable")
 
             @classmethod
-            def dump(cls, basepath: Path, value: pd.DataFrame) -> None:
-                fp = cls.filepath(basepath)
-                with utils.temporary_save_path(fp) as tmp:
+            def static_dump(cls, filepath: Path, value: pd.DataFrame) -> None:
+                with utils.temporary_save_path(filepath) as tmp:
                     value.to_parquet(tmp)
 
 
@@ -198,24 +216,22 @@ def dump(cls, basepath: Path, value: pd.DataFrame) -> None:
     pass
 else:
 
-    class MneRaw(DumperLoader[mne.io.Raw]):
+    class MneRaw(StaticDumperLoader[mne.io.Raw]):
         SUFFIX = "-raw.fif"
 
         @classmethod
-        def load(cls, basepath: Path) -> mne.io.Raw:
-            fp = cls.filepath(basepath)
+        def static_load(cls, filepath: Path) -> mne.io.Raw:
             try:
-                return mne.io.read_raw_fif(fp, verbose=False, allow_maxshield=False)
+                return mne.io.read_raw_fif(filepath, verbose=False, allow_maxshield=False)
             except ValueError:
-                raw = mne.io.read_raw_fif(fp, verbose=False, allow_maxshield=True)
+                raw = mne.io.read_raw_fif(filepath, verbose=False, allow_maxshield=True)
                 msg = "MaxShield data detected, consider applying Maxwell filter and interpolating bad channels"
                 warnings.warn(msg)
                 return raw
 
         @classmethod
-        def dump(cls, basepath: Path, value: mne.io.Raw) -> None:
-            fp = cls.filepath(basepath)
-            with utils.temporary_save_path(fp) as tmp:
+        def static_dump(cls, filepath: Path, value: mne.io.Raw) -> None:
+            with utils.temporary_save_path(filepath) as tmp:
                 value.save(tmp)
 
     DumperLoader.DEFAULTS[(mne.io.Raw, mne.io.RawArray)] = MneRaw
@@ -229,18 +245,16 @@ def dump(cls, basepath: Path, value: mne.io.Raw) -> None:
 
     Nifti = nibabel.Nifti1Image | nibabel.Nifti2Image
 
-    class NibabelNifti(DumperLoader[Nifti]):
+    class NibabelNifti(StaticDumperLoader[Nifti]):
         SUFFIX = ".nii.gz"
 
         @classmethod
-        def load(cls, basepath: Path) -> mne.io.Raw:
-            fp = cls.filepath(basepath)
-            return nibabel.load(fp, mmap=True)
+        def static_load(cls, filepath: Path) -> mne.io.Raw:
+            return nibabel.load(filepath, mmap=True)
 
         @classmethod
-        def dump(cls, basepath: Path, value: Nifti) -> None:
-            fp = cls.filepath(basepath)
-            with utils.temporary_save_path(fp) as tmp:
+        def static_dump(cls, filepath: Path, value: Nifti) -> None:
+            with utils.temporary_save_path(filepath) as tmp:
                 nibabel.save(value, tmp)
 
     DumperLoader.DEFAULTS[(nibabel.Nifti1Image, nibabel.Nifti2Image)] = NibabelNifti
@@ -264,22 +278,20 @@ def is_view(x: torch.Tensor) -> bool:
         storage_size = len(x.untyped_storage()) // x.dtype.itemsize
         return storage_size != x.numel() or not x.is_contiguous()
 
-    class TorchTensor(DumperLoader[torch.Tensor]):
+    class TorchTensor(StaticDumperLoader[torch.Tensor]):
         SUFFIX = ".pt"
 
         @classmethod
-        def load(cls, basepath: Path) -> torch.Tensor:
-            fp = cls.filepath(basepath)
-            return torch.load(fp, map_location="cpu")  # type: ignore
+        def static_load(cls, filepath: Path) -> torch.Tensor:
+            return torch.load(filepath, map_location="cpu")  # type: ignore
 
         @classmethod
-        def dump(cls, basepath: Path, value: torch.Tensor) -> None:
+        def static_dump(cls, filepath: Path, value: torch.Tensor) -> None:
             if not isinstance(value, torch.Tensor):
                 raise TypeError(f"Expected torch Tensor but got {value} ({type(value)}")
             if is_view(value):
                 value = value.clone()
-            fp = cls.filepath(basepath)
-            with utils.temporary_save_path(fp) as tmp:
+            with utils.temporary_save_path(filepath) as tmp:
                 torch.save(value.detach().cpu(), tmp)
 
     DumperLoader.DEFAULTS[torch.Tensor] = TorchTensor
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index e585d47..8753b26 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -51,26 +51,6 @@ def test_array_cache(tmp_path: Path, in_ram: bool) -> None:
     assert not cache2
 
 
-@pytest.mark.parametrize(
-    "string,expected",
-    [
-        (
-            "whave\t-er I want/to\nput i^n there",
-            "whave--er-I-want-to-put-i^n-there-391137b5",
-        ),
-        (
-            "whave\t-er I want/to put i^n there",  # same but space instead of line return
-            "whave--er-I-want-to-put-i^n-there-cef06284",
-        ),
-        (50 * "a" + 50 * "b", 40 * "a" + "[.]" + 40 * "b" + "-932620a9"),
-        (51 * "a" + 50 * "b", 40 * "a" + "[.]" + 40 * "b" + "-86bb658a"),  # longer
-    ],
-)
-def test_string_uid(string: str, expected: str) -> None:
-    out = cd._string_uid(string)
-    assert out == expected
-
-
 @pytest.mark.parametrize(
     "data",
     (
diff --git a/exca/test_dumperloader.py b/exca/test_dumperloader.py
index b97c189..cf0134c 100644
--- a/exca/test_dumperloader.py
+++ b/exca/test_dumperloader.py
@@ -92,3 +92,23 @@ def test_dump_torch_view(tmp_path: Path) -> None:
 def test_default_class() -> None:
     out = dumperloader.DumperLoader.default_class(int | None)  # type: ignore
     assert out is dumperloader.Pickle
+
+
+@pytest.mark.parametrize(
+    "string,expected",
+    [
+        (
+            "whave\t-er I want/to\nput i^n there",
+            "whave--er-I-want-to-put-i^n-there-391137b5",
+        ),
+        (
+            "whave\t-er I want/to put i^n there",  # same but space instead of line return
+            "whave--er-I-want-to-put-i^n-there-cef06284",
+        ),
+        (50 * "a" + 50 * "b", 40 * "a" + "[.]" + 40 * "b" + "-932620a9"),
+        (51 * "a" + 50 * "b", 40 * "a" + "[.]" + 40 * "b" + "-86bb658a"),  # longer
+    ],
+)
+def test_string_uid(string: str, expected: str) -> None:
+    out = dumperloader._string_uid(string)
+    assert out == expected

From 2f187be234074bd966e09e5afdf70561efd94028 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Tue, 14 Jan 2025 17:55:34 +0100
Subject: [PATCH 15/36] wip

---
 exca/dumperloader.py      |  2 +-
 exca/test_dumperloader.py | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/exca/dumperloader.py b/exca/dumperloader.py
index 41e378f..5c0645c 100644
--- a/exca/dumperloader.py
+++ b/exca/dumperloader.py
@@ -202,7 +202,7 @@ def static_load(cls, filepath: Path) -> pd.DataFrame:
                 if not filepath.exists():
                     # fallback to csv for compatibility when updating to parquet
                     return PandasDataFrame.static_load(filepath.with_suffix(".csv"))
-                return pd.read_parquet(fp, dtype_backend="numpy_nullable")
+                return pd.read_parquet(filepath, dtype_backend="numpy_nullable")
 
             @classmethod
             def static_dump(cls, filepath: Path, value: pd.DataFrame) -> None:
diff --git a/exca/test_dumperloader.py b/exca/test_dumperloader.py
index cf0134c..704a3b3 100644
--- a/exca/test_dumperloader.py
+++ b/exca/test_dumperloader.py
@@ -40,10 +40,10 @@ def test_data_dump_suffix(tmp_path: Path, data: tp.Any) -> None:
     Cls = dumperloader.DumperLoader.default_class(type(data))
     if not isinstance(data, str):
         assert Cls is not dumperloader.Pickle
-    dl = Cls()
+    dl = Cls(tmp_path)
     # test with an extension, as it's easy to mess the new name with Path.with_suffix
-    dl.dump(tmp_path / "blublu.ext", data)
-    reloaded = dl.load(tmp_path / "blublu.ext")
+    info = dl.dump("blublu.ext", data)
+    reloaded = dl.load(**info)
     ExpectedCls = type(data)
     if ExpectedCls is mne.io.RawArray:
         ExpectedCls = mne.io.Raw
@@ -55,9 +55,9 @@ def test_text_df(tmp_path: Path, name: str) -> None:
     df = pd.DataFrame(
         [{"type": "Word", "text": "None"}, {"type": "Something", "number": 12}]
     )
-    dl = dumperloader.DumperLoader.CLASSES[name]()
-    dl.dump(tmp_path / "blublu", df)
-    reloaded = dl.load(tmp_path / "blublu")
+    dl = dumperloader.DumperLoader.CLASSES[name](tmp_path)
+    info = dl.dump("blublu", df)
+    reloaded = dl.load(**info)
     assert reloaded.loc[0, "text"] == "None"
     assert pd.isna(reloaded.loc[1, "text"])  # type: ignore
     assert pd.isna(reloaded.loc[0, "number"])  # type: ignore
@@ -83,9 +83,9 @@ def test_dump_torch_view(tmp_path: Path) -> None:
     data = torch.arange(8)[:2]
     assert dumperloader.is_view(data)
     # reloading it should not be a view as it was cloned
-    dl = dumperloader.TorchTensor()
-    dl.dump(tmp_path / "blublu", data)
-    reloaded = dl.load(tmp_path / "blublu")
+    dl = dumperloader.TorchTensor(tmp_path)
+    info = dl.dump("blublu", data)
+    reloaded = dl.load(**info)
     assert not dumperloader.is_view(reloaded)
 
 

From 69ceee81f496ffdfa9df1791f56a10f3136cf39b Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Tue, 14 Jan 2025 18:02:18 +0100
Subject: [PATCH 16/36] wip

---
 exca/cachedict.py    | 2 ++
 exca/dumperloader.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index dc7658a..2474e36 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -205,6 +205,8 @@ def _get_loader(self) -> DumperLoader:
         key = host_pid()  # make sure we dont use a loader from another thread
         if self.folder is None:
             raise RuntimeError("Cannot get loader with no folder")
+        if self.cache_type is None:
+            raise RuntimeError("Shouldn't get called with no cache type")
         if key not in self._loaders:
             self._loaders[key] = DumperLoader.CLASSES[self.cache_type](self.folder)
         return self._loaders[key]
diff --git a/exca/dumperloader.py b/exca/dumperloader.py
index 5c0645c..530cd21 100644
--- a/exca/dumperloader.py
+++ b/exca/dumperloader.py
@@ -69,7 +69,7 @@ def default_class(type_: Y) -> tp.Type["DumperLoader[Y]"]:
 class StaticDumperLoader(DumperLoader[X]):
     SUFFIX = ""
 
-    def load(self, filename: str) -> X:
+    def load(self, filename: str) -> X:  # type: ignore
         filepath = self.folder / filename
         return self.static_load(filepath)
 
@@ -136,7 +136,7 @@ def __init__(self, folder: Path | str) -> None:
         self.row = 0
         self._name: str | None = None
 
-    def load(self, filename: str, row: int) -> np.ndarray:
+    def load(self, filename: str, row: int) -> np.ndarray:  # type: ignore
         return np.load(self.folder / filename, mmap_mode="r")[row]  # type: ignore
 
     def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:

From a07e8ebda4813cbd9e476ff0f3e7ef0f8f981a19 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Tue, 14 Jan 2025 18:15:23 +0100
Subject: [PATCH 17/36] fix

---
 exca/dumperloader.py      |  5 ++---
 exca/test_dumperloader.py | 13 +++++++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/exca/dumperloader.py b/exca/dumperloader.py
index 530cd21..628af0a 100644
--- a/exca/dumperloader.py
+++ b/exca/dumperloader.py
@@ -128,7 +128,7 @@ def static_load(cls, filepath: Path) -> np.ndarray:
         return np.load(filepath, mmap_mode="r")  # type: ignore
 
 
-class NumpyMemmapArrayDumper(DumperLoader[np.ndarray]):
+class MultiMemmapArray(DumperLoader[np.ndarray]):
 
     def __init__(self, folder: Path | str) -> None:
         super().__init__(folder)
@@ -156,8 +156,7 @@ def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
         )
         memmap[self.row] = value
         memmap.flush()
-        memmap.close()
-        out = {"row": self.row, "filename": str(fp)}
+        out = {"row": self.row, "filename": fp.name}
         self.row += 1
         if self.size == self.row:
             self.size = max(self.size + 1, int(round(1.2 * self.size)))
diff --git a/exca/test_dumperloader.py b/exca/test_dumperloader.py
index 704a3b3..1215cd7 100644
--- a/exca/test_dumperloader.py
+++ b/exca/test_dumperloader.py
@@ -112,3 +112,16 @@ def test_default_class() -> None:
 def test_string_uid(string: str, expected: str) -> None:
     out = dumperloader._string_uid(string)
     assert out == expected
+
+
+def test_multi_memmap_array(tmp_path: Path) -> None:
+    dl = dumperloader.MultiMemmapArray(folder=tmp_path)
+    dl.size = 2
+    info = []
+    x = np.random.rand(2, 3)
+    info.append(dl.dump("x", x))
+    info.append(dl.dump("y", np.random.rand(2, 3)))
+    info.append(dl.dump("z", np.random.rand(2, 3)))
+    assert info[0]["filename"] == info[1]["filename"]
+    assert info[2]["filename"] != info[1]["filename"]
+    np.testing.assert_array_equal(dl.load(**info[0]), x)

From c6f6c7aebb281a261fdf7e1cd9fb79e874456444 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Tue, 14 Jan 2025 18:41:29 +0100
Subject: [PATCH 18/36] HACK

---
 exca/cachedict.py | 7 +++++++
 exca/map.py       | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 2474e36..94483d8 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -89,6 +89,10 @@ def __init__(
         self._key_info: dict[str, dict[str, tp.Any]] = {}
         self.cache_type = cache_type
         self._set_cache_type(cache_type)
+        self._informed_size: int | None = None
+
+    def inform_size(self, size: int) -> None:
+        self._informed_size = size
 
     def __repr__(self) -> str:
         name = self.__class__.__name__
@@ -209,6 +213,9 @@ def _get_loader(self) -> DumperLoader:
             raise RuntimeError("Shouldn't get called with no cache type")
         if key not in self._loaders:
             self._loaders[key] = DumperLoader.CLASSES[self.cache_type](self.folder)
+            if self._informed_size is not None:
+                if hasattr(self._loaders[key], "size"):
+                    self._loaders[key].size = self._informed_size
         return self._loaders[key]
 
     def _set_cache_type(self, cache_type: str | None) -> None:
diff --git a/exca/map.py b/exca/map.py
index e5c6c8c..b299b5e 100644
--- a/exca/map.py
+++ b/exca/map.py
@@ -466,6 +466,8 @@ def _call_and_store(
         imethod = self._infra_method
         if imethod is None:
             raise RuntimeError(f"Infra was not applied: {self!r}")
+        if isinstance(d, CacheDict):
+            d.inform_size(len(items))
         item_uid = imethod.item_uid
         if items:  # make sure some overlapping job did not already run stuff
             keys = set(d)  # update cache dict

From cf6ebc0bc8ff401be53ee2cc6d3d405638e28c81 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Wed, 15 Jan 2025 10:29:22 +0100
Subject: [PATCH 19/36] simplify

---
 exca/cachedict.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 94483d8..88e6488 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -99,13 +99,6 @@ def __repr__(self) -> str:
         keep_in_ram = self._keep_in_ram
         return f"{name}({self.folder},{keep_in_ram=})"
 
-    @property
-    def _write_fp(self) -> Path:
-        if self.folder is None:
-            raise RuntimeError("No write filepath with no provided folder")
-        name = f"{host_pid()}-info.jsonl"
-        return Path(self.folder) / name
-
     def clear(self) -> None:
         self._ram_data.clear()
         self._key_info.clear()
@@ -214,8 +207,9 @@ def _get_loader(self) -> DumperLoader:
         if key not in self._loaders:
             self._loaders[key] = DumperLoader.CLASSES[self.cache_type](self.folder)
             if self._informed_size is not None:
-                if hasattr(self._loaders[key], "size"):
-                    self._loaders[key].size = self._informed_size
+                if hasattr(self._loaders[key], "size"):  # HACKY!
+                    self._loaders[key].size = self._informed_size  # type: ignore
+                    self._informed_size = None
         return self._loaders[key]
 
     def _set_cache_type(self, cache_type: str | None) -> None:
@@ -261,9 +255,11 @@ def __setitem__(self, key: str, value: X) -> None:
                 files.append(keyfile)
             # new write
             info["_key"] = key
-            write_fp = self._write_fp
-            if not write_fp.exists():
-                files.append(write_fp)  # no need to update the file permission again
+            info_fp = Path(self.folder) / f"{host_pid()}-info.jsonl"
+            if not info_fp.exists():
+                files.append(
+                    write_fp
+                )  # no need to update the file permission if already there
             with write_fp.open("ab") as f:
                 b = json.dumps(info).encode("utf8")
                 current = f.tell()

From 1bf76d34de4636825993483f5c9265860f65a227 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Wed, 15 Jan 2025 11:22:38 +0100
Subject: [PATCH 20/36] fix

---
 exca/cachedict.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 88e6488..b5f60df 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -67,7 +67,7 @@ def __init__(
         keep_in_ram: bool = False,
         cache_type: None | str = None,
         permissions: int | None = 0o777,
-        _write_legacy_key_files: bool = True,
+        _write_legacy_key_files: bool = False,
     ) -> None:
         self.folder = None if folder is None else Path(folder)
         self.permissions = permissions
@@ -257,14 +257,13 @@ def __setitem__(self, key: str, value: X) -> None:
             info["_key"] = key
             info_fp = Path(self.folder) / f"{host_pid()}-info.jsonl"
             if not info_fp.exists():
-                files.append(
-                    write_fp
-                )  # no need to update the file permission if already there
-            with write_fp.open("ab") as f:
+                # no need to update the file permission if pre-existing
+                files.append(info_fp)
+            with info_fp.open("ab") as f:
                 b = json.dumps(info).encode("utf8")
                 current = f.tell()
                 f.write(b + b"\n")
-                info.update(_jsonl=write_fp, _byterange=(current, current + len(b) + 1))
+                info.update(_jsonl=info_fp, _byterange=(current, current + len(b) + 1))
             self._key_info[key] = info
             # reading will reload to in-memory cache if need be
             # (since dumping may have loaded the underlying data, let's not keep it)

From ca2021c0013b6387525fb2e0bf149fc7ac73f958 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Wed, 15 Jan 2025 11:35:00 +0100
Subject: [PATCH 21/36] doc

---
 exca/cachedict.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index b5f60df..d271b2c 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -57,8 +57,9 @@ class CacheDict(tp.Generic[X]):
 
     Note
     ----
-    Each item is cached as 1 file, with an additional .key file with the same name holding
-    the actual key for the item (which can differ from the file name)
+    Dicts write to .jsonl files to hold keys and how to read the
+    corresponding item. Different threads write to different jsonl
+    files to avoid interferences.
     """
 
     def __init__(

From 60f4e0828348521368a6b4d6519ebe1cf0e717d2 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Wed, 15 Jan 2025 11:35:23 +0100
Subject: [PATCH 22/36] fix

---
 exca/cachedict.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index d271b2c..1ec9a78 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -68,7 +68,7 @@ def __init__(
         keep_in_ram: bool = False,
         cache_type: None | str = None,
         permissions: int | None = 0o777,
-        _write_legacy_key_files: bool = False,
+        _write_legacy_key_files: bool = True,
     ) -> None:
         self.folder = None if folder is None else Path(folder)
         self.permissions = permissions

From 5ee64069fce7f77cd4e458721722b92045dc13da Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Wed, 15 Jan 2025 12:13:13 +0100
Subject: [PATCH 23/36] simplify

---
 exca/cachedict.py    | 16 +---------------
 exca/dumperloader.py |  6 ++++++
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 1ec9a78..2dbd3f9 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -188,8 +188,6 @@ def __getitem__(self, key: str) -> X:
             # https://stackoverflow.com/questions/3112546/os-path-exists-lies/3112717
             self.folder.chmod(self.folder.stat().st_mode)
             _ = key in self
-        if self.cache_type is None:
-            self.check_cache_type()
         if self.cache_type is None:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
         info = self._key_info[key]
@@ -220,11 +218,8 @@ def _set_cache_type(self, cache_type: str | None) -> None:
         if cache_type is None:
             if fp.exists():
                 cache_type = fp.read_text()
-                if cache_type not in DumperLoader.CLASSES:
-                    logger.warning("Ignoring cache_type file providing: %s", cache_type)
-                    cache_type = None
-        self.check_cache_type(cache_type)
         if cache_type is not None:
+            DumperLoader.check_valid_cache_type(cache_type)
             self.cache_type = cache_type
             if not fp.exists():
                 self.folder.mkdir(exist_ok=True)
@@ -232,13 +227,6 @@ def _set_cache_type(self, cache_type: str | None) -> None:
                 if self.permissions is not None:
                     fp.chmod(self.permissions)
 
-    @staticmethod
-    def check_cache_type(cache_type: None | str = None) -> None:
-        if cache_type is not None:
-            if cache_type not in DumperLoader.CLASSES:
-                avail = list(DumperLoader.CLASSES)
-                raise ValueError(f"Unknown {cache_type=}, use one of {avail}")
-
     def __setitem__(self, key: str, value: X) -> None:
         if self.cache_type is None:
             cls = DumperLoader.default_class(type(value))
@@ -282,8 +270,6 @@ def __delitem__(self, key: str) -> None:
         self._ram_data.pop(key, None)
         if self.folder is None:
             return
-        if self.cache_type is None:
-            self.check_cache_type()
         if self.cache_type is None:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
         loader = self._get_loader()
diff --git a/exca/dumperloader.py b/exca/dumperloader.py
index 628af0a..31a7e5b 100644
--- a/exca/dumperloader.py
+++ b/exca/dumperloader.py
@@ -65,6 +65,12 @@ def default_class(type_: Y) -> tp.Type["DumperLoader[Y]"]:
             pass
         return Cls  # type: ignore
 
+    @classmethod
+    def check_valid_cache_type(cls, cache_type: str) -> None:
+        if cache_type not in DumperLoader.CLASSES:
+            avail = list(DumperLoader.CLASSES)
+            raise ValueError(f"Unknown {cache_type=}, use one of {avail}")
+
 
 class StaticDumperLoader(DumperLoader[X]):
     SUFFIX = ""

From 0a7ea3b185efa5d1ab353445f0d0bb94dfe7ec1f Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Thu, 16 Jan 2025 14:15:04 +0100
Subject: [PATCH 24/36] fix

---
 exca/cachedict.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 2dbd3f9..35ab568 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -151,17 +151,21 @@ def _load_info_files(self) -> None:
         names = out.decode("utf8").splitlines()
         for name in names:
             fp = folder / name
-            num = 0
+            last = 0
+            meta = {}
             with fp.open("rb") as f:
-                for line in f:
+                for k, line in enumerate(f):
                     count = len(line)
+                    last = last + count
                     line = line.strip()
                     if not line:
                         continue
                     info = json.loads(line.decode("utf8"))
-                    info.update(_jsonl=fp, _byterange=(num, num + count))
+                    if not k:  # metadata
+                        meta = info
+                        continue
+                    info.update(_jsonl=fp, _byterange=(last - count, last), **meta)
                     self._key_info[info.pop("_key")] = info
-                    num += count
 
     def values(self) -> tp.Iterable[X]:
         for key in self:
@@ -245,10 +249,13 @@ def __setitem__(self, key: str, value: X) -> None:
             # new write
             info["_key"] = key
             info_fp = Path(self.folder) / f"{host_pid()}-info.jsonl"
-            if not info_fp.exists():
-                # no need to update the file permission if pre-existing
-                files.append(info_fp)
+            first_write = not info_fp.exists()
             with info_fp.open("ab") as f:
+                if first_write:
+                    # no need to update the file permission if pre-existing
+                    files.append(info_fp)
+                    meta = {"_cache_type": dumper.__class__.__name__}
+                    f.write(json.dumps(meta).encode("utf8") + b"\n")
                 b = json.dumps(info).encode("utf8")
                 current = f.tell()
                 f.write(b + b"\n")

From bf0649ceac07041e47ac4e91499347b8c3a2e77e Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Thu, 16 Jan 2025 15:40:30 +0100
Subject: [PATCH 25/36] wip

---
 exca/cachedict.py      | 77 +++++++++++++++++++++++++++++-------------
 exca/test_cachedict.py |  4 +--
 2 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 35ab568..33afa71 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -7,6 +7,7 @@
 """
 Disk, RAM caches
 """
+import dataclasses
 import json
 import logging
 import shutil
@@ -24,6 +25,13 @@
 logger = logging.getLogger(__name__)
 
 
+@dataclasses.dataclass
+class DumpInfo:
+    byte_range: tuple[int, int]
+    jsonl: Path
+    cache_type: str
+
+
 class CacheDict(tp.Generic[X]):
     """Dictionary-like object that caches and loads data on disk and ram.
 
@@ -74,7 +82,7 @@ def __init__(
         self.permissions = permissions
         self._keep_in_ram = keep_in_ram
         self._write_legacy_key_files = _write_legacy_key_files
-        self._loaders: dict[str, DumperLoader] = {}  # loaders are persistent
+        self._loaders: dict[tuple[str, str], DumperLoader] = {}  # loaders are persistent
         if self.folder is None and not keep_in_ram:
             raise ValueError("At least folder or keep_in_ram should be activated")
 
@@ -126,6 +134,10 @@ def keys(self) -> tp.Iterator[str]:
                 out = e.output
             names = out.decode("utf8").splitlines()
             jobs = {}
+            if self.cache_type is None:
+                self._set_cache_type(None)
+            if self.cache_type is None:
+                raise RuntimeError("cache_type should have been detected")
             # parallelize content reading
             with futures.ThreadPoolExecutor() as ex:
                 jobs = {
@@ -133,7 +145,18 @@ def keys(self) -> tp.Iterator[str]:
                     for name in names
                     if name[:-4] not in self._key_info
                 }
-            self._key_info.update({j.result(): {"uid": name} for name, j in jobs.items()})
+            info = {
+                j.result(): {
+                    "uid": name,
+                    "_dump_info": DumpInfo(
+                        byte_range=(0, 0),
+                        jsonl=folder / (name + ".key"),
+                        cache_type=self.cache_type,
+                    ),
+                }
+                for name, j in jobs.items()
+            }
+            self._key_info.update(info)
             self._load_info_files()
             keys |= set(self._key_info)
         return iter(keys)
@@ -164,7 +187,8 @@ def _load_info_files(self) -> None:
                     if not k:  # metadata
                         meta = info
                         continue
-                    info.update(_jsonl=fp, _byterange=(last - count, last), **meta)
+                    dinfo = DumpInfo(jsonl=fp, byte_range=(last - count, last), **meta)
+                    info["_dump_info"] = dinfo
                     self._key_info[info.pop("_key")] = info
 
     def values(self) -> tp.Iterable[X]:
@@ -195,20 +219,22 @@ def __getitem__(self, key: str) -> X:
         if self.cache_type is None:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
         info = self._key_info[key]
-        loader = self._get_loader()
+        dinfo: DumpInfo = info["_dump_info"]
+        loader = self._get_loader(dinfo.cache_type)
         loaded = loader.load(**{x: y for x, y in info.items() if not x.startswith("_")})
         if self._keep_in_ram:
             self._ram_data[key] = loaded
         return loaded  # type: ignore
 
-    def _get_loader(self) -> DumperLoader:
-        key = host_pid()  # make sure we dont use a loader from another thread
+    def _get_loader(self, cache_type: str) -> DumperLoader:
+        key = (
+            cache_type,
+            host_pid(),
+        )  # make sure we dont use a loader from another thread
         if self.folder is None:
             raise RuntimeError("Cannot get loader with no folder")
-        if self.cache_type is None:
-            raise RuntimeError("Shouldn't get called with no cache type")
         if key not in self._loaders:
-            self._loaders[key] = DumperLoader.CLASSES[self.cache_type](self.folder)
+            self._loaders[key] = DumperLoader.CLASSES[cache_type](self.folder)
             if self._informed_size is not None:
                 if hasattr(self._loaders[key], "size"):  # HACKY!
                     self._loaders[key].size = self._informed_size  # type: ignore
@@ -239,7 +265,7 @@ def __setitem__(self, key: str, value: X) -> None:
         if self._keep_in_ram and self.folder is None:
             self._ram_data[key] = value
         if self.folder is not None:
-            dumper = self._get_loader()
+            dumper = self._get_loader(self.cache_type)
             info = dumper.dump(key, value)
             files = [self.folder / info["filename"]]
             if self._write_legacy_key_files and isinstance(dumper, StaticDumperLoader):
@@ -250,16 +276,18 @@ def __setitem__(self, key: str, value: X) -> None:
             info["_key"] = key
             info_fp = Path(self.folder) / f"{host_pid()}-info.jsonl"
             first_write = not info_fp.exists()
+            meta = {"cache_type": dumper.__class__.__name__}
             with info_fp.open("ab") as f:
                 if first_write:
-                    # no need to update the file permission if pre-existing
                     files.append(info_fp)
-                    meta = {"_cache_type": dumper.__class__.__name__}
                     f.write(json.dumps(meta).encode("utf8") + b"\n")
                 b = json.dumps(info).encode("utf8")
                 current = f.tell()
                 f.write(b + b"\n")
-                info.update(_jsonl=info_fp, _byterange=(current, current + len(b) + 1))
+                dinfo = DumpInfo(
+                    jsonl=info_fp, byte_range=(current, current + len(b) + 1), **meta
+                )
+                info["_dump_info"] = dinfo
             self._key_info[key] = info
             # reading will reload to in-memory cache if need be
             # (since dumping may have loaded the underlying data, let's not keep it)
@@ -279,18 +307,17 @@ def __delitem__(self, key: str) -> None:
             return
         if self.cache_type is None:
             raise RuntimeError(f"Could not figure cache_type in {self.folder}")
-        loader = self._get_loader()
         info = self._key_info.pop(key)
-        if isinstance(loader, StaticDumperLoader):
-            keyfile = self.folder / (info["filename"][: -len(loader.SUFFIX)] + ".key")
-            keyfile.unlink(missing_ok=True)
-        if "_jsonl" in info:
-            jsonl = Path(info["_jsonl"])
-            brange = info["_byterange"]
+        dinfo: DumpInfo = info["_dump_info"]
+        loader = self._get_loader(dinfo.cache_type)
+        if dinfo.jsonl.suffix == ".key":
+            dinfo.jsonl.unlink(missing_ok=True)
+        else:
+            brange = dinfo.byte_range
             # overwrite with whitespaces
-            with jsonl.open("rb+") as f:
+            with dinfo.jsonl.open("rb+") as f:
                 f.seek(brange[0])
-                f.write(b" " * (brange[1] - brange[0] - 1) + b"\n")
+                f.write(b" " * (brange[1] - brange[0] - 1))
         info = {x: y for x, y in info.items() if not x.startswith("_")}
         if len(info) == 1:  # only filename -> we can remove it as it is not shared
             # moves then delete to avoid weird effects
@@ -309,8 +336,10 @@ def __contains__(self, key: str) -> bool:
             # maybe in folder (never read it)
             uid = _string_uid(key)
             fp = self.folder / f"{uid}.key"
-            if fp.exists():
-                loader = self._get_loader()
+            if self.cache_type is None:
+                self._set_cache_type(None)
+            if fp.exists() and self.cache_type is not None:
+                loader = self._get_loader(self.cache_type)
                 if not isinstance(loader, StaticDumperLoader):
                     raise RuntimeError(
                         "Cannot regenerate info from non-static dumper-loader"
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index 8753b26..1d10f33 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -146,8 +146,8 @@ def test_info_jsonl_deletion(
     _ = cache.keys()  # listing
     assert cache._key_info == info
     for sub in info.values():
-        fp = Path(sub["_jsonl"])
-        r = sub["_byterange"]
+        fp = sub["_dump_info"].jsonl
+        r = sub["_dump_info"].byte_range
         with fp.open("rb") as f:
             f.seek(r[0])
             out = f.read(r[1] - r[0])

From 5fd03c88d16c378968066581fb085b7d3da2e914 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Thu, 16 Jan 2025 15:53:40 +0100
Subject: [PATCH 26/36] working

---
 exca/cachedict.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 33afa71..2b939f6 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -136,8 +136,6 @@ def keys(self) -> tp.Iterator[str]:
             jobs = {}
             if self.cache_type is None:
                 self._set_cache_type(None)
-            if self.cache_type is None:
-                raise RuntimeError("cache_type should have been detected")
             # parallelize content reading
             with futures.ThreadPoolExecutor() as ex:
                 jobs = {
@@ -145,6 +143,8 @@ def keys(self) -> tp.Iterator[str]:
                     for name in names
                     if name[:-4] not in self._key_info
                 }
+            if jobs and self.cache_type is None:
+                raise RuntimeError("cache_type should have been detected")
             info = {
                 j.result(): {
                     "uid": name,
@@ -310,10 +310,11 @@ def __delitem__(self, key: str) -> None:
         info = self._key_info.pop(key)
         dinfo: DumpInfo = info["_dump_info"]
         loader = self._get_loader(dinfo.cache_type)
-        if dinfo.jsonl.suffix == ".key":
-            dinfo.jsonl.unlink(missing_ok=True)
-        else:
-            brange = dinfo.byte_range
+        if isinstance(loader, StaticDumperLoader):  # legacy
+            keyfile = self.folder / (info["filename"][: -len(loader.SUFFIX)] + ".key")
+            keyfile.unlink(missing_ok=True)
+        brange = dinfo.byte_range
+        if brange[0] != brange[1]:
             # overwrite with whitespaces
             with dinfo.jsonl.open("rb+") as f:
                 f.seek(brange[0])
@@ -345,6 +346,7 @@ def __contains__(self, key: str) -> bool:
                         "Cannot regenerate info from non-static dumper-loader"
                     )
                 filename = _string_uid(key) + loader.SUFFIX
-                self._key_info[key] = {"filename": filename}
+                dinfo = DumpInfo(byte_range=(0, 0), jsonl=fp, cache_type=self.cache_type)
+                self._key_info[key] = {"filename": filename, "_dump_info": dinfo}
                 return True
         return False  # lazy check

From 76b5639e5304d93a1b82116d6d1b8026f143043b Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Thu, 16 Jan 2025 16:00:20 +0100
Subject: [PATCH 27/36] fix

---
 exca/cachedict.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 2b939f6..8b0ae20 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -133,9 +133,13 @@ def keys(self) -> tp.Iterator[str]:
             except subprocess.CalledProcessError as e:
                 out = e.output
             names = out.decode("utf8").splitlines()
+            if not names:
+                return iter(keys)
             jobs = {}
             if self.cache_type is None:
                 self._set_cache_type(None)
+            if self.cache_type is None:
+                raise RuntimeError("cache_type should have been detected")
             # parallelize content reading
             with futures.ThreadPoolExecutor() as ex:
                 jobs = {
@@ -143,8 +147,6 @@ def keys(self) -> tp.Iterator[str]:
                     for name in names
                     if name[:-4] not in self._key_info
                 }
-            if jobs and self.cache_type is None:
-                raise RuntimeError("cache_type should have been detected")
             info = {
                 j.result(): {
                     "uid": name,

From 4a4c988055aacdca0b93c7029944775088e5a53f Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Fri, 17 Jan 2025 13:59:21 +0100
Subject: [PATCH 28/36] fix

---
 exca/cachedict.py      | 6 ++----
 exca/test_cachedict.py | 1 +
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 8b0ae20..c0cc804 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -133,12 +133,10 @@ def keys(self) -> tp.Iterator[str]:
             except subprocess.CalledProcessError as e:
                 out = e.output
             names = out.decode("utf8").splitlines()
-            if not names:
-                return iter(keys)
             jobs = {}
             if self.cache_type is None:
                 self._set_cache_type(None)
-            if self.cache_type is None:
+            if names and self.cache_type is None:
                 raise RuntimeError("cache_type should have been detected")
             # parallelize content reading
             with futures.ThreadPoolExecutor() as ex:
@@ -153,7 +151,7 @@ def keys(self) -> tp.Iterator[str]:
                     "_dump_info": DumpInfo(
                         byte_range=(0, 0),
                         jsonl=folder / (name + ".key"),
-                        cache_type=self.cache_type,
+                        cache_type=self.cache_type,  # type: ignore
                     ),
                 }
                 for name, j in jobs.items()
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index 1d10f33..0ef31a9 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -152,6 +152,7 @@ def test_info_jsonl_deletion(
             f.seek(r[0])
             out = f.read(r[1] - r[0])
             assert out.startswith(b"{") and out.endswith(b"}\n")
+
     if remove_jsonl:
         for ipath in tmp_path.glob("*.jsonl"):
             ipath.unlink()

From 74fa85481983bfde7fac12995e1ce8a23f9f45b7 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Fri, 17 Jan 2025 14:32:05 +0100
Subject: [PATCH 29/36] simpler

---
 exca/dumperloader.py      | 22 ++++++++++++++++++++++
 exca/test_dumperloader.py | 12 ++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/exca/dumperloader.py b/exca/dumperloader.py
index 31a7e5b..21a2e1f 100644
--- a/exca/dumperloader.py
+++ b/exca/dumperloader.py
@@ -170,6 +170,28 @@ def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
         return out
 
 
+class MultiMemmapArray64(DumperLoader[np.ndarray]):
+
+    def load(self, filename: str, offset: int, shape: tuple[int, ...]) -> np.ndarray:  # type: ignore
+        return np.memmap(
+            self.folder / filename,
+            dtype=np.float64,
+            mode="r",
+            offset=offset,
+            shape=shape,
+            order="C",
+        )
+
+    def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
+        if not isinstance(value, np.ndarray):
+            raise TypeError(f"Expected numpy array but got {value} ({type(value)})")
+        name = f"{host_pid()}.data"
+        with (self.folder / name).open("ab") as f:
+            offset = f.tell()
+            f.write(np.ascontiguousarray(value, dtype=np.float64).data)
+        return {"filename": name, "offset": offset, "shape": tuple(value.shape)}
+
+
 try:
     import pandas as pd
 except ImportError:
diff --git a/exca/test_dumperloader.py b/exca/test_dumperloader.py
index 1215cd7..6a68b31 100644
--- a/exca/test_dumperloader.py
+++ b/exca/test_dumperloader.py
@@ -125,3 +125,15 @@ def test_multi_memmap_array(tmp_path: Path) -> None:
     assert info[0]["filename"] == info[1]["filename"]
     assert info[2]["filename"] != info[1]["filename"]
     np.testing.assert_array_equal(dl.load(**info[0]), x)
+
+
+def test_multi_memmap_array64(tmp_path: Path) -> None:
+    dl = dumperloader.MultiMemmapArray64(folder=tmp_path)
+    info = []
+    x = np.random.rand(2, 3)
+    info.append(dl.dump("x", x))
+    info.append(dl.dump("y", np.random.rand(3, 3)))
+    info.append(dl.dump("z", np.random.rand(4, 3)))
+    assert info[0]["filename"] == info[1]["filename"]
+    np.testing.assert_array_equal(dl.load(**info[0]), x)
+    assert dl.load(**info[1]).shape == (3, 3)

From e1f7a6f6774a9e1f02d2b851fe4418c0d755aaa4 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Fri, 17 Jan 2025 17:28:53 +0100
Subject: [PATCH 30/36] robusttest

---
 exca/dumperloader.py      | 15 ++++++++++-----
 exca/test_dumperloader.py |  8 +++++---
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/exca/dumperloader.py b/exca/dumperloader.py
index 21a2e1f..6ce49ae 100644
--- a/exca/dumperloader.py
+++ b/exca/dumperloader.py
@@ -170,12 +170,12 @@ def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
         return out
 
 
-class MultiMemmapArray64(DumperLoader[np.ndarray]):
+class MemmapArrayFile(DumperLoader[np.ndarray]):
 
-    def load(self, filename: str, offset: int, shape: tuple[int, ...]) -> np.ndarray:  # type: ignore
+    def load(self, filename: str, offset: int, shape: tuple[int, ...], dtype: str) -> np.ndarray:  # type: ignore
         return np.memmap(
             self.folder / filename,
-            dtype=np.float64,
+            dtype=dtype,
             mode="r",
             offset=offset,
             shape=shape,
@@ -188,8 +188,13 @@ def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
         name = f"{host_pid()}.data"
         with (self.folder / name).open("ab") as f:
             offset = f.tell()
-            f.write(np.ascontiguousarray(value, dtype=np.float64).data)
-        return {"filename": name, "offset": offset, "shape": tuple(value.shape)}
+            f.write(np.ascontiguousarray(value).data)
+        return {
+            "filename": name,
+            "offset": offset,
+            "shape": tuple(value.shape),
+            "dtype": str(value.dtype),
+        }
 
 
 try:
diff --git a/exca/test_dumperloader.py b/exca/test_dumperloader.py
index 6a68b31..c7fd8bf 100644
--- a/exca/test_dumperloader.py
+++ b/exca/test_dumperloader.py
@@ -127,13 +127,15 @@ def test_multi_memmap_array(tmp_path: Path) -> None:
     np.testing.assert_array_equal(dl.load(**info[0]), x)
 
 
-def test_multi_memmap_array64(tmp_path: Path) -> None:
-    dl = dumperloader.MultiMemmapArray64(folder=tmp_path)
+def test_multi_memmap_array_file(tmp_path: Path) -> None:
+    dl = dumperloader.MemmapArrayFile(folder=tmp_path)
     info = []
     x = np.random.rand(2, 3)
     info.append(dl.dump("x", x))
     info.append(dl.dump("y", np.random.rand(3, 3)))
     info.append(dl.dump("z", np.random.rand(4, 3)))
     assert info[0]["filename"] == info[1]["filename"]
-    np.testing.assert_array_equal(dl.load(**info[0]), x)
+    x2 = dl.load(**info[0])
+    info.append(dl.dump("w", np.random.rand(5, 3)))  # write in between reads
+    np.testing.assert_array_equal(x2, x)
     assert dl.load(**info[1]).shape == (3, 3)

From 2ee43d64a0990062a0edd3e0e420035fcb3770eb Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Fri, 17 Jan 2025 17:52:28 +0100
Subject: [PATCH 31/36] wip

---
 exca/cachedict.py | 3 ++-
 exca/map.py       | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index c0cc804..b3c0f81 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -76,7 +76,7 @@ def __init__(
         keep_in_ram: bool = False,
         cache_type: None | str = None,
         permissions: int | None = 0o777,
-        _write_legacy_key_files: bool = True,
+        _write_legacy_key_files: bool = False,
     ) -> None:
         self.folder = None if folder is None else Path(folder)
         self.permissions = permissions
@@ -168,6 +168,7 @@ def _load_info_files(self) -> None:
         # read all existing jsonl files
         find_cmd = 'find . -type f -name "*-info.jsonl"'
         try:
+            print(find_cmd, folder)
             out = subprocess.check_output(find_cmd, shell=True, cwd=folder)
         except subprocess.CalledProcessError as e:
             out = e.output  # stderr contains missing tmp files
diff --git a/exca/map.py b/exca/map.py
index b299b5e..ff1bd5b 100644
--- a/exca/map.py
+++ b/exca/map.py
@@ -290,7 +290,8 @@ def _find_missing(self, items: tp.Dict[str, tp.Any]) -> tp.Dict[str, tp.Any]:
         except ValueError:
             pass  # no caching
         else:
-            missing = {k: item for k, item in missing.items() if k not in cache}
+            keys = list(cache.keys())
+            missing = {k: item for k, item in missing.items() if k not in keys}
         self._check_configs(write=True)  # if there is a cache, check config or write it
         executor = self.executor()
         if not hasattr(self, "mode"):  # compatibility

From 7ec7eaf660de63ee7c670f367d7fd94bb606e4bd Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Fri, 17 Jan 2025 18:08:34 +0100
Subject: [PATCH 32/36] wip

---
 exca/map.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/exca/map.py b/exca/map.py
index ff1bd5b..4363b6b 100644
--- a/exca/map.py
+++ b/exca/map.py
@@ -407,6 +407,8 @@ def _method_override_futures(self, items: tp.Sequence[tp.Any]) -> tp.Iterator[tp
             np.random.shuffle(missing)
             if pool is None:
                 # run locally
+                msg = "Computing %s missing items"
+                logger.debug(msg, len(missing))
                 cached = self.folder is not None
                 out = self._call_and_store(
                     [ki[1] for ki in missing], use_cache_dict=cached

From dcfeab5ccf7c4e9a85770d4f093052138b6e252c Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Fri, 17 Jan 2025 21:34:09 +0100
Subject: [PATCH 33/36] wip

---
 exca/cachedict.py | 42 +++++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index b3c0f81..2f6ba6b 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -32,6 +32,21 @@ class DumpInfo:
     cache_type: str
 
 
+def _write_info(info_fp: Path, info: dict, cache_type: str) -> DumpInfo:
+    first_write = not info_fp.exists()
+    meta = {"cache_type": cache_type}
+    with info_fp.open("ab") as f:
+        if first_write:
+            f.write(json.dumps(meta).encode("utf8") + b"\n")
+        b = json.dumps(info).encode("utf8")
+        current = f.tell()
+        f.write(b + b"\n")
+        dinfo = DumpInfo(
+            jsonl=info_fp, byte_range=(current, current + len(b) + 1), **meta
+        )
+    return dinfo
+
+
 class CacheDict(tp.Generic[X]):
     """Dictionary-like object that caches and loads data on disk and ram.
 
@@ -276,19 +291,20 @@ def __setitem__(self, key: str, value: X) -> None:
             # new write
             info["_key"] = key
             info_fp = Path(self.folder) / f"{host_pid()}-info.jsonl"
-            first_write = not info_fp.exists()
-            meta = {"cache_type": dumper.__class__.__name__}
-            with info_fp.open("ab") as f:
-                if first_write:
-                    files.append(info_fp)
-                    f.write(json.dumps(meta).encode("utf8") + b"\n")
-                b = json.dumps(info).encode("utf8")
-                current = f.tell()
-                f.write(b + b"\n")
-                dinfo = DumpInfo(
-                    jsonl=info_fp, byte_range=(current, current + len(b) + 1), **meta
-                )
-                info["_dump_info"] = dinfo
+            # first_write = not info_fp.exists()
+            # meta = {"cache_type": dumper.__class__.__name__}
+            # with info_fp.open("ab") as f:
+            #     if first_write:
+            #         files.append(info_fp)
+            #         f.write(json.dumps(meta).encode("utf8") + b"\n")
+            #     b = json.dumps(info).encode("utf8")
+            #     current = f.tell()
+            #     f.write(b + b"\n")
+            #     dinfo = DumpInfo(
+            #         jsonl=info_fp, byte_range=(current, current + len(b) + 1), **meta
+            #     )
+            dinfo = _write_info(info_fp, info, self.cache_type)
+            info["_dump_info"] = dinfo
             self._key_info[key] = info
             # reading will reload to in-memory cache if need be
             # (since dumping may have loaded the underlying data, let's not keep it)

From 4445d7b16ec18bc801bb041b9fa3457b092ffa79 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Sat, 18 Jan 2025 18:15:11 +0100
Subject: [PATCH 34/36] rminformed

---
 exca/cachedict.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index 2f6ba6b..f89c705 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -113,10 +113,6 @@ def __init__(
         self._key_info: dict[str, dict[str, tp.Any]] = {}
         self.cache_type = cache_type
         self._set_cache_type(cache_type)
-        self._informed_size: int | None = None
-
-    def inform_size(self, size: int) -> None:
-        self._informed_size = size
 
     def __repr__(self) -> str:
         name = self.__class__.__name__
@@ -251,10 +247,6 @@ def _get_loader(self, cache_type: str) -> DumperLoader:
             raise RuntimeError("Cannot get loader with no folder")
         if key not in self._loaders:
             self._loaders[key] = DumperLoader.CLASSES[cache_type](self.folder)
-            if self._informed_size is not None:
-                if hasattr(self._loaders[key], "size"):  # HACKY!
-                    self._loaders[key].size = self._informed_size  # type: ignore
-                    self._informed_size = None
         return self._loaders[key]
 
     def _set_cache_type(self, cache_type: str | None) -> None:

From ed42aa08ba67fb71754273ddf5fa5fee43cf0682 Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Sun, 19 Jan 2025 21:31:32 +0100
Subject: [PATCH 35/36] wip

---
 exca/cachedict.py         | 82 +++++++++++++++++++++++----------------
 exca/dumperloader.py      | 30 +++++++++++---
 exca/map.py               | 18 +++++----
 exca/test_cachedict.py    | 16 +++++---
 exca/test_dumperloader.py | 10 +++--
 5 files changed, 101 insertions(+), 55 deletions(-)

diff --git a/exca/cachedict.py b/exca/cachedict.py
index f89c705..aa066a2 100644
--- a/exca/cachedict.py
+++ b/exca/cachedict.py
@@ -7,7 +7,9 @@
 """
 Disk, RAM caches
 """
+import contextlib
 import dataclasses
+import io
 import json
 import logging
 import shutil
@@ -32,21 +34,6 @@ class DumpInfo:
     cache_type: str
 
 
-def _write_info(info_fp: Path, info: dict, cache_type: str) -> DumpInfo:
-    first_write = not info_fp.exists()
-    meta = {"cache_type": cache_type}
-    with info_fp.open("ab") as f:
-        if first_write:
-            f.write(json.dumps(meta).encode("utf8") + b"\n")
-        b = json.dumps(info).encode("utf8")
-        current = f.tell()
-        f.write(b + b"\n")
-        dinfo = DumpInfo(
-            jsonl=info_fp, byte_range=(current, current + len(b) + 1), **meta
-        )
-    return dinfo
-
-
 class CacheDict(tp.Generic[X]):
     """Dictionary-like object that caches and loads data on disk and ram.
 
@@ -113,6 +100,11 @@ def __init__(
         self._key_info: dict[str, dict[str, tp.Any]] = {}
         self.cache_type = cache_type
         self._set_cache_type(cache_type)
+        # write mode
+        self._info_f: io.BufferedWriter | None = None
+        self._info_fp: Path | None = None
+        self._estack: contextlib.ExitStack | None = None
+        self._dumper: DumperLoader | None = None
 
     def __repr__(self) -> str:
         name = self.__class__.__name__
@@ -265,37 +257,61 @@ def _set_cache_type(self, cache_type: str | None) -> None:
                 if self.permissions is not None:
                     fp.chmod(self.permissions)
 
+    @contextlib.contextmanager
+    def write_mode(self) -> tp.Iterator[None]:
+        if self.folder is None:
+            yield
+            return
+        info_fp = Path(self.folder) / f"{host_pid()}-info.jsonl"
+        try:
+            with contextlib.ExitStack() as estack:
+                f = estack.enter_context(info_fp.open("ab"))
+                self._estack = estack
+                self._info_f = f
+                self._info_fp = info_fp
+                yield
+        except:
+            pass
+        finally:
+            self._info_f = None
+            self._estack = None
+            self._info_fp = None
+
     def __setitem__(self, key: str, value: X) -> None:
         if self.cache_type is None:
             cls = DumperLoader.default_class(type(value))
             self.cache_type = cls.__name__
             self._set_cache_type(self.cache_type)
+
         if self._keep_in_ram and self.folder is None:
             self._ram_data[key] = value
         if self.folder is not None:
-            dumper = self._get_loader(self.cache_type)
-            info = dumper.dump(key, value)
+            if self._info_f is None or self._estack is None or self._info_fp is None:
+                raise RuntimeError("Cannot write without a write_mode context")
+            if self._dumper is None:
+                self._dumper = self._get_loader(self.cache_type)
+                self._estack.enter_context(self._dumper.write_mode())
+            info = self._dumper.dump(key, value)
             files = [self.folder / info["filename"]]
-            if self._write_legacy_key_files and isinstance(dumper, StaticDumperLoader):
-                keyfile = self.folder / (info["filename"][: -len(dumper.SUFFIX)] + ".key")
+            if self._write_legacy_key_files and isinstance(
+                self._dumper, StaticDumperLoader
+            ):
+                keyfile = self.folder / (
+                    info["filename"][: -len(self._dumper.SUFFIX)] + ".key"
+                )
                 keyfile.write_text(key, encoding="utf8")
                 files.append(keyfile)
             # new write
             info["_key"] = key
-            info_fp = Path(self.folder) / f"{host_pid()}-info.jsonl"
-            # first_write = not info_fp.exists()
-            # meta = {"cache_type": dumper.__class__.__name__}
-            # with info_fp.open("ab") as f:
-            #     if first_write:
-            #         files.append(info_fp)
-            #         f.write(json.dumps(meta).encode("utf8") + b"\n")
-            #     b = json.dumps(info).encode("utf8")
-            #     current = f.tell()
-            #     f.write(b + b"\n")
-            #     dinfo = DumpInfo(
-            #         jsonl=info_fp, byte_range=(current, current + len(b) + 1), **meta
-            #     )
-            dinfo = _write_info(info_fp, info, self.cache_type)
+            meta = {"cache_type": self.cache_type}
+            if not self._info_f.tell():
+                self._info_f.write(json.dumps(meta).encode("utf8") + b"\n")
+            b = json.dumps(info).encode("utf8")
+            current = self._info_f.tell()
+            self._info_f.write(b + b"\n")
+            dinfo = DumpInfo(
+                jsonl=self._info_fp, byte_range=(current, current + len(b) + 1), **meta
+            )
             info["_dump_info"] = dinfo
             self._key_info[key] = info
             # reading will reload to in-memory cache if need be
diff --git a/exca/dumperloader.py b/exca/dumperloader.py
index 6ce49ae..2ba5d16 100644
--- a/exca/dumperloader.py
+++ b/exca/dumperloader.py
@@ -4,7 +4,9 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+import contextlib
 import hashlib
+import io
 import pickle
 import socket
 import threading
@@ -42,6 +44,10 @@ class DumperLoader(tp.Generic[X]):
     def __init__(self, folder: str | Path = "") -> None:
         self.folder = Path(folder)
 
+    @contextlib.contextmanager
+    def write_mode(self) -> tp.Iterator[None]:
+        yield
+
     @classmethod
     def __init_subclass__(cls, **kwargs: tp.Any) -> None:
         super().__init_subclass__(**kwargs)
@@ -172,6 +178,20 @@ def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
 
 class MemmapArrayFile(DumperLoader[np.ndarray]):
 
+    def __init__(self, folder: str | Path = "") -> None:
+        super().__init__(folder)
+        self._f: io.BufferedWriter | None = None
+        self._name: str | None = None
+
+    @contextlib.contextmanager
+    def write_mode(self) -> tp.Iterator[None]:
+        self._name = f"{host_pid()}.data"
+        with (self.folder / self._name).open("ab") as f:
+            self._f = f
+            yield
+        self._f = None
+        self._name = None
+
     def load(self, filename: str, offset: int, shape: tuple[int, ...], dtype: str) -> np.ndarray:  # type: ignore
         return np.memmap(
             self.folder / filename,
@@ -183,14 +203,14 @@ def load(self, filename: str, offset: int, shape: tuple[int, ...], dtype: str) -
         )
 
     def dump(self, key: str, value: np.ndarray) -> dict[str, tp.Any]:
+        if self._f is None or self._name is None:
+            raise RuntimeError("Need a write_mode context")
         if not isinstance(value, np.ndarray):
             raise TypeError(f"Expected numpy array but got {value} ({type(value)})")
-        name = f"{host_pid()}.data"
-        with (self.folder / name).open("ab") as f:
-            offset = f.tell()
-            f.write(np.ascontiguousarray(value).data)
+        offset = self._f.tell()
+        self._f.write(np.ascontiguousarray(value).data)
         return {
-            "filename": name,
+            "filename": self._name,
             "offset": offset,
             "shape": tuple(value.shape),
             "dtype": str(value.dtype),
diff --git a/exca/map.py b/exca/map.py
index 4363b6b..2c1350d 100644
--- a/exca/map.py
+++ b/exca/map.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import collections
+import contextlib
 import dataclasses
 import functools
 import inspect
@@ -469,8 +470,6 @@ def _call_and_store(
         imethod = self._infra_method
         if imethod is None:
             raise RuntimeError(f"Infra was not applied: {self!r}")
-        if isinstance(d, CacheDict):
-            d.inform_size(len(items))
         item_uid = imethod.item_uid
         if items:  # make sure some overlapping job did not already run stuff
             keys = set(d)  # update cache dict
@@ -481,12 +480,15 @@ def _call_and_store(
         method = functools.partial(imethod.method, self._obj)
         outputs = method(items)
         sentinel = base.Sentinel()
-        for item, output in itertools.zip_longest(items, outputs, fillvalue=sentinel):
-            if item is sentinel or output is sentinel:
-                raise RuntimeError(
-                    f"Cached function did not yield exactly once per item: {item=!r}, {output=!r}"
-                )
-            d[item_uid(item)] = output
+        with contextlib.ExitStack() as estack:
+            if isinstance(d, CacheDict):
+                estack.enter_context(d.write_mode())
+            for item, output in itertools.zip_longest(items, outputs, fillvalue=sentinel):
+                if item is sentinel or output is sentinel:
+                    raise RuntimeError(
+                        f"Cached function did not yield exactly once per item: {item=!r}, {output=!r}"
+                    )
+                d[item_uid(item)] = output
         # don't return the whole cache dict if data is cached
         return {} if use_cache_dict else d
 
diff --git a/exca/test_cachedict.py b/exca/test_cachedict.py
index 0ef31a9..ddd9d35 100644
--- a/exca/test_cachedict.py
+++ b/exca/test_cachedict.py
@@ -25,7 +25,8 @@ def test_array_cache(tmp_path: Path, in_ram: bool) -> None:
     assert not list(cache.keys())
     assert not len(cache)
     assert not cache
-    cache["blublu"] = x
+    with cache.write_mode():
+        cache["blublu"] = x
     assert "blublu" in cache
     assert cache
     np.testing.assert_almost_equal(cache["blublu"], x)
@@ -33,7 +34,8 @@ def test_array_cache(tmp_path: Path, in_ram: bool) -> None:
     assert set(cache.keys()) == {"blublu"}
     assert bool(cache._ram_data) is in_ram
     cache2: cd.CacheDict[tp.Any] = cd.CacheDict(folder=folder)
-    cache2["blabla"] = 2 * x
+    with cache.write_mode():
+        cache2["blabla"] = 2 * x
     assert "blabla" in cache
     assert set(cache.keys()) == {"blublu", "blabla"}
     d = dict(cache2.items())
@@ -62,7 +64,8 @@ def test_array_cache(tmp_path: Path, in_ram: bool) -> None:
 )
 def test_data_dump_suffix(tmp_path: Path, data: tp.Any) -> None:
     cache: cd.CacheDict[np.ndarray] = cd.CacheDict(folder=tmp_path, keep_in_ram=False)
-    cache["blublu.tmp"] = data
+    with cache.write_mode():
+        cache["blublu.tmp"] = data
     assert cache.cache_type not in [None, "Pickle"]
     names = [fp.name for fp in tmp_path.iterdir() if not fp.name.startswith(".")]
     assert len(names) == 3
@@ -88,7 +91,8 @@ def test_specialized_dump(tmp_path: Path, data: tp.Any, cache_type: str) -> None
     cache: cd.CacheDict[np.ndarray] = cd.CacheDict(
         folder=tmp_path, keep_in_ram=False, cache_type=cache_type
     )
-    cache["x"] = data
+    with cache.write_mode():
+        cache["x"] = data
     assert isinstance(cache["x"], type(data))
 
 
@@ -99,6 +103,7 @@ def test_specialized_dump(tmp_path: Path, data: tp.Any, cache_type: str) -> None
 def test_info_jsonl(
     tmp_path: Path, legacy_write: bool, remove_jsonl: bool, process: bool
 ) -> None:
+    pytest.skip()
     cache: cd.CacheDict[int] = cd.CacheDict(
         folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
     )
@@ -137,7 +142,8 @@ def test_info_jsonl_deletion(
         cache: cd.CacheDict[int] = cd.CacheDict(
             folder=tmp_path, keep_in_ram=False, _write_legacy_key_files=legacy_write
         )
-        cache[k] = 12 if k == "x" else 3
+        with cache.write_mode():
+            cache[k] = 12 if k == "x" else 3
     _ = cache.keys()  # listing
     info = cache._key_info
     cache = cd.CacheDict(
diff --git a/exca/test_dumperloader.py b/exca/test_dumperloader.py
index c7fd8bf..eba16b7 100644
--- a/exca/test_dumperloader.py
+++ b/exca/test_dumperloader.py
@@ -131,11 +131,13 @@ def test_multi_memmap_array_file(tmp_path: Path) -> None:
     dl = dumperloader.MemmapArrayFile(folder=tmp_path)
     info = []
     x = np.random.rand(2, 3)
-    info.append(dl.dump("x", x))
-    info.append(dl.dump("y", np.random.rand(3, 3)))
-    info.append(dl.dump("z", np.random.rand(4, 3)))
+    with dl.write_mode():
+        info.append(dl.dump("x", x))
+        info.append(dl.dump("y", np.random.rand(3, 3)))
+        info.append(dl.dump("z", np.random.rand(4, 3)))
     assert info[0]["filename"] == info[1]["filename"]
     x2 = dl.load(**info[0])
-    info.append(dl.dump("w", np.random.rand(5, 3)))  # write in between reads
+    with dl.write_mode():
+        info.append(dl.dump("w", np.random.rand(5, 3)))  # write in between reads
     np.testing.assert_array_equal(x2, x)
     assert dl.load(**info[1]).shape == (3, 3)

From 99e43e196a81c1f90fd6a65c5c7b6af7db8d58ad Mon Sep 17 00:00:00 2001
From: Jeremy Rapin <jrapin@meta.com>
Date: Sun, 19 Jan 2025 21:38:18 +0100
Subject: [PATCH 36/36] wipscript

---
 script_bench.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 script_bench.py

diff --git a/script_bench.py b/script_bench.py
new file mode 100644
index 0000000..ce4eadd
--- /dev/null
+++ b/script_bench.py
@@ -0,0 +1,78 @@
+import cProfile
+import logging
+import time
+import typing as tp
+
+import numpy as np
+import pydantic
+
+import exca as xk
+
+logging.getLogger("exca").setLevel(logging.DEBUG)
+
+
+class DumpMap(pydantic.BaseModel):
+    param: int = 12
+    infra: xk.MapInfra = xk.MapInfra(version="1")
+
+    @infra.apply(
+        item_uid=str,
+        cache_type="MemmapArrayFile",
+    )
+    def process(self, items: tp.Iterable[str]) -> tp.Iterator[np.ndarray]:
+        for item in items:
+            yield np.random.rand(200, self.param)
+
+
+profiler = cProfile.Profile()
+profiler.enable()
+cfg = DumpMap(infra={"folder": "./cache-test-memfile-context", "keep_in_ram": False})
+t0 = time.time()
+cfg.process([str(k) for k in range(4000)])
+t1 = time.time()
+print(t1 - t0)  # 0.3s creation, 0.15ss read
+profiler.disable()
+profiler.dump_stats("dump-file.prof")
+
+profiler = cProfile.Profile()
+profiler.enable()
+cfg = DumpMap(infra={"folder": "./cache-test-memfile", "keep_in_ram": False})
+t0 = time.time()
+cfg.process([str(k) for k in range(4000)])
+t1 = time.time()
+print(t1 - t0)  # 23s creation, 0.2s read
+profiler.disable()
+profiler.dump_stats("dump-file.prof")
+
+profiler = cProfile.Profile()
+profiler.enable()
+cfg = DumpMap(infra={"folder": "./cache-test-jsonl", "keep_in_ram": False})
+t0 = time.time()
+cfg.process([str(k) for k in range(4000)])
+t1 = time.time()
+print(t1 - t0)  # 18s creation, 0.2s read
+profiler.disable()
+profiler.dump_stats("dump-file.prof")
+
+
+cfg = DumpMap(infra={"folder": "./cache-test-main", "keep_in_ram": False})
+t0 = time.time()
+cfg.process([str(k) for k in range(4000)])
+t1 = time.time()
+print(t1 - t0)  # 5.6s creation, 0.03s check
+
+
+t0 = time.time()
+for x, y in cfg.infra.cache_dict.items():
+    print(x)
+t1 = time.time()
+print(t1 - t0)  # 0.5s (memfile)  11s
+
+keys = [str(k) for k in range(4000)]
+t0 = time.time()
+for x in keys:
+    y = cfg.infra.cache_dict[x]
+    _ = np.array(y)
+    # print(x)
+t1 = time.time()  # 0.5 (memfile)  13s (standard)
+print(t1 - t0)  # 0.5s (memfile)  11s