ensure iterable gets closed in asyncmapper

In `AsyncMapper`, generators are executed on a separate thread. However, if an exception occurs, the `__exit__` method gets executed in the main thread. This can lead to issues with SQLite. Specifically, attempting to close a connection outside the thread where it was created results in the following error: ```console sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 123145530425344 and this is thread id 140704344640320. ``` This PR modifies `AsyncMapper` to ensure that `__exit__` is executed in the same thread where the generator runs. Additionally, since the main thread might still attempt to close the generator, I have added safeguards to avoid calling `sqlite.Connection.close()` more than once. The behavior of the main thread calling `__exit__` is beyond `AsyncMapper`'s control, as it accepts arbitrary `Iterable` inputs.
iterative · Jan 1, 2025 · b9ee297 · b9ee297
1 parent 7adfc0a
commit b9ee297
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 16 deletions.
diff --git a/src/datachain/asyn.py b/src/datachain/asyn.py
@@ -14,6 +14,8 @@
 
 from fsspec.asyn import get_loop
 
+from datachain.utils import safe_closing
+
 ASYNC_WORKERS = 20
 
 InputT = TypeVar("InputT", contravariant=True)  # noqa: PLC0105
@@ -64,11 +66,14 @@ def start_task(self, coro: Coroutine) -> asyncio.Task:
         return task
 
     def _produce(self) -> None:
-        for item in self.iterable:
-            if self._shutdown_producer.is_set():
-                return
-            fut = asyncio.run_coroutine_threadsafe(self.work_queue.put(item), self.loop)
-            fut.result()  # wait until the item is in the queue
+        with safe_closing(self.iterable):
+            for item in self.iterable:
+                if self._shutdown_producer.is_set():
+                    return
+                fut = asyncio.run_coroutine_threadsafe(
+                    self.work_queue.put(item), self.loop
+                )
+                fut.result()  # wait until the item is in the queue
 
     async def produce(self) -> None:
         await self.to_thread(self._produce)

diff --git a/src/datachain/data_storage/sqlite.py b/src/datachain/data_storage/sqlite.py
@@ -239,6 +239,8 @@ def cursor(self, factory=None):
         return self.db.cursor(factory)
 
     def close(self) -> None:
+        if self.is_closed:
+            return
         self.db.close()
         self.is_closed = True
 

diff --git a/src/datachain/lib/udf.py b/src/datachain/lib/udf.py
@@ -1,6 +1,7 @@
 import contextlib
 import sys
 import traceback
+from collections.abc import Generator as GeneratorType
 from collections.abc import Iterable, Iterator, Mapping, Sequence
 from typing import TYPE_CHECKING, Any, Callable, Optional
 
@@ -21,10 +22,9 @@
     Partition,
     RowsOutputBatch,
 )
+from datachain.utils import safe_closing
 
 if TYPE_CHECKING:
-    from collections import abc
-
     from typing_extensions import Self
 
     from datachain.catalog import Catalog
@@ -295,10 +295,13 @@ def run(
     ) -> Iterator[Iterable[UDFResult]]:
         self.catalog = catalog
         self.setup()
-        prepared_inputs: abc.Generator[Sequence[Any], None, None] = (
-            self._prepare_row_and_id(row, udf_fields, cache, download_cb)
-            for row in udf_inputs
-        )
+
+        def row_iter() -> GeneratorType[Sequence[Any], None, None]:
+            with safe_closing(udf_inputs) as rows:
+                for row in rows:
+                    yield self._prepare_row_and_id(row, udf_fields, cache, download_cb)
+
+        prepared_inputs = row_iter()
         if self.prefetch > 0:
             _cache = self.catalog.cache if cache else None
             prepared_inputs = rows_prefetcher(
@@ -378,9 +381,13 @@ def run(
     ) -> Iterator[Iterable[UDFResult]]:
         self.catalog = catalog
         self.setup()
-        prepared_inputs: abc.Generator[Sequence[Any], None, None] = (
-            self._prepare_row(row, udf_fields, cache, download_cb) for row in udf_inputs
-        )
+
+        def row_iter() -> GeneratorType[Sequence[Any], None, None]:
+            with safe_closing(udf_inputs) as rows:
+                for row in rows:
+                    yield self._prepare_row(row, udf_fields, cache, download_cb)
+
+        prepared_inputs = row_iter()
         if self.prefetch > 0:
             _cache = self.catalog.cache if cache else None
             prepared_inputs = rows_prefetcher(

diff --git a/src/datachain/utils.py b/src/datachain/utils.py
@@ -9,6 +9,7 @@
 import sys
 import time
 from collections.abc import Iterable, Iterator, Sequence
+from contextlib import contextmanager
 from datetime import date, datetime, timezone
 from itertools import chain, islice
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
@@ -22,6 +23,7 @@
 
 if TYPE_CHECKING:
     import pandas as pd
+    from typing_extensions import Self
 
 NUL = b"\0"
 TIME_ZERO = datetime.fromtimestamp(0, tz=timezone.utc)
@@ -33,7 +35,7 @@
 STUDIO_URL = "https://studio.datachain.ai"
 
 
-T = TypeVar("T", bound="DataChainDir")
+T = TypeVar("T")
 
 
 class DataChainDir:
@@ -90,7 +92,7 @@ def default_root(cls) -> str:
         return osp.join(root_dir, cls.DEFAULT)
 
     @classmethod
-    def find(cls: type[T], create: bool = True) -> T:
+    def find(cls, create: bool = True) -> "Self":
         try:
             root = os.environ[cls.ENV_VAR]
         except KeyError:
@@ -479,3 +481,12 @@ def row_to_nested_dict(
     for h, v in zip(headers, row):
         nested_dict_path_set(result, h, v)
     return result
+
+
+@contextmanager
+def safe_closing(thing: T) -> Iterator[T]:
+    try:
+        yield thing
+    finally:
+        if hasattr(thing, "close"):
+            thing.close()