Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prefetch: use a separate temporary cache for prefetching #730

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/get_started/torch-loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def forward(self, x):
if __name__ == "__main__":
ds = (
DataChain.from_storage(STORAGE, type="image")
.settings(cache=True, prefetch=25)
.settings(prefetch=25)
.filter(C("file.path").glob("*.jpg"))
.map(
label=lambda path: label_to_int(basename(path)[:3], CLASSES),
Expand All @@ -68,7 +68,7 @@ def forward(self, x):
train_loader = DataLoader(
ds.to_pytorch(transform=transform),
batch_size=25,
num_workers=max(4, os.cpu_count() or 2),
num_workers=min(4, os.cpu_count() or 2),
persistent_workers=True,
multiprocessing_context=multiprocessing.get_context("spawn"),
)
Expand Down
3 changes: 2 additions & 1 deletion src/datachain/asyn.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
Iterable,
Iterator,
)
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, wait
from heapq import heappop, heappush
from typing import Any, Callable, Generic, Optional, TypeVar

Expand Down Expand Up @@ -179,6 +179,7 @@ def iterate(self, timeout=None) -> Generator[ResultT, None, None]:
self.shutdown_producer()
if not async_run.done():
async_run.cancel()
wait([async_run])
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.cancel() does not immediately cancel the task the underlying asyncio task.

We could add a .result() to wait for the future, but that does not seem to work either for the cancelled future from run_coroutine_threadsafe(). See python/cpython#105836.

So, I have added wait(...) as it seems to wait the cancelled future, and wait for underlying asyncio task.

Alternatively, we could add an asyncio.Event and wait for it.


def __iter__(self):
return self.iterate()
Expand Down
40 changes: 31 additions & 9 deletions src/datachain/cache.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import os
from collections.abc import Iterator
from contextlib import contextmanager
from tempfile import mkdtemp
from typing import TYPE_CHECKING, Optional

from dvc_data.hashfile.db.local import LocalHashFileDB
from dvc_objects.fs.local import LocalFileSystem
from dvc_objects.fs.utils import remove
from fsspec.callbacks import Callback, TqdmCallback

from .progress import Tqdm
Expand All @@ -20,6 +24,23 @@ def try_scandir(path):
pass


def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "DataChainCache":
cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
return DataChainCache(cache_dir, tmp_dir=tmp_dir)


@contextmanager
def temporary_cache(
tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
) -> Iterator["DataChainCache"]:
cache = get_temp_cache(tmp_dir, prefix=prefix)
try:
yield cache
finally:
if delete:
cache.destroy()


class DataChainCache:
def __init__(self, cache_dir: str, tmp_dir: str):
self.odb = LocalHashFileDB(
Expand All @@ -28,6 +49,9 @@ def __init__(self, cache_dir: str, tmp_dir: str):
tmp_dir=tmp_dir,
)

def __eq__(self, other) -> bool:
return self.odb == other.odb

@property
def cache_dir(self):
return self.odb.path
Expand Down Expand Up @@ -82,20 +106,18 @@ async def download(
os.unlink(tmp_info)

def store_data(self, file: "File", contents: bytes) -> None:
checksum = file.get_hash()
dst = self.path_from_checksum(checksum)
if not os.path.exists(dst):
# Create the file only if it's not already in cache
os.makedirs(os.path.dirname(dst), exist_ok=True)
with open(dst, mode="wb") as f:
f.write(contents)

def clear(self):
self.odb.add_bytes(file.get_hash(), contents)

def clear(self) -> None:
"""
Completely clear the cache.
"""
self.odb.clear()

def destroy(self) -> None:
# `clear` leaves the prefix directory structure intact.
remove(self.cache_dir)

def get_total_size(self) -> int:
total = 0
for subdir in try_scandir(self.odb.path):
Expand Down
6 changes: 6 additions & 0 deletions src/datachain/catalog/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,12 @@ def find_column_to_str( # noqa: PLR0911
return ""


def clone_catalog_with_cache(catalog: "Catalog", cache: "DataChainCache") -> "Catalog":
clone = catalog.copy()
clone.cache = cache
return clone


class Catalog:
def __init__(
self,
Expand Down
17 changes: 13 additions & 4 deletions src/datachain/lib/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,19 @@
client = self._catalog.get_client(self.source)
client.download(self, callback=self._download_cb)

async def _prefetch(self) -> None:
if self._caching_enabled:
client = self._catalog.get_client(self.source)
await client._download(self, callback=self._download_cb)
async def _prefetch(self, download_cb: Optional["Callback"] = None) -> bool:
from datachain.client.hf import HfClient

if self._catalog is None:
raise RuntimeError("cannot prefetch file because catalog is not setup")

Check warning on line 276 in src/datachain/lib/file.py

View check run for this annotation

Codecov / codecov/patch

src/datachain/lib/file.py#L276

Added line #L276 was not covered by tests

client = self._catalog.get_client(self.source)
if client.protocol == HfClient.protocol:
return False

Check warning on line 280 in src/datachain/lib/file.py

View check run for this annotation

Codecov / codecov/patch

src/datachain/lib/file.py#L280

Added line #L280 was not covered by tests

await client._download(self, callback=download_cb or self._download_cb)
self._download_cb = DEFAULT_CALLBACK
return True

def get_local_path(self) -> Optional[str]:
"""Return path to a file in a local cache.
Expand Down
65 changes: 52 additions & 13 deletions src/datachain/lib/pytorch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import logging
from collections.abc import Iterator
import os
import weakref
from collections.abc import Generator, Iterable, Iterator
from contextlib import closing
from typing import TYPE_CHECKING, Any, Callable, Optional

from PIL import Image
Expand All @@ -9,11 +12,13 @@
from torchvision.transforms import v2

from datachain import Session
from datachain.asyn import AsyncMapper
from datachain.cache import get_temp_cache
from datachain.catalog import Catalog, get_catalog
from datachain.lib.dc import DataChain
from datachain.lib.settings import Settings
from datachain.lib.text import convert_text
from datachain.progress import CombinedDownloadCallback
from datachain.query.dataset import get_download_callback

if TYPE_CHECKING:
from torchvision.transforms.v2 import Transform
Expand Down Expand Up @@ -75,6 +80,18 @@
if (prefetch := dc_settings.prefetch) is not None:
self.prefetch = prefetch

if self.cache or not self.prefetch:
self._cache = catalog.cache
else:
tmp_dir = catalog.cache.tmp_dir
assert tmp_dir
self._cache = get_temp_cache(tmp_dir, prefix="prefetch-")
weakref.finalize(self, self.close)

def close(self) -> None:
if not self.cache:
self._cache.destroy()

def _init_catalog(self, catalog: "Catalog"):
# For compatibility with multiprocessing,
# we can only store params in __init__(), as Catalog isn't picklable
Expand All @@ -89,9 +106,15 @@
ms = ms_cls(*ms_args, **ms_kwargs)
wh_cls, wh_args, wh_kwargs = self._wh_params
wh = wh_cls(*wh_args, **wh_kwargs)
return Catalog(ms, wh, **self._catalog_params)
catalog = Catalog(ms, wh, **self._catalog_params)
catalog.cache = self._cache
return catalog

def _rows_iter(self, total_rank: int, total_workers: int):
def _row_iter(
self,
total_rank: int,
total_workers: int,
) -> Generator[tuple[Any, ...], None, None]:
catalog = self._get_catalog()
session = Session("PyTorch", catalog=catalog)
ds = DataChain.from_dataset(
Expand All @@ -104,16 +127,32 @@
ds = ds.chunk(total_rank, total_workers)
yield from ds.collect()

def __iter__(self) -> Iterator[Any]:
total_rank, total_workers = self.get_rank_and_workers()
rows = self._rows_iter(total_rank, total_workers)
if self.prefetch > 0:
from datachain.lib.udf import _prefetch_input

rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
yield from map(self._process_row, rows)
def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
from datachain.lib.udf import _prefetch_inputs

def _process_row(self, row_features):
total_rank, total_workers = self.get_rank_and_workers()
download_cb = CombinedDownloadCallback()
if os.getenv("DATACHAIN_SHOW_PREFETCH_PROGRESS"):
download_cb = get_download_callback(

Check warning on line 136 in src/datachain/lib/pytorch.py

View check run for this annotation

Codecov / codecov/patch

src/datachain/lib/pytorch.py#L136

Added line #L136 was not covered by tests
f"{total_rank}/{total_workers}", position=total_rank
)
Comment on lines +135 to +138
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shows a prefetch download progressbar for each worker which will be useful for debugging.

We cannot enable this by default, as this will mess up user's progressbar due to multiprocessing.


rows = self._row_iter(total_rank, total_workers)
rows = _prefetch_inputs(
rows,
self.prefetch,
download_cb=download_cb,
after_prefetch=download_cb.increment_file_count,
)

with download_cb, closing(rows):
yield from rows

def __iter__(self) -> Iterator[list[Any]]:
with closing(self._iter_with_prefetch()) as rows:
yield from map(self._process_row, rows)

def _process_row(self, row_features: Iterable[Any]) -> list[Any]:
row = []
for fr in row_features:
if hasattr(fr, "read"):
Expand Down
Loading
Loading