diff --git a/dvc/cli/command.py b/dvc/cli/command.py index 57c0dd3185..39a76984d2 100644 --- a/dvc/cli/command.py +++ b/dvc/cli/command.py @@ -13,7 +13,7 @@ def __init__(self, args): os.chdir(args.cd) - self.repo = Repo(uninitialized=self.UNINITIALIZED) + self.repo: "Repo" = Repo(uninitialized=self.UNINITIALIZED) self.config = self.repo.config self.args = args diff --git a/dvc/cli/parser.py b/dvc/cli/parser.py index dafa942f2b..c626daf9db 100644 --- a/dvc/cli/parser.py +++ b/dvc/cli/parser.py @@ -14,6 +14,7 @@ config, daemon, dag, + data, data_sync, destroy, diff, @@ -88,6 +89,7 @@ experiments, check_ignore, machine, + data, ] diff --git a/dvc/commands/data.py b/dvc/commands/data.py new file mode 100644 index 0000000000..b4b38c856a --- /dev/null +++ b/dvc/commands/data.py @@ -0,0 +1,178 @@ +import argparse +import logging +from typing import TYPE_CHECKING + +from funcy import compact, log_durations + +from dvc.cli.command import CmdBase +from dvc.cli.utils import append_doc_link, fix_subparsers +from dvc.ui import ui + +if TYPE_CHECKING: + from dvc.repo.data import Status as DataStatus + + +logger = logging.getLogger(__name__) + + +class CmdDataStatus(CmdBase): + COLORS = { + "not_in_cache": "red", + "committed": "green", + "uncommitted": "yellow", + "untracked": "cyan", + } + LABELS = { + "not_in_cache": "Not in cache", + "committed": "DVC committed changes", + "uncommitted": "DVC uncommitted changes", + "untracked": "Untracked files", + "unchanged": "DVC unchanged files", + } + HINTS = { + "not_in_cache": 'use "dvc pull ..." ' + "to update your local storage", + "committed": "git commit the corresponding dvc files " + "to update the repo", + "uncommitted": 'use "dvc commit ..." to track changes', + "untracked": 'use "git add ..." or ' + 'dvc add ..." to commit to git or to dvc', + "git_dirty": "there are {}changes not tracked by dvc, " + 'use "git status" to see', + } + + @staticmethod + def _process_status(status: "DataStatus"): + """Flatten stage status, and filter empty stage status contents.""" + for stage, stage_status in status.items(): + items = stage_status + if isinstance(stage_status, dict): + items = { + file: state + for state, files in stage_status.items() + for file in files + } + if not items: + continue + yield stage, items + + @classmethod + def _show_status(cls, status: "DataStatus") -> int: + git_info = status.pop("git") # type: ignore[misc] + result = dict(cls._process_status(status)) + if not result: + no_changes = "No changes" + if git_info.get("is_empty", False): + no_changes += " in an empty git repo" + ui.write(f"{no_changes}.") + + for idx, (stage, stage_status) in enumerate(result.items()): + if idx: + ui.write() + + label = cls.LABELS.get(stage, stage.capitalize() + " files") + header = f"{label}:" + color = cls.COLORS.get(stage, "normal") + + ui.write(header) + if hint := cls.HINTS.get(stage): + ui.write(f" ({hint})") + + if isinstance(stage_status, dict): + items = [ + ": ".join([state, file]) + for file, state in stage_status.items() + ] + else: + items = stage_status + + for item in items: + ui.write(f"\t[{color}]{item}[/]".expandtabs(8), styled=True) + + if (hint := cls.HINTS.get("git_dirty")) and git_info.get("is_dirty"): + message = hint.format("other " if result else "") + ui.write(f"[blue]({message})[/]", styled=True) + return 0 + + def run(self) -> int: + with log_durations(logger.trace, "in data_status"): # type: ignore + status = self.repo.data_status( + granular=self.args.granular, + untracked_files=self.args.untracked_files, + with_dirs=self.args.with_dirs, + ) + + if not self.args.unchanged: + status.pop("unchanged") # type: ignore[misc] + if self.args.untracked_files == "no": + status.pop("untracked") + if self.args.json: + status.pop("git") # type: ignore[misc] + ui.write_json(compact(status)) + return 0 + return self._show_status(status) + + +def add_parser(subparsers, parent_parser): + data_parser = subparsers.add_parser( + "data", + parents=[parent_parser], + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + data_subparsers = data_parser.add_subparsers( + dest="cmd", + help="Use `dvc data CMD --help` to display command-specific help.", + ) + fix_subparsers(data_subparsers) + + DATA_STATUS_HELP = ( + "Show changes between the last git commit, " + "the dvcfiles and the workspace." + ) + data_status_parser = data_subparsers.add_parser( + "status", + parents=[parent_parser], + description=append_doc_link(DATA_STATUS_HELP, "data/status"), + formatter_class=argparse.RawDescriptionHelpFormatter, + help=DATA_STATUS_HELP, + ) + data_status_parser.add_argument( + "--json", + action="store_true", + default=False, + help="Show output in JSON format.", + ) + data_status_parser.add_argument( + "--show-json", + action="store_true", + default=False, + dest="json", + help=argparse.SUPPRESS, + ) + data_status_parser.add_argument( + "--granular", + action="store_true", + default=False, + help="Show granular file-level info for DVC-tracked directories.", + ) + data_status_parser.add_argument( + "--unchanged", + action="store_true", + default=False, + help="Show unmodified DVC-tracked files.", + ) + data_status_parser.add_argument( + "--untracked-files", + choices=["no", "all"], + default="no", + const="all", + nargs="?", + help="Show untracked files.", + ) + data_status_parser.add_argument( + "--with-dirs", + action="store_true", + default=False, + help=argparse.SUPPRESS, + ) + data_status_parser.set_defaults(func=CmdDataStatus) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index ce22ae95d3..304d8f8c39 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -76,6 +76,8 @@ class Repo: from dvc.repo.status import status # type: ignore[misc] from dvc.repo.update import update # type: ignore[misc] + from .data import status as data_status # type: ignore[misc] + ls = staticmethod(_ls) get = staticmethod(_get) get_url = staticmethod(_get_url) diff --git a/dvc/repo/data.py b/dvc/repo/data.py new file mode 100644 index 0000000000..398a0aab9e --- /dev/null +++ b/dvc/repo/data.py @@ -0,0 +1,259 @@ +import os +from collections import defaultdict +from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypedDict, cast + +if TYPE_CHECKING: + from scmrepo.base import Base + + from dvc.output import Output + from dvc.repo import Repo + from dvc_data.hashfile.db import HashFileDB + from dvc_data.hashfile.obj import HashFile + + +def _in_cache(obj: Optional["HashFile"], cache: "HashFileDB") -> bool: + from dvc_objects.errors import ObjectFormatError + + if not obj: + return False + if not obj.hash_info.value: + return False + + try: + cache.check(obj.hash_info.value) + return True + except (FileNotFoundError, ObjectFormatError): + return False + + +def _shallow_diff( + root: str, + old_obj: Optional["HashFile"], + new_obj: Optional["HashFile"], + cache: "HashFileDB", +) -> Dict[str, List[str]]: + # TODO: add support for shallow diff in dvc-data + # TODO: we may want to recursively do in_cache check + d = {} + + from dvc_data.objects.tree import Tree + + if isinstance(new_obj, Tree): + root = os.path.sep.join([root, ""]) + + if not _in_cache(old_obj, cache): + d["not_in_cache"] = [root] + + if old_obj is None and new_obj is None: + return d + if old_obj is None: + return {"added": [root], **d} + if new_obj is None: + return {"deleted": [root], **d} + if old_obj.hash_info != new_obj.hash_info: + return {"modified": [root], **d} + return {"unchanged": [root], **d} + + +def _granular_diff( + root: str, + old_obj: Optional["HashFile"], + new_obj: Optional["HashFile"], + cache: "HashFileDB", + with_dirs: bool = False, +) -> Dict[str, List[str]]: + from dvc_data.diff import ROOT + from dvc_data.diff import diff as odiff + from dvc_data.objects.tree import Tree + + def path_join(root: str, *paths: str) -> str: + if not isinstance(new_obj, Tree): + return root + return os.path.sep.join([root, *paths]) + + diff_data = odiff(old_obj, new_obj, cache) + drop_root = not with_dirs and isinstance(new_obj, Tree) + + output: Dict[str, List[str]] = defaultdict(list) + for state in ("added", "deleted", "modified", "unchanged"): + items = getattr(diff_data, state) + output[state].extend( + path_join(root, *item.new.key) + for item in items + if not (drop_root and item.new.key == ROOT) + ) + # TODO: PERF: diff is checking not_in_cache for each even if we only + # need it for the index. + # BUG: not_in_cache file also shows up as modified in staged and + # unstaged. We currently don't know if it is really modified. + output["not_in_cache"].extend( + path_join(root, *item.new.key) + for item in items + if not item.old.in_cache + and not (drop_root and item.new.key == ROOT) + and state != "added" + ) + return output + + +def _diff( + root: str, + old_obj: Optional["HashFile"], + new_obj: Optional["HashFile"], + cache: "HashFileDB", + granular: bool = False, + with_dirs: bool = False, +) -> Dict[str, List[str]]: + if granular: + return _granular_diff( + root, old_obj, new_obj, cache, with_dirs=with_dirs + ) + return _shallow_diff(root, old_obj, new_obj, cache) + + +class GitInfo(TypedDict, total=False): + staged: Dict[str, List[str]] + unstaged: Dict[str, List[str]] + untracked: List[str] + is_empty: bool + is_dirty: bool + + +def _git_info(scm: "Base", untracked_files: str = "all") -> GitInfo: + from scmrepo.exceptions import SCMError + + from dvc.scm import NoSCM + + if isinstance(scm, NoSCM): + return {} + + try: + scm.get_rev() + except SCMError: + empty_repo = True + else: + empty_repo = False + + staged, unstaged, untracked = scm.status(untracked_files=untracked_files) + # NOTE: order is important here. + return GitInfo( + staged=staged, + unstaged=unstaged, + untracked=untracked, + is_empty=empty_repo, + is_dirty=any([staged, unstaged, untracked]), + ) + + +def _diff_index_to_wtree(repo: "Repo", **kwargs: Any) -> Dict[str, List[str]]: + from dvc_data.build import build + + unstaged_diff = defaultdict(list) + for out in repo.index.outs: + out = cast("Output", out) + if not out.use_cache: + continue + + try: + _, _, new = build( + out.odb, + out.fs_path, + out.fs, + out.fs.PARAM_CHECKSUM, + ignore=out.dvcignore, + dry_run=True, + ) + except FileNotFoundError: + new = None + + cache = repo.odb.local + root = str(out) + old = out.get_obj() + d = _diff(root, old, new, cache, **kwargs) + for state, items in d.items(): + if not items: + continue + unstaged_diff[state].extend(items) + return unstaged_diff + + +def _diff_head_to_index( + repo: "Repo", head: str = "HEAD", **kwargs: Any +) -> Dict[str, List[str]]: + # we need to store objects from index and the HEAD to diff later + objs: Dict[str, Dict[str, "HashFile"]] = defaultdict(dict) + staged_diff = defaultdict(list) + for rev in repo.brancher(revs=[head]): + for out in repo.index.outs: + out = cast("Output", out) + if not out.use_cache: + continue + + root = str(out) + typ = "index" if rev == "workspace" else head + objs[root][typ] = out.get_obj() + + cache = repo.odb.local + for root, obj_d in objs.items(): + old = obj_d.get(head, None) + new = obj_d.get("index", None) + d = _diff(root, old, new, cache, **kwargs) + for state, items in d.items(): + if not items: + continue + staged_diff[state].extend(items) + + return staged_diff + + +class Status(TypedDict): + not_in_cache: List[str] + committed: Dict[str, Any] + uncommitted: Dict[str, Any] + untracked: List[str] + unchanged: List[str] + git: GitInfo + + +def _transform_git_paths_to_dvc(repo: "Repo", files: List[str]): + """Transform files rel. to Git root to DVC root, and drop outside files.""" + rel = repo.fs.path.relpath(repo.root_dir, repo.scm.root_dir).rstrip("/") + if rel in (os.curdir, ""): + return files + + prefix = rel + os.sep + length = len(prefix) + return [file[length:] for file in files if file.startswith(prefix)] + + +def status(repo: "Repo", untracked_files: str = "no", **kwargs: Any) -> Status: + from scmrepo.exceptions import SCMError + + from dvc.scm import NoSCMError + + head = kwargs.pop("head", "HEAD") + uncommitted_diff = _diff_index_to_wtree(repo, **kwargs) + not_in_cache = uncommitted_diff.pop("not_in_cache", []) + unchanged = set(uncommitted_diff.pop("unchanged", [])) + + try: + committed_diff = _diff_head_to_index(repo, head=head, **kwargs) + except (SCMError, NoSCMError): + committed_diff = {} + else: + # we don't care about not-in-cache between the head and the index. + committed_diff.pop("not_in_cache", None) + unchanged &= set(committed_diff.pop("unchanged", [])) + + git_info = _git_info(repo.scm, untracked_files=untracked_files) + untracked = git_info.get("untracked", []) + untracked = _transform_git_paths_to_dvc(repo, untracked) + # order matters here + return Status( + not_in_cache=not_in_cache, + committed=committed_diff, + uncommitted=uncommitted_diff, + untracked=untracked, + unchanged=list(unchanged), + git=git_info, + ) diff --git a/setup.cfg b/setup.cfg index 942a690a3e..ad6bac8909 100644 --- a/setup.cfg +++ b/setup.cfg @@ -125,6 +125,7 @@ tests = pytest-xdist==2.5.0 pytest-mock==3.8.2 pytest-lazy-fixture==0.6.3 + pytest-test-utils==0.0.8 # https://github.com/docker/docker-py/issues/2902 pytest-docker==0.11.0; python_version < '3.10' or sys_platform != 'win32' flaky==3.7.0 diff --git a/tests/func/test_data_status.py b/tests/func/test_data_status.py new file mode 100644 index 0000000000..e6bcb0d141 --- /dev/null +++ b/tests/func/test_data_status.py @@ -0,0 +1,167 @@ +from os.path import join + +from dvc.repo import Repo +from dvc.testing.tmp_dir import make_subrepo +from dvc.utils.fs import remove + +EMPTY_STATUS = { + "committed": {}, + "uncommitted": {}, + "git": {}, + "not_in_cache": [], + "unchanged": [], + "untracked": [], +} + + +def test_file(M, tmp_dir, scm, dvc): + tmp_dir.dvc_gen("foo", "foo", commit="add foo") + tmp_dir.dvc_gen("foo", "foobar") + remove(tmp_dir / "foo") + + expected = { + "committed": {"modified": ["foo"]}, + "uncommitted": {"deleted": ["foo"]}, + "git": M.instance_of(dict), + "not_in_cache": [], + "unchanged": [], + "untracked": [], + } + assert dvc.data_status() == expected + assert dvc.data_status(granular=True) == expected + + +def test_directory(M, tmp_dir, scm, dvc): + tmp_dir.dvc_gen({"dir": {"foo": "foo"}}, commit="add dir") + tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar", "foobar": "foobar"}}) + remove(tmp_dir / "dir") + (tmp_dir / "dir").gen({"foo": "foo", "bar": "barr", "baz": "baz"}) + tmp_dir.gen("untracked", "untracked") + + assert dvc.data_status() == { + "committed": {"modified": [join("dir", "")]}, + "uncommitted": {"modified": [join("dir", "")]}, + "git": M.instance_of(dict), + "not_in_cache": [], + "unchanged": [], + "untracked": [], + } + + assert dvc.data_status(granular=True, untracked_files="all") == { + "committed": { + "added": M.unordered( + join("dir", "bar"), + join("dir", "foobar"), + ) + }, + "uncommitted": { + "added": [join("dir", "baz")], + "modified": [join("dir", "bar")], + "deleted": [join("dir", "foobar")], + }, + "git": M.instance_of(dict), + "not_in_cache": [], + "unchanged": [join("dir", "foo")], + "untracked": ["untracked"], + } + + +def test_new_empty_git_repo(M, tmp_dir, scm): + dvc = Repo.init() + assert dvc.data_status() == { + **EMPTY_STATUS, + "git": M.dict( + is_empty=True, + is_dirty=True, + ), + } + + +def test_noscm_repo(dvc): + assert dvc.data_status() == EMPTY_STATUS + + +def test_unchanged(M, tmp_dir, scm, dvc): + tmp_dir.dvc_gen({"dir": {"foo": "foo"}}, commit="add dir") + tmp_dir.dvc_gen("bar", "bar", commit="add foo") + + assert dvc.data_status() == { + **EMPTY_STATUS, + "git": M.instance_of(dict), + "unchanged": M.unordered("bar", join("dir", "")), + } + assert dvc.data_status(granular=True) == { + **EMPTY_STATUS, + "git": M.instance_of(dict), + "unchanged": M.unordered("bar", join("dir", "foo")), + } + + +def test_not_in_cache(M, tmp_dir, scm, dvc): + # TODO: investigation required, might return wrong results + tmp_dir.dvc_gen({"dir": {"foo": "foo"}}, commit="add dir") + tmp_dir.dvc_gen("bar", "bar", commit="add foo") + remove(dvc.odb.local.cache_dir) + + assert dvc.data_status() == { + **EMPTY_STATUS, + "not_in_cache": M.unordered("bar", join("dir", "")), + "git": M.instance_of(dict), + "unchanged": ["bar"], + "uncommitted": {"added": [join("dir", "")]}, + } + assert dvc.data_status(granular=True) == { + **EMPTY_STATUS, + "not_in_cache": M.unordered("bar"), + "git": M.instance_of(dict), + "unchanged": [], + "committed": {"modified": ["bar"]}, + "uncommitted": {"modified": ["bar"], "added": [join("dir", "foo")]}, + } + + +def test_withdirs(M, tmp_dir, scm, dvc): + tmp_dir.dvc_gen({"dir": {"foo": "foo"}}, commit="add dir") + tmp_dir.dvc_gen("bar", "bar", commit="add foo") + assert dvc.data_status(granular=True, with_dirs=True) == { + **EMPTY_STATUS, + "git": M.instance_of(dict), + "unchanged": M.unordered("bar", join("dir", "foo"), join("dir", "")), + } + + +def test_skip_uncached_pipeline_outputs(tmp_dir, dvc, run_copy_metrics): + tmp_dir.gen({"m_temp.yaml": str(5)}) + run_copy_metrics( + "m_temp.yaml", + "m.yaml", + metrics_no_cache=["m.yaml"], + name="copy-metrics", + ) + assert dvc.data_status() == EMPTY_STATUS + assert ( + dvc.data_status(granular=True, untracked_files="all") == EMPTY_STATUS + ) + + +def test_output_with_newly_added_stage(tmp_dir, dvc): + dvc.stage.add(deps=["bar"], outs=["foo"], name="copy", cmd="cp foo bar") + assert dvc.data_status() == {**EMPTY_STATUS, "not_in_cache": ["foo"]} + + +def test_subdir(M, tmp_dir, scm): + subrepo = tmp_dir / "sub" + make_subrepo(subrepo, scm) + + with subrepo.chdir(): + subrepo.dvc_gen({"dir": {"foo": "foo"}}, commit="add dir") + subrepo.dvc_gen("bar", "bar", commit="add foo") + subrepo.gen("untracked", "untracked") + + dvc = subrepo.dvc + assert dvc.data_status(granular=True, untracked_files="all") == { + **EMPTY_STATUS, + "git": M.instance_of(dict), + "unchanged": M.unordered("bar", join("dir", "foo")), + "untracked": ["untracked"], + } diff --git a/tests/unit/command/test_compat_flag.py b/tests/unit/command/test_compat_flag.py index 996121bbfd..f5aa78eb20 100644 --- a/tests/unit/command/test_compat_flag.py +++ b/tests/unit/command/test_compat_flag.py @@ -34,6 +34,7 @@ def _id_gen(val) -> str: (["plots", "diff", "--show-json"], "json"), (["exp", "list", "--names-only"], "name_only"), (["stage", "list", "--names-only"], "name_only"), + (["data", "status", "--json"], "json"), ], ids=_id_gen, ) diff --git a/tests/unit/command/test_data_status.py b/tests/unit/command/test_data_status.py new file mode 100644 index 0000000000..eb61090916 --- /dev/null +++ b/tests/unit/command/test_data_status.py @@ -0,0 +1,139 @@ +import json + +import pytest +from funcy import omit + +from dvc.cli import main, parse_args +from dvc.commands.data import CmdDataStatus +from dvc.repo import Repo +from dvc.repo.data import Status + + +@pytest.fixture +def mocked_status(): + yield Status( + not_in_cache=["notincache"], + committed={"added": ["dir/bar", "dir/foo"]}, + uncommitted={ + "added": ["dir/baz"], + "modified": ["dir/bar"], + "deleted": ["dir/foobar"], + }, + untracked=["untracked"], + unchanged=["dir/foo"], + git={"is_dirty": True}, + ) + + +def test_cli(dvc, mocker, mocked_status): + status = mocker.patch( + "dvc.repo.Repo.data_status", return_value=mocked_status + ) + + cli_args = parse_args( + [ + "data", + "status", + "--json", + "--unchanged", + "--untracked-files", + "--with-dirs", + "--granular", + ] + ) + + assert cli_args.func == CmdDataStatus + cmd = cli_args.func(cli_args) + assert cmd.run() == 0 + status.assert_called_once_with( + untracked_files="all", + granular=True, + with_dirs=True, + ) + + +@pytest.mark.parametrize( + "args, to_omit", + [ + ([], ["untracked", "unchanged"]), + (["--unchanged"], ["untracked"]), + (["--unchanged", "--untracked-files"], []), + ], +) +def test_json(dvc, mocker, capsys, mocked_status, args, to_omit): + mocker.patch("dvc.repo.Repo.data_status", return_value=mocked_status) + assert main(["data", "status", "--json", *args]) == 0 + out, err = capsys.readouterr() + assert out.rstrip() == json.dumps(omit(mocked_status, [*to_omit, "git"])) + assert not err + + +def test_no_changes_repo(dvc, scm, capsys): + assert main(["data", "status"]) == 0 + assert capsys.readouterr() == ("No changes.\n", "") + + +def test_empty_scm_repo(tmp_dir, capsys): + tmp_dir.init(scm=True) + Repo.init() + + assert main(["data", "status"]) == 0 + out, err = capsys.readouterr() + assert ( + out + == """\ +No changes in an empty git repo. +(there are changes not tracked by dvc, use "git status" to see) +""" + ) + assert not err + + +@pytest.mark.parametrize( + "args", + [ + ("--untracked-files",), + ("--unchanged",), + ("--untracked-files", "--unchanged"), + ], +) +@pytest.mark.parametrize("is_dirty", [True, False]) +def test_show_status(dvc, scm, mocker, capsys, mocked_status, args, is_dirty): + mocked_status["git"]["is_dirty"] = is_dirty + mocker.patch("dvc.repo.Repo.data_status", return_value=mocked_status) + assert main(["data", "status", *args]) == 0 + out, err = capsys.readouterr() + expected_out = """\ +Not in cache: + (use "dvc pull ..." to update your local storage) + notincache + +DVC committed changes: + (git commit the corresponding dvc files to update the repo) + added: dir/bar + added: dir/foo + +DVC uncommitted changes: + (use "dvc commit ..." to track changes) + added: dir/baz + modified: dir/bar + deleted: dir/foobar +""" + if "--untracked-files" in args: + expected_out += """ +Untracked files: + (use "git add ..." or dvc add ..." to commit to git or to dvc) + untracked +""" + if "--unchanged" in args: + expected_out += """ +DVC unchanged files: + dir/foo +""" + + if is_dirty: + expected_out += """\ +(there are other changes not tracked by dvc, use "git status" to see) +""" + assert out == expected_out + assert not err