Skip to content

Commit

Permalink
Merge pull request #371 from DESm1th/blacklisting
Browse files Browse the repository at this point in the history
[ENH] Keep blacklisted files + handle blacklisted files in bids folder
  • Loading branch information
DESm1th authored Jan 29, 2025
2 parents 170adfe + a505222 commit e727a65
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 52 deletions.
4 changes: 2 additions & 2 deletions assets/config_templates/main_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ IgnoreHeaderFields: # Dicom Header fields to exclude from gold
# study's resources folder. (Optional).
# Default: 'behav|\.edat2'

# BlacklistDel: [nii, mnc, nrrd, resources] # Indicate which directories to
# BlacklistDel: [nii, bids, resources] # Indicate which directories to
# delete blacklisted data from.
# Default: [nii, mnc, nrrd,
# Default: [nii, bids,
# resources]
152 changes: 107 additions & 45 deletions bin/dm_blacklist_rm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python
"""
Searches the file system for any data on the blacklist and deletes them.
Searches the file system for any data on the blacklist and removes them
from subject data folders to avoid pipeline failures etc.
Usage:
dm_blacklist_rm.py [options] [--path=KEY]... <project>
Expand All @@ -9,6 +10,8 @@
<project> The name of a datman managed project
Options:
--keep If provided, the blacklisted scans will be moved to
a 'blacklisted' subdir instead of being deleted.
--path KEY If provided overrides the 'BlacklistDel' setting
from the config files, which defines the directories
to delete blacklisted items from. 'KEY' may be the name
Expand All @@ -22,15 +25,14 @@
"""
import os
import glob
import logging
import shutil

from docopt import docopt

import datman.config
import datman.scan
import datman.utils
from datman.scanid import ParseException

logging.basicConfig(level=logging.WARN,
format="[%(name)s] %(levelname)s: %(message)s")
Expand All @@ -43,6 +45,7 @@ def main():
global DRYRUN
arguments = docopt(__doc__)
project = arguments['<project>']
keep = arguments['--keep']
override_paths = arguments['--path']
verbose = arguments['--verbose']
debug = arguments['--debug']
Expand All @@ -58,13 +61,30 @@ def main():

config = datman.config.config(study=project)
metadata = datman.utils.get_subject_metadata(config, allow_partial=True)
base_paths = get_base_paths(config, override_paths)
search_paths = get_search_paths(config, override_paths)

remove_blacklisted_items(metadata, base_paths)
for sub in metadata:
if not metadata[sub]:
continue

logger.debug(f"Working on {sub}")
session = datman.scan.Scan(sub, config)
handle_blacklisted_scans(
session, metadata[sub], search_paths, keep=keep
)


def get_search_paths(config, user_paths=None):
"""Get path types to search for blacklisted files in.
Args:
config (:obj:`datman.config.config`): A datman config object for the
current study.
user_paths (:obj:`list`): A list of path types to search through.
Optional. Causes the configuration file setting to be ignored.
def get_base_paths(config, user_paths):
"""Get the full path to each base directory to search for blacklisted data.
Returns:
list: A list of path types that will be searched.
"""
if user_paths:
path_keys = user_paths
Expand All @@ -73,52 +93,94 @@ def get_base_paths(config, user_paths):
path_keys = config.get_key("BlacklistDel")
except datman.config.UndefinedSetting:
# Fall back to the default
path_keys = ['nii', 'mnc', 'nrrd', 'resources']
path_keys = ['nii', 'bids', 'resources']
return path_keys


def handle_blacklisted_scans(session, bl_scans, search_paths, keep=False):
"""Move or delete all blacklisted scans for the given path types.
Args:
session (:obj:`datman.scan.Scan`): A datman scan object for the
current session.
bl_scans (:obj:`list`): A list of strings each representing a
blacklisted scan.
search_paths (:obj:`list`): A list of path types to move/delete
blacklisted scans from. Each path type must exist in the
datman config files.
keep (bool): Whether to move files instead of deleting them. Optional,
default False.
"""
for scan in bl_scans:
for path_type in search_paths:
found = session.find_files(scan, format=path_type)

base_paths = []
for key in path_keys:
try:
found = config.get_path(key)
except datman.config.UndefinedSetting:
logger.warning(f"Given undefined path type - {key}. Ignoring.")
continue
if not found:
continue

if os.path.exists(found):
base_paths.append(found)
if is_already_handled(found):
continue

return base_paths
logger.debug(f"Files found for removal: {found}")

if DRYRUN:
logger.info("DRYRUN - Leaving files in place.")
continue

def remove_blacklisted_items(metadata, base_paths):
for sub in metadata:
blacklist_entries = metadata[sub]
if not blacklist_entries:
continue
for item in found:
if keep:
path = getattr(session, f"{path_type}_path")
logger.info(
f"Moving blacklisted files to {path}/blacklisted"
)
move_file(path, item)
else:
delete_file(item)

logger.debug(f"Working on {sub}")
for path in base_paths:
for sub_dir in glob.glob(os.path.join(path, sub + "*")):
for entry in blacklist_entries:
remove_matches(sub_dir, entry)


def remove_matches(path, fname):
matches = find_files(path, fname)
if matches:
logger.info(f"Files found for deletion: {matches}")
if DRYRUN:
return
for item in matches:
try:
os.remove(item)
except FileNotFoundError:
pass
except (PermissionError, IsADirectoryError):
logger.error(f"Failed to delete blacklisted item {item}.")

def is_already_handled(found_files):
"""Checks if the found files have already been moved or removed.
Split series scans get 'found' for each blacklist entry the first time
the script encounters the blacklist entries. This checks if all 'found'
items have already been removed so unneccessary errors messages can be
avoided!
Args:
found_files (:obj:`list`): A list of files to check.
def find_files(path, fname):
return glob.glob(os.path.join(path, fname + "*"))
Returns:
bool
"""
return all([not os.path.exists(item) for item in found_files])


def move_file(path, item):
"""Move a file to a 'blacklisted' subdir inside the given path.
Args:
path (:obj:`str`): The path to move put the 'blacklisted' folder.
item (:obj:`str`): The full path to a blacklisted file to move.
"""
bl_dir = os.path.join(path, "blacklisted")
try:
os.mkdir(bl_dir)
except FileExistsError:
pass

try:
shutil.move(item, bl_dir)
except shutil.Error as e:
logger.error(f"Failed to move blacklisted file {item} - {e}")


def delete_file(file_path):
try:
os.remove(file_path)
except FileNotFoundError:
pass
except (PermissionError, IsADirectoryError):
logger.error(f"Failed to delete blacklisted item {file_path}.")


if __name__ == "__main__":
Expand Down
7 changes: 3 additions & 4 deletions datman/exporters.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,15 +972,14 @@ def get_error_file(self, dm_file):

def outputs_exist(self):
for dm_name in self.name_map:
if read_blacklist(scan=dm_name, config=self.config):
continue

if self.name_map[dm_name] == "missing":
if not os.path.exists(self.get_error_file(dm_name)):
return False
continue

bl_entry = read_blacklist(scan=dm_name, config=self.config)
if bl_entry:
continue

full_path = os.path.join(self.output_dir, dm_name + self.ext)
if not os.path.exists(full_path):
return False
Expand Down
63 changes: 63 additions & 0 deletions datman/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def __init__(self, subject_id, config, bids_root=None):

self.bids_root = bids_root
self.bids_path = self.__get_bids()
self._bids_inventory = self._make_bids_inventory()

# This one lists all existing resource folders for the timepoint.
self.resources = self._get_resources(config)
Expand All @@ -149,14 +150,76 @@ def _get_ident(self, subid):
return ident

def find_files(self, file_stem, format="nii"):
"""Find files belonging to the session matching a given file name.
Args:
file_stem (:obj:`str`): A valid datman-style file name, with or
without the extension and preceding path.
format (:obj:`str`): The configured datman folder path to search
through. Default: 'nii'
Returns:
:obj:`list`: a list of full paths to matching files, if any. Or
an empty list if none are found.
Raises:
datman.scanid.ParseException: If an invalid datman file name
is given.
"""
if format == 'bids':
return self._find_bids_files(file_stem)

try:
base_path = getattr(self, f"{format}_path")
except AttributeError:
return []

if not os.path.exists(base_path):
return []

return glob.glob(os.path.join(base_path, file_stem + "*"))

def _find_bids_files(self, file_stem):
ident, _, series, _ = datman.scanid.parse_filename(file_stem)
if ident.session != self.session:
return []
if int(series) in self._bids_inventory:
return self._bids_inventory[int(series)]
return []

def _make_bids_inventory(self):
if not self.bids_path:
return {}

inventory = {}
for path, _, files in os.walk(self.bids_path):
if path.endswith("blacklisted"):
continue

for item in files:
if not item.endswith(".json"):
continue

json_path = os.path.join(path, item)
contents = datman.utils.read_json(json_path)

repeat = contents['Repeat'] if 'Repeat' in contents else '01'
if repeat != self.session:
continue

try:
series = int(contents['SeriesNumber'])
except KeyError:
# Ignore sidecars missing a series number field.
continue
base_fname = os.path.splitext(json_path)[0]

inventory.setdefault(series, []).extend(
glob.glob(base_fname + "*")
)

return inventory

def get_tagged_nii(self, tag):
try:
matched_niftis = self.__nii_dict[tag]
Expand Down
2 changes: 1 addition & 1 deletion docs/datman_conf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ Optional
* Accepted values: A list of path names, where each path name has already
been defined in `Paths`_.
* Default value: If omitted ``dm_blacklist_rm`` will delete blacklisted
scans from ``nii``, ``mnc``, ``nrrd``, and ``resources``, if these
scans from ``nii``, ``bids``, and ``resources``, if these
directories exist.

Example
Expand Down

0 comments on commit e727a65

Please sign in to comment.