From 335c4eb39ad8556070e769fa9491ec5de22ee455 Mon Sep 17 00:00:00 2001 From: Paul Saxe Date: Thu, 3 Mar 2022 16:37:39 -0500 Subject: [PATCH 1/2] Adding ability to read files from with tar files. --- read_structure_step/read_structure.py | 232 ++++++++++++++++++-------- 1 file changed, 167 insertions(+), 65 deletions(-) diff --git a/read_structure_step/read_structure.py b/read_structure_step/read_structure.py index 84ce4e0..ed3f222 100644 --- a/read_structure_step/read_structure.py +++ b/read_structure_step/read_structure.py @@ -13,13 +13,14 @@ import logging from pathlib import PurePath, Path +import tarfile +import tempfile import textwrap from .formats.registries import get_format_metadata import read_structure_step from .read import read import seamm -from seamm import data # noqa: F401 from seamm_util import ureg, Q_ # noqa: F401 import seamm_util.printing as printing from seamm_util.printing import FormattedText as __ @@ -123,78 +124,179 @@ def run(self): context=seamm.flowchart_variables._data ) - # What type of file? + # Check for tar files, potentially compressed if isinstance(P["file"], Path): - filename = str(P["file"]) + path = P["file"] else: - filename = ["file"].strip() - file_type = P["file type"] + path = Path(["file"].strip()) - if file_type != "from extension": - extension = file_type.split()[0] + extensions = path.suffixes + if ".tar" in extensions or ".tgz" in extensions: + self.read_tarfile(path, P) else: - path = PurePath(filename) - extension = path.suffix - if extension == ".gz": - extension = path.stem.suffix - - if extension == "": - extension = guess_extension(filename, use_file_name=False) - P["file type"] = extension - - # Print what we are doing - printer.important(self.description_text(P)) - - # Read the file into the system - system_db = self.get_variable("_system_db") - system, configuration = self.get_system_configuration( - P, structure_handling=True - ) - - read( - filename, - configuration, - extension=extension, - add_hydrogens=P["add hydrogens"], - system_db=system_db, - system=system, - indices=P["indices"], - subsequent_as_configurations=( - P["subsequent structure handling"] == "Create a new configuration" - ), - system_name=P["system name"], - configuration_name=P["configuration name"], - printer=printer.important, - references=self.references, - bibliography=self._bibliography, - ) + # What type of file? + if isinstance(P["file"], Path): + filename = str(P["file"]) + else: + filename = ["file"].strip() + file_type = P["file type"] - # Finish the output - if configuration.periodicity == 3: - space_group = configuration.symmetry.group - if space_group == "": - symmetry_info = "" + if file_type != "from extension": + extension = file_type.split()[0] else: - symmetry_info = f" The space group is {space_group}." - printer.important( - __( - f"\n Created a periodic structure with {configuration.n_atoms} " - f"atoms.{symmetry_info}" - f"\n System name = {system.name}" - f"\n Configuration name = {configuration.name}", - indent=4 * " ", - ) + path = PurePath(filename) + extension = path.suffix + if extension == ".gz": + extension = path.stem.suffix + + if extension == "": + extension = guess_extension(filename, use_file_name=False) + P["file type"] = extension + + # Print what we are doing + printer.important(self.description_text(P)) + + # Read the file into the system + system_db = self.get_variable("_system_db") + system, configuration = self.get_system_configuration( + P, structure_handling=True ) - else: - printer.important( - __( - f"\n Created a molecular structure with {configuration.n_atoms} " - "atoms." - f"\n System name = {system.name}" - f"\n Configuration name = {configuration.name}", - indent=4 * " ", - ) + + read( + filename, + configuration, + extension=extension, + add_hydrogens=P["add hydrogens"], + system_db=system_db, + system=system, + indices=P["indices"], + subsequent_as_configurations=( + P["subsequent structure handling"] == "Create a new configuration" + ), + system_name=P["system name"], + configuration_name=P["configuration name"], + printer=printer.important, + references=self.references, + bibliography=self._bibliography, ) + + # Finish the output + if configuration.periodicity == 3: + space_group = configuration.symmetry.group + if space_group == "": + symmetry_info = "" + else: + symmetry_info = f" The space group is {space_group}." + printer.important( + __( + "\n Created a periodic structure with " + f"{configuration.n_atoms} atoms. {symmetry_info}" + f"\n System name = {system.name}" + f"\n Configuration name = {configuration.name}", + indent=4 * " ", + ) + ) + else: + printer.important( + __( + "\n Created a molecular structure with " + "{configuration.n_atoms} atoms." + f"\n System name = {system.name}" + f"\n Configuration name = {configuration.name}", + indent=4 * " ", + ) + ) + printer.important("") return next_node + + def read_tarfile(self, tarfile_path, P): + """Read structures from a tarfile. + + Parameters + ---------- + path : pathlib.Path + The path to the tarfile. + P : {str: str} + Dictionary of control parameters for this step. + """ + file_type = P["file type"] + if file_type != "from extension": + extensions = [file_type.split()[0]] + + as_configurations = ( + P["subsequent structure handling"] == "Create a new configuration" + ) + + n = 0 + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_dir_path = Path(tmp_dir) + with tarfile.open(tarfile_path.expanduser(), "r") as tar: + for member in tar: + if not member.isfile(): + continue + + if member.name[0] == ".": + continue + + path = PurePath(member.name) + if path.name[0] == ".": + continue + extension = path.suffix + + # If explicit extension does not match, skip. + if file_type != "from extension" and extension not in extensions: + continue + + # For the time being write the contents to a file. Eventually should + # rewrite all the routines to handle text as well as files. + fd = tar.extractfile(member) + if fd is None: + fd.close() + continue + + data = fd.read() + fd.close() + + tmp_path = tmp_dir_path / path.name + tmp_path.write_bytes(data) + + filename = str(tmp_path) + + if extension == "": + extension = guess_extension(filename) + + # Read the file into the system + system_db = self.get_variable("_system_db") + system, configuration = self.get_system_configuration( + P, structure_handling=True + ) + + read( + filename, + configuration, + extension=extension, + add_hydrogens=P["add hydrogens"], + system_db=system_db, + system=system, + indices=P["indices"], + subsequent_as_configurations=as_configurations, + system_name=P["system name"], + configuration_name=P["configuration name"], + printer=printer.important, + references=self.references, + bibliography=self._bibliography, + ) + + tmp_path.unlink() + n += 1 + if n % 1000 == 0: + print(n) + + printer.important( + __( + f"\n Created {n} structures from the tarfile {tarfile}", + indent=4 * " ", + ) + ) From 2233c4f416f4dd45bf570f05bbd37e84cec2a948 Mon Sep 17 00:00:00 2001 From: Paul Saxe Date: Fri, 6 May 2022 20:07:08 -0400 Subject: [PATCH 2/2] bug fixes and additions to correctly handle PDB (mm)cif files. --- read_structure_step/formats/cif/cif.py | 16 +++---- read_structure_step/formats/cif/mmcif.py | 58 +++++++++++++++--------- read_structure_step/read_structure.py | 5 +- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/read_structure_step/formats/cif/cif.py b/read_structure_step/formats/cif/cif.py index a8e25a6..141c735 100644 --- a/read_structure_step/formats/cif/cif.py +++ b/read_structure_step/formats/cif/cif.py @@ -149,7 +149,7 @@ def load_cif( # Set the system name if system_name is not None and system_name != "": - lower_name = system_name.lower() + lower_name = str(system_name).lower() if "from file" in lower_name: system.name = block_name elif "file name" in lower_name: @@ -159,11 +159,11 @@ def load_cif( elif "empirical formula" in lower_name: system.name = configuration.formula()[1] else: - system.name = system_name + system.name = str(system_name) # And the configuration name if configuration_name is not None and configuration_name != "": - lower_name = configuration_name.lower() + lower_name = str(configuration_name).lower() if "from file" in lower_name: configuration.name = block_name elif "file name" in lower_name: @@ -173,7 +173,7 @@ def load_cif( elif "empirical formula" in lower_name: configuration.name = configuration.formula()[1] else: - configuration.name = configuration_name + configuration.name = str(configuration_name) logger.debug(f" added system {system_db.n_systems}: {block_name}") block_name = line[5:].strip() lines = [] @@ -194,7 +194,7 @@ def load_cif( # Set the system name if system_name is not None and system_name != "": - lower_name = system_name.lower() + lower_name = str(system_name).lower() if "from file" in lower_name: system.name = block_name elif "file name" in lower_name: @@ -204,11 +204,11 @@ def load_cif( elif "empirical formula" in lower_name: system.name = configuration.formula()[1] else: - system.name = system_name + system.name = str(system_name) # And the configuration name if configuration_name is not None and configuration_name != "": - lower_name = configuration_name.lower() + lower_name = str(configuration_name).lower() if "from file" in lower_name: configuration.name = block_name elif "file name" in lower_name: @@ -218,4 +218,4 @@ def load_cif( elif "empirical formula" in lower_name: configuration.name = configuration.formula()[1] else: - configuration.name = configuration_name + configuration.name = str(configuration_name) diff --git a/read_structure_step/formats/cif/mmcif.py b/read_structure_step/formats/cif/mmcif.py index 01d65ef..b521812 100644 --- a/read_structure_step/formats/cif/mmcif.py +++ b/read_structure_step/formats/cif/mmcif.py @@ -138,19 +138,26 @@ def load_mmcif( block_name = line[5:].strip() else: structure_no += 1 - if structure_no > 1: - if subsequent_as_configurations: - configuration = system.create_configuration() - else: - system = system_db.create_system() - configuration = system.create_configuration() + # Check for NMR ensemble + text = "\n".join(lines) + if "_pdbx_nmr_ensemble.conformers_submitted_total_number" in text: + system = system_db.create_system() + system.from_mmcif_text(text) + else: + if structure_no > 1: + if subsequent_as_configurations: + configuration = system.create_configuration() + else: + system = system_db.create_system() + configuration = system.create_configuration() + + configuration.from_mmcif_text(text) - configuration.from_mmcif_text("\n".join(lines)) logger.debug(f" added system {system_db.n_systems}: {block_name}") # Set the system name if system_name is not None and system_name != "": - lower_name = system_name.lower() + lower_name = str(system_name).lower() if "from file" in lower_name: system.name = block_name elif "file name" in lower_name: @@ -160,11 +167,11 @@ def load_mmcif( elif "empirical formula" in lower_name: system.name = configuration.formula()[1] else: - system.name = system_name + system.name = str(system_name) # And the configuration name if configuration_name is not None and configuration_name != "": - lower_name = configuration_name.lower() + lower_name = str(configuration_name).lower() if "from file" in lower_name: configuration.name = block_name elif "file name" in lower_name: @@ -174,7 +181,7 @@ def load_mmcif( elif "empirical formula" in lower_name: configuration.name = configuration.formula()[1] else: - configuration.name = configuration_name + configuration.name = str(configuration_name) logger.debug(f" added system {system_db.n_systems}: {block_name}") block_name = line[5:].strip() lines = [] @@ -183,19 +190,26 @@ def load_mmcif( if len(lines) > 0: # The last block just ends at the end of the file structure_no += 1 - if structure_no > 1: - if subsequent_as_configurations: - configuration = system.create_configuration() - else: - system = system_db.create_system() - configuration = system.create_configuration() + # Check for NMR ensemble + text = "\n".join(lines) + if "_pdbx_nmr_ensemble.conformers_submitted_total_number" in text: + system = system_db.create_system() + system.from_mmcif_text(text) + else: + if structure_no > 1: + if subsequent_as_configurations: + configuration = system.create_configuration() + else: + system = system_db.create_system() + configuration = system.create_configuration() + + configuration.from_mmcif_text(text) - configuration.from_mmcif_text("\n".join(lines)) logger.debug(f" added system {system_db.n_systems}: {block_name}") # Set the system name if system_name is not None and system_name != "": - lower_name = system_name.lower() + lower_name = str(system_name).lower() if "from file" in lower_name: system.name = block_name elif "file name" in lower_name: @@ -205,11 +219,11 @@ def load_mmcif( elif "empirical formula" in lower_name: system.name = configuration.formula()[1] else: - system.name = system_name + system.name = str(system_name) # And the configuration name if configuration_name is not None and configuration_name != "": - lower_name = configuration_name.lower() + lower_name = str(configuration_name).lower() if "from file" in lower_name: configuration.name = block_name elif "file name" in lower_name: @@ -219,4 +233,4 @@ def load_mmcif( elif "empirical formula" in lower_name: configuration.name = configuration.formula()[1] else: - configuration.name = configuration_name + configuration.name = str(configuration_name) diff --git a/read_structure_step/read_structure.py b/read_structure_step/read_structure.py index ed3f222..964bfac 100644 --- a/read_structure_step/read_structure.py +++ b/read_structure_step/read_structure.py @@ -181,6 +181,9 @@ def run(self): ) # Finish the output + system, configuration = self.get_system_configuration( + P, structure_handling=False + ) if configuration.periodicity == 3: space_group = configuration.symmetry.group if space_group == "": @@ -200,7 +203,7 @@ def run(self): printer.important( __( "\n Created a molecular structure with " - "{configuration.n_atoms} atoms." + f"{configuration.n_atoms} atoms." f"\n System name = {system.name}" f"\n Configuration name = {configuration.name}", indent=4 * " ",