diff --git a/HISTORY.rst b/HISTORY.rst index 3e87ea4..113faf4 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,10 @@ History ======= +2023.7.27 -- Support for .gz and .bz2 files, and multi-structure .xyz files + * Handle .gz and .bz2 files for .sdf and .xyz extensions. + * Handle multi-structure XYZ files with a blank line between records. + 2023.7.6 -- Bugfixes * Fixed output of number of structures written to SDF files, which was off by 1. * Cleaned up the output for the write-structure step diff --git a/read_structure_step/formats/sdf/sdf.py b/read_structure_step/formats/sdf/sdf.py index d35bfc1..51c0c0b 100644 --- a/read_structure_step/formats/sdf/sdf.py +++ b/read_structure_step/formats/sdf/sdf.py @@ -2,6 +2,7 @@ Implementation of the reader for SDF files using OpenBabel """ +import bz2 import gzip from pathlib import Path import shutil @@ -133,10 +134,15 @@ def load_sdf( path.expanduser().resolve() # Get the information for progress output, if requested. - compress = path.suffix == ".gz" if printer is not None: n_structures = 0 - with gzip.open(path, mode="rt") if compress else open(path, "r") as fd: + with ( + gzip.open(path, mode="rt") + if path.suffix == ".gz" + else bz2.open(path, mode="rt") + if path.suffix == ".bz2" + else open(path, "r") + ) as fd: for line in fd: if line[0:4] == "$$$$": n_structures += 1 @@ -154,7 +160,13 @@ def load_sdf( n_errors = 0 obMol = openbabel.OBMol() text = "" - with gzip.open(path, mode="rt") if compress else open(path, "r") as fd: + with ( + gzip.open(path, mode="rt") + if path.suffix == ".gz" + else bz2.open(path, mode="rt") + if path.suffix == ".bz2" + else open(path, "r") + ) as fd: for line in fd: text += line diff --git a/read_structure_step/formats/xyz/xyz.py b/read_structure_step/formats/xyz/xyz.py index 8b47e55..3fe38c8 100644 --- a/read_structure_step/formats/xyz/xyz.py +++ b/read_structure_step/formats/xyz/xyz.py @@ -2,6 +2,8 @@ Implementation of the reader for XYZ files using OpenBabel """ +import bz2 +import gzip import logging import os from pathlib import Path @@ -119,7 +121,6 @@ def load_xyz( printer=None, references=None, bibliography=None, - save_data=True, **kwargs, ): """Read an XYZ input file. @@ -132,6 +133,49 @@ def load_xyz( configuration : molsystem.Configuration The configuration to put the imported structure into. + extension : str, optional, default: None + The extension, including initial dot, defining the format. + + add_hydrogens : bool = True + Whether to add any missing hydrogen atoms. + + system_db : System_DB = None + The system database, used if multiple structures in the file. + + system : System = None + The system to use if adding subsequent structures as configurations. + + indices : str = "1:end" + The generalized indices (slices, SMARTS, etc.) to select structures + from a file containing multiple structures. + + subsequent_as_configurations : bool = False + Normally and subsequent structures are loaded into new systems; however, + if this option is True, they will be added as configurations. + + system_name : str = "from file" + The name for systems. Can be directives like "SMILES" or + "Canonical SMILES". If None, no name is given. + + configuration_name : str = "sequential" + The name for configurations. Can be directives like "SMILES" or + "Canonical SMILES". If None, no name is given. + + printer : Logger or Printer + A function that prints to the appropriate place, used for progress. + + references : ReferenceHandler = None + The reference handler object or None + + bibliography : dict + The bibliography as a dictionary. + + Returns + ------- + [Configuration] + The list of configurations created. + + We'll use OpenBabel to read the file; however, OpenBabel is somewhat limited, so we'll first preprocess the file to extract extra data and also to fit it to the format that OpenBabel can handle. @@ -143,6 +187,12 @@ def load_xyz( #. symbol, x, y, z #. ... + Some XYZ files, for instance those from tmQM encode data in the comment line:: + + CSD_code = WIXKOE | q = 0 | S = 0 | Stoichiometry = C47H65LaN2O | MND = 7 + + We'll try to handle this type of comment. + The Minnesota Solvation database uses a slightly modified form: #. A comment, often the structure name and provenance @@ -164,151 +214,267 @@ def load_xyz( path = Path(file_name) else: path = file_name - path.expanduser().resolve() - lines = path.read_text().splitlines() - - # Examine the first lines and see what the format might be - file_type = "unknown" - n_lines = len(lines) - line1 = lines[0].strip() - fields1 = line1.split() - n_fields1 = len(fields1) - - if n_lines <= 1: - line2 = None - fields2 = [] - n_fields2 = 0 - else: - line2 = lines[1].strip() - fields2 = line2.split() - n_fields2 = len(fields2) - - if n_lines <= 2: - line3 = None - fields3 = [] - n_fields3 = 0 - else: - line3 = lines[2].strip() - fields3 = line3.split() - n_fields3 = len(fields3) - - # Check for "standard" file - if n_fields1 == 1: - try: - n_atoms = int(fields1[0]) - except Exception: - pass - else: - # Might be traditional file. Check 3rd line for atom - if n_fields3 == 4: - file_type = "standard" - elif n_fields1 == 0: - # Might be standard file without atom count. - if n_fields3 == 4: - file_type = "standard" - n_atoms = 0 - for line in lines[2:]: - n_fields = len(line.split()) - if n_fields == 0: - break + path = path.expanduser().resolve() + + print(f"In xyz.py, path = {path}") + + # Get the information for progress output, if requested. + n_structures = 0 + last_line = 0 + with ( + gzip.open(path, mode="rt") + if path.suffix == ".gz" + else bz2.open(path, mode="rt") + if path.suffix == ".bz2" + else open(path, "r") + ) as fd: + for line in fd: + last_line += 1 + if line.strip() == "": + n_structures += 1 + # may not have blank line at end + if line.strip() != "": + n_structures += 1 + if printer is not None: + printer("") + printer(f" The XYZ file contains {n_structures} structures.") + last_percent = 0 + t0 = time.time() + last_t = t0 + + obConversion = openbabel.OBConversion() + obConversion.SetInFormat("xyz") + + configurations = [] + structure_no = 0 + n_errors = 0 + obMol = openbabel.OBMol() + + total_lines = 0 + with ( + gzip.open(path, mode="rt") + if path.suffix == ".gz" + else bz2.open(path, mode="rt") + if path.suffix == ".bz2" + else open(path, "r") + ) as fd: + print(f"{fd=}") + lines = [] + line_no = 0 + for line in fd: + # lines have \n at end. It is not stripped as is more normal. + total_lines += 1 + line_no += 1 + lines.append(line) + if total_lines == last_line or line_no > 3 and line.strip() == "": + # End of block, so examine the first lines and see which format + file_type = "unknown" + n_lines = len(lines) + print(f"Found block of {n_lines} lines") + line1 = lines[0].strip() + fields1 = line1.split() + n_fields1 = len(fields1) + + if n_lines <= 1: + line2 = None + fields2 = [] + n_fields2 = 0 else: - n_atoms += 1 - # Put the count in line 1 - lines[0] = str(n_atoms) - - # And Minnesota variant with three headers. - if n_lines > 3 and n_fields3 == 2: - try: - charge = int(fields3[0]) - multiplicity = int(fields3[1]) - except Exception: - pass - else: - file_type = "Minnesota" - if n_fields2 != 0: - logger.warning( - f"Minnesota style XYZ file, 2nd line is not blank:\n\t{lines[1]}" - ) - # Count atoms - n_atoms = 0 - for line in lines[3:]: - n_fields = len(line.split()) - if n_fields == 0: - break + line2 = lines[1].strip() + fields2 = line2.split() + n_fields2 = len(fields2) + + if n_lines <= 2: + line3 = None + fields3 = [] + n_fields3 = 0 else: - n_atoms += 1 - # Move comment to 2nd line - lines[1] = lines[0] - # Put the count in line 1 - lines[0] = str(n_atoms) - # Remove 3rd line with charge and multiplicity - del lines[2] - - # Reassemble an input file. - input_data = "\n".join(lines) - logger.info(f"Input data:\n\n{input_data}\n") - - title = lines[1].strip() - - # Now try to convert using OpenBabel - out = OutputGrabber(sys.stderr) - with out: - obConversion = openbabel.OBConversion() - obConversion.SetInFormat("xyz") - obMol = openbabel.OBMol() - - success = obConversion.ReadString(obMol, input_data) - if not success: - raise RuntimeError("obConversion failed") - - if add_hydrogens: - obMol.AddHydrogens() - - configuration.from_OBMol(obMol) - - # Check any stderr information from obabel. - if out.capturedtext != "": - tmp = out.capturedtext - if "Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders" not in tmp: - logger.warning(tmp) - - if file_type == "Minnesota": - # Record the charge, and the spin state - configuration.charge = charge - configuration.spin_multiplicity = multiplicity - - logger.info(f"{charge=} {multiplicity=}") - - # Set the system name - if system_name is not None and system_name != "": - lower_name = system_name.lower() - if "from file" in lower_name: - system.name = str(path) - elif lower_name == "title": - if len(title) > 0: - system.name = title - else: - system.name = str(path) - elif "canonical smiles" in lower_name: - system.name = configuration.canonical_smiles - elif "smiles" in lower_name: - system.name = configuration.smiles - else: - system.name = system_name - - # And the configuration name - if configuration_name is not None and configuration_name != "": - lower_name = configuration_name.lower() - if "from file" in lower_name: - configuration.name = obMol.GetTitle() - elif "canonical smiles" in lower_name: - configuration.name = configuration.canonical_smiles - elif "smiles" in lower_name: - configuration.name = configuration.smiles - elif lower_name == "sequential": - configuration.name = "1" - else: - configuration.name = configuration_name + line3 = lines[2].strip() + fields3 = line3.split() + n_fields3 = len(fields3) + + # Check for "standard" file + if n_fields1 == 1: + try: + n_atoms = int(fields1[0]) + except Exception: + pass + else: + # Might be traditional file. Check 3rd line for atom + if n_fields3 == 4: + file_type = "standard" + elif n_fields1 == 0: + # Might be standard file without atom count. + if n_fields3 == 4: + file_type = "standard" + n_atoms = 0 + for line in lines[2:]: + n_fields = len(line.split()) + if n_fields == 0: + break + else: + n_atoms += 1 + # Put the count in line 1 + lines[0] = str(n_atoms) + + # And Minnesota variant with three headers. + if n_lines > 3 and n_fields3 == 2: + try: + charge = int(fields3[0]) + multiplicity = int(fields3[1]) + except Exception: + pass + else: + file_type = "Minnesota" + if n_fields2 != 0: + logger.warning( + "Minnesota style XYZ file, 2nd line is not blank:" + f"\n\t{lines[1]}" + ) + # Count atoms + n_atoms = 0 + for line in lines[3:]: + n_fields = len(line.split()) + if n_fields == 0: + break + else: + n_atoms += 1 + # Move comment to 2nd line + lines[1] = lines[0] + # Put the count in line 1 + lines[0] = str(n_atoms) + "\n" + # Remove 3rd line with charge and multiplicity + del lines[2] + + # Reassemble an input file. + input_data = "".join(lines) + + print("Input data") + print(input_data) + print("---------") + + logger.info(f"Input data:\n\n{input_data}\n") + + title = lines[1].strip() + + lines = [] + line_no = 0 + + # Now try to convert using OpenBabel + out = OutputGrabber(sys.stderr) + with out: + success = obConversion.ReadString(obMol, input_data) + if not success: + raise RuntimeError("obConversion failed") + + if add_hydrogens: + obMol.AddHydrogens() + + structure_no += 1 + if structure_no > 1: + if subsequent_as_configurations: + configuration = system.create_configuration() + else: + system = system_db.create_system() + configuration = system.create_configuration() + + configuration.from_OBMol(obMol) + configurations.append(configuration) + + # Check any stderr information from obabel. + if out.capturedtext != "": + tmp = out.capturedtext + if ( + "Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders" + not in tmp + ): + logger.warning(f"{structure_no}: {tmp}") + + if file_type == "Minnesota": + # Record the charge, and the spin state + configuration.charge = charge + configuration.spin_multiplicity = multiplicity + + logger.info(f"{charge=} {multiplicity=}") + elif "|" in title: + for tmp in title.split("|"): + if "=" in tmp: + key, val = tmp.split(maxsplit=1) + key = key.strip() + val = val.strip() + if key == "q": + try: + configuration.charge = float(val) + except Exception: + pass + elif key == "S": + try: + configuration.spin_multiplicity = int(val) + except Exception: + pass + elif key == "CSD_code": + title = val + + # Set the system name + if system_name is not None and system_name != "": + lower_name = system_name.lower() + if "from file" in lower_name: + system.name = str(path) + elif lower_name == "title": + if len(title) > 0: + system.name = title + else: + system.name = str(path) + elif "canonical smiles" in lower_name: + system.name = configuration.canonical_smiles + elif "smiles" in lower_name: + system.name = configuration.smiles + else: + system.name = system_name + + # And the configuration name + if configuration_name is not None and configuration_name != "": + lower_name = configuration_name.lower() + if "from file" in lower_name: + configuration.name = obMol.GetTitle() + elif lower_name == "title": + if len(title) > 0: + configuration.name = title + else: + configuration.name = str(path) + elif "canonical smiles" in lower_name: + configuration.name = configuration.canonical_smiles + elif "smiles" in lower_name: + configuration.name = configuration.smiles + elif lower_name == "sequential": + configuration.name = str(structure_no) + else: + configuration.name = configuration_name + + if printer: + percent = int(100 * structure_no / n_structures) + if percent > last_percent: + t1 = time.time() + if t1 - last_t >= 60: + t = int(t1 - t0) + rate = structure_no / (t1 - t0) + t_left = int((n_structures - structure_no) / rate) + printer( + f"\t{structure_no:6} ({percent}%) structures read in " + f"{t} seconds. About {t_left} seconds remaining." + ) + last_t = t1 + last_percent = percent + + if printer: + t1 = time.time() + rate = structure_no / (t1 - t0) + printer( + f" Read {structure_no - n_errors - 1} structures in {t1 - t0:.1f} " + f"seconds = {rate:.2f} per second" + ) + if n_errors > 0: + printer(f" {n_errors} structures could not be read due to errors.") if references: # Add the citations for Open Babel @@ -368,4 +534,4 @@ def load_xyz( except Exception: pass - return [configuration] + return configurations diff --git a/read_structure_step/utils.py b/read_structure_step/utils.py index ccc5f1d..87cc655 100644 --- a/read_structure_step/utils.py +++ b/read_structure_step/utils.py @@ -1,4 +1,4 @@ -import os +from pathlib import Path from . import formats import re @@ -8,6 +8,8 @@ def guess_extension(file_name, use_file_name=False): Returns the file format. It can either use the file name extension or guess based on signatures found in the file. + Correctly handles .gz and .bz2 files. + Parameters ---------- file_name: str @@ -24,8 +26,13 @@ def guess_extension(file_name, use_file_name=False): """ if use_file_name is True: - (root, ext) = os.path.splitext(file_name) - + path = Path(file_name) + suffixes = path.suffixes + ext = "" + if len(suffixes) > 0: + ext = suffixes[-1] + if ext in (".gz", ".bz2") and len(suffixes) > 1: + ext = suffixes[-2] if ext == "": return None diff --git a/tests/data/3TR_model.sdf.bz2 b/tests/data/3TR_model.sdf.bz2 new file mode 100644 index 0000000..6fc989e Binary files /dev/null and b/tests/data/3TR_model.sdf.bz2 differ diff --git a/tests/data/3TR_model.sdf.gz b/tests/data/3TR_model.sdf.gz new file mode 100644 index 0000000..a9d33a1 Binary files /dev/null and b/tests/data/3TR_model.sdf.gz differ diff --git a/tests/data/tmQM_test.xyz.bz2 b/tests/data/tmQM_test.xyz.bz2 new file mode 100644 index 0000000..b3e10fe Binary files /dev/null and b/tests/data/tmQM_test.xyz.bz2 differ diff --git a/tests/data/tmQM_test.xyz.gz b/tests/data/tmQM_test.xyz.gz new file mode 100644 index 0000000..e8e18b9 Binary files /dev/null and b/tests/data/tmQM_test.xyz.gz differ diff --git a/tests/test_formats.py b/tests/test_formats.py index a7a25b8..d938dc5 100644 --- a/tests/test_formats.py +++ b/tests/test_formats.py @@ -42,7 +42,7 @@ 5 2 3 1""" -@pytest.fixture(scope="module") +@pytest.fixture() def configuration(): """Create a system db, system and configuration.""" db = SystemDB(filename="file:seamm_db?mode=memory&cache=shared") @@ -66,6 +66,8 @@ def configuration(): "3TR_model_MN.xyz", "3TR_model.pdb", "3TR_model.sdf", + "3TR_model.sdf.gz", + "3TR_model.sdf.bz2", ], ) def test_format(configuration, structure): @@ -370,3 +372,25 @@ def test_mopac_8(configuration): assert configuration.atoms.symbols == check_symbols assert smiles == check_smiles + + +@pytest.mark.parametrize( + "structure", + [ + "tmQM_test.xyz.gz", + "tmQM_test.xyz.bz2", + ], +) +def test_compressed(configuration, structure): + file_name = build_filenames.build_data_filename(structure) + system = configuration.system + system_db = system.system_db + read_structure_step.read( + file_name, + configuration, + system_db=system_db, + system=system, + subsequent_as_configurations=False, + ) + + assert system_db.n_systems == 7