Source code for luna.wrappers.rdkit

from rdkit.Chem import MolFromMol2File, MolFromPDBFile, MolFromMolFile, MolFromMolBlock, MolFromMol2Block, SanitizeFlags, SanitizeMol
from xopen import xopen

from luna.util.file import get_file_format
from luna.util.exceptions import IllegalArgumentError

import re
import logging

logger = logging.getLogger()

RDKIT_FORMATS = ("mol2", "mol", "mdl", "sdf", "sd", "pdb")

REGEX_MOL_FILE = re.compile(r' (V2000|V3000)$')


[docs]def read_mol_from_file(mol_file, mol_format, sanitize=True, removeHs=True): """Read a molecule from ``mol_file`` using RDKit. Parameters ---------- mol_file : str The pathname of the molecular file to be read. mol_format : str The molecular format of ``mol_file``. sanitize : bool If True (the default), sanitize the molecule. removeHs : bool If True (the default), remove explict hydrogens from the molecule. Returns ------- : :class:`rdkit.Chem.rdchem.Mol` or None The parsed molecule or None in case the sanitization process fails. Raises ------ IllegalArgumentError If the provided molecular format is not accepted by RDKit. Examples -------- >>> from luna.util.default_values import LUNA_PATH >>> from luna.wrappers.rdkit import read_mol_from_file >>> rdk_mol = read_mol_from_file(mol_file=f"{LUNA_PATH}/tutorial/inputs/ZINC000007786517.mol", ... mol_format="mol") >>> print(rdk_mol.GetProp("_Name")) ZINC000007786517 """ if mol_format == "mol2": # First it creates the molecule without applying the sanitization function. rdk_mol = MolFromMol2File(mol_file, sanitize=False, removeHs=removeHs) elif mol_format == "pdb": # First it creates the molecule without applying the sanitization function. rdk_mol = MolFromPDBFile(mol_file, sanitize=False, removeHs=removeHs) elif mol_format in RDKIT_FORMATS: # First it creates the molecule without applying the sanitization function. rdk_mol = MolFromMolFile(mol_file, sanitize=False, removeHs=removeHs) else: raise IllegalArgumentError("Invalid '%s' format. The accepted formats are: %s." % (mol_format, ",".join(RDKIT_FORMATS))) # Now it sanitizes the molecule if necessary. # We do it here in order to catch an Exception while performing the sanitization. if sanitize: try: SanitizeMol(rdk_mol, SanitizeFlags.SANITIZE_ALL) except Exception as e: logger.exception(e) return None return rdk_mol
[docs]def new_mol_from_block(block, mol_format, sanitize=True, removeHs=True): """Read a molecule from a string block using RDKit. Parameters ---------- block : str The molecular string block. mol_format : str The molecular format of ``block``. sanitize : bool If True (the default), sanitize the molecule. removeHs : bool If True (the default), remove explict hydrogens from the molecule. Returns ------- : :class:`rdkit.Chem.rdchem.Mol` or None The parsed molecule or None in case the sanitization process fails. Raises ------ IllegalArgumentError If the provided molecular format is not accepted by RDKit. Examples -------- >>> from luna.util.default_values import LUNA_PATH >>> from luna.wrappers.rdkit import new_mol_from_block >>> with open(f"{LUNA_PATH}/tutorial/inputs/GLY.sdf", "r") as IN: ... mol_block = IN.read() >>> rdk_mol = new_mol_from_block(block=mol_block, mol_format="sdf") >>> print(rdk_mol.GetProp("_Name")) GLY """ if mol_format == "mol2": # First it creates the molecule without applying the sanitization function. rdk_mol = MolFromMol2Block(block, sanitize=False, removeHs=removeHs) elif mol_format in RDKIT_FORMATS: # First it creates the molecule without applying the sanitization function. rdk_mol = MolFromMolBlock(block, sanitize=False, removeHs=removeHs) else: raise IllegalArgumentError("Invalid '%s' format. The accepted formats are: %s." % (mol_format, ",".join(RDKIT_FORMATS))) # Now it sanitizes the molecule if necessary. # We do it here in order to catch an Exception while performing the sanitization. if sanitize: try: SanitizeMol(rdk_mol, SanitizeFlags.SANITIZE_ALL) except Exception as e: logger.exception(e) return None return rdk_mol
[docs]def read_multimol_file(mol_file, targets=None, mol_format=None, sanitize=True, removeHs=True): """Read molecules from a multimolecular file using RDKit. Parameters ---------- mol_file : str The pathname of the molecular file to be read. targets : iterable of str Only parses molecules, given by their ids, defined in ``targets``. mol_format : str, optional The molecular format of ``mol_file``. If not provided, the format will be defined by the file extension. sanitize : bool If True (the default), sanitize the molecule. removeHs : bool If True (the default), remove explict hydrogens from the molecule. Yields ------- : tuple of (:class:`rdkit.Chem.rdchem.Mol`, int) A tuple containing the parsed molecule and its id. Raises ------ IllegalArgumentError If the provided or identified molecular format is not accepted by RDKit. Examples -------- In this first example, we will read all molecules from a multimolecular file. >>> from luna.util.default_values import LUNA_PATH >>> from luna.wrappers.rdkit import read_multimol_file >>> for mol_tuple in read_multimol_file(mol_file=f"{LUNA_PATH}/tutorial/inputs/ligands.mol2"): ... print(mol_tuple[0].GetProp("_Name")) ZINC000343043015 ZINC000065293174 ZINC000575033470 ZINC000096459890 ZINC000012442563 Now, we will only read two molecules (ZINC000065293174, ZINC000096459890) from the same multimolecular file. >>> from luna.util.default_values import LUNA_PATH >>> from luna.wrappers.rdkit import read_multimol_file >>> for mol_tuple in read_multimol_file(mol_file=f"{LUNA_PATH}/tutorial/inputs/ligands.mol2", ... targets=["ZINC000065293174", "ZINC000096459890"]): ... print(mol_tuple[0].GetProp("_Name")) ZINC000065293174 ZINC000096459890 """ def apply_mol_format(lines): # Check if the MOL file contains a valid header comprising a Title line, Program/file timestamp line, and a Comment line. # Thus, the Counts line must be in line 4 (index 3). If it's not, add blank lines to match the format. # This amendment is necessary otherwise RDKit will crash. if not REGEX_MOL_FILE.search(lines[3]): logger.warning("While parsing the file '%s', we found a molecule starting at line #%d that does not contain a valid header. " "We will add empty lines to its header to match the MOL file format." % (mol_file, mol_starts_at)) counts_line_pos = None for i, line in enumerate(lines): if REGEX_MOL_FILE.search(line): counts_line_pos = i break missing_lines = ["\n"] * (3 - counts_line_pos) lines = missing_lines + lines if lines[-1].strip() != "M END": logger.warning("While parsing the file '%s', we found a molecule starting at line #%d that does not contain the END line. " "We will add it to the block end to match the MOL file format." % (mol_file, mol_starts_at)) lines.append("M END\n") return lines ext = mol_format or get_file_format(mol_file, ignore_compression=True) if ext not in RDKIT_FORMATS: raise IllegalArgumentError("Format '%s' informed or assumed from the filename is invalid. The accepted formats are: %s." % (ext, ",".join(RDKIT_FORMATS))) with xopen(mol_file, "r") as IN: if targets is not None: targets = set(targets) mol = [] ignore_lines = False line_count = 0 while True: try: line = IN.readline() line_count += 1 # readline() returns empty strings when EOF is reached. if not line: raise StopIteration # Ignore new lines before a molecule block. if len(mol) == 0 and line == "\n": continue # Save the line in which the new molecule starts. if len(mol) == 0: mol_starts_at = line_count if ext == "mol2": if line.startswith("#"): continue # New molecule block definition is starting... if line.startswith("@<TRIPOS>MOLECULE"): # New molecule identified but an old one already exists. if mol: mol_id = mol[1].strip() # If a target list is not informed, create a new molecule object. if targets is None: # Create a new RDKit object rdk_mol = new_mol_from_block("".join(mol), ext, sanitize, removeHs) yield ((rdk_mol, mol_id)) # Otherwise, create a new molecule only if it is in the list. elif mol_id in targets: targets.remove(mol_id) # Create a new RDKit object rdk_mol = new_mol_from_block("".join(mol), ext, sanitize, removeHs) yield ((rdk_mol, mol_id)) # Restart the molecule block. mol = [] mol.append(line) else: if line.startswith("M END"): mol.append(line) # Fix the MOL block in cases where the header or end lines do not match the MOL file format. mol = apply_mol_format(mol) mol_id = mol[0].strip() # If a target list is informed, create a new molecule only if it is in the list. if targets is None: # Create a new RDKit object rdk_mol = new_mol_from_block("".join(mol), ext, sanitize, removeHs) yield((rdk_mol, mol_id)) elif mol[0].strip() in targets: targets.remove(mol_id) # Create a new RDKit object rdk_mol = new_mol_from_block("".join(mol), ext, sanitize, removeHs) yield((rdk_mol, mol_id)) # Restart the molecule block. mol = [] # After finding a molecule block, ignore any following lines until it finds a line "$$$$". ignore_lines = True elif line.startswith("$$$$") is False: # Ignore lines util it finds a line "$$$$". if ignore_lines: continue mol.append(line) else: ignore_lines = False except StopIteration: if mol: if ext == "mol": # Fix the MOL block in cases where the header or end lines do not match the MOL file format. mol = apply_mol_format(mol) mol_id = mol[1].strip() if ext == "mol2" else mol[0].strip() # If a target list is informed, create a new molecule only if it is in the list. if targets is None or (targets is not None and mol_id in targets): # Create a new RDKit object rdk_mol = new_mol_from_block("".join(mol), ext, sanitize, removeHs) yield ((rdk_mol, mol_id)) break # If all target compounds were already found, just break the loop. if targets is not None and len(targets) == 0: break