Source code for luna.mol.entry

import re
import logging
from operator import xor
from os.path import exists
from collections import defaultdict

from rdkit.Chem import Mol as RDMol
from openbabel import OBMol
from openbabel.pybel import readfile
from openbabel.pybel import Molecule as PybelMol
from openbabel.pybel import informats as OB_FORMATS

from luna.wrappers.rdkit import RDKIT_FORMATS, read_multimol_file, read_mol_from_file
from luna.wrappers.base import MolWrapper
from luna.util.default_values import ACCEPTED_MOL_OBJ_TYPES, ENTRY_SEPARATOR, ARTIFACTS_LIST
from luna.util.file import get_file_format, get_filename
from luna.util.exceptions import InvalidEntry, IllegalArgumentError, MoleculeObjectError, MoleculeObjectTypeError, MoleculeNotFoundError
from luna.MyBio.PDB.PDBParser import PDBParser, WATER_NAMES, DEFAULT_CHAIN_ID
from luna.MyBio.PDB.Entity import Entity


logger = logging.getLogger()

PCI_ENTRY_REGEX = re.compile(r'^.{1,255}:\w:\w[\w+\-]{0,2}:\-?\d{1,4}[a-zA-z]?$')
PPI_ENTRY_REGEX = re.compile(r'^.{1,255}:\w$')

REGEX_RESNUM_ICODE = re.compile(r'^([\-\+]?\d+)([a-zA-z]?)$')


[docs]class Entry:
    """Entries determine the target molecule to which interactions and other properties
    will be calculated. They can be ligands, chains, etc, and can be defined in a number of ways.
    Each entry has an associated PDB file that may contain macromolecules (protein, RNA, DNA) and
    other small molecules, water, and ions.
    The PDB file provides the context to where the interactions with the target molecule will be calculated.

    Parameters
    ----------
    pdb_id : str
        A 4-symbols structure id from PDB or a local PDB filename. Example: '3QL8' or 'file1'.
    chain_id : str
        A 1-symbol chain id. Example: 'A'.
    comp_name : str, optional
        A 1 to 3-symbols compound name (residue name in the PDB format).
        Obligatory if ``is_hetatm`` is True. Example: 'X01'.
    comp_num : int, optional
        A valid 4-digits integer (residue sequence number in the PDB format).
        Obligatory if ``is_hetatm`` is True. Example: 300 or -1.
    comp_icode : str, optional
        A 1-character compound insertion code (residue insertion code in the PDB format). Example: 'A'.
    is_hetatm : bool
        If the compound is a ligand or not. The default value is True.
    sep : str
        A separator character to format the entry string. The default value is ':'.
    parser : :class:`~luna.MyBio.PDB.PDBParser.PDBParser` or :class:`~luna.MyBio.PDB.FTMapParser.FTMapParser`, optional
            Define a PDB parser object. If not provided, the default parser will be used.

    Raises
    ------
    IllegalArgumentError
        If ``is_hetatm`` is True, but the compound name and number are not provided.
        If the compound number is provided but it is not an integer.
        If ``comp_icode`` is provided but it is not a valid character.
    InvalidEntry
        If the provided information does not match the PDB format.

    Examples
    --------

    Chain entry: can be used to calculate interactions with a given chain.

    >>> from luna.mol.entry import Entry
    >>> e = Entry(pdb_id="3QL8", chain_id="A")
    >>> print(e)
    <Entry: 3QL8:A>

    Compound entry: can be used to calculate interactions with a given compound (residue or nucleotide).

    >>> from luna.mol.entry import Entry
    >>> e = Entry(pdb_id="3QL8", chain_id="A", comp_name="HIS", comp_num=125, is_hetatm=False)
    >>> print(e)
    <Entry: 3QL8:A:HIS:125>

    Ligand entry: can be used to calculate interactions with a given ligand.

    >>> from luna.mol.entry import Entry
    >>> e = Entry(pdb_id="3QL8", chain_id="A", comp_name="X01", comp_num=300, is_hetatm=True)
    >>> print(e)
    <Entry: 3QL8:A:X01:300>

    You can use a different character separator for the entries. For example:

    >>> from luna.mol.entry import Entry
    >>> e = Entry(pdb_id="3QL8", chain_id="A", comp_name="X01", comp_num=300, is_hetatm=True, sep="/")
    >>> print(e)
    <Entry: 3QL8/A/X01/300>
    """

    def __init__(self, pdb_id, chain_id, comp_name=None, comp_num=None, comp_icode=None,
                 is_hetatm=True, sep=ENTRY_SEPARATOR, parser=None):

        if xor(comp_name is None, comp_num is None):
            raise IllegalArgumentError("You tried to define a compound, so you must inform its name and number.")

        if comp_num is not None:
            try:
                assert float(comp_num).is_integer()
                comp_num = int(comp_num)
            except (ValueError, AssertionError):
                raise IllegalArgumentError("The informed compound number '%s' is invalid. It must be an integer." % str(comp_num))

        if comp_icode is not None:
            comp_icode = str(comp_icode)
            if comp_icode.isdigit() or len(comp_icode) > 1:
                raise IllegalArgumentError("The informed compound icode '%s' is invalid. It must be a character." % str(comp_icode))

        self._pdb_id = pdb_id
        self._chain_id = chain_id
        self._comp_name = comp_name
        self._comp_num = comp_num
        self._comp_icode = comp_icode
        self.is_hetatm = is_hetatm
        self.sep = sep
        self.parser = parser

        if not self.is_valid():
            raise InvalidEntry("Entry '%s' does not match the PDB format." % self.to_string())

[docs]    @classmethod
    def from_string(cls, entry_str, is_hetatm=True, sep=ENTRY_SEPARATOR):
        """Initialize from a string.

        Parameters
        ----------
        entry_str : str
            A string representing the entry. Example: '3QL8:A:X01:300'.
        is_hetatm : bool
            Defines if the compound is a ligand or not. The default value is True.
        sep : str
            The separator character used in ``entry_str``. The default value is ':'.
            For example: if ``entry_str`` is set to '3QL8|A|X01|300', then ``sep`` should be defined as '|'.

        Returns
        -------
         : `Entry`

        Raises
        ------
        IllegalArgumentError
            If the fields in ``entry_str`` do not match the format expected to define a
            chain (ChainEntry) or a compound (MolEntry).

        Examples
        --------

        Chain entry: can be used to calculate interactions with a given chain.

        >>> from luna.mol.entry import Entry
        >>> e = Entry.from_string("3QL8:A", sep=":")
        >>> print(e)
        <Entry: 3QL8:A>

        Compound entry: can be used to calculate interactions with a given compound (residue or nucleotide).

        >>> from luna.mol.entry import Entry
        >>> e = Entry.from_string("3QL8:A:HIS:125", sep=":")
        >>> print(e)
        <Entry: 3QL8:A:HIS:125>

        Ligand entry: can be used to calculate interactions with a given ligand.

        >>> from luna.mol.entry import Entry
        >>> e = Entry.from_string("3QL8:A:X01:300", sep=":")
        >>> print(e)
        <Entry: 3QL8:A:X01:300>
        """

        entries = entry_str.split(sep)

        # Try to initialize a new ChainEntry.
        if len(entries) == 2:
            if any([str(i).strip() == "" for i in entries]):
                raise IllegalArgumentError("The number of fields in the informed string '%s' is incorrect. A valid ChainEntry must contain "
                                           "two obligatory fields: PDB and chain id." % entry_str)

            return cls(*entries, is_hetatm=False, sep=sep)

        # Try to initialize a new MolEntry.
        elif len(entries) == 4:
            if any([str(i).strip() == "" for i in entries]):
                raise IllegalArgumentError("The number of fields in the informed string '%s' is incorrect. A valid MolEntry "
                                           "must contain four obligatory fields: PDB, chain id, compound name, and compound "
                                           "number followed by its insertion code when applicable." % entry_str)

            # Separate ligand number from insertion code.
            matched = REGEX_RESNUM_ICODE.match(entries[3])
            if matched:
                comp_num = matched.group(1)
                try:
                    assert float(comp_num).is_integer()
                    comp_num = int(comp_num)
                except (ValueError, AssertionError):
                    raise IllegalArgumentError("The informed compound number '%s' is invalid. It must be an integer." % str(comp_num))

                icode = None if matched.group(2) == "" else matched.group(2)
                entries = entries[0:3] + [comp_num, icode]
            else:
                raise IllegalArgumentError("The compound number and its insertion code (if applicable) '%s' is invalid. "
                                           "It must be an integer followed by one insertion code character when applicable."
                                           % entries[3])
            return cls(*entries, is_hetatm=is_hetatm, sep=sep)

        else:
            raise IllegalArgumentError("The number of fields in the informed string '%s' is incorrect. A valid string must contain "
                                       "two obligatory fields (PDB and chain id) and it may contain two optional fields (compound name "
                                       "and compound number followed by its insertion code when applicable)." % entry_str)

    @property
    def pdb_id(self):
        """str, read-only: the pdb id."""
        return self._pdb_id

    @property
    def chain_id(self):
        """str, read-only: the chain id."""
        return self._chain_id

    @property
    def comp_name(self):
        """str, read-only: the compound name."""
        return self._comp_name

    @property
    def comp_num(self):
        """int, read-only: the compound number."""
        return self._comp_num

    @property
    def comp_icode(self):
        """str, read-only: the compound insertion code."""
        if isinstance(self._comp_icode, str):
            return self._comp_icode
        else:
            return ' '

    @property
    def full_id(self):
        """tuple, read-only: The full id of the entry is the tuple (PDB id or filename, chain id)
        for entries representing chains and (PDB id or filename, chain id,
        compound name, compound number, insertion code) for entries representing compounds."""

        entry = [self.pdb_id, self.chain_id]
        if self.comp_name is not None and self.comp_num is not None:
            entry.append(self.comp_name)
            entry.append(self.comp_num)
            entry.append(self.comp_icode)
        return tuple(entry)

[docs]    def to_string(self, sep=None):
        """ Convert the entry to a string using ``sep`` as a separator character.

        Parameters
        ----------
        sep : str or None
            If None (the default), use the separator character defined during the entry object creation.
            Otherwise, uses ``sep`` as the separator character.

        Examples
        --------

        >>> from luna.mol.entry import Entry
        >>> e = Entry(pdb_id="3QL8", chain_id="A", comp_name="X01", comp_num=300, is_hetatm=True, sep=":")
        >>> print(e.to_string("/"))
        3QL8/A/X01/300
        """

        full_id = self.full_id

        # An entry object will always have a PDB and chain id.
        entry = list(full_id[0:2])

        # If it contains additional information about the compound it will also include them.
        if len(full_id) > 2:
            if full_id[2] is not None and full_id[3] is not None:
                comp_name = str(full_id[2]).strip()
                comp_num_and_icode = str(full_id[3]).strip() + str(full_id[4]).strip()
                entry += [comp_name, comp_num_and_icode]

        sep = sep or self.sep

        return sep.join(entry)

[docs]    def is_valid(self):
        """Check if the entry matches the expected format for protein-protein or protein-compound complexes.

        Returns
        -------
         : bool
        """
        full_id = self.full_id

        # Regex for ChainEntry (pdb_id, chain_id).
        if len(full_id) == 2:
            return PPI_ENTRY_REGEX.match(self.to_string(":")) is not None

        # Regex for MolEntry (pdb_id, chain_id, comp_name, comp_num, icode).
        elif len(full_id) == 5:
            return PCI_ENTRY_REGEX.match(self.to_string(":")) is not None

        # Return False for anything else
        return False

[docs]    def get_biopython_key(self, full_id=False):
        """Represent the entry as a key to select chains or compounds from Biopython Entity objects.

        Parameters
        full_id : bool
            If True, return the full id of a chain or ligand.
            For chains, it consists of a tuple containing the PDB and the chain id.
            For ligands, it consists of a tuple containing the PDB, the chain, and the ligand id.
            The default value is False.

        Returns
        -------
         : str or tuple
            Return str if the entry represents a chain and if ``full_id`` is False.
            Otherwise, return a tuple.

        Examples
        --------

        >>> from luna.mol.entry import Entry
        >>> e = Entry(pdb_id="3QL8", chain_id="A", comp_name="X01", comp_num=300, is_hetatm=True, sep=":")
        >>> print(e.get_biopython_key())
        ('H_X01', 300, ' ')

        """
        key = []
        if full_id:
            key = [self.pdb_id, 0, self.chain_id]

        if self.comp_name is not None and self.comp_num is not None:
            if self.comp_name == 'HOH' or self.comp_name == 'WAT':
                comp_id = ('W', self.comp_num, self.comp_icode)
            elif self.is_hetatm:
                comp_id = ('H_%s' % self.comp_name, self.comp_num, self.comp_icode)
            else:
                comp_id = (' ', self.comp_num, self.comp_icode)

            if full_id:
                key.append(comp_id)
                return tuple(key)
            return comp_id

        if full_id:
            return tuple(key)

        return self.chain_id

    def __repr__(self):
        return '<%s: %s>' % (self.__class__.__name__, self.to_string(self.sep))


[docs]class ChainEntry(Entry):
    """Define a chain.

    Parameters
    ----------
    pdb_id : str
        A 4-symbols structure id from PDB or a local PDB filename. Example: '3QL8' or 'file1'.
    chain_id : str
        A 1-symbol chain id. Example: 'A'.
    sep : str
        A separator character to format the entry string. The default value is ':'.

    Raises
    ------
    InvalidEntry
        If the provided information does not match the PDB format.

    Examples
    --------
    >>> from luna.mol.entry import ChainEntry
    >>> e = ChainEntry(pdb_id="3QL8", chain_id="A")
    >>> print(e)
    <ChainEntry: 3QL8:A>

    """

    def __init__(self, pdb_id, chain_id, sep=ENTRY_SEPARATOR, parser=None):
        super().__init__(pdb_id, chain_id, is_hetatm=False, sep=sep, parser=parser)

[docs]    @classmethod
    def from_string(cls, entry_str, sep=ENTRY_SEPARATOR):
        """Initialize from a string.

        Parameters
        ----------
        entry_str : str
            A string representing the entry. Example: '3QL8:A'.
        sep : str
            The separator character used in ``entry_str``. The default value is ':'.
            For example: if ``entry_str`` is set to '3QL8|A', then ``sep`` should be defined as '|'.

        Returns
        -------
         : `Entry`

        Raises
        ------
        IllegalArgumentError
            If the fields in ``entry_str`` do not match the format expected to define a chain.

        Examples
        --------
        >>> from luna.mol.entry import ChainEntry
        >>> e = ChainEntry.from_string("3QL8:A", sep=":")
        >>> print(e)
        <ChainEntry: 3QL8:A>

        """
        entries = entry_str.split(sep)
        if len(entries) == 2:
            return cls(*entries, sep=sep)
        else:
            raise IllegalArgumentError("The number of fields in the informed string '%s' is incorrect. A valid string must contain "
                                       "two obligatory fields: PDB and chain id." % entry_str)

    @property
    def full_id(self):
        """tuple, read-only: The full id of the entry is the tuple (PDB id or filename, chain id)."""
        return (self.pdb_id, self.chain_id)


[docs]class MolEntry(Entry):
    """Define a compound from a PDB file, which can be a residue, nucleotide, or ligand.

    Parameters
    ----------
    pdb_id : str
        A 4-symbols structure id from PDB or a local PDB filename. Example: '3QL8' or 'file1'.
    chain_id : str
        A 1-symbol chain id. Example: 'A'.
    comp_name : str
        A 1 to 3-symbols compound name (residue name in the PDB format). Example: 'X01'.
    comp_num : int
        A valid 4-digits integer (residue sequence number in the PDB format). Example: 300 or -1.
    comp_icode : str, optional
        A 1-character compound insertion code (residue insertion code in the PDB format). Example: 'A'.
    sep : str
        A separator character to format the entry string. The default value is ':'.

    Raises
    ------
    InvalidEntry
        If the provided information does not match the PDB format.

    Examples
    --------

    Compound entry: can be used to calculate interactions with a given compound (residue or nucleotide).

    >>> from luna.mol.entry import MolEntry
    >>> e = MolEntry(pdb_id="3QL8", chain_id="A", comp_name="HIS", comp_num=125, is_hetatm=False)
    >>> print(e)
    <MolEntry: 3QL8:A:HIS:125>

    Ligand entry: can be used to calculate interactions with a given ligand.

    >>> from luna.mol.entry import MolEntry
    >>> e = MolEntry(pdb_id="3QL8", chain_id="A", comp_name="X01", comp_num=300, is_hetatm=True)
    >>> print(e)
    <MolEntry: 3QL8:A:X01:300>

    """

    def __init__(self, pdb_id, chain_id, comp_name, comp_num, comp_icode=None, sep=ENTRY_SEPARATOR, parser=None):
        super().__init__(pdb_id, chain_id, comp_name, comp_num, comp_icode, is_hetatm=True, sep=sep, parser=parser)

[docs]    @classmethod
    def from_string(cls, entry_str, sep=ENTRY_SEPARATOR):
        entries = entry_str.split(sep)
        if len(entries) == 4:
            # Separate ligand number from insertion code.
            matched = REGEX_RESNUM_ICODE.match(entries[3])
            if matched:
                comp_num = matched.group(1)
                icode = None if matched.group(2) == "" else matched.group(2)
                entries = entries[0:3] + [comp_num, icode]
            else:
                raise IllegalArgumentError("The compound number and its insertion code (if applicable) '%s' is invalid. "
                                           "It must be an integer followed by one insertion code character when applicable." % entries[3])
            return cls(*entries, sep=sep)
        else:
            raise IllegalArgumentError("The number of fields in the informed string '%s' is incorrect. A valid compound entry must contain "
                                       "four obligatory fields: PDB, chain id, compound name, and compound number followed by its "
                                       "insertion code when applicable." % entry_str)

[docs]    @classmethod
    def from_file(cls, input_file, sep=":"):
        """Initialize from a list of strings representing compounds.

        Parameters
        ----------
        input_file : str
            The file from where the list of strings (one per line) will be read from.
        sep : str
            The separator character used in ``input_file``. The default value is ':'.
            For example: if entries from ``input_file`` use '|' as the separator, then ``sep`` should be defined as '|'.

        Yields
        ------
        `MolEntry`
            An entry recovered from ``input_file``.
        """
        with open(input_file, "r") as IN:
            for row in IN:
                entry_str = row.strip()
                if entry_str == "":
                    continue

                yield cls.from_string(entry_str)


[docs]class MolFileEntry(Entry):

    """Define a ligand from a molecular file.
    This class should be used for docking and molecular dynamics campaigns where usually one has
    the protein structure in the PDB format and the ligand structure in a separate molecular file.

    Parameters
    ----------
    pdb_id : str
        A 4-symbols structure id from PDB or a local PDB filename. Example: '3QL8' or 'file1'.
    mol_id : str
        The ligand id in the molecular file.
    sep : str
        A separator character to format the entry string. The default value is ':'.

    Attributes
    ----------
    mol_id : str
        The ligand id.
    mol_file : str
        Pathname of the molecular file.
    mol_file_ext : str
        The molecular file format.
        If not provided, try to recover the molecular file extension directly from ``mol_file``.
    mol_obj_type : {'rdkit', 'openbabel'}
        Define which library (RDKit or Open Babel) to use to parse the molecular file.
    overwrite_mol_name : bool
        If True, substitute the ligand name in the parsed molecular object with ``mol_id``.
        Only works for single-molecule files (``is_multimol_file`` = False)
        as in these cases ``mol_id`` does not need to match the ligand name in the molecular file.
    is_multimol_file : bool
        If ``mol_file`` contains multiple molecules or not.
        If True, ``mol_id`` should match some ligand name in ``mol_file``.
    """

    def __init__(self, pdb_id, mol_id, sep=ENTRY_SEPARATOR):

        self.mol_id = mol_id

        #
        # Initialize empty properties.
        #
        self._mol_obj = None
        self.mol_file = None
        # TODO: Find a way to assume the Mol file type when not provided
        self.mol_file_ext = None
        self.mol_obj_type = None
        self.overwrite_mol_name = None
        self.is_multimol_file = None

        super().__init__(pdb_id, DEFAULT_CHAIN_ID, "LIG", 9999, is_hetatm=True, sep=sep)

[docs]    @classmethod
    def from_mol_obj(cls, pdb_id, mol_id, mol_obj, sep=ENTRY_SEPARATOR):
        """Initialize from an already loaded molecular object.

        This function is useful in cases where a molecular object is parsed and
        pre-processed using a different protocol defined by the user.

        Parameters
        ----------
        pdb_id : str
            A 4-symbols structure id from PDB or a local PDB filename. Example: '3QL8' or 'file1'.
        mol_id : str
            The ligand id.
            As the molecular object is already provided, the ligand id does not need to match the ligand name in the molecular object.
        mol_obj : :class:`~luna.wrappers.base.MolWrapper`, :class:`rdkit.Chem.rdchem.Mol`, or :class:`openbabel.pybel.Molecule`
            The molecular object.
        sep : str
            A separator character to format the entry string. The default value is ':'.

        Returns
        -------
         : `MolFileEntry`

        Raises
        ------
        MoleculeObjectTypeError
            If the molecular object is not an instance
            of :class:`~luna.wrappers.base.MolWrapper`, :class:`rdkit.Chem.rdchem.Mol`, or :class:`openbabel.pybel.Molecule`.
        IllegalArgumentError
            If ``entity`` is not a valid Biopython object.

        Examples
        --------

        In this example, we will initialize a new MolFileEntry with the ligand 'ZINC000007786517' and
        the structure located in a PDB file of name 'D4', which is the structure used for docking the molecule.

        First, let's parse the molecular file.

        >>> from luna.wrappers.rdkit import read_mol_from_file
        >>> mol_obj = read_mol_from_file("tutorial/inputs/ZINC000007786517.mol", mol_format="mol")

        Now, we create the new MolFileEntry object as follows:

        >>> e = MolFileEntry.from_mol_obj("D4", "ZINC000007786517", mol_obj, sep=ENTRY_SEPARATOR)
        >>> print(e)
        <MolFileEntry: D4:ZINC000007786517>
        >>> print(e.mol_obj.to_smiles())
        Cc1cccc(NC(=O)C[N@@H+](C)C2CCCCC2)c1C
        """

        if mol_obj is not None:
            if isinstance(mol_obj, MolWrapper):
                mol_obj = mol_obj.unwrap()
            elif isinstance(mol_obj, PybelMol):
                mol_obj = mol_obj.OBMol

            if isinstance(mol_obj, RDMol):
                mol_obj_type = "rdkit"
            elif isinstance(mol_obj, OBMol):
                mol_obj_type = "openbabel"
            else:
                logger.exception("Objects of type '%s' are not currently accepted." % mol_obj.__class__)
                raise MoleculeObjectTypeError("Objects of type '%s' are not currently accepted." % mol_obj.__class__)
        else:
            if mol_obj_type not in ACCEPTED_MOL_OBJ_TYPES:
                raise IllegalArgumentError("Objects of type '%s' are not currently accepted. "
                                           "The available options are: %s." % (mol_obj_type, ", ".join(ACCEPTED_MOL_OBJ_TYPES)))

        entry = cls(pdb_id, mol_id, sep)
        entry.mol_obj = mol_obj
        entry.mol_obj_type = mol_obj_type

        # TODO: Find a way to assume the Mol file type when not provided
        # mol_file_ext

        return entry

[docs]    @classmethod
    def from_mol_file(cls, pdb_id, mol_id, mol_file, is_multimol_file, mol_file_ext=None, mol_obj_type='rdkit',
                      autoload=False, overwrite_mol_name=False, sep=ENTRY_SEPARATOR):
        """Initialize from a molecular file.

        Parameters
        ----------
        pdb_id : str
            A 4-symbols structure id from PDB or a local PDB filename. Example: '3QL8' or 'file1'.
        mol_id : str
            The ligand id in the molecular file.
        mol_file : str
            Pathname of the molecular file.
        is_multimol_file : bool
            If ``mol_file`` contains multiple molecules or not.
            If True, ``mol_id`` should match some ligand name in ``mol_file``.
        mol_file_ext : str, optional
            The molecular file format.
            If not provided, try to recover the molecular file extension directly from ``mol_file``.
        mol_obj_type : {'rdkit', 'openbabel'}
            If "rdkit", parse the converted molecule with RDKit and return an instance of :class:`rdkit.Chem.rdchem.Mol`.
            If "openbabel", parse the converted molecule with Open Babel and return an instance of
            :class:`openbabel.pybel.Molecule`. The default value is 'rdkit'.
        autoload : bool
            If True, parse the ligand from the molecular file during the entry initialization.
            Otherwise, only load the ligand when first used.
        overwrite_mol_name : bool
            If True, substitute the ligand name in the parsed molecular object with ``mol_id``.
            Only works for single-molecule files (``is_multimol_file`` = False)
            as in these cases ``mol_id`` does not need to match the ligand name in the molecular file.
        sep : str
            A separator character to format the entry string. The default value is ':'.

        Returns
        -------
         : `MolFileEntry`

        Raises
        ------
        FileNotFoundError
            If ``mol_file`` does not exist.
        IllegalArgumentError
            If ``mol_obj_type`` is not either 'rdkit' nor 'openbabel'.
        MoleculeObjectError
            If any errors occur while parsing the molecular file.
            Detailed information about the errors can be found in the logging outputs.
        MoleculeNotFoundError
            If the ligand ``mol_id`` was not found in the input file and ``is_multimol_file`` is True.

        Examples
        --------

        In this first example, we will read the ligand 'ZINC000007786517' from a single-molecule file.
        As we are working with a single-molecule file, ``mol_id`` can be any value you prefer.

        >>> from luna.mol.entry import MolFileEntry
        >>> e = MolFileEntry.from_mol_file(pdb_id="D4", mol_id="Ligand", mol_file="tutorial/inputs/ZINC000007786517.mol",
        ...                                mol_obj_type='rdkit', is_multimol_file=False)
        >>> print(e)
        D4:Ligand
        >>> print(e.mol_obj.to_smiles())
        Cc1cccc(NC(=O)C[N@@H+](C)C2CCCCC2)c1C

        Now, let's say we need to read the ligand 'ZINC000096459890' from a multi-molecular file and that we
        want to use Open Babel to parse the molecule.
        To do so, remember that it should exist a ligand with the name ``mol_id`` in ``mol_file``.
        Otherwise, it will raise the exception MoleculeNotFoundError.

        >>> from luna.mol.entry import MolFileEntry
        >>> e = MolFileEntry.from_mol_file(pdb_id="D4", mol_id="ZINC000096459890", mol_file="tutorial/inputs/ligands.mol2",
        ...                                mol_obj_type='openbabel', is_multimol_file=True)
        >>> print(e)
        <MolFileEntry: D4:ZINC000096459890>
        >>> print(e.mol_obj.to_smiles())
        O=C(OCCCN1C=CC=CC1=O)c1ccc2ccc(Cl)cc2n1

        Below, we show what happens if ``mol_id`` does not exist in ``mol_file``. Observe we set ``autoload`` to True
        to parse the molecule right away.

        >>> from luna.mol.entry import MolFileEntry
        >>> e = MolFileEntry.from_mol_file(pdb_id="D4", mol_id="Ligand", mol_file="tutorial/inputs/ligands.mol2",
        ...                                mol_obj_type='openbabel', is_multimol_file=True, autoload=True)
        luna.util.exceptions.MoleculeNotFoundError: "The ligand 'Ligand' was not found in the input file \
        or generated errors while parsing it with Open Babel."
        """

        entry = cls(pdb_id, mol_id, sep)

        entry.mol_file = mol_file
        entry.is_multimol_file = is_multimol_file
        entry.mol_file_ext = mol_file_ext or get_file_format(mol_file)
        entry.mol_obj_type = mol_obj_type
        entry.overwrite_mol_name = overwrite_mol_name

        if autoload:
            entry._load_mol_from_file()

        return entry

[docs]    @classmethod
    def from_file(cls, input_file, pdb_id, mol_file, **kwargs):
        """Initialize from a list of ligand names.

        Parameters
        ----------
        input_file : str
            The file from where the list of ligand names (one per line) will be read from.
        pdb_id : str
            A 4-symbols structure id from PDB or a local PDB filename. Example: '3QL8' or 'file1'.
        mol_file : str
            Pathname of a multi-molecular file.
        mol_file_ext : str, optional
            The molecular file format.
            If not provided, try to recover the molecular file extension directly from ``mol_file``.
        mol_obj_type : {'rdkit', 'openbabel'}
            If "rdkit", parse the converted molecule with RDKit and return an instance of :class:`rdkit.Chem.rdchem.Mol`.
            If "openbabel", parse the converted molecule with Open Babel and return an instance of
            :class:`openbabel.pybel.Molecule`. The default value is 'rdkit'.
        autoload : bool
            If True, parse the ligand from the molecular file during the entry initialization.
            Otherwise, only load the ligand when first used.
        sep : str
            A separator character to format the entry string. The default value is ':'.

        Yields
        ------
        `MolFileEntry`
            An entry recovered from ``input_file``.

        Raises
        ------
        FileNotFoundError
            If ``mol_file`` does not exist.
        IllegalArgumentError
            If ``mol_obj_type`` is not either 'rdkit' nor 'openbabel'.
        MoleculeObjectError
            If any errors occur while parsing the molecular file.
            Detailed information about the errors can be found in the logging outputs.
        MoleculeNotFoundError
            If some ligand from ``input_file`` was not found in ``mol_file``.

        Examples
        --------

        >>> from luna.mol.entry import MolFileEntry
        >>> entries = MolFileEntry.from_file(input_file="tutorial/inputs/MolEntries.txt",
        ...                                  pdb_id="D4", mol_file="tutorial/inputs/ligands.mol2",
        ...                                  mol_obj_type="openbabel", autoload=True)
        >>> for e in entries:
        >>>     print(e)
        <MolFileEntry: D4:ZINC000012442563>
        <MolFileEntry: D4:ZINC000065293174>
        <MolFileEntry: D4:ZINC000096459890>
        <MolFileEntry: D4:ZINC000343043015>
        <MolFileEntry: D4:ZINC000575033470>
        """
        with open(input_file, "r") as IN:
            for row in IN:
                ligand_id = row.strip()
                if ligand_id == "":
                    continue

                yield cls.from_mol_file(pdb_id, ligand_id, mol_file,
                                        is_multimol_file=True, overwrite_mol_name=False, **kwargs)

    @property
    def full_id(self):
        """tuple, read-only: The full id of the entry is the tuple (PDB id or filename, ligand id)."""
        return (self.pdb_id, self.mol_id)

    @property
    def mol_obj(self):
        """:class:`~luna.wrappers.base.MolWrapper`, :class:`rdkit.Chem.rdchem.Mol`, or :class:`openbabel.pybel.Molecule`: The molecule."""
        if self._mol_obj is None and self.mol_file is not None:
            self._load_mol_from_file()
        return self._mol_obj

    @mol_obj.setter
    def mol_obj(self, mol_obj):
        self._mol_obj = MolWrapper(mol_obj)

[docs]    def is_valid(self):
        """Check if the entry represents a valid protein-ligand complex.

        Returns
        -------
         : bool
        """
        return True

[docs]    def is_mol_obj_loaded(self):
        """Check if the molecular object has already been loaded.

        Returns
        -------
         : bool
        """
        return self._mol_obj is not None

    def _load_mol_from_file(self):
        logger.debug("It will try to load the molecule '%s'." % self.mol_id)

        if self.mol_file is None:
            raise IllegalArgumentError("It cannot load the molecule as no molecular file was provided.")

        available_formats = OB_FORMATS if self.mol_obj_type == "openbabel" else RDKIT_FORMATS
        tool = "Open Babel" if self.mol_obj_type == "openbabel" else "RDKit"
        if self.mol_file_ext not in available_formats:
            raise IllegalArgumentError("Extension '%s' informed or assumed from the filename is not a format "
                                       "recognized by %s." % (self.mol_file_ext, tool))

        if not exists(self.mol_file):
            raise FileNotFoundError("The file '%s' was not found." % self.mol_file)

        try:
            if self.mol_obj_type == "openbabel":
                mols = readfile(self.mol_file_ext, self.mol_file)
                # If it is a multimol file, then we need to loop over the molecules to find the target one.
                # Note that in this case, the ids must match.
                if self.is_multimol_file:
                    for ob_mol in mols:
                        if self.mol_id == get_filename(ob_mol.OBMol.GetTitle()):
                            self.mol_obj = ob_mol
                            break
                else:
                    self.mol_obj = mols.__next__()
            else:
                if self.mol_file_ext == "pdb":
                    self.mol_obj = read_mol_from_file(self.mol_file, mol_format=self.mol_file_ext, removeHs=False)
                else:
                    # If 'targets' is None, then the entire Mol file will be read.
                    targets = None
                    # If it is a multimol file than loop through it until the informed molecule (by its mol_id) is found.
                    if self.is_multimol_file:
                        targets = [self.mol_id]

                    for rdk_mol, mol_id in read_multimol_file(self.mol_file, mol_format=self.mol_file_ext, targets=targets, removeHs=False):
                        # It returns None if the molecule parsing generated errors.
                        self.mol_obj = rdk_mol
                        break
        except Exception as e:
            logger.exception(e)
            raise MoleculeObjectError("An error occurred while parsing the molecular file with %s and the molecule "
                                      "object for the entry '%s' could not be created. Check the logs for more information."
                                      % (tool, self.to_string()))

        if self._mol_obj is None:
            raise MoleculeNotFoundError("The ligand '%s' was not found in the input file or generated errors while parsing it with %s."
                                        % (self.mol_id, tool))
        else:
            if not self.mol_obj.has_name() or self.overwrite_mol_name:
                self.mol_obj.set_name(self.mol_id)

        logger.debug("Molecule '%s' was successfully loaded." % self.mol_id)

[docs]    def get_biopython_structure(self, entity=None, parser=None):
        """Transform the molecular object into a Biopython Entity object.

        If ``entity`` is provided, the molecular object is appended to it, i.e., this function
        can be used to join a ligand and the structure used during docking or molecular dynamics.

        By default, the ligand is added to a chain of id `z`.

        Parameters
        ----------
        entity : :class:`~luna.MyBio.PDB.Entity.Entity`, optional
            Append the molecular object to ``entity``.
            If not provided, a new :class:`~luna.MyBio.PDB.Entity.Entity` is created.
        parser : :class:`~luna.MyBio.PDB.PDBParser.PDBParser`, optional
            Define a PDB parser object. If not provided, the default parser will be used.

        Returns
        -------
         : :class:`~luna.MyBio.PDB.Entity.Entity`

        Raises
        ------
        IllegalArgumentError
            If ``entity`` is not a valid Biopython object.

        Examples
        --------

        In this example, we will demonstrate how to join a protein structure and a ligand docked against it.

        First, let's parse the PDB file.

        >>> from luna.MyBio.PDB.PDBParser import PDBParser
        >>> pdb_parser = PDBParser(PERMISSIVE=True, QUIET=True)
        >>> structure = pdb_parser.get_structure("Protein", "tutorial/inputs/D4.pdb")

        Observe that the list of chains in the parsed structure contains only one element.

        >>> print(structure[0].child_list)
        [<Chain id=A>]

        Now, we will read the ligand and append it to the existing protein structure.

        >>> from luna.mol.entry import MolFileEntry
        >>> e = MolFileEntry.from_mol_file("D4", "ZINC000007786517", "tutorial/inputs/ZINC000007786517.mol",
        ...                                mol_obj_type='rdkit', is_multimol_file=False)
        >>> joined_structure = e.get_biopython_structure(structure)

        Observe that now the list of chains contains chains 'A' and 'z', which is the default chain where
        ligands are added.

        >>> print(joined_structure[0].child_list)
        [<Chain id=A>, <Chain id=z>]

        If we loop over the residues in chain 'z', we will find our ligand.

        >>> for r in structure[0]["z"]:
        >>>     print(r)
        <Residue LIG het=H_LIG resseq=9999 icode= >
        """

        if parser is None:
            parser = PDBParser(PERMISSIVE=True, QUIET=True, FIX_EMPTY_CHAINS=True,
                               FIX_ATOM_NAME_CONFLICT=True, FIX_OBABEL_FLAGS=False)

        mol_file_ext = self.mol_file_ext
        if mol_file_ext is None and self.mol_file is not None:
            mol_file_ext = get_file_format(self.mol_file)

        if self.mol_obj_type == "openbabel":
            pdb_block = self.mol_obj.to_pdb_block()

            atm = self.mol_obj.unwrap().GetFirstAtom()
            residue_info = atm.GetResidue()

            # When the PDBParser finds an empty chain, it automatically replace it by 'z'.
            chain_id = residue_info.GetChain() if residue_info.GetChain().strip() != "" else self.chain_id
            comp_num = residue_info.GetNum()

            if residue_info.GetName() in WATER_NAMES:
                comp_name = "W"
            elif residue_info.IsHetAtom(atm):
                comp_name = "H_%s" % residue_info.GetName()
            else:
                comp_name = " "

            if mol_file_ext == "pdb":
                self.chain_id = chain_id
                self.comp_name = residue_info.GetName()
                self.comp_num = comp_num
                self.is_hetatm = residue_info.IsHetAtom(atm)
        else:
            pdb_block = self.mol_obj.to_pdb_block()

            if mol_file_ext == "pdb":
                residue_info = self.mol_obj.unwrap().GetAtoms()[0].GetPDBResidueInfo()
                # When the PDBParser finds an empty chain, it automatically replace it by 'z'.
                chain_id = residue_info.GetChainId() if residue_info.GetChainId().strip() != "" else self.chain_id
                comp_num = residue_info.GetResidueNumber()

                if residue_info.GetResidueName() in WATER_NAMES:
                    comp_name = "W"
                elif residue_info.GetIsHeteroAtom():
                    comp_name = "H_%s" % residue_info.GetResidueName()
                else:
                    comp_name = " "

                self.chain_id = chain_id
                self.comp_name = residue_info.GetResidueName()
                self.comp_num = comp_num
                self.is_hetatm = residue_info.GetIsHeteroAtom()
            else:
                # When the PDBParser finds an empty chain, it automatically replace it by 'z'.
                chain_id = self.chain_id
                comp_name = "H_UNL"
                comp_num = 1

        comp_structure = parser.get_structure_from_pdb_block(self.pdb_id, pdb_block)

        chain = comp_structure[0][chain_id]
        if self.chain_id != chain.id:
            chain.id = self.chain_id

        lig = chain[(comp_name, comp_num, " ")]

        # It only substitutes the ligand id if it is different from the id defined by the MolFileEntry object property.
        # This update will never happen when the ligand file is a PDB file as the ids are guaranteed to be equal.
        if lig.id != ("H_%s" % self.comp_name, self.comp_num, " "):
            lig.id = ("H_%s" % self.comp_name, self.comp_num, " ")

        lig.resname = self.comp_name

        if entity is not None:
            if isinstance(entity, Entity):
                structure = entity.get_parent_by_level('S')
                if self.chain_id not in structure[0].child_dict:
                    chain = chain.copy()
                    structure[0].add(chain)
                else:
                    lig = lig.copy()
                    # Update the ligand index according to the number of compounds already present in the chain.
                    lig.idx = len(structure[0][self.chain_id].child_list)
                    structure[0][self.chain_id].add(lig)
            else:
                raise IllegalArgumentError("The informed entity is not a valid Biopython object.")
        else:
            entity = comp_structure

        return entity

    def __repr__(self):
        return '<MolFileEntry: %s%s%s>' % (self.pdb_id, self.sep, self.mol_id)

    def __getstate__(self):
        if self._mol_obj is not None:
            self.mol_obj = MolWrapper(self.mol_obj)
        return self.__dict__

    def __setstate__(self, state):
        self.__dict__.update(state)


[docs]def recover_entries_from_entity(entity, get_small_molecules=True, get_chains=False,
                                ignore_artifacts=True, by_cluster=False, sep=ENTRY_SEPARATOR):
    """ Search for chains and small molecules in ``entity`` and return them as strings.

    Parameters
    ----------
    entity : :class:`~luna.MyBio.PDB.Entity.Entity`
        An entity from where chains or small molecules will be recovered.
    get_small_molecules : bool
        If True, identify small molecules and return them as `MolEntry` objects. The default value is True.
    get_chains : bool
        If True, identify chains and return them as `ChainEntry` objects. The default value is False.
    ignore_artifacts : bool
        If True, ignore the following crystallography artifacts: ACE, ACT, BME, CSD, CSW, EDO, FMT, GOL, MSE,
        NAG, NO3, PO4, SGM, SO4, or TPO. The default value is True.
    by_cluster : bool
        If True, aggregate entries by cluster. Cluster ids are exclusive to :class:`~luna.MyBio.PDB.Residue.Residue` instances and are
        automatically set by :class:`~luna.MyBio.PDB.FTMapParser.FTMapParser`, a parser for FTMap results.
        By default, the cluster id of :class:`~luna.MyBio.PDB.Residue.Residue` instances are set to None, therefore,
        if the cluster id is not explicitly defined, all entries will be aggregated to the same key ``None``.
    sep : str
        A separator character to format the entry string. The default value is ':'.

    Returns
    ------
     : list or dict
        If ``by_cluster`` is set to False, a list of `ChainEntry` or `MolEntry` objects is returned.
        Otherwise, a dict is returned, in which keys are clusters and values are lists of `ChainEntry` or `MolEntry` objects.
        When no cluster information is available, all entries are aggregated in a key of value ``None``.
        Cluster ids are exclusive to :class:`~luna.MyBio.PDB.Residue.Residue` instances, therefore, `ChainEntry` objects
        are always placed in a key of value ``None``.

    Examples
    --------

    First, let's parse a PDB file.

    >>> from luna.MyBio.PDB.PDBParser import PDBParser
    >>> pdb_parser = PDBParser(PERMISSIVE=True, QUIET=True)
    >>> structure = pdb_parser.get_structure("Protein", "tutorial/inputs/3QQK.pdb")

    Now, we can recover entries from the parsed PDB file:

    >>> from luna.mol.entry import recover_entries_from_entity
    >>> entries = recover_entries_from_entity(structure, get_chains=True)
    >>> for e in entries:
    >>>     print(e)
    <MolEntry: Protein:A:X02:497>
    <ChainEntry: Protein:A>

    """

    clusters = defaultdict(list)
    entries = []

    if entity.level == "S":
        if get_small_molecules:
            residues = entity[0].get_residues()
        if get_chains:
            chains = entity[0].get_chains()

    elif entity.level == "M":
        if get_small_molecules:
            residues = entity.get_residues()
        if get_chains:
            chains = entity.get_chains()
    else:
        if get_small_molecules:
            # If the entity is already a Chain, get_parent_by_level() returns the same object.
            # But, if the entity is a Residue or Atom, it will return its corresponding chain parent.
            residues = entity.get_parent_by_level("C").get_residues()
        if get_chains:
            chains = entity.get_parent_by_level("M").get_chains()

    if get_small_molecules:
        pdb_id = entity.get_parent_by_level("S").id
        for res in residues:
            if res.is_hetatm():
                if ignore_artifacts and res.resname in ARTIFACTS_LIST:
                    continue

                comp_num_and_icode = ""
                if isinstance(res.id[1], int):
                    comp_num_and_icode = str(res.id[1])
                comp_num_and_icode += str(res.id[2]) if res.id[2].strip() else ""

                entry = MolEntry.from_string(sep.join([pdb_id, res.parent.id,
                                                       res.resname, comp_num_and_icode]))
                if by_cluster:
                    clusters[res.cluster_id].append(entry)
                else:
                    entries.append(entry)

    if get_chains:
        pdb_id = entity.get_parent_by_level("S").id
        for chain in chains:
            entry = ChainEntry.from_string(sep.join([pdb_id, chain.id]))
            if by_cluster:
                clusters[None].append(entry)
            else:
                entries.append(entry)

    if by_cluster:
        return clusters
    return entries