Source code for luna.align.tmalign

from os import remove
import math
import glob
import subprocess
import os.path
import warnings

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Alphabet import generic_protein

from luna.MyBio.util import parse_from_file, save_to_file
from luna.util.exceptions import InvalidSuperpositionFileError
from luna.util.file import is_directory_valid


import logging
logger = logging.getLogger()


TMALIGN = "/bin/tmalign"


[docs]class TMAlignment(MultipleSeqAlignment):

    """ Store TM-align results, including sequence alignment and TM-score.

    Parameters
    ----------
    score : float
        TM-score.
    records : iterable of :class:`Bio.SeqRecord.SeqRecord`
        Same length protein sequences. This may be an empty list.
    **kwargs : dict, optional
        Extra arguments to `TMAlignment`.
        Refer to :class:`Bio.Align.MultipleSeqAlignment` documentation for a
        list of all possible arguments.
    """

    def __init__(self, score, records, **kwargs):
        self.score = score
        super().__init__(records, **kwargs)


[docs]def align_structures(pdb_to_align, ref_pdb, output_path=None, tmalign=None):
    """Align two PDB files with TM-align.

    .. warning::

        TM-align performs pair-wise structural alignments.
        Therefore, if your PDB file contains multiple structures, it is recommended
        you extract the chains first and align them separately, otherwise the alignment
        may not produce the expected results.

        To extract chains you can use :class:`~luna.MyBio.extractor.Extractor`.

    Parameters
    ----------
    pdb_to_align : str
        The PDB structure that will be aligned to ``ref_pdb``.
    ref_pdb : str
        Reference PDB file, i.e., align ``pdb_to_align`` to ``ref_pdb``.
    output_path : str
        Where to save TM-align output structures.
    tmalign : str
        Pathname to TM-align binary. The default value is '/bin/tmalign'.

    Returns
    -------
     : `TMAlignment`
     Results of TM-align, including sequence alignment and TM-score.

    Examples
    --------

    >>> from luna.util.default_values import LUNA_PATH
    >>> from luna.align.tmalign import align_structures
    >>> pdb1 = "{LUNA_PATH}/tutorial/inputs/3QQF.pdb"
    >>> pdb2 = "{LUNA_PATH}/tutorial/inputs/3QQK.pdb"
    >>> alignment = align_structures(pdb1, pdb2, "./", tmalign="/media/data/Workspace/Softwares/TMalignc/TMalign")
    >>> print(alignment.score)
    0.98582
    """

    tmalign = tmalign or TMALIGN

    logger.info("TM-align will try to align the files %s and %s." % (pdb_to_align, ref_pdb))

    output = _run_tmalign(pdb_to_align, ref_pdb, output_path, tmalign)

    seq_pair, tm_score = get_seq_records(output, pdb_to_align, ref_pdb)

    alignment = TMAlignment(tm_score, records=seq_pair, alphabet=generic_protein)

    logger.info("TM-align finished successfully. %s." % alignment[0].description)

    return alignment


def _run_tmalign(file1, file2, output_path, tmalign):

    logger.debug("It will try to execute the command: '%s %s %s'." % (tmalign, file1, file2))

    for fname in (file1, file2):
        if not os.path.isfile(fname):
            logging.error("Missing file: %s", fname)
            raise FileNotFoundError("Missing file: %s", fname)

    try:
        if output_path is not None and output_path.strip() != "":
            if is_directory_valid(output_path):
                logger.debug("The superposition files will be saved "
                             "at the directory '%s'" % output_path)

                filename = os.path.split(os.path.basename(file1))[1]
                output_file = "%s/%s.sup" % (output_path, filename)
                args = [tmalign, file1, file2, "-o", output_file]
        else:
            args = [tmalign, file1, file2]

        output = subprocess.check_output(args)
    except subprocess.CalledProcessError as e:
        logger.exception(e)

        raise RuntimeError("TMalign failed for PDB files: %s and %s" % (file1, file2))

    return output.decode()


[docs]def get_seq_records(tm_output, aligned_pdb_id, ref_pdb_id):
    """Create a pair of :class:`Bio.SeqRecord.SeqRecord` from a TM-align output.

    Parameters
    ----------
    tm_output : str
        The output produced by TM-align.
    aligned_pdb_id : str
        An identifier for the aligned PDB file.
    ref_pdb_id : str
        An identifier for the reference PDB file.
    """
    logger.debug("Parsing the TMalign output.")

    lines = tm_output.splitlines()

    # Extract the TM-score (measure of structure similarity)
    # Take the mean of the (two) given TM-scores -- not sure which is reference
    tm_scores = []
    for line in lines:
        if line.startswith('TM-score'):
            # TMalign v. 2012/05/07 or earlier
            tm_scores.append(float(line.split(None, 2)[1]))
        elif 'TM-score=' in line:
            # TMalign v. 2013/05/11 or so
            tokens = line.split()
            for token in tokens:
                if token.startswith('TM-score='):
                    _key, _val = token.split('=')
                    tm_scores.append(float(_val.rstrip(',')))
                    break

    tm_score = math.fsum(tm_scores) / len(tm_scores)
    # Extract the sequence alignment
    last_lines = lines[-7:]

    assert last_lines[0].startswith('(":"')  # (":" denotes the residues pairs
    assert last_lines[-1].startswith('Total running time is')

    aligned_seq, ref_seq = last_lines[1].strip(), last_lines[3].strip()

    return (SeqRecord(Seq(aligned_seq), id=aligned_pdb_id, description="TM-score=%f" % tm_score),
            SeqRecord(Seq(ref_seq), id=ref_pdb_id, description="TM-score=%f" % tm_score)), tm_score


[docs]def extract_chain_from_sup(sup_file, extract_chain, output_file, new_chain_id=None, QUIET=True):
    """ Extract a chain from the superposition file generated by TM-align.

        .. warning::
            TM-align modifies the id of the original chains, so it is highly
            recommended to rename it to match the original chain id.
            To do so, use the parameter ``new_chain_id``.

        Parameters
        ----------
        sup_file: str
            A superposition file generated by TM-align.
        extract_chain: {'A', 'B'}
            Target chain id to be extracted. TM-align performs pair-wise structural alignment,
            so the output file will always contain two chains 'A' and 'B',
            where 'A' and 'B' are the aligned and reference structures, respectively.
        output_file: str
            Save the extracted chain to this file.
        new_chain_id: str, optional
            The new chain id of the extracted chain.
            If not provided, the chain ids will be maintained as it is in the TM-align output.
        QUIET: bool
            If True (the default), mute warning messages generated by Biopython.
    """
    if QUIET:
        try:
            warnings.filterwarnings("ignore")
            logger.debug("Quiet mode activated. From now on, no warning will be printed.")
        except Exception:
            logger.warning("Quiet mode could not be activated.")

    if extract_chain not in ["A", "B"]:
        raise ValueError("Valid values for 'extract_chain' are 'A' and 'B'.")

    try:
        logger.debug("Trying to parse the file '%s'." % sup_file)

        structure = parse_from_file("SUP", sup_file)

        model = structure[0]
        if (len(model.child_list) != 2):
            raise InvalidSuperpositionFileError("This structure has %d chains. The file generated "
                                                "by TM-align must have 2 chains." % len(model.child_list))

        chain_to_remove = 'B' if extract_chain == "A" else "A"
        model.detach_child(chain_to_remove)

        if new_chain_id is not None and extract_chain != new_chain_id:
            model[extract_chain].id = new_chain_id
            logger.debug("Modifications completed.")

        save_to_file(structure, output_file)

        logger.debug("File '%s' created successfully." % output_file)
    except Exception as e:
        logger.exception(e)
        raise


[docs]def remove_sup_files(path):
    """ Remove all superposition files created by TM-align at a defined directory ``path``."""
    try:
        if (is_directory_valid(path)):
            targets = ['*.sup', '*.sup_atm', '*.sup_all', '*.sup_all_atm', '*.sup_all_atm_lig']

            for target in targets:
                files = glob.glob('%s/%s' % (path, target))
                for file in files:
                    try:
                        remove(file)
                    except Exception as e:
                        logger.exception(e)
                        logger.warning("File %s not removed." % file)
    except Exception as e:
        logger.exception(e)
        raise