from os import remove
import math
import glob
import subprocess
import os.path
import warnings
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Alphabet import generic_protein
from luna.MyBio.util import parse_from_file, save_to_file
from luna.util.exceptions import InvalidSuperpositionFileError
from luna.util.file import is_directory_valid
import logging
logger = logging.getLogger()
TMALIGN = "/bin/tmalign"
[docs]class TMAlignment(MultipleSeqAlignment):
""" Store TM-align results, including sequence alignment and TM-score.
Parameters
----------
score : float
TM-score.
records : iterable of :class:`Bio.SeqRecord.SeqRecord`
Same length protein sequences. This may be an empty list.
**kwargs : dict, optional
Extra arguments to `TMAlignment`.
Refer to :class:`Bio.Align.MultipleSeqAlignment` documentation for a
list of all possible arguments.
"""
def __init__(self, score, records, **kwargs):
self.score = score
super().__init__(records, **kwargs)
[docs]def align_structures(pdb_to_align, ref_pdb, output_path=None, tmalign=None):
"""Align two PDB files with TM-align.
.. warning::
TM-align performs pair-wise structural alignments.
Therefore, if your PDB file contains multiple structures, it is recommended
you extract the chains first and align them separately, otherwise the alignment
may not produce the expected results.
To extract chains you can use :class:`~luna.MyBio.extractor.Extractor`.
Parameters
----------
pdb_to_align : str
The PDB structure that will be aligned to ``ref_pdb``.
ref_pdb : str
Reference PDB file, i.e., align ``pdb_to_align`` to ``ref_pdb``.
output_path : str
Where to save TM-align output structures.
tmalign : str
Pathname to TM-align binary. The default value is '/bin/tmalign'.
Returns
-------
: `TMAlignment`
Results of TM-align, including sequence alignment and TM-score.
Examples
--------
>>> from luna.util.default_values import LUNA_PATH
>>> from luna.align.tmalign import align_structures
>>> pdb1 = "{LUNA_PATH}/tutorial/inputs/3QQF.pdb"
>>> pdb2 = "{LUNA_PATH}/tutorial/inputs/3QQK.pdb"
>>> alignment = align_structures(pdb1, pdb2, "./", tmalign="/media/data/Workspace/Softwares/TMalignc/TMalign")
>>> print(alignment.score)
0.98582
"""
tmalign = tmalign or TMALIGN
logger.info("TM-align will try to align the files %s and %s." % (pdb_to_align, ref_pdb))
output = _run_tmalign(pdb_to_align, ref_pdb, output_path, tmalign)
seq_pair, tm_score = get_seq_records(output, pdb_to_align, ref_pdb)
alignment = TMAlignment(tm_score, records=seq_pair, alphabet=generic_protein)
logger.info("TM-align finished successfully. %s." % alignment[0].description)
return alignment
def _run_tmalign(file1, file2, output_path, tmalign):
logger.debug("It will try to execute the command: '%s %s %s'." % (tmalign, file1, file2))
for fname in (file1, file2):
if not os.path.isfile(fname):
logging.error("Missing file: %s", fname)
raise FileNotFoundError("Missing file: %s", fname)
try:
if output_path is not None and output_path.strip() != "":
if is_directory_valid(output_path):
logger.debug("The superposition files will be saved "
"at the directory '%s'" % output_path)
filename = os.path.split(os.path.basename(file1))[1]
output_file = "%s/%s.sup" % (output_path, filename)
args = [tmalign, file1, file2, "-o", output_file]
else:
args = [tmalign, file1, file2]
output = subprocess.check_output(args)
except subprocess.CalledProcessError as e:
logger.exception(e)
raise RuntimeError("TMalign failed for PDB files: %s and %s" % (file1, file2))
return output.decode()
[docs]def get_seq_records(tm_output, aligned_pdb_id, ref_pdb_id):
"""Create a pair of :class:`Bio.SeqRecord.SeqRecord` from a TM-align output.
Parameters
----------
tm_output : str
The output produced by TM-align.
aligned_pdb_id : str
An identifier for the aligned PDB file.
ref_pdb_id : str
An identifier for the reference PDB file.
"""
logger.debug("Parsing the TMalign output.")
lines = tm_output.splitlines()
# Extract the TM-score (measure of structure similarity)
# Take the mean of the (two) given TM-scores -- not sure which is reference
tm_scores = []
for line in lines:
if line.startswith('TM-score'):
# TMalign v. 2012/05/07 or earlier
tm_scores.append(float(line.split(None, 2)[1]))
elif 'TM-score=' in line:
# TMalign v. 2013/05/11 or so
tokens = line.split()
for token in tokens:
if token.startswith('TM-score='):
_key, _val = token.split('=')
tm_scores.append(float(_val.rstrip(',')))
break
tm_score = math.fsum(tm_scores) / len(tm_scores)
# Extract the sequence alignment
last_lines = lines[-7:]
assert last_lines[0].startswith('(":"') # (":" denotes the residues pairs
assert last_lines[-1].startswith('Total running time is')
aligned_seq, ref_seq = last_lines[1].strip(), last_lines[3].strip()
return (SeqRecord(Seq(aligned_seq), id=aligned_pdb_id, description="TM-score=%f" % tm_score),
SeqRecord(Seq(ref_seq), id=ref_pdb_id, description="TM-score=%f" % tm_score)), tm_score
[docs]def remove_sup_files(path):
""" Remove all superposition files created by TM-align at a defined directory ``path``."""
try:
if (is_directory_valid(path)):
targets = ['*.sup', '*.sup_atm', '*.sup_all', '*.sup_all_atm', '*.sup_all_atm_lig']
for target in targets:
files = glob.glob('%s/%s' % (path, target))
for file in files:
try:
remove(file)
except Exception as e:
logger.exception(e)
logger.warning("File %s not removed." % file)
except Exception as e:
logger.exception(e)
raise