import re
import logging
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from luna.util.exceptions import IllegalArgumentError
logger = logging.getLogger()
[docs]def available_similarity_functions():
"""Return a list of all similarity metrics available at RDKit."""
regex = re.compile("Bulk([a-zA-Z]+)Similarity", flags=0)
return list(filter(regex.match, dir(DataStructs)))
[docs]def calc_distance_matrix(fps, similarity_func="BulkTanimotoSimilarity"):
"""Calculate the pairwise distance (dissimilarity) between fingerprints in ``fps`` using
the similarity metric ``similarity_func``.
Parameters
----------
fps : iterable of RDKit :class:`~rdkit.DataStructs.cDataStructs.ExplicitBitVect` or :class:`~rdkit.DataStructs.cDataStructs.SparseBitVect`
A sequence of fingerprints.
similarity_func : str
A similarity metric to calculate the distance between the provided fingerprints. The default value is 'BulkTanimotoSimilarity'.
To check out the list of available similarity metrics, call the function :py:meth:`available_similarity_functions`.
Examples
--------
First, let's define a set of molecules.
>>> from luna.wrappers.base import MolWrapper
>>> mols = [MolWrapper.from_smiles("CCCCCC").unwrap(),
... MolWrapper.from_smiles("CCCCCCCC").unwrap(),
... MolWrapper.from_smiles("CCCCCCCCO").unwrap()]
Now, we generate fingerprints for those molecules.
>>> from luna.mol.fingerprint import generate_fp_for_mols
>>> fps = [d["fp"] for d in generate_fp_for_mols(mols, "morgan_fp")]
Finally, calculate the distance between the molecules based on their fingerprints.
>>> from luna.mol.clustering import calc_distance_matrix
>>> print(calc_distance_matrix(fps))
[0.125, 0.46153846153846156, 0.3846153846153846]
Returns
-------
distances : list of float
Flattened diagonal matrix.
"""
funcs = available_similarity_functions()
if similarity_func not in funcs:
raise IllegalArgumentError("Similarity function not available.")
dists = []
for i in range(1, len(fps)):
if (similarity_func == "BulkTverskySimilarity"):
params = [fps[i], fps[:i], 0, 1]
else:
params = [fps[i], fps[:i]]
sims = getattr(DataStructs, similarity_func)(*params)
dists.extend([1 - x for x in sims])
return dists
[docs]def cluster_fps(fps, cutoff=0.2, similarity_func="BulkTanimotoSimilarity"):
"""Clusterize molecules based on fingerprints using the Butina clustering algorithm.
Parameters
----------
fps : iterable of RDKit :class:`~rdkit.DataStructs.cDataStructs.ExplicitBitVect` or :class:`~rdkit.DataStructs.cDataStructs.SparseBitVect`
A sequence of fingerprints.
cutoff : float
Elements within this range of each other are considered to be neighbors.
similarity_func : str
A similarity metric to calculate the distance between the provided fingerprints. The default value is 'BulkTanimotoSimilarity'.
To check out the list of available similarity metrics, call the function :py:meth:`available_similarity_functions`.
Examples
--------
First, let's define a set of molecules.
>>> from luna.wrappers.base import MolWrapper
>>> mols = [MolWrapper.from_smiles("CCCCCC").unwrap(),
... MolWrapper.from_smiles("CCCCCCCC").unwrap(),
... MolWrapper.from_smiles("CCCCCCCCO").unwrap()]
Now, we generate fingerprints for those molecules.
>>> from luna.mol.fingerprint import generate_fp_for_mols
>>> fps = [d["fp"] for d in generate_fp_for_mols(mols, "morgan_fp")]
Finally, clusterize the molecules based on their fingerprints.
>>> from luna.mol.clustering import cluster_fps
>>> print(cluster_fps(fps, cutoff=0.2))
((1, 0), (2,))
Returns
-------
clusters : tuple of tuples
Each cluster is defined as a tuple of tuples, where the first element for each cluster is its centroid.
"""
logger.debug("Trying to clusterize %d molecules." % len(fps))
logger.debug("Defined cutoff: %.2f. Defined similarity function: %s." % (cutoff, similarity_func))
# first generate the distance matrix.
dists = calc_distance_matrix(fps, similarity_func)
logger.debug("Distance matrix created.")
# now cluster the data.
cs = Butina.ClusterData(dists, len(fps), cutoff, isDistData=True)
logger.debug("Number of cluster(s) created: %d." % len(cs))
return cs