Source code for luna.mol.clustering

import re
import logging

from rdkit import DataStructs
from rdkit.ML.Cluster import Butina

from luna.util.exceptions import IllegalArgumentError

logger = logging.getLogger()


[docs]def available_similarity_functions(): """Return a list of all similarity metrics available at RDKit.""" regex = re.compile("Bulk([a-zA-Z]+)Similarity", flags=0) return list(filter(regex.match, dir(DataStructs)))
[docs]def calc_distance_matrix(fps, similarity_func="BulkTanimotoSimilarity"): """Calculate the pairwise distance (dissimilarity) between fingerprints in ``fps`` using the similarity metric ``similarity_func``. Parameters ---------- fps : iterable of RDKit :class:`~rdkit.DataStructs.cDataStructs.ExplicitBitVect` or :class:`~rdkit.DataStructs.cDataStructs.SparseBitVect` A sequence of fingerprints. similarity_func : str A similarity metric to calculate the distance between the provided fingerprints. The default value is 'BulkTanimotoSimilarity'. To check out the list of available similarity metrics, call the function :py:meth:`available_similarity_functions`. Examples -------- First, let's define a set of molecules. >>> from luna.wrappers.base import MolWrapper >>> mols = [MolWrapper.from_smiles("CCCCCC").unwrap(), ... MolWrapper.from_smiles("CCCCCCCC").unwrap(), ... MolWrapper.from_smiles("CCCCCCCCO").unwrap()] Now, we generate fingerprints for those molecules. >>> from luna.mol.fingerprint import generate_fp_for_mols >>> fps = [d["fp"] for d in generate_fp_for_mols(mols, "morgan_fp")] Finally, calculate the distance between the molecules based on their fingerprints. >>> from luna.mol.clustering import calc_distance_matrix >>> print(calc_distance_matrix(fps)) [0.125, 0.46153846153846156, 0.3846153846153846] Returns ------- distances : list of float Flattened diagonal matrix. """ funcs = available_similarity_functions() if similarity_func not in funcs: raise IllegalArgumentError("Similarity function not available.") dists = [] for i in range(1, len(fps)): if (similarity_func == "BulkTverskySimilarity"): params = [fps[i], fps[:i], 0, 1] else: params = [fps[i], fps[:i]] sims = getattr(DataStructs, similarity_func)(*params) dists.extend([1 - x for x in sims]) return dists
[docs]def cluster_fps(fps, cutoff=0.2, similarity_func="BulkTanimotoSimilarity"): """Clusterize molecules based on fingerprints using the Butina clustering algorithm. Parameters ---------- fps : iterable of RDKit :class:`~rdkit.DataStructs.cDataStructs.ExplicitBitVect` or :class:`~rdkit.DataStructs.cDataStructs.SparseBitVect` A sequence of fingerprints. cutoff : float Elements within this range of each other are considered to be neighbors. similarity_func : str A similarity metric to calculate the distance between the provided fingerprints. The default value is 'BulkTanimotoSimilarity'. To check out the list of available similarity metrics, call the function :py:meth:`available_similarity_functions`. Examples -------- First, let's define a set of molecules. >>> from luna.wrappers.base import MolWrapper >>> mols = [MolWrapper.from_smiles("CCCCCC").unwrap(), ... MolWrapper.from_smiles("CCCCCCCC").unwrap(), ... MolWrapper.from_smiles("CCCCCCCCO").unwrap()] Now, we generate fingerprints for those molecules. >>> from luna.mol.fingerprint import generate_fp_for_mols >>> fps = [d["fp"] for d in generate_fp_for_mols(mols, "morgan_fp")] Finally, clusterize the molecules based on their fingerprints. >>> from luna.mol.clustering import cluster_fps >>> print(cluster_fps(fps, cutoff=0.2)) ((1, 0), (2,)) Returns ------- clusters : tuple of tuples Each cluster is defined as a tuple of tuples, where the first element for each cluster is its centroid. """ logger.debug("Trying to clusterize %d molecules." % len(fps)) logger.debug("Defined cutoff: %.2f. Defined similarity function: %s." % (cutoff, similarity_func)) # first generate the distance matrix. dists = calc_distance_matrix(fps, similarity_func) logger.debug("Distance matrix created.") # now cluster the data. cs = Butina.ClusterData(dists, len(fps), cutoff, isDistData=True) logger.debug("Number of cluster(s) created: %d." % len(cs)) return cs