Source code for luna.interaction.fp.fingerprint

import numpy as np
from rdkit.DataStructs.cDataStructs import ExplicitBitVect, SparseBitVect
from scipy.sparse import issparse, csr_matrix
from collections import defaultdict
from rdkit import DataStructs

from luna.util.exceptions import (BitsValueError, InvalidFingerprintType, IllegalArgumentError, FingerprintCountsError)
from luna.version import __version__


import logging

logger = logging.getLogger()

DEFAULT_FP_LENGTH = 2**32
DEFAULT_FOLDED_FP_LENGTH = 4096
DEFAULT_FP_DTYPE = np.int32


[docs]class Fingerprint: """A fingerprint that stores indices of "on" bits. Parameters ---------- indices : array_like of int Indices of "on" bits. fp_length : int The fingerprint length (total number of bits). The default value is :math:`2^{32}`. unfolded_fp : `Fingerprint` or None The unfolded version of this fingerprint. If None, this fingerprint may have not been folded yet. unfolding_map : dict, optional A mapping between current indices and indices from the unfolded version of this fingerprint what makes it possible to trace folded bits back to the original shells (features). props: dict, optional Custom properties of the fingerprint, consisting of a string keyword and some value. It can be used, for instance, to save the ligand name and parameters used to generate shells (IFP features). """ def __init__(self, indices, fp_length=DEFAULT_FP_LENGTH, unfolded_fp=None, unfolding_map=None, props=None): indices = np.asarray(indices, dtype=np.long) if np.any(np.logical_or(indices < 0, indices >= fp_length)): logger.exception("Provided indices are in a different bit scale.") raise BitsValueError("Provided indices are in a different bit scale.") self._indices = np.unique(indices) self._fp_length = fp_length self._unfolded_fp = unfolded_fp self._unfolding_map = unfolding_map or {} self._props = props or {} self.version = __version__
[docs] @classmethod def from_indices(cls, indices, fp_length=DEFAULT_FP_LENGTH, **kwargs): """Initialize from an array of indices. Parameters ---------- indices : array_like of int Indices of "on" bits. fp_length : int The fingerprint length (total number of bits). The default value is :math:`2^{32}`. **kwargs : dict, optional Extra arguments to `Fingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `Fingerprint` Examples -------- >>> from luna.interaction.fp.fingerprint import Fingerprint >>> import numpy as np >>> np.random.seed(0) >>> on_bits = 8 >>> fp_length = 32 >>> indices = np.random.randint(0, fp_length, on_bits) >>> print(indices) [12 15 21 0 3 27 3 7] >>> fp = Fingerprint.from_indices(indices, fp_length=fp_length) >>> print(fp.indices) [ 0 3 7 12 15 21 27] >>> print(fp.to_vector(compressed=False)) [1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0] """ return cls(indices, fp_length, **kwargs)
[docs] @classmethod def from_vector(cls, vector, fp_length=None, **kwargs): """Initialize from a vector. Parameters ---------- vector : :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` Array of bits. fp_length : int, optional The fingerprint length (total number of bits). If not provided, the fingerprint length will be defined based on the ``vector`` shape. **kwargs : dict, optional Extra arguments to `Fingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `Fingerprint` Examples -------- >>> from luna.interaction.fp.fingerprint import Fingerprint >>> import numpy as np >>> np.random.seed(0) >>> fp_length = 32 >>> vector = np.random.choice([0, 1], size=(fp_length,), p=[0.8, 0.2]) >>> print(vector) [0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0] >>> fp = Fingerprint.from_vector(vector) >>> print(fp.indices) [ 7 8 13 17 19 20 27] >>> print(fp.fp_length) 32 """ if fp_length is None: try: fp_length = vector.shape[1] except IndexError: fp_length = vector.shape[0] if issparse(vector): indices = vector.indices.astype(np.long) else: indices = np.asarray(np.where(vector), dtype=np.long).flatten() return cls.from_indices(indices, fp_length, **kwargs)
[docs] @classmethod def from_bit_string(cls, bit_string, fp_length=None, **kwargs): """Initialize from a bit string (e.g. '0010100110'). Parameters ---------- bit_string : str String of 0s and 1s. fp_length : int, optional The fingerprint length (total number of bits). If not provided, the fingerprint length will be defined based on the string length. **kwargs : dict, optional Extra arguments to `Fingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `Fingerprint` Examples -------- >>> from luna.interaction.fp.fingerprint import Fingerprint >>> fp = Fingerprint.from_bit_string("0010100110000010") >>> print(fp.indices) [ 2 4 7 8 14] >>> print(fp.fp_length) 16 """ indices = [i for i, char in enumerate(bit_string) if char != '0'] if fp_length is None: fp_length = len(bit_string) return cls.from_indices(indices, fp_length, **kwargs)
[docs] @classmethod def from_rdkit(cls, rdkit_fp, **kwargs): """Initialize from an RDKit fingerprint. Parameters ---------- rdkit_fp : :class:`~rdkit.DataStructs.cDataStructs.ExplicitBitVect` or :class:`~rdkit.DataStructs.cDataStructs.SparseBitVect` An existing RDKit fingerprint. **kwargs : dict, optional Extra arguments to `Fingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `Fingerprint` """ if not (isinstance(rdkit_fp, ExplicitBitVect) or isinstance(rdkit_fp, SparseBitVect)): logger.exception("Invalid fingerprint type. RDKit only accepts a SparseBitVect or ExplicitBitVect object.") raise TypeError("Invalid fingerprint type. RDKit only accepts a SparseBitVect or ExplicitBitVect object.") fp_length = rdkit_fp.GetNumBits() indices = np.asarray(rdkit_fp.GetOnBits(), dtype=np.long) return cls.from_indices(indices, fp_length, **kwargs)
[docs] @classmethod def from_fingerprint(cls, fp, **kwargs): """Initialize from an existing fingerprint. Parameters ---------- fp : `Fingerprint` An existing fingerprint. **kwargs : dict, optional Extra arguments to `Fingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `Fingerprint` """ if not isinstance(fp, Fingerprint): logger.exception("Informed fingerprint is not an instance of %s." % (cls.__class__)) raise InvalidFingerprintType("Informed fingerprint is not an instance of %s." % (cls.__class__)) unfolded_fp = fp.__class__.from_fingerprint(fp.unfolded_fp) if fp.unfolded_fp is not None else None unfolding_map = dict(fp.unfolding_map) props = dict(fp.props) return cls.from_indices(fp.indices, fp.fp_length, unfolded_fp=unfolded_fp, unfolding_map=unfolding_map, props=props)
@property def indices(self): """array_like of int, read-only: Indices of "on" bits.""" return self._indices @property def bit_count(self): """int, read-only: Number of "on" bits.""" return self.indices.shape[0] @property def density(self): """float, read-only: Proportion of "on" bits in fingerprint.""" return self.bit_count / self.fp_length @property def counts(self): """dict, read-only: Mapping between each index in ``indices`` to the number of counts, which is always 1 for bit fingerprints.""" return dict([(k, 1) for k in self.indices]) @property def fp_length(self): """int, read-only: The fingerprint length (total number of bits).""" return self._fp_length @property def unfolded_fp(self): """`Fingerprint` or None, read-only: The unfolded version of this fingerprint. If None, this fingerprint may have not been folded yet.""" if self._unfolded_fp is None: logger.warning("This fingerprint was not previously folded.") return None return self._unfolded_fp @property def unfolded_indices(self): """array_like of int, read-only: Indices of "on" bits in the unfolded fingerprint.""" if self._unfolding_map is None: logger.warning("This fingerprint was not previously folded.") return None return self.unfolded_fp.indices @property def unfolding_map(self): """dict, read-only: The mapping between current indices and indices from the unfolded version of this fingerprint what makes it possible to trace folded bits back to the original shells (features).""" if self._unfolding_map is None: logger.warning("This fingerprint was not previously folded.") return None return self._unfolding_map @property def props(self): """dict, read-only: The custom properties of the fingerprint.""" return self._props @property def name(self): """str: The property 'name'. If it was not provided, then return an empty string.""" return self.props.get("name", "") @name.setter def name(self, name): self.props["name"] = str(name) @property def num_levels(self): """int: The property 'num_levels' used to generate this fingerprint \ (see :class:`~luna.interaction.fp.shell.ShellGenerator`). \ If it was not provided, then return None.""" return self.props.get("num_levels", None) @num_levels.setter def num_levels(self, num_levels): self.props["num_levels"] = str(num_levels) @property def radius_step(self): """float: The property 'radius_step' used to generate this fingerprint \ (see :class:`~luna.interaction.fp.shell.ShellGenerator`). \ If it was not provided, then return None.""" return self.props.get("radius_step", None) @radius_step.setter def radius_step(self, radius_step): self.props["radius_step"] = str(radius_step) @property def num_shells(self): """int: The property 'num_shells' \ (see :class:`~luna.interaction.fp.shell.ShellGenerator`). \ If it was not provided, then return None.""" return self.props.get("num_shells", None) @num_shells.setter def num_shells(self, num_shells): self.props["num_shells"] = str(num_shells)
[docs] def get_prop(self, key): """Get value of the property ``key``. If not set, raise KeyError.""" try: return self.props[key] except KeyError: logger.warning("Key '%s' does not exist." % key) return None
[docs] def set_prop(self, key, value): """Set value to the property ``key``.""" self.props[key] = value
[docs] def get_num_bits(self): """Get the fingerprint length (total number of bits).""" return self.fp_length
[docs] def get_num_on_bits(self): """Get the number of "on" bits.""" return self.bit_count
[docs] def get_num_off_bits(self): """Get the number of "off" bits.""" return self.get_num_bits() - self.get_num_on_bits()
[docs] def get_bit(self, index): """Get the bit/count value at index ``index``. Raises ------ BitsValueError If the provided index is in a different bit scale. """ if index in self.counts: return self.counts[index] elif index >= 0 and index < self.fp_length: return 0 else: logger.exception("The provided index is in a different bit scale.") raise BitsValueError("The provided index is in a different bit scale.")
[docs] def get_on_bits(self): """Get "on" bits. Returns ------- : :class:`numpy.ndarray` """ return np.array([k for (k, v) in self.counts.items() if v > 0])
[docs] def to_vector(self, compressed=True, dtype=DEFAULT_FP_DTYPE): """Convert this fingerprint to a vector of bits/counts. .. warning:: This function may raise a `MemoryError` exception when using huge indices vectors. If you found this issue, you may want to try a different data type or apply a folding operation before calling `to_vector`. Parameters ------- compressed : bool If True, build a compressed sparse matrix (scipy.sparse.csr_matrix). dtype : data-type The default value is np.int32. Returns ------- : :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` Vector of bits/counts. Return a compressed sparse matrix (`scipy.sparse.csr_matrix`) if ``compressed`` is True. Otherwise, return a Numpy array (:class:`numpy.ndarray`) Raises ------ BitsValueError If some of the fingerprint indices are greater than the fingerprint length. MemoryError If the operation ran out of memory. """ data = [self.counts[i] for i in self.indices] if compressed: try: row = np.zeros(self.bit_count) col = self.indices vector = csr_matrix((data, (row, col)), shape=(1, self.fp_length), dtype=dtype) except ValueError as e: logger.exception(e) raise BitsValueError("Sparse matrix construction failed. Invalid indices or input data.") else: try: # This function is causing a MemoryError exception when using a 2**32 vector. vector = np.zeros(self.fp_length, dtype=dtype) except MemoryError as e: logger.exception(e) raise MemoryError("Huge indices vector detected. An operation ran out of memory. " "Use a different data type or apply a folding operation.") try: vector[self.indices] = data except IndexError as e: logger.exception(e) raise BitsValueError("Some of the provided indices are greater than the fingerprint length.") return vector
[docs] def to_bit_vector(self, compressed=True): """Convert this fingerprint to a vector of bits. .. warning:: This function may raise a `MemoryError` exception when using huge indices vectors. If you found this issue, you may want to try a different data type or apply a folding operation before calling `to_bit_vector`. Parameters ------- compressed : bool If True, build a compressed sparse matrix (scipy.sparse.csr_matrix). Returns ------- : :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` Vector of bits/counts. Return a compressed sparse matrix (`scipy.sparse.csr_matrix`) if ``compressed`` is True. Otherwise, return a Numpy array (:class:`numpy.ndarray`) Raises ------ BitsValueError If some of the fingerprint indices are greater than the fingerprint length. MemoryError If the operation ran out of memory. """ return self.to_vector(compressed=compressed, dtype=np.bool_).astype(np.int8)
[docs] def to_bit_string(self): """Convert this fingerprint to a string of bits. .. warning:: This function may raise a `MemoryError` exception when using huge indices vectors. If you found this issue, you may want to try a different data type or apply a folding operation before calling `to_bit_string`. Returns ------- : str Raises ------ MemoryError If the operation ran out of memory. """ try: # This function is causing a MemoryError exception when using a 2**32 vector. bit_vector = self.to_bit_vector(compressed=False).astype(np.int8) return "".join(map(str, bit_vector)) except MemoryError as e: logger.exception(e) raise MemoryError("Huge indices vector detected. An operation ran out of memory. " "Use a different data type or apply a folding operation.")
[docs] def to_rdkit(self, rdkit_fp_cls=None): """Convert this fingerprint to an RDKit fingerprint. .. note:: If the fingerprint length exceeds the maximum RDKit fingerprint length (:math:`2^{31} - 1`), this fingerprint will be folded to length :math:`2^{31} - 1` before conversion. Returns ------- : :class:`~rdkit.DataStructs.cDataStructs.ExplicitBitVect` or :class:`~rdkit.DataStructs.cDataStructs.SparseBitVect` If ``fp_length`` is less than :math:`1e5`, :class:`~rdkit.DataStructs.cDataStructs.ExplicitBitVect` is used. Otherwise, :class:`~rdkit.DataStructs.cDataStructs.SparseBitVect` is used. """ if rdkit_fp_cls is None: # Classes to store explicit bit vectors: ExplicitBitVect or SparseBitVect. # ExplicitBitVect is most useful for situations where the size of the vector is # relatively small (tens of thousands or smaller). # For larger vectors, use the _SparseBitVect_ class instead. if self.fp_length < 1e5: rdkit_fp_cls = ExplicitBitVect else: rdkit_fp_cls = SparseBitVect # RDKit data structure defines fingerprints as a std:set composed of ints (signed int). # Since we always have values higher than 0 and since the data structure contains only signed ints, # then the max length for a RDKit fingerprint is 2^31 - 1. # C signed int (32 bit) ranges: [-2^31, 2^31-1]. max_rdkit_fp_length = 2**31 - 1 fp_length = self.fp_length if max_rdkit_fp_length < fp_length: logger.warning("The current fingerprint will be folded as its size is higher than the maximum " "size accepted by RDKit, which is 2**31 - 1.") fp_length = max_rdkit_fp_length indices = self.indices % max_rdkit_fp_length rdkit_fp = rdkit_fp_cls(fp_length) rdkit_fp.SetBitsFromList(indices.tolist()) return rdkit_fp
[docs] def fold(self, new_length=DEFAULT_FOLDED_FP_LENGTH): """Fold this fingerprint to size ``new_length``. Parameters ---------- new_length : int Length of the new fingerprint, ideally multiple of 2. The default value is 4096. Returns ------- : `Fingerprint` Folded `Fingerprint`. Raises ------ BitsValueError If the new fingerprint length is not a multiple of 2 or is greater than the existing fingerprint length. Examples -------- >>> from luna.interaction.fp.fingerprint import Fingerprint >>> import numpy as np >>> np.random.seed(0) >>> on_bits = 8 >>> fp_length = 32 >>> indices = np.random.randint(0, fp_length, on_bits) >>> print(indices) [12 15 21 0 3 27 3 7] >>> fp = Fingerprint.from_indices(indices, fp_length=fp_length) >>> print(fp.indices) [ 0 3 7 12 15 21 27] >>> print(fp.to_vector(compressed=False)) [1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0] >>> folded_fp = fp.fold(8) >>> print(folded_fp.indices) [0 3 4 5 7] >>> print(folded_fp.to_vector(compressed=False)) [1 0 0 1 1 1 0 1] """ if new_length > self.fp_length: error_msg = ("The new fingerprint length must be smaller than the existing fingerprint length.") logger.exception(error_msg) raise BitsValueError(error_msg) if not np.log2(self.fp_length / new_length).is_integer(): error_msg = ("It is not possible to fold the current fingerprint into the informed new length. " "The current length divided by the new one is not a power of 2 number.") logger.exception(error_msg) raise BitsValueError(error_msg) folded_indices = self.indices % new_length unfolding_map = defaultdict(set) for k, v in sorted(zip(folded_indices, self.indices)): unfolding_map[k].add(v) props = dict(self.props) if "fp_length" in props: props["fp_length"] = new_length new_fp = self.__class__(indices=folded_indices, fp_length=new_length, unfolded_fp=self, unfolding_map=unfolding_map, props=props) return new_fp
[docs] def unfold(self): """Unfold this fingerprint and return its parent fingerprint. Returns ------- : `Fingerprint` """ return self.unfolded_fp
[docs] def union(self, other): """Return the union of indices of two fingerprints. Returns ------- : :class:`numpy.ndarray` Raises ------ InvalidFingerprintType If the informed fingerprint is not an instance of `Fingerprint`. BitsValueError If the fingerprints have different lengths. """ if not isinstance(other, Fingerprint): logger.exception("The informed fingerprint is not an instance of %s." % (other.__class__)) raise InvalidFingerprintType("The informed fingerprint is not an instance of %s." % (other.__class__)) if self.fp_length != other.fp_length: raise BitsValueError("Fingerprints are in a different bit scale") return np.union1d(self.indices, other.indices)
[docs] def intersection(self, other): """Return the intersection between indices of two fingerprints. Returns ------- : :class:`numpy.ndarray` Raises ------ InvalidFingerprintType If the informed fingerprint is not an instance of `Fingerprint`. BitsValueError If the fingerprints have different lengths. """ if not isinstance(other, Fingerprint): logger.exception("Informed fingerprint is not an instance of %s." % (other.__class__)) raise InvalidFingerprintType("Informed fingerprint is not an instance of %s." % (other.__class__)) if self.fp_length != other.fp_length: raise BitsValueError("Fingerprints are in a different bit scale") return np.intersect1d(self.indices, other.indices, assume_unique=True)
[docs] def difference(self, other): """Return indices in this fingerprint but not in ``other``. Returns ------- : :class:`numpy.ndarray` Raises ------ InvalidFingerprintType If the informed fingerprint is not an instance of `Fingerprint`. BitsValueError If the fingerprints have different lengths. """ if not isinstance(other, Fingerprint): logger.exception("Informed fingerprint is not an instance of %s." % (other.__class__)) raise InvalidFingerprintType("Informed fingerprint is not an instance of %s." % (other.__class__)) if self.fp_length != other.fp_length: raise BitsValueError("Fingerprints are in a different bit scale") return np.setdiff1d(self.indices, other.indices, assume_unique=True)
[docs] def symmetric_difference(self, other): """Return indices in either this fingerprint or ``other`` but not both. Returns ------- : :class:`numpy.ndarray` Raises ------ InvalidFingerprintType If the informed fingerprint is not an instance of `Fingerprint`. BitsValueError If the fingerprints have different lengths. """ if not isinstance(other, Fingerprint): logger.exception("Informed fingerprint is not an instance of %s." % (other.__class__)) raise InvalidFingerprintType("Informed fingerprint is not an instance of %s." % (other.__class__)) if self.fp_length != other.fp_length: raise BitsValueError("Fingerprints are in a different bit scale") return np.setxor1d(self.indices, other.indices, assume_unique=True)
[docs] def calc_similarity(self, other): """Calculates the Tanimoto similarity between this fingeprint and ``other``. Returns ------- : float Examples -------- >>> from luna.interaction.fp.fingerprint import Fingerprint >>> fp1 = Fingerprint.from_bit_string("0010101110000010") >>> fp2 = Fingerprint.from_bit_string("1010100110010010") >>> print(fp1.calc_similarity(fp2)) 0.625 """ return DataStructs.FingerprintSimilarity(self.to_rdkit(), other.to_rdkit())
def __repr__(self): return ("<%s: indices=%s length=%d>" % (self.__class__, repr(self.indices).replace('\n', '').replace(' ', ''), self.fp_length)) def __eq__(self, other): if isinstance(other, Fingerprint): return (self.__class__ == other.__class__ and self.fp_length == other.fp_length and np.all(np.in1d(self.indices, other.indices, assume_unique=True))) return False def __ne__(self, other): return not self.__eq__(other) def __or__(self, other): return self.union(other) def __and__(self, other): return self.intersection(other) def __sub__(self, other): return self.difference(other) def __xor__(self, other): return self.symmetric_difference(other)
[docs]class CountFingerprint(Fingerprint): """A fingerprint that stores the number of occurrences of each index. Parameters ---------- indices : array_like of int, optional Indices of "on" bits. It is optional if ``counts`` is provided. counts : dict, optional Mapping between each index in ``indices`` to the number of counts. If not provided, the default count value of 1 will be used instead. fp_length : int The fingerprint length (total number of bits). The default value is :math:`2^{32}`. unfolded_fp : `Fingerprint` or None The unfolded version of this fingerprint. If None, this fingerprint may have not been folded yet. unfolding_map : dict, optional A mapping between current indices and indices from the unfolded version of this fingerprint what makes it possible to trace folded bits back to the original shells (features). props: dict, optional Custom properties of the fingerprint, consisting of a string keyword and some value. It can be used, for instance, to save the ligand name and parameters used to generate shells (IFP features). """ def __init__(self, indices=None, counts=None, fp_length=DEFAULT_FP_LENGTH, unfolded_fp=None, unfolding_map=None, props=None): if indices is None and counts is None: logger.exception("Indices or counts must be provided.") raise IllegalArgumentError("Indices or counts must be provided.") if indices is not None: indices = np.asarray(indices, dtype=np.long) if np.any(np.logical_or(indices < 0, indices >= fp_length)): logger.exception("Provided indices are in a different bit scale.") raise BitsValueError("Provided indices are in a different bit scale.") if counts is None: indices, counts = np.unique(indices, return_counts=True) counts = dict(zip(indices, counts)) else: indices = np.unique(indices) if not np.all([x in indices for x in counts]): logger.exception("At least one index from 'counts' is not in 'indices'.") raise FingerprintCountsError("At least one index from 'counts' is not in 'indices'.") if len(set(indices).symmetric_difference(counts)) > 0: logger.exception("At least one index in 'indices' is not in 'counts'.") raise FingerprintCountsError("At least one index in 'indices' is not in 'counts'.") else: indices = np.asarray(sorted(counts.keys()), dtype=np.long) if np.any(np.logical_or(indices < 0, indices >= fp_length)): logger.exception("Provided indices are in a different bit scale.") raise BitsValueError("Provided indices are in a different bit scale.") self._counts = counts super().__init__(indices, fp_length, unfolded_fp, unfolding_map, props)
[docs] @classmethod def from_indices(cls, indices=None, counts=None, fp_length=DEFAULT_FP_LENGTH, **kwargs): """Initialize from an array of indices. Parameters ---------- indices : array_like of int, optional Indices of "on" bits. It is optional if ``counts`` is provided. counts : dict, optional Mapping between each index in ``indices`` to the number of counts. If not provided, the default count value of 1 will be used instead. fp_length : int The fingerprint length (total number of bits). The default value is :math:`2^{32}`. **kwargs : dict, optional Extra arguments to `CountFingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `CountFingerprint` Examples -------- >>> from luna.interaction.fp.fingerprint import CountFingerprint >>> import numpy as np >>> np.random.seed(0) >>> on_bits = 8 >>> fp_length = 32 >>> indices, counts = np.unique(np.random.randint(0, fp_length, on_bits), return_counts=True) >>> counts = dict(zip(indices, counts)) >>> print(counts) {0: 1, 3: 2, 7: 1, 12: 1, 15: 1, 21: 1, 27: 1} >>> fp = CountFingerprint.from_indices(indices, counts=counts, fp_length=fp_length) >>> print(fp.indices) [ 0 3 7 12 15 21 27] >>> print(fp.to_vector(compressed=False)) [1 0 0 2 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0] """ return cls(indices=indices, counts=counts, fp_length=fp_length, **kwargs)
[docs] @classmethod def from_counts(cls, counts, fp_length=DEFAULT_FP_LENGTH, **kwargs): """Initialize from a counting map. Parameters ---------- counts : dict Mapping between each index in ``indices`` to the number of counts. fp_length : int The fingerprint length (total number of bits). The default value is :math:`2^{32}`. **kwargs : dict, optional Extra arguments to `CountFingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `CountFingerprint` Examples -------- >>> from luna.interaction.fp.fingerprint import CountFingerprint >>> import numpy as np >>> np.random.seed(0) >>> on_bits = 8 >>> fp_length = 32 >>> counts = dict(zip(*np.unique(np.random.randint(0, fp_length, on_bits), ... return_counts=True))) >>> print(counts) {0: 1, 3: 2, 7: 1, 12: 1, 15: 1, 21: 1, 27: 1} >>> fp = CountFingerprint.from_counts(counts=counts, fp_length=fp_length) >>> print(fp.indices) [ 0 3 7 12 15 21 27] >>> print(fp.to_vector(compressed=False)) 1 0 0 2 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0] """ return cls(counts=counts, fp_length=fp_length, **kwargs)
[docs] @classmethod def from_bit_string(cls, bit_string, counts=None, fp_length=None, **kwargs): """Initialize from a bit string (e.g. '0010100110'). Parameters ---------- bit_string : str String of 0s and 1s. counts : dict, optional Mapping between each index in ``indices`` to the number of counts. If not provided, the default count value of 1 will be used instead. fp_length : int, optional The fingerprint length (total number of bits). If not provided, the fingerprint length will be defined based on the string length. **kwargs : dict, optional Extra arguments to `Fingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `CountFingerprint` Examples -------- >>> from luna.interaction.fp.fingerprint import CountFingerprint >>> fp = CountFingerprint.from_bit_string("0010100110000010", ... counts={2: 5, 4: 1, 7: 3, 8: 1, 14: 2}) >>> print(fp.indices) [ 2 4 7 8 14] >>> print(fp.counts) {2: 5, 4: 1, 7: 3, 8: 1, 14: 2} """ indices = [i for i, char in enumerate(bit_string) if char != '0'] if fp_length is None: fp_length = len(bit_string) return cls.from_indices(indices, counts, fp_length, **kwargs)
[docs] @classmethod def from_vector(cls, vector, fp_length=None, **kwargs): """Initialize from a vector. Parameters ---------- vector : :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` Array of counts. fp_length : int, optional The fingerprint length (total number of bits). If not provided, the fingerprint length will be defined based on the ``vector`` shape. **kwargs : dict, optional Extra arguments to `Fingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `CountFingerprint` Examples -------- >>> from luna.interaction.fp.fingerprint import CountFingerprint >>> import numpy as np >>> np.random.seed(0) >>> fp_length = 32 >>> vector = np.random.choice(5, size=(fp_length,), p=[0.76, 0.1, 0.1, 0.02, 0.02]) >>> print(vector) [0 0 0 0 2 3 0 1 0 0 2 0 0 0 1 1 2 3 1 0 1 0 0 0 2 0 0 0 1 0 0 0] >>> fp = CountFingerprint.from_vector(vector) >>> print(fp.indices) [ 4 5 7 10 14 15 16 17 18 20 24 28] >>> print(fp.counts) {4: 2, 5: 3, 7: 1, 10: 2, 14: 1, 15: 1, 16: 2, 17: 3, 18: 1, 20: 1, 24: 2, 28: 1} """ if fp_length is None: try: fp_length = vector.shape[1] except IndexError: fp_length = vector.shape[0] if issparse(vector): indices = vector.indices.astype(np.long) counts = vector.data else: indices = np.asarray(np.where(vector), dtype=np.long).flatten() counts = vector[indices] counts = dict(zip(indices, counts)) return cls.from_indices(indices, counts, fp_length, **kwargs)
[docs] @classmethod def from_fingerprint(cls, fp, **kwargs): """Initialize from an existing fingerprint. Parameters ---------- fp : `Fingerprint` An existing fingerprint. **kwargs : dict, optional Extra arguments to `Fingerprint`. Refer to the documentation for a list of all possible arguments. Returns ------- : `CountFingerprint` """ if not isinstance(fp, Fingerprint): logger.exception("Informed fingerprint is not an instance of %s." % (cls.__class__)) raise InvalidFingerprintType("Informed fingerprint is not an instance of %s." % (cls.__class__)) counts = dict([(i, c) for i, c in fp.counts.items() if c > 0]) unfolded_fp = fp.__class__.from_fingerprint(fp.unfolded_fp) if fp.unfolded_fp is not None else None unfolding_map = dict(fp.unfolding_map) props = dict(fp.props) new_fp = cls.from_counts(counts, fp.fp_length, unfolded_fp=unfolded_fp, unfolding_map=unfolding_map, props=props) return new_fp
@property def counts(self): """dict, read-only: Mapping between each index in ``indices`` to the number of counts.""" return self._counts
[docs] def get_count(self, index): """Get the count value at index ``index``. Return 0 if index is not in ``counts``.""" return self.counts.get(index, 0)
[docs] def fold(self, new_length=DEFAULT_FOLDED_FP_LENGTH): """Fold this fingerprint to size ``new_length``. Parameters ---------- new_length : int Length of the new fingerprint, ideally multiple of 2. The default value is 4096. Returns ------- : `Fingerprint` Folded `Fingerprint`. Raises ------ BitsValueError If the new fingerprint length is not a multiple of 2 or is greater than the existing fingerprint length. Examples -------- >>> from luna.interaction.fp.fingerprint import CountFingerprint >>> import numpy as np >>> np.random.seed(0) >>> on_bits = 8 >>> fp_length = 32 >>> indices, counts = np.unique(np.random.randint(0, fp_length, on_bits), return_counts=True) >>> counts = dict(zip(indices, counts)) >>> print(counts) {0: 1, 3: 2, 7: 1, 12: 1, 15: 1, 21: 1, 27: 1} >>> fp = CountFingerprint.from_indices(indices, counts=counts, fp_length=fp_length) >>> print(fp.indices) [ 0 3 7 12 15 21 27] >>> print(fp.to_vector(compressed=False)) [1 0 0 2 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0] >>> folded_fp = fp.fold(8) >>> print(folded_fp.indices) [0 3 4 5 7] >>> print(folded_fp.to_vector(compressed=False)) [1 0 0 3 1 1 0 2] """ new_fp = super().fold(new_length) new_fp._counts = dict([(folded_idx, sum([self.get_count(x) for x in unfolded_set])) for folded_idx, unfolded_set in new_fp.unfolding_map.items()]) return new_fp
def __repr__(self): return ("<%s: counts={%s} length=%d>" % (self.__class__, tuple([(k, v) for k, v in self.counts.items()]), self.fp_length)) def __eq__(self, other): if isinstance(other, Fingerprint): return (self.__class__ == other.__class__ and self.counts == other.counts and self.fp_length == other.fp_length and np.all(np.in1d(self.indices, other.indices, assume_unique=True))) return False