Compute Jaccard distances on sparse matrix Compute Jaccard distances on sparse matrix numpy numpy

Compute Jaccard distances on sparse matrix


Vectorization is relatively easy if you use matrix multiplication to calculate the set intersections and then the rule |union(a, b)| == |a| + |b| - |intersection(a, b)| to determine the unions:

# Not actually necessary for sparse matrices, but it is for # dense matrices and ndarrays, if X.dtype is integer.from __future__ import divisiondef pairwise_jaccard(X):    """Computes the Jaccard distance between the rows of `X`.    """    X = X.astype(bool).astype(int)    intrsct = X.dot(X.T)    row_sums = intrsct.diagonal()    unions = row_sums[:,None] + row_sums - intrsct    dist = 1.0 - intrsct / unions    return dist

Note the cast to bool and then int, because the dtype of X must be large enough to accumulate twice the maximum row sum and that entries of X must be either zero or one. The downside of this code is that it's heavy on RAM, because unions and dists are dense matrices.

If you're only interested in distances smaller than some cut-off epsilon, the code can be tuned for sparse matrices:

from scipy.sparse import csr_matrixdef pairwise_jaccard_sparse(csr, epsilon):    """Computes the Jaccard distance between the rows of `csr`,    smaller than the cut-off distance `epsilon`.    """    assert(0 < epsilon < 1)    csr = csr_matrix(csr).astype(bool).astype(int)    csr_rownnz = csr.getnnz(axis=1)    intrsct = csr.dot(csr.T)    nnz_i = np.repeat(csr_rownnz, intrsct.getnnz(axis=1))    unions = nnz_i + csr_rownnz[intrsct.indices] - intrsct.data    dists = 1.0 - intrsct.data / unions    mask = (dists > 0) & (dists <= epsilon)    data = dists[mask]    indices = intrsct.indices[mask]    rownnz = np.add.reduceat(mask, intrsct.indptr[:-1])    indptr = np.r_[0, np.cumsum(rownnz)]    out = csr_matrix((data, indices, indptr), intrsct.shape)    return out

If this still takes to much RAM you could try to vectorize over one dimension and Python-loop over the other.


To add to the accepted answer: I had use for a weighted version of the above method which is simply implemented as:

def pairwise_jaccard_sparse_weighted(csr, epsilon, weight):    csr = scipy.sparse.csr_matrix(csr).astype(bool).astype(int)    csr_w = csr * scipy.sparse.diags(weight)    csr_rowsum = numpy.array(csr_w.sum(axis = 1)).flatten()    intrsct = csr.dot(csr_w.T)    rowsum_i = numpy.repeat(csr_rowsum, intrsct.getnnz(axis = 1))    unions = rowsum_i + csr_rowsum[intrsct.indices] - intrsct.data    dists = 1.0 - 1.0 * intrsct.data / unions    mask = (dists > 0) & (dists <= epsilon)    data = dists[mask]    indices = intrsct.indices[mask]    rownnz = numpy.add.reduceat(mask, intrsct.indptr[:-1])    indptr = numpy.r_[0, numpy.cumsum(rownnz)]    out = scipy.sparse.csr_matrix((data, indices, indptr), intrsct.shape)    return out

I doubt this is the most efficient implementation, but it's a damn sight quicker than the dense implementation in scipy.spatial.distance.jaccard.


Here a solution that has a scikit-learn-like API.

def pairwise_sparse_jaccard_distance(X, Y=None):    """    Computes the Jaccard distance between two sparse matrices or between all pairs in    one sparse matrix.    Args:        X (scipy.sparse.csr_matrix): A sparse matrix.        Y (scipy.sparse.csr_matrix, optional): A sparse matrix.    Returns:        numpy.ndarray: A similarity matrix.    """    if Y is None:        Y = X    assert X.shape[1] == Y.shape[1]    X = X.astype(bool).astype(int)    Y = Y.astype(bool).astype(int)    intersect = X.dot(Y.T)    x_sum = X.sum(axis=1).A1    y_sum = Y.sum(axis=1).A1    xx, yy = np.meshgrid(x_sum, y_sum)    union = ((xx + yy).T - intersect)    return (1 - intersect / union).A

Here some testing and benchmarking:

>>> import timeit>>> import numpy as np>>> from scipy.sparse import csr_matrix>>> from sklearn.metrics import pairwise_distances>>> X = csr_matrix(np.random.choice(a=[False, True], size=(10000, 1000), p=[0.9, 0.1]))>>> Y = csr_matrix(np.random.choice(a=[False, True], size=(1000, 1000), p=[0.9, 0.1]))

Asserting that all results are approximately equivalent

>>> custom_jaccard_distance = pairwise_sparse_jaccard_distance(X, Y)>>> sklearn_jaccard_distance = pairwise_distances(X.todense(), Y.todense(), "jaccard")>>> np.allclose(custom_jaccard_distance, sklearn_jaccard_distance)True

Benchmarking runtime (from Jupyer Notebook)

>>> %timeit pairwise_jaccard_index(X, Y)795 ms ± 58.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)>>> %timeit 1 - pairwise_distances(X.todense(), Y.todense(), "jaccard")14.7 s ± 694 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)