Constructing a co-occurrence matrix in python pandas Constructing a co-occurrence matrix in python pandas pandas pandas

Constructing a co-occurrence matrix in python pandas


It's a simple linear algebra, you multiply matrix with its transpose (your example contains strings, don't forget to convert them to integer):

>>> df_asint = df.astype(int)>>> coocc = df_asint.T.dot(df_asint)>>> coocc       Dop  Snack  TransDop      4      2      3Snack    2      3      2Trans    3      2      4

if, as in R answer, you want to reset diagonal, you can use numpy's fill_diagonal:

>>> import numpy as np>>> np.fill_diagonal(coocc.values, 0)>>> coocc       Dop  Snack  TransDop      0      2      3Snack    2      0      2Trans    3      2      0


Demo in NumPy:

import numpy as npnp.random.seed(3) # for reproducibility# Generate data: 5 labels, 10 examples, binary.label_headers = 'Alice Bob Carol Dave Eve'.split(' ')label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer.print('labels:\n{0}'.format(label_data))# Compute cooccurrence matrix cooccurrence_matrix = np.dot(label_data.transpose(),label_data)print('\ncooccurrence_matrix:\n{0}'.format(cooccurrence_matrix)) # Compute cooccurrence matrix in percentage# FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element#      http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)with np.errstate(divide='ignore', invalid='ignore'):    cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))print('\ncooccurrence_matrix_percentage:\n{0}'.format(cooccurrence_matrix_percentage))

Output:

labels:[[0 0 1 1 0] [0 0 1 1 1] [0 1 1 1 0] [1 1 0 0 0] [0 1 1 0 0] [0 1 0 0 0] [0 1 0 1 1] [0 1 0 0 1] [1 0 0 1 0] [1 0 1 1 1]]cooccurrence_matrix:[[3 1 1 2 1] [1 6 2 2 2] [1 2 5 4 2] [2 2 4 6 3] [1 2 2 3 4]]cooccurrence_matrix_percentage:[[ 1.          0.33333333  0.33333333  0.66666667  0.33333333] [ 0.16666667  1.          0.33333333  0.33333333  0.33333333] [ 0.2         0.4         1.          0.8         0.4       ] [ 0.33333333  0.33333333  0.66666667  1.          0.5       ] [ 0.25        0.5         0.5         0.75        1.        ]]

With a heatmap using matplotlib:

import numpy as npnp.random.seed(3) # for reproducibilityimport matplotlib.pyplot as pltdef show_values(pc, fmt="%.2f", **kw):    '''    Heatmap with text in each cell with matplotlib's pyplot    Source: http://stackoverflow.com/a/25074150/395857     By HYRY    '''    from itertools import izip    pc.update_scalarmappable()    ax = pc.get_axes()    for p, color, value in izip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):        x, y = p.vertices[:-2, :].mean(0)        if np.all(color[:3] > 0.5):            color = (0.0, 0.0, 0.0)        else:            color = (1.0, 1.0, 1.0)        ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)def cm2inch(*tupl):    '''    Specify figure size in centimeter in matplotlib    Source: http://stackoverflow.com/a/22787457/395857    By gns-ank    '''    inch = 2.54    if type(tupl[0]) == tuple:        return tuple(i/inch for i in tupl[0])    else:        return tuple(i/inch for i in tupl)def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels):    '''    Inspired by:    - http://stackoverflow.com/a/16124677/395857     - http://stackoverflow.com/a/25074150/395857    '''    # Plot it out    fig, ax = plt.subplots()        c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)    # put the major ticks at the middle of each cell    ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)    ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)    # set tick labels    #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)    ax.set_xticklabels(xticklabels, minor=False)    ax.set_yticklabels(yticklabels, minor=False)    # set title and x/y labels    plt.title(title)    plt.xlabel(xlabel)    plt.ylabel(ylabel)          # Remove last blank column    plt.xlim( (0, AUC.shape[1]) )    # Turn off all the ticks    ax = plt.gca()        for t in ax.xaxis.get_major_ticks():        t.tick1On = False        t.tick2On = False    for t in ax.yaxis.get_major_ticks():        t.tick1On = False        t.tick2On = False    # Add color bar    plt.colorbar(c)    # Add text in each cell     show_values(c)    # Proper orientation (origin at the top left instead of bottom left)    ax.invert_yaxis()    ax.xaxis.tick_top()    # resize     fig = plt.gcf()    fig.set_size_inches(cm2inch(40, 20))def main():    # Generate data: 5 labels, 10 examples, binary.    label_headers = 'Alice Bob Carol Dave Eve'.split(' ')    label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer.    print('labels:\n{0}'.format(label_data))    # Compute cooccurrence matrix     cooccurrence_matrix = np.dot(label_data.transpose(),label_data)    print('\ncooccurrence_matrix:\n{0}'.format(cooccurrence_matrix))     # Compute cooccurrence matrix in percentage    # FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element    #      http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804    cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)    with np.errstate(divide='ignore', invalid='ignore'):        cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))    print('\ncooccurrence_matrix_percentage:\n{0}'.format(cooccurrence_matrix_percentage))    # Add count in labels    label_header_with_count = [ '{0} ({1})'.format(label_header, cooccurrence_matrix_diagonal[label_number]) for label_number, label_header in enumerate(label_headers)]      print('\nlabel_header_with_count: {0}'.format(label_header_with_count))    # Plotting    x_axis_size = cooccurrence_matrix_percentage.shape[0]    y_axis_size = cooccurrence_matrix_percentage.shape[1]    title = "Co-occurrence matrix\n"    xlabel= ''#"Labels"    ylabel= ''#"Labels"    xticklabels = label_header_with_count    yticklabels = label_header_with_count    heatmap(cooccurrence_matrix_percentage, title, xlabel, ylabel, xticklabels, yticklabels)    plt.savefig('image_output.png', dpi=300, format='png', bbox_inches='tight') # use format='svg' or 'pdf' for vectorial pictures    #plt.show()if __name__ == "__main__":    main()    #cProfile.run('main()') # if you want to do some profiling

enter image description here

(PS: a neat visualization of a co-occurrence matrix in D3.js.)


In case that you have larger corpus and term-frequency matrix, using sparse matrix multiplication might be more efficient. I use the same trick of matrix multiplication refered to algo answer on this page.

import scipy.sparse as spX = sp.csr_matrix(df.astype(int).values) # convert dataframe to sparse matrixXc = X.T * X # multiply sparse matrix # Xc.setdiag(0) # reset diagonalprint(Xc.todense()) # to print co-occurence matrix in dense format

Xc here will be the co-occurence matrix in sparse csr format