Fast logarithm calculation Fast logarithm calculation numpy numpy

Fast logarithm calculation


Note that ALL below are float32, not double precision.

UPDATE:I've ditched gcc completely in favour of Intel's icc. It makes ALL the difference when performance is critical and when you don't have time to fine-tune your "compiler hints" to enforce gcc vectorization (see, e.g. here)

log_omp.c,

GCC: gcc -o log_omp.so -fopenmp log_omp.c -lm -O3 -fPIC -shared -std=c99

ICC: icc -o log_omp.so -openmp loge_omp.c -lm -O3 -fPIC -shared -std=c99 -vec-report1 -xAVX -I/opt/intel/composer/mkl/include

#include <math.h>#include "omp.h"#include "mkl_vml.h"#define restrict __restrictinline void log_omp(int m, float * restrict a, float * restrict c);void log_omp(int m, float * restrict a, float * restrict c){   int i;#pragma omp parallel for default(none) shared(m,a,c) private(i)   for (i=0; i<m; i++) {      a[i] = log(c[i]);   }}// VML / icc only:void log_VML(int m, float * restrict a, float * restrict c){   int i;   int split_to = 14;   int iter = m / split_to;   int additional = m % split_to;//   vsLn(m, c, a);#pragma omp parallel for default(none) shared(m,a,c, additional, iter) private(i) num_threads(split_to)   for (i=0;i < (m-additional); i+=iter)     vsLog10(iter,c+i,a+i);     //vmsLn(iter,c+i,a+i, VML_HA);   if (additional > 0)     vsLog10(additional, c+m-additional, a+m-additional);     //vmsLn(additional, c+m-additional, a+m-additional, VML_HA);}

in python:

from ctypes import CDLL, c_int, c_void_pdef log_omp(xs, out):    lib = CDLL('./log_omp.so')    lib.log_omp.argtypes = [c_int, np.ctypeslib.ndpointer(dtype=np.float32), np.ctypeslib.ndpointer(dtype=np.float32)]    lib.log_omp.restype  = c_void_p    n = xs.shape[0]    out = np.empty(n, np.float32)    lib.log_omp(n, out, xs)    return out

Cython code (in ipython notebook, hence the %% magic):

%%cython --compile-args=-fopenmp --link-args=-fopenmpimport  numpy as npcimport numpy as npfrom libc.math cimport logfrom cython.parallel cimport prangeimport cython@cython.boundscheck(False)def cylog(np.ndarray[np.float32_t, ndim=1] a not None,        np.ndarray[np.float32_t, ndim=1] out=None):    if out is None:        out = np.empty((a.shape[0]), dtype=a.dtype)    cdef Py_ssize_t i    with nogil:        for i in prange(a.shape[0]):            out[i] = log(a[i])    return out

Timings:

numexpr.detect_number_of_cores() // 228%env OMP_NUM_THREADS=28x = np.abs(np.random.randn(50000000).astype('float32'))y = x.copy()# GCC%timeit log_omp(x, y)10 loops, best of 3: 21.6 ms per loop# ICC%timeit log_omp(x, y)100 loops, best of 3: 9.6 ms per loop%timeit log_VML(x, y)100 loops, best of 3: 10 ms per loop%timeit cylog(x, out=y)10 loops, best of 3: 21.7 ms per loopnumexpr.set_num_threads(28)%timeit out = numexpr.evaluate('log(x)')100 loops, best of 3: 13 ms per loop

So numexpr seems to be doing a better job than (poorly) compiled gcc code, but icc wins.

Some resources I found useful and shamefully used code from:

http://people.duke.edu/~ccc14/sta-663/Optimization_Bakeoff.html

https://gist.github.com/zed/2051661