numpy 1D array: mask elements that repeat more than n times
Disclaimer: this is just a sounder implementation of @FlorianH's idea:
def f(a,N): mask = np.empty(a.size,bool) mask[:N] = True np.not_equal(a[N:],a[:-N],out=mask[N:]) return mask
For larger arrays this makes a huge difference:
a = np.arange(1000).repeat(np.random.randint(0,10,1000))N = 3print(timeit(lambda:f(a,N),number=1000)*1000,"us")# 5.443050000394578 us# compare toprint(timeit(lambda:[True for _ in range(N)] + list(bins[:-N] != bins[N:]),number=1000)*1000,"us")# 76.18969900067896 us
Approach #1 : Here's a vectorized way -
from scipy.ndimage.morphology import binary_dilationdef keep_N_per_group(a, N): k = np.ones(N,dtype=bool) m = np.r_[True,a[:-1]!=a[1:]] return a[binary_dilation(m,k,origin=-(N//2))]
Sample run -
In [42]: aOut[42]: array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])In [43]: keep_N_per_group(a, N=3)Out[43]: array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])
Approach #2 : A bit more compact version -
def keep_N_per_group_v2(a, N): k = np.ones(N,dtype=bool) return a[binary_dilation(np.ediff1d(a,to_begin=a[0])!=0,k,origin=-(N//2))]
Approach #3 : Using the grouped-counts and np.repeat
(won't give us the mask though) -
def keep_N_per_group_v3(a, N): m = np.r_[True,a[:-1]!=a[1:],True] idx = np.flatnonzero(m) c = np.diff(idx) return np.repeat(a[idx[:-1]],np.minimum(c,N))
Approach #4 : With a view-based
method -
from skimage.util import view_as_windowsdef keep_N_per_group_v4(a, N): m = np.r_[True,a[:-1]!=a[1:]] w = view_as_windows(m,N) idx = np.flatnonzero(m) v = idx<len(w) w[idx[v]] = 1 if v.all()==0: m[idx[v.argmin()]:] = 1 return a[m]
Approach #5 : With a view-based
method without indices from flatnonzero
-
def keep_N_per_group_v5(a, N): m = np.r_[True,a[:-1]!=a[1:]] w = view_as_windows(m,N) last_idx = len(a)-m[::-1].argmax()-1 w[m[:-N+1]] = 1 m[last_idx:last_idx+N] = 1 return a[m]
I want to present a solution using numba which should be fairly easy to understand. I assume that you want to "mask" consecutive repeating items:
import numpy as npimport numba as nb@nb.njitdef mask_more_n(arr, n): mask = np.ones(arr.shape, np.bool_) current = arr[0] count = 0 for idx, item in enumerate(arr): if item == current: count += 1 else: current = item count = 1 mask[idx] = count <= n return mask
For example:
>>> bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])>>> bins[mask_more_n(bins, 3)]array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5])>>> bins[mask_more_n(bins, 2)]array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5])
Performance:
Using simple_benchmark
- however I haven't included all approaches. It's a log-log scale:
It seems like the numba solution cannot beat the solution from Paul Panzer which seems to be faster for large arrays by a bit (and doesn't require an additional dependency).
However both seem to outperform the other solutions, but they do return a mask instead of the "filtered" array.
import numpy as npimport numba as nbfrom simple_benchmark import BenchmarkBuilder, MultiArgumentb = BenchmarkBuilder()bins = np.array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5])@nb.njitdef mask_more_n(arr, n): mask = np.ones(arr.shape, np.bool_) current = arr[0] count = 0 for idx, item in enumerate(arr): if item == current: count += 1 else: current = item count = 1 mask[idx] = count <= n return mask@b.add_function(warmups=True)def MSeifert(arr, n): return mask_more_n(arr, n)from scipy.ndimage.morphology import binary_dilation@b.add_function()def Divakar_1(a, N): k = np.ones(N,dtype=bool) m = np.r_[True,a[:-1]!=a[1:]] return a[binary_dilation(m,k,origin=-(N//2))]@b.add_function()def Divakar_2(a, N): k = np.ones(N,dtype=bool) return a[binary_dilation(np.ediff1d(a,to_begin=a[0])!=0,k,origin=-(N//2))]@b.add_function()def Divakar_3(a, N): m = np.r_[True,a[:-1]!=a[1:],True] idx = np.flatnonzero(m) c = np.diff(idx) return np.repeat(a[idx[:-1]],np.minimum(c,N))from skimage.util import view_as_windows@b.add_function()def Divakar_4(a, N): m = np.r_[True,a[:-1]!=a[1:]] w = view_as_windows(m,N) idx = np.flatnonzero(m) v = idx<len(w) w[idx[v]] = 1 if v.all()==0: m[idx[v.argmin()]:] = 1 return a[m]@b.add_function()def Divakar_5(a, N): m = np.r_[True,a[:-1]!=a[1:]] w = view_as_windows(m,N) last_idx = len(a)-m[::-1].argmax()-1 w[m[:-N+1]] = 1 m[last_idx:last_idx+N] = 1 return a[m]@b.add_function()def PaulPanzer(a,N): mask = np.empty(a.size,bool) mask[:N] = True np.not_equal(a[N:],a[:-N],out=mask[N:]) return maskimport random@b.add_arguments('array size')def argument_provider(): for exp in range(2, 20): size = 2**exp yield size, MultiArgument([np.array([random.randint(0, 5) for _ in range(size)]), 3])r = b.run()import matplotlib.pyplot as pltplt.figure(figsize=[10, 8])r.plot()