Split a list into nested lists on a value Split a list into nested lists on a value python python

Split a list into nested lists on a value


>>> def isplit(iterable,splitters):    return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k]>>> isplit(L,(None,))[[1, 4], [6, 9], [3, 9, 4]]>>> isplit(L,(None,9))[[1, 4], [6], [3], [4]]

benchmark code:

import timeit    kabie=("isplit_kabie","""import itertoolsdef isplit_kabie(iterable,splitters):    return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k]""" )ssplit=("ssplit","""def ssplit(seq,splitters):    seq=list(seq)    if splitters and seq:        result=[]        begin=0        for end in range(len(seq)):            if seq[end] in splitters:                if end > begin:                    result.append(seq[begin:end])                begin=end+1        if begin<len(seq):            result.append(seq[begin:])        return result    return [seq]""" )ssplit2=("ssplit2","""def ssplit2(seq,splitters):    seq=list(seq)    if splitters and seq:        splitters=set(splitters).intersection(seq)        if splitters:            result=[]            begin=0            for end in range(len(seq)):                if seq[end] in splitters:                    if end > begin:                        result.append(seq[begin:end])                    begin=end+1            if begin<len(seq):                result.append(seq[begin:])            return result    return [seq]""" )emile=("magicsplit","""def _itersplit(l, *splitters):    current = []    for item in l:        if item in splitters:            yield current            current = []        else:            current.append(item)    yield currentdef magicsplit(l, splitters):    return [subl for subl in _itersplit(l, *splitters) if subl]""" )emile_improved=("magicsplit2","""def _itersplit(l, *splitters):    current = []    for item in l:        if item in splitters:            if current:                yield current                current = []        else:            current.append(item)    if current:        yield currentdef magicsplit2(l, splitters):    if splitters and l:        return [i for i in _itersplit(l, *splitters)]    return [list(l)]""" )karl=("ssplit_karl","""def ssplit_karl(original,splitters):    indices = [i for (i, x) in enumerate(original) if x in splitters]    ends = indices + [len(original)]    begins = [0] + [x + 1 for x in indices]    return [original[begin:end] for (begin, end) in zip(begins, ends)]""" )ryan=("split_on","""from functools import reducedef split_on (seq, delims, remove_empty=True):    '''Split seq into lists using delims as a delimiting elements.    For example, split_on(delims=2, list=xrange(0,5)) yields [ [0,1], [3,4] ].    delims can be either a single delimiting element or a list or    tuple of multiple delimiting elements. If you wish to use a list    or tuple as a delimiter, you must enclose it in another list or    tuple.    If remove_empty is False, then consecutive delimiter elements or delimiter elements at the beginning or end of the longlist'''    delims=set(delims)    def reduce_fun(lists, elem):        if elem in delims:            if remove_empty and lists[-1] == []:                # Avoid adding multiple empty lists                pass            else:                lists.append([])        else:            lists[-1].append(elem)        return lists    result_list = reduce(reduce_fun, seq, [ [], ])    # Maybe remove trailing empty list    if remove_empty and result_list[-1] == []:        result_list.pop()    return result_list""" )cases=(kabie, emile, emile_improved, ssplit ,ssplit2 ,ryan)data=(    ([1, 4, None, 6, 9, None, 3, 9, 4 ],(None,)),    ([1, 4, None, 6, 9, None, 3, 9, 4 ]*5,{None,9,7}),    ((),()),    (range(1000),()),    ("Split me",('','')),    ("split me "*100,' '),    ("split me,"*100,' ,'*20),    ("split me, please!"*100,' ,!'),    (range(100),range(100)),    (range(100),range(101,1000)),    (range(100),range(50,150)),    (list(range(100))*30,(99,)),    )params="seq,splitters"def benchmark(func,code,data,params='',times=10000,rounds=3,debug=''):    assert(func.isidentifier())    tester = timeit.Timer(stmt='{func}({params})'.format(                                func=func,params=params),                          setup="{code}\n".format(code=code)+            (params and "{params}={data}\n".format(params=params,data=data)) +            (debug and """ret=repr({func}({params}))print({func}.__name__.rjust(16),":",ret[:30]+"..."+ret[-15:] if len(ret)>50 else ret)                       """.format(func=func,params=params)))    results = [tester.timeit(times) for i in range(rounds)]    if not debug:        print("{:>16s} takes:{:6.4f},avg:{:.2e},best:{:.4f},worst:{:.4f}".format(            func,sum(results),sum(results)/times/rounds,min(results),max(results)))def testAll(cases,data,params='',times=10000,rounds=3,debug=''):    if debug:        times,rounds = 1,1    for dat in data:        sdat = tuple(map(repr,dat))        print("{}x{} times:".format(times,rounds),              ','.join("{}".format(d[:8]+"..."+d[-5:] if len(d)>16 else d)for d in map(repr,dat)))        for func,code in cases:            benchmark(func,code,dat,params,times,rounds,debug)if __name__=='__main__':    testAll(cases,data,params,500,10)#,debug=True)

Output on i3-530, Windows7, Python 3.1.2:

500x10 times: [1, 4, N...9, 4],(None,)    isplit_kabie takes:0.0605,avg:1.21e-05,best:0.0032,worst:0.0074      magicsplit takes:0.0287,avg:5.74e-06,best:0.0016,worst:0.0036     magicsplit2 takes:0.0174,avg:3.49e-06,best:0.0017,worst:0.0018          ssplit takes:0.0149,avg:2.99e-06,best:0.0015,worst:0.0016         ssplit2 takes:0.0198,avg:3.96e-06,best:0.0019,worst:0.0021        split_on takes:0.0229,avg:4.59e-06,best:0.0023,worst:0.0024500x10 times: [1, 4, N...9, 4],{9, None, 7}    isplit_kabie takes:0.1448,avg:2.90e-05,best:0.0144,worst:0.0146      magicsplit takes:0.0636,avg:1.27e-05,best:0.0063,worst:0.0065     magicsplit2 takes:0.0891,avg:1.78e-05,best:0.0064,worst:0.0162          ssplit takes:0.0593,avg:1.19e-05,best:0.0058,worst:0.0061         ssplit2 takes:0.1004,avg:2.01e-05,best:0.0069,worst:0.0142        split_on takes:0.0929,avg:1.86e-05,best:0.0090,worst:0.0096500x10 times: (),()    isplit_kabie takes:0.0041,avg:8.14e-07,best:0.0004,worst:0.0004      magicsplit takes:0.0040,avg:8.04e-07,best:0.0004,worst:0.0004     magicsplit2 takes:0.0022,avg:4.35e-07,best:0.0002,worst:0.0002          ssplit takes:0.0023,avg:4.59e-07,best:0.0002,worst:0.0003         ssplit2 takes:0.0023,avg:4.53e-07,best:0.0002,worst:0.0002        split_on takes:0.0072,avg:1.45e-06,best:0.0007,worst:0.0009500x10 times: range(0, 1000),()    isplit_kabie takes:0.8892,avg:1.78e-04,best:0.0881,worst:0.0895      magicsplit takes:0.6614,avg:1.32e-04,best:0.0654,worst:0.0673     magicsplit2 takes:0.0958,avg:1.92e-05,best:0.0094,worst:0.0099          ssplit takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0095         ssplit2 takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0096        split_on takes:1.3348,avg:2.67e-04,best:0.1328,worst:0.1340500x10 times: 'Split me',('', '')    isplit_kabie takes:0.0234,avg:4.68e-06,best:0.0023,worst:0.0024      magicsplit takes:0.0126,avg:2.52e-06,best:0.0012,worst:0.0013     magicsplit2 takes:0.0138,avg:2.76e-06,best:0.0013,worst:0.0015          ssplit takes:0.0119,avg:2.39e-06,best:0.0012,worst:0.0012         ssplit2 takes:0.0075,avg:1.50e-06,best:0.0007,worst:0.0008        split_on takes:0.0191,avg:3.83e-06,best:0.0018,worst:0.0023500x10 times: 'split m... me ',' '    isplit_kabie takes:2.0803,avg:4.16e-04,best:0.2060,worst:0.2098      magicsplit takes:0.9219,avg:1.84e-04,best:0.0920,worst:0.0925     magicsplit2 takes:1.0221,avg:2.04e-04,best:0.1018,worst:0.1034          ssplit takes:0.8294,avg:1.66e-04,best:0.0818,worst:0.0834         ssplit2 takes:0.9911,avg:1.98e-04,best:0.0983,worst:0.1014        split_on takes:1.5672,avg:3.13e-04,best:0.1543,worst:0.1694500x10 times: 'split m... me,',' , , , ... , ,'    isplit_kabie takes:2.1847,avg:4.37e-04,best:0.2164,worst:0.2275      magicsplit takes:3.7135,avg:7.43e-04,best:0.3693,worst:0.3783     magicsplit2 takes:3.8104,avg:7.62e-04,best:0.3795,worst:0.3884          ssplit takes:0.9522,avg:1.90e-04,best:0.0939,worst:0.0956         ssplit2 takes:1.0140,avg:2.03e-04,best:0.1009,worst:0.1023        split_on takes:1.5747,avg:3.15e-04,best:0.1563,worst:0.1615500x10 times: 'split m...ase!',' ,!'    isplit_kabie takes:3.3443,avg:6.69e-04,best:0.3324,worst:0.3380      magicsplit takes:2.0594,avg:4.12e-04,best:0.2054,worst:0.2076     magicsplit2 takes:2.1850,avg:4.37e-04,best:0.2180,worst:0.2191          ssplit takes:1.4881,avg:2.98e-04,best:0.1484,worst:0.1493         ssplit2 takes:1.8779,avg:3.76e-04,best:0.1868,worst:0.1920        split_on takes:2.9596,avg:5.92e-04,best:0.2946,worst:0.2980500x10 times: range(0, 100),range(0, 100)    isplit_kabie takes:0.9445,avg:1.89e-04,best:0.0933,worst:0.1023      magicsplit takes:0.5878,avg:1.18e-04,best:0.0583,worst:0.0593     magicsplit2 takes:0.5597,avg:1.12e-04,best:0.0554,worst:0.0588          ssplit takes:0.8568,avg:1.71e-04,best:0.0852,worst:0.0874         ssplit2 takes:0.1399,avg:2.80e-05,best:0.0121,worst:0.0242        split_on takes:0.1462,avg:2.92e-05,best:0.0145,worst:0.0148500x10 times: range(0, 100),range(101, 1000)    isplit_kabie takes:19.9749,avg:3.99e-03,best:1.9789,worst:2.0330      magicsplit takes:9.4997,avg:1.90e-03,best:0.9369,worst:0.9640     magicsplit2 takes:9.4394,avg:1.89e-03,best:0.9267,worst:0.9665          ssplit takes:19.2363,avg:3.85e-03,best:1.8936,worst:1.9516         ssplit2 takes:0.2032,avg:4.06e-05,best:0.0201,worst:0.0205        split_on takes:0.3329,avg:6.66e-05,best:0.0323,worst:0.0344500x10 times: range(0, 100),range(50, 150)    isplit_kabie takes:1.1394,avg:2.28e-04,best:0.1130,worst:0.1153      magicsplit takes:0.7288,avg:1.46e-04,best:0.0721,worst:0.0760     magicsplit2 takes:0.7220,avg:1.44e-04,best:0.0705,worst:0.0774          ssplit takes:1.0835,avg:2.17e-04,best:0.1059,worst:0.1116         ssplit2 takes:0.1092,avg:2.18e-05,best:0.0105,worst:0.0116        split_on takes:0.1639,avg:3.28e-05,best:0.0162,worst:0.0168500x10 times: [0, 1, 2..., 99],(99,)    isplit_kabie takes:3.2579,avg:6.52e-04,best:0.3225,worst:0.3360      magicsplit takes:2.2937,avg:4.59e-04,best:0.2274,worst:0.2344     magicsplit2 takes:2.6054,avg:5.21e-04,best:0.2587,worst:0.2642          ssplit takes:1.5251,avg:3.05e-04,best:0.1495,worst:0.1729         ssplit2 takes:1.7298,avg:3.46e-04,best:0.1696,worst:0.1858        split_on takes:4.1041,avg:8.21e-04,best:0.4033,worst:0.4291

Slightly modified Ryan's code, hope you don't mind.ssplit was based on the idea of Karl. Added statements handling some special cases to became ssplit2 which is the best solution I may provide.


Something like this:

def _itersplit(l, splitters):    current = []    for item in l:        if item in splitters:            yield current            current = []        else:            current.append(item)    yield currentdef magicsplit(l, *splitters):    return [subl for subl in _itersplit(l, splitters) if subl]

gets me:

>>> l = [1, 4, None, 6, 9, None, 3, 9, 4 ]>>> magicsplit(l, None)[[1, 4], [6, 9], [3, 9, 4]]>>> magicsplit(l, None, 9)[[1, 4], [6], [3], [4]]


You could find the indices of the "delimiter" elements. For example, the indices of None are 2 and 5 (zero-based).You could use these, along with the length of the list, to construct a list of tuples [ (0,1), (3,4), (6,8) ] representing the start indices and end indices of the sublists. Then you can use a list comprehension over this list of tuples to extract sublists.