Pandas alternative to apply - to create new column based on multiple columns
I see a reasonable performance improvement by using .loc
rather than chained indexing:
import random, pandas as pd, numpy as npdf = pd.DataFrame([[4,5,19],[1,2,0],[2,5,9],[8,2,5]], columns=['a','b','c'])df = pd.concat([df]*1000000)x = df.sample(n=2)def get_new(row): a, b, c = row return random.choice(df[(df['a'] != a) & (df['b'] == b) & (df['c'] != c)]['c'].values)def get_new2(row): a, b, c = row return random.choice(df.loc[(df['a'] != a) & (df['b'] == b) & (df['c'] != c), 'c'].values)%timeit x.apply(lambda row: get_new(row), axis=1) # 159ms%timeit x.apply(lambda row: get_new2(row), axis=1) # 119ms