How to one-hot-encode from a pandas column containing a list?
We can also use sklearn.preprocessing.MultiLabelBinarizer:
Often we want to use sparse DataFrame for the real world data in order to save a lot of RAM.
Sparse solution (for Pandas v0.25.0+)
from sklearn.preprocessing import MultiLabelBinarizermlb = MultiLabelBinarizer(sparse_output=True)df = df.join( pd.DataFrame.sparse.from_spmatrix( mlb.fit_transform(df.pop('Col3')), index=df.index, columns=mlb.classes_))
result:
In [38]: dfOut[38]: Col1 Col2 Apple Banana Grape Orange0 C 33.0 1 1 0 11 A 2.5 1 0 1 02 B 42.0 0 1 0 0In [39]: df.dtypesOut[39]:Col1 objectCol2 float64Apple Sparse[int32, 0]Banana Sparse[int32, 0]Grape Sparse[int32, 0]Orange Sparse[int32, 0]dtype: objectIn [40]: df.memory_usage()Out[40]:Index 128Col1 24Col2 24Apple 16 # <--- NOTE!Banana 16 # <--- NOTE!Grape 8 # <--- NOTE!Orange 8 # <--- NOTE!dtype: int64
Dense solution
mlb = MultiLabelBinarizer()df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('Col3')), columns=mlb.classes_, index=df.index))
Result:
In [77]: dfOut[77]: Col1 Col2 Apple Banana Grape Orange0 C 33.0 1 1 0 11 A 2.5 1 0 1 02 B 42.0 0 1 0 0
Option 1
Short Answerpir_slow
df.drop('Col3', 1).join(df.Col3.str.join('|').str.get_dummies()) Col1 Col2 Apple Banana Grape Orange0 C 33.0 1 1 0 11 A 2.5 1 0 1 02 B 42.0 0 1 0 0
Option 2
Fast Answerpir_fast
v = df.Col3.valuesl = [len(x) for x in v.tolist()]f, u = pd.factorize(np.concatenate(v))n, m = len(v), u.sizei = np.arange(n).repeat(l)dummies = pd.DataFrame( np.bincount(i * m + f, minlength=n * m).reshape(n, m), df.index, u)df.drop('Col3', 1).join(dummies) Col1 Col2 Apple Orange Banana Grape0 C 33.0 1 1 1 01 A 2.5 1 0 0 12 B 42.0 0 0 1 0
Option 3pir_alt1
df.drop('Col3', 1).join( pd.get_dummies( pd.DataFrame(df.Col3.tolist()).stack() ).astype(int).sum(level=0)) Col1 Col2 Apple Orange Banana Grape0 C 33.0 1 1 1 01 A 2.5 1 0 0 12 B 42.0 0 0 1 0
Timing Results
Code Below
def maxu(df): mlb = MultiLabelBinarizer() d = pd.DataFrame( mlb.fit_transform(df.Col3.values) , df.index, mlb.classes_ ) return df.drop('Col3', 1).join(d)def bos(df): return df.drop('Col3', 1).assign(**pd.get_dummies(df.Col3.apply(lambda x:pd.Series(x)).stack().reset_index(level=1,drop=True)).sum(level=0))def psi(df): return pd.concat([ df.drop("Col3", 1), df.Col3.apply(lambda x: pd.Series(1, x)).fillna(0) ], axis=1)def alex(df): return df[['Col1', 'Col2']].assign(**{fruit: [1 if fruit in cell else 0 for cell in df.Col3] for fruit in set(fruit for fruits in df.Col3 for fruit in fruits)})def pir_slow(df): return df.drop('Col3', 1).join(df.Col3.str.join('|').str.get_dummies())def pir_alt1(df): return df.drop('Col3', 1).join(pd.get_dummies(pd.DataFrame(df.Col3.tolist()).stack()).astype(int).sum(level=0))def pir_fast(df): v = df.Col3.values l = [len(x) for x in v.tolist()] f, u = pd.factorize(np.concatenate(v)) n, m = len(v), u.size i = np.arange(n).repeat(l) dummies = pd.DataFrame( np.bincount(i * m + f, minlength=n * m).reshape(n, m), df.index, u ) return df.drop('Col3', 1).join(dummies)results = pd.DataFrame( index=(1, 3, 10, 30, 100, 300, 1000, 3000), columns='maxu bos psi alex pir_slow pir_fast pir_alt1'.split())for i in results.index: d = pd.concat([df] * i, ignore_index=True) for j in results.columns: stmt = '{}(d)'.format(j) setp = 'from __main__ import d, {}'.format(j) results.set_value(i, j, timeit(stmt, setp, number=10))