Pandas column reformatting
This is a little bit complicate
(df.set_index('Code') .Items.str.split(';',expand=True) .stack() .str.split('-',expand=True) .set_index(0,append=True)[1] .unstack() .fillna('') .sum(level=0))0 ca co eq go tpCode 123 hk 147 ml ml 258 us 321 ch 369 usmy 456 eu lbe654 au au 741 us 789 us 852 eu 963 ml ml ml 987 jp # using str split to get unnest the column, #then we do stack, and str split again , then set the first column to index # after unstack we yield the result
List comprehensions work better (read: much faster) for string problems like this which require multiple levels of splitting.
df2 = pd.DataFrame([ dict(y.split('-') for y in x.split('; ')) for x in df.Items]).fillna('')df2.insert(0, 'Code', df.Code)print(df2) Code ca co eq go tp0 123 hk 1 456 eu lbe2 789 us 3 321 ch 4 654 au au 5 987 jp 6 147 ml ml 7 258 us # Should be "us,my"... see below.8 369 my 9 741 us 10 852 eu 11 963 ml ml ml
This does not handle the situation where multiple items with the same key can be present in a row. For that, a slightly more involved solution is needed.
from itertools import chainv = [x.split('; ') for x in df.Items] X = pd.Series(df.Code.values.repeat([len(x) for x in v]))Y = pd.DataFrame([x.split('-') for x in chain.from_iterable(v)])df2 = pd.concat([X, Y], axis=1, ignore_index=True)(df2.set_index([0, 1, 3])[2] .unstack(1) .fillna('') .groupby(level=0) .agg(lambda x: ','.join(x).strip(','))1 ca co eq go tp0 123 hk 147 ml ml 258 us 321 ch 369 us,my 456 eu lbe654 au au 741 us 789 us 852 eu 963 ml ml ml 987 jp
import pandas as pddf = pd.DataFrame([ ('123', 'eq-hk'), ('456', 'ca-eu; tp-lbe'), ('789', 'ca-us'), ('321', 'go-ch'), ('654', 'ca-au; go-au'), ('987', 'go-jp'), ('147', 'co-ml; go-ml'), ('258', 'ca-us'), ('369', 'ca-us; ca-my'), ('741', 'ca-us'), ('852', 'ca-eu'), ('963', 'ca-ml; co-ml; go-ml')], columns=['Code', 'Items'])# Get item type list from each row, sum (concatenate) the lists and convert# to a set to remove duplicates item_types = set(df['Items'].str.findall('(\w+)-').sum())print(item_types)# {'ca', 'co', 'eq', 'go', 'tp'}# Generate a column for each item typedf1 = pd.DataFrame(df['Code'])for t in item_types: df1[t] = df['Items'].str.findall('%s-(\w+)' % t).apply(lambda x: ''.join(x))print(df1)# Code ca tp eq co go#0 123 hk #1 456 eu lbe #2 789 us #3 321 ch#4 654 au au#5 987 jp#6 147 ml ml#7 258 us #8 369 usmy #9 741 us #10 852 eu #11 963 ml ml ml