Delete rows from SQL server bases on content in dataframe Delete rows from SQL server bases on content in dataframe pandas pandas

Delete rows from SQL server bases on content in dataframe


Pandas doesn't support deletion of SQL rows based on specific conditions. You have to tell SQL Server which rows your want to delete:

import sqlalchemy as saengine = sa.create_engine('mssql+pyodbc://...')meta = sa.MetaData()# Map the Inventory table in your database to a SQLAlchemy objectinventory = sa.Table('Inventory', meta, autoload=True, autoload_with=engine)# Build the WHERE clause of your DELETE statement from rows in the dataframe.# Equivalence in T-SQL#      WHERE (Year = ... AND Month = ...) OR (Year = ... AND Month = ...) OR (Year = ... AND Month = ...)cond = df.apply(lambda row: sa.and_(inventory.c['Year'] == row['Year'], inventory.c['Month'] == row['Month']), axis=1)cond = sa.or_(*cond)# Define and execute the DELETEdelete = inventory.delete().where(cond)with engine.connect() as conn:    conn.execute(delete)# Now you can insert the new datadf.to_sql('Inventory', engine, if_exists='append', index=False)


I think you have two good options.

1) working in Pandas. Query the existing table with Pandas.read_sql_table(), import the csv file as a second DataFrame and merge-update the old with the new. Then insert the updated DataFrame, for example using df.to_sql(..., if exists='update').

2) use sqlalchemy and work in the database, particularly if you want to preserve the schema or other conditions.

Below a brief and general example based on these two solutions. Others are more specific solution are likely possible, but these are two starting points.

import sqlalchemy as saimport sqlalchemy.ext.declarative as sa_decimport sqlalchemy.orm as sa_ormimport pandas as pdfrom sqlalchemy import updatefrom sqlalchemy import and_#con = sqlite3.connect('hyp.db')#cur = con.cursor()# general pandas solutiont1 = pd.DataFrame({'year': [1, 2, 3], 'month': [4, 5, 6], 'value': [2, 2, 2]})t2 = pd.DataFrame({'year': [1, 5, 3], 'month': [4, 9, 9], 'value': [1, 5, 10]})c = pd.merge(t1, t2, how='outer', on=['year', 'month'], suffixes=['', '_t2'])c.loc[c['value_t2'].notnull(), 'value'] = c.loc[c['value_t2'].notnull(), 'value_t2']c = c.drop('value_t2', axis=1)print(c)# pandas using updatet1 = pd.DataFrame({'year': [1, 2, 3], 'month': [4, 5, 6], 'value': [2, 2, 2]})t2 = pd.DataFrame({'year': [1, 5, 3], 'month': [4, 9, 9], 'value': [1, 5, 10]})c = pd.merge(t1, t2, how='outer', on=['year', 'month'], suffixes=['', '_t2'])c['value'].update(c['value_t2'])c = c.drop('value_t2', axis=1)print(c)# the c.to_sql(...)##### sqlalchemyName = 'try.db'Type = 'sqlite'Url = sa.engine.url.URL(Type, database=Name)Engine = sa.engine.create_engine(Url)Base = sa_dec.declarative_base()Session = sa_orm.sessionmaker(bind=Engine)class Info(Base):    __tablename__ = 'Inventory'    id = sa.Column(sa.Integer, primary_key=True)    __table_args__ = (sa.UniqueConstraint('Year', 'Month'),)    Year = sa.Column(sa.String(250))    Month = sa.Column(sa.String(250))    Value = sa.Column(sa.Float)Base.metadata.create_all(Engine)# change values of year and month to testt = pd.DataFrame({'Year': [1, 2, 5], 'Month': ['Jun', 'July', 'Dec'], 'Value': [3, 3, 3]})# this isn't very efficient but it is here to give you a comprehensive example# where you have good control on what is happeningfor i, r in t.iterrows():    newdata = Info()    for col, val in r.items():        setattr(newdata, col, val)    con = Engine.connect()    session = Session()  # open sqlalchemy-sqlite session    session.add(newdata)  # add Info instance to session to insert    try:        session.flush()  # test insert, to see if there is any error    except sa.exc.IntegrityError:  # here catch unique constraint error if already in db        print('already in')        session.rollback()  # rollback to remove the blocked instance        stmt = update(Info).where(and_(Info.Year == r['Year'], Info.Year == r['Month'])).values(Value=r['Value'])        con.execute(stmt)    else:        session.commit()  # commit changes to db    finally:        session.close()  # close session to keep clean, it will be open in case of new data        con.close()

I tested the two solutions, they seem to work but further tests are needed.


I ended up doing it like this:

Inventory['Posting_date'] = pd.to_datetime(Inventory.Year.astype('str')+Inventory.Posting_period.astype('str'), format="%Y%m") + MonthEnd(1)#Delete periods already existing in the database in order to avoid duplicates when reloading etc.delete_date_inv = Inventory.Posting_date.drop_duplicates()delete_date_inv = delete_date_inv.astype('str')delete_date_inv = delete_date_inv.to_list()#Apply the deletion of the destinationconnection = engine.connect()connection.execute(f"""delete from Inventory where Posting_date in ({str(delete_date_inv)[1:-1]})""")connection.close()#Load to the databaseInventory.to_sql('Inventory', schema = 'dbo', con=engine, if_exists='append', index = False, chunksize = 10000)

I am not sure if the above is more efficient that the other answers, but it works :-)

Thank you all for your comments.