Does Spark Dataframe have an equivalent option of Panda's merge indicator? Does Spark Dataframe have an equivalent option of Panda's merge indicator? pandas pandas

Does Spark Dataframe have an equivalent option of Panda's merge indicator?


Altered LostInOverflow 's answer and got this working:

from pyspark.sql import RowASchema = StructType([StructField('ida', IntegerType(),nullable=False),                 StructField('name', StringType(),nullable=False)])BSchema = StructType([StructField('idb', IntegerType(),nullable=False),                 StructField('role', StringType(),nullable=False)])AData = sc.parallelize ([ Row(1,'michel'), Row(2,'diederik'), Row(3,'rok'), Row(4,'piet')])BData = sc.parallelize ([ Row(1,'engineer'), Row(2,'lead'), Row(3,'scientist'), Row(5,'manager')])ADF = hc.createDataFrame(AData,ASchema)BDF = hc.createDataFrame(BData,BSchema)DFJOIN = ADF.join(BDF, ADF['ida'] == BDF['idb'], "outer")DFJOIN.show()+----+--------+----+---------+| ida|    name| idb|     role|+----+--------+----+---------+|   1|  michel|   1| engineer||   2|diederik|   2|     lead||   3|     rok|   3|scientist||   4|    piet|null|     null||null|    null|   5|  manager|+----+--------+----+---------+from pyspark.sql.functions import *DFJOINMERGE = DFJOIN.withColumn("_merge", when(DFJOIN["ida"].isNull(), "right_only").when(DFJOIN["idb"].isNull(), "left_only").otherwise("both"))\  .withColumn("id", coalesce(ADF["ida"], BDF["idb"]))\   .drop(DFJOIN["ida"])\   .drop(DFJOIN["idb"])#DFJOINMERGE.show()DFJOINMERGE.groupBy("_merge").count().show()+----------+-----+|    _merge|count|+----------+-----+|right_only|    1|| left_only|    1||      both|    3|+----------+-----+


Try this:

>>> from pyspark.sql.functions import *>>> sdf1 = sqlContext.createDataFrame(df1)>>> sdf2 = sqlContext.createDataFrame(df2)>>> sdf = sdf1.join(sdf2, sdf1["col1"] == sdf2["col1"], "outer")>>> sdf.withColumn("_merge", when(sdf1["col1"].isNull(), "right_only").when(sdf2["col1"].isNull(), "left_only").otherwise("both"))\...  .withColumn("col1", coalesce(sdf1["col1"], sdf2["col1"]))\...   .drop(sdf1["col1"])\...   .drop(sdf2["col1"])