Does Spark Dataframe have an equivalent option of Panda's merge indicator?
Altered LostInOverflow 's answer and got this working:
from pyspark.sql import RowASchema = StructType([StructField('ida', IntegerType(),nullable=False), StructField('name', StringType(),nullable=False)])BSchema = StructType([StructField('idb', IntegerType(),nullable=False), StructField('role', StringType(),nullable=False)])AData = sc.parallelize ([ Row(1,'michel'), Row(2,'diederik'), Row(3,'rok'), Row(4,'piet')])BData = sc.parallelize ([ Row(1,'engineer'), Row(2,'lead'), Row(3,'scientist'), Row(5,'manager')])ADF = hc.createDataFrame(AData,ASchema)BDF = hc.createDataFrame(BData,BSchema)DFJOIN = ADF.join(BDF, ADF['ida'] == BDF['idb'], "outer")DFJOIN.show()+----+--------+----+---------+| ida| name| idb| role|+----+--------+----+---------+| 1| michel| 1| engineer|| 2|diederik| 2| lead|| 3| rok| 3|scientist|| 4| piet|null| null||null| null| 5| manager|+----+--------+----+---------+from pyspark.sql.functions import *DFJOINMERGE = DFJOIN.withColumn("_merge", when(DFJOIN["ida"].isNull(), "right_only").when(DFJOIN["idb"].isNull(), "left_only").otherwise("both"))\ .withColumn("id", coalesce(ADF["ida"], BDF["idb"]))\ .drop(DFJOIN["ida"])\ .drop(DFJOIN["idb"])#DFJOINMERGE.show()DFJOINMERGE.groupBy("_merge").count().show()+----------+-----+| _merge|count|+----------+-----+|right_only| 1|| left_only| 1|| both| 3|+----------+-----+
Try this:
>>> from pyspark.sql.functions import *>>> sdf1 = sqlContext.createDataFrame(df1)>>> sdf2 = sqlContext.createDataFrame(df2)>>> sdf = sdf1.join(sdf2, sdf1["col1"] == sdf2["col1"], "outer")>>> sdf.withColumn("_merge", when(sdf1["col1"].isNull(), "right_only").when(sdf2["col1"].isNull(), "left_only").otherwise("both"))\... .withColumn("col1", coalesce(sdf1["col1"], sdf2["col1"]))\... .drop(sdf1["col1"])\... .drop(sdf2["col1"])