Take n rows from a spark dataframe and pass to toPandas() Take n rows from a spark dataframe and pass to toPandas() python python

Take n rows from a spark dataframe and pass to toPandas()


You can use the limit(n) function:

l = [('Alice', 1),('Jim',2),('Sandra',3)]df = sqlContext.createDataFrame(l, ['name', 'age'])df.limit(2).withColumn('age2', df.age + 2).toPandas()

Or:

l = [('Alice', 1),('Jim',2),('Sandra',3)]df = sqlContext.createDataFrame(l, ['name', 'age'])df.withColumn('age2', df.age + 2).limit(2).toPandas()


You could get first rows of Spark DataFrame with head and then create Pandas DataFrame:

l = [('Alice', 1),('Jim',2),('Sandra',3)]df = sqlContext.createDataFrame(l, ['name', 'age'])df_pandas = pd.DataFrame(df.head(3), columns=df.columns)In [4]: df_pandasOut[4]:      name  age0   Alice    11     Jim    22  Sandra    3


Try it:

def showDf(df, count=None, percent=None, maxColumns=0):    if (df == None): return    import pandas    from IPython.display import display    pandas.set_option('display.encoding', 'UTF-8')    # Pandas dataframe    dfp = None    # maxColumns param    if (maxColumns >= 0):        if (maxColumns == 0): maxColumns = len(df.columns)        pandas.set_option('display.max_columns', maxColumns)    # count param    if (count == None and percent == None): count = 10 # Default count    if (count != None):        count = int(count)        if (count == 0): count = df.count()        pandas.set_option('display.max_rows', count)        dfp = pandas.DataFrame(df.head(count), columns=df.columns)        display(dfp)    # percent param    elif (percent != None):        percent = float(percent)        if (percent >=0.0 and percent <= 1.0):            import datetime            now = datetime.datetime.now()            seed = long(now.strftime("%H%M%S"))            dfs = df.sample(False, percent, seed)            count = df.count()            pandas.set_option('display.max_rows', count)            dfp = dfs.toPandas()                display(dfp)

Examples of usages are:

# Shows the ten first rows of the Spark dataframeshowDf(df)showDf(df, 10)showDf(df, count=10)# Shows a random sample which represents 15% of the Spark dataframeshowDf(df, percent=0.15)