Ben Chuanlong Du's Blog

It is never too late to learn.

Coalesce and Repartition in Spark DataFrame

In [1]:
%%classpath add mvn
org.apache.spark spark-core_2.11 2.3.1
org.apache.spark spark-sql_2.11 2.3.1
In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

val spark = SparkSession.builder()
    .master("local[2]")
    .appName("Spark Column Example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()

import spark.implicits._
Out[2]:
org.apache.spark.sql.SparkSession$implicits$@60ad5f12
In [3]:
val df = spark.read.json("../../data/people.json")
df.show
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

Out[3]:
null

Get Number of Partitions

In [4]:
df.rdd.getNumPartitions
Out[4]:
1

Repartition

In [6]:
val df2 = df.repartition(4)
Out[6]:
[age: bigint, name: string]
In [7]:
df2.rdd.getNumPartitions
Out[7]:
4
In [ ]:

Comments