Ben Chuanlong Du's Blog

It is never too late to learn.

The filter Function in Spark DataFrame

In [1]:
%%classpath add mvn
org.apache.spark spark-core_2.11 2.1.1
org.apache.spark spark-sql_2.11 2.1.1
In [2]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.
    builder().
    master("local").
    appName("Spark SQL basic example").
    config("spark.some.config.option", "some-value").
    getOrCreate()
spark

import spark.implicits._
Out[2]:
org.apache.spark.sql.SparkSession$implicits$@7e12bd6b
In [3]:
val df = spark.read.json("../../data/people.json")
df.show
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

Out[3]:
null

Filtering

In [12]:
df.filter($"age" > 21).show()
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

Out[12]:
null
In [14]:
import org.apache.spark.sql.Column


def greaterThan(column: Column, v: Int):
    Column = {column > v}
Out[14]:
import org.apache.spark.sql.Column
greaterThan: (column: org.apache.spark.sql.Column, v: Int)org.apache.spark.sql.Column
In [15]:
df.filter(greaterThan(df("age"), 21)).show
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

Out[15]:
null
In [16]:
df.filter(greaterThan($"age", 21)).show
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

Out[16]:
null
In [17]:
df.filter($"age" === 30).show
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

Out[17]:
null
In [18]:
df.filter($"age" === null).show
+---+----+
|age|name|
+---+----+
+---+----+

Out[18]:
null
In [19]:
df.filter($"age" <=> 30).show
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

Out[19]:
null
In [20]:
df.filter($"age" <=> null).show
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
+----+-------+

Out[20]:
null
In [ ]:

Comments