Ben Chuanlong Du's Blog

It is never too late to learn.

Construct Simple Spark DataFrames Using Seq

Seq.toDF

toDF() provides a concise syntax for creating DataFrames and can be accessed after importing Spark implicits.

import spark.implicits._

SparkSession.createDataFrame

In [4]:
%%classpath add mvn
org.apache.spark spark-core_2.11 2.1.1
org.apache.spark spark-sql_2.11 2.1.1
In [33]:
import org.apache.spark.sql.{SparkSession, Row}
import org.apache.spark.sql.types._


val spark = SparkSession.builder()
    .master("local")
    .appName("createDF example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()

spark
import spark.implicits._
Out[33]:
org.apache.spark.sql.SparkSession$implicits$@2d2d2d87
In [24]:
val someDF = Seq(
  (8, "bat"),
  (64, "mouse"),
  (-27, "horse")
).toDF("Number", "Words")
Out[24]:
[Number: int, Words: string]
In [29]:
someDF.show()
someDF.printSchema()
+------+-----+
|Number|Words|
+------+-----+
|     8|  bat|
|    64|mouse|
|   -27|horse|
+------+-----+

root
 |-- Number: integer (nullable = false)
 |-- Words: string (nullable = true)

Out[29]:
null
In [34]:
val someData = Seq(
  Row(8, "bat"),
  Row(64, "mouse"),
  Row(-27, "horse")
)

val someSchema = List(
  StructField("number", IntegerType, true),
  StructField("word", StringType, true)
)

val someDF = spark.createDataFrame(
  spark.sparkContext.parallelize(someData),
  StructType(someSchema)
)
Out[34]:
[number: int, word: string]
In [ ]:
Seq(1, 2, 3).toDF.show

createDF() is defined in spark-daria and allows for the following terse syntax.

In [40]:
// val someDF = spark.createDF(
//   List(
//     (8, "bat"),
//     (64, "mouse"),
//     (-27, "horse")
//   ), List(
//     ("number", IntegerType, true),
//     ("word", StringType, true)
//   )
// )
input is incomplete
In [ ]:

Comments