Comment¶
You can use withColumnRenamed
to rename a column in a DataFrame.
You can also do renaming using alias
when select columns.
In [4]:
import pandas as pd
In [5]:
from pathlib import Path
import findspark
# findspark.init(str(next(Path("/opt").glob("spark-3*"))))
findspark.init("/opt/spark-3.0.1-bin-hadoop3.2/")
# findspark.init("/opt/spark-2.3.0-bin-hadoop2.7")
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import (
IntegerType,
StringType,
StructType,
StructField,
ArrayType,
)
spark = (
SparkSession.builder.appName("PySpark_Str_Func").enableHiveSupport().getOrCreate()
)
In [8]:
df = spark.createDataFrame(
pd.DataFrame(
data=(
(1, "a", "foo", 3.0),
(1, "b", "bar", 4.0),
(3, "c", "foo", 5.0),
(4, "d", "bar", 7.0),
),
columns=["_0", "_1", "_2", "_3"],
)
)
df.show()
Get Column Name¶
There is no good way to get the name of a Column, unfortunately.
Column._jc.toString()
In [9]:
df._0
Out[9]:
In [10]:
str(df._0)
Out[10]:
In [11]:
df._0._jc.toString()
Out[11]:
In [13]:
c = max(df._0).alias("_0_max")
c
Out[13]:
In [14]:
c._jc.toString()
Out[14]:
In [15]:
c.alias()
Out[15]:
In [16]:
str(c)
Out[16]:
Drop Columns¶
In [5]:
df.drop("_1", "_3").show
Renaming One Column Using withColumnRenamed
¶
In [8]:
df.withColumnRenamed("_1", "x1").show
Renaming One Column Using alias
¶
In [9]:
df.select(
$"_1".alias("x1"),
$"_2",
$"_3",
$"_4"
).show
Batch Renaming Using withColumnRenamed
¶
In [12]:
val lookup = Map(
"_1" -> "x1",
"_2" -> "x2",
"_3" -> "x3",
"_4" -> "x4"
)
In [13]:
lookup.foldLeft(df) {
(acc, ca) => acc.withColumnRenamed(ca._1, ca._2)
}.show
Batch Renaming Using alias
¶
In [14]:
df.select(df.columns.map(c => col(c).alias(lookup.getOrElse(c, c))): _*).show