In [1]:
import pandas as pd
In [5]:
from pathlib import Path
import findspark
# findspark.init(str(next(Path("/opt").glob("spark-3*"))))
findspark.init("/opt/spark-3.0.1-bin-hadoop3.2/")
# findspark.init("/opt/spark-2.3.0-bin-hadoop2.7")
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import StructType
spark = (
SparkSession.builder.appName("PySpark_Str_Func").enableHiveSupport().getOrCreate()
)
In [3]:
df = spark.createDataFrame(
pd.DataFrame(data=[(5.0, 11), (8.0, 20), (1.0, -33)], columns=["x", "y"])
)
df.show()
In [5]:
df.withColumn("z", col("y") + 1).show()
In [6]:
df.withColumn("z", col("y") - 1).show()
In [7]:
df.withColumn("z", col("y") * 2).show()
In [8]:
df.withColumn("z", col("y") / 2).show()
Modulus (%
)¶
In [16]:
df.withColumn("z", col("y") % 5).show()
In [17]:
df.withColumn("z", expr("y % 5")).show()
In [18]:
df.withColumn("z", expr("mod(y, 5)")).show()
In [19]:
df.withColumn("z", expr("pmod(y, 5)")).show()
In [20]:
df.createOrReplaceTempView("df")
In [25]:
spark.sql("select x, y, y % 5 as z from df").show()
In [26]:
spark.sql("select x, y, mod(y, 5) as z from df").show()
In [27]:
spark.sql("select x, y, pmod(y, 5) as z from df").show()
In Scala/Spark, primitive values must be after the operator. It won't work if the primitive value in a expression is before the arithmetic operator. There are 2 ways to resolve the issue.
- Use the
lit
to create a const column expression. - Reorder the expression to make the primitive value after the operator.
However, in PySpark primitive types can be used before the operator.
In [4]:
df.withColumn("z", 1 - col("y")).show()
In [6]:
df.withColumn("z", lit(1) - col("y")).show()
In [ ]: