In [1]:

import pandas as pd

In [5]:

from pathlib import Path
import findspark

# findspark.init(str(next(Path("/opt").glob("spark-3*"))))
findspark.init("/opt/spark-3.0.1-bin-hadoop3.2/")
# findspark.init("/opt/spark-2.3.0-bin-hadoop2.7")

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import StructType

spark = (
    SparkSession.builder.appName("PySpark_Str_Func").enableHiveSupport().getOrCreate()
)

In [3]:

df = spark.createDataFrame(
    pd.DataFrame(data=[(5.0, 11), (8.0, 20), (1.0, -33)], columns=["x", "y"])
)
df.show()

+---+---+
|  x|  y|
+---+---+
|5.0| 11|
|8.0| 20|
|1.0|-33|
+---+---+

In [5]:

df.withColumn("z", col("y") + 1).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11| 12|
|8.0| 20| 21|
|1.0|-33|-32|
+---+---+---+

In [6]:

df.withColumn("z", col("y") - 1).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11| 10|
|8.0| 20| 19|
|1.0|-33|-34|
+---+---+---+

In [7]:

df.withColumn("z", col("y") * 2).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11| 22|
|8.0| 20| 40|
|1.0|-33|-66|
+---+---+---+

In [8]:

df.withColumn("z", col("y") / 2).show()

+---+---+-----+
|  x|  y|    z|
+---+---+-----+
|5.0| 11|  5.5|
|8.0| 20| 10.0|
|1.0|-33|-16.5|
+---+---+-----+

Modulus (`%`)¶

In [16]:

df.withColumn("z", col("y") % 5).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|  1|
|8.0| 20|  0|
|1.0|-33| -3|
+---+---+---+

In [17]:

df.withColumn("z", expr("y % 5")).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|  1|
|8.0| 20|  0|
|1.0|-33| -3|
+---+---+---+

In [18]:

df.withColumn("z", expr("mod(y, 5)")).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|  1|
|8.0| 20|  0|
|1.0|-33| -3|
+---+---+---+

In [19]:

df.withColumn("z", expr("pmod(y, 5)")).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|  1|
|8.0| 20|  0|
|1.0|-33|  2|
+---+---+---+

In [20]:

df.createOrReplaceTempView("df")

In [25]:

spark.sql("select x, y, y % 5 as z from df").show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|  1|
|8.0| 20|  0|
|1.0|-33| -3|
+---+---+---+

In [26]:

spark.sql("select x, y, mod(y, 5) as z from df").show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|  1|
|8.0| 20|  0|
|1.0|-33| -3|
+---+---+---+

In [27]:

spark.sql("select x, y, pmod(y, 5) as z from df").show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|  1|
|8.0| 20|  0|
|1.0|-33|  2|
+---+---+---+

In Scala/Spark, primitive values must be after the operator. It won't work if the primitive value in a expression is before the arithmetic operator. There are 2 ways to resolve the issue.

Use the lit to create a const column expression.
Reorder the expression to make the primitive value after the operator.

However, in PySpark primitive types can be used before the operator.

In [4]:

df.withColumn("z", 1 - col("y")).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|-10|
|8.0| 20|-19|
|1.0|-33| 34|
+---+---+---+

In [6]:

df.withColumn("z", lit(1) - col("y")).show()

+---+---+---+
|  x|  y|  z|
+---+---+---+
|5.0| 11|-10|
|8.0| 20|-19|
|1.0|-33| 34|
+---+---+---+

References¶

Spark SQL Built-in Functions

Spark Scala Functions

https://obstkel.com/spark-sql-functions

https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/sql/functions.html

In [ ]:

Ben Chuanlong Du's Blog

It is never too late to learn.

Arithmetic Functions and Operations in Spark

Modulus (`%`)¶

References¶

Comments

Modulus (%)¶

References¶

Comments

Modulus (`%`)¶