Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!
In [1]:
from typing import List, Tuple
import pandas as pd
In [2]:
from pathlib import Path
import findspark
findspark.init(str(next(Path("/opt").glob("spark-3*"))))
# findspark.init("/opt/spark-2.3.0-bin-hadoop2.7")
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import (
IntegerType,
StringType,
StructType,
StructField,
ArrayType,
)
spark = (
SparkSession.builder.appName("PySpark_Str_Func").enableHiveSupport().getOrCreate()
)
In [3]:
df = spark.createDataFrame(
pd.DataFrame(
data=[([1, 2], "how", 1), ([2, 3], "are", 2), ([3, 4], "you", 3)],
columns=["col1", "col2", "col3"],
)
)
df.show()
Not (~
) for Column Expressions¶
Use ~
to reverse the values of a boolean column expression.
Notice that you cannot use the not
keyword in this situation.
In [4]:
df.filter(~(col("col3") >= 3)).show()
In [5]:
df.filter((col("col3") < 3) & (col("col2") == "how")).show()
In [6]:
df.filter((col("col3") >= 3) | (col("col2") == "how")).show()
In [ ]: