In [9]:
import pandas as pd
In [8]:
import socket
import findspark
findspark.init("/opt/spark-3.2.0-bin-hadoop3.2")
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("PySpark").enableHiveSupport().getOrCreate()
In [11]:
df = spark.createDataFrame(
pd.DataFrame(
data=(
("Ben", "Du", 1),
("Ben", "Du", 2),
("Ben", "Tu", 3),
("Ben", "Tu", 4),
("Ken", "Xu", 1),
("Ken", "Xu", 9),
),
columns=("fname", "lname", "score"),
)
)
df.show()
DataFrame.stat.approxQuantile¶
Notice that it returns a Double array.
In [16]:
df.stat.approxQuantile("score", [0.5], 0.1)
Out[16]:
In [15]:
df.stat.approxQuantile("score", [0.5], 0.001)
Out[15]:
In [18]:
df.stat.approxQuantile("score", [0.5], 0.5)
Out[18]: