Ben Chuanlong Du's Blog

It is never too late to learn.

Read Parquet Files Using Polars in Rust

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

In [2]:
:timing
:sccache 1
:dep polars = { version = "0.42.0", features = ["lazy", "parquet"] }
Out[2]:
Timing: true
sccache: true. Warning: dynamic linking disabled, use :cache instead to preserve dynamic linking
Out[2]:
Took 123503ms
In [5]:
use polars::df;
use polars::prelude::*;
use polars::datatypes::DataType;
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;
Out[5]:
Took 220ms
In [6]:
let mut frame = LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    )?
    .collect()?;
frame
Out[6]:
shape: (10_498_456, 4)
┌──────────────────┬───────────────────┬──────────────────┬─────────────┐
│ id0              ┆ id1               ┆ id2              ┆ score_r4_it │
│ ---              ┆ ---               ┆ ---              ┆ ---         │
│ u64              ┆ u64               ┆ u64              ┆ f64         │
╞══════════════════╪═══════════════════╪══════════════════╪═════════════╡
│ 33701888         ┆ 13510798882120448 ┆ 4101             ┆ -0.954137   │
│ 28               ┆ 4505798920142848  ┆ 2097282          ┆ -11.599546  │
│ 41943042         ┆ 275951782400      ┆ 336              ┆ -3.984118   │
│ 18939904         ┆ 4503599637331969  ┆ 26               ┆ -1.175188   │
│ 416              ┆ 4503599628682241  ┆ 74               ┆ -0.97172    │
│ …                ┆ …                 ┆ …                ┆ …           │
│ 2195456          ┆ 9007199523700769  ┆ 4503599627370502 ┆ 1.033871    │
│ 14               ┆ 557840            ┆ 224              ┆ -0.068309   │
│ 4503599627374600 ┆ 1074528352        ┆ 131600           ┆ 18.639906   │
│ 8388674          ┆ 70901825536       ┆ 4736             ┆ -4.103395   │
│ 1057             ┆ 274917753088      ┆ 4612             ┆ -10.065635  │
└──────────────────┴───────────────────┴──────────────────┴─────────────┘
Out[6]:
Took 2955ms

Count the Total Number of Rows of All Parquet Files

In [15]:
LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    ).unwrap().count().collect()
Out[15]:
Ok(shape: (1, 4)
┌──────────┬──────────┬──────────┬─────────────┐
│ id0      ┆ id1      ┆ id2      ┆ score_r4_it │
│ ---      ┆ ---      ┆ ---      ┆ ---         │
│ u32      ┆ u32      ┆ u32      ┆ u32         │
╞══════════╪══════════╪══════════╪═════════════╡
│ 10498456 ┆ 10498456 ┆ 10498456 ┆ 10498456    │
└──────────┴──────────┴──────────┴─────────────┘)
Out[15]:
Took 2817ms
In [12]:
LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    ).unwrap().select(
        &[col("*").count().cast(DataType::UInt64)]
    ).collect()
Out[12]:
Ok(shape: (1, 4)
┌──────────┬──────────┬──────────┬─────────────┐
│ id0      ┆ id1      ┆ id2      ┆ score_r4_it │
│ ---      ┆ ---      ┆ ---      ┆ ---         │
│ u64      ┆ u64      ┆ u64      ┆ u64         │
╞══════════╪══════════╪══════════╪═════════════╡
│ 10498456 ┆ 10498456 ┆ 10498456 ┆ 10498456    │
└──────────┴──────────┴──────────┴─────────────┘)
Out[12]:
Took 2965ms
In [16]:
LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    ).unwrap().select(
        &[col("id0").count().cast(DataType::UInt64).alias("n")]
    ).collect().unwrap()["n"].u64().unwrap().get(0).unwrap()
Out[16]:
10498456
Out[16]:
Took 3088ms
In [17]:
LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    ).unwrap().select(
        &[lit(1).count().cast(DataType::UInt64).alias("n")]
    ).collect().unwrap()["n"].u64().unwrap().get(0).unwrap()
Out[17]:
1
Out[17]:
Took 4176ms
In [ ]:

Comments