Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!
Comments¶
- Notice that a cell in a Parquet table has a type of
Field
which is an enum of types.
In [2]:
:timing
:sccache 1
:dep parquet = ">=14.0.0"
In [4]:
use std::fs::File;
use std::path::Path;
use parquet::file::reader::{FileReader, SerializedFileReader};
let file = File::open("./bench.parquet").unwrap();
let reader = SerializedFileReader::new(file).unwrap();
for row in reader.get_row_iter(None).unwrap() {
for (idx, (name, field)) in row.get_column_iter().enumerate() {
println!("column index: {}, column name: {}, column value: {}", idx, name, field);
}
}
Out[4]:
Parse a Parquet file without knowing the order of columns.
In [25]:
let mut play = PlayRounds::default();
let file = File::open("/workdir/ofcp_test_data/test_data_11_dedup/part-000.parquet").unwrap();
let reader = SerializedFileReader::new(file).unwrap();
for (idx_r, row) in reader.get_row_iter(None).unwrap().enumerate() {
let it_col = row.get_column_iter();
let mut id0 = 0u64;
let mut id1 = 0u64;
let mut id2 = 0u64;
let mut score_it = 0f64;
for (name, field) in row.get_column_iter() {
if name == "id0" {
match field {
&Field::Long(id) => {
id0 = id as u64;
},
_ => panic!("Wrong type for id0!"),
}
} else if name == "id1" {
match field {
&Field::Long(id) => {
id1 = id as u64;
},
_ => panic!("Wrong type for id1!"),
}
} else if name == "id2" {
match field {
&Field::Long(id) => {
id2 = id as u64;
},
_ => panic!("Wrong type for id2!"),
}
} else if name == "score_r4_it" {
match field {
&Field::Double(s) => {
score_it = s;
},
_ => panic!("Wrong type for score!"),
}
}
}
play.set_ids(id0, id1, id2);
let mut method = BruteForceMethod::Iteration;
let s = play.score_r4(&from_id(ALL ^ id0 ^ id1 ^ id2).unwrap(), 2.0, &mut method);
let delta = (score_it -s).abs();
assert!(
delta < 1E-8,
"Round 4 score for the following case is not calculated correctly!
id0: {id0}, id1: {id1}, id2: {id2}, score_it: {score_it}, score_ana: {s}, delta: {delta}
",
)
}
Out[25]:
Things become much easier if know the exact order of columns.
In [9]:
let mut play = PlayRounds::default();
let file = File::open("/workdir/ofcp_test_data/clean/test_data_11_dedup/part-000.parquet").unwrap();
let reader = SerializedFileReader::new(file).unwrap();
for row in reader.get_row_iter(None).unwrap() {
let mut it_col = row.get_column_iter();
let id0 = match it_col.next().unwrap().1 {
&Field::ULong(id) => id,
_ => panic!("Wrong type for id0!"),
};
let id1 = match it_col.next().unwrap().1 {
&Field::ULong(id) => id,
_ => panic!("Wrong type for id1!"),
};
let id2 = match it_col.next().unwrap().1 {
&Field::ULong(id) => id,
_ => panic!("Wrong type for id2!"),
};
let score_it = match it_col.next().unwrap().1 {
&Field::Double(s) => s,
_ => panic!("Wrong type for score!"),
};
play.set_ids(id0, id1, id2);
let mut method = BruteForceMethod::Iteration;
let s = play.score_r4(&from_id(ALL ^ id0 ^ id1 ^ id2).unwrap(), 2.0, &mut method);
let delta = (score_it -s).abs();
assert!(
delta < 1E-8,
"Round 4 score for the following case is not calculated correctly!
id0: {id0}, id1: {id1}, id2: {id2}, score_it: {score_it}, score_ana: {s}, delta: {delta}
",
)
}
Out[9]:
References¶
In [ ]: