Describe the bug
The is_null and is_not_null kernels are incorrect for UnionArrays
I believe the core problem is that UnionArray does not implement Array::logical_nulls
|
impl Array for UnionArray { |
|
fn as_any(&self) -> &dyn Any { |
|
self |
|
} |
|
|
|
fn to_data(&self) -> ArrayData { |
|
self.clone().into() |
|
} |
|
|
|
fn into_data(self) -> ArrayData { |
|
self.into() |
|
} |
|
|
|
fn data_type(&self) -> &DataType { |
|
&self.data_type |
|
} |
|
|
|
fn slice(&self, offset: usize, length: usize) -> ArrayRef { |
|
Arc::new(self.slice(offset, length)) |
|
} |
|
|
|
fn len(&self) -> usize { |
|
self.type_ids.len() |
|
} |
|
|
|
fn is_empty(&self) -> bool { |
|
self.type_ids.is_empty() |
|
} |
|
|
|
fn offset(&self) -> usize { |
|
0 |
|
} |
|
|
|
fn nulls(&self) -> Option<&NullBuffer> { |
|
None |
|
} |
|
|
|
/// Union types always return non null as there is no validity buffer. |
|
/// To check validity correctly you must check the underlying vector. |
|
fn is_null(&self, _index: usize) -> bool { |
|
false |
|
} |
|
|
|
/// Union types always return non null as there is no validity buffer. |
|
/// To check validity correctly you must check the underlying vector. |
|
fn is_valid(&self, _index: usize) -> bool { |
|
true |
|
} |
|
|
|
/// Union types always return 0 null count as there is no validity buffer. |
|
/// To get null count correctly you must check the underlying vector. |
|
fn null_count(&self) -> usize { |
|
0 |
|
} |
|
|
|
fn get_buffer_memory_size(&self) -> usize { |
|
let mut sum = self.type_ids.inner().capacity(); |
|
if let Some(o) = self.offsets.as_ref() { |
|
sum += o.inner().capacity() |
|
} |
|
self.fields |
|
.iter() |
|
.flat_map(|x| x.as_ref().map(|x| x.get_buffer_memory_size())) |
|
.sum::<usize>() |
|
+ sum |
|
} |
|
|
|
fn get_array_memory_size(&self) -> usize { |
|
let mut sum = self.type_ids.inner().capacity(); |
|
if let Some(o) = self.offsets.as_ref() { |
|
sum += o.inner().capacity() |
|
} |
|
std::mem::size_of::<Self>() |
|
+ self |
|
.fields |
|
.iter() |
|
.flat_map(|x| x.as_ref().map(|x| x.get_array_memory_size())) |
|
.sum::<usize>() |
|
+ sum |
|
} |
|
} |
And instead falls back to the default implementation (which calls Array::nulls):
|
fn logical_nulls(&self) -> Option<NullBuffer> { |
|
self.nulls().cloned() |
|
} |
To Reproduce
Run these tests in is_null.rs:
#[test]
fn test_null_array_is_not_null() {
let a = NullArray::new(3);
let res = is_not_null(&a).unwrap();
let expected = BooleanArray::from(vec![false, false, false]);
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
#[test]
fn test_dense_union_is_null() {
// union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
let int_array = Int32Array::from(vec![Some(1), None]);
let float_array = Float64Array::from(vec![Some(3.2), None]);
let str_array = StringArray::from(vec![Some("a"), None]);
let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
let offsets = [0, 1, 0, 1, 0, 1]
.into_iter()
.collect::<ScalarBuffer<i32>>();
let children = vec![
Arc::new(int_array) as Arc<dyn Array>,
Arc::new(float_array),
Arc::new(str_array),
];
let array =
UnionArray::try_new(union_fields(), type_ids, Some(offsets), children)
.unwrap();
let result = is_null(&array).unwrap();
let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
assert_eq!(expected, &result);
}
#[test]
fn test_sparse_union_is_null() {
// union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
let int_array = Int32Array::from(vec![Some(1), None, None, None, None, None]);
let float_array =
Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
let str_array = StringArray::from(vec![None, None, None, None, Some("a"), None]);
let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
let children = vec![
Arc::new(int_array) as Arc<dyn Array>,
Arc::new(float_array),
Arc::new(str_array),
];
let array =
UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
let result = is_null(&array).unwrap();
let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
assert_eq!(expected, &result);
}
fn union_fields() -> UnionFields {
[
(0, Arc::new(Field::new("A", DataType::Int32, true))),
(1, Arc::new(Field::new("B", DataType::Float64, true))),
(2, Arc::new(Field::new("C", DataType::Utf8, true))),
]
.into_iter()
.collect()
}
Full diff
diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs
index ea8e12abbe2..0dd74a2d0b6 100644
--- a/arrow-arith/src/boolean.rs
+++ b/arrow-arith/src/boolean.rs
@@ -354,6 +354,8 @@ pub fn is_not_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
mod tests {
use super::*;
use std::sync::Arc;
+ use arrow_buffer::ScalarBuffer;
+ use arrow_schema::{DataType, Field, UnionFields};
#[test]
fn test_bool_array_and() {
@@ -911,4 +913,65 @@ mod tests {
assert_eq!(expected, res);
assert!(res.nulls().is_none());
}
+
+ #[test]
+ fn test_dense_union_is_null() {
+ // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+ let int_array = Int32Array::from(vec![Some(1), None]);
+ let float_array = Float64Array::from(vec![Some(3.2), None]);
+ let str_array = StringArray::from(vec![Some("a"), None]);
+ let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
+ let offsets = [0, 1, 0, 1, 0, 1]
+ .into_iter()
+ .collect::<ScalarBuffer<i32>>();
+
+ let children = vec![
+ Arc::new(int_array) as Arc<dyn Array>,
+ Arc::new(float_array),
+ Arc::new(str_array),
+ ];
+
+ let array =
+ UnionArray::try_new(union_fields(), type_ids, Some(offsets), children)
+ .unwrap();
+
+ let result = is_null(&array).unwrap();
+
+ let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
+ assert_eq!(expected, &result);
+ }
+
+ #[test]
+ fn test_sparse_union_is_null() {
+ // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+ let int_array = Int32Array::from(vec![Some(1), None, None, None, None, None]);
+ let float_array =
+ Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
+ let str_array = StringArray::from(vec![None, None, None, None, Some("a"), None]);
+ let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
+
+ let children = vec![
+ Arc::new(int_array) as Arc<dyn Array>,
+ Arc::new(float_array),
+ Arc::new(str_array),
+ ];
+
+ let array =
+ UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
+
+ let result = is_null(&array).unwrap();
+
+ let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
+ assert_eq!(expected, &result);
+ }
+
+ fn union_fields() -> UnionFields {
+ [
+ (0, Arc::new(Field::new("A", DataType::Int32, true))),
+ (1, Arc::new(Field::new("B", DataType::Float64, true))),
+ (2, Arc::new(Field::new("C", DataType::Utf8, true))),
+ ]
+ .into_iter()
+ .collect()
+ }
}
Expected behavior
The tests should pass
Instead they currently error as is_null always returns false and is_not_null always returns true:
assertion `left == right` failed
left: BooleanArray
[
false,
true,
false,
true,
false,
true,
]
right: BooleanArray
[
false,
false,
false,
false,
false,
false,
]
Additional context
This was found by @samuelcolvin on apache/datafusion#11162
The reproducers are from his PR on apache/datafusion#11321
Describe the bug
The
is_nullandis_not_nullkernels are incorrect forUnionArraysI believe the core problem is that
UnionArraydoes not implementArray::logical_nullsarrow-rs/arrow-array/src/array/union_array.rs
Lines 445 to 525 in b9562b9
And instead falls back to the default implementation (which calls
Array::nulls):arrow-rs/arrow-array/src/array/mod.rs
Lines 212 to 214 in b9562b9
To Reproduce
Run these tests in
is_null.rs:Full diff
Expected behavior
The tests should pass
Instead they currently error as
is_nullalways returns false andis_not_nullalways returns true:Additional context
This was found by @samuelcolvin on apache/datafusion#11162
The reproducers are from his PR on apache/datafusion#11321