Skip to content

Incorrect values for is_null and is_not_null on UnionArray #6017

@alamb

Description

@alamb

Describe the bug
The is_null and is_not_null kernels are incorrect for UnionArrays

I believe the core problem is that UnionArray does not implement Array::logical_nulls

impl Array for UnionArray {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.type_ids.len()
}
fn is_empty(&self) -> bool {
self.type_ids.is_empty()
}
fn offset(&self) -> usize {
0
}
fn nulls(&self) -> Option<&NullBuffer> {
None
}
/// Union types always return non null as there is no validity buffer.
/// To check validity correctly you must check the underlying vector.
fn is_null(&self, _index: usize) -> bool {
false
}
/// Union types always return non null as there is no validity buffer.
/// To check validity correctly you must check the underlying vector.
fn is_valid(&self, _index: usize) -> bool {
true
}
/// Union types always return 0 null count as there is no validity buffer.
/// To get null count correctly you must check the underlying vector.
fn null_count(&self) -> usize {
0
}
fn get_buffer_memory_size(&self) -> usize {
let mut sum = self.type_ids.inner().capacity();
if let Some(o) = self.offsets.as_ref() {
sum += o.inner().capacity()
}
self.fields
.iter()
.flat_map(|x| x.as_ref().map(|x| x.get_buffer_memory_size()))
.sum::<usize>()
+ sum
}
fn get_array_memory_size(&self) -> usize {
let mut sum = self.type_ids.inner().capacity();
if let Some(o) = self.offsets.as_ref() {
sum += o.inner().capacity()
}
std::mem::size_of::<Self>()
+ self
.fields
.iter()
.flat_map(|x| x.as_ref().map(|x| x.get_array_memory_size()))
.sum::<usize>()
+ sum
}
}

And instead falls back to the default implementation (which calls Array::nulls):

fn logical_nulls(&self) -> Option<NullBuffer> {
self.nulls().cloned()
}

To Reproduce
Run these tests in is_null.rs:

    #[test]
    fn test_null_array_is_not_null() {
        let a = NullArray::new(3);

        let res = is_not_null(&a).unwrap();

        let expected = BooleanArray::from(vec![false, false, false]);

        assert_eq!(expected, res);
        assert!(res.nulls().is_none());
    }

    #[test]
    fn test_dense_union_is_null() {
        // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
        let int_array = Int32Array::from(vec![Some(1), None]);
        let float_array = Float64Array::from(vec![Some(3.2), None]);
        let str_array = StringArray::from(vec![Some("a"), None]);
        let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
        let offsets = [0, 1, 0, 1, 0, 1]
            .into_iter()
            .collect::<ScalarBuffer<i32>>();

        let children = vec![
            Arc::new(int_array) as Arc<dyn Array>,
            Arc::new(float_array),
            Arc::new(str_array),
        ];

        let array =
            UnionArray::try_new(union_fields(), type_ids, Some(offsets), children)
                .unwrap();

        let result = is_null(&array).unwrap();

        let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
        assert_eq!(expected, &result);
    }

    #[test]
    fn test_sparse_union_is_null() {
        // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
        let int_array = Int32Array::from(vec![Some(1), None, None, None, None, None]);
        let float_array =
            Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
        let str_array = StringArray::from(vec![None, None, None, None, Some("a"), None]);
        let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();

        let children = vec![
            Arc::new(int_array) as Arc<dyn Array>,
            Arc::new(float_array),
            Arc::new(str_array),
        ];

        let array =
            UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();

        let result = is_null(&array).unwrap();

        let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
        assert_eq!(expected, &result);
    }

    fn union_fields() -> UnionFields {
        [
            (0, Arc::new(Field::new("A", DataType::Int32, true))),
            (1, Arc::new(Field::new("B", DataType::Float64, true))),
            (2, Arc::new(Field::new("C", DataType::Utf8, true))),
        ]
            .into_iter()
            .collect()
    }
Full diff

diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs
index ea8e12abbe2..0dd74a2d0b6 100644
--- a/arrow-arith/src/boolean.rs
+++ b/arrow-arith/src/boolean.rs
@@ -354,6 +354,8 @@ pub fn is_not_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
 mod tests {
     use super::*;
     use std::sync::Arc;
+    use arrow_buffer::ScalarBuffer;
+    use arrow_schema::{DataType, Field, UnionFields};

     #[test]
     fn test_bool_array_and() {
@@ -911,4 +913,65 @@ mod tests {
         assert_eq!(expected, res);
         assert!(res.nulls().is_none());
     }
+
+    #[test]
+    fn test_dense_union_is_null() {
+        // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+        let int_array = Int32Array::from(vec![Some(1), None]);
+        let float_array = Float64Array::from(vec![Some(3.2), None]);
+        let str_array = StringArray::from(vec![Some("a"), None]);
+        let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
+        let offsets = [0, 1, 0, 1, 0, 1]
+            .into_iter()
+            .collect::<ScalarBuffer<i32>>();
+
+        let children = vec![
+            Arc::new(int_array) as Arc<dyn Array>,
+            Arc::new(float_array),
+            Arc::new(str_array),
+        ];
+
+        let array =
+            UnionArray::try_new(union_fields(), type_ids, Some(offsets), children)
+                .unwrap();
+
+        let result = is_null(&array).unwrap();
+
+        let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
+        assert_eq!(expected, &result);
+    }
+
+    #[test]
+    fn test_sparse_union_is_null() {
+        // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+        let int_array = Int32Array::from(vec![Some(1), None, None, None, None, None]);
+        let float_array =
+            Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
+        let str_array = StringArray::from(vec![None, None, None, None, Some("a"), None]);
+        let type_ids = [0, 0, 1, 1, 2, 2].into_iter().collect::<ScalarBuffer<i8>>();
+
+        let children = vec![
+            Arc::new(int_array) as Arc<dyn Array>,
+            Arc::new(float_array),
+            Arc::new(str_array),
+        ];
+
+        let array =
+            UnionArray::try_new(union_fields(), type_ids, None, children).unwrap();
+
+        let result = is_null(&array).unwrap();
+
+        let expected = &BooleanArray::from(vec![false, true, false, true, false, true]);
+        assert_eq!(expected, &result);
+    }
+
+    fn union_fields() -> UnionFields {
+        [
+            (0, Arc::new(Field::new("A", DataType::Int32, true))),
+            (1, Arc::new(Field::new("B", DataType::Float64, true))),
+            (2, Arc::new(Field::new("C", DataType::Utf8, true))),
+        ]
+            .into_iter()
+            .collect()
+    }
 }

Expected behavior
The tests should pass

Instead they currently error as is_null always returns false and is_not_null always returns true:

assertion `left == right` failed
  left: BooleanArray
[
  false,
  true,
  false,
  true,
  false,
  true,
]
 right: BooleanArray
[
  false,
  false,
  false,
  false,
  false,
  false,
]

Additional context
This was found by @samuelcolvin on apache/datafusion#11162

The reproducers are from his PR on apache/datafusion#11321

Metadata

Metadata

Assignees

No one assigned

    Labels

    arrowChanges to the arrow cratebug

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions