Skip to content

Column projection: filling in default values is broken for tables that don't have exactly 1 row #1766

@lidavidm

Description

@lidavidm

Apache Iceberg version

main (development)

Please describe the bug 🐞

This happens on pyiceberg 0.9.0

result_batch = result_batch.set_column(index, name, [value])

This set_column call always tries to add a 1-row column. But this is wrong (and PyArrow rejects it), it needs to add a column with the same length as the rest of the columns.

Reproducer
import os
import tempfile

import pyarrow.parquet
import pyiceberg.io
import pyiceberg.catalog
import pyiceberg.catalog.memory
import pyiceberg.schema


schema = pyiceberg.schema.Schema(
    pyiceberg.schema.NestedField(
        field_id=1,
        name="o_orderkey",
        field_type=pyiceberg.schema.LongType(),
        required=False,
    ),
    pyiceberg.schema.NestedField(
        field_id=2,
        name="month",
        field_type=pyiceberg.schema.StringType(),
        required=False,
    ),
    schema_id=0,
    identifier_field_ids=[],
)

partition_spec = pyiceberg.partitioning.PartitionSpec(
    pyiceberg.partitioning.PartitionField(
        source_id=2,
        field_id=1000,
        transform=pyiceberg.transforms.IdentityTransform(),
        name="month",
    )
)


with tempfile.TemporaryDirectory() as tmp_path:
    print("Warehouse in", tmp_path)

    session = pyiceberg.catalog.memory.InMemoryCatalog(
        "session",
        **{pyiceberg.io.WAREHOUSE: tmp_path},
    )
    session.create_namespace("session")

    table = pyarrow.table({"o_orderkey": [1, 2, 3]})
    data_path = os.path.join(tmp_path, "orders.parquet")
    with open(data_path, "wb") as f:
        pyarrow.parquet.write_table(table, f)

    orders = session.create_table(
        identifier="session.orders",
        schema=schema,
        partition_spec=partition_spec,
    )

    # Work around lack of native support for doing this (I may have missed something)
    data_files = list(
        pyiceberg.io.pyarrow.parquet_files_to_data_files(
            orders.io, orders.metadata, [data_path]
        )
    )
    for data_file in data_files:
        data_file.partition = pyiceberg.typedef.Record(month="1992-02")

    with orders.transaction() as tx:
        if tx.table_metadata.name_mapping() is None:
            default_name_mapping = tx.table_metadata.schema().name_mapping.model_dump_json()
            tx.set_properties(
                **{
                    pyiceberg.table.TableProperties.DEFAULT_NAME_MAPPING: default_name_mapping,
                }
            )
        with tx.update_snapshot().fast_append() as update_snapshot:
            for data_file in data_files:
                update_snapshot.append_data_file(data_file)

    scan = orders.scan()
    print(scan.to_arrow())
Output
Warehouse in /tmp/tmpy69j5uf6
Traceback (most recent call last):
  File "/home/lidavidm/Code/repro.py", line 80, in <module>
    print(scan.to_arrow())
          ^^^^^^^^^^^^^^^
  File "/home/lidavidm/Code/venv/lib/python3.12/site-packages/pyiceberg/table/__init__.py", line 1763, in to_arrow
    ).to_table(self.plan_files())
      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lidavidm/Code/venv/lib/python3.12/site-packages/pyiceberg/io/pyarrow.py", line 1575, in to_table
    if table_result := future.result():
                       ^^^^^^^^^^^^^^^
  File "/home/lidavidm/miniforge3/lib/python3.12/concurrent/futures/_base.py", line 449, in result
    return self.__get_result()
           ^^^^^^^^^^^^^^^^^^^
  File "/home/lidavidm/miniforge3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result
    raise self._exception
  File "/home/lidavidm/miniforge3/lib/python3.12/concurrent/futures/thread.py", line 59, in run
    result = self.fn(*self.args, **self.kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lidavidm/Code/venv/lib/python3.12/site-packages/pyiceberg/io/pyarrow.py", line 1556, in _table_from_scan_task
    batches = list(self._record_batches_from_scan_tasks_and_deletes([task], deletes_per_file))
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lidavidm/Code/venv/lib/python3.12/site-packages/pyiceberg/io/pyarrow.py", line 1637, in _record_batches_from_scan_tasks_and_deletes
    for batch in batches:
                 ^^^^^^^
  File "/home/lidavidm/Code/venv/lib/python3.12/site-packages/pyiceberg/io/pyarrow.py", line 1441, in _task_to_record_batches
    result_batch = result_batch.set_column(index, name, [value])
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pyarrow/table.pxi", line 2969, in pyarrow.lib.RecordBatch.set_column
  File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
  File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Added column's length must match record batch's length. Expected length 3 but got length 1
venv
annotated-types==0.7.0
cachetools==5.5.2
certifi==2025.1.31
charset-normalizer==3.4.1
click==8.1.8
fsspec==2025.2.0
greenlet==3.1.1
idna==3.10
markdown-it-py==3.0.0
mdurl==0.1.2
mmh3==5.1.0
pyarrow==19.0.1
pydantic==2.10.6
pydantic_core==2.27.2
Pygments==2.19.1
pyiceberg==0.9.0
pyparsing==3.2.1
python-dateutil==2.9.0.post0
requests==2.32.3
rich==13.9.4
six==1.17.0
sortedcontainers==2.4.0
SQLAlchemy==2.0.38
strictyaml==1.7.3
tenacity==9.0.0
typing_extensions==4.12.2
urllib3==2.3.0

Willingness to contribute

  • I can contribute a fix for this bug independently
  • I would be willing to contribute a fix for this bug with guidance from the Iceberg community
  • I cannot contribute a fix for this bug at this time

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions