|
def test_delete_threshold(session_catalog: Catalog) -> None: |
|
schema = Schema( |
|
NestedField(field_id=101, name="id", field_type=LongType(), required=True), |
|
NestedField(field_id=103, name="created_at", field_type=DateType(), required=False), |
|
NestedField(field_id=104, name="relevancy_score", field_type=DoubleType(), required=False), |
|
) |
|
|
|
partition_spec = PartitionSpec(PartitionField(source_id=103, field_id=2000, transform=DayTransform(), name="created_at_day")) |
|
|
|
try: |
|
session_catalog.drop_table( |
|
identifier="default.scores", |
|
) |
|
except NoSuchTableError: |
|
pass |
|
|
|
session_catalog.create_table( |
|
identifier="default.scores", |
|
schema=schema, |
|
partition_spec=partition_spec, |
|
) |
|
|
|
# Parameters |
|
num_rows = 100 # Number of rows in the dataframe |
|
id_min, id_max = 1, 10000 |
|
date_start, date_end = date(2024, 1, 1), date(2024, 2, 1) |
|
|
|
# Generate the 'id' column |
|
id_column = np.random.randint(id_min, id_max, num_rows) |
|
|
|
# Generate the 'created_at' column as dates only |
|
date_range = pd.date_range(start=date_start, end=date_end, freq="D") # Daily frequency for dates |
|
created_at_column = np.random.choice(date_range, num_rows) # Convert to string (YYYY-MM-DD format) |
|
|
|
# Generate the 'relevancy_score' column with a peak around 0.1 |
|
relevancy_score_column = np.random.beta(a=2, b=20, size=num_rows) # Adjusting parameters to peak around 0.1 |
Feature Request / Improvement
#1256 mentioned removing
numpyas a dependency.numpyis currently only used in 1 functioniceberg-python/pyiceberg/io/pyarrow.py
Lines 810 to 815 in 583a7e9
and 2 tests
iceberg-python/tests/integration/test_writes/test_writes.py
Lines 1348 to 1383 in 583a7e9
iceberg-python/tests/integration/test_writes/test_writes.py
Lines 1404 to 1412 in 583a7e9
It seems like we can replace the
_combine_positional_deletesuse ofnumpywith pure python operators or pyarrow operators