Skip to content

Repack Library

The owid-repack library handles data packaging, compression, and format optimization.

You can install it via pip:

pip install owid-repack

owid.repack

Functions:

  • repack_frame

    Convert the DataFrame's columns to the most compact types possible.

  • series_eq

    Check that series are equal, but unlike normal floating point checks where

  • shrink_float

    Take a Float64 series and make it as small as possible.

  • shrink_integer

    Take an Int64 series and make it as small as possible.

  • to_safe_types

    Convert numeric columns to Float64 and Int64 and categorical

repack_frame

repack_frame(
    df: DataFrame,
    remap: Optional[Dict[str, str]] = None,
    dtypes: Optional[Dict[str, Any]] = {},
) -> DataFrame

Convert the DataFrame's columns to the most compact types possible. Rename columns if necessary during the repacking. The column renames work even if the column is part of the index.

Parameters:

Source code in lib/repack/owid/repack/__init__.py
def repack_frame(
    df: pd.DataFrame,
    remap: Optional[Dict[str, str]] = None,
    dtypes: Optional[Dict[str, Any]] = {},
) -> pd.DataFrame:
    """
    Convert the DataFrame's columns to the most compact types possible.
    Rename columns if necessary during the repacking. The column renames
    work even if the column is part of the index.

    Args:
        remap: remap column names
        dtypes: dictionary of fixed dtypes to use
    """
    if df.index.names != [None]:
        raise ValueError("repacking is lost for index columns")

    remap = remap or {}
    dtypes = dtypes or {}

    # unwind the primary key
    if len(df.index.names) == 1 and not df.index.names[0]:
        primary_key = []
    else:
        primary_key = cast(List[str], df.index.names)
        df.reset_index(inplace=True)

    # repack each column into the best dtype we can give it
    df = pd.concat(
        [repack_series(df.loc[:, col]) if col not in dtypes else df[col] for col in df.columns],
        axis=1,
    )

    # use given dtypes
    if dtypes:
        df = df.astype(dtypes)

    # remap all column names, including those in the primary key
    for from_, to_ in remap.items():
        if from_ in df.columns:
            df.rename(columns={from_: to_}, inplace=True)
    primary_key = [remap.get(k, k) for k in primary_key]

    for col in df.columns:
        if df[col].dtype == "object":
            raise ValueError(f"Column {col} is still object. Consider converting it to str.")

    # set the primary key back again
    if primary_key:
        df.set_index(primary_key, inplace=True)

    return df

series_eq

series_eq(
    lhs: Series,
    rhs: Series,
    rtol: float = 1e-05,
    atol: float = 1e-08,
) -> bool

Check that series are equal, but unlike normal floating point checks where NaN != NaN, we want missing or null values to be reported as equal to each other.

Source code in lib/repack/owid/repack/__init__.py
def series_eq(lhs: pd.Series, rhs: pd.Series, rtol: float = 1e-5, atol: float = 1e-8) -> bool:
    """
    Check that series are equal, but unlike normal floating point checks where
    NaN != NaN, we want missing or null values to be reported as equal to each
    other.
    """
    # NOTE: this could be speeded up with numpy methods or smarter comparison,
    # but it's not bottleneck at the moment
    if len(lhs) != len(rhs):
        return False

    return np.allclose(lhs, rhs, rtol=rtol, atol=atol, equal_nan=True)

shrink_float

shrink_float(s: Series) -> Series

Take a Float64 series and make it as small as possible.

Source code in lib/repack/owid/repack/__init__.py
def shrink_float(s: pd.Series) -> pd.Series:
    """
    Take a Float64 series and make it as small as possible.
    """
    assert s.dtype.name.replace("[pyarrow]", "") in ("float64", "Float64", "double"), s.dtype

    options = ["Float32", "Float64"]
    for dtype in options:
        v = s.astype(dtype)

        if series_eq(s, v):
            return v

    raise ValueError()

shrink_integer

shrink_integer(s: Series) -> Series

Take an Int64 series and make it as small as possible.

Source code in lib/repack/owid/repack/__init__.py
def shrink_integer(s: pd.Series) -> pd.Series:
    """
    Take an Int64 series and make it as small as possible.
    """
    assert s.dtype == "Int64"

    if s.isnull().all():
        # shrink all NaNs to Int8
        return s.astype("Int8")
    else:
        if s.min() < 0:
            series = ["Int32", "Int16", "Int8"]
        else:
            series = ["UInt32", "UInt16", "UInt8"]

    for dtype in series:
        v = s.astype(dtype)
        if not (v == s).all():
            break

        s = v

    return s

to_safe_types

to_safe_types(t: DataFrame) -> DataFrame

Convert numeric columns to Float64 and Int64 and categorical columns to string[pyarrow].

Source code in lib/repack/owid/repack/__init__.py
def to_safe_types(t: pd.DataFrame) -> pd.DataFrame:
    """Convert numeric columns to Float64 and Int64 and categorical
    columns to string[pyarrow]."""
    t = t.astype({col: _safe_dtype(t[col].dtype) for col in t.columns})

    if isinstance(t.index, pd.MultiIndex):
        t.index = t.index.set_levels([level.astype(_safe_dtype(level.dtype)) for level in t.index.levels])
    else:
        t.index = t.index.astype(_safe_dtype(t.index.dtype))

    return t