Skip to content

OWID's Technical Documentation

Repack Library

Repack Library¶

The owid-repack library handles data packaging, compression, and format optimization.

You can install it via pip:

pip install owid-repack

owid.repack ¶

Functions:

repack_frame –

Convert the DataFrame's columns to the most compact types possible.
series_eq –

Check that series are equal, but unlike normal floating point checks where
shrink_float –

Take a Float64 series and make it as small as possible.
shrink_integer –

Take an Int64 series and make it as small as possible.
to_safe_types –

Convert numeric columns to Float64 and Int64 and categorical

repack_frame ¶

repack_frame(
    df: DataFrame,
    remap: Optional[Dict[str, str]] = None,
    dtypes: Optional[Dict[str, Any]] = {},
) -> DataFrame

Convert the DataFrame's columns to the most compact types possible. Rename columns if necessary during the repacking. The column renames work even if the column is part of the index.

Parameters:

remap (Optional[Dict[str, str]], default: None ) –

remap column names
dtypes (Optional[Dict[str, Any]], default: {} ) –

dictionary of fixed dtypes to use

Source code in lib/repack/owid/repack/__init__.py

def repack_frame(
    df: pd.DataFrame,
    remap: Optional[Dict[str, str]] = None,
    dtypes: Optional[Dict[str, Any]] = {},
) -> pd.DataFrame:
    """
    Convert the DataFrame's columns to the most compact types possible.
    Rename columns if necessary during the repacking. The column renames
    work even if the column is part of the index.

    Args:
        remap: remap column names
        dtypes: dictionary of fixed dtypes to use
    """
    if df.index.names != [None]:
        raise ValueError("repacking is lost for index columns")

    remap = remap or {}
    dtypes = dtypes or {}

    # unwind the primary key
    if len(df.index.names) == 1 and not df.index.names[0]:
        primary_key = []
    else:
        primary_key = cast(List[str], df.index.names)
        df.reset_index(inplace=True)

    # repack each column into the best dtype we can give it
    df = pd.concat(
        [repack_series(df.loc[:, col]) if col not in dtypes else df[col] for col in df.columns],
        axis=1,
    )

    # use given dtypes
    if dtypes:
        df = df.astype(dtypes)

    # remap all column names, including those in the primary key
    for from_, to_ in remap.items():
        if from_ in df.columns:
            df.rename(columns={from_: to_}, inplace=True)
    primary_key = [remap.get(k, k) for k in primary_key]

    for col in df.columns:
        if df[col].dtype == "object":
            raise ValueError(f"Column {col} is still object. Consider converting it to str.")

    # set the primary key back again
    if primary_key:
        df.set_index(primary_key, inplace=True)

    return df

series_eq ¶

series_eq(
    lhs: Series,
    rhs: Series,
    rtol: float = 1e-05,
    atol: float = 1e-08,
) -> bool

Check that series are equal, but unlike normal floating point checks where NaN != NaN, we want missing or null values to be reported as equal to each other.

Source code in lib/repack/owid/repack/__init__.py

def series_eq(lhs: pd.Series, rhs: pd.Series, rtol: float = 1e-5, atol: float = 1e-8) -> bool:
    """
    Check that series are equal, but unlike normal floating point checks where
    NaN != NaN, we want missing or null values to be reported as equal to each
    other.
    """
    # NOTE: this could be speeded up with numpy methods or smarter comparison,
    # but it's not bottleneck at the moment
    if len(lhs) != len(rhs):
        return False

    return np.allclose(lhs, rhs, rtol=rtol, atol=atol, equal_nan=True)

shrink_float ¶

shrink_float(s: Series) -> Series

Take a Float64 series and make it as small as possible.

Source code in lib/repack/owid/repack/__init__.py

def shrink_float(s: pd.Series) -> pd.Series:
    """
    Take a Float64 series and make it as small as possible.
    """
    assert s.dtype.name.replace("[pyarrow]", "") in ("float64", "Float64", "double"), s.dtype

    options = ["Float32", "Float64"]
    for dtype in options:
        v = s.astype(dtype)

        if series_eq(s, v):
            return v

    raise ValueError()

shrink_integer ¶

shrink_integer(s: Series) -> Series

Take an Int64 series and make it as small as possible.

Source code in lib/repack/owid/repack/__init__.py

def shrink_integer(s: pd.Series) -> pd.Series:
    """
    Take an Int64 series and make it as small as possible.
    """
    assert s.dtype == "Int64"

    if s.isnull().all():
        # shrink all NaNs to Int8
        return s.astype("Int8")
    else:
        if s.min() < 0:
            series = ["Int32", "Int16", "Int8"]
        else:
            series = ["UInt32", "UInt16", "UInt8"]

    for dtype in series:
        v = s.astype(dtype)
        if not (v == s).all():
            break

        s = v

    return s

to_safe_types ¶

to_safe_types(t: DataFrame) -> DataFrame

Convert numeric columns to Float64 and Int64 and categorical columns to string[pyarrow].

Source code in lib/repack/owid/repack/__init__.py

def to_safe_types(t: pd.DataFrame) -> pd.DataFrame:
    """Convert numeric columns to Float64 and Int64 and categorical
    columns to string[pyarrow]."""
    t = t.astype({col: _safe_dtype(t[col].dtype) for col in t.columns})

    if isinstance(t.index, pd.MultiIndex):
        t.index = t.index.set_levels([level.astype(_safe_dtype(level.dtype)) for level in t.index.levels])
    else:
        t.index = t.index.astype(_safe_dtype(t.index.dtype))

    return t