Convert the DataFrame's columns to the most compact types possible.
Rename columns if necessary during the repacking. The column renames
work even if the column is part of the index.
defrepack_frame(df:pd.DataFrame,remap:Optional[Dict[str,str]]=None,dtypes:Optional[Dict[str,Any]]={},)->pd.DataFrame:""" Convert the DataFrame's columns to the most compact types possible. Rename columns if necessary during the repacking. The column renames work even if the column is part of the index. Args: remap: remap column names dtypes: dictionary of fixed dtypes to use """ifdf.index.names!=[None]:raiseValueError("repacking is lost for index columns")remap=remapor{}dtypes=dtypesor{}# unwind the primary keyiflen(df.index.names)==1andnotdf.index.names[0]:primary_key=[]else:primary_key=cast(List[str],df.index.names)df.reset_index(inplace=True)# repack each column into the best dtype we can give itdf=pd.concat([repack_series(df.loc[:,col])ifcolnotindtypeselsedf[col]forcolindf.columns],axis=1,)# use given dtypesifdtypes:df=df.astype(dtypes)# remap all column names, including those in the primary keyforfrom_,to_inremap.items():iffrom_indf.columns:df.rename(columns={from_:to_},inplace=True)primary_key=[remap.get(k,k)forkinprimary_key]forcolindf.columns:ifdf[col].dtype=="object":raiseValueError(f"Column {col} is still object. Consider converting it to str.")# set the primary key back againifprimary_key:df.set_index(primary_key,inplace=True)returndf
Check that series are equal, but unlike normal floating point checks where
NaN != NaN, we want missing or null values to be reported as equal to each
other.
defseries_eq(lhs:pd.Series,rhs:pd.Series,rtol:float=1e-5,atol:float=1e-8)->bool:""" Check that series are equal, but unlike normal floating point checks where NaN != NaN, we want missing or null values to be reported as equal to each other. """# NOTE: this could be speeded up with numpy methods or smarter comparison,# but it's not bottleneck at the momentiflen(lhs)!=len(rhs):returnFalsereturnnp.allclose(lhs,rhs,rtol=rtol,atol=atol,equal_nan=True)
defshrink_float(s:pd.Series)->pd.Series:""" Take a Float64 series and make it as small as possible. """asserts.dtype.name.replace("[pyarrow]","")in("float64","Float64","double"),s.dtypeoptions=["Float32","Float64"]fordtypeinoptions:v=s.astype(dtype)ifseries_eq(s,v):returnvraiseValueError()
defshrink_integer(s:pd.Series)->pd.Series:""" Take an Int64 series and make it as small as possible. """asserts.dtype=="Int64"ifs.isnull().all():# shrink all NaNs to Int8returns.astype("Int8")else:ifs.min()<0:series=["Int32","Int16","Int8"]else:series=["UInt32","UInt16","UInt8"]fordtypeinseries:v=s.astype(dtype)ifnot(v==s).all():breaks=vreturns
defto_safe_types(t:pd.DataFrame)->pd.DataFrame:"""Convert numeric columns to Float64 and Int64 and categorical columns to string[pyarrow]."""t=t.astype({col:_safe_dtype(t[col].dtype)forcolint.columns})ifisinstance(t.index,pd.MultiIndex):t.index=t.index.set_levels([level.astype(_safe_dtype(level.dtype))forlevelint.index.levels])else:t.index=t.index.astype(_safe_dtype(t.index.dtype))returnt