1. Downcast dtypes (Biggest Win)
def reduce_mem_usage(df):
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
else:
df[col] = pd.to_numeric(df[col], downcast='float')
return df
Typically saves 50-70% memory.
2. Use Categoricals for Low-Cardinality Strings
df['city'] = df['city'].astype('category') # 10x less memory
3. Vectorize Instead of Apply
# SLOW
df['result'] = df.apply(lambda r: r['a'] * r['b'], axis=1)
# FAST
df['result'] = df['a'] * df['b']