Examples¶
All examples use the same synthetic exam scores dataset. Run this setup block first:
import numpy as np
import pandas as pd
from interlace import fit
rng = np.random.default_rng(42)
n = 300
df = pd.DataFrame({
"score": 60 + rng.normal(0, 8, n),
"hours_studied": rng.uniform(0, 10, n),
"prior_gpa": rng.uniform(2.0, 4.0, n),
"student_id": [f"s{i}" for i in rng.integers(1, 31, n)],
"school_id": [f"sch{i}" for i in rng.integers(1, 11, n)],
})
result = fit(
formula="score ~ hours_studied + prior_gpa",
data=df,
groups=["student_id", "school_id"],
)
/tmp/ipykernel_2483/2983271922.py:15: UserWarning: boundary (singular) fit: see help(interlace.isSingular)
Single grouping factor¶
When you only have one source of clustering, pass a single column name:
result_single = fit(
formula="score ~ hours_studied + prior_gpa",
data=df,
groups="student_id",
)
print(result_single.variance_components)
{'student_id': 5.56870629733604e-15}
/tmp/ipykernel_2483/1119454205.py:1: UserWarning: boundary (singular) fit: see help(interlace.isSingular)
Residuals¶
hlm_resid returns a DataFrame with .resid and .fitted columns alongside the
original data. Use type="marginal" to ignore random effects, or
type="conditional" to subtract predicted BLUPs.
from interlace import hlm_resid
# Marginal residuals: y - Xβ
marginal = hlm_resid(result, type="marginal")
print(marginal[["resid", "fitted"]].describe())
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[3], line 5
3 # Marginal residuals: y - Xβ
4 marginal = hlm_resid(result, type="marginal")
----> 5 print(marginal[["resid", "fitted"]].describe())
File ~/work/interlace/interlace/.venv/lib/python3.13/site-packages/pandas/core/frame.py:4384, in DataFrame.__getitem__(self, key)
4382 if is_iterator(key):
4383 key = list(key)
-> 4384 indexer = self.columns._get_indexer_strict(key, "columns")[1]
4386 # take() does not accept boolean indexers
4387 if getattr(indexer, "dtype", None) == bool:
File ~/work/interlace/interlace/.venv/lib/python3.13/site-packages/pandas/core/indexes/base.py:6302, in Index._get_indexer_strict(self, key, axis_name)
6299 else:
6300 keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 6302 self._raise_if_missing(keyarr, indexer, axis_name)
6304 keyarr = self.take(indexer)
6305 if isinstance(key, Index):
6306 # GH 42790 - Preserve name from an Index
File ~/work/interlace/interlace/.venv/lib/python3.13/site-packages/pandas/core/indexes/base.py:6352, in Index._raise_if_missing(self, key, indexer, axis_name)
6350 if nmissing:
6351 if nmissing == len(indexer):
-> 6352 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
6354 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
6355 raise KeyError(f"{not_found} not in index")
KeyError: "None of [Index(['resid', 'fitted'], dtype='str')] are in the [columns]"
# Conditional residuals: y - Xβ - Zû
conditional = hlm_resid(result, type="conditional")
# Standardised
std_resid = hlm_resid(result, type="conditional", standardized=True)
# Group-level random effects
school_re = hlm_resid(result, level="school_id")
print(school_re.head())
Leverage¶
The hat-matrix diagonal is decomposed into fixed-effect and random-effect components following Demidenko & Stukel (2005) and Nobre & Singer (2007).
from interlace import leverage
lev = leverage(result)
print(lev.columns)
# High-leverage observations
high_lev = lev[lev["overall"] > 2 * lev["overall"].mean()]
print(f"{len(high_lev)} high-leverage observations")
Influence diagnostics¶
hlm_influence fits the model n times with one observation (or group) deleted,
computing Cook’s D, MDFFITS, COVTRACE, COVRATIO, and RVC for each deletion.
from interlace import hlm_influence
# Observation-level influence
infl = hlm_influence(result, level=1)
print(infl.columns)
# Group-level influence (delete one school at a time)
school_infl = hlm_influence(result, level="school_id")
print(school_infl.head())
Cook’s distance and MDFFITS¶
from interlace import cooks_distance, mdffits
cd = cooks_distance(result) # np.ndarray, shape (n,)
mdf = mdffits(result) # np.ndarray, shape (n,)
print(f"Max Cook's D: {cd.max():.4f}")
Count and measure influential observations¶
from interlace import n_influential, tau_gap
# Count observations exceeding the 4/n heuristic threshold
n_inf = n_influential(result)
print(f"{n_inf} influential observations (Cook's D > 4/n)")
# Change in random-effects std devs after removing influential observations
gap = tau_gap(result)
print(gap)
Combined augmented DataFrame¶
hlm_augment is a convenience wrapper that returns a single DataFrame containing the
original data, conditional residuals, fitted values, and all influence statistics.
Useful for exploratory analysis or downstream filtering.
from interlace import hlm_augment
aug = hlm_augment(result)
print(aug.columns.tolist())
# Find the most influential observations
aug.nlargest(5, "cooksd")[["student_id", "school_id", "score", "cooksd"]]
# Skip the influence refit loop (faster, residuals only)
aug_fast = hlm_augment(result, include_influence=False)
print(aug_fast.columns.tolist())
Prediction on new data¶
df_new = pd.DataFrame({
"hours_studied": [3.0, 7.0, 5.0],
"prior_gpa": [2.5, 3.8, 3.1],
"student_id": ["s1", "s2", "s_new"], # s_new is unseen
"school_id": ["sch1", "sch1", "sch_new"],
})
# Conditional prediction (known BLUPs applied, unknown → 0)
y_hat = result.predict(newdata=df_new)
print(y_hat)
# Fixed-effects only (population-level)
y_fe = result.predict(newdata=df_new, include_re=False)
print(y_fe)
Plotting¶
All plots return plotnine.ggplot objects and can be further customised with
standard plotnine layers.
Residual plots¶
from interlace import plot_resid
resid_df = hlm_resid(result, type="conditional")
plot_resid(resid_df, type="resid_vs_fitted")
plot_resid(resid_df, type="qq")
Influence index plot¶
from interlace import plot_influence
plot_influence(infl, diag="cooksd")
plot_influence(infl, diag="mdffits")
Ranked dotplot with outlier labels¶
dotplot_diag ranks observations by the chosen diagnostic and labels any that
exceed 3 × IQR above Q3.
from interlace import dotplot_diag
dotplot_diag(infl, diag="cooksd", cutoff="internal", name="index")
dotplot_diag(infl, diag="cooksd", cutoff=4 / len(df))