Examples¶

All examples use the same synthetic exam scores dataset. Run this setup block first:

import numpy as np
import pandas as pd
from interlace import fit

rng = np.random.default_rng(42)
n = 300
df = pd.DataFrame({
    "score":         60 + rng.normal(0, 8, n),
    "hours_studied": rng.uniform(0, 10, n),
    "prior_gpa":     rng.uniform(2.0, 4.0, n),
    "student_id":    [f"s{i}"   for i in rng.integers(1, 31, n)],
    "school_id":     [f"sch{i}" for i in rng.integers(1, 11, n)],
})

result = fit(
    formula="score ~ hours_studied + prior_gpa",
    data=df,
    groups=["student_id", "school_id"],
)

/tmp/ipykernel_2483/2983271922.py:15: UserWarning: boundary (singular) fit: see help(interlace.isSingular)

Single grouping factor¶

When you only have one source of clustering, pass a single column name:

result_single = fit(
    formula="score ~ hours_studied + prior_gpa",
    data=df,
    groups="student_id",
)

print(result_single.variance_components)

{'student_id': 5.56870629733604e-15}

/tmp/ipykernel_2483/1119454205.py:1: UserWarning: boundary (singular) fit: see help(interlace.isSingular)

Residuals¶

hlm_resid returns a DataFrame with .resid and .fitted columns alongside the original data. Use type="marginal" to ignore random effects, or type="conditional" to subtract predicted BLUPs.

from interlace import hlm_resid

# Marginal residuals: y - Xβ
marginal = hlm_resid(result, type="marginal")
print(marginal[["resid", "fitted"]].describe())

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[3], line 5
      3 # Marginal residuals: y - Xβ
      4 marginal = hlm_resid(result, type="marginal")
----> 5 print(marginal[["resid", "fitted"]].describe())

File ~/work/interlace/interlace/.venv/lib/python3.13/site-packages/pandas/core/frame.py:4384, in DataFrame.__getitem__(self, key)
   4382     if is_iterator(key):
   4383         key = list(key)
-> 4384     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   4386 # take() does not accept boolean indexers
   4387 if getattr(indexer, "dtype", None) == bool:

File ~/work/interlace/interlace/.venv/lib/python3.13/site-packages/pandas/core/indexes/base.py:6302, in Index._get_indexer_strict(self, key, axis_name)
   6299 else:
   6300     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 6302 self._raise_if_missing(keyarr, indexer, axis_name)
   6304 keyarr = self.take(indexer)
   6305 if isinstance(key, Index):
   6306     # GH 42790 - Preserve name from an Index

File ~/work/interlace/interlace/.venv/lib/python3.13/site-packages/pandas/core/indexes/base.py:6352, in Index._raise_if_missing(self, key, indexer, axis_name)
   6350 if nmissing:
   6351     if nmissing == len(indexer):
-> 6352         raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   6354     not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
   6355     raise KeyError(f"{not_found} not in index")

KeyError: "None of [Index(['resid', 'fitted'], dtype='str')] are in the [columns]"

# Conditional residuals: y - Xβ - Zû
conditional = hlm_resid(result, type="conditional")

# Standardised
std_resid = hlm_resid(result, type="conditional", standardized=True)

# Group-level random effects
school_re = hlm_resid(result, level="school_id")
print(school_re.head())

Leverage¶

The hat-matrix diagonal is decomposed into fixed-effect and random-effect components following Demidenko & Stukel (2005) and Nobre & Singer (2007).

from interlace import leverage

lev = leverage(result)
print(lev.columns)

# High-leverage observations
high_lev = lev[lev["overall"] > 2 * lev["overall"].mean()]
print(f"{len(high_lev)} high-leverage observations")

Influence diagnostics¶

hlm_influence fits the model n times with one observation (or group) deleted, computing Cook’s D, MDFFITS, COVTRACE, COVRATIO, and RVC for each deletion.

from interlace import hlm_influence

# Observation-level influence
infl = hlm_influence(result, level=1)
print(infl.columns)

# Group-level influence (delete one school at a time)
school_infl = hlm_influence(result, level="school_id")
print(school_infl.head())

Cook’s distance and MDFFITS¶

from interlace import cooks_distance, mdffits

cd  = cooks_distance(result)   # np.ndarray, shape (n,)
mdf = mdffits(result)          # np.ndarray, shape (n,)

print(f"Max Cook's D: {cd.max():.4f}")

Count and measure influential observations¶

from interlace import n_influential, tau_gap

# Count observations exceeding the 4/n heuristic threshold
n_inf = n_influential(result)
print(f"{n_inf} influential observations (Cook's D > 4/n)")

# Change in random-effects std devs after removing influential observations
gap = tau_gap(result)
print(gap)

Combined augmented DataFrame¶

hlm_augment is a convenience wrapper that returns a single DataFrame containing the original data, conditional residuals, fitted values, and all influence statistics. Useful for exploratory analysis or downstream filtering.

from interlace import hlm_augment

aug = hlm_augment(result)
print(aug.columns.tolist())

# Find the most influential observations
aug.nlargest(5, "cooksd")[["student_id", "school_id", "score", "cooksd"]]

# Skip the influence refit loop (faster, residuals only)
aug_fast = hlm_augment(result, include_influence=False)
print(aug_fast.columns.tolist())

Prediction on new data¶

df_new = pd.DataFrame({
    "hours_studied": [3.0, 7.0, 5.0],
    "prior_gpa":     [2.5, 3.8, 3.1],
    "student_id":    ["s1", "s2", "s_new"],  # s_new is unseen
    "school_id":     ["sch1", "sch1", "sch_new"],
})

# Conditional prediction (known BLUPs applied, unknown → 0)
y_hat = result.predict(newdata=df_new)
print(y_hat)

# Fixed-effects only (population-level)
y_fe = result.predict(newdata=df_new, include_re=False)
print(y_fe)

Plotting¶

All plots return plotnine.ggplot objects and can be further customised with standard plotnine layers.

Residual plots¶

from interlace import plot_resid

resid_df = hlm_resid(result, type="conditional")

plot_resid(resid_df, type="resid_vs_fitted")

plot_resid(resid_df, type="qq")

Influence index plot¶

from interlace import plot_influence

plot_influence(infl, diag="cooksd")

plot_influence(infl, diag="mdffits")

Ranked dotplot with outlier labels¶

dotplot_diag ranks observations by the chosen diagnostic and labels any that exceed 3 × IQR above Q3.

from interlace import dotplot_diag

dotplot_diag(infl, diag="cooksd", cutoff="internal", name="index")

dotplot_diag(infl, diag="cooksd", cutoff=4 / len(df))