Source code for interlace.ols

"""OLS fitting with formulaic design matrices and HC3 robust standard errors.

Provides a statsmodels-compatible interface without depending on statsmodels
or pandas for the fitting step. Any narwhals-compatible frame (pandas, polars,
…) is accepted as input data.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any

import formulaic
import narwhals as nw
import numpy as np
import pandas as pd


@dataclass
class _OLSDataWrapper:
    """Mirrors result.model.data.frame (statsmodels compat)."""

    frame: Any  # native type as passed by the caller (pandas, polars, etc.)


@dataclass
class _OLSModelInfo:
    """Container for OLS model matrices and metadata."""

    exog: np.ndarray
    endog: np.ndarray
    exog_names: list[str]
    formula: str
    data: _OLSDataWrapper
    _rhs_model_spec: Any  # formulaic ModelSpec for predict()


class OLSResult:
    """Result of an OLS fit via formulaic + numpy.

    Attributes match the statsmodels OLS interface so that this is a
    drop-in replacement.

    Attributes
    ----------
    params : pd.Series
        Estimated regression coefficients, indexed by term name.
    resid : np.ndarray
        Residuals ``y - X @ params``, shape ``(n,)``.
    fittedvalues : np.ndarray
        In-sample fitted values ``X @ params``, shape ``(n,)``.
    normalized_cov_params : np.ndarray
        ``(X'X)^{-1}``, shape ``(p, p)``.  Multiply by ``scale`` to obtain
        the OLS covariance matrix.
    model : _OLSModelInfo
        Container for the design matrix, response vector, term names,
        formula string, and original data frame.

    Examples
    --------
    >>> result = interlace.ols("y ~ x", df)
    >>> result.params
    >>> result.hc3_bse()
    """

    def __init__(
        self,
        params: pd.Series,
        resid: np.ndarray,
        fittedvalues: np.ndarray,
        normalized_cov_params: np.ndarray,
        model: _OLSModelInfo,
    ) -> None:
        self.params = params
        self.resid = resid
        self.fittedvalues = fittedvalues
        self.normalized_cov_params = normalized_cov_params
        self.model = model

    @property
    def df_resid(self) -> float:
        """Residual degrees of freedom: n - p, matching statsmodels OLS df_resid."""
        return float(len(self.resid) - len(self.params))

    @property
    def mse_resid(self) -> float:
        """Mean squared error of residuals: RSS / (n - p), matching statsmodels."""
        return float(np.sum(self.resid**2) / self.df_resid)

    @property
    def scale(self) -> float:
        """Residual variance: RSS / (n - p), matching statsmodels OLS scale."""
        return self.mse_resid

    def hc3_bse(self) -> np.ndarray:
        """HC3 heteroskedasticity-consistent standard errors.

        Computes:
            h_ii = diag(X (X'X)^{-1} X')
            d_i  = e_i^2 / (1 - h_ii)^2
            cov_HC3 = (X'X)^{-1} (X' diag(d) X) (X'X)^{-1}
            se_HC3  = sqrt(diag(cov_HC3))

        Returns
        -------
        np.ndarray, shape (p,)
        """
        X = self.model.exog
        e = self.resid
        XtX_inv = self.normalized_cov_params
        # Hat matrix diagonal: h_ii = (X (X'X)^{-1} X')_ii
        # Computed without forming full hat matrix: h_ii = sum_j (X @ XtX_inv * X)_ij
        h = np.einsum("ij,jk,ik->i", X, XtX_inv, X)
        d = e**2 / (1.0 - h) ** 2
        meat = X.T @ (d[:, None] * X)
        cov_hc3 = XtX_inv @ meat @ XtX_inv
        return np.sqrt(np.diag(cov_hc3))

    def predict(self, data: Any) -> np.ndarray:
        """Predict on new data by re-evaluating the RHS formula.

        Parameters
        ----------
        data:
            DataFrame containing the predictor columns. Any narwhals-compatible
            frame is accepted.

        Returns
        -------
        np.ndarray, shape (n,)
        """
        nw_data = nw.from_native(data, eager_only=True)
        rhs_spec = self.model._rhs_model_spec
        X_new = np.asarray(formulaic.model_matrix(rhs_spec, nw_data), dtype=float)
        return np.asarray(X_new @ self.params.values, dtype=float)


[docs] def ols(formula: str, data: Any) -> OLSResult: """Fit an OLS model using a formulaic formula string. Parameters ---------- formula: Formula string, e.g. ``"y ~ x1 + x2"``. data: DataFrame containing all variables. Any narwhals-compatible frame (pandas, polars, …) is accepted. Returns ------- OLSResult Drop-in replacement for statsmodels OLS results. Examples -------- >>> import interlace >>> result = interlace.ols("y ~ x1 + x2", df) >>> result.params """ nw_data = nw.from_native(data, eager_only=True) matrices = formulaic.model_matrix(formula, nw_data) X = np.asarray(matrices.rhs, dtype=float) y = np.asarray(matrices.lhs, dtype=float).squeeze() term_names: list[str] = list(matrices.rhs.columns) rhs_model_spec = matrices.rhs.model_spec # QR-based least squares beta, _, _, _ = np.linalg.lstsq(X, y, rcond=None) fittedvalues = X @ beta resid = y - fittedvalues normalized_cov_params = np.linalg.inv(X.T @ X) params = pd.Series(beta, index=term_names) model = _OLSModelInfo( exog=X, endog=y, exog_names=term_names, formula=formula, data=_OLSDataWrapper(frame=data), _rhs_model_spec=rhs_model_spec, ) return OLSResult( params=params, resid=resid, fittedvalues=fittedvalues, normalized_cov_params=normalized_cov_params, model=model, )