diff --git a/CHANGELOG.md b/CHANGELOG.md index 283a416..90f9082 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,29 @@ # Changelog + + +## Unreleased + +### Added + +- `tidy_summary()` gains an `interpolation=` parameter controlling how `Q1`/`Q3` + are computed. The default is unchanged from 0.1.0 (`"nearest"`); pass + `interpolation="linear"` for R's `quantile()` type 7 — also NumPy's default and + the quartiles drawn by Plotly/ggplot2 boxplots. **Non-breaking** (default + preserved). +- `chisq_test()` gains a `correct=` parameter for Yates' continuity correction on + the test of independence. The default is unchanged from 0.1.0 (`correct=False`, + the uncorrected Pearson statistic, matching the simulation-based + `calculate(stat="Chisq")`); pass `correct=True` to match R's + `chisq.test`/`prop_test`. As in R, the correction only affects 2x2 tables. + **Non-breaking** (default preserved). + ## 0.1.0 (2026-06-20) Initial release of the Python companion to **ModernDive: Statistical Inference diff --git a/RELEASING.md b/RELEASING.md index c556294..a02c005 100644 --- a/RELEASING.md +++ b/RELEASING.md @@ -46,6 +46,10 @@ unzip -l dist/*.whl | grep -c parquet # sanity: bundled datasets are present ## Cutting a release 1. **Pick the version** (PyPI versions are immutable — you can't re-upload one). + Any change that can alter existing users' results is **breaking**: it needs a + dedicated `### ⚠️ Breaking changes` section in `CHANGELOG.md` (what changed, + how to restore the old behavior, why) and a **minor/major** bump — never a + patch. Prefer adding an opt-in parameter with the old default to avoid breaking. 2. **Bump `version`** in `pyproject.toml`. 3. **Update `CHANGELOG.md`**: rename the `## Unreleased` section to `## (YYYY-MM-DD)` and start a fresh empty `## Unreleased` above it. diff --git a/moderndive/infer/wrappers.py b/moderndive/infer/wrappers.py index a1de893..6d60e63 100644 --- a/moderndive/infer/wrappers.py +++ b/moderndive/infer/wrappers.py @@ -181,6 +181,7 @@ def chisq_test( response: str | None = None, explanatory: str | None = None, p: dict | None = None, + correct: bool = False, ) -> pl.DataFrame: """Tidy chi-squared test. @@ -188,6 +189,13 @@ def chisq_test( response and a ``p={level: probability, ...}`` mapping, it is a **goodness-of-fit** test against those hypothesized proportions. Returns ``statistic``, ``chisq_df``, ``p_value``. + + ``correct`` applies Yates' continuity correction to the test of independence. + It defaults to ``False`` — the uncorrected Pearson statistic, matching + moderndive 0.1.0 and the simulation-based ``calculate(stat="Chisq")``. Pass + ``correct=True`` to match R's ``chisq.test``/``prop_test`` default; like R, the + correction only affects 2x2 tables (one degree of freedom) and never the + goodness-of-fit case. """ from scipy import stats @@ -210,7 +218,7 @@ def chisq_test( ) sub = data.select(resp, expl).drop_nulls() table = sub.to_pandas().pivot_table(index=resp, columns=expl, aggfunc="size", fill_value=0) - chi2, pval, dof, _ = stats.chi2_contingency(table.to_numpy(), correction=False) + chi2, pval, dof, _ = stats.chi2_contingency(table.to_numpy(), correction=correct) return pl.DataFrame( {"statistic": [float(chi2)], "chisq_df": [int(dof)], "p_value": [float(pval)]} ) diff --git a/moderndive/modeling.py b/moderndive/modeling.py index c4834f5..72c3d75 100644 --- a/moderndive/modeling.py +++ b/moderndive/modeling.py @@ -362,13 +362,24 @@ def get_regression_summaries(model, digits: int = 3) -> pl.DataFrame: return table.with_columns(pl.col(float_cols).round(digits)) -def tidy_summary(data, columns: list[str] | None = None, digits: int = 3) -> pl.DataFrame: +def tidy_summary( + data, + columns: list[str] | None = None, + digits: int = 3, + interpolation: str = "nearest", +) -> pl.DataFrame: """Per-variable summary statistics for the selected columns. Mirrors the R ``moderndive::tidy_summary`` column layout: ``column, n, group, type, min, Q1, mean, median, Q3, max, sd``. Numeric columns get the five-number summary + mean/sd; non-numeric columns report ``n`` and ``type`` with the numeric fields left null. + + ``interpolation`` selects how ``Q1``/``Q3`` are computed when a quartile falls + between two observations. The default ``"nearest"`` matches moderndive 0.1.0 + (polars' default). Pass ``interpolation="linear"`` for R's ``quantile()`` type + 7 — also NumPy's default and the quartiles drawn by Plotly/ggplot2 boxplots — + or any other polars quantile method. """ df = data if isinstance(data, pl.DataFrame) else pl.from_pandas(data) columns = columns or df.columns @@ -395,10 +406,10 @@ def tidy_summary(data, columns: list[str] | None = None, digits: int = 3) -> pl. s = series.drop_nulls() row.update( min=round(float(s.min()), digits), - Q1=round(float(s.quantile(0.25)), digits), + Q1=round(float(s.quantile(0.25, interpolation=interpolation)), digits), mean=round(float(s.mean()), digits), median=round(float(s.median()), digits), - Q3=round(float(s.quantile(0.75)), digits), + Q3=round(float(s.quantile(0.75, interpolation=interpolation)), digits), max=round(float(s.max()), digits), sd=round(float(s.std()), digits), ) diff --git a/tests/test_infer_parity.py b/tests/test_infer_parity.py index 5c75e64..1cd32c2 100644 --- a/tests/test_infer_parity.py +++ b/tests/test_infer_parity.py @@ -127,9 +127,15 @@ def test_t_test_one_sample_tidy_columns(): def test_chisq_test_df_and_stat(): + # Default is the uncorrected Pearson statistic (matches moderndive 0.1.0 and + # the simulation-based calculate(stat="Chisq")) — strictly positive here. out = chisq_test(_yawn(), formula="yawn ~ group") assert out["chisq_df"][0] == 1 assert out["statistic"][0] > 0 + # Opt into Yates' continuity correction (R's chisq.test default); on this weak + # 2x2 association the corrected statistic is smaller. + corrected = chisq_test(_yawn(), formula="yawn ~ group", correct=True) + assert corrected["statistic"][0] < out["statistic"][0] # --- bias-corrected CI ----------------------------------------------------