Missing Values by Reason" not in html
+
+ def test_heatmap_mode_styling(self, tbl_pl, specs):
+ html = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True).as_raw_html()
+ assert "IBM Plex Mono" in html
+ assert "
Missing Pattern Heatmap" not in html
+
+
+class TestNonApplicableReasons:
+ """Reasons not defined in a column's spec should render as an em dash, not '0 (0%)'."""
+
+ def test_table_mode_em_dash(self, tbl_pl, specs):
+ html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html()
+ # age has no "below_threshold"; income has no "refused"/"dont_know" -> 3 em dashes
+ assert html.count("—") == 3
+ # age DOES define "dont_know" but observes none -> should still show "0 (0%)"
+ assert "0 (0%)" in html
+
+ def test_heatmap_mode_em_dash(self, tbl_pl, specs):
+ html = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True).as_raw_html()
+ assert html.count("—") == 3
+
+ def test_single_spec_no_em_dash(self):
+ # With one spec, every reason in the union applies -> no em dashes
+ tbl = pl.DataFrame({"age": [34, -98, 41, -99]})
+ spec = {"age": pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"})}
+ html = pb.missing_vals_tbl(tbl, missing=spec).as_raw_html()
+ assert "—" not in html
diff --git a/tests/test_validate.py b/tests/test_validate.py
index 64e8718d0..6ff03cdfa 100644
--- a/tests/test_validate.py
+++ b/tests/test_validate.py
@@ -833,6 +833,7 @@ def test_validation_plan_and_interrogation(request, tbl_fixture) -> None:
"values",
"inclusive",
"na_pass",
+ "missing",
"pre",
"segments",
"thresholds",
@@ -915,6 +916,7 @@ def test_validation_plan_and_interrogation(request, tbl_fixture) -> None:
"values",
"inclusive",
"na_pass",
+ "missing",
"pre",
"segments",
"thresholds",
diff --git a/tests/test_yaml_missing_specs.py b/tests/test_yaml_missing_specs.py
new file mode 100644
index 000000000..9937bd939
--- /dev/null
+++ b/tests/test_yaml_missing_specs.py
@@ -0,0 +1,127 @@
+import polars as pl
+import pytest
+
+import pointblank as pb
+from pointblank.yaml import YAMLValidationError, yaml_interrogate, yaml_to_python
+
+
+def _write_csv(tmp_path, df):
+ p = tmp_path / "survey.csv"
+ df.write_csv(p)
+ return str(p)
+
+
+@pytest.fixture
+def survey_csv(tmp_path):
+ df = pl.DataFrame({"age": [34, -98, 41, -99, 29, -98, 55, 38]})
+ return _write_csv(tmp_path, df)
+
+
+def test_named_missing_spec_pct(survey_csv):
+ yaml_str = f"""
+tbl: {survey_csv}
+missing_specs:
+ standard_survey:
+ reasons:
+ -99: not_asked
+ -98: refused
+ -97: dont_know
+ categories:
+ nonresponse: [refused, dont_know]
+steps:
+ - col_pct_missing:
+ columns: age
+ missing: standard_survey
+ max_pct: 0.5
+ - col_pct_missing:
+ columns: age
+ missing: standard_survey
+ reason: refused
+ max_pct: 0.30
+"""
+ result = yaml_interrogate(yaml_str)
+ assert len(result.validation_info) == 2
+ # overall 3/8=0.375 <= 0.5 pass; refused 2/8=0.25 <= 0.30 pass
+ assert result.validation_info[0].all_passed is True
+ assert result.validation_info[1].all_passed is True
+
+
+def test_named_missing_spec_coded(tmp_path):
+ df = pl.DataFrame({"age": [34, -98, 41, None, 29, -99, 55, 38]})
+ csv = _write_csv(tmp_path, df)
+ yaml_str = f"""
+tbl: {csv}
+missing_specs:
+ survey:
+ reasons:
+ -99: not_asked
+ -98: refused
+steps:
+ - col_missing_coded:
+ columns: age
+ missing: survey
+"""
+ result = yaml_interrogate(yaml_str)
+ info = result.validation_info[0]
+ assert info.n_failed == 1 # one raw null
+
+
+def test_inline_missing_spec(survey_csv):
+ yaml_str = f"""
+tbl: {survey_csv}
+steps:
+ - col_pct_missing:
+ columns: age
+ missing:
+ reasons:
+ -99: not_asked
+ -98: refused
+ max_pct: 0.5
+"""
+ result = yaml_interrogate(yaml_str)
+ assert result.validation_info[0].all_passed is True
+
+
+def test_unknown_spec_reference_raises(survey_csv):
+ yaml_str = f"""
+tbl: {survey_csv}
+steps:
+ - col_pct_missing:
+ columns: age
+ missing: nonexistent
+ max_pct: 0.5
+"""
+ with pytest.raises(YAMLValidationError, match="Unknown missing spec"):
+ yaml_interrogate(yaml_str)
+
+
+def test_missing_specs_must_be_dict(survey_csv):
+ yaml_str = f"""
+tbl: {survey_csv}
+missing_specs:
+ - not_a_mapping
+steps:
+ - rows_distinct
+"""
+ with pytest.raises(YAMLValidationError):
+ yaml_interrogate(yaml_str)
+
+
+def test_yaml_to_python_renders_missing_spec(survey_csv):
+ yaml_str = f"""
+tbl: {survey_csv}
+missing_specs:
+ survey:
+ reasons:
+ -99: not_asked
+ -98: refused
+steps:
+ - col_pct_missing:
+ columns: age
+ missing: survey
+ max_pct: 0.5
+"""
+ code = yaml_to_python(yaml_str)
+ assert "pb.MissingSpec(" in code
+ assert "col_pct_missing" in code
+ assert "reasons=" in code
diff --git a/user_guide/01-validation-plan/02-validation-methods.qmd b/user_guide/01-validation-plan/02-validation-methods.qmd
index 2cc85e623..c420961c1 100644
--- a/user_guide/01-validation-plan/02-validation-methods.qmd
+++ b/user_guide/01-validation-plan/02-validation-methods.qmd
@@ -293,6 +293,78 @@ In summary, `na_pass=` works like this:
- `na_pass=True`: missing values pass validation regardless of the condition being tested
- `na_pass=False` (the default): missing values fail validation
+### Structured Missingness with `missing=`
+
+`na_pass=` treats missingness as binary, but real-world data often encodes *why* a value is absent
+using sentinel codes (e.g., `-99` for "not asked", `-98` for "refused"). The
+[`MissingSpec`](`pointblank.MissingSpec`) class captures these codes and their reasons, and most
+validation methods accept a `missing=` argument that uses it.
+
+When you pass `missing=` to a `col_vals_*()` method, declared sentinel values (and, by default,
+`Null` values) are *excluded* from the check, so only the "real" values are validated:
+
+```{python}
+import polars as pl
+
+tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29, 200, 55, None]})
+
+age_missing = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"})
+
+validation = (
+ pb.Validate(data=tbl)
+ .col_vals_between(columns="age", left=0, right=120, missing=age_missing)
+ .interrogate()
+)
+
+validation
+```
+
+Only the real value `200` is out of range; the sentinel codes and the `Null` are excluded and pass.
+In the report, such steps are marked with a compact `MISSING-AWARE` badge, and a one-line summary of
+the codes appears in the step's notes.
+
+Pointblank also provides dedicated missingness validation methods that use a `MissingSpec`:
+
+- [`Validate.col_pct_missing()`](`Validate.col_pct_missing`): assert the percentage of missing
+ values stays within a limit, optionally filtered by a specific `reason=` or `category=`.
+- [`Validate.col_missing_coded()`](`Validate.col_missing_coded`): assert every absence is expressed
+ as a documented code (no uncoded raw `Null` values).
+- [`Validate.col_missing_only_coded()`](`Validate.col_missing_only_coded`): assert a column contains
+ only documented codes and legitimate values (catching undocumented codes like a stray `-95`),
+ paired with an `allowed=` set or a `min_val`/`max_val` range.
+- [`Validate.col_missing_consistent()`](`Validate.col_missing_consistent`): assert related columns
+ share a consistent missingness pattern for a given reason (e.g., a survey skip pattern).
+
+```{python}
+income_missing = pb.MissingSpec(reasons={-99: "not_asked", -1: "below_threshold"})
+
+survey = pl.DataFrame(
+ {
+ "income_source": [1, -99, 2, -99],
+ "income_amount": [50000, -99, 42000, 38000],
+ }
+)
+
+validation = (
+ pb.Validate(data=survey)
+ # No more than 30% of income values may be "not_asked"
+ .col_pct_missing(columns="income_amount", missing=income_missing, reason="not_asked", max_pct=0.30)
+ # If income wasn't asked, both related columns should be coded together
+ .col_missing_consistent(
+ columns=["income_source", "income_amount"], missing=income_missing, when_reason="not_asked"
+ )
+ .interrogate()
+)
+
+validation
+```
+
+`MissingSpec` also offers pre-built factories for common standards (e.g.,
+`pb.MissingSpec.from_cdisc_null_flavors()`, `pb.MissingSpec.from_sas()`,
+`pb.MissingSpec.from_spss()`), and importing metadata from SPSS/Stata/SAS files can auto-generate
+specs via [`MetadataImport.missing_specs()`](`pointblank.MetadataImport`). For a fuller treatment of
+structured-missingness *reporting*, see the *Missing Values Reporting* article.
+
## 2. Row-based Validations
Row-based validations focus on examining properties that span across entire rows rather than
diff --git a/user_guide/03-yaml/02-yaml-reference.qmd b/user_guide/03-yaml/02-yaml-reference.qmd
index cbe3341b2..a13408beb 100644
--- a/user_guide/03-yaml/02-yaml-reference.qmd
+++ b/user_guide/03-yaml/02-yaml-reference.qmd
@@ -40,6 +40,11 @@ actions: # OPTIONAL: Global failure actions
final_actions: # OPTIONAL: Actions triggered after all steps complete
warning: "Post-validation warning"
error: "Post-validation error"
+missing_specs: # OPTIONAL: Named structured-missingness specs
+ standard_survey:
+ reasons:
+ -99: not_asked
+ -98: refused
steps: # REQUIRED: List of validation steps
- validation_method_name
- validation_method_name:
@@ -191,6 +196,62 @@ Template variables available for action strings:
- `{level}`: severity level ('warning'/'error'/'critical')
- `{time}`: timestamp of validation
+### Structured Missingness (`missing_specs`)
+
+The optional top-level `missing_specs` key defines named [`MissingSpec`](`pointblank.MissingSpec`)
+objects that steps can reference. Each named spec maps sentinel values to reason labels, and may
+declare `categories`, `null_is_missing`, and `null_reason`:
+
+```yaml
+missing_specs:
+ standard_survey:
+ reasons:
+ -99: not_asked
+ -98: refused
+ -97: dont_know
+ categories:
+ nonresponse: [refused, dont_know]
+ null_is_missing: true # OPTIONAL (default true)
+ null_reason: unknown # OPTIONAL (default "unknown")
+```
+
+Steps reference a named spec by name through the `missing:` parameter. This works both on the
+`col_vals_*` methods (to exclude sentinel values from a check) and on the dedicated missingness
+methods (`col_pct_missing`, `col_missing_coded`, `col_missing_only_coded`, `col_missing_consistent`):
+
+```yaml
+missing_specs:
+ standard_survey:
+ reasons:
+ -99: not_asked
+ -98: refused
+
+steps:
+ - col_vals_between:
+ columns: age
+ left: 0
+ right: 120
+ missing: standard_survey # excludes -99/-98 (and nulls) from the range check
+ - col_pct_missing:
+ columns: age
+ missing: standard_survey
+ reason: refused
+ max_pct: 0.30
+```
+
+A step can also define a spec inline (an anonymous mapping) instead of referencing a named one:
+
+```yaml
+steps:
+ - col_pct_missing:
+ columns: age
+ max_pct: 0.5
+ missing:
+ reasons:
+ -99: not_asked
+ -98: refused
+```
+
## Validation Methods Reference
### Column Value Validations
diff --git a/user_guide/05-data-inspection/03-missing-vals-tbl.qmd b/user_guide/05-data-inspection/03-missing-vals-tbl.qmd
index 951392099..f47c264ff 100644
--- a/user_guide/05-data-inspection/03-missing-vals-tbl.qmd
+++ b/user_guide/05-data-inspection/03-missing-vals-tbl.qmd
@@ -81,3 +81,90 @@ pb.missing_vals_tbl(game_revenue)
We see nothing but light blue in this report! The header also indicates that there are no missing
values by displaying a large green check mark (the other report tables provided a count of total
missing values across all columns).
+
+## Structured Missingness by Reason
+
+So far we've treated missingness as binary: a value is either `Null` or it isn't. But real-world
+data often encodes *why* a value is absent. Survey data distinguishes *refused* from *not asked*
+from *don't know*; clinical and statistical-package data use sentinel codes like `-99`, `".A"`, or
+`"NOT DONE"`. Pointblank captures this with the [`MissingSpec`](`pointblank.MissingSpec`) class,
+which maps sentinel values to human-readable *reasons*.
+
+When you pass a `missing=` mapping of column names to `MissingSpec` objects, `missing_vals_tbl()`
+switches from the sector heatmap to a *structured breakdown*: one row per column with the count and
+percentage of complete values and of each missing reason.
+
+::: {.callout-note}
+## Supplying `missing=` produces a different report
+
+The structured breakdown is a *distinct visualization*, not an annotated version of the default
+sector heatmap. Adding `missing=` changes the table's whole layout. The report title changes too
+(from "Missing Values" to "Missing Values by Reason", or "Missing Pattern Heatmap" with
+`as_heatmap=True`), and the shared title styling and monospaced column list keep the two views
+recognizably part of the same family.
+:::
+
+```{python}
+import polars as pl
+
+survey = pl.DataFrame(
+ {
+ "age": [34, -98, 41, -99, 29, -98, 55, None],
+ "income": [50000, -99, -1, None, 42000, -99, 38000, 61000],
+ }
+)
+
+specs = {
+ "age": pb.MissingSpec(reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}),
+ "income": pb.MissingSpec(reasons={-99: "not_asked", -1: "below_threshold"}),
+}
+
+pb.missing_vals_tbl(survey, missing=specs)
+```
+
+Each `MissingSpec` declares the sentinel values for a column and the reason each one represents.
+Those declared (coded) reasons are grouped under the **Missing Reasons** spanner. By default, actual
+`Null` values are also counted as missing; because those are raw `Null`/`None`/`NA` values and *not*
+part of the spec, they're tallied in a fixed **Null** column at the far right (styled like
+**Complete**), rather than as a reason. Set `null_is_missing=False` on the spec if raw nulls should
+be treated as real values instead — then there's no **Null** column at all.
+
+The reason columns are the *union* of reasons across all the specs you provide. When a reason isn't
+defined for a particular column, that cell shows an em dash (`—`) rather than `0`. This signals
+"not applicable to this column", as distinct from a reason that *is* defined but simply wasn't
+observed (which shows `0 (0%)`).
+
+### Viewing the pattern as a heatmap
+
+For a more visual read of *where* missingness concentrates, pass `as_heatmap=True`. The reason
+columns are then shaded from light to dark by the proportion missing:
+
+```{python}
+pb.missing_vals_tbl(survey, missing=specs, as_heatmap=True)
+```
+
+### Pre-built specs for common standards
+
+You don't always have to define reasons by hand. `MissingSpec` provides factory methods for common
+encodings, including CDISC/HL7 null flavors and SAS special missing values:
+
+```{python}
+cdisc = pb.MissingSpec.from_cdisc_null_flavors()
+print("NASK ->", cdisc.reason_for("NASK")) # not_asked
+print("UNK ->", cdisc.reason_for("UNK")) # unknown
+```
+
+When metadata is imported from SPSS, Stata, or SAS files (see the *Metadata Import* section),
+[`MetadataImport.missing_specs()`](`pointblank.MetadataImport`) auto-generates a `{column:
+MissingSpec}` mapping from the variables' declared missing values, ready to pass straight to
+`missing_vals_tbl()`.
+
+::: {.callout-note}
+The same `MissingSpec` objects power missingness-aware *validation*, not just reporting. You can
+pass `missing=` to the `col_vals_*()` methods (to exclude sentinel values from a check) and use the
+dedicated [`col_pct_missing()`](`pointblank.Validate.col_pct_missing`),
+[`col_missing_coded()`](`pointblank.Validate.col_missing_coded`),
+[`col_missing_only_coded()`](`pointblank.Validate.col_missing_only_coded`), and
+[`col_missing_consistent()`](`pointblank.Validate.col_missing_consistent`) validation steps. See the
+*Validation Methods* article for details.
+:::
diff --git a/user_guide/11-metadata-import/02-statistical-packages.qmd b/user_guide/11-metadata-import/02-statistical-packages.qmd
index 25d85e1eb..9b40338b2 100644
--- a/user_guide/11-metadata-import/02-statistical-packages.qmd
+++ b/user_guide/11-metadata-import/02-statistical-packages.qmd
@@ -130,6 +130,36 @@ them appropriately. When validation is generated, these codes are documented in
rather than generating explicit exclusion rules, since the correct handling depends on your
analysis context.
+#### Turning missing codes into `MissingSpec` objects
+
+To put these codes to work in validation and reporting, convert them into
+[`MissingSpec`](`pointblank.MissingSpec`) objects. The
+[`MetadataImport.missing_specs()`](`pointblank.MetadataImport`) method does this for every variable
+that declares missing values, returning a `{column: MissingSpec}` mapping (the reason labels are
+derived from the variables' value labels):
+
+```python
+meta = pb.import_metadata("survey.sav")
+
+# Auto-generate a {column: MissingSpec} mapping from the declared missing values
+specs = meta.missing_specs()
+
+# Use the specs in a structured missingness report...
+pb.missing_vals_tbl(data, missing=specs)
+
+# ...or in missingness-aware validation
+validation = (
+ pb.Validate(data=data)
+ .col_vals_between(columns="age", left=0, right=120, missing=specs["age"])
+ .interrogate()
+)
+```
+
+You can also build a spec for a single variable with
+[`VariableMetadata.to_missing_spec()`](`pointblank.VariableMetadata`), or construct one directly
+from SPSS-style values via `pb.MissingSpec.from_spss(missing_values=[...], labels={...})`. See the
+*Missing Values Reporting* and *Validation Methods* articles for what you can do with these specs.
+
### Type Detection from Formats
SPSS stores numeric variables with format strings that indicate how they should be displayed. These
diff --git a/user_guide/11-metadata-import/03-cdisc-validation.qmd b/user_guide/11-metadata-import/03-cdisc-validation.qmd
index eb1a32e21..e8d8d4308 100644
--- a/user_guide/11-metadata-import/03-cdisc-validation.qmd
+++ b/user_guide/11-metadata-import/03-cdisc-validation.qmd
@@ -531,6 +531,27 @@ This layered approach gives you the flexibility to apply different levels of val
on your needs. The Define-XML checks enforce what was specifically documented for your study,
while the SDTM template checks enforce the broader standard requirements that apply universally.
+## Null Flavors and Structured Missingness
+
+Clinical data uses standardized HL7/CDISC *null flavors* to record *why* a value is absent (e.g.,
+`"NASK"` = not asked, `"UNK"` = unknown, `"NA"` = not applicable). Pointblank ships a pre-built
+[`MissingSpec`](`pointblank.MissingSpec`) for these codes via
+`MissingSpec.from_cdisc_null_flavors()`:
+
+```{python}
+cdisc = pb.MissingSpec.from_cdisc_null_flavors()
+
+print("NASK ->", cdisc.reason_for("NASK")) # not_asked
+print("UNK ->", cdisc.reason_for("UNK")) # unknown
+print("boundary codes:", cdisc.values_for_category("boundary"))
+```
+
+This spec can be passed to `missing_vals_tbl()` for a reason-by-reason breakdown, or to the
+`col_vals_*()` and dedicated missingness validation methods (`col_pct_missing()`,
+`col_missing_coded()`, `col_missing_only_coded()`, `col_missing_consistent()`) to validate data
+while accounting for the null flavor codes. See the *Missing Values Reporting* and *Validation
+Methods* articles for the full set of capabilities.
+
## Conclusion
CDISC data validation with Pointblank covers the full spectrum of clinical trial data management: