From b22247779fba8346731f4005d9663ccf0f1d874e Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 14:58:54 -0400 Subject: [PATCH 01/55] Add col_pct_missing() --- pointblank/validate.py | 172 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) diff --git a/pointblank/validate.py b/pointblank/validate.py index d58e33caa..d0b78c277 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -10404,6 +10404,178 @@ def col_pct_null( return self + def col_pct_missing( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + max_pct: float, + reason: str | None = None, + category: str | None = None, + thresholds: int | float | None | bool | tuple | dict | Thresholds = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: + """ + Validate that the percentage of *structured* missing values stays within a limit. + + The `col_pct_missing()` validation method checks whether the percentage of missing values + in a column is at most `max_pct=`. Unlike [`col_pct_null()`](`pointblank.Validate.col_pct_null`), + which only considers actual null values, this method uses a + [`MissingSpec`](`pointblank.MissingSpec`) to define which values count as missing: declared + sentinel values (e.g., `-99` for `"refused"`) and, when `null_is_missing=True`, actual null + values. This validation operates at the column level, generating a single validation step + per column that passes when the missing percentage does not exceed `max_pct=`. + + You can narrow the check to a single reason (via `reason=`) or a category of reasons (via + `category=`), making it possible to assert things like "at most 10% of values were refused" + or "at most 15% are item nonresponse". + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If + multiple columns are supplied or resolved, there will be a separate validation step + generated for each column. + missing + A [`MissingSpec`](`pointblank.MissingSpec`) describing the sentinel values (and their + reasons) that encode missingness for this column. + max_pct + The maximum allowable percentage of missing values, expressed as a decimal between + `0.0` and `1.0`. For example, `max_pct=0.20` means at most 20% of values may be missing. + reason + If provided, only count missing values whose reason matches this label. Cannot be + combined with `category=`. + category + If provided, only count missing values whose reason falls in this category (as defined + in `MissingSpec.categories`). Cannot be combined with `reason=`. + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will + be set locally and global thresholds (if any) will take effect. + actions + Optional actions to take when the validation step(s) meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value or callable that determines whether the validation step should be + active. Using `False` will make the validation step inactive (still reporting its + presence and keeping indexes for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) + ``` + Survey data often encodes missingness with sentinel values rather than nulls. Here, the + `age` column uses `-99` (`"not_asked"`), `-98` (`"refused"`), and `-97` (`"dont_know"`): + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame( + {"age": [34, -98, 41, -99, 29, -98, 55, 38]}, + ) + + age_missing = pb.MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}, + categories={"item_nonresponse": ["refused", "dont_know"]}, + ) + + validation = ( + pb.Validate(data=tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.5) + .col_pct_missing(columns="age", missing=age_missing, reason="refused", max_pct=0.30) + .interrogate() + ) + + validation + ``` + """ + assertion_type = _get_fn_name() + + _check_column(column=columns) + _check_thresholds(thresholds=thresholds) + _check_active_input(param=active, param_name="active") + + if not isinstance(missing, MissingSpec): + raise TypeError( + f"`missing=` must be a MissingSpec, got {type(missing).__name__}." + ) + + if reason is not None and category is not None: + raise ValueError("Only one of `reason=` or `category=` can be specified.") + + if not 0.0 <= max_pct <= 1.0: + raise ValueError(f"`max_pct=` must be between 0.0 and 1.0, got {max_pct}.") + + # Resolve which sentinel values (and whether nulls) count as missing for this step + if reason is not None: + sentinels = missing.values_for_reason(reason) + count_null = missing.null_is_missing and missing.null_reason == reason + elif category is not None: + sentinels = missing.values_for_category(category) + cat_reasons = (missing.categories or {}).get(category, []) + count_null = missing.null_is_missing and missing.null_reason in cat_reasons + else: + sentinels = missing.sentinel_values() + count_null = missing.null_is_missing + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later + # resolve the columns + if isinstance(columns, (ColumnSelector, nw.selectors.Selector)): + columns = col(columns) + + # If `columns` is Column value or a string, place it in a list for iteration + if isinstance(columns, (Column, str)): + columns = [columns] + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + # Iterate over the columns and create a validation step for each + for column in columns: + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=column, + values={ + "sentinels": sentinels, + "count_null": count_null, + "max_pct": max_pct, + "reason": reason, + "category": category, + }, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + ) + + self._add_validation(validation_info=val_info) + + return self def rows_distinct( self, columns_subset: str | list[str] | None = None, From 00c571f866ae392efee17cbbc02fb3b233cf1de4 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 14:59:12 -0400 Subject: [PATCH 02/55] Add col_missing_coded() --- pointblank/validate.py | 139 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/pointblank/validate.py b/pointblank/validate.py index d0b78c277..ac4cdc28e 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -10576,6 +10576,145 @@ def col_pct_missing( self._add_validation(validation_info=val_info) return self + + def col_missing_coded( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: + """ + Validate that all missing values in a column are *coded* (no uncoded nulls). + + The `col_missing_coded()` validation method checks that every absent value in a column is + expressed with an explicit missing-value code, rather than a raw null. Under the structured + missingness model (see [`MissingSpec`](`pointblank.MissingSpec`)), every absence should + carry a *reason* — encoded as a sentinel value such as `-99` for `"not_asked"`. A raw null + represents *uncoded* (unknown) missingness, so this validation treats raw nulls as failing + test units while declared sentinel values and real values pass. + + This validation operates over the number of test units equal to the number of rows in the + table (determined after any `pre=` mutation has been applied). + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If + multiple columns are supplied or resolved, there will be a separate validation step + generated for each column. + missing + A [`MissingSpec`](`pointblank.MissingSpec`) describing the sentinel values (and their + reasons) that encode missingness for this column. The spec documents which codes are + considered valid expressions of missingness. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. + actions + Optional actions to take when the validation step(s) meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value or callable that determines whether the validation step should be + active. Using `False` will make the validation step inactive (still reporting its + presence and keeping indexes for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) + ``` + Here, the `age` column codes its missingness with sentinel values, except for one row that + has a raw null (an uncoded absence): + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame({"age": [34, -98, 41, None, 29, -99, 55, 38]}) + + age_missing = pb.MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}, + ) + + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing) + .interrogate() + ) + + validation + ``` + + The validation reports a single failing test unit: the row where `age` is a raw null, which + represents missingness without a documented reason. + """ + assertion_type = _get_fn_name() + + _check_column(column=columns) + _check_pre(pre=pre) + _check_thresholds(thresholds=thresholds) + _check_active_input(param=active, param_name="active") + + if not isinstance(missing, MissingSpec): + raise TypeError( + f"`missing=` must be a MissingSpec, got {type(missing).__name__}." + ) + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + columns = _resolve_columns(columns) + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + # Iterate over the columns and create a validation step for each + for column in columns: + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=column, + values=missing, + pre=pre, + segments=segments, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + ) + + self._add_validation(validation_info=val_info) + + return self + def rows_distinct( self, columns_subset: str | list[str] | None = None, From 17aaf5f4658c591371a6cd959b03d1e5eaf68a07 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 14:59:29 -0400 Subject: [PATCH 03/55] Add _create_text_col_pct_missing() util fn --- pointblank/validate.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/pointblank/validate.py b/pointblank/validate.py index ac4cdc28e..4f64bbe3b 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -20004,6 +20004,40 @@ def _create_text_col_pct_null( return text +def _create_text_col_pct_missing( + lang: str, + column: str | None, + value: dict, + for_failure: bool = False, + locale: str | None = None, +) -> str: + """Create autobrief/failure text for col_pct_missing validation.""" + type_ = _expect_failure_type(for_failure=for_failure) + + column_text = _prep_column_text(column=column) + + fmt_locale = locale if locale else lang + + max_pct_value = value.get("max_pct", 0) * 100 # Convert to percentage + max_pct_formatted = _format_number_safe(max_pct_value, decimals=1, locale=fmt_locale) + + return EXPECT_FAIL_TEXT[f"col_pct_missing_{type_}_text"][lang].format( + column_text=column_text, + max_pct=max_pct_formatted, + ) + + +def _create_text_col_missing_coded(lang: str, column: str | None, for_failure: bool = False) -> str: + """Create autobrief/failure text for col_missing_coded validation.""" + type_ = _expect_failure_type(for_failure=for_failure) + + column_text = _prep_column_text(column=column) + + return EXPECT_FAIL_TEXT[f"col_missing_coded_{type_}_text"][lang].format( + column_text=column_text, + ) + + def _create_text_conjointly(lang: str, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) From db9228729d4d045055026299ef567bd003f42141 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 14:59:41 -0400 Subject: [PATCH 04/55] Update validate.py --- pointblank/validate.py | 49 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/pointblank/validate.py b/pointblank/validate.py index 4f64bbe3b..f9cc73c7d 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -63,6 +63,7 @@ SpeciallyValidation, col_count_match, col_exists, + col_pct_missing, col_pct_null, col_schema_match, col_vals_expr, @@ -75,6 +76,7 @@ interrogate_le, interrogate_lt, interrogate_ne, + interrogate_missing_coded, interrogate_not_null, interrogate_notin, interrogate_null, @@ -85,6 +87,7 @@ rows_complete, ) from pointblank._typing import SegmentSpec +from pointblank.missing import MissingSpec from pointblank._utils import ( _check_any_df_lib, _check_invalid_fields, @@ -14080,6 +14083,7 @@ def interrogate( "col_vals_le", "col_vals_null", "col_vals_not_null", + "col_missing_coded", "col_vals_increasing", "col_vals_decreasing", "col_vals_between", @@ -14122,6 +14126,8 @@ def interrogate( results_tbl = interrogate_null(tbl=tbl, column=column) elif assertion_method == "not_null": results_tbl = interrogate_not_null(tbl=tbl, column=column) + elif assertion_method == "missing_coded": + results_tbl = interrogate_missing_coded(tbl=tbl, column=column) elif assertion_type == "col_vals_increasing": from pointblank._interrogation import interrogate_increasing @@ -14208,6 +14214,22 @@ def interrogate( results_tbl = None + elif assertion_type == "col_pct_missing": + result_bool = col_pct_missing( + data_tbl=data_tbl_step, + column=column, + sentinels=value["sentinels"], + count_null=value["count_null"], + max_pct=value["max_pct"], + ) + + validation.all_passed = result_bool + validation.n = 1 + validation.n_passed = int(result_bool) + validation.n_failed = 1 - int(result_bool) + + results_tbl = None + elif assertion_type == "col_vals_expr": results_tbl = col_vals_expr( data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type @@ -17267,6 +17289,7 @@ def get_tabular_report( elif assertion_type[i] in [ "col_vals_null", "col_vals_not_null", + "col_missing_coded", "col_exists", "rows_distinct", "rows_complete", @@ -17282,6 +17305,16 @@ def get_tabular_report( tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0 values_upd.append(f"p = {p_value}
tol = {tol_value}") + elif assertion_type[i] in ["col_pct_missing"]: + # Format the max_pct and any reason/category filter for display + max_pct_value = value["max_pct"] + filter_line = "" + if value.get("reason") is not None: + filter_line = f"
reason = {value['reason']}" + elif value.get("category") is not None: + filter_line = f"
category = {value['category']}" + values_upd.append(f"max_pct = {max_pct_value}{filter_line}") + elif assertion_type[i] in ["data_freshness"]: # Format max_age nicely for display max_age = value.get("max_age") @@ -19595,6 +19628,22 @@ def _create_autobrief_or_failure_text( n_rows=n_rows, ) + if assertion_type == "col_pct_missing": + return _create_text_col_pct_missing( + lang=lang, + column=column, + value=values, + for_failure=for_failure, + locale=locale if locale else lang, + ) + + if assertion_type == "col_missing_coded": + return _create_text_col_missing_coded( + lang=lang, + column=column, + for_failure=for_failure, + ) + if assertion_type == "conjointly": return _create_text_conjointly(lang=lang, for_failure=for_failure) From 81ef682e9a3949a8aafcdc857a569c1441f129ab Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:00:03 -0400 Subject: [PATCH 05/55] Add the MissingSpec class --- pointblank/missing.py | 242 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 pointblank/missing.py diff --git a/pointblank/missing.py b/pointblank/missing.py new file mode 100644 index 000000000..04f3150a6 --- /dev/null +++ b/pointblank/missing.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +__all__ = [ + "MissingSpec", +] + + +@dataclass +class MissingSpec: + """ + Specification for structured missing values in a column. + + Real-world data rarely encodes missingness as a single `null` value. Survey data distinguishes + *refused* from *don't know* from *not applicable*; clinical data uses codes like `"NOT DONE"`; + statistical packages use sentinel values such as `-99`, `".A"`, or `""`. A `MissingSpec` + captures these sentinel values, the *reason* each one represents, and how they should be + handled during validation and analysis. + + This brings the idea of *structured missingness* (a missing value carries a reason for its + absence) into Pointblank's runtime validation layer. Once defined, a `MissingSpec` can be + passed to validation methods (via `missing=`) to automatically exclude sentinel values from + constraint checks, or used with dedicated methods like + [`Validate.col_missing_coded()`](`pointblank.Validate.col_missing_coded`) and + [`Validate.col_pct_missing()`](`pointblank.Validate.col_pct_missing`). + + Parameters + ---------- + reasons + A dictionary mapping sentinel values to reason labels. Keys are the actual values present + in the data (e.g., `-99`, `"NA"`, `".A"`). Values are human-readable reason identifiers + (e.g., `"refused"`, `"not_asked"`). + categories + Optional grouping of reasons into categories (e.g., an `"item_nonresponse"` category that + groups `"refused"` and `"dont_know"`). Useful for aggregate reporting and for checking + missingness rates by category. Each value is a list of reason labels that appear in + `reasons`. Default is `None`. + null_is_missing + Whether actual null/`None`/`NaN` values should also be treated as missing (with reason + given by `null_reason`). Default is `True`. + null_reason + The reason label assigned to actual null values when `null_is_missing=True`. Default is + `"unknown"`. + description + Optional human-readable description of the overall missingness pattern. Default is `None`. + + Returns + ------- + MissingSpec + A missing-value specification that can be attached to a `Field` (via `missing=`) or passed + to validation methods. + + Examples + -------- + Define the missing-value codes for a survey `age` variable: + + ```python + import pointblank as pb + + age_missing = pb.MissingSpec( + reasons={ + -99: "not_asked", # Question wasn't asked to this participant + -98: "refused", # Participant declined to answer + -97: "dont_know", # Participant didn't know + -96: "not_applicable", # Question doesn't apply + }, + categories={ + "item_nonresponse": ["refused", "dont_know"], + "design": ["not_asked", "not_applicable"], + }, + ) + ``` + + The spec can then answer questions about its own structure: + + ```python + age_missing.sentinel_values() # [-99, -98, -97, -96] + age_missing.reason_for(-98) # "refused" + age_missing.values_for_reason("refused") # [-98] + age_missing.values_for_category("item_nonresponse") # [-98, -97] + ``` + """ + + reasons: dict[Any, str] + categories: dict[str, list[str]] | None = None + null_is_missing: bool = True + null_reason: str = "unknown" + description: str | None = field(default=None) + + def __post_init__(self) -> None: + self._validate() + + def _validate(self) -> None: + """Validate that the missing specification is internally consistent.""" + if not isinstance(self.reasons, dict): + raise TypeError( + f"reasons must be a dict mapping sentinel values to reason labels, " + f"got {type(self.reasons).__name__}" + ) + + if len(self.reasons) == 0 and not self.null_is_missing: + raise ValueError( + "A MissingSpec must define at least one sentinel value in `reasons`, " + "or set `null_is_missing=True`." + ) + + for value, reason in self.reasons.items(): + if not isinstance(reason, str): + raise TypeError( + f"Reason labels must be strings, got {type(reason).__name__} " + f"for sentinel value {value!r}." + ) + + if not isinstance(self.null_reason, str): + raise TypeError( + f"null_reason must be a string, got {type(self.null_reason).__name__}." + ) + + if self.categories is not None: + if not isinstance(self.categories, dict): + raise TypeError( + f"categories must be a dict mapping category names to lists of reason " + f"labels, got {type(self.categories).__name__}." + ) + + known_reasons = set(self.reasons.values()) + if self.null_is_missing: + known_reasons.add(self.null_reason) + + for category, reason_list in self.categories.items(): + if not isinstance(reason_list, (list, tuple)): + raise TypeError( + f"Category '{category}' must map to a list of reason labels, " + f"got {type(reason_list).__name__}." + ) + unknown = [r for r in reason_list if r not in known_reasons] + if unknown: + raise ValueError( + f"Category '{category}' references unknown reason label(s) {unknown}. " + f"Known reasons are {sorted(known_reasons)}." + ) + + def sentinel_values(self) -> list: + """Get all sentinel values that encode missingness. + + Returns + ------- + list + The keys of `reasons` (the actual values in the data that represent missingness). + Note that this does *not* include `None` even when `null_is_missing=True`; use + [`is_missing()`](`pointblank.MissingSpec.is_missing`) to test individual values. + """ + return list(self.reasons.keys()) + + def reason_for(self, value: Any) -> str | None: + """Get the reason label for a specific value. + + Parameters + ---------- + value + A value from the data. + + Returns + ------- + str | None + The reason label if `value` is a declared sentinel value, `null_reason` if `value` + is `None` and `null_is_missing=True`, or `None` if the value is not considered + missing. + """ + if value is None: + return self.null_reason if self.null_is_missing else None + return self.reasons.get(value) + + def is_missing(self, value: Any) -> bool: + """Check whether a value should be considered missing under this spec. + + Parameters + ---------- + value + A value from the data. + + Returns + ------- + bool + `True` if `value` is a declared sentinel value, or if `value` is `None` and + `null_is_missing=True`. + """ + if value is None: + return self.null_is_missing + return value in self.reasons + + def values_for_reason(self, reason: str) -> list: + """Get all sentinel values that correspond to a given reason. + + Parameters + ---------- + reason + A reason label. + + Returns + ------- + list + All sentinel values mapped to `reason`. + """ + return [v for v, r in self.reasons.items() if r == reason] + + def values_for_category(self, category: str) -> list: + """Get all sentinel values whose reason falls in a given category. + + Parameters + ---------- + category + A category name defined in `categories`. + + Returns + ------- + list + All sentinel values whose reason label is in the given category. Returns an empty + list if `categories` is `None` or the category is undefined. + """ + if self.categories is None: + return [] + reasons_in_cat = self.categories.get(category, []) + return [v for v, r in self.reasons.items() if r in reasons_in_cat] + + def reasons_list(self) -> list[str]: + """Get the distinct reason labels defined by this spec. + + Returns + ------- + list[str] + The distinct reason labels (in first-seen order), including `null_reason` when + `null_is_missing=True`. + """ + seen: dict[str, None] = {} + for r in self.reasons.values(): + seen.setdefault(r, None) + if self.null_is_missing: + seen.setdefault(self.null_reason, None) + return list(seen.keys()) From 6a8439aac27678d088b70348eca961976da4f8a4 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:00:07 -0400 Subject: [PATCH 06/55] Update __init__.py --- pointblank/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pointblank/__init__.py b/pointblank/__init__.py index e8d37b872..231e8233f 100644 --- a/pointblank/__init__.py +++ b/pointblank/__init__.py @@ -57,6 +57,7 @@ from pointblank.generate.base import GeneratorConfig from pointblank.inspect import has_columns, has_rows from pointblank.integrations.otel import emit_otel +from pointblank.missing import MissingSpec from pointblank.metadata import ( ADaMDatasetTemplate, ADaMVariableSpec, @@ -120,6 +121,7 @@ "PipelineResult", "DataScan", "DraftValidation", + "MissingSpec", "col", "ref", "expr_col", From 97c39b9dc16e5baec6e4e66c57fba87ae71820a2 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:00:45 -0400 Subject: [PATCH 07/55] Add compatible dtypes for missing_coded --- pointblank/_constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pointblank/_constants.py b/pointblank/_constants.py index 204ef412e..287e97e8d 100644 --- a/pointblank/_constants.py +++ b/pointblank/_constants.py @@ -21,6 +21,7 @@ "within_spec": ["str"], "null": ["str", "numeric", "bool", "datetime", "duration"], "not_null": ["str", "numeric", "bool", "datetime", "duration"], + "missing_coded": ["str", "numeric", "bool", "datetime", "duration"], } ASSERTION_TYPE_METHOD_MAP: dict[str, str] = { From 4a9a8d65565b6fccf0c2fdfa77d775122ba96afe Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:01:13 -0400 Subject: [PATCH 08/55] Add to assertion-type/method map --- pointblank/_constants.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pointblank/_constants.py b/pointblank/_constants.py index 287e97e8d..36b477237 100644 --- a/pointblank/_constants.py +++ b/pointblank/_constants.py @@ -26,6 +26,8 @@ ASSERTION_TYPE_METHOD_MAP: dict[str, str] = { "col_pct_null": "pct_null", + "col_pct_missing": "pct_missing", + "col_missing_coded": "missing_coded", "col_vals_gt": "gt", "col_vals_lt": "lt", "col_vals_eq": "eq", From 5c8f6a728689c8f6785a162dc3100c656edef8c5 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:01:32 -0400 Subject: [PATCH 09/55] Declare col_missing_coded() as row-based --- pointblank/_constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pointblank/_constants.py b/pointblank/_constants.py index 36b477237..78b6b6163 100644 --- a/pointblank/_constants.py +++ b/pointblank/_constants.py @@ -94,6 +94,7 @@ "col_vals_decreasing", "col_vals_null", "col_vals_not_null", + "col_missing_coded", "col_vals_expr", "conjointly", "prompt", From 0cb11a4a6d04f3a8f206b09aef85a1d5defb8070 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:01:46 -0400 Subject: [PATCH 10/55] Add icons for reporting outputs --- pointblank/_constants.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pointblank/_constants.py b/pointblank/_constants.py index 78b6b6163..8e644d7f3 100644 --- a/pointblank/_constants.py +++ b/pointblank/_constants.py @@ -644,6 +644,18 @@ +""", + "col_pct_missing": """ + + pct_missing + + + + + + + + """, "col_vals_not_null": """ From 30fb8853a1d12eb7654d24aa31d503d0c4afc5c7 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:01:54 -0400 Subject: [PATCH 11/55] Update _constants.py --- pointblank/_constants.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pointblank/_constants.py b/pointblank/_constants.py index 8e644d7f3..d898a7495 100644 --- a/pointblank/_constants.py +++ b/pointblank/_constants.py @@ -668,6 +668,16 @@ +""", + "col_missing_coded": """ + + col_missing_coded + + + + + + """, "col_vals_regex": """ From cdfdc457d2ec638f2ef8ecb1ab4151c57fd9eb61 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:02:14 -0400 Subject: [PATCH 12/55] Add translations for missing validations --- pointblank/_constants_translations.py | 168 ++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/pointblank/_constants_translations.py b/pointblank/_constants_translations.py index cb968d0ff..920d494e8 100644 --- a/pointblank/_constants_translations.py +++ b/pointblank/_constants_translations.py @@ -1049,6 +1049,174 @@ "th": "เปอร์เซ็นต์ของค่า null ใน {column_text} ไม่อยู่ภายใน [{lower}%, {upper}%]", "fa": "درصد مقادیر null در {column_text} در محدوده [{lower}%, {upper}%] نبود.", }, + "col_pct_missing_expectation_text": { + "en": "Expect that the percentage of missing values in {column_text} is at most {max_pct}%.", + "fr": "On s'attend à ce que le pourcentage de valeurs manquantes dans {column_text} soit d'au plus {max_pct}%.", + "de": "Erwarten Sie, dass der Prozentsatz der fehlenden Werte in {column_text} höchstens {max_pct}% beträgt.", + "it": "Aspettatevi che la percentuale di valori mancanti in {column_text} sia al massimo {max_pct}%.", + "es": "Se espera que el porcentaje de valores faltantes en {column_text} sea como máximo {max_pct}%.", + "pt": "Espera-se que a porcentagem de valores ausentes em {column_text} seja no máximo {max_pct}%.", + "ro": "Se așteaptă ca procentul valorilor lipsă în {column_text} să fie cel mult {max_pct}%.", + "tr": "{column_text} içindeki eksik değerlerin yüzdesinin en fazla {max_pct}% olmasını bekleyin.", + "zh-Hans": "预期{column_text}中缺失值的百分比最多为{max_pct}%。", + "zh-Hant": "{column_text}中缺失值的百分比應最多為{max_pct}%。", + "ja": "{column_text}の欠損値の割合が最大{max_pct}%であることを期待します。", + "ko": "{column_text}의 결측값 비율이 최대 {max_pct}%이어야 합니다.", + "vi": "Kỳ vọng tỷ lệ phần trăm giá trị thiếu trong {column_text} tối đa là {max_pct}%.", + "ru": "Ожидается, что процент отсутствующих значений в {column_text} составит не более {max_pct}%.", + "cs": "Očekává se, že procento chybějících hodnot ve sloupci {column_text} bude nejvýše {max_pct}%.", + "pl": "Oczekuje się, że procent brakujących wartości w {column_text} wyniesie co najwyżej {max_pct}%.", + "da": "Forvent, at procentdelen af manglende værdier i {column_text} højst er {max_pct}%.", + "sv": "Förvänta dig att andelen saknade värden i {column_text} är högst {max_pct}%.", + "nb": "Forvent at prosentandelen av manglende verdier i {column_text} er høyst {max_pct}%.", + "nl": "Verwacht dat het percentage ontbrekende waarden in {column_text} hoogstens {max_pct}% is.", + "fi": "Odota, että puuttuvien arvojen prosenttiosuus sarakkeessa {column_text} on enintään {max_pct}%.", + "is": "Væntir þess að hlutfall vantandi gilda í {column_text} sé í mesta lagi {max_pct}%.", + "ar": "توقع أن تكون نسبة القيم المفقودة في {column_text} {max_pct}% على الأكثر.", + "hi": "अपेक्षा है कि {column_text} में अनुपस्थित मानों का प्रतिशत अधिकतम {max_pct}% होना चाहिए।", + "el": "Αναμένεται το ποσοστό των ελλιπών τιμών στη στήλη {column_text} να είναι το πολύ {max_pct}%.", + "id": "Mengharapkan bahwa persentase nilai yang hilang dalam {column_text} paling banyak {max_pct}%.", + "uk": "Очікується, що відсоток відсутніх значень в {column_text} становитиме не більше {max_pct}%.", + "bg": "Очаква се процентът на липсващите стойности в {column_text} да бъде най-много {max_pct}%.", + "hr": "Očekuje se da postotak nedostajućih vrijednosti u {column_text} bude najviše {max_pct}%.", + "et": "Eeldatakse, et puuduvate väärtuste protsent veerus {column_text} on kõige rohkem {max_pct}%.", + "hu": "Elvárás, hogy a hiányzó értékek aránya a {column_text} oszlopban legfeljebb {max_pct}% legyen.", + "ga": "Táthar ag súil go mbeadh céatadán na luachanna ar iarraidh i {column_text} ar a mhéad {max_pct}%.", + "lv": "Tiek sagaidīts, ka trūkstošo vērtību procents {column_text} būs ne vairāk kā {max_pct}%.", + "lt": "Tikimasi, kad trūkstamų reikšmių procentas stulpelyje {column_text} bus ne daugiau kaip {max_pct}%.", + "mt": "Mistenni li l-perċentwal ta' valuri nieqsa f'{column_text} huwa l-aktar {max_pct}%.", + "sk": "Očakáva sa, že percento chýbajúcich hodnôt v {column_text} bude najviac {max_pct}%.", + "sl": "Pričakuje se, da bo odstotek manjkajočih vrednosti v {column_text} največ {max_pct}%.", + "he": "צפוי שאחוז הערכים החסרים ב{column_text} יהיה לכל היותר {max_pct}%.", + "th": "คาดหวังว่าเปอร์เซ็นต์ของค่าที่หายไปใน {column_text} จะไม่เกิน {max_pct}%", + "fa": "انتظار می‌رود که درصد مقادیر مفقود در {column_text} حداکثر {max_pct}% باشد.", + }, + "col_pct_missing_failure_text": { + "en": "The percentage of missing values in {column_text} exceeded {max_pct}%.", + "fr": "Le pourcentage de valeurs manquantes dans {column_text} a dépassé {max_pct}%.", + "de": "Der Prozentsatz der fehlenden Werte in {column_text} überschritt {max_pct}%.", + "it": "La percentuale di valori mancanti in {column_text} ha superato {max_pct}%.", + "es": "El porcentaje de valores faltantes en {column_text} superó {max_pct}%.", + "pt": "A porcentagem de valores ausentes em {column_text} excedeu {max_pct}%.", + "ro": "Procentul valorilor lipsă în {column_text} a depășit {max_pct}%.", + "tr": "{column_text} içindeki eksik değerlerin yüzdesi {max_pct}% değerini aştı.", + "zh-Hans": "{column_text}中缺失值的百分比超过了{max_pct}%。", + "zh-Hant": "{column_text}中缺失值的百分比超過了{max_pct}%。", + "ja": "{column_text}の欠損値の割合が{max_pct}%を超えました。", + "ko": "{column_text}의 결측값 비율이 {max_pct}%를 초과했습니다.", + "vi": "Tỷ lệ phần trăm giá trị thiếu trong {column_text} đã vượt quá {max_pct}%.", + "ru": "Процент отсутствующих значений в {column_text} превысил {max_pct}%.", + "cs": "Procento chybějících hodnot ve sloupci {column_text} překročilo {max_pct}%.", + "pl": "Procent brakujących wartości w {column_text} przekroczył {max_pct}%.", + "da": "Procentdelen af manglende værdier i {column_text} oversteg {max_pct}%.", + "sv": "Andelen saknade värden i {column_text} översteg {max_pct}%.", + "nb": "Prosentandelen av manglende verdier i {column_text} oversteg {max_pct}%.", + "nl": "Het percentage ontbrekende waarden in {column_text} overschreed {max_pct}%.", + "fi": "Puuttuvien arvojen prosenttiosuus sarakkeessa {column_text} ylitti {max_pct}%.", + "is": "Hlutfall vantandi gilda í {column_text} fór yfir {max_pct}%.", + "ar": "تجاوزت نسبة القيم المفقودة في {column_text} {max_pct}%.", + "hi": "{column_text} में अनुपस्थित मानों का प्रतिशत {max_pct}% से अधिक था।", + "el": "Το ποσοστό των ελλιπών τιμών στη στήλη {column_text} ξεπέρασε {max_pct}%.", + "id": "Persentase nilai yang hilang dalam {column_text} melebihi {max_pct}%.", + "uk": "Відсоток відсутніх значень в {column_text} перевищив {max_pct}%.", + "bg": "Процентът на липсващите стойности в {column_text} надхвърли {max_pct}%.", + "hr": "Postotak nedostajućih vrijednosti u {column_text} premašio je {max_pct}%.", + "et": "Puuduvate väärtuste protsent veerus {column_text} ületas {max_pct}%.", + "hu": "A hiányzó értékek aránya a {column_text} oszlopban meghaladta a {max_pct}%-ot.", + "ga": "Sháraigh céatadán na luachanna ar iarraidh i {column_text} {max_pct}%.", + "lv": "Trūkstošo vērtību procents {column_text} pārsniedza {max_pct}%.", + "lt": "Trūkstamų reikšmių procentas stulpelyje {column_text} viršijo {max_pct}%.", + "mt": "Il-perċentwal ta' valuri nieqsa f'{column_text} qabeż {max_pct}%.", + "sk": "Percento chýbajúcich hodnôt v {column_text} prekročilo {max_pct}%.", + "sl": "Odstotek manjkajočih vrednosti v {column_text} je presegel {max_pct}%.", + "he": "אחוז הערכים החסרים ב{column_text} חרג מ-{max_pct}%.", + "th": "เปอร์เซ็นต์ของค่าที่หายไปใน {column_text} เกิน {max_pct}%", + "fa": "درصد مقادیر مفقود در {column_text} از {max_pct}% فراتر رفت.", + }, + "col_missing_coded_expectation_text": { + "en": "Expect that all missing values in {column_text} are coded (no uncoded Null values).", + "fr": "On s'attend à ce que toutes les valeurs manquantes dans {column_text} soient codées (aucune valeur nulle non codée).", + "de": "Erwarten Sie, dass alle fehlenden Werte in {column_text} kodiert sind (keine unkodierten Nullwerte).", + "it": "Aspettatevi che tutti i valori mancanti in {column_text} siano codificati (nessun valore nullo non codificato).", + "es": "Se espera que todos los valores faltantes en {column_text} estén codificados (sin valores nulos no codificados).", + "pt": "Espera-se que todos os valores ausentes em {column_text} estejam codificados (sem valores nulos não codificados).", + "ro": "Se așteaptă ca toate valorile lipsă în {column_text} să fie codificate (fără valori nule necodificate).", + "tr": "{column_text} içindeki tüm eksik değerlerin kodlanmış olmasını bekleyin (kodlanmamış boş değer yok).", + "zh-Hans": "预期{column_text}中所有缺失值都已编码(没有未编码的空值)。", + "zh-Hant": "{column_text}中所有缺失值都應已編碼(沒有未編碼的空值)。", + "ja": "{column_text}のすべての欠損値がコード化されていることを期待します(コード化されていないnull値がない)。", + "ko": "{column_text}의 모든 결측값이 코드화되어 있어야 합니다(코드화되지 않은 null 값 없음).", + "vi": "Kỳ vọng tất cả giá trị thiếu trong {column_text} đều được mã hóa (không có giá trị null chưa mã hóa).", + "ru": "Ожидается, что все отсутствующие значения в {column_text} закодированы (нет незакодированных нулевых значений).", + "cs": "Očekává se, že všechny chybějící hodnoty ve sloupci {column_text} jsou zakódované (žádné nezakódované null hodnoty).", + "pl": "Oczekuje się, że wszystkie brakujące wartości w {column_text} są zakodowane (brak niezakodowanych wartości null).", + "da": "Forvent, at alle manglende værdier i {column_text} er kodede (ingen ukodede null-værdier).", + "sv": "Förvänta dig att alla saknade värden i {column_text} är kodade (inga okodade null-värden).", + "nb": "Forvent at alle manglende verdier i {column_text} er kodet (ingen ukodede null-verdier).", + "nl": "Verwacht dat alle ontbrekende waarden in {column_text} gecodeerd zijn (geen ongecodeerde null-waarden).", + "fi": "Odota, että kaikki puuttuvat arvot sarakkeessa {column_text} on koodattu (ei koodaamattomia null-arvoja).", + "is": "Væntir þess að öll vantandi gildi í {column_text} séu kóðuð (engin ókóðuð null-gildi).", + "ar": "توقع أن تكون جميع القيم المفقودة في {column_text} مرمّزة (لا توجد قيم فارغة غير مرمّزة).", + "hi": "अपेक्षा है कि {column_text} में सभी अनुपस्थित मान कोडित हों (कोई बिना कोडित null मान नहीं)।", + "el": "Αναμένεται όλες οι ελλιπείς τιμές στη στήλη {column_text} να είναι κωδικοποιημένες (καμία μη κωδικοποιημένη null τιμή).", + "id": "Mengharapkan bahwa semua nilai yang hilang dalam {column_text} dikodekan (tidak ada nilai null yang tidak dikodekan).", + "uk": "Очікується, що всі відсутні значення в {column_text} закодовані (немає незакодованих нульових значень).", + "bg": "Очаква се всички липсващи стойности в {column_text} да са кодирани (без некодирани null стойности).", + "hr": "Očekuje se da su sve nedostajuće vrijednosti u {column_text} kodirane (bez nekodiranih null vrijednosti).", + "et": "Eeldatakse, et kõik puuduvad väärtused veerus {column_text} on kodeeritud (kodeerimata null-väärtusi pole).", + "hu": "Elvárás, hogy a {column_text} oszlopban minden hiányzó érték kódolt legyen (nincs kódolatlan null érték).", + "ga": "Táthar ag súil go mbeadh gach luach ar iarraidh i {column_text} códaithe (gan aon luachanna null gan chódú).", + "lv": "Tiek sagaidīts, ka visas trūkstošās vērtības {column_text} ir kodētas (nav nekodētu null vērtību).", + "lt": "Tikimasi, kad visos trūkstamos reikšmės stulpelyje {column_text} yra užkoduotos (nėra neužkoduotų null reikšmių).", + "mt": "Mistenni li l-valuri nieqsa kollha f'{column_text} huma kodifikati (l-ebda valuri null mhux kodifikati).", + "sk": "Očakáva sa, že všetky chýbajúce hodnoty v {column_text} sú zakódované (žiadne nezakódované null hodnoty).", + "sl": "Pričakuje se, da so vse manjkajoče vrednosti v {column_text} kodirane (brez nekodiranih null vrednosti).", + "he": "צפוי שכל הערכים החסרים ב{column_text} יהיו מקודדים (אין ערכי null לא מקודדים).", + "th": "คาดหวังว่าค่าที่หายไปทั้งหมดใน {column_text} จะถูกเข้ารหัส (ไม่มีค่า null ที่ไม่ได้เข้ารหัส)", + "fa": "انتظار می‌رود که همه مقادیر مفقود در {column_text} کدگذاری شده باشند (هیچ مقدار null کدگذاری‌نشده‌ای وجود نداشته باشد).", + }, + "col_missing_coded_failure_text": { + "en": "Uncoded missing values (raw Null values) were present in {column_text}.", + "fr": "Des valeurs manquantes non codées (valeurs nulles brutes) étaient présentes dans {column_text}.", + "de": "Unkodierte fehlende Werte (rohe Nullwerte) waren in {column_text} vorhanden.", + "it": "Erano presenti valori mancanti non codificati (valori nulli grezzi) in {column_text}.", + "es": "Había valores faltantes no codificados (valores nulos sin procesar) en {column_text}.", + "pt": "Havia valores ausentes não codificados (valores nulos brutos) em {column_text}.", + "ro": "Valori lipsă necodificate (valori nule brute) au fost prezente în {column_text}.", + "tr": "{column_text} içinde kodlanmamış eksik değerler (ham boş değerler) mevcuttu.", + "zh-Hans": "{column_text}中存在未编码的缺失值(原始空值)。", + "zh-Hant": "{column_text}中存在未編碼的缺失值(原始空值)。", + "ja": "{column_text}にコード化されていない欠損値(生のnull値)が存在しました。", + "ko": "{column_text}에 코드화되지 않은 결측값(원시 null 값)이 있었습니다.", + "vi": "Có giá trị thiếu chưa mã hóa (giá trị null thô) trong {column_text}.", + "ru": "В {column_text} присутствовали незакодированные отсутствующие значения (необработанные нулевые значения).", + "cs": "Ve sloupci {column_text} byly přítomny nezakódované chybějící hodnoty (surové null hodnoty).", + "pl": "W {column_text} obecne były niezakodowane brakujące wartości (surowe wartości null).", + "da": "Ukodede manglende værdier (rå null-værdier) var til stede i {column_text}.", + "sv": "Okodade saknade värden (råa null-värden) förekom i {column_text}.", + "nb": "Ukodede manglende verdier (rå null-verdier) var til stede i {column_text}.", + "nl": "Ongecodeerde ontbrekende waarden (ruwe null-waarden) waren aanwezig in {column_text}.", + "fi": "Sarakkeessa {column_text} oli koodaamattomia puuttuvia arvoja (raakoja null-arvoja).", + "is": "Ókóðuð vantandi gildi (hrá null-gildi) voru til staðar í {column_text}.", + "ar": "كانت هناك قيم مفقودة غير مرمّزة (قيم فارغة خام) في {column_text}.", + "hi": "{column_text} में बिना कोडित अनुपस्थित मान (कच्चे null मान) मौजूद थे।", + "el": "Μη κωδικοποιημένες ελλιπείς τιμές (ακατέργαστες null τιμές) υπήρχαν στη στήλη {column_text}.", + "id": "Nilai yang hilang tidak dikodekan (nilai null mentah) ada dalam {column_text}.", + "uk": "У {column_text} були присутні незакодовані відсутні значення (необроблені нульові значення).", + "bg": "В {column_text} присъстваха некодирани липсващи стойности (необработени null стойности).", + "hr": "U {column_text} bile su prisutne nekodirane nedostajuće vrijednosti (sirove null vrijednosti).", + "et": "Veerus {column_text} esinesid kodeerimata puuduvad väärtused (toored null-väärtused).", + "hu": "A {column_text} oszlopban kódolatlan hiányzó értékek (nyers null értékek) voltak jelen.", + "ga": "Bhí luachanna ar iarraidh gan chódú (luachanna null amha) i láthair i {column_text}.", + "lv": "{column_text} bija nekodētas trūkstošās vērtības (neapstrādātas null vērtības).", + "lt": "Stulpelyje {column_text} buvo neužkoduotų trūkstamų reikšmių (neapdorotų null reikšmių).", + "mt": "Valuri nieqsa mhux kodifikati (valuri null mhux ipproċessati) kienu preżenti f'{column_text}.", + "sk": "V {column_text} sa vyskytli nezakódované chýbajúce hodnoty (surové null hodnoty).", + "sl": "V {column_text} so bile prisotne nekodirane manjkajoče vrednosti (surove null vrednosti).", + "he": "ערכים חסרים לא מקודדים (ערכי null גולמיים) היו נוכחים ב{column_text}.", + "th": "มีค่าที่หายไปที่ไม่ได้เข้ารหัส (ค่า null ดิบ) อยู่ใน {column_text}", + "fa": "مقادیر مفقود کدگذاری‌نشده (مقادیر null خام) در {column_text} وجود داشت.", + }, "regex_expectation_text": { "en": "Expect that values in {column_text} should match the regular expression: {values_text}.", "fr": "On s'attend à ce que les valeurs de {column_text} correspondent à l'expression régulière : {values_text}.", From d855d60cf107ece96193d57bebb26d5aac3472ce Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:02:32 -0400 Subject: [PATCH 13/55] Add interrogation functions for missing valdns --- pointblank/_interrogation.py | 59 ++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/pointblank/_interrogation.py b/pointblank/_interrogation.py index 28cd2bf34..8485ef96e 100644 --- a/pointblank/_interrogation.py +++ b/pointblank/_interrogation.py @@ -755,6 +755,52 @@ def col_pct_null( return n_null >= (abs_target - lower_bound) and n_null <= (abs_target + upper_bound) +def col_pct_missing( + data_tbl: IntoFrame, + column: str, + sentinels: list, + count_null: bool, + max_pct: float, +) -> bool: + """Check that the percentage of missing values in a column does not exceed `max_pct`. + + Missing values are those equal to one of the `sentinels` and, when `count_null=True`, actual + null values. The percentage is computed over the total number of rows. + """ + nw_frame = nw.from_native(data_tbl) + + # Build a boolean expression that flags missing values + missing_expr = None + if sentinels: + missing_expr = nw.col(column).is_in(sentinels) + if count_null: + null_expr = nw.col(column).is_null() + missing_expr = null_expr if missing_expr is None else (missing_expr | null_expr) + + if missing_expr is None: + # Nothing counts as missing under this spec/filter + return 0.0 <= max_pct + + # Cast boolean to Int32 before sum to support PySpark which can't sum booleans + if is_narwhals_lazyframe(nw_frame): + stats = nw_frame.select( + total_rows=nw.len(), + n_missing=missing_expr.cast(nw.Int32).sum(), + ).collect() + total_rows: int = int(stats["total_rows"][0]) + n_missing: int = int(stats["n_missing"][0]) + else: + assert is_narwhals_dataframe(nw_frame) + total_rows = int(nw_frame.select(nw.len()).item()) + n_missing = int(nw_frame.select(missing_expr.cast(nw.Int32).sum()).item()) + + if total_rows == 0: + return True + + pct_missing = n_missing / total_rows + return pct_missing <= max_pct + + def col_count_match(data_tbl: IntoFrame, count: Any, inverse: bool) -> bool: """ Check if DataFrame column count matches expected count. @@ -2534,6 +2580,19 @@ def interrogate_not_null(tbl: IntoFrame, column: str) -> Any: return result_tbl.to_native() +def interrogate_missing_coded(tbl: IntoFrame, column: str) -> Any: + """Missing-coded interrogation. + + A row passes when its value is *not* a raw null. Under the structured-missingness model, every + absence should be expressed with an explicit sentinel code (which is non-null), so a raw null + represents *uncoded* missingness and fails the test unit. + """ + nw_tbl = nw.from_native(tbl) + assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame)) + result_tbl = nw_tbl.with_columns(pb_is_good_=~nw.col(column).is_null()) + return result_tbl.to_native() + + def interrogate_increasing( tbl: IntoFrame, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool ) -> Any: From 47b23a0ab33728281f393e6b34a2b198bd61951e Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:02:35 -0400 Subject: [PATCH 14/55] Update validate.pyi --- pointblank/validate.pyi | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pointblank/validate.pyi b/pointblank/validate.pyi index 82a7664de..47124b150 100644 --- a/pointblank/validate.pyi +++ b/pointblank/validate.pyi @@ -7,6 +7,7 @@ from pathlib import Path from pointblank._typing import SegmentSpec, Tolerance from pointblank._utils import _PBUnresolvedColumn from pointblank.column import Column, ColumnSelector, ColumnSelectorNarwhals, ReferenceColumn +from pointblank.missing import MissingSpec from pointblank.schema import Schema from pointblank.thresholds import Actions, FinalActions, Thresholds from typing import Any, Callable, Literal, ParamSpec, TypeVar @@ -394,6 +395,29 @@ class Validate: brief: str | bool | None = None, active: bool | Callable = True, ) -> Validate: ... + def col_pct_missing( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + max_pct: float, + reason: str | None = None, + category: str | None = None, + thresholds: int | float | None | bool | tuple | dict | Thresholds = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: ... + def col_missing_coded( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: ... def rows_distinct( self, columns_subset: str | list[str] | None = None, From c68bc5ecdcbd94f3ab4cc7718936d44d5fac8974 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:02:43 -0400 Subject: [PATCH 15/55] Create test_col_missing_coded.py --- tests/test_col_missing_coded.py | 96 +++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 tests/test_col_missing_coded.py diff --git a/tests/test_col_missing_coded.py b/tests/test_col_missing_coded.py new file mode 100644 index 000000000..c4911eaae --- /dev/null +++ b/tests/test_col_missing_coded.py @@ -0,0 +1,96 @@ +import polars as pl +import pytest + +import pointblank as pb + + +@pytest.fixture +def age_missing(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}) + + +class TestColMissingCoded: + def test_passes_when_all_coded(self, age_missing): + # All absence expressed as sentinels; no raw nulls + tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29, -97, 55, 38]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing) + .interrogate() + ) + info = validation.validation_info[0] + assert info.n == 8 + assert info.n_failed == 0 + assert info.all_passed is True + + def test_fails_on_raw_null(self, age_missing): + tbl = pl.DataFrame({"age": [34, -98, 41, None, 29, -99, 55, None]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing) + .interrogate() + ) + info = validation.validation_info[0] + assert info.n == 8 + assert info.n_failed == 2 # two raw nulls + assert info.all_passed is False + + def test_sentinels_pass(self, age_missing): + # only sentinels and reals, no nulls -> all pass + tbl = pl.DataFrame({"age": [-99, -98, -97, -99]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing) + .interrogate() + ) + assert validation.validation_info[0].all_passed is True + + def test_missing_must_be_missingspec(self): + tbl = pl.DataFrame({"age": [1, 2, 3]}) + with pytest.raises(TypeError): + pb.Validate(data=tbl).col_missing_coded(columns="age", missing={-99: "x"}) + + def test_multiple_columns(self, age_missing): + tbl = pl.DataFrame({"a": [1, None, 3], "b": [-99, 2, 3]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns=["a", "b"], missing=age_missing) + .interrogate() + ) + assert len(validation.validation_info) == 2 + assert validation.validation_info[0].n_failed == 1 # column a has a null + assert validation.validation_info[1].n_failed == 0 # column b has none + + def test_report_renders_with_brief(self, age_missing): + tbl = pl.DataFrame({"age": [34, None, 41]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing, brief=True) + .interrogate() + ) + gt = validation.get_tabular_report() + assert gt is not None + + +class TestAutobriefTranslations: + """Exercise the autobrief text builders across languages (no KeyError).""" + + @pytest.mark.parametrize("lang", ["en", "fr", "de", "ja", "ar", "zh-Hans", "fa", "he"]) + def test_col_missing_coded_brief_langs(self, age_missing, lang): + tbl = pl.DataFrame({"age": [34, None, 41]}) + validation = ( + pb.Validate(data=tbl, lang=lang) + .col_missing_coded(columns="age", missing=age_missing, brief=True) + .interrogate() + ) + assert validation.validation_info[0].autobrief + + @pytest.mark.parametrize("lang", ["en", "fr", "de", "ja", "ar", "zh-Hans", "fa", "he"]) + def test_col_pct_missing_brief_langs(self, age_missing, lang): + tbl = pl.DataFrame({"age": [34, -98, 41, -99]}) + validation = ( + pb.Validate(data=tbl, lang=lang) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.5, brief=True) + .interrogate() + ) + assert validation.validation_info[0].autobrief From e3351a5d13a7770ee0eee3b0d7a36105f44b5387 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:02:45 -0400 Subject: [PATCH 16/55] Create test_col_pct_missing.py --- tests/test_col_pct_missing.py | 144 ++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 tests/test_col_pct_missing.py diff --git a/tests/test_col_pct_missing.py b/tests/test_col_pct_missing.py new file mode 100644 index 000000000..a1c7d4b78 --- /dev/null +++ b/tests/test_col_pct_missing.py @@ -0,0 +1,144 @@ +import polars as pl +import pytest + +import pointblank as pb + + +@pytest.fixture +def survey_tbl(): + # 8 rows: ages with sentinel codes + # -99 = not_asked, -98 = refused, -97 = dont_know + # values: 34, -98, 41, -99, 29, -98, 55, 38 + # -> 2 refused, 1 not_asked, 0 dont_know, 5 real -> 3/8 = 0.375 missing + return pl.DataFrame({"age": [34, -98, 41, -99, 29, -98, 55, 38]}) + + +@pytest.fixture +def age_missing(): + return pb.MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}, + categories={"item_nonresponse": ["refused", "dont_know"], "design": ["not_asked"]}, + ) + + +def _single_step_passed(validation): + info = validation.validation_info[0] + return info.all_passed + + +class TestColPctMissing: + def test_overall_pass(self, survey_tbl, age_missing): + validation = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.5) + .interrogate() + ) + assert _single_step_passed(validation) is True + + def test_overall_fail(self, survey_tbl, age_missing): + validation = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.30) + .interrogate() + ) + # 3/8 = 0.375 > 0.30 -> fail + assert _single_step_passed(validation) is False + + def test_by_reason_refused(self, survey_tbl, age_missing): + # 2/8 = 0.25 refused + passing = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, reason="refused", max_pct=0.25) + .interrogate() + ) + failing = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, reason="refused", max_pct=0.20) + .interrogate() + ) + assert _single_step_passed(passing) is True + assert _single_step_passed(failing) is False + + def test_by_reason_zero(self, survey_tbl, age_missing): + # no dont_know values -> 0% always passes + validation = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, reason="dont_know", max_pct=0.0) + .interrogate() + ) + assert _single_step_passed(validation) is True + + def test_by_category(self, survey_tbl, age_missing): + # item_nonresponse = refused + dont_know = 2/8 = 0.25 + passing = ( + pb.Validate(data=survey_tbl) + .col_pct_missing( + columns="age", missing=age_missing, category="item_nonresponse", max_pct=0.25 + ) + .interrogate() + ) + assert _single_step_passed(passing) is True + + def test_nulls_counted(self, age_missing): + tbl = pl.DataFrame({"age": [34, None, 41, -98, 29, 38, 55, 38]}) + # null_is_missing=True by default: 1 null + 1 refused = 2/8 = 0.25 + validation = ( + pb.Validate(data=tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.25) + .interrogate() + ) + assert _single_step_passed(validation) is True + + def test_nulls_excluded_when_spec_says_so(self): + spec = pb.MissingSpec(reasons={-98: "refused"}, null_is_missing=False) + tbl = pl.DataFrame({"age": [34, None, None, -98, 29, 38, 55, 38]}) + # only -98 counts: 1/8 = 0.125 + validation = ( + pb.Validate(data=tbl) + .col_pct_missing(columns="age", missing=spec, max_pct=0.125) + .interrogate() + ) + assert _single_step_passed(validation) is True + + def test_reason_and_category_mutually_exclusive(self, survey_tbl, age_missing): + with pytest.raises(ValueError, match="Only one of"): + pb.Validate(data=survey_tbl).col_pct_missing( + columns="age", + missing=age_missing, + reason="refused", + category="item_nonresponse", + max_pct=0.5, + ) + + def test_max_pct_bounds(self, survey_tbl, age_missing): + with pytest.raises(ValueError, match="max_pct"): + pb.Validate(data=survey_tbl).col_pct_missing( + columns="age", missing=age_missing, max_pct=1.5 + ) + + def test_missing_must_be_missingspec(self, survey_tbl): + with pytest.raises(TypeError): + pb.Validate(data=survey_tbl).col_pct_missing( + columns="age", missing={-99: "not_asked"}, max_pct=0.5 + ) + + def test_multiple_columns(self, age_missing): + tbl = pl.DataFrame( + {"a": [1, -98, 3, 4], "b": [-99, -99, 3, 4]} + ) + validation = ( + pb.Validate(data=tbl) + .col_pct_missing(columns=["a", "b"], missing=age_missing, max_pct=0.5) + .interrogate() + ) + assert len(validation.validation_info) == 2 + + def test_report_renders(self, survey_tbl, age_missing): + # The validation report should build without error (exercises icon + value rendering) + validation = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.5, brief=True) + .interrogate() + ) + gt = validation.get_tabular_report() + assert gt is not None From 6e3a5727a8cee672c0511c21bf77ddf65f16cc6a Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:02:48 -0400 Subject: [PATCH 17/55] Create test_missing.py --- tests/test_missing.py | 124 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 tests/test_missing.py diff --git a/tests/test_missing.py b/tests/test_missing.py new file mode 100644 index 000000000..53b62099a --- /dev/null +++ b/tests/test_missing.py @@ -0,0 +1,124 @@ +import pytest + +import pointblank as pb +from pointblank.missing import MissingSpec + + +class TestMissingSpecConstruction: + """Tests for MissingSpec construction and validation.""" + + def test_minimal_spec(self): + spec = MissingSpec(reasons={-99: "not_asked"}) + assert spec.reasons == {-99: "not_asked"} + assert spec.categories is None + assert spec.null_is_missing is True + assert spec.null_reason == "unknown" + assert spec.description is None + + def test_full_spec(self): + spec = MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}, + categories={"item_nonresponse": ["refused", "dont_know"], "design": ["not_asked"]}, + null_is_missing=False, + null_reason="system", + description="Standard survey codes", + ) + assert spec.null_is_missing is False + assert spec.null_reason == "system" + assert spec.description == "Standard survey codes" + + def test_exported_from_top_level(self): + assert pb.MissingSpec is MissingSpec + + def test_reasons_must_be_dict(self): + with pytest.raises(TypeError): + MissingSpec(reasons=[-99, -98]) # type: ignore[arg-type] + + def test_empty_reasons_requires_null_is_missing(self): + # OK: empty reasons but null_is_missing=True + MissingSpec(reasons={}, null_is_missing=True) + # Not OK: empty reasons and null_is_missing=False + with pytest.raises(ValueError): + MissingSpec(reasons={}, null_is_missing=False) + + def test_reason_labels_must_be_strings(self): + with pytest.raises(TypeError): + MissingSpec(reasons={-99: 1}) # type: ignore[dict-item] + + def test_category_must_reference_known_reasons(self): + with pytest.raises(ValueError, match="unknown reason"): + MissingSpec( + reasons={-99: "not_asked"}, + categories={"bad": ["nonexistent"]}, + ) + + def test_category_can_reference_null_reason(self): + spec = MissingSpec( + reasons={-99: "not_asked"}, + categories={"all_absent": ["not_asked", "unknown"]}, + null_is_missing=True, + ) + assert spec.values_for_category("all_absent") == [-99] + + def test_categories_must_be_dict(self): + with pytest.raises(TypeError): + MissingSpec(reasons={-99: "not_asked"}, categories=["not_asked"]) # type: ignore[arg-type] + + +class TestMissingSpecMethods: + @pytest.fixture + def spec(self): + return MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know", -96: "not_applicable"}, + categories={ + "item_nonresponse": ["refused", "dont_know"], + "design": ["not_asked", "not_applicable"], + }, + ) + + def test_sentinel_values(self, spec): + assert spec.sentinel_values() == [-99, -98, -97, -96] + + def test_reason_for(self, spec): + assert spec.reason_for(-98) == "refused" + assert spec.reason_for(5) is None + + def test_reason_for_null(self, spec): + assert spec.reason_for(None) == "unknown" + spec_no_null = MissingSpec(reasons={-99: "not_asked"}, null_is_missing=False) + assert spec_no_null.reason_for(None) is None + + def test_is_missing(self, spec): + assert spec.is_missing(-99) is True + assert spec.is_missing(42) is False + assert spec.is_missing(None) is True + + def test_is_missing_null_excluded(self): + spec = MissingSpec(reasons={-99: "not_asked"}, null_is_missing=False) + assert spec.is_missing(None) is False + + def test_values_for_reason(self, spec): + assert spec.values_for_reason("refused") == [-98] + assert spec.values_for_reason("nonexistent") == [] + + def test_values_for_category(self, spec): + assert spec.values_for_category("item_nonresponse") == [-98, -97] + assert spec.values_for_category("design") == [-99, -96] + assert spec.values_for_category("nonexistent") == [] + + def test_values_for_category_no_categories(self): + spec = MissingSpec(reasons={-99: "not_asked"}) + assert spec.values_for_category("anything") == [] + + def test_reasons_list(self, spec): + assert spec.reasons_list() == [ + "not_asked", + "refused", + "dont_know", + "not_applicable", + "unknown", + ] + + def test_reasons_list_no_null(self): + spec = MissingSpec(reasons={-99: "a", -98: "b"}, null_is_missing=False) + assert spec.reasons_list() == ["a", "b"] From e1b6faa1d339b3580577ee07e74ce34c5e312d06 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:39:17 -0400 Subject: [PATCH 18/55] Apply structured missingness to valdn methods --- pointblank/validate.py | 186 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 185 insertions(+), 1 deletion(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index f9cc73c7d..78bdeeced 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -61,6 +61,7 @@ from pointblank._interrogation import ( NumberOfTestUnits, SpeciallyValidation, + apply_missing_exclusion, col_count_match, col_exists, col_pct_missing, @@ -2677,7 +2678,136 @@ def _generate_display_table( return gt_tbl -def missing_vals_tbl(data: Any) -> GT: +def _prettify_reason_label(reason: str) -> str: + """Turn a snake_case reason label into a Title Case display label (e.g. 'not_asked' -> + 'Not Asked').""" + return reason.replace("_", " ").title() + + +def _build_structured_missing_tbl(data: Any, missing: dict[str, MissingSpec]) -> GT: + """Build a structured-missingness breakdown table (one row per column, columns for the count + and percentage of complete values and of each missing reason).""" + if not isinstance(missing, dict): + raise TypeError( + f"`missing=` must be a dict mapping column names to MissingSpec objects, " + f"got {type(missing).__name__}." + ) + for col_name, spec in missing.items(): + if not isinstance(spec, MissingSpec): + raise TypeError( + f"`missing[{col_name!r}]` must be a MissingSpec, got {type(spec).__name__}." + ) + + nw_frame = nw.from_native(data) + is_lazy = isinstance(nw_frame, nw.LazyFrame) + + available_columns = list(nw_frame.columns) + + # Build the ordered union of reason labels across all specs (first-seen order) + reason_order: list[str] = [] + for spec in missing.values(): + for r in spec.reasons_list(): + if r not in reason_order: + reason_order.append(r) + + records: list[dict[str, Any]] = [] + for column, spec in missing.items(): + if column not in available_columns: + raise ValueError( + f"Column '{column}' given in `missing=` was not found in the table." + ) + + # Build one aggregation per reason that has an expression (sentinels and/or nulls) + select_exprs: dict[str, Any] = {"__total__": nw.len()} + reason_alias: dict[str, str] = {} + for i, r in enumerate(spec.reasons_list()): + sentinels = spec.values_for_reason(r) + expr = None + if sentinels: + expr = nw.col(column).is_in(sentinels) + if r == spec.null_reason and spec.null_is_missing: + null_expr = nw.col(column).is_null() + expr = null_expr if expr is None else (expr | null_expr) + if expr is not None: + alias = f"__r{i}__" + reason_alias[r] = alias + select_exprs[alias] = expr.cast(nw.Int32).sum() + + out = nw_frame.select(**select_exprs) + if is_lazy: + out = out.collect() + + total = int(out["__total__"][0]) + counts: dict[str, int] = {} + for r in spec.reasons_list(): + counts[r] = int(out[reason_alias[r]][0]) if r in reason_alias else 0 + + total_missing = sum(counts.values()) + complete = total - total_missing + + def _fmt(count: int) -> str: + pct = round(100 * count / total) if total > 0 else 0 + return f"{count} ({pct}%)" + + record: dict[str, Any] = { + "columns": column, + "total_n": str(total), + "complete": _fmt(complete), + } + # Fill every reason column in the union (0 for reasons this spec doesn't define) + for r in reason_order: + record[r] = _fmt(counts.get(r, 0)) + records.append(record) + + # Build a DataFrame from the records using the available DataFrame library + df_lib_gt = _select_df_lib(preference="polars") + if df_lib_gt.__name__ == "polars": + import polars as pl + + breakdown_df = pl.DataFrame(records) + else: + import pandas as pd + + breakdown_df = pd.DataFrame(records) + + title = "Missing Values by Reason" + subtitle = "Counts and percentages of complete values and each missing reason, per column." + + cols_labels = { + "columns": "Column", + "total_n": "Total N", + "complete": "Complete", + } + for r in reason_order: + cols_labels[r] = _prettify_reason_label(r) + + value_columns = ["total_n", "complete"] + reason_order + + gt_tbl = ( + GT(breakdown_df) + .tab_header(title=html(f"
{title}
"), subtitle=subtitle) + .opt_table_font(font=google_font(name="IBM Plex Sans")) + .opt_align_table_header(align="left") + .cols_label(cases=cols_labels) + .cols_align(align="right", columns=value_columns) + .cols_align(align="left", columns="columns") + .tab_style( + style=style.text(font=google_font(name="IBM Plex Mono"), size="12px"), + locations=loc.body(columns=value_columns), + ) + .tab_style( + style=style.text(weight="bold"), + locations=loc.body(columns="columns"), + ) + ) + + if version("great_tables") >= "0.17.0": + gt_tbl = gt_tbl.tab_options(quarto_disable_processing=True) + + return gt_tbl + + +def missing_vals_tbl(data: Any, missing: dict[str, MissingSpec] | None = None) -> GT: """ Display a table that shows the missing values in the input table. @@ -2685,12 +2815,23 @@ def missing_vals_tbl(data: Any) -> GT: table. The table is displayed using the Great Tables API, which allows for further customization of the table's appearance if so desired. + By default, missingness is treated as binary (a value is either Null or it isn't) and the + function renders a sector-based heatmap of the proportion of Null values across the rows of each + column. When a `missing=` mapping of columns to [`MissingSpec`](`pointblank.MissingSpec`) objects + is supplied, the function instead renders a *structured missingness* breakdown: one row per + column with the count and percentage of complete values and of each missing *reason* (e.g., + "Refused", "Not Asked", "Unknown"). + Parameters ---------- data The table for which to display the missing values. This could be a DataFrame object, an Ibis table object, a CSV file path, a Parquet file path, or a database connection string. Read the *Supported Input Table Types* section for details on the supported table types. + missing + An optional dictionary mapping column names to [`MissingSpec`](`pointblank.MissingSpec`) + objects. When provided, the function renders a structured breakdown of missingness by + reason for the specified columns (rather than the default sector heatmap). Returns ------- @@ -2768,6 +2909,12 @@ def missing_vals_tbl(data: Any) -> GT: if "pyspark" not in tbl_type: data = copy.deepcopy(data) + # If a `missing=` spec mapping is provided, render the structured missingness breakdown + # (count and percentage of complete values and each missing reason, per column) instead of + # the default sector heatmap + if missing is not None: + return _build_structured_missing_tbl(data=data, missing=missing) + # Get the number of rows in the table n_rows = get_row_count(data) @@ -3818,6 +3965,7 @@ def from_agg_validator( values: Any | list[Any] | tuple | None = None inclusive: tuple[bool, bool] | None = None na_pass: bool | None = None + missing: Any | None = None pre: Callable | None = None segments: Any | None = None thresholds: Thresholds | None = None @@ -5211,6 +5359,7 @@ def col_vals_gt( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -5483,6 +5632,7 @@ def col_vals_gt( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -5500,6 +5650,7 @@ def col_vals_lt( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -5779,6 +5930,7 @@ def col_vals_lt( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -5796,6 +5948,7 @@ def col_vals_eq( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -6075,6 +6228,7 @@ def col_vals_eq( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -6092,6 +6246,7 @@ def col_vals_ne( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -6369,6 +6524,7 @@ def col_vals_ne( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -6386,6 +6542,7 @@ def col_vals_ge( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -6666,6 +6823,7 @@ def col_vals_ge( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -6683,6 +6841,7 @@ def col_vals_le( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -6963,6 +7122,7 @@ def col_vals_le( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -6982,6 +7142,7 @@ def col_vals_between( right: float | int | Column, inclusive: tuple[bool, bool] = (True, True), na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -7288,6 +7449,7 @@ def col_vals_between( values=value, inclusive=inclusive, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -7307,6 +7469,7 @@ def col_vals_outside( right: float | int | Column, inclusive: tuple[bool, bool] = (True, True), na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -7613,6 +7776,7 @@ def col_vals_outside( values=value, inclusive=inclusive, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -7629,6 +7793,7 @@ def col_vals_in_set( self, columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, set: Collection[Any], + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -7935,6 +8100,7 @@ class Color(Enum): assertion_type=assertion_type, column=column, values=set, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -7951,6 +8117,7 @@ def col_vals_not_in_set( self, columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, set: Collection[Any], + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -8229,6 +8396,7 @@ class InvalidStatus(Enum): assertion_type=assertion_type, column=column, values=set, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -8247,6 +8415,7 @@ def col_vals_increasing( allow_stationary: bool = False, decreasing_tol: float | None = None, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -8425,6 +8594,7 @@ def col_vals_increasing( column=column, values="", na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -8447,6 +8617,7 @@ def col_vals_decreasing( allow_stationary: bool = False, increasing_tol: float | None = None, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -8625,6 +8796,7 @@ def col_vals_decreasing( column=column, values="", na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -9143,6 +9315,7 @@ def col_vals_regex( pattern: str, na_pass: bool = False, inverse: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -9394,6 +9567,7 @@ def col_vals_regex( column=column, values=values, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -9411,6 +9585,7 @@ def col_vals_within_spec( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, spec: str, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -9684,6 +9859,7 @@ def col_vals_within_spec( column=column, values=values, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -14199,6 +14375,14 @@ def interrogate( tbl=tbl, column=column, values=value, na_pass=na_pass ) + # Apply structured-missingness exclusion: any row whose value is a + # declared sentinel (or a null when `null_is_missing=True`) is treated + # as a passing test unit, so only the "real" values are validated + if validation.missing is not None and results_tbl is not None: + results_tbl = apply_missing_exclusion( + results_tbl=results_tbl, column=column, spec=validation.missing + ) + elif assertion_type == "col_pct_null": result_bool = col_pct_null( data_tbl=data_tbl_step, From 4fedb0e1e198c4d1c96752ead0dabcc3ec065079 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:39:45 -0400 Subject: [PATCH 19/55] Add the apply_missing_exclusion() util function --- pointblank/_interrogation.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pointblank/_interrogation.py b/pointblank/_interrogation.py index 8485ef96e..2e1e8e8af 100644 --- a/pointblank/_interrogation.py +++ b/pointblank/_interrogation.py @@ -2580,6 +2580,36 @@ def interrogate_not_null(tbl: IntoFrame, column: str) -> Any: return result_tbl.to_native() +def apply_missing_exclusion(results_tbl: IntoFrame, column: str, spec: Any) -> Any: + """Mark rows with structured-missing values as passing. + + Given a `results_tbl` that already carries a boolean `pb_is_good_` column, force that column to + `True` for any row whose value in `column` is a declared sentinel of `spec` (a `MissingSpec`), + or a null when `spec.null_is_missing` is `True`. This implements the `missing=` exclusion on + `col_vals_*` validation methods: sentinel/missing values are excluded from the check (they pass) + so that only the "real" values are validated. + """ + sentinels = spec.sentinel_values() + + # Build a null-free boolean mask. Note `is_in()` yields null for null inputs, and OR-ing a null + # into `pb_is_good_` would corrupt a failing row (False | null = null under Kleene logic), so the + # sentinel mask is explicitly filled with `False` for null rows. + mask = None + if sentinels: + mask = nw.col(column).is_in(sentinels).fill_null(False) + if spec.null_is_missing: + null_expr = nw.col(column).is_null() + mask = null_expr if mask is None else (mask | null_expr) + + if mask is None: + return results_tbl + + nw_tbl = nw.from_native(results_tbl) + assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame)) + nw_tbl = nw_tbl.with_columns(pb_is_good_=(nw.col("pb_is_good_") | mask)) + return nw_tbl.to_native() + + def interrogate_missing_coded(tbl: IntoFrame, column: str) -> Any: """Missing-coded interrogation. From 9ccaa5914a78d6b89a0d6bbeb40535b887aa3ea4 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:39:47 -0400 Subject: [PATCH 20/55] Update validate.pyi --- pointblank/validate.pyi | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pointblank/validate.pyi b/pointblank/validate.pyi index 47124b150..359686cf2 100644 --- a/pointblank/validate.pyi +++ b/pointblank/validate.pyi @@ -78,7 +78,7 @@ def preview( min_tbl_width: int = 500, incl_header: bool | None = None, ) -> GT: ... -def missing_vals_tbl(data: Any) -> GT: ... +def missing_vals_tbl(data: Any, missing: dict[str, MissingSpec] | None = None) -> GT: ... def get_column_count(data: Any) -> int: ... def get_row_count(data: Any) -> int: ... @dataclass @@ -179,6 +179,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -191,6 +192,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -203,6 +205,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -215,6 +218,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -227,6 +231,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -239,6 +244,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -253,6 +259,7 @@ class Validate: right: float | int | Column, inclusive: tuple[bool, bool] = (True, True), na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -267,6 +274,7 @@ class Validate: right: float | int | Column, inclusive: tuple[bool, bool] = (True, True), na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -278,6 +286,7 @@ class Validate: self, columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, set: Collection[Any], + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -289,6 +298,7 @@ class Validate: self, columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, set: Collection[Any], + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -302,6 +312,7 @@ class Validate: allow_stationary: bool = False, decreasing_tol: float | None = None, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -315,6 +326,7 @@ class Validate: allow_stationary: bool = False, increasing_tol: float | None = None, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -348,6 +360,7 @@ class Validate: pattern: str, na_pass: bool = False, inverse: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -360,6 +373,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, spec: str, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, From b9aaf1f938b3c1c627fb7f1713005d8168c64cbc Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:39:54 -0400 Subject: [PATCH 21/55] Update yaml.py --- pointblank/yaml.py | 96 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/pointblank/yaml.py b/pointblank/yaml.py index 4bdd67cd2..455d6744f 100644 --- a/pointblank/yaml.py +++ b/pointblank/yaml.py @@ -8,9 +8,39 @@ from pointblank._agg import is_valid_agg from pointblank._utils import _is_lib_present +from pointblank.missing import MissingSpec from pointblank.thresholds import Actions from pointblank.validate import Validate, load_dataset + +def _missing_spec_from_dict(spec_def: dict) -> MissingSpec: + """Build a `MissingSpec` from a YAML mapping.""" + if not isinstance(spec_def, dict): + raise YAMLValidationError( + f"A missing spec must be a mapping, got {type(spec_def).__name__}." + ) + return MissingSpec( + reasons=spec_def.get("reasons", {}), + categories=spec_def.get("categories"), + null_is_missing=spec_def.get("null_is_missing", True), + null_reason=spec_def.get("null_reason", "unknown"), + description=spec_def.get("description"), + ) + + +def _missing_spec_to_code(spec: MissingSpec) -> str: + """Render a `MissingSpec` as a `pb.MissingSpec(...)` constructor call for code generation.""" + parts = [f"reasons={spec.reasons!r}"] + if spec.categories is not None: + parts.append(f"categories={spec.categories!r}") + if spec.null_is_missing is not True: + parts.append(f"null_is_missing={spec.null_is_missing!r}") + if spec.null_reason != "unknown": + parts.append(f"null_reason={spec.null_reason!r}") + if spec.description is not None: + parts.append(f"description={spec.description!r}") + return f"pb.MissingSpec({', '.join(parts)})" + if TYPE_CHECKING: from typing import Literal @@ -243,6 +273,8 @@ class YAMLValidator: "col_vals_decreasing": "col_vals_decreasing", "col_vals_within_spec": "col_vals_within_spec", "col_pct_null": "col_pct_null", + "col_pct_missing": "col_pct_missing", + "col_missing_coded": "col_missing_coded", "rows_distinct": "rows_distinct", "rows_complete": "rows_complete", "col_count_match": "col_count_match", @@ -332,6 +364,7 @@ def _validate_schema(self, config: dict) -> None: "steps", "tbl_name", "label", + "missing_specs", "thresholds", "actions", "final_actions", @@ -608,10 +641,45 @@ def _parse_schema_spec(self, schema_spec: Any) -> Any: f"Schema specification must be a dictionary, got: {type(schema_spec)}" ) + def _parse_missing_specs(self, config: dict) -> dict[str, MissingSpec]: + """Parse the top-level `missing_specs` block into named `MissingSpec` objects.""" + raw = config.get("missing_specs") + if raw is None: + return {} + if not isinstance(raw, dict): + raise YAMLValidationError("'missing_specs' must be a dictionary of named specs") + return {name: _missing_spec_from_dict(spec_def) for name, spec_def in raw.items()} + + def _resolve_missing( + self, value: Any, missing_specs: Optional[dict[str, MissingSpec]] + ) -> MissingSpec: + """Resolve a step's `missing=` value to a `MissingSpec`. + + The value can be a named reference into the top-level `missing_specs` block, an inline + mapping defining a spec, or an already-constructed `MissingSpec`. + """ + if isinstance(value, MissingSpec): + return value + if isinstance(value, str): + if not missing_specs or value not in missing_specs: + available = sorted(missing_specs.keys()) if missing_specs else [] + raise YAMLValidationError( + f"Unknown missing spec '{value}'. Define it under the top-level " + f"'missing_specs' block. Available: {available}" + ) + return missing_specs[value] + if isinstance(value, dict): + return _missing_spec_from_dict(value) + raise YAMLValidationError( + f"Invalid 'missing' value: {value!r}. Use a named reference, an inline mapping, " + "or a MissingSpec." + ) + def _parse_validation_step( self, step_config: Union[str, dict], namespaces: Optional[Union[Iterable[str], Mapping[str, str]]] = None, + missing_specs: Optional[dict[str, MissingSpec]] = None, ) -> tuple[str, dict]: """Parse a single validation step from YAML configuration. @@ -676,6 +744,10 @@ def _parse_validation_step( # (e.g., `active: pb.has_columns("col_a")` or `active: false`) elif key == "active" and isinstance(value, str): processed_parameters[key] = _safe_eval_python_code(value, namespaces=namespaces) + elif key == "missing": + # Pass the raw value through (a spec name, inline mapping, or MissingSpec); it is + # resolved to a MissingSpec below, after the loop + processed_parameters[key] = value else: # Normal processing (requires python: block syntax) processed_parameters[key] = _process_python_expressions( @@ -683,6 +755,11 @@ def _parse_validation_step( ) parameters = processed_parameters + # Resolve a `missing=` parameter (used by col_pct_missing, col_missing_coded) into a + # MissingSpec, looking up named references in the top-level `missing_specs` block + if "missing" in parameters: + parameters["missing"] = self._resolve_missing(parameters["missing"], missing_specs) + # Convert `columns=` specification if "columns" in parameters: parameters["columns"] = self._parse_column_spec(parameters["columns"]) @@ -832,10 +909,13 @@ def build_validation( validation = Validate(data, **validate_kwargs) + # Parse any named missing specs declared at the top level + missing_specs = self._parse_missing_specs(config) + # Add validation steps for step_config in config["steps"]: method_name, parameters = self._parse_validation_step( - step_config, namespaces=namespaces + step_config, namespaces=namespaces, missing_specs=missing_specs ) # Get the method from the validation object @@ -1644,6 +1724,9 @@ def extract_python_expressions(obj, path=""): validator = YAMLValidator() config = validator.load_config(yaml) + # Parse any named missing specs so steps referencing them can be rendered + missing_specs = validator._parse_missing_specs(config) + # Start building the Python code code_lines = [] @@ -1780,7 +1863,9 @@ def extract_python_expressions(obj, path=""): # Handle string steps (parameterless methods like "rows_distinct") if isinstance(step_config, str): - method_name, parameters = validator._parse_validation_step(step_config, namespaces=None) + method_name, parameters = validator._parse_validation_step( + step_config, namespaces=None, missing_specs=missing_specs + ) code_lines.append(f" .{method_name}()") continue @@ -1802,7 +1887,9 @@ def extract_python_expressions(obj, path=""): elif isinstance(step_params["expr"], str): original_expressions["expr"] = step_params["expr"] - method_name, parameters = validator._parse_validation_step(step_config, namespaces=None) + method_name, parameters = validator._parse_validation_step( + step_config, namespaces=None, missing_specs=missing_specs + ) # Apply the original expressions to override the converted lambda functions if method_name == "conjointly" and "expressions" in original_expressions: @@ -1852,6 +1939,9 @@ def extract_python_expressions(obj, path=""): param_parts.append(f"{key}={columns_str}") else: param_parts.append(f'{key}="{value}"') # pragma: no cover + elif key == "missing" and isinstance(value, MissingSpec): + # Render a resolved MissingSpec as a `pb.MissingSpec(...)` constructor call + param_parts.append(f"missing={_missing_spec_to_code(value)}") elif key == "brief": # Handle `brief=` parameter: can be a boolean or a string if isinstance(value, bool): From b1ab501690f07bedf6e8e8f89c6711e33484de59 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:39:59 -0400 Subject: [PATCH 22/55] Create test_col_vals_missing_param.py --- tests/test_col_vals_missing_param.py | 94 ++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 tests/test_col_vals_missing_param.py diff --git a/tests/test_col_vals_missing_param.py b/tests/test_col_vals_missing_param.py new file mode 100644 index 000000000..ea86b4dde --- /dev/null +++ b/tests/test_col_vals_missing_param.py @@ -0,0 +1,94 @@ +import polars as pl +import pandas as pd +import pytest + +import pointblank as pb + + +@pytest.fixture +def spec(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + +@pytest.fixture +def spec_no_null(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}, null_is_missing=False) + + +def _info(v): + return v.validation_info[0] + + +class TestMissingExclusion: + def test_between_excludes_sentinels_and_nulls(self, spec): + tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29, 200, 55, None]}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + info = _info(v) + assert info.n == 8 + # only 200 is a real out-of-range value + assert info.n_failed == 1 + + def test_gt_excludes(self, spec): + tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29, 55]}) + v = pb.Validate(data=tbl).col_vals_gt(columns="age", value=0, missing=spec).interrogate() + assert _info(v).n_failed == 0 + + def test_null_not_excluded_when_spec_says_so(self, spec_no_null): + # null_is_missing=False -> nulls are NOT excluded; with na_pass default False, null fails gt + tbl = pl.DataFrame({"age": [34, -98, None, 41]}) + v = ( + pb.Validate(data=tbl) + .col_vals_gt(columns="age", value=0, missing=spec_no_null) + .interrogate() + ) + # -98 excluded (passes); null fails (na_pass False); reals pass -> 1 failure + assert _info(v).n_failed == 1 + + def test_in_set_excludes_sentinels(self, spec): + tbl = pl.DataFrame({"grade": [1, 2, -99, 3, -98, 9]}) + v = ( + pb.Validate(data=tbl) + .col_vals_in_set(columns="grade", set=[1, 2, 3], missing=spec) + .interrogate() + ) + # 9 is the only real value not in the set + assert _info(v).n_failed == 1 + + def test_regex_excludes_string_sentinels(self): + spec = pb.MissingSpec(reasons={"N/A": "not_applicable", "REF": "refused"}) + tbl = pl.DataFrame({"code": ["AB12", "N/A", "CD34", "REF", "bad code"]}) + v = ( + pb.Validate(data=tbl) + .col_vals_regex(columns="code", pattern=r"^[A-Z]{2}[0-9]{2}$", missing=spec) + .interrogate() + ) + # "bad code" is the only real non-matching value + assert _info(v).n_failed == 1 + + def test_no_missing_param_unchanged(self): + tbl = pl.DataFrame({"age": [34, -98, 41]}) + v = pb.Validate(data=tbl).col_vals_gt(columns="age", value=0).interrogate() + # -98 is a real value < 0 -> fails when missing= not used + assert _info(v).n_failed == 1 + + def test_pandas_backend(self, spec): + tbl = pd.DataFrame({"age": [34, -98, 41, -99, 200]}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + assert _info(v).n_failed == 1 + + def test_report_renders(self, spec): + tbl = pl.DataFrame({"age": [34, -98, 41, 200]}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + assert v.get_tabular_report() is not None From 7ef7d3503ae2a4710d5f0efe780ec867349d48c2 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:40:01 -0400 Subject: [PATCH 23/55] Create test_missing_vals_tbl_structured.py --- tests/test_missing_vals_tbl_structured.py | 86 +++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 tests/test_missing_vals_tbl_structured.py diff --git a/tests/test_missing_vals_tbl_structured.py b/tests/test_missing_vals_tbl_structured.py new file mode 100644 index 000000000..30c2f9907 --- /dev/null +++ b/tests/test_missing_vals_tbl_structured.py @@ -0,0 +1,86 @@ +import polars as pl +import pandas as pd +import pytest +from great_tables import GT + +import pointblank as pb + + +@pytest.fixture +def tbl_pl(): + return pl.DataFrame( + { + "age": [34, -98, 41, -99, 29, -98, 55, None], + "income": [50000, -99, -1, None, 42000, -99, 38000, 61000], + } + ) + + +@pytest.fixture +def specs(): + return { + "age": pb.MissingSpec(reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}), + "income": pb.MissingSpec(reasons={-99: "not_asked", -1: "below_threshold"}), + } + + +class TestStructuredMissingTbl: + def test_returns_gt(self, tbl_pl, specs): + result = pb.missing_vals_tbl(tbl_pl, missing=specs) + assert isinstance(result, GT) + + def test_reason_columns_present(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html() + for token in [ + "Not Asked", + "Refused", + "Dont Know", + "Below Threshold", + "Unknown", + "Complete", + "Total N", + ]: + assert token in html + + def test_counts_correct(self, tbl_pl): + # age: total 8 -> refused 2 (25%), not_asked 1 (12%), dont_know 0 (0%), + # unknown/null 1 (12%), complete 4 (50%) + spec = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}) + html = pb.missing_vals_tbl(tbl_pl, missing={"age": spec}).as_raw_html() + assert "4 (50%)" in html # complete + assert "2 (25%)" in html # refused + assert "1 (12%)" in html # not_asked / unknown + assert "0 (0%)" in html # dont_know + + def test_null_excluded_when_spec_says_so(self): + # null_is_missing=False -> the null is counted as complete, no Unknown column + tbl = pl.DataFrame({"age": [34, -98, 41, None]}) + spec = pb.MissingSpec(reasons={-98: "refused"}, null_is_missing=False) + html = pb.missing_vals_tbl(tbl, missing={"age": spec}).as_raw_html() + assert "Unknown" not in html + # complete = 3 (null + 2 reals) of 4 = 75% + assert "3 (75%)" in html + + def test_pandas_input(self, specs): + tbl = pd.DataFrame( + { + "age": [34, -98, 41, -99, 29, -98, 55, None], + "income": [50000, -99, -1, None, 42000, -99, 38000, 61000], + } + ) + result = pb.missing_vals_tbl(tbl, missing=specs) + assert isinstance(result, GT) + + def test_default_behavior_unchanged(self, tbl_pl): + # No missing= -> the original sector heatmap path + result = pb.missing_vals_tbl(tbl_pl) + assert isinstance(result, GT) + + def test_missing_must_be_dict_of_specs(self, tbl_pl): + with pytest.raises(TypeError): + pb.missing_vals_tbl(tbl_pl, missing={"age": {-99: "x"}}) + + def test_unknown_column_raises(self, tbl_pl): + spec = pb.MissingSpec(reasons={-99: "not_asked"}) + with pytest.raises(ValueError, match="not found"): + pb.missing_vals_tbl(tbl_pl, missing={"nonexistent": spec}) From 40b33006ce67396863232f7af6c5c78a8d1c8970 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:40:04 -0400 Subject: [PATCH 24/55] Update test_validate.py --- tests/test_validate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_validate.py b/tests/test_validate.py index 64e8718d0..6ff03cdfa 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -833,6 +833,7 @@ def test_validation_plan_and_interrogation(request, tbl_fixture) -> None: "values", "inclusive", "na_pass", + "missing", "pre", "segments", "thresholds", @@ -915,6 +916,7 @@ def test_validation_plan_and_interrogation(request, tbl_fixture) -> None: "values", "inclusive", "na_pass", + "missing", "pre", "segments", "thresholds", From 274c2e32f10f2bd7847c770d1b2179771462a5aa Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 15:40:08 -0400 Subject: [PATCH 25/55] Create test_yaml_missing_specs.py --- tests/test_yaml_missing_specs.py | 127 +++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 tests/test_yaml_missing_specs.py diff --git a/tests/test_yaml_missing_specs.py b/tests/test_yaml_missing_specs.py new file mode 100644 index 000000000..9937bd939 --- /dev/null +++ b/tests/test_yaml_missing_specs.py @@ -0,0 +1,127 @@ +import polars as pl +import pytest + +import pointblank as pb +from pointblank.yaml import YAMLValidationError, yaml_interrogate, yaml_to_python + + +def _write_csv(tmp_path, df): + p = tmp_path / "survey.csv" + df.write_csv(p) + return str(p) + + +@pytest.fixture +def survey_csv(tmp_path): + df = pl.DataFrame({"age": [34, -98, 41, -99, 29, -98, 55, 38]}) + return _write_csv(tmp_path, df) + + +def test_named_missing_spec_pct(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +missing_specs: + standard_survey: + reasons: + -99: not_asked + -98: refused + -97: dont_know + categories: + nonresponse: [refused, dont_know] +steps: + - col_pct_missing: + columns: age + missing: standard_survey + max_pct: 0.5 + - col_pct_missing: + columns: age + missing: standard_survey + reason: refused + max_pct: 0.30 +""" + result = yaml_interrogate(yaml_str) + assert len(result.validation_info) == 2 + # overall 3/8=0.375 <= 0.5 pass; refused 2/8=0.25 <= 0.30 pass + assert result.validation_info[0].all_passed is True + assert result.validation_info[1].all_passed is True + + +def test_named_missing_spec_coded(tmp_path): + df = pl.DataFrame({"age": [34, -98, 41, None, 29, -99, 55, 38]}) + csv = _write_csv(tmp_path, df) + yaml_str = f""" +tbl: {csv} +missing_specs: + survey: + reasons: + -99: not_asked + -98: refused +steps: + - col_missing_coded: + columns: age + missing: survey +""" + result = yaml_interrogate(yaml_str) + info = result.validation_info[0] + assert info.n_failed == 1 # one raw null + + +def test_inline_missing_spec(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +steps: + - col_pct_missing: + columns: age + missing: + reasons: + -99: not_asked + -98: refused + max_pct: 0.5 +""" + result = yaml_interrogate(yaml_str) + assert result.validation_info[0].all_passed is True + + +def test_unknown_spec_reference_raises(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +steps: + - col_pct_missing: + columns: age + missing: nonexistent + max_pct: 0.5 +""" + with pytest.raises(YAMLValidationError, match="Unknown missing spec"): + yaml_interrogate(yaml_str) + + +def test_missing_specs_must_be_dict(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +missing_specs: + - not_a_mapping +steps: + - rows_distinct +""" + with pytest.raises(YAMLValidationError): + yaml_interrogate(yaml_str) + + +def test_yaml_to_python_renders_missing_spec(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +missing_specs: + survey: + reasons: + -99: not_asked + -98: refused +steps: + - col_pct_missing: + columns: age + missing: survey + max_pct: 0.5 +""" + code = yaml_to_python(yaml_str) + assert "pb.MissingSpec(" in code + assert "col_pct_missing" in code + assert "reasons=" in code From a138223873dbeb2ba9dcb00697a720937b4a8ac1 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:10 -0400 Subject: [PATCH 26/55] Update _constants_translations.py --- pointblank/_constants_translations.py | 168 ++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/pointblank/_constants_translations.py b/pointblank/_constants_translations.py index 920d494e8..14f57c58a 100644 --- a/pointblank/_constants_translations.py +++ b/pointblank/_constants_translations.py @@ -1217,6 +1217,174 @@ "th": "มีค่าที่หายไปที่ไม่ได้เข้ารหัส (ค่า null ดิบ) อยู่ใน {column_text}", "fa": "مقادیر مفقود کدگذاری‌نشده (مقادیر null خام) در {column_text} وجود داشت.", }, + "col_missing_only_coded_expectation_text": { + "en": "Expect that {column_text} contains only documented missing codes and legitimate values.", + "fr": "On s'attend à ce que {column_text} ne contienne que des codes de valeurs manquantes documentés et des valeurs légitimes.", + "de": "Erwarten Sie, dass {column_text} nur dokumentierte fehlende Codes und legitime Werte enthält.", + "it": "Aspettatevi che {column_text} contenga solo codici mancanti documentati e valori legittimi.", + "es": "Se espera que {column_text} contenga solo códigos de valores faltantes documentados y valores legítimos.", + "pt": "Espera-se que {column_text} contenha apenas códigos de valores ausentes documentados e valores legítimos.", + "ro": "Se așteaptă ca {column_text} să conțină doar coduri de valori lipsă documentate și valori legitime.", + "tr": "{column_text} öğesinin yalnızca belgelenmiş eksik kodları ve geçerli değerleri içermesini bekleyin.", + "zh-Hans": "预期{column_text}仅包含已记录的缺失值代码和合法值。", + "zh-Hant": "{column_text}應僅包含已記錄的缺失值代碼和合法值。", + "ja": "{column_text}に文書化された欠損コードと正当な値のみが含まれていることを期待します。", + "ko": "{column_text}에 문서화된 결측 코드와 정당한 값만 포함되어 있어야 합니다.", + "vi": "Kỳ vọng {column_text} chỉ chứa các mã thiếu đã được ghi nhận và các giá trị hợp lệ.", + "ru": "Ожидается, что {column_text} содержит только задокументированные коды отсутствия и допустимые значения.", + "cs": "Očekává se, že {column_text} obsahuje pouze zdokumentované chybějící kódy a legitimní hodnoty.", + "pl": "Oczekuje się, że {column_text} zawiera tylko udokumentowane kody braków i prawidłowe wartości.", + "da": "Forvent, at {column_text} kun indeholder dokumenterede manglende koder og legitime værdier.", + "sv": "Förvänta dig att {column_text} endast innehåller dokumenterade saknade koder och legitima värden.", + "nb": "Forvent at {column_text} bare inneholder dokumenterte manglende koder og legitime verdier.", + "nl": "Verwacht dat {column_text} alleen gedocumenteerde ontbrekende codes en legitieme waarden bevat.", + "fi": "Odota, että {column_text} sisältää vain dokumentoituja puuttuvien arvojen koodeja ja kelvollisia arvoja.", + "is": "Væntir þess að {column_text} innihaldi aðeins skráða vantandi kóða og lögmæt gildi.", + "ar": "توقع أن يحتوي {column_text} على رموز القيم المفقودة الموثقة والقيم المشروعة فقط.", + "hi": "अपेक्षा है कि {column_text} में केवल प्रलेखित अनुपस्थित कोड और वैध मान हों।", + "el": "Αναμένεται η στήλη {column_text} να περιέχει μόνο τεκμηριωμένους κωδικούς ελλιπών τιμών και έγκυρες τιμές.", + "id": "Mengharapkan bahwa {column_text} hanya berisi kode nilai yang hilang yang terdokumentasi dan nilai yang sah.", + "uk": "Очікується, що {column_text} містить лише задокументовані коди відсутності та допустимі значення.", + "bg": "Очаква се {column_text} да съдържа само документирани кодове за липсващи стойности и легитимни стойности.", + "hr": "Očekuje se da {column_text} sadrži samo dokumentirane kodove za nedostajuće vrijednosti i legitimne vrijednosti.", + "et": "Eeldatakse, et {column_text} sisaldab ainult dokumenteeritud puuduvate väärtuste koode ja õiguspäraseid väärtusi.", + "hu": "Elvárás, hogy a {column_text} csak dokumentált hiányzó kódokat és érvényes értékeket tartalmazzon.", + "ga": "Táthar ag súil nach mbeadh i {column_text} ach cóid ar iarraidh dhoiciméadaithe agus luachanna dlisteanacha.", + "lv": "Tiek sagaidīts, ka {column_text} satur tikai dokumentētus trūkstošo vērtību kodus un likumīgas vērtības.", + "lt": "Tikimasi, kad {column_text} yra tik dokumentuoti trūkstamų reikšmių kodai ir teisėtos reikšmės.", + "mt": "Mistenni li {column_text} ikun fih biss kodiċijiet ta' valuri nieqsa dokumentati u valuri leġittimi.", + "sk": "Očakáva sa, že {column_text} obsahuje iba zdokumentované chýbajúce kódy a legitímne hodnoty.", + "sl": "Pričakuje se, da {column_text} vsebuje samo dokumentirane kode manjkajočih vrednosti in legitimne vrednosti.", + "he": "צפוי ש{column_text} יכיל רק קודי ערכים חסרים מתועדים וערכים לגיטימיים.", + "th": "คาดหวังว่า {column_text} จะมีเฉพาะรหัสค่าที่หายไปที่มีการบันทึกไว้และค่าที่ถูกต้องเท่านั้น", + "fa": "انتظار می‌رود که {column_text} فقط شامل کدهای مفقود مستندشده و مقادیر معتبر باشد.", + }, + "col_missing_only_coded_failure_text": { + "en": "Undocumented codes were present in {column_text}.", + "fr": "Des codes non documentés étaient présents dans {column_text}.", + "de": "Undokumentierte Codes waren in {column_text} vorhanden.", + "it": "Erano presenti codici non documentati in {column_text}.", + "es": "Había códigos no documentados en {column_text}.", + "pt": "Havia códigos não documentados em {column_text}.", + "ro": "Coduri nedocumentate au fost prezente în {column_text}.", + "tr": "{column_text} içinde belgelenmemiş kodlar mevcuttu.", + "zh-Hans": "{column_text}中存在未记录的代码。", + "zh-Hant": "{column_text}中存在未記錄的代碼。", + "ja": "{column_text}に文書化されていないコードが存在しました。", + "ko": "{column_text}에 문서화되지 않은 코드가 있었습니다.", + "vi": "Có các mã chưa được ghi nhận trong {column_text}.", + "ru": "В {column_text} присутствовали незадокументированные коды.", + "cs": "Ve sloupci {column_text} byly přítomny nezdokumentované kódy.", + "pl": "W {column_text} obecne były nieudokumentowane kody.", + "da": "Udokumenterede koder var til stede i {column_text}.", + "sv": "Odokumenterade koder fanns i {column_text}.", + "nb": "Udokumenterte koder var til stede i {column_text}.", + "nl": "Er waren ongedocumenteerde codes aanwezig in {column_text}.", + "fi": "Sarakkeessa {column_text} oli dokumentoimattomia koodeja.", + "is": "Óskráðir kóðar voru til staðar í {column_text}.", + "ar": "كانت هناك رموز غير موثقة في {column_text}.", + "hi": "{column_text} में बिना प्रलेखित कोड मौजूद थे।", + "el": "Υπήρχαν μη τεκμηριωμένοι κωδικοί στη στήλη {column_text}.", + "id": "Terdapat kode yang tidak terdokumentasi dalam {column_text}.", + "uk": "У {column_text} були наявні незадокументовані коди.", + "bg": "В {column_text} присъстваха недокументирани кодове.", + "hr": "U {column_text} bili su prisutni nedokumentirani kodovi.", + "et": "Veerus {column_text} esines dokumenteerimata koode.", + "hu": "A {column_text} oszlopban dokumentálatlan kódok voltak jelen.", + "ga": "Bhí cóid neamhdhoiciméadaithe i láthair i {column_text}.", + "lv": "{column_text} bija nedokumentēti kodi.", + "lt": "{column_text} buvo nedokumentuotų kodų.", + "mt": "Kien hemm kodiċijiet mhux dokumentati f'{column_text}.", + "sk": "V {column_text} sa vyskytli nezdokumentované kódy.", + "sl": "V {column_text} so bili prisotni nedokumentirani kodi.", + "he": "היו קודים לא מתועדים ב{column_text}.", + "th": "พบรหัสที่ไม่มีการบันทึกไว้ใน {column_text}", + "fa": "کدهای مستندنشده در {column_text} وجود داشت.", + }, + "col_missing_consistent_expectation_text": { + "en": "Expect consistent missingness for reason {reason} across columns {columns_text}.", + "fr": "On s'attend à une absence cohérente pour la raison {reason} dans les colonnes {columns_text}.", + "de": "Erwarten Sie eine konsistente Fehlendheit für den Grund {reason} über die Spalten {columns_text} hinweg.", + "it": "Aspettatevi una mancanza coerente per il motivo {reason} tra le colonne {columns_text}.", + "es": "Se espera una ausencia coherente por el motivo {reason} en las columnas {columns_text}.", + "pt": "Espera-se uma ausência consistente pelo motivo {reason} nas colunas {columns_text}.", + "ro": "Se așteaptă o lipsă consecventă pentru motivul {reason} în coloanele {columns_text}.", + "tr": "{columns_text} sütunlarında {reason} nedeniyle tutarlı eksiklik bekleyin.", + "zh-Hans": "预期各列 {columns_text} 中因 {reason} 导致的缺失情况一致。", + "zh-Hant": "預期各欄 {columns_text} 中因 {reason} 導致的缺失情況一致。", + "ja": "列 {columns_text} 全体で理由 {reason} による欠損が一貫していることを期待します。", + "ko": "{columns_text} 열 전체에서 사유 {reason}에 대한 일관된 결측을 기대합니다.", + "vi": "Kỳ vọng sự thiếu hụt nhất quán cho lý do {reason} trên các cột {columns_text}.", + "ru": "Ожидается согласованная пропущенность по причине {reason} в столбцах {columns_text}.", + "cs": "Očekává se konzistentní chybějící hodnoty z důvodu {reason} napříč sloupci {columns_text}.", + "pl": "Oczekuje się spójnego braku danych z powodu {reason} w kolumnach {columns_text}.", + "da": "Forvent konsistent manglende data af årsagen {reason} på tværs af kolonnerne {columns_text}.", + "sv": "Förvänta dig konsekvent saknad data av orsaken {reason} över kolumnerna {columns_text}.", + "nb": "Forvent konsistent manglende data av årsaken {reason} på tvers av kolonnene {columns_text}.", + "nl": "Verwacht consistente ontbrekendheid om reden {reason} in de kolommen {columns_text}.", + "fi": "Odota johdonmukaista puuttuvuutta syystä {reason} sarakkeissa {columns_text}.", + "is": "Væntir samkvæmrar vöntunar af ástæðunni {reason} yfir dálkana {columns_text}.", + "ar": "توقع غيابًا متسقًا للسبب {reason} عبر الأعمدة {columns_text}.", + "hi": "अपेक्षा है कि कारण {reason} के लिए स्तंभों {columns_text} में सुसंगत अनुपस्थिति हो।", + "el": "Αναμένεται συνεπής έλλειψη για τον λόγο {reason} στις στήλες {columns_text}.", + "id": "Mengharapkan ketiadaan yang konsisten karena alasan {reason} di seluruh kolom {columns_text}.", + "uk": "Очікується узгоджена відсутність даних з причини {reason} у стовпцях {columns_text}.", + "bg": "Очаква се последователна липса по причина {reason} в колоните {columns_text}.", + "hr": "Očekuje se dosljedno nedostajanje za razlog {reason} u stupcima {columns_text}.", + "et": "Eeldatakse järjepidevat puudumist põhjusel {reason} veergudes {columns_text}.", + "hu": "Elvárás, hogy a {reason} okból következetes hiány legyen a(z) {columns_text} oszlopokban.", + "ga": "Táthar ag súil le heaspa chomhsheasmhach ar an gcúis {reason} ar fud na gcolún {columns_text}.", + "lv": "Tiek sagaidīts konsekvents trūkums iemesla {reason} dēļ kolonnās {columns_text}.", + "lt": "Tikimasi nuoseklaus trūkumo dėl priežasties {reason} stulpeliuose {columns_text}.", + "mt": "Mistenni nuqqas konsistenti għar-raġuni {reason} fil-kolonni {columns_text}.", + "sk": "Očakávajú sa konzistentné chýbajúce hodnoty z dôvodu {reason} v stĺpcoch {columns_text}.", + "sl": "Pričakuje se dosledno manjkanje zaradi razloga {reason} v stolpcih {columns_text}.", + "he": "צפויה חוסר עקבי מסיבה {reason} בעמודות {columns_text}.", + "th": "คาดหวังว่าการขาดหายไปด้วยเหตุผล {reason} จะสอดคล้องกันในคอลัมน์ {columns_text}", + "fa": "انتظار می‌رود فقدان سازگار به دلیل {reason} در ستون‌های {columns_text} وجود داشته باشد.", + }, + "col_missing_consistent_failure_text": { + "en": "Inconsistent missingness for reason {reason} was found across columns {columns_text}.", + "fr": "Une absence incohérente pour la raison {reason} a été trouvée dans les colonnes {columns_text}.", + "de": "Inkonsistente Fehlendheit für den Grund {reason} wurde über die Spalten {columns_text} hinweg gefunden.", + "it": "È stata rilevata una mancanza incoerente per il motivo {reason} tra le colonne {columns_text}.", + "es": "Se encontró una ausencia incoherente por el motivo {reason} en las columnas {columns_text}.", + "pt": "Foi encontrada uma ausência inconsistente pelo motivo {reason} nas colunas {columns_text}.", + "ro": "A fost găsită o lipsă inconsecventă pentru motivul {reason} în coloanele {columns_text}.", + "tr": "{columns_text} sütunlarında {reason} nedeniyle tutarsız eksiklik bulundu.", + "zh-Hans": "在各列 {columns_text} 中发现因 {reason} 导致的缺失情况不一致。", + "zh-Hant": "在各欄 {columns_text} 中發現因 {reason} 導致的缺失情況不一致。", + "ja": "列 {columns_text} 全体で理由 {reason} による欠損が一貫していないことが見つかりました。", + "ko": "{columns_text} 열 전체에서 사유 {reason}에 대한 일관되지 않은 결측이 발견되었습니다.", + "vi": "Đã tìm thấy sự thiếu hụt không nhất quán cho lý do {reason} trên các cột {columns_text}.", + "ru": "В столбцах {columns_text} обнаружена несогласованная пропущенность по причине {reason}.", + "cs": "Napříč sloupci {columns_text} byly nalezeny nekonzistentní chybějící hodnoty z důvodu {reason}.", + "pl": "W kolumnach {columns_text} znaleziono niespójny brak danych z powodu {reason}.", + "da": "Inkonsistent manglende data af årsagen {reason} blev fundet på tværs af kolonnerne {columns_text}.", + "sv": "Inkonsekvent saknad data av orsaken {reason} hittades över kolumnerna {columns_text}.", + "nb": "Inkonsistent manglende data av årsaken {reason} ble funnet på tvers av kolonnene {columns_text}.", + "nl": "Inconsistente ontbrekendheid om reden {reason} werd aangetroffen in de kolommen {columns_text}.", + "fi": "Sarakkeissa {columns_text} havaittiin epäjohdonmukaista puuttuvuutta syystä {reason}.", + "is": "Ósamkvæm vöntun af ástæðunni {reason} fannst yfir dálkana {columns_text}.", + "ar": "تم العثور على غياب غير متسق للسبب {reason} عبر الأعمدة {columns_text}.", + "hi": "कारण {reason} के लिए स्तंभों {columns_text} में असंगत अनुपस्थिति पाई गई।", + "el": "Βρέθηκε ασυνεπής έλλειψη για τον λόγο {reason} στις στήλες {columns_text}.", + "id": "Ketiadaan yang tidak konsisten karena alasan {reason} ditemukan di seluruh kolom {columns_text}.", + "uk": "У стовпцях {columns_text} виявлено неузгоджену відсутність даних з причини {reason}.", + "bg": "Установена е непоследователна липса по причина {reason} в колоните {columns_text}.", + "hr": "Pronađeno je nedosljedno nedostajanje za razlog {reason} u stupcima {columns_text}.", + "et": "Veergudes {columns_text} leiti ebajärjepidev puudumine põhjusel {reason}.", + "hu": "A(z) {columns_text} oszlopokban következetlen hiány található a {reason} okból.", + "ga": "Fuarthas easpa neamhchomhsheasmhach ar an gcúis {reason} ar fud na gcolún {columns_text}.", + "lv": "Kolonnās {columns_text} tika atrasts nekonsekvents trūkums iemesla {reason} dēļ.", + "lt": "Stulpeliuose {columns_text} rastas nenuoseklus trūkumas dėl priežasties {reason}.", + "mt": "Instab nuqqas inkonsistenti għar-raġuni {reason} fil-kolonni {columns_text}.", + "sk": "V stĺpcoch {columns_text} sa našli nekonzistentné chýbajúce hodnoty z dôvodu {reason}.", + "sl": "V stolpcih {columns_text} je bilo najdeno nedosledno manjkanje zaradi razloga {reason}.", + "he": "נמצאה חוסר לא עקבי מסיבה {reason} בעמודות {columns_text}.", + "th": "พบการขาดหายไปด้วยเหตุผล {reason} ที่ไม่สอดคล้องกันในคอลัมน์ {columns_text}", + "fa": "فقدان ناسازگار به دلیل {reason} در ستون‌های {columns_text} یافت شد.", + }, "regex_expectation_text": { "en": "Expect that values in {column_text} should match the regular expression: {values_text}.", "fr": "On s'attend à ce que les valeurs de {column_text} correspondent à l'expression régulière : {values_text}.", From afa7cc5d2d8f754c0b2017f849129f87366a4cc2 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:16 -0400 Subject: [PATCH 27/55] Update _constants.py --- pointblank/_constants.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pointblank/_constants.py b/pointblank/_constants.py index d898a7495..9b67872a2 100644 --- a/pointblank/_constants.py +++ b/pointblank/_constants.py @@ -22,12 +22,15 @@ "null": ["str", "numeric", "bool", "datetime", "duration"], "not_null": ["str", "numeric", "bool", "datetime", "duration"], "missing_coded": ["str", "numeric", "bool", "datetime", "duration"], + "missing_only_coded": ["str", "numeric", "bool", "datetime", "duration"], } ASSERTION_TYPE_METHOD_MAP: dict[str, str] = { "col_pct_null": "pct_null", "col_pct_missing": "pct_missing", "col_missing_coded": "missing_coded", + "col_missing_only_coded": "missing_only_coded", + "col_missing_consistent": "missing_consistent", "col_vals_gt": "gt", "col_vals_lt": "lt", "col_vals_eq": "eq", @@ -95,6 +98,7 @@ "col_vals_null", "col_vals_not_null", "col_missing_coded", + "col_missing_only_coded", "col_vals_expr", "conjointly", "prompt", @@ -678,6 +682,26 @@ +""", + "col_missing_only_coded": """ + + col_missing_only_coded + + + + + + +""", + "col_missing_consistent": """ + + col_missing_consistent + + + + + + """, "col_vals_regex": """ From 61718391967fef0782080df4a0de6e5badb785d2 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:22 -0400 Subject: [PATCH 28/55] Update _interrogation.py --- pointblank/_interrogation.py | 76 ++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/pointblank/_interrogation.py b/pointblank/_interrogation.py index 2e1e8e8af..218a705e1 100644 --- a/pointblank/_interrogation.py +++ b/pointblank/_interrogation.py @@ -2610,6 +2610,82 @@ def apply_missing_exclusion(results_tbl: IntoFrame, column: str, spec: Any) -> A return nw_tbl.to_native() +def interrogate_missing_only_coded( + tbl: IntoFrame, + column: str, + sentinels: list, + count_null: bool, + allowed: list | None, + min_val: Any, + max_val: Any, +) -> Any: + """Missing-only-coded interrogation. + + A row passes when its value is either a declared sentinel (a documented missing code), a null + (when `count_null=True`), or a legitimate "real" value — one in `allowed` or within the + `[min_val, max_val]` range. Any other value is treated as an *undocumented* code and fails. + """ + nw_tbl = nw.from_native(tbl) + + good = None + + def _or(expr): + nonlocal good + good = expr if good is None else (good | expr) + + if sentinels: + _or(nw.col(column).is_in(sentinels).fill_null(False)) + if count_null: + _or(nw.col(column).is_null()) + if allowed: + _or(nw.col(column).is_in(allowed).fill_null(False)) + if min_val is not None or max_val is not None: + range_expr = nw.lit(True) + if min_val is not None: + range_expr = range_expr & (nw.col(column) >= min_val) + if max_val is not None: + range_expr = range_expr & (nw.col(column) <= max_val) + _or(range_expr.fill_null(False)) + + if good is None: + good = nw.lit(False) + + result_tbl = nw_tbl.with_columns(pb_is_good_=good) + return result_tbl.to_native() + + +def interrogate_missing_consistent( + tbl: IntoFrame, columns: list[str], sentinels: list, count_null: bool +) -> Any: + """Cross-column missing-consistency interrogation. + + Given a set of related `columns`, a row passes when the "missing for a given reason" status is + consistent across all of them: either *none* of the columns carry the reason, or *all* of them + do. A row fails when some-but-not-all of the columns are missing for that reason. Missingness + for the reason is encoded by the `sentinels` values (and, when `count_null=True`, actual nulls). + """ + nw_tbl = nw.from_native(tbl) + n_cols = len(columns) + + count_expr = None + for c in columns: + if sentinels: + col_expr = nw.col(c).is_in(sentinels).fill_null(False) + else: + col_expr = nw.lit(False) # noqa + if count_null: + col_expr = col_expr | nw.col(c).is_null() + col_count = col_expr.cast(nw.Int32) + count_expr = col_count if count_expr is None else (count_expr + col_count) + + result_tbl = nw_tbl.with_columns(_n_reason_=count_expr) + result_tbl = result_tbl.with_columns( + pb_is_good_=((nw.col("_n_reason_") == 0) | (nw.col("_n_reason_") == n_cols)) + ) + result_tbl = result_tbl.drop("_n_reason_") + return result_tbl.to_native() + + def interrogate_missing_coded(tbl: IntoFrame, column: str) -> Any: """Missing-coded interrogation. From 8c36fc661ea3d2304191ee7ec917028ef41e1873 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:27 -0400 Subject: [PATCH 29/55] Update validate.py --- pointblank/validate.py | 585 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 550 insertions(+), 35 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 78bdeeced..d3a0b0120 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -78,6 +78,8 @@ interrogate_lt, interrogate_ne, interrogate_missing_coded, + interrogate_missing_consistent, + interrogate_missing_only_coded, interrogate_not_null, interrogate_notin, interrogate_null, @@ -2684,9 +2686,15 @@ def _prettify_reason_label(reason: str) -> str: return reason.replace("_", " ").title() -def _build_structured_missing_tbl(data: Any, missing: dict[str, MissingSpec]) -> GT: +def _build_structured_missing_tbl( + data: Any, missing: dict[str, MissingSpec], as_heatmap: bool = False +) -> GT: """Build a structured-missingness breakdown table (one row per column, columns for the count - and percentage of complete values and of each missing reason).""" + and percentage of complete values and of each missing reason). + + When `as_heatmap=True`, render the reason proportions as a color-coded heatmap (cells shaded + from light to dark by the proportion missing for each reason) instead of count/percent text. + """ if not isinstance(missing, dict): raise TypeError( f"`missing=` must be a dict mapping column names to MissingSpec objects, " @@ -2745,18 +2753,32 @@ def _build_structured_missing_tbl(data: Any, missing: dict[str, MissingSpec]) -> total_missing = sum(counts.values()) complete = total - total_missing - def _fmt(count: int) -> str: - pct = round(100 * count / total) if total > 0 else 0 - return f"{count} ({pct}%)" + def _prop(count: int) -> float: + return (count / total) if total > 0 else 0.0 - record: dict[str, Any] = { - "columns": column, - "total_n": str(total), - "complete": _fmt(complete), - } - # Fill every reason column in the union (0 for reasons this spec doesn't define) - for r in reason_order: - record[r] = _fmt(counts.get(r, 0)) + if as_heatmap: + # Numeric proportions (0..1) so cells can be color-shaded by missingness + record: dict[str, Any] = { + "columns": column, + "total_n": str(total), + "complete": _prop(complete), + } + for r in reason_order: + record[r] = _prop(counts.get(r, 0)) + else: + + def _fmt(count: int) -> str: + pct = round(100 * count / total) if total > 0 else 0 + return f"{count} ({pct}%)" + + record = { + "columns": column, + "total_n": str(total), + "complete": _fmt(complete), + } + # Fill every reason column in the union (0 for reasons this spec doesn't define) + for r in reason_order: + record[r] = _fmt(counts.get(r, 0)) records.append(record) # Build a DataFrame from the records using the available DataFrame library @@ -2770,9 +2792,6 @@ def _fmt(count: int) -> str: breakdown_df = pd.DataFrame(records) - title = "Missing Values by Reason" - subtitle = "Counts and percentages of complete values and each missing reason, per column." - cols_labels = { "columns": "Column", "total_n": "Total N", @@ -2783,23 +2802,55 @@ def _fmt(count: int) -> str: value_columns = ["total_n", "complete"] + reason_order - gt_tbl = ( - GT(breakdown_df) - .tab_header(title=html(f"
{title}
"), subtitle=subtitle) - .opt_table_font(font=google_font(name="IBM Plex Sans")) - .opt_align_table_header(align="left") - .cols_label(cases=cols_labels) - .cols_align(align="right", columns=value_columns) - .cols_align(align="left", columns="columns") - .tab_style( - style=style.text(font=google_font(name="IBM Plex Mono"), size="12px"), - locations=loc.body(columns=value_columns), + if as_heatmap: + title = "Missing Pattern Heatmap" + subtitle = "Proportion of each missing reason per column (darker = more missing)." + prop_columns = ["complete"] + reason_order + + gt_tbl = ( + GT(breakdown_df) + .tab_header( + title=html(f"
{title}
"), subtitle=subtitle + ) + .opt_table_font(font=google_font(name="IBM Plex Sans")) + .opt_align_table_header(align="left") + .cols_label(cases=cols_labels) + .cols_align(align="center", columns=value_columns) + .cols_align(align="left", columns="columns") + .fmt_percent(columns=prop_columns, decimals=0) + .data_color( + columns=reason_order, + palette=["#F5F5F5", "#000000"], + domain=[0, 1], + ) + .tab_style( + style=style.text(weight="bold"), + locations=loc.body(columns="columns"), + ) ) - .tab_style( - style=style.text(weight="bold"), - locations=loc.body(columns="columns"), + else: + title = "Missing Values by Reason" + subtitle = "Counts and percentages of complete values and each missing reason, per column." + + gt_tbl = ( + GT(breakdown_df) + .tab_header( + title=html(f"
{title}
"), subtitle=subtitle + ) + .opt_table_font(font=google_font(name="IBM Plex Sans")) + .opt_align_table_header(align="left") + .cols_label(cases=cols_labels) + .cols_align(align="right", columns=value_columns) + .cols_align(align="left", columns="columns") + .tab_style( + style=style.text(font=google_font(name="IBM Plex Mono"), size="12px"), + locations=loc.body(columns=value_columns), + ) + .tab_style( + style=style.text(weight="bold"), + locations=loc.body(columns="columns"), + ) ) - ) if version("great_tables") >= "0.17.0": gt_tbl = gt_tbl.tab_options(quarto_disable_processing=True) @@ -2807,7 +2858,9 @@ def _fmt(count: int) -> str: return gt_tbl -def missing_vals_tbl(data: Any, missing: dict[str, MissingSpec] | None = None) -> GT: +def missing_vals_tbl( + data: Any, missing: dict[str, MissingSpec] | None = None, as_heatmap: bool = False +) -> GT: """ Display a table that shows the missing values in the input table. @@ -2832,6 +2885,10 @@ def missing_vals_tbl(data: Any, missing: dict[str, MissingSpec] | None = None) - An optional dictionary mapping column names to [`MissingSpec`](`pointblank.MissingSpec`) objects. When provided, the function renders a structured breakdown of missingness by reason for the specified columns (rather than the default sector heatmap). + as_heatmap + Only applies when `missing=` is provided. When `True`, render the per-reason proportions as + a color-coded heatmap (cells shaded from light to dark by the proportion missing) instead of + the count/percentage text breakdown. Default is `False`. Returns ------- @@ -2913,7 +2970,7 @@ def missing_vals_tbl(data: Any, missing: dict[str, MissingSpec] | None = None) - # (count and percentage of complete values and each missing reason, per column) instead of # the default sector heatmap if missing is not None: - return _build_structured_missing_tbl(data=data, missing=missing) + return _build_structured_missing_tbl(data=data, missing=missing, as_heatmap=as_heatmap) # Get the number of rows in the table n_rows = get_row_count(data) @@ -10894,6 +10951,168 @@ def col_missing_coded( return self + def col_missing_only_coded( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + allowed: Collection[Any] | None = None, + min_val: float | int | None = None, + max_val: float | int | None = None, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: + """ + Validate that a column contains only documented codes and legitimate values. + + The `col_missing_only_coded()` method checks that every value in a column is *accounted + for*: it is either a declared missing-value code (a sentinel in the + [`MissingSpec`](`pointblank.MissingSpec`), or a null when `null_is_missing=True`), or a + legitimate "real" value. Legitimate real values are defined by `allowed=` (an explicit set) + and/or a `[min_val, max_val]` range. Any value that is neither a documented code nor a + legitimate real value is flagged — this catches *undocumented* sentinel codes (e.g., a + stray `-95`) that aren't part of the spec. + + At least one of `allowed=`, `min_val=`, or `max_val=` must be provided so that legitimate + real values can be distinguished from undocumented codes. This validation operates over the + number of test units equal to the number of rows in the table. + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. + missing + A [`MissingSpec`](`pointblank.MissingSpec`) declaring the documented sentinel codes. + allowed + An explicit set of legitimate real values. A value in this set passes. Can be combined + with `min_val=`/`max_val=` (a value passes if it satisfies either constraint). + min_val + Lower bound (inclusive) of the legitimate real-value range. + max_val + Upper bound (inclusive) of the legitimate real-value range. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. + actions + Optional actions to take when the validation step meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value or callable that determines whether the validation step should be + active. Using `False` will make the validation step inactive (still reporting its + presence and keeping indexes for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) + ``` + The `age` column should contain real ages in `[0, 120]` or the documented codes `-99`/`-98`. + The value `-95` is an *undocumented* code and should be flagged: + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame({"age": [34, -98, 41, -95, 29, -99, 55]}) + + age_missing = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + validation = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=age_missing, min_val=0, max_val=120) + .interrogate() + ) + + validation + ``` + + The validation reports one failing test unit: the row where `age` is `-95`, which is + neither a real age in range nor a declared sentinel. + """ + assertion_type = _get_fn_name() + + _check_column(column=columns) + _check_pre(pre=pre) + _check_thresholds(thresholds=thresholds) + _check_active_input(param=active, param_name="active") + + if not isinstance(missing, MissingSpec): + raise TypeError( + f"`missing=` must be a MissingSpec, got {type(missing).__name__}." + ) + + if allowed is None and min_val is None and max_val is None: + raise ValueError( + "`col_missing_only_coded()` requires at least one of `allowed=`, `min_val=`, or " + "`max_val=` so that legitimate real values can be distinguished from undocumented " + "codes." + ) + + sentinels = missing.sentinel_values() + count_null = missing.null_is_missing + allowed_list = list(allowed) if allowed is not None else None + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + columns = _resolve_columns(columns) + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + # Iterate over the columns and create a validation step for each + for column in columns: + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=column, + values={ + "sentinels": sentinels, + "count_null": count_null, + "allowed": allowed_list, + "min_val": min_val, + "max_val": max_val, + "spec": missing, + }, + pre=pre, + segments=segments, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + ) + + self._add_validation(validation_info=val_info) + + return self + def rows_distinct( self, columns_subset: str | list[str] | None = None, @@ -11386,6 +11605,167 @@ def rows_complete( return self + def col_missing_consistent( + self, + columns: list[str], + missing: MissingSpec, + when_reason: str, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: + """ + Validate that related columns share a consistent missingness pattern for a given reason. + + The `col_missing_consistent()` method checks that, across a set of related columns, the + "missing for a specific reason" status is *consistent*: for each row, either *none* of the + columns are missing for `when_reason=`, or *all* of them are. This is useful for structured + survey or clinical data where a skip pattern should propagate across related fields — for + example, if a question wasn't asked (`"not_asked"`) then all of its dependent fields should + also be coded `"not_asked"`. + + A value is considered "missing for the reason" when it is one of the sentinel values mapped + to `when_reason=` in the [`MissingSpec`](`pointblank.MissingSpec`) (and, when the reason is + the spec's `null_reason` and `null_is_missing=True`, an actual null). This validation + operates over the number of test units equal to the number of rows in the table. A row fails + when some — but not all — of the columns are missing for the given reason. + + Parameters + ---------- + columns + A list of related columns to check for consistent missingness. + missing + A [`MissingSpec`](`pointblank.MissingSpec`) describing the sentinel values and their + reasons for the columns. + when_reason + The reason label whose presence should be consistent across `columns=`. If one column + in a row is missing for this reason, all of them should be. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. + actions + Optional actions to take when the validation step meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value or callable that determines whether the validation step should be + active. Using `False` will make the validation step inactive (still reporting its + presence and keeping indexes for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) + ``` + Here, `income_source` and `income_amount` should both be coded `"not_asked"` (`-99`) together + when the income question wasn't asked. The last row is inconsistent — only one field is + coded `-99`: + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame( + { + "income_source": [1, -99, 2, -99], + "income_amount": [50000, -99, 42000, 38000], + } + ) + + income_missing = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + validation = ( + pb.Validate(data=tbl) + .col_missing_consistent( + columns=["income_source", "income_amount"], + missing=income_missing, + when_reason="not_asked", + ) + .interrogate() + ) + + validation + ``` + + The validation reports one failing test unit: the final row, where `income_source` is coded + `-99` (`"not_asked"`) but `income_amount` is a real value. + """ + assertion_type = _get_fn_name() + + _check_pre(pre=pre) + _check_thresholds(thresholds=thresholds) + _check_active_input(param=active, param_name="active") + + if not isinstance(missing, MissingSpec): + raise TypeError( + f"`missing=` must be a MissingSpec, got {type(missing).__name__}." + ) + + if isinstance(columns, str): + columns = [columns] + columns = list(columns) + if len(columns) < 2: + raise ValueError( + "`col_missing_consistent()` requires at least two columns to compare." + ) + + # Resolve which sentinel values (and whether nulls) represent `when_reason` + sentinels = missing.values_for_reason(when_reason) + count_null = missing.null_is_missing and missing.null_reason == when_reason + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=columns, + values={ + "sentinels": sentinels, + "count_null": count_null, + "when_reason": when_reason, + "spec": missing, + }, + pre=pre, + segments=segments, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + ) + + self._add_validation(validation_info=val_info) + + return self + def prompt( self, prompt: str, @@ -14260,6 +14640,7 @@ def interrogate( "col_vals_null", "col_vals_not_null", "col_missing_coded", + "col_missing_only_coded", "col_vals_increasing", "col_vals_decreasing", "col_vals_between", @@ -14304,6 +14685,16 @@ def interrogate( results_tbl = interrogate_not_null(tbl=tbl, column=column) elif assertion_method == "missing_coded": results_tbl = interrogate_missing_coded(tbl=tbl, column=column) + elif assertion_method == "missing_only_coded": + results_tbl = interrogate_missing_only_coded( + tbl=tbl, + column=column, + sentinels=value["sentinels"], + count_null=value["count_null"], + allowed=value["allowed"], + min_val=value["min_val"], + max_val=value["max_val"], + ) elif assertion_type == "col_vals_increasing": from pointblank._interrogation import interrogate_increasing @@ -14427,6 +14818,14 @@ def interrogate( elif assertion_type == "rows_complete": results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column) + elif assertion_type == "col_missing_consistent": + results_tbl = interrogate_missing_consistent( + tbl=data_tbl_step, + columns=column, + sentinels=value["sentinels"], + count_null=value["count_null"], + ) + elif assertion_type == "prompt": from pointblank._interrogation import interrogate_prompt @@ -15048,7 +15447,8 @@ def interrogate( if ( collect_extracts and assertion_type - in ROW_BASED_VALIDATION_TYPES + ["rows_distinct", "rows_complete"] + in ROW_BASED_VALIDATION_TYPES + + ["rows_distinct", "rows_complete", "col_missing_consistent"] and tbl_type not in IBIS_BACKENDS ): # Add row numbers to the results table @@ -17480,6 +17880,20 @@ def get_tabular_report( ]: values_upd.append("—") + elif assertion_type[i] in ["col_missing_consistent"]: + # Show the reason being checked for cross-column consistency + values_upd.append(f"when_reason = {value.get('when_reason')}") + + elif assertion_type[i] in ["col_missing_only_coded"]: + # Show the allowed real values and/or range used to define legitimate values + parts = [] + if value.get("allowed") is not None: + allowed_str = str(value["allowed"])[1:-1].replace("'", "") + parts.append(f"allowed = {allowed_str}") + if value.get("min_val") is not None or value.get("max_val") is not None: + parts.append(f"[{value.get('min_val')}, {value.get('max_val')}]") + values_upd.append("
".join(parts) if parts else "—") + elif assertion_type[i] in ["col_pct_null"]: # Extract p and tol from the values dict for nice formatting p_value = value["p"] @@ -17633,6 +18047,20 @@ def get_tabular_report( else: # pragma: no cover values_upd.append(str(value)) # pragma: no cover + # Annotate `col_vals_*` steps that carry a `missing=` MissingSpec so the report shows that + # structured-missing values (sentinels and, optionally, nulls) were excluded from the check. + # The `missing` spec is fetched directly from the validation steps (it isn't a report field). + missing_specs = [getattr(v, "missing", None) for v in self.validation_info] + for i, spec in enumerate(missing_specs): + if spec is None or i >= len(values_upd): + continue + reasons = ", ".join(spec.reasons_list()) if hasattr(spec, "reasons_list") else "" + annotation = ( + "
" + f"missing-aware: {reasons}" + ) + values_upd[i] = f"{values_upd[i]}{annotation}" + # Remove the `inclusive` entry from the dictionary validation_info_dict.pop("inclusive") @@ -18357,7 +18785,7 @@ def get_step_report( # if get_row_count(extract) == 0: # return "No rows were extracted." - if assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_complete"]: + if assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_complete", "col_missing_consistent"]: # Get the extracted data for the step extract = self.get_data_extracts(i=i, frame=True) @@ -18439,6 +18867,20 @@ def get_step_report( else: step_report = None # pragma: no cover + # If the step is associated with a MissingSpec, append a legend of the missing-value codes + # and their reasons so that sentinel values appearing in the failing rows can be interpreted + step_spec = getattr(self.validation_info[i - 1], "missing", None) + if step_spec is None and isinstance(values, MissingSpec): + # col_missing_coded stores the spec directly in `values` + step_spec = values + if step_spec is None and isinstance(values, dict) and isinstance(values.get("spec"), MissingSpec): + # col_missing_only_coded and col_missing_consistent stash the spec under `values["spec"]` + step_spec = values["spec"] + if step_spec is not None and step_report is not None: + legend_html = _missing_legend_html(step_spec) + if legend_html and hasattr(step_report, "tab_source_note"): + step_report = step_report.tab_source_note(source_note=html(legend_html)) + return step_report def get_dataframe_report( @@ -19828,6 +20270,21 @@ def _create_autobrief_or_failure_text( for_failure=for_failure, ) + if assertion_type == "col_missing_only_coded": + return _create_text_col_missing_only_coded( + lang=lang, + column=column, + for_failure=for_failure, + ) + + if assertion_type == "col_missing_consistent": + return _create_text_col_missing_consistent( + lang=lang, + columns=column, + value=values, + for_failure=for_failure, + ) + if assertion_type == "conjointly": return _create_text_conjointly(lang=lang, for_failure=for_failure) @@ -20271,6 +20728,38 @@ def _create_text_col_missing_coded(lang: str, column: str | None, for_failure: b ) +def _create_text_col_missing_only_coded( + lang: str, column: str | None, for_failure: bool = False +) -> str: + """Create autobrief/failure text for col_missing_only_coded validation.""" + type_ = _expect_failure_type(for_failure=for_failure) + + column_text = _prep_column_text(column=column) + + return EXPECT_FAIL_TEXT[f"col_missing_only_coded_{type_}_text"][lang].format( + column_text=column_text, + ) + + +def _create_text_col_missing_consistent( + lang: str, columns: Any, value: dict, for_failure: bool = False +) -> str: + """Create autobrief/failure text for col_missing_consistent validation.""" + type_ = _expect_failure_type(for_failure=for_failure) + + if isinstance(columns, (list, tuple)): + columns_text = _prep_values_text(values=list(columns), lang=lang, limit=5) + else: + columns_text = _prep_column_text(column=columns) + + reason = value.get("when_reason") if isinstance(value, dict) else None + + return EXPECT_FAIL_TEXT[f"col_missing_consistent_{type_}_text"][lang].format( + columns_text=columns_text, + reason=reason, + ) + + def _create_text_conjointly(lang: str, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) @@ -20618,6 +21107,21 @@ def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any: return data_tbl +def _missing_legend_html(spec: Any) -> str: + """Build an HTML legend of a MissingSpec's sentinel codes and their reasons, for step reports.""" + if not hasattr(spec, "reasons"): + return "" + items = [f"{value} → {reason}" for value, reason in spec.reasons.items()] + if getattr(spec, "null_is_missing", False): + items.append(f"null → {spec.null_reason}") + if not items: + return "" + return ( + "
" + "Missing codes: " + "; ".join(items) + "
" + ) + + def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict: """ Convert a `_ValidationInfo` object to a dictionary. @@ -22344,6 +22848,17 @@ def _step_report_row_based( text = STEP_REPORT_TEXT["rows_complete_all"][lang] else: text = STEP_REPORT_TEXT["rows_complete_subset"][lang] + elif assertion_type == "col_missing_coded": + text = f"{column} is missing-coded" + elif assertion_type == "col_missing_only_coded": + text = f"{column} only documented codes" + elif assertion_type == "col_missing_consistent": + cols = ", ".join(column) if isinstance(column, (list, tuple)) else str(column) + reason = values.get("when_reason") if isinstance(values, dict) else None + text = f"consistent “{reason}” across {{{cols}}}" + else: + # Fallback for any other assertion type: show the assertion type name + text = str(assertion_type) # Wrap assertion text in a tag text = ( From 0b2c73a1452a90a9e1c1a0e1c724d5aa113de44f Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:32 -0400 Subject: [PATCH 30/55] Update validate.pyi --- pointblank/validate.pyi | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/pointblank/validate.pyi b/pointblank/validate.pyi index 359686cf2..25e1714b9 100644 --- a/pointblank/validate.pyi +++ b/pointblank/validate.pyi @@ -78,7 +78,9 @@ def preview( min_tbl_width: int = 500, incl_header: bool | None = None, ) -> GT: ... -def missing_vals_tbl(data: Any, missing: dict[str, MissingSpec] | None = None) -> GT: ... +def missing_vals_tbl( + data: Any, missing: dict[str, MissingSpec] | None = None, as_heatmap: bool = False +) -> GT: ... def get_column_count(data: Any) -> int: ... def get_row_count(data: Any) -> int: ... @dataclass @@ -432,6 +434,20 @@ class Validate: brief: str | bool | None = None, active: bool | Callable = True, ) -> Validate: ... + def col_missing_only_coded( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + allowed: Collection[Any] | None = None, + min_val: float | int | None = None, + max_val: float | int | None = None, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: ... def rows_distinct( self, columns_subset: str | list[str] | None = None, @@ -452,6 +468,18 @@ class Validate: brief: str | bool | None = None, active: bool | Callable = True, ) -> Validate: ... + def col_missing_consistent( + self, + columns: list[str], + missing: MissingSpec, + when_reason: str, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: ... def prompt( self, prompt: str, From 2832838b2a84963cdbe07c6c9ec35c5b05f432d2 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:38 -0400 Subject: [PATCH 31/55] Update yaml.py --- pointblank/yaml.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pointblank/yaml.py b/pointblank/yaml.py index 455d6744f..6a9e39c26 100644 --- a/pointblank/yaml.py +++ b/pointblank/yaml.py @@ -275,6 +275,8 @@ class YAMLValidator: "col_pct_null": "col_pct_null", "col_pct_missing": "col_pct_missing", "col_missing_coded": "col_missing_coded", + "col_missing_only_coded": "col_missing_only_coded", + "col_missing_consistent": "col_missing_consistent", "rows_distinct": "rows_distinct", "rows_complete": "rows_complete", "col_count_match": "col_count_match", From 9488596a12c3cdb917208a8ebea6aeb156d9db6f Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:43 -0400 Subject: [PATCH 32/55] Create test_col_missing_consistent.py --- tests/test_col_missing_consistent.py | 114 +++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 tests/test_col_missing_consistent.py diff --git a/tests/test_col_missing_consistent.py b/tests/test_col_missing_consistent.py new file mode 100644 index 000000000..50c424ba4 --- /dev/null +++ b/tests/test_col_missing_consistent.py @@ -0,0 +1,114 @@ +import polars as pl +import pandas as pd +import pytest + +import pointblank as pb + + +@pytest.fixture +def spec(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + +def _info(v): + return v.validation_info[0] + + +class TestColMissingConsistent: + def test_basic_inconsistency(self, spec): + tbl = pl.DataFrame( + {"income_source": [1, -99, 2, -99], "income_amount": [50000, -99, 42000, 38000]} + ) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent( + columns=["income_source", "income_amount"], missing=spec, when_reason="not_asked" + ) + .interrogate() + ) + info = _info(v) + assert info.n == 4 + assert info.n_failed == 1 # last row: only one column is -99 + + def test_all_consistent_passes(self, spec): + tbl = pl.DataFrame( + {"a": [1, -99, 2, -99], "b": [5, -99, 6, -99]} + ) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .interrogate() + ) + assert _info(v).n_failed == 0 + + def test_null_reason_consistency(self): + # when_reason == null_reason, null_is_missing True -> nulls count + spec = pb.MissingSpec(reasons={-98: "refused"}, null_reason="unknown") + tbl = pl.DataFrame({"a": [1, None, None], "b": [5, None, 6]}) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="unknown") + .interrogate() + ) + # row2 both null -> ok; row3 only a null -> fail + assert _info(v).n_failed == 1 + + def test_three_columns(self, spec): + tbl = pl.DataFrame( + {"a": [-99, 1, -99], "b": [-99, 2, -99], "c": [-99, 3, 7]} + ) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b", "c"], missing=spec, when_reason="not_asked") + .interrogate() + ) + # row1 all -99 ok; row2 none ok; row3 a,b -99 but c=7 -> fail + assert _info(v).n_failed == 1 + + def test_requires_two_columns(self, spec): + tbl = pl.DataFrame({"a": [1, 2]}) + with pytest.raises(ValueError, match="at least two columns"): + pb.Validate(data=tbl).col_missing_consistent( + columns=["a"], missing=spec, when_reason="not_asked" + ) + + def test_missing_must_be_spec(self): + tbl = pl.DataFrame({"a": [1], "b": [2]}) + with pytest.raises(TypeError): + pb.Validate(data=tbl).col_missing_consistent( + columns=["a", "b"], missing={-99: "x"}, when_reason="not_asked" + ) + + def test_pandas_backend(self, spec): + tbl = pd.DataFrame( + {"a": [1, -99, -99], "b": [5, -99, 6]} + ) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .interrogate() + ) + assert _info(v).n_failed == 1 + + def test_report_and_step_report(self, spec): + tbl = pl.DataFrame({"a": [1, -99, -99], "b": [5, -99, 6]}) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .interrogate() + ) + assert v.get_tabular_report() is not None + # step report (row-based extract path) should build without error + assert v.get_step_report(i=1) is not None + + @pytest.mark.parametrize("lang", ["en", "fr", "de", "ja", "ar", "zh-Hans"]) + def test_brief_langs(self, spec, lang): + tbl = pl.DataFrame({"a": [1, -99, -99], "b": [5, -99, 6]}) + v = ( + pb.Validate(data=tbl, lang=lang) + .col_missing_consistent( + columns=["a", "b"], missing=spec, when_reason="not_asked", brief=True + ) + .interrogate() + ) + assert _info(v).autobrief From a21533edb0e24b40717058a4bbd0dbac0bb62b31 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:48 -0400 Subject: [PATCH 33/55] Create test_col_missing_only_coded.py --- tests/test_col_missing_only_coded.py | 115 +++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 tests/test_col_missing_only_coded.py diff --git a/tests/test_col_missing_only_coded.py b/tests/test_col_missing_only_coded.py new file mode 100644 index 000000000..7bb1bbbbb --- /dev/null +++ b/tests/test_col_missing_only_coded.py @@ -0,0 +1,115 @@ +import polars as pl +import pandas as pd +import pytest + +import pointblank as pb + + +@pytest.fixture +def spec(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + +def _info(v): + return v.validation_info[0] + + +class TestColMissingOnlyCoded: + def test_flags_undocumented_code(self, spec): + # -95 is undocumented; reals in [0,120]; -99/-98 documented + tbl = pl.DataFrame({"age": [34, -98, 41, -95, 29, -99, 55]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + info = _info(v) + assert info.n == 7 + assert info.n_failed == 1 # only -95 + + def test_all_documented_or_real_passes(self, spec): + tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + assert _info(v).n_failed == 0 + + def test_allowed_set(self, spec): + tbl = pl.DataFrame({"grade": [1, 2, -99, 3, -95, -98]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="grade", missing=spec, allowed=[1, 2, 3]) + .interrogate() + ) + # -95 is undocumented -> 1 failure + assert _info(v).n_failed == 1 + + def test_null_documented_when_null_is_missing(self): + spec = pb.MissingSpec(reasons={-99: "not_asked"}, null_is_missing=True) + tbl = pl.DataFrame({"age": [34, None, -99, 200]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + # null passes (documented as unknown), -99 passes, 200 out of range -> fail + assert _info(v).n_failed == 1 + + def test_null_fails_when_not_missing(self): + spec = pb.MissingSpec(reasons={-99: "not_asked"}, null_is_missing=False) + tbl = pl.DataFrame({"age": [34, None, -99, 41]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + # null is neither documented nor a real value -> fail + assert _info(v).n_failed == 1 + + def test_requires_a_real_value_constraint(self, spec): + tbl = pl.DataFrame({"age": [1, 2, 3]}) + with pytest.raises(ValueError, match="at least one of"): + pb.Validate(data=tbl).col_missing_only_coded(columns="age", missing=spec) + + def test_missing_must_be_spec(self): + tbl = pl.DataFrame({"age": [1, 2, 3]}) + with pytest.raises(TypeError): + pb.Validate(data=tbl).col_missing_only_coded( + columns="age", missing={-99: "x"}, min_val=0, max_val=10 + ) + + def test_pandas_backend(self, spec): + tbl = pd.DataFrame({"age": [34, -98, -95, 200]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + # -95 undocumented, 200 out of range -> 2 failures + assert _info(v).n_failed == 2 + + def test_report_and_step_report(self, spec): + tbl = pl.DataFrame({"age": [34, -98, -95, 41]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded( + columns="age", missing=spec, min_val=0, max_val=120, brief=True + ) + .interrogate() + ) + assert v.get_tabular_report() is not None + assert v.get_step_report(i=1) is not None + + @pytest.mark.parametrize("lang", ["en", "fr", "de", "ja", "ar", "zh-Hans"]) + def test_brief_langs(self, spec, lang): + tbl = pl.DataFrame({"age": [34, -95]}) + v = ( + pb.Validate(data=tbl, lang=lang) + .col_missing_only_coded( + columns="age", missing=spec, min_val=0, max_val=120, brief=True + ) + .interrogate() + ) + assert _info(v).autobrief From 4557687a1533e3d3a569a2fd8ca1677fc26c3a0f Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:53 -0400 Subject: [PATCH 34/55] Create test_missing_report_integration.py --- tests/test_missing_report_integration.py | 100 +++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 tests/test_missing_report_integration.py diff --git a/tests/test_missing_report_integration.py b/tests/test_missing_report_integration.py new file mode 100644 index 000000000..33babf327 --- /dev/null +++ b/tests/test_missing_report_integration.py @@ -0,0 +1,100 @@ +import polars as pl + +import pointblank as pb + + +def test_tabular_report_annotates_missing_aware_steps(): + tbl = pl.DataFrame({"age": [34, -98, 41, 200]}) + spec = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + html = v.get_tabular_report().as_raw_html() + assert "missing-aware" in html + assert "refused" in html and "not_asked" in html + + +def test_tabular_report_no_annotation_without_missing(): + tbl = pl.DataFrame({"age": [34, -98, 41, 200]}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120) + .interrogate() + ) + assert "missing-aware" not in v.get_tabular_report().as_raw_html() + + +def test_dedicated_methods_show_context(): + tbl = pl.DataFrame({"age": [34, -98, 41, -99]}) + spec = pb.MissingSpec( + reasons={-99: "not_asked", -98: "refused"}, + categories={"nonresponse": ["refused"]}, + ) + v = ( + pb.Validate(data=tbl) + .col_pct_missing(columns="age", missing=spec, reason="refused", max_pct=0.5) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + html = v.get_tabular_report().as_raw_html() + # col_pct_missing shows the reason filter; col_missing_only_coded shows the range + assert "reason = refused" in html + assert "[0, 120]" in html + + +def test_step_report_shows_missing_codes_legend(): + spec = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + # col_vals_* with missing= + tbl = pl.DataFrame({"age": [34, -98, 200, -99, 300]}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + h = v.get_step_report(i=1).as_raw_html() + assert "Missing codes" in h and "not_asked" in h and "refused" in h + + # col_missing_coded (spec in values) + tbl2 = pl.DataFrame({"age": [34, None, 41]}) + v2 = pb.Validate(data=tbl2).col_missing_coded(columns="age", missing=spec).interrogate() + assert "Missing codes" in v2.get_step_report(i=1).as_raw_html() + + # col_missing_only_coded (spec stashed in values dict) + tbl3 = pl.DataFrame({"age": [34, -98, -95, 41]}) + v3 = ( + pb.Validate(data=tbl3) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + assert "Missing codes" in v3.get_step_report(i=1).as_raw_html() + + # col_missing_consistent + tbl4 = pl.DataFrame({"a": [1, -99, -99], "b": [5, -99, 6]}) + v4 = ( + pb.Validate(data=tbl4) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .interrogate() + ) + assert "Missing codes" in v4.get_step_report(i=1).as_raw_html() + + +def test_step_report_no_legend_without_missing(): + tbl = pl.DataFrame({"age": [34, 200, 41]}) + v = pb.Validate(data=tbl).col_vals_between(columns="age", left=0, right=120).interrogate() + assert "Missing codes" not in v.get_step_report(i=1).as_raw_html() + + +def test_report_renders_with_mixed_steps(): + tbl = pl.DataFrame({"a": [1, -99, 3], "b": [-99, -99, 3]}) + spec = pb.MissingSpec(reasons={-99: "not_asked"}) + v = ( + pb.Validate(data=tbl) + .col_vals_gt(columns="a", value=0, missing=spec) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .col_missing_coded(columns="a", missing=spec) + .interrogate() + ) + assert v.get_tabular_report() is not None From 1f8558a12a2b5d589d2d76894e174081ee0a77c1 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Tue, 16 Jun 2026 18:33:58 -0400 Subject: [PATCH 35/55] Update test_missing_vals_tbl_structured.py --- tests/test_missing_vals_tbl_structured.py | 25 +++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/test_missing_vals_tbl_structured.py b/tests/test_missing_vals_tbl_structured.py index 30c2f9907..fde123bbe 100644 --- a/tests/test_missing_vals_tbl_structured.py +++ b/tests/test_missing_vals_tbl_structured.py @@ -84,3 +84,28 @@ def test_unknown_column_raises(self, tbl_pl): spec = pb.MissingSpec(reasons={-99: "not_asked"}) with pytest.raises(ValueError, match="not found"): pb.missing_vals_tbl(tbl_pl, missing={"nonexistent": spec}) + + +class TestMissingHeatmap: + def test_heatmap_returns_gt(self, tbl_pl, specs): + result = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True) + assert isinstance(result, GT) + + def test_heatmap_title_and_labels(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True).as_raw_html() + assert "Missing Pattern Heatmap" in html + assert "Refused" in html and "Below Threshold" in html + assert "%" in html # proportions formatted as percentages + + def test_heatmap_pandas(self, specs): + tbl = pd.DataFrame( + { + "age": [34, -98, 41, -99, 29, -98, 55, None], + "income": [50000, -99, -1, None, 42000, -99, 38000, 61000], + } + ) + assert isinstance(pb.missing_vals_tbl(tbl, missing=specs, as_heatmap=True), GT) + + def test_as_heatmap_ignored_without_missing(self, tbl_pl): + # as_heatmap only applies with missing=; default sector view still returned + assert isinstance(pb.missing_vals_tbl(tbl_pl, as_heatmap=True), GT) From 9de4244bcd466d1fa84cc9de16fffb2caed85ca8 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 10:49:31 -0400 Subject: [PATCH 36/55] Use report notes to present MissingSpec details --- pointblank/validate.py | 114 +++++++++++++++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 21 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index d3a0b0120..f1b1ae49e 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -10802,6 +10802,7 @@ def col_pct_missing( "max_pct": max_pct, "reason": reason, "category": category, + "spec": missing, }, thresholds=thresholds, actions=actions, @@ -14310,6 +14311,15 @@ def interrogate( validation.autobrief = autobrief + # If the step carries structured-missingness context (a `missing=` spec or a dedicated + # missing method), attach a one-line note summarizing the codes and any reason/range + # filter. This keeps the VALUES cell minimal while surfacing detail in the Notes section. + missing_note = _build_missing_note(validation) + if missing_note is not None: + validation._add_note( + key="missing_spec", markdown=missing_note[0], text=missing_note[1] + ) + # ------------------------------------------------ # Bypassing the validation step if conditions met # ------------------------------------------------ @@ -17881,18 +17891,16 @@ def get_tabular_report( values_upd.append("—") elif assertion_type[i] in ["col_missing_consistent"]: - # Show the reason being checked for cross-column consistency - values_upd.append(f"when_reason = {value.get('when_reason')}") + # Minimal cell: a compact badge (the reason and columns live in the step note) + values_upd.append( + "CONSISTENT" + ) elif assertion_type[i] in ["col_missing_only_coded"]: - # Show the allowed real values and/or range used to define legitimate values - parts = [] - if value.get("allowed") is not None: - allowed_str = str(value["allowed"])[1:-1].replace("'", "") - parts.append(f"allowed = {allowed_str}") - if value.get("min_val") is not None or value.get("max_val") is not None: - parts.append(f"[{value.get('min_val')}, {value.get('max_val')}]") - values_upd.append("
".join(parts) if parts else "—") + # Minimal cell: a compact badge (allowed values/range live in the step note) + values_upd.append( + "ONLY CODED" + ) elif assertion_type[i] in ["col_pct_null"]: # Extract p and tol from the values dict for nice formatting @@ -17904,14 +17912,8 @@ def get_tabular_report( values_upd.append(f"p = {p_value}
tol = {tol_value}") elif assertion_type[i] in ["col_pct_missing"]: - # Format the max_pct and any reason/category filter for display - max_pct_value = value["max_pct"] - filter_line = "" - if value.get("reason") is not None: - filter_line = f"
reason = {value['reason']}" - elif value.get("category") is not None: - filter_line = f"
category = {value['category']}" - values_upd.append(f"max_pct = {max_pct_value}{filter_line}") + # Minimal cell: just the threshold (reason/category detail lives in the step note) + values_upd.append(f"≤ {value['max_pct']}") elif assertion_type[i] in ["data_freshness"]: # Format max_age nicely for display @@ -18054,10 +18056,10 @@ def get_tabular_report( for i, spec in enumerate(missing_specs): if spec is None or i >= len(values_upd): continue - reasons = ", ".join(spec.reasons_list()) if hasattr(spec, "reasons_list") else "" + # Keep the cell minimal: a compact badge. The reason/code detail lives in the step note. annotation = ( - "
" - f"missing-aware: {reasons}" + "
MISSING-AWARE" ) values_upd[i] = f"{values_upd[i]}{annotation}" @@ -21107,6 +21109,76 @@ def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any: return data_tbl +def _resolve_step_missing_spec(validation: Any) -> Any: + """Return the `MissingSpec` associated with a validation step, if any. + + The spec lives in different places depending on the method: on `validation.missing` for + `col_vals_*` steps that used `missing=`; directly in `validation.values` for `col_missing_coded`; + and under `validation.values["spec"]` for `col_pct_missing`, `col_missing_only_coded`, and + `col_missing_consistent`. + """ + spec = getattr(validation, "missing", None) + if spec is not None: + return spec + vals = getattr(validation, "values", None) + if isinstance(vals, MissingSpec): + return vals + if isinstance(vals, dict) and isinstance(vals.get("spec"), MissingSpec): + return vals["spec"] + return None + + +def _build_missing_note(validation: Any) -> tuple[str, str] | None: + """Build a one-line (markdown, text) note summarizing a step's structured-missingness context. + + Returns `None` when the step has no associated `MissingSpec`. + """ + spec = _resolve_step_missing_spec(validation) + if spec is None or not hasattr(spec, "reasons"): + return None + + codes_md = ", ".join(f"`{value}`→{reason}" for value, reason in spec.reasons.items()) + codes_tx = ", ".join(f"{value}->{reason}" for value, reason in spec.reasons.items()) + if getattr(spec, "null_is_missing", False): + codes_md += f", `null`→{spec.null_reason}" + codes_tx += f", null->{spec.null_reason}" + + md = f"**Missing codes:** {codes_md}" + tx = f"Missing codes: {codes_tx}" + + # Method-specific context appended to the one-line summary + assertion_type = getattr(validation, "assertion_type", None) + vals = getattr(validation, "values", None) + + if assertion_type == "col_pct_missing" and isinstance(vals, dict): + if vals.get("reason") is not None: + md += f". Counting reason `{vals['reason']}`" + tx += f". Counting reason {vals['reason']}" + elif vals.get("category") is not None: + md += f". Counting category `{vals['category']}`" + tx += f". Counting category {vals['category']}" + elif assertion_type == "col_missing_only_coded" and isinstance(vals, dict): + bits_md = [] + bits_tx = [] + if vals.get("allowed") is not None: + allowed_str = ", ".join(str(a) for a in vals["allowed"]) + bits_md.append(f"allowed {{{allowed_str}}}") + bits_tx.append(f"allowed {{{allowed_str}}}") + if vals.get("min_val") is not None or vals.get("max_val") is not None: + rng = f"[{vals.get('min_val')}, {vals.get('max_val')}]" + bits_md.append(f"range {rng}") + bits_tx.append(f"range {rng}") + if bits_md: + md += f". Legitimate values: {', '.join(bits_md)}" + tx += f". Legitimate values: {', '.join(bits_tx)}" + elif assertion_type == "col_missing_consistent" and isinstance(vals, dict): + if vals.get("when_reason") is not None: + md += f". Consistency required for reason `{vals['when_reason']}`" + tx += f". Consistency required for reason {vals['when_reason']}" + + return md, tx + + def _missing_legend_html(spec: Any) -> str: """Build an HTML legend of a MissingSpec's sentinel codes and their reasons, for step reports.""" if not hasattr(spec, "reasons"): From 17b0ea8b463320e54103c6a92a018be172fe8ece Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 10:49:35 -0400 Subject: [PATCH 37/55] Update test_missing_report_integration.py --- tests/test_missing_report_integration.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/test_missing_report_integration.py b/tests/test_missing_report_integration.py index 33babf327..edd08cda2 100644 --- a/tests/test_missing_report_integration.py +++ b/tests/test_missing_report_integration.py @@ -12,7 +12,9 @@ def test_tabular_report_annotates_missing_aware_steps(): .interrogate() ) html = v.get_tabular_report().as_raw_html() - assert "missing-aware" in html + # The VALUES cell carries a compact badge; the reason/code detail goes to the step note + assert "MISSING-AWARE" in html + assert "Missing codes" in html assert "refused" in html and "not_asked" in html @@ -23,10 +25,12 @@ def test_tabular_report_no_annotation_without_missing(): .col_vals_between(columns="age", left=0, right=120) .interrogate() ) - assert "missing-aware" not in v.get_tabular_report().as_raw_html() + html = v.get_tabular_report().as_raw_html() + assert "MISSING-AWARE" not in html + assert "Missing codes" not in html -def test_dedicated_methods_show_context(): +def test_dedicated_methods_show_minimal_cell_and_note(): tbl = pl.DataFrame({"age": [34, -98, 41, -99]}) spec = pb.MissingSpec( reasons={-99: "not_asked", -98: "refused"}, @@ -39,9 +43,15 @@ def test_dedicated_methods_show_context(): .interrogate() ) html = v.get_tabular_report().as_raw_html() - # col_pct_missing shows the reason filter; col_missing_only_coded shows the range - assert "reason = refused" in html - assert "[0, 120]" in html + # Compact VALUES cells: a threshold for col_pct_missing and an "ONLY CODED" badge + assert "ONLY CODED" in html + # Detail is surfaced via the auto Notes system + assert "Missing codes" in html + assert "Counting reason" in html and "refused" in html + assert "Legitimate values" in html and "[0, 120]" in html + # The old verbose VALUES strings should no longer be present + assert "reason = refused" not in html + assert "max_pct = " not in html def test_step_report_shows_missing_codes_legend(): From 41ede89d02474dde81ebfa22f8f62ea2a3efa10f Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 11:36:47 -0400 Subject: [PATCH 38/55] Update missing.py --- pointblank/missing.py | 256 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) diff --git a/pointblank/missing.py b/pointblank/missing.py index 04f3150a6..de533d560 100644 --- a/pointblank/missing.py +++ b/pointblank/missing.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from dataclasses import dataclass, field from typing import Any @@ -8,6 +9,31 @@ ] +# Standard HL7/CDISC null flavors mapped to snake_case reason labels +_CDISC_NULL_FLAVORS: dict[str, str] = { + "NI": "no_information", + "NA": "not_applicable", + "UNK": "unknown", + "ASKU": "asked_but_unknown", + "NAV": "temporarily_unavailable", + "NASK": "not_asked", + "OTH": "other", + "PINF": "positive_infinity", + "NINF": "negative_infinity", + "MSK": "masked", + "DER": "derived", + "QS": "sufficient_quantity", + "TRC": "trace", + "NP": "not_present", +} + + +def _slugify(label: Any) -> str: + """Convert a human-readable label into a snake_case reason identifier.""" + slug = re.sub(r"[^0-9a-zA-Z]+", "_", str(label).strip().lower()).strip("_") + return slug or "missing" + + @dataclass class MissingSpec: """ @@ -240,3 +266,233 @@ def reasons_list(self) -> list[str]: if self.null_is_missing: seen.setdefault(self.null_reason, None) return list(seen.keys()) + + # ------------------------------------------------------------------ + # Factory methods (pre-built specs and metadata-import integration) + # ------------------------------------------------------------------ + + @classmethod + def from_cdisc_null_flavors( + cls, + null_is_missing: bool = True, + null_reason: str = "no_information", + description: str | None = "CDISC/HL7 null flavors", + ) -> "MissingSpec": + """Create a `MissingSpec` for the standard HL7/CDISC *null flavors*. + + Clinical data uses standardized null flavor codes to record *why* a value is absent (e.g., + `"NASK"` for "not asked", `"UNK"` for "unknown"). This returns a ready-to-use spec mapping + those codes to reason labels. + + Parameters + ---------- + null_is_missing + Whether actual null values should also be treated as missing. Default is `True`. + null_reason + The reason label for actual null values. Default is `"no_information"`. + description + Optional description. Default identifies the spec as CDISC/HL7 null flavors. + + Returns + ------- + MissingSpec + A spec with the standard null flavor codes. + + Examples + -------- + ```python + import pointblank as pb + + cdisc_missing = pb.MissingSpec.from_cdisc_null_flavors() + cdisc_missing.reason_for("NASK") # "not_asked" + ``` + """ + reasons = dict(_CDISC_NULL_FLAVORS) + categories = { + "unknown": ["no_information", "unknown", "asked_but_unknown", "temporarily_unavailable"], + "not_applicable": ["not_applicable", "not_asked", "not_present"], + "boundary": ["positive_infinity", "negative_infinity"], + } + return cls( + reasons=reasons, + categories=categories, + null_is_missing=null_is_missing, + null_reason=null_reason, + description=description, + ) + + # Convenient short alias + @classmethod + def from_cdisc(cls, **kwargs: Any) -> "MissingSpec": + """Alias for [`from_cdisc_null_flavors()`](`pointblank.MissingSpec.from_cdisc_null_flavors`).""" + return cls.from_cdisc_null_flavors(**kwargs) + + @classmethod + def from_sas( + cls, + reasons: dict[str, str] | None = None, + include_underscore: bool = True, + null_is_missing: bool = True, + null_reason: str = "system_missing", + description: str | None = "SAS special missing values", + ) -> "MissingSpec": + """Create a `MissingSpec` for SAS special missing values. + + SAS encodes missingness with `"."` (system missing), `"._"`, and `".A"` through `".Z"` (27 + user-defined missing codes). This returns a spec covering all of them; you can override the + reason label for any specific code via `reasons=`. + + Parameters + ---------- + reasons + Optional mapping of specific SAS missing codes to custom reason labels (e.g., + `{".A": "not_applicable", ".B": "below_detection"}`). These override the defaults. + include_underscore + Whether to include the `"._"` special missing code. Default is `True`. + null_is_missing + Whether actual null values should also be treated as missing. Default is `True`. + null_reason + The reason label for actual null values. Default is `"system_missing"`. + description + Optional description. Default identifies the spec as SAS special missing values. + + Returns + ------- + MissingSpec + A spec covering the SAS special missing values. + + Examples + -------- + ```python + import pointblank as pb + + sas_missing = pb.MissingSpec.from_sas( + reasons={".A": "not_applicable", ".B": "below_detection"} + ) + sas_missing.reason_for(".A") # "not_applicable" + sas_missing.reason_for(".C") # "user_missing_c" + ``` + """ + built: dict[Any, str] = {".": "system_missing"} + if include_underscore: + built["._"] = "system_missing" + for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": + built[f".{letter}"] = f"user_missing_{letter.lower()}" + if reasons: + for code, label in reasons.items(): + built[code] = label + return cls( + reasons=built, + null_is_missing=null_is_missing, + null_reason=null_reason, + description=description, + ) + + @classmethod + def from_spss( + cls, + missing_values: list, + labels: dict[Any, str] | None = None, + null_is_missing: bool = True, + null_reason: str = "unknown", + description: str | None = "SPSS user-defined missing values", + ) -> "MissingSpec": + """Create a `MissingSpec` from SPSS-style user-defined missing values. + + SPSS supports up to 3 user-defined missing values per variable (plus a range). Pass the + missing values (and optionally their value labels) to build a spec. Reason labels are + derived from the labels when available, otherwise a `"missing_"` placeholder is used. + + Parameters + ---------- + missing_values + The sentinel values that SPSS marks as missing for the variable (e.g., `[-99, -98]`). + labels + Optional mapping of sentinel value to human-readable label (e.g., `{-99: "Refused"}`). + Labels are slugified into reason identifiers (e.g., `"Refused"` -> `"refused"`). + null_is_missing + Whether actual null values should also be treated as missing. Default is `True`. + null_reason + The reason label for actual null values. Default is `"unknown"`. + description + Optional description. Default identifies the spec as SPSS user-defined missing values. + + Returns + ------- + MissingSpec + A spec built from the SPSS missing values. + + Examples + -------- + ```python + import pointblank as pb + + spss_missing = pb.MissingSpec.from_spss( + missing_values=[-99, -98], + labels={-99: "Not asked", -98: "Refused"}, + ) + spss_missing.reason_for(-98) # "refused" + ``` + """ + labels = labels or {} + reasons = { + value: (_slugify(labels[value]) if value in labels else f"missing_{_slugify(value)}") + for value in missing_values + } + return cls( + reasons=reasons, + null_is_missing=null_is_missing, + null_reason=null_reason, + description=description, + ) + + @classmethod + def from_variable_metadata( + cls, + variable: Any, + null_is_missing: bool = True, + null_reason: str = "unknown", + ) -> "MissingSpec | None": + """Create a `MissingSpec` from an imported variable's metadata. + + This works with a [`VariableMetadata`](`pointblank.VariableMetadata`) object (as produced by + [`import_metadata()`](`pointblank.import_metadata`) for SPSS, Stata, and SAS files). It reads + the variable's `missing_values` and derives reason labels from `missing_value_labels` or + `value_labels` when available. + + Parameters + ---------- + variable + A variable-metadata object exposing `missing_values` and (optionally) + `missing_value_labels` / `value_labels` attributes. + null_is_missing + Whether actual null values should also be treated as missing. Default is `True`. + null_reason + The reason label for actual null values. Default is `"unknown"`. + + Returns + ------- + MissingSpec | None + A spec built from the variable's missing values, or `None` if the variable declares no + missing values. + """ + missing_values = getattr(variable, "missing_values", None) or [] + if not missing_values: + return None + + labels = getattr(variable, "missing_value_labels", None) or {} + value_labels = getattr(variable, "value_labels", None) or {} + + reasons: dict[Any, str] = {} + for value in missing_values: + label = labels.get(value) + if label is None: + label = value_labels.get(value) + reasons[value] = _slugify(label) if label else f"missing_{_slugify(value)}" + + return cls( + reasons=reasons, + null_is_missing=null_is_missing, + null_reason=null_reason, + description=f"Imported missing values for '{getattr(variable, 'name', 'variable')}'", + ) From e9bfabe6280fc0e1421c8d7dccac88f5eb0cb3bd Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 11:36:55 -0400 Subject: [PATCH 39/55] Update _types.py --- pointblank/metadata/_types.py | 48 +++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/pointblank/metadata/_types.py b/pointblank/metadata/_types.py index 04e6b83b1..c70e9a78d 100644 --- a/pointblank/metadata/_types.py +++ b/pointblank/metadata/_types.py @@ -223,6 +223,21 @@ class VariableMetadata: unit: str | None = None unit_system: str | None = None + def to_missing_spec(self) -> Any: + """Build a [`MissingSpec`](`pointblank.MissingSpec`) from this variable's missing values. + + Reads `missing_values` and derives reason labels from `missing_value_labels` or + `value_labels` when available. + + Returns + ------- + MissingSpec | None + A `MissingSpec` for the variable, or `None` if no missing values are declared. + """ + from pointblank.missing import MissingSpec + + return MissingSpec.from_variable_metadata(self) + @dataclass class MetadataImport: @@ -340,6 +355,39 @@ def get_variable(self, name: str) -> VariableMetadata: return var raise KeyError(f"No variable named '{name}' in imported metadata") + def missing_specs(self) -> dict[str, Any]: + """Auto-generate [`MissingSpec`](`pointblank.MissingSpec`) objects for all variables. + + Builds a mapping of column name to `MissingSpec` for every imported variable that declares + missing values (e.g., SPSS user-defined missing values, SAS special missing). The result + can be passed directly to validation methods (via `missing=`) or to + [`missing_vals_tbl()`](`pointblank.missing_vals_tbl`). + + Returns + ------- + dict[str, MissingSpec] + A mapping of column name to `MissingSpec`. Variables without declared missing values + are omitted. + + Examples + -------- + ```python + import pointblank as pb + + meta = pb.import_metadata("survey.sav", format="spss") + specs = meta.missing_specs() + + # Use the auto-generated specs in a missingness report + pb.missing_vals_tbl(data, missing=specs) + ``` + """ + specs: dict[str, Any] = {} + for var in self.variables: + spec = var.to_missing_spec() + if spec is not None: + specs[var.name] = spec + return specs + def get_codelist(self, name: str) -> Codelist: """Get a specific codelist by name. From 5aa2d1c027d5093ad71383e9f3fef573bdc9fb0b Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 11:36:57 -0400 Subject: [PATCH 40/55] Create test_missing_factories.py --- tests/test_missing_factories.py | 145 ++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 tests/test_missing_factories.py diff --git a/tests/test_missing_factories.py b/tests/test_missing_factories.py new file mode 100644 index 000000000..7cdeb0610 --- /dev/null +++ b/tests/test_missing_factories.py @@ -0,0 +1,145 @@ +import pytest + +import pointblank as pb +from pointblank.missing import MissingSpec, _slugify +from pointblank.metadata import VariableMetadata, MetadataImport + + +class TestSlugify: + @pytest.mark.parametrize( + "label,expected", + [ + ("Refused", "refused"), + ("Not Applicable", "not_applicable"), + ("DON'T KNOW", "don_t_know"), + (" spaced ", "spaced"), + (-99, "99"), + ("", "missing"), + ], + ) + def test_slugify(self, label, expected): + assert _slugify(label) == expected + + +class TestFromCdisc: + def test_standard_codes(self): + spec = MissingSpec.from_cdisc_null_flavors() + assert spec.reason_for("NASK") == "not_asked" + assert spec.reason_for("UNK") == "unknown" + assert spec.reason_for("PINF") == "positive_infinity" + assert spec.reason_for("NA") == "not_applicable" + + def test_categories(self): + spec = MissingSpec.from_cdisc_null_flavors() + assert set(spec.values_for_category("boundary")) == {"PINF", "NINF"} + assert "NASK" in spec.values_for_category("not_applicable") + + def test_alias(self): + assert MissingSpec.from_cdisc().reason_for("MSK") == "masked" + + def test_null_handling(self): + spec = MissingSpec.from_cdisc_null_flavors() + assert spec.null_is_missing is True + assert spec.reason_for(None) == "no_information" + + def test_exported_via_top_level(self): + assert pb.MissingSpec.from_cdisc_null_flavors().reason_for("NI") == "no_information" + + +class TestFromSas: + def test_defaults(self): + spec = MissingSpec.from_sas() + assert spec.reason_for(".") == "system_missing" + assert spec.reason_for(".A") == "user_missing_a" + assert spec.reason_for(".Z") == "user_missing_z" + assert spec.reason_for("._") == "system_missing" + + def test_overrides(self): + spec = MissingSpec.from_sas(reasons={".A": "not_applicable", ".B": "below_detection"}) + assert spec.reason_for(".A") == "not_applicable" + assert spec.reason_for(".B") == "below_detection" + assert spec.reason_for(".C") == "user_missing_c" # default preserved + + def test_no_underscore(self): + spec = MissingSpec.from_sas(include_underscore=False) + assert spec.reason_for("._") is None + # 26 letters + "." = 27 sentinels + assert len(spec.sentinel_values()) == 27 + + +class TestFromSpss: + def test_with_labels(self): + spec = MissingSpec.from_spss( + missing_values=[-99, -98], labels={-99: "Not asked", -98: "Refused"} + ) + assert spec.reason_for(-99) == "not_asked" + assert spec.reason_for(-98) == "refused" + + def test_without_labels(self): + spec = MissingSpec.from_spss(missing_values=[-99, -1]) + assert spec.reason_for(-99) == "missing_99" + assert spec.reason_for(-1) == "missing_1" + + +class TestFromVariableMetadata: + def test_uses_missing_value_labels(self): + var = VariableMetadata( + name="age", + dtype="Int64", + missing_values=[-99, -98], + missing_value_labels={-99: "Not asked", -98: "Refused"}, + ) + spec = MissingSpec.from_variable_metadata(var) + assert spec.reason_for(-98) == "refused" + + def test_falls_back_to_value_labels(self): + var = VariableMetadata( + name="age", + dtype="Int64", + missing_values=[-99], + value_labels={-99: "Not Asked", 1: "Yes"}, + ) + spec = MissingSpec.from_variable_metadata(var) + assert spec.reason_for(-99) == "not_asked" + + def test_no_missing_returns_none(self): + var = VariableMetadata(name="id", dtype="Int64") + assert MissingSpec.from_variable_metadata(var) is None + + def test_to_missing_spec_method(self): + var = VariableMetadata(name="age", dtype="Int64", missing_values=[-99]) + assert var.to_missing_spec().is_missing(-99) is True + + +class TestMetadataImportMissingSpecs: + def test_missing_specs_mapping(self): + v1 = VariableMetadata( + name="age", + dtype="Int64", + missing_values=[-99, -98], + missing_value_labels={-99: "Not asked", -98: "Refused"}, + ) + v2 = VariableMetadata(name="id", dtype="Int64") # no missing values + meta = MetadataImport(source_format="spss", variables=[v1, v2]) + + specs = meta.missing_specs() + assert list(specs.keys()) == ["age"] # id omitted (no missing values) + assert specs["age"].reason_for(-99) == "not_asked" + + def test_specs_usable_in_validation(self): + import polars as pl + + v = VariableMetadata( + name="age", dtype="Int64", missing_values=[-99], missing_value_labels={-99: "Not asked"} + ) + meta = MetadataImport(source_format="spss", variables=[v]) + specs = meta.missing_specs() + + tbl = pl.DataFrame({"age": [34, -99, 200]}) + validation = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=specs["age"]) + .interrogate() + ) + # -99 excluded; only 200 fails + assert validation.validation_info[0].n_failed == 1 From ba0daf21a7057bf4b5b0ee82e3c0b0aea94c552d Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 12:46:29 -0400 Subject: [PATCH 41/55] Add missing sections in existing docstrings --- pointblank/validate.py | 160 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) diff --git a/pointblank/validate.py b/pointblank/validate.py index f1b1ae49e..4d3f9464e 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -10711,6 +10711,34 @@ def col_pct_missing( Validate The `Validate` object with the added validation step. + Thresholds + ---------- + The `thresholds=` parameter is used to set the failure-condition levels for the validation + step. If they are set here at the step level, these thresholds will override any thresholds + set at the global level in `Validate(thresholds=...)`. + + There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values + can either be set as a proportion failing of all test units (a value between `0` to `1`), + or, the absolute number of failing test units (as integer that's `1` or greater). + + Thresholds can be defined using one of these input schemes: + + 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create + thresholds) + 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is + the 'error' level, and position `2` is the 'critical' level + 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and + 'critical' + 4. a single integer/float value denoting absolute number or fraction of failing test units + for the 'warning' level only + + If the number of failing test units exceeds set thresholds, the validation step will be + marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be + set, you're free to set any combination of them. + + Aside from reporting failure conditions, thresholds can be used to determine the actions to + take for each level of failure (using the `actions=` parameter). + Examples -------- ```{python} @@ -10879,6 +10907,50 @@ def col_missing_coded( Validate The `Validate` object with the added validation step. + Preprocessing + ------------- + The `pre=` argument allows for a preprocessing function or lambda to be applied to the data + table during interrogation. This function should take a table as input and return a modified + table. This is useful for performing any necessary transformations or filtering on the data + before the validation step is applied. + + Segmentation + ------------ + The `segments=` argument allows for the segmentation of a validation step into multiple + segments. This is useful for applying the same validation step to different subsets of the + data. The segmentation can be done based on a single column or specific fields within a + column. Providing a single column name results in a separate validation step for each unique + value in that column; a tuple of `(column, values)` restricts segmentation to the listed + values. The segmentation is performed after any `pre=` preprocessing. + + Thresholds + ---------- + The `thresholds=` parameter is used to set the failure-condition levels for the validation + step. If they are set here at the step level, these thresholds will override any thresholds + set at the global level in `Validate(thresholds=...)`. + + There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values + can either be set as a proportion failing of all test units (a value between `0` to `1`), + or, the absolute number of failing test units (as integer that's `1` or greater). + + Thresholds can be defined using one of these input schemes: + + 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create + thresholds) + 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is + the 'error' level, and position `2` is the 'critical' level + 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and + 'critical' + 4. a single integer/float value denoting absolute number or fraction of failing test units + for the 'warning' level only + + If the number of failing test units exceeds set thresholds, the validation step will be + marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be + set, you're free to set any combination of them. + + Aside from reporting failure conditions, thresholds can be used to determine the actions to + take for each level of failure (using the `actions=` parameter). + Examples -------- ```{python} @@ -11025,6 +11097,50 @@ def col_missing_only_coded( Validate The `Validate` object with the added validation step. + Preprocessing + ------------- + The `pre=` argument allows for a preprocessing function or lambda to be applied to the data + table during interrogation. This function should take a table as input and return a modified + table. This is useful for performing any necessary transformations or filtering on the data + before the validation step is applied. + + Segmentation + ------------ + The `segments=` argument allows for the segmentation of a validation step into multiple + segments. This is useful for applying the same validation step to different subsets of the + data. The segmentation can be done based on a single column or specific fields within a + column. Providing a single column name results in a separate validation step for each unique + value in that column; a tuple of `(column, values)` restricts segmentation to the listed + values. The segmentation is performed after any `pre=` preprocessing. + + Thresholds + ---------- + The `thresholds=` parameter is used to set the failure-condition levels for the validation + step. If they are set here at the step level, these thresholds will override any thresholds + set at the global level in `Validate(thresholds=...)`. + + There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values + can either be set as a proportion failing of all test units (a value between `0` to `1`), + or, the absolute number of failing test units (as integer that's `1` or greater). + + Thresholds can be defined using one of these input schemes: + + 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create + thresholds) + 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is + the 'error' level, and position `2` is the 'critical' level + 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and + 'critical' + 4. a single integer/float value denoting absolute number or fraction of failing test units + for the 'warning' level only + + If the number of failing test units exceeds set thresholds, the validation step will be + marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be + set, you're free to set any combination of them. + + Aside from reporting failure conditions, thresholds can be used to determine the actions to + take for each level of failure (using the `actions=` parameter). + Examples -------- ```{python} @@ -11674,6 +11790,50 @@ def col_missing_consistent( Validate The `Validate` object with the added validation step. + Preprocessing + ------------- + The `pre=` argument allows for a preprocessing function or lambda to be applied to the data + table during interrogation. This function should take a table as input and return a modified + table. This is useful for performing any necessary transformations or filtering on the data + before the validation step is applied. + + Segmentation + ------------ + The `segments=` argument allows for the segmentation of a validation step into multiple + segments. This is useful for applying the same validation step to different subsets of the + data. The segmentation can be done based on a single column or specific fields within a + column. Providing a single column name results in a separate validation step for each unique + value in that column; a tuple of `(column, values)` restricts segmentation to the listed + values. The segmentation is performed after any `pre=` preprocessing. + + Thresholds + ---------- + The `thresholds=` parameter is used to set the failure-condition levels for the validation + step. If they are set here at the step level, these thresholds will override any thresholds + set at the global level in `Validate(thresholds=...)`. + + There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values + can either be set as a proportion failing of all test units (a value between `0` to `1`), + or, the absolute number of failing test units (as integer that's `1` or greater). + + Thresholds can be defined using one of these input schemes: + + 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create + thresholds) + 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is + the 'error' level, and position `2` is the 'critical' level + 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and + 'critical' + 4. a single integer/float value denoting absolute number or fraction of failing test units + for the 'warning' level only + + If the number of failing test units exceeds set thresholds, the validation step will be + marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be + set, you're free to set any combination of them. + + Aside from reporting failure conditions, thresholds can be used to determine the actions to + take for each level of failure (using the `actions=` parameter). + Examples -------- ```{python} From 273999cc9167edfc23b8aa465a6ca7c4d62e8ecf Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 12:46:48 -0400 Subject: [PATCH 42/55] Add objects to reference: section --- great-docs.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/great-docs.yml b/great-docs.yml index b1e6377e7..4f445c591 100644 --- a/great-docs.yml +++ b/great-docs.yml @@ -131,6 +131,8 @@ reference: members: false - name: DraftValidation members: false + - name: MissingSpec + members: true - title: Contracts and Pipelines desc: > @@ -189,6 +191,10 @@ reference: - Validate.col_vals_expr - Validate.col_exists - Validate.col_pct_null + - Validate.col_pct_missing + - Validate.col_missing_coded + - Validate.col_missing_only_coded + - Validate.col_missing_consistent - Validate.rows_distinct - Validate.rows_complete - Validate.col_schema_match From e46039555769dab63ff76c8e593dc8d46a82b197 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 12:46:50 -0400 Subject: [PATCH 43/55] Update 02-validation-methods.qmd --- .../02-validation-methods.qmd | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/user_guide/01-validation-plan/02-validation-methods.qmd b/user_guide/01-validation-plan/02-validation-methods.qmd index 2cc85e623..c420961c1 100644 --- a/user_guide/01-validation-plan/02-validation-methods.qmd +++ b/user_guide/01-validation-plan/02-validation-methods.qmd @@ -293,6 +293,78 @@ In summary, `na_pass=` works like this: - `na_pass=True`: missing values pass validation regardless of the condition being tested - `na_pass=False` (the default): missing values fail validation +### Structured Missingness with `missing=` + +`na_pass=` treats missingness as binary, but real-world data often encodes *why* a value is absent +using sentinel codes (e.g., `-99` for "not asked", `-98` for "refused"). The +[`MissingSpec`](`pointblank.MissingSpec`) class captures these codes and their reasons, and most +validation methods accept a `missing=` argument that uses it. + +When you pass `missing=` to a `col_vals_*()` method, declared sentinel values (and, by default, +`Null` values) are *excluded* from the check, so only the "real" values are validated: + +```{python} +import polars as pl + +tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29, 200, 55, None]}) + +age_missing = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + +validation = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=age_missing) + .interrogate() +) + +validation +``` + +Only the real value `200` is out of range; the sentinel codes and the `Null` are excluded and pass. +In the report, such steps are marked with a compact `MISSING-AWARE` badge, and a one-line summary of +the codes appears in the step's notes. + +Pointblank also provides dedicated missingness validation methods that use a `MissingSpec`: + +- [`Validate.col_pct_missing()`](`Validate.col_pct_missing`): assert the percentage of missing + values stays within a limit, optionally filtered by a specific `reason=` or `category=`. +- [`Validate.col_missing_coded()`](`Validate.col_missing_coded`): assert every absence is expressed + as a documented code (no uncoded raw `Null` values). +- [`Validate.col_missing_only_coded()`](`Validate.col_missing_only_coded`): assert a column contains + only documented codes and legitimate values (catching undocumented codes like a stray `-95`), + paired with an `allowed=` set or a `min_val`/`max_val` range. +- [`Validate.col_missing_consistent()`](`Validate.col_missing_consistent`): assert related columns + share a consistent missingness pattern for a given reason (e.g., a survey skip pattern). + +```{python} +income_missing = pb.MissingSpec(reasons={-99: "not_asked", -1: "below_threshold"}) + +survey = pl.DataFrame( + { + "income_source": [1, -99, 2, -99], + "income_amount": [50000, -99, 42000, 38000], + } +) + +validation = ( + pb.Validate(data=survey) + # No more than 30% of income values may be "not_asked" + .col_pct_missing(columns="income_amount", missing=income_missing, reason="not_asked", max_pct=0.30) + # If income wasn't asked, both related columns should be coded together + .col_missing_consistent( + columns=["income_source", "income_amount"], missing=income_missing, when_reason="not_asked" + ) + .interrogate() +) + +validation +``` + +`MissingSpec` also offers pre-built factories for common standards (e.g., +`pb.MissingSpec.from_cdisc_null_flavors()`, `pb.MissingSpec.from_sas()`, +`pb.MissingSpec.from_spss()`), and importing metadata from SPSS/Stata/SAS files can auto-generate +specs via [`MetadataImport.missing_specs()`](`pointblank.MetadataImport`). For a fuller treatment of +structured-missingness *reporting*, see the *Missing Values Reporting* article. + ## 2. Row-based Validations Row-based validations focus on examining properties that span across entire rows rather than From 9e0e1f32f6af82a7df120ac960fe332b9fa2339f Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 12:46:52 -0400 Subject: [PATCH 44/55] Update 02-yaml-reference.qmd --- user_guide/03-yaml/02-yaml-reference.qmd | 61 ++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/user_guide/03-yaml/02-yaml-reference.qmd b/user_guide/03-yaml/02-yaml-reference.qmd index cbe3341b2..a13408beb 100644 --- a/user_guide/03-yaml/02-yaml-reference.qmd +++ b/user_guide/03-yaml/02-yaml-reference.qmd @@ -40,6 +40,11 @@ actions: # OPTIONAL: Global failure actions final_actions: # OPTIONAL: Actions triggered after all steps complete warning: "Post-validation warning" error: "Post-validation error" +missing_specs: # OPTIONAL: Named structured-missingness specs + standard_survey: + reasons: + -99: not_asked + -98: refused steps: # REQUIRED: List of validation steps - validation_method_name - validation_method_name: @@ -191,6 +196,62 @@ Template variables available for action strings: - `{level}`: severity level ('warning'/'error'/'critical') - `{time}`: timestamp of validation +### Structured Missingness (`missing_specs`) + +The optional top-level `missing_specs` key defines named [`MissingSpec`](`pointblank.MissingSpec`) +objects that steps can reference. Each named spec maps sentinel values to reason labels, and may +declare `categories`, `null_is_missing`, and `null_reason`: + +```yaml +missing_specs: + standard_survey: + reasons: + -99: not_asked + -98: refused + -97: dont_know + categories: + nonresponse: [refused, dont_know] + null_is_missing: true # OPTIONAL (default true) + null_reason: unknown # OPTIONAL (default "unknown") +``` + +Steps reference a named spec by name through the `missing:` parameter. This works both on the +`col_vals_*` methods (to exclude sentinel values from a check) and on the dedicated missingness +methods (`col_pct_missing`, `col_missing_coded`, `col_missing_only_coded`, `col_missing_consistent`): + +```yaml +missing_specs: + standard_survey: + reasons: + -99: not_asked + -98: refused + +steps: + - col_vals_between: + columns: age + left: 0 + right: 120 + missing: standard_survey # excludes -99/-98 (and nulls) from the range check + - col_pct_missing: + columns: age + missing: standard_survey + reason: refused + max_pct: 0.30 +``` + +A step can also define a spec inline (an anonymous mapping) instead of referencing a named one: + +```yaml +steps: + - col_pct_missing: + columns: age + max_pct: 0.5 + missing: + reasons: + -99: not_asked + -98: refused +``` + ## Validation Methods Reference ### Column Value Validations From b9f1e7a00eb542246cbc288fa5553fbf204b3d68 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 12:47:05 -0400 Subject: [PATCH 45/55] Update 03-missing-vals-tbl.qmd --- .../03-missing-vals-tbl.qmd | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/user_guide/05-data-inspection/03-missing-vals-tbl.qmd b/user_guide/05-data-inspection/03-missing-vals-tbl.qmd index 951392099..a4fafbf76 100644 --- a/user_guide/05-data-inspection/03-missing-vals-tbl.qmd +++ b/user_guide/05-data-inspection/03-missing-vals-tbl.qmd @@ -81,3 +81,72 @@ pb.missing_vals_tbl(game_revenue) We see nothing but light blue in this report! The header also indicates that there are no missing values by displaying a large green check mark (the other report tables provided a count of total missing values across all columns). + +## Structured Missingness by Reason + +So far we've treated missingness as binary: a value is either `Null` or it isn't. But real-world +data often encodes *why* a value is absent. Survey data distinguishes *refused* from *not asked* +from *don't know*; clinical and statistical-package data use sentinel codes like `-99`, `".A"`, or +`"NOT DONE"`. Pointblank captures this with the [`MissingSpec`](`pointblank.MissingSpec`) class, +which maps sentinel values to human-readable *reasons*. + +When you pass a `missing=` mapping of column names to `MissingSpec` objects, `missing_vals_tbl()` +switches from the sector heatmap to a *structured breakdown*: one row per column with the count and +percentage of complete values and of each missing reason. + +```{python} +import polars as pl + +survey = pl.DataFrame( + { + "age": [34, -98, 41, -99, 29, -98, 55, None], + "income": [50000, -99, -1, None, 42000, -99, 38000, 61000], + } +) + +specs = { + "age": pb.MissingSpec(reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}), + "income": pb.MissingSpec(reasons={-99: "not_asked", -1: "below_threshold"}), +} + +pb.missing_vals_tbl(survey, missing=specs) +``` + +Each `MissingSpec` declares the sentinel values for a column and the reason each one represents. By +default, actual `Null` values are also counted as missing (with the reason `"unknown"`); set +`null_is_missing=False` on the spec if raw nulls should be treated as real values instead. + +### Viewing the pattern as a heatmap + +For a more visual read of *where* missingness concentrates, pass `as_heatmap=True`. The reason +columns are then shaded from light to dark by the proportion missing: + +```{python} +pb.missing_vals_tbl(survey, missing=specs, as_heatmap=True) +``` + +### Pre-built specs for common standards + +You don't always have to define reasons by hand. `MissingSpec` provides factory methods for common +encodings, including CDISC/HL7 null flavors and SAS special missing values: + +```{python} +cdisc = pb.MissingSpec.from_cdisc_null_flavors() +print("NASK ->", cdisc.reason_for("NASK")) # not_asked +print("UNK ->", cdisc.reason_for("UNK")) # unknown +``` + +When metadata is imported from SPSS, Stata, or SAS files (see the *Metadata Import* section), +[`MetadataImport.missing_specs()`](`pointblank.MetadataImport`) auto-generates a `{column: +MissingSpec}` mapping from the variables' declared missing values, ready to pass straight to +`missing_vals_tbl()`. + +::: {.callout-note} +The same `MissingSpec` objects power missingness-aware *validation*, not just reporting. You can +pass `missing=` to the `col_vals_*()` methods (to exclude sentinel values from a check) and use the +dedicated [`col_pct_missing()`](`pointblank.Validate.col_pct_missing`), +[`col_missing_coded()`](`pointblank.Validate.col_missing_coded`), +[`col_missing_only_coded()`](`pointblank.Validate.col_missing_only_coded`), and +[`col_missing_consistent()`](`pointblank.Validate.col_missing_consistent`) validation steps. See the +*Validation Methods* article for details. +::: From b3d702b1e849aa96eb6ff6dc79e85724d3416b3d Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 12:47:11 -0400 Subject: [PATCH 46/55] Update 02-statistical-packages.qmd --- .../02-statistical-packages.qmd | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/user_guide/11-metadata-import/02-statistical-packages.qmd b/user_guide/11-metadata-import/02-statistical-packages.qmd index 25d85e1eb..9b40338b2 100644 --- a/user_guide/11-metadata-import/02-statistical-packages.qmd +++ b/user_guide/11-metadata-import/02-statistical-packages.qmd @@ -130,6 +130,36 @@ them appropriately. When validation is generated, these codes are documented in rather than generating explicit exclusion rules, since the correct handling depends on your analysis context. +#### Turning missing codes into `MissingSpec` objects + +To put these codes to work in validation and reporting, convert them into +[`MissingSpec`](`pointblank.MissingSpec`) objects. The +[`MetadataImport.missing_specs()`](`pointblank.MetadataImport`) method does this for every variable +that declares missing values, returning a `{column: MissingSpec}` mapping (the reason labels are +derived from the variables' value labels): + +```python +meta = pb.import_metadata("survey.sav") + +# Auto-generate a {column: MissingSpec} mapping from the declared missing values +specs = meta.missing_specs() + +# Use the specs in a structured missingness report... +pb.missing_vals_tbl(data, missing=specs) + +# ...or in missingness-aware validation +validation = ( + pb.Validate(data=data) + .col_vals_between(columns="age", left=0, right=120, missing=specs["age"]) + .interrogate() +) +``` + +You can also build a spec for a single variable with +[`VariableMetadata.to_missing_spec()`](`pointblank.VariableMetadata`), or construct one directly +from SPSS-style values via `pb.MissingSpec.from_spss(missing_values=[...], labels={...})`. See the +*Missing Values Reporting* and *Validation Methods* articles for what you can do with these specs. + ### Type Detection from Formats SPSS stores numeric variables with format strings that indicate how they should be displayed. These From 7ea9b688b80987742b86c5c8b4d525990b383dbf Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 12:47:13 -0400 Subject: [PATCH 47/55] Update 03-cdisc-validation.qmd --- .../03-cdisc-validation.qmd | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/user_guide/11-metadata-import/03-cdisc-validation.qmd b/user_guide/11-metadata-import/03-cdisc-validation.qmd index eb1a32e21..e8d8d4308 100644 --- a/user_guide/11-metadata-import/03-cdisc-validation.qmd +++ b/user_guide/11-metadata-import/03-cdisc-validation.qmd @@ -531,6 +531,27 @@ This layered approach gives you the flexibility to apply different levels of val on your needs. The Define-XML checks enforce what was specifically documented for your study, while the SDTM template checks enforce the broader standard requirements that apply universally. +## Null Flavors and Structured Missingness + +Clinical data uses standardized HL7/CDISC *null flavors* to record *why* a value is absent (e.g., +`"NASK"` = not asked, `"UNK"` = unknown, `"NA"` = not applicable). Pointblank ships a pre-built +[`MissingSpec`](`pointblank.MissingSpec`) for these codes via +`MissingSpec.from_cdisc_null_flavors()`: + +```{python} +cdisc = pb.MissingSpec.from_cdisc_null_flavors() + +print("NASK ->", cdisc.reason_for("NASK")) # not_asked +print("UNK ->", cdisc.reason_for("UNK")) # unknown +print("boundary codes:", cdisc.values_for_category("boundary")) +``` + +This spec can be passed to `missing_vals_tbl()` for a reason-by-reason breakdown, or to the +`col_vals_*()` and dedicated missingness validation methods (`col_pct_missing()`, +`col_missing_coded()`, `col_missing_only_coded()`, `col_missing_consistent()`) to validate data +while accounting for the null flavor codes. See the *Missing Values Reporting* and *Validation +Methods* articles for the full set of capabilities. + ## Conclusion CDISC data validation with Pointblank covers the full spectrum of clinical trial data management: From 54861ae909ce2826e8c72f9e04df3e31bd0e9a0e Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 15:06:45 -0400 Subject: [PATCH 48/55] Improve appearance of missing vals table --- pointblank/validate.py | 166 +++++++++++------- tests/test_missing_vals_tbl_structured.py | 97 +++++++++- .../03-missing-vals-tbl.qmd | 24 ++- 3 files changed, 210 insertions(+), 77 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 4d3f9464e..084211b98 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -2680,12 +2680,6 @@ def _generate_display_table( return gt_tbl -def _prettify_reason_label(reason: str) -> str: - """Turn a snake_case reason label into a Title Case display label (e.g. 'not_asked' -> - 'Not Asked').""" - return reason.replace("_", " ").title() - - def _build_structured_missing_tbl( data: Any, missing: dict[str, MissingSpec], as_heatmap: bool = False ) -> GT: @@ -2711,13 +2705,18 @@ def _build_structured_missing_tbl( available_columns = list(nw_frame.columns) - # Build the ordered union of reason labels across all specs (first-seen order) + # Build the ordered union of *declared* (coded) reason labels across all specs (first-seen + # order). Raw Null/None/NA values are tallied separately in a fixed "Null" column rather than + # being treated as a reason, since they are not part of any MissingSpec. reason_order: list[str] = [] for spec in missing.values(): - for r in spec.reasons_list(): + for r in spec.reasons.values(): if r not in reason_order: reason_order.append(r) + # A "Null" column is shown only if at least one spec counts raw nulls as missing + has_null_col = any(spec.null_is_missing for spec in missing.values()) + records: list[dict[str, Any]] = [] for column, spec in missing.items(): if column not in available_columns: @@ -2725,60 +2724,56 @@ def _build_structured_missing_tbl( f"Column '{column}' given in `missing=` was not found in the table." ) - # Build one aggregation per reason that has an expression (sentinels and/or nulls) + # One aggregation per declared reason (sentinel values only), plus a separate raw-null + # count when the spec treats nulls as missing; coded reasons and raw nulls are kept distinct + declared_reasons = list(dict.fromkeys(spec.reasons.values())) select_exprs: dict[str, Any] = {"__total__": nw.len()} reason_alias: dict[str, str] = {} - for i, r in enumerate(spec.reasons_list()): - sentinels = spec.values_for_reason(r) - expr = None - if sentinels: - expr = nw.col(column).is_in(sentinels) - if r == spec.null_reason and spec.null_is_missing: - null_expr = nw.col(column).is_null() - expr = null_expr if expr is None else (expr | null_expr) - if expr is not None: - alias = f"__r{i}__" - reason_alias[r] = alias - select_exprs[alias] = expr.cast(nw.Int32).sum() + for i, r in enumerate(declared_reasons): + reason_alias[r] = f"__r{i}__" + select_exprs[reason_alias[r]] = ( + nw.col(column).is_in(spec.values_for_reason(r)).cast(nw.Int32).sum() + ) + if spec.null_is_missing: + select_exprs["__null__"] = nw.col(column).is_null().cast(nw.Int32).sum() out = nw_frame.select(**select_exprs) if is_lazy: out = out.collect() total = int(out["__total__"][0]) - counts: dict[str, int] = {} - for r in spec.reasons_list(): - counts[r] = int(out[reason_alias[r]][0]) if r in reason_alias else 0 + coded_counts = {r: int(out[reason_alias[r]][0]) for r in declared_reasons} + n_null = int(out["__null__"][0]) if spec.null_is_missing else 0 - total_missing = sum(counts.values()) + total_missing = sum(coded_counts.values()) + n_null complete = total - total_missing + # A coded reason only *applies* to a column if its spec declares it; non-applicable reasons + # render as an em dash (not "0"). The "Null" column applies only when null_is_missing=True. + applicable = set(declared_reasons) + def _prop(count: int) -> float: return (count / total) if total > 0 else 0.0 if as_heatmap: - # Numeric proportions (0..1) so cells can be color-shaded by missingness - record: dict[str, Any] = { - "columns": column, - "total_n": str(total), - "complete": _prop(complete), - } + # Numeric proportions (0..1) so reason cells can be color-shaded; non-applicable cells + # are left as None (shown as an em dash, uncolored) + record: dict[str, Any] = {"columns": column, "complete": _prop(complete)} for r in reason_order: - record[r] = _prop(counts.get(r, 0)) + record[r] = _prop(coded_counts.get(r, 0)) if r in applicable else None + if has_null_col: + record["null"] = _prop(n_null) if spec.null_is_missing else None else: def _fmt(count: int) -> str: pct = round(100 * count / total) if total > 0 else 0 return f"{count} ({pct}%)" - record = { - "columns": column, - "total_n": str(total), - "complete": _fmt(complete), - } - # Fill every reason column in the union (0 for reasons this spec doesn't define) + record = {"columns": column, "complete": _fmt(complete)} for r in reason_order: - record[r] = _fmt(counts.get(r, 0)) + record[r] = _fmt(coded_counts.get(r, 0)) if r in applicable else "—" + if has_null_col: + record["null"] = _fmt(n_null) if spec.null_is_missing else "—" records.append(record) # Build a DataFrame from the records using the available DataFrame library @@ -2792,26 +2787,52 @@ def _fmt(count: int) -> str: breakdown_df = pd.DataFrame(records) - cols_labels = { - "columns": "Column", - "total_n": "Total N", - "complete": "Complete", - } - for r in reason_order: - cols_labels[r] = _prettify_reason_label(r) + # Reason columns keep their raw input form as labels (e.g. "not_asked", not "Not Asked"); the + # fixed columns are relabeled. The total row count is already shown in the header, so there's no + # redundant "Total N" column. Raw nulls appear in a fixed "Null" column (styled like "Complete"), + # not as a reason. + cols_labels = {"columns": "Column", "complete": "Complete"} + if has_null_col: + cols_labels["null"] = "Null" + + value_columns = ["complete"] + reason_order + (["null"] if has_null_col else []) + + # Build a header that matches the default `missing_vals_tbl()` look: a plain (large) title in + # IBM Plex Sans and a subtitle showing the table type and dimensions + tbl_type = _get_tbl_type(data=data) + n_rows_total = get_row_count(data) + table_type_html = _create_table_type_html(tbl_type=tbl_type, tbl_name=None, font_size="10px") + tbl_dims_html = _create_table_dims_html( + columns=len(available_columns), rows=n_rows_total, font_size="10px" + ) + combined_subtitle = ( + "
" + '
' + f"{table_type_html}" + f"{tbl_dims_html}" + "
" + "
" + ) + + # The left "Column" column is rendered in monospace, matching the default report's body font + column_name_style = style.text( + color="black", font=google_font(name="IBM Plex Mono"), size="12px" + ) + # The reason column labels keep their raw input form and are shown in monospace + reason_label_style = style.text(font=google_font(name="IBM Plex Mono"), size="12px") - value_columns = ["total_n", "complete"] + reason_order + # Columns that should show an em dash for non-applicable cells (reason columns + the Null column) + em_dash_columns = reason_order + (["null"] if has_null_col else []) if as_heatmap: title = "Missing Pattern Heatmap" - subtitle = "Proportion of each missing reason per column (darker = more missing)." - prop_columns = ["complete"] + reason_order + # "complete" and "null" are shown as plain percentages (uncolored, like the default report); + # only the coded reason columns are color-shaded by proportion + prop_columns = ["complete"] + reason_order + (["null"] if has_null_col else []) gt_tbl = ( GT(breakdown_df) - .tab_header( - title=html(f"
{title}
"), subtitle=subtitle - ) + .tab_header(title=title, subtitle=html(combined_subtitle)) .opt_table_font(font=google_font(name="IBM Plex Sans")) .opt_align_table_header(align="left") .cols_label(cases=cols_labels) @@ -2822,21 +2843,18 @@ def _fmt(count: int) -> str: columns=reason_order, palette=["#F5F5F5", "#000000"], domain=[0, 1], + na_color="#FFFFFF", ) - .tab_style( - style=style.text(weight="bold"), - locations=loc.body(columns="columns"), - ) + .sub_missing(columns=em_dash_columns, missing_text="—") + .tab_style(style=column_name_style, locations=loc.body(columns="columns")) + .tab_style(style=reason_label_style, locations=loc.column_labels(columns=reason_order)) ) else: title = "Missing Values by Reason" - subtitle = "Counts and percentages of complete values and each missing reason, per column." gt_tbl = ( GT(breakdown_df) - .tab_header( - title=html(f"
{title}
"), subtitle=subtitle - ) + .tab_header(title=title, subtitle=html(combined_subtitle)) .opt_table_font(font=google_font(name="IBM Plex Sans")) .opt_align_table_header(align="left") .cols_label(cases=cols_labels) @@ -2846,12 +2864,15 @@ def _fmt(count: int) -> str: style=style.text(font=google_font(name="IBM Plex Mono"), size="12px"), locations=loc.body(columns=value_columns), ) - .tab_style( - style=style.text(weight="bold"), - locations=loc.body(columns="columns"), - ) + .tab_style(style=column_name_style, locations=loc.body(columns="columns")) + .tab_style(style=reason_label_style, locations=loc.column_labels(columns=reason_order)) ) + # Group only the coded reasons under a "Missing Reasons" spanner. Raw nulls live in the fixed + # "Null" column (styled like "Complete"), so they aren't mistaken for declared spec reasons. + if reason_order: + gt_tbl = gt_tbl.tab_spanner(label="Missing Reasons", columns=reason_order) + if version("great_tables") >= "0.17.0": gt_tbl = gt_tbl.tab_options(quarto_disable_processing=True) @@ -2873,7 +2894,17 @@ def missing_vals_tbl( column. When a `missing=` mapping of columns to [`MissingSpec`](`pointblank.MissingSpec`) objects is supplied, the function instead renders a *structured missingness* breakdown: one row per column with the count and percentage of complete values and of each missing *reason* (e.g., - "Refused", "Not Asked", "Unknown"). + `refused`, `not_asked`). Declared (coded) reasons are grouped under a "Missing Reasons" spanner + and keep their raw input form as labels; actual `Null`/`None`/`NA` values (which are not part of + the spec) are tallied in a fixed "Null" column at the far right (styled like "Complete"), so + they aren't mistaken for declared reasons. + + Note that supplying `missing=` produces a *different report* than the default view: it is a + distinct visualization (a per-reason breakdown table, or a per-reason heatmap with + `as_heatmap=True`), not an annotated version of the default sector heatmap. The report titles + differ accordingly ("Missing Values" for the default, "Missing Values by Reason" or "Missing + Pattern Heatmap" for the structured views), and the shared header/title styling makes the family + resemblance clear. Parameters ---------- @@ -2884,7 +2915,10 @@ def missing_vals_tbl( missing An optional dictionary mapping column names to [`MissingSpec`](`pointblank.MissingSpec`) objects. When provided, the function renders a structured breakdown of missingness by - reason for the specified columns (rather than the default sector heatmap). + reason for the specified columns (rather than the default sector heatmap). The reason + columns are the union of reasons across the supplied specs; a reason that isn't defined for + a given column is shown as an em dash (not applicable), as distinct from a defined-but-unobserved + reason (shown as `0 (0%)`). as_heatmap Only applies when `missing=` is provided. When `True`, render the per-reason proportions as a color-coded heatmap (cells shaded from light to dark by the proportion missing) instead of diff --git a/tests/test_missing_vals_tbl_structured.py b/tests/test_missing_vals_tbl_structured.py index fde123bbe..ae46963b0 100644 --- a/tests/test_missing_vals_tbl_structured.py +++ b/tests/test_missing_vals_tbl_structured.py @@ -31,16 +31,55 @@ def test_returns_gt(self, tbl_pl, specs): def test_reason_columns_present(self, tbl_pl, specs): html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html() + # Coded reason labels keep their raw input form (snake_case), grouped under a spanner for token in [ - "Not Asked", - "Refused", - "Dont Know", - "Below Threshold", - "Unknown", + "not_asked", + "refused", + "dont_know", + "below_threshold", "Complete", - "Total N", + "Null", # fixed column for raw nulls (not a reason) + "Missing Reasons", # spanner over the coded reason columns only ]: assert token in html + # Labels are not prettified to Title Case + assert "Not Asked" not in html and "Below Threshold" not in html + # The redundant "Total N" column was removed (row count is in the header) + assert "Total N" not in html + + def test_null_is_fixed_column_not_a_reason(self, tbl_pl, specs): + # Raw nulls appear in a fixed "Null" column, not as an "unknown" reason under the spanner + html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html() + assert "Null" in html + assert "unknown" not in html # the null_reason label is not shown + # "Null" is a fixed column to the right of the coded reasons + gt = pb.missing_vals_tbl(tbl_pl, missing=specs) + cols = list(gt._tbl_data.columns) + assert cols[-1] == "null" + assert cols.index("null") > cols.index("below_threshold") + + def test_no_null_column_when_null_not_missing(self): + # null_is_missing=False -> no "Null" column and no "unknown" text + tbl = pl.DataFrame({"age": [34, -98, 41, None]}) + spec = {"age": pb.MissingSpec(reasons={-98: "refused"}, null_is_missing=False)} + gt = pb.missing_vals_tbl(tbl, missing=spec) + assert "null" not in list(gt._tbl_data.columns) + html = gt.as_raw_html() + assert "unknown" not in html + + def test_null_column_em_dash_when_not_applicable(self): + # When one spec counts nulls and another doesn't, the Null column shows an em dash for the + # column whose spec sets null_is_missing=False + tbl = pl.DataFrame({"a": [1, -99, None], "b": [1, -99, None]}) + specs = { + "a": pb.MissingSpec(reasons={-99: "not_asked"}), # null_is_missing=True + "b": pb.MissingSpec(reasons={-99: "not_asked"}, null_is_missing=False), + } + gt = pb.missing_vals_tbl(tbl, missing=specs) + null_vals = list(gt._tbl_data["null"]) + # column "a" counts its 1 null; column "b" is not applicable (em dash) + assert null_vals[0] == "1 (33%)" + assert null_vals[1] == "—" def test_counts_correct(self, tbl_pl): # age: total 8 -> refused 2 (25%), not_asked 1 (12%), dont_know 0 (0%), @@ -57,7 +96,7 @@ def test_null_excluded_when_spec_says_so(self): tbl = pl.DataFrame({"age": [34, -98, 41, None]}) spec = pb.MissingSpec(reasons={-98: "refused"}, null_is_missing=False) html = pb.missing_vals_tbl(tbl, missing={"age": spec}).as_raw_html() - assert "Unknown" not in html + assert "unknown" not in html # complete = 3 (null + 2 reals) of 4 = 75% assert "3 (75%)" in html @@ -94,7 +133,8 @@ def test_heatmap_returns_gt(self, tbl_pl, specs): def test_heatmap_title_and_labels(self, tbl_pl, specs): html = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True).as_raw_html() assert "Missing Pattern Heatmap" in html - assert "Refused" in html and "Below Threshold" in html + assert "refused" in html and "below_threshold" in html + assert "Missing Reasons" in html # spanner over reason columns assert "%" in html # proportions formatted as percentages def test_heatmap_pandas(self, specs): @@ -109,3 +149,44 @@ def test_heatmap_pandas(self, specs): def test_as_heatmap_ignored_without_missing(self, tbl_pl): # as_heatmap only applies with missing=; default sector view still returned assert isinstance(pb.missing_vals_tbl(tbl_pl, as_heatmap=True), GT) + + +class TestStyledLikeOriginal: + """The structured/heatmap outputs should reuse the original report's title style and the + monospaced left Column column.""" + + def test_table_mode_styling(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html() + # Monospaced font present (left Column column + value columns) + assert "IBM Plex Mono" in html + # Header carries the table type + dimensions subtitle (as the default report does) + assert "rows" in html.lower() or "columns" in html.lower() + # Plain title (no shrunk font-size wrapper as before) + assert "
Missing Values by Reason" not in html + + def test_heatmap_mode_styling(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True).as_raw_html() + assert "IBM Plex Mono" in html + assert "
Missing Pattern Heatmap" not in html + + +class TestNonApplicableReasons: + """Reasons not defined in a column's spec should render as an em dash, not '0 (0%)'.""" + + def test_table_mode_em_dash(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html() + # age has no "below_threshold"; income has no "refused"/"dont_know" -> 3 em dashes + assert html.count("—") == 3 + # age DOES define "dont_know" but observes none -> should still show "0 (0%)" + assert "0 (0%)" in html + + def test_heatmap_mode_em_dash(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True).as_raw_html() + assert html.count("—") == 3 + + def test_single_spec_no_em_dash(self): + # With one spec, every reason in the union applies -> no em dashes + tbl = pl.DataFrame({"age": [34, -98, 41, -99]}) + spec = {"age": pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"})} + html = pb.missing_vals_tbl(tbl, missing=spec).as_raw_html() + assert "—" not in html diff --git a/user_guide/05-data-inspection/03-missing-vals-tbl.qmd b/user_guide/05-data-inspection/03-missing-vals-tbl.qmd index a4fafbf76..f47c264ff 100644 --- a/user_guide/05-data-inspection/03-missing-vals-tbl.qmd +++ b/user_guide/05-data-inspection/03-missing-vals-tbl.qmd @@ -94,6 +94,16 @@ When you pass a `missing=` mapping of column names to `MissingSpec` objects, `mi switches from the sector heatmap to a *structured breakdown*: one row per column with the count and percentage of complete values and of each missing reason. +::: {.callout-note} +## Supplying `missing=` produces a different report + +The structured breakdown is a *distinct visualization*, not an annotated version of the default +sector heatmap. Adding `missing=` changes the table's whole layout. The report title changes too +(from "Missing Values" to "Missing Values by Reason", or "Missing Pattern Heatmap" with +`as_heatmap=True`), and the shared title styling and monospaced column list keep the two views +recognizably part of the same family. +::: + ```{python} import polars as pl @@ -112,9 +122,17 @@ specs = { pb.missing_vals_tbl(survey, missing=specs) ``` -Each `MissingSpec` declares the sentinel values for a column and the reason each one represents. By -default, actual `Null` values are also counted as missing (with the reason `"unknown"`); set -`null_is_missing=False` on the spec if raw nulls should be treated as real values instead. +Each `MissingSpec` declares the sentinel values for a column and the reason each one represents. +Those declared (coded) reasons are grouped under the **Missing Reasons** spanner. By default, actual +`Null` values are also counted as missing; because those are raw `Null`/`None`/`NA` values and *not* +part of the spec, they're tallied in a fixed **Null** column at the far right (styled like +**Complete**), rather than as a reason. Set `null_is_missing=False` on the spec if raw nulls should +be treated as real values instead — then there's no **Null** column at all. + +The reason columns are the *union* of reasons across all the specs you provide. When a reason isn't +defined for a particular column, that cell shows an em dash (`—`) rather than `0`. This signals +"not applicable to this column", as distinct from a reason that *is* defined but simply wasn't +observed (which shows `0 (0%)`). ### Viewing the pattern as a heatmap From e5c12777d56f6477cba060dc21bfd3c61c56f2e8 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 15:37:45 -0400 Subject: [PATCH 49/55] Perform some code formatting --- pointblank/missing.py | 11 +++++++---- pointblank/validate.py | 35 +++++++++++++++-------------------- pointblank/yaml.py | 1 + 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/pointblank/missing.py b/pointblank/missing.py index de533d560..4d2814aac 100644 --- a/pointblank/missing.py +++ b/pointblank/missing.py @@ -140,9 +140,7 @@ def _validate(self) -> None: ) if not isinstance(self.null_reason, str): - raise TypeError( - f"null_reason must be a string, got {type(self.null_reason).__name__}." - ) + raise TypeError(f"null_reason must be a string, got {type(self.null_reason).__name__}.") if self.categories is not None: if not isinstance(self.categories, dict): @@ -309,7 +307,12 @@ def from_cdisc_null_flavors( """ reasons = dict(_CDISC_NULL_FLAVORS) categories = { - "unknown": ["no_information", "unknown", "asked_but_unknown", "temporarily_unavailable"], + "unknown": [ + "no_information", + "unknown", + "asked_but_unknown", + "temporarily_unavailable", + ], "not_applicable": ["not_applicable", "not_asked", "not_present"], "boundary": ["positive_infinity", "negative_infinity"], } diff --git a/pointblank/validate.py b/pointblank/validate.py index 084211b98..2787bed5d 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -2720,9 +2720,7 @@ def _build_structured_missing_tbl( records: list[dict[str, Any]] = [] for column, spec in missing.items(): if column not in available_columns: - raise ValueError( - f"Column '{column}' given in `missing=` was not found in the table." - ) + raise ValueError(f"Column '{column}' given in `missing=` was not found in the table.") # One aggregation per declared reason (sentinel values only), plus a separate raw-null # count when the spec treats nulls as missing; coded reasons and raw nulls are kept distinct @@ -10814,9 +10812,7 @@ def col_pct_missing( _check_active_input(param=active, param_name="active") if not isinstance(missing, MissingSpec): - raise TypeError( - f"`missing=` must be a MissingSpec, got {type(missing).__name__}." - ) + raise TypeError(f"`missing=` must be a MissingSpec, got {type(missing).__name__}.") if reason is not None and category is not None: raise ValueError("Only one of `reason=` or `category=` can be specified.") @@ -11026,9 +11022,7 @@ def col_missing_coded( _check_active_input(param=active, param_name="active") if not isinstance(missing, MissingSpec): - raise TypeError( - f"`missing=` must be a MissingSpec, got {type(missing).__name__}." - ) + raise TypeError(f"`missing=` must be a MissingSpec, got {type(missing).__name__}.") # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( @@ -11214,9 +11208,7 @@ def col_missing_only_coded( _check_active_input(param=active, param_name="active") if not isinstance(missing, MissingSpec): - raise TypeError( - f"`missing=` must be a MissingSpec, got {type(missing).__name__}." - ) + raise TypeError(f"`missing=` must be a MissingSpec, got {type(missing).__name__}.") if allowed is None and min_val is None and max_val is None: raise ValueError( @@ -11916,17 +11908,13 @@ def col_missing_consistent( _check_active_input(param=active, param_name="active") if not isinstance(missing, MissingSpec): - raise TypeError( - f"`missing=` must be a MissingSpec, got {type(missing).__name__}." - ) + raise TypeError(f"`missing=` must be a MissingSpec, got {type(missing).__name__}.") if isinstance(columns, str): columns = [columns] columns = list(columns) if len(columns) < 2: - raise ValueError( - "`col_missing_consistent()` requires at least two columns to compare." - ) + raise ValueError("`col_missing_consistent()` requires at least two columns to compare.") # Resolve which sentinel values (and whether nulls) represent `when_reason` sentinels = missing.values_for_reason(when_reason) @@ -18981,7 +18969,10 @@ def get_step_report( # if get_row_count(extract) == 0: # return "No rows were extracted." - if assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_complete", "col_missing_consistent"]: + if assertion_type in ROW_BASED_VALIDATION_TYPES + [ + "rows_complete", + "col_missing_consistent", + ]: # Get the extracted data for the step extract = self.get_data_extracts(i=i, frame=True) @@ -19069,7 +19060,11 @@ def get_step_report( if step_spec is None and isinstance(values, MissingSpec): # col_missing_coded stores the spec directly in `values` step_spec = values - if step_spec is None and isinstance(values, dict) and isinstance(values.get("spec"), MissingSpec): + if ( + step_spec is None + and isinstance(values, dict) + and isinstance(values.get("spec"), MissingSpec) + ): # col_missing_only_coded and col_missing_consistent stash the spec under `values["spec"]` step_spec = values["spec"] if step_spec is not None and step_report is not None: diff --git a/pointblank/yaml.py b/pointblank/yaml.py index 6a9e39c26..22ef133a8 100644 --- a/pointblank/yaml.py +++ b/pointblank/yaml.py @@ -41,6 +41,7 @@ def _missing_spec_to_code(spec: MissingSpec) -> str: parts.append(f"description={spec.description!r}") return f"pb.MissingSpec({', '.join(parts)})" + if TYPE_CHECKING: from typing import Literal From 09f75520851a314719ec45901e204a91ba553c8d Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 15:37:49 -0400 Subject: [PATCH 50/55] Update test_col_missing_consistent.py --- tests/test_col_missing_consistent.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/test_col_missing_consistent.py b/tests/test_col_missing_consistent.py index 50c424ba4..1bcedce26 100644 --- a/tests/test_col_missing_consistent.py +++ b/tests/test_col_missing_consistent.py @@ -31,9 +31,7 @@ def test_basic_inconsistency(self, spec): assert info.n_failed == 1 # last row: only one column is -99 def test_all_consistent_passes(self, spec): - tbl = pl.DataFrame( - {"a": [1, -99, 2, -99], "b": [5, -99, 6, -99]} - ) + tbl = pl.DataFrame({"a": [1, -99, 2, -99], "b": [5, -99, 6, -99]}) v = ( pb.Validate(data=tbl) .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") @@ -54,9 +52,7 @@ def test_null_reason_consistency(self): assert _info(v).n_failed == 1 def test_three_columns(self, spec): - tbl = pl.DataFrame( - {"a": [-99, 1, -99], "b": [-99, 2, -99], "c": [-99, 3, 7]} - ) + tbl = pl.DataFrame({"a": [-99, 1, -99], "b": [-99, 2, -99], "c": [-99, 3, 7]}) v = ( pb.Validate(data=tbl) .col_missing_consistent(columns=["a", "b", "c"], missing=spec, when_reason="not_asked") @@ -80,9 +76,7 @@ def test_missing_must_be_spec(self): ) def test_pandas_backend(self, spec): - tbl = pd.DataFrame( - {"a": [1, -99, -99], "b": [5, -99, 6]} - ) + tbl = pd.DataFrame({"a": [1, -99, -99], "b": [5, -99, 6]}) v = ( pb.Validate(data=tbl) .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") From 222bb6254bc0fedc02429b63a5b05e0f7e76b84e Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 15:37:51 -0400 Subject: [PATCH 51/55] Update test_col_missing_only_coded.py --- tests/test_col_missing_only_coded.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_col_missing_only_coded.py b/tests/test_col_missing_only_coded.py index 7bb1bbbbb..55bf83b80 100644 --- a/tests/test_col_missing_only_coded.py +++ b/tests/test_col_missing_only_coded.py @@ -94,9 +94,7 @@ def test_report_and_step_report(self, spec): tbl = pl.DataFrame({"age": [34, -98, -95, 41]}) v = ( pb.Validate(data=tbl) - .col_missing_only_coded( - columns="age", missing=spec, min_val=0, max_val=120, brief=True - ) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120, brief=True) .interrogate() ) assert v.get_tabular_report() is not None @@ -107,9 +105,7 @@ def test_brief_langs(self, spec, lang): tbl = pl.DataFrame({"age": [34, -95]}) v = ( pb.Validate(data=tbl, lang=lang) - .col_missing_only_coded( - columns="age", missing=spec, min_val=0, max_val=120, brief=True - ) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120, brief=True) .interrogate() ) assert _info(v).autobrief From 476934c0a4a17fa958d5ac1f26e46c437cc42194 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 15:37:54 -0400 Subject: [PATCH 52/55] Update test_col_pct_missing.py --- tests/test_col_pct_missing.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_col_pct_missing.py b/tests/test_col_pct_missing.py index a1c7d4b78..4dbce285f 100644 --- a/tests/test_col_pct_missing.py +++ b/tests/test_col_pct_missing.py @@ -123,9 +123,7 @@ def test_missing_must_be_missingspec(self, survey_tbl): ) def test_multiple_columns(self, age_missing): - tbl = pl.DataFrame( - {"a": [1, -98, 3, 4], "b": [-99, -99, 3, 4]} - ) + tbl = pl.DataFrame({"a": [1, -98, 3, 4], "b": [-99, -99, 3, 4]}) validation = ( pb.Validate(data=tbl) .col_pct_missing(columns=["a", "b"], missing=age_missing, max_pct=0.5) From 8bc28846fed8987626fa95d2bac3179ce0356272 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 15:37:58 -0400 Subject: [PATCH 53/55] Update test_missing_report_integration.py --- tests/test_missing_report_integration.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_missing_report_integration.py b/tests/test_missing_report_integration.py index edd08cda2..6751ec13e 100644 --- a/tests/test_missing_report_integration.py +++ b/tests/test_missing_report_integration.py @@ -20,11 +20,7 @@ def test_tabular_report_annotates_missing_aware_steps(): def test_tabular_report_no_annotation_without_missing(): tbl = pl.DataFrame({"age": [34, -98, 41, 200]}) - v = ( - pb.Validate(data=tbl) - .col_vals_between(columns="age", left=0, right=120) - .interrogate() - ) + v = pb.Validate(data=tbl).col_vals_between(columns="age", left=0, right=120).interrogate() html = v.get_tabular_report().as_raw_html() assert "MISSING-AWARE" not in html assert "Missing codes" not in html From 096027aac271c3b53a30d613c9d050b8ed9f64c1 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 15:46:36 -0400 Subject: [PATCH 54/55] Regenerate validate.pyi for the _ValidationInfo.missing field --- pointblank/validate.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/pointblank/validate.pyi b/pointblank/validate.pyi index 25e1714b9..72df3911c 100644 --- a/pointblank/validate.pyi +++ b/pointblank/validate.pyi @@ -106,6 +106,7 @@ class _ValidationInfo: values: Any | list[Any] | tuple | None = ... inclusive: tuple[bool, bool] | None = ... na_pass: bool | None = ... + missing: Any | None = ... pre: Callable | None = ... segments: Any | None = ... thresholds: Thresholds | None = ... From 5dfd4646019bc8964232069d553e06223864e54b Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Wed, 17 Jun 2026 16:11:15 -0400 Subject: [PATCH 55/55] Cap pandas <3 in test/dev deps to match the pinned duckdb --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2b4c897ac..314790628 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ docs = [ "jupyter", "nbclient>=0.10.0", "nbformat>=5.10.4", - "pandas>=2.2.3", + "pandas>=2.2.3,<3", # <3: pandas 3.0's default `str` dtype is unsupported by the pinned duckdb (<1.3.3) "polars>=1.17.1", "pyspark==3.5.6", "openpyxl>=3.0.0", @@ -94,7 +94,7 @@ dev = [ "jupyter", "nbclient>=0.10.0", "nbformat>=5.10.4", - "pandas>=2.2.3", + "pandas>=2.2.3,<3", # <3: pandas 3.0's default `str` dtype is unsupported by the pinned duckdb (<1.3.3) "polars>=1.17.1", "pre-commit==2.15.0", "pyarrow",