diff --git a/great-docs.yml b/great-docs.yml index b1e6377e7..4f445c591 100644 --- a/great-docs.yml +++ b/great-docs.yml @@ -131,6 +131,8 @@ reference: members: false - name: DraftValidation members: false + - name: MissingSpec + members: true - title: Contracts and Pipelines desc: > @@ -189,6 +191,10 @@ reference: - Validate.col_vals_expr - Validate.col_exists - Validate.col_pct_null + - Validate.col_pct_missing + - Validate.col_missing_coded + - Validate.col_missing_only_coded + - Validate.col_missing_consistent - Validate.rows_distinct - Validate.rows_complete - Validate.col_schema_match diff --git a/pointblank/__init__.py b/pointblank/__init__.py index e8d37b872..231e8233f 100644 --- a/pointblank/__init__.py +++ b/pointblank/__init__.py @@ -57,6 +57,7 @@ from pointblank.generate.base import GeneratorConfig from pointblank.inspect import has_columns, has_rows from pointblank.integrations.otel import emit_otel +from pointblank.missing import MissingSpec from pointblank.metadata import ( ADaMDatasetTemplate, ADaMVariableSpec, @@ -120,6 +121,7 @@ "PipelineResult", "DataScan", "DraftValidation", + "MissingSpec", "col", "ref", "expr_col", diff --git a/pointblank/_constants.py b/pointblank/_constants.py index 204ef412e..9b67872a2 100644 --- a/pointblank/_constants.py +++ b/pointblank/_constants.py @@ -21,10 +21,16 @@ "within_spec": ["str"], "null": ["str", "numeric", "bool", "datetime", "duration"], "not_null": ["str", "numeric", "bool", "datetime", "duration"], + "missing_coded": ["str", "numeric", "bool", "datetime", "duration"], + "missing_only_coded": ["str", "numeric", "bool", "datetime", "duration"], } ASSERTION_TYPE_METHOD_MAP: dict[str, str] = { "col_pct_null": "pct_null", + "col_pct_missing": "pct_missing", + "col_missing_coded": "missing_coded", + "col_missing_only_coded": "missing_only_coded", + "col_missing_consistent": "missing_consistent", "col_vals_gt": "gt", "col_vals_lt": "lt", "col_vals_eq": "eq", @@ -91,6 +97,8 @@ "col_vals_decreasing", "col_vals_null", "col_vals_not_null", + "col_missing_coded", + "col_missing_only_coded", "col_vals_expr", "conjointly", "prompt", @@ -640,6 +648,18 @@ +""", + "col_pct_missing": """ + + pct_missing + + + + + + + + """, "col_vals_not_null": """ @@ -652,6 +672,36 @@ +""", + "col_missing_coded": """ + + col_missing_coded + + + + + + +""", + "col_missing_only_coded": """ + + col_missing_only_coded + + + + + + +""", + "col_missing_consistent": """ + + col_missing_consistent + + + + + + """, "col_vals_regex": """ diff --git a/pointblank/_constants_translations.py b/pointblank/_constants_translations.py index cb968d0ff..14f57c58a 100644 --- a/pointblank/_constants_translations.py +++ b/pointblank/_constants_translations.py @@ -1049,6 +1049,342 @@ "th": "เปอร์เซ็นต์ของค่า null ใน {column_text} ไม่อยู่ภายใน [{lower}%, {upper}%]", "fa": "درصد مقادیر null در {column_text} در محدوده [{lower}%, {upper}%] نبود.", }, + "col_pct_missing_expectation_text": { + "en": "Expect that the percentage of missing values in {column_text} is at most {max_pct}%.", + "fr": "On s'attend à ce que le pourcentage de valeurs manquantes dans {column_text} soit d'au plus {max_pct}%.", + "de": "Erwarten Sie, dass der Prozentsatz der fehlenden Werte in {column_text} höchstens {max_pct}% beträgt.", + "it": "Aspettatevi che la percentuale di valori mancanti in {column_text} sia al massimo {max_pct}%.", + "es": "Se espera que el porcentaje de valores faltantes en {column_text} sea como máximo {max_pct}%.", + "pt": "Espera-se que a porcentagem de valores ausentes em {column_text} seja no máximo {max_pct}%.", + "ro": "Se așteaptă ca procentul valorilor lipsă în {column_text} să fie cel mult {max_pct}%.", + "tr": "{column_text} içindeki eksik değerlerin yüzdesinin en fazla {max_pct}% olmasını bekleyin.", + "zh-Hans": "预期{column_text}中缺失值的百分比最多为{max_pct}%。", + "zh-Hant": "{column_text}中缺失值的百分比應最多為{max_pct}%。", + "ja": "{column_text}の欠損値の割合が最大{max_pct}%であることを期待します。", + "ko": "{column_text}의 결측값 비율이 최대 {max_pct}%이어야 합니다.", + "vi": "Kỳ vọng tỷ lệ phần trăm giá trị thiếu trong {column_text} tối đa là {max_pct}%.", + "ru": "Ожидается, что процент отсутствующих значений в {column_text} составит не более {max_pct}%.", + "cs": "Očekává se, že procento chybějících hodnot ve sloupci {column_text} bude nejvýše {max_pct}%.", + "pl": "Oczekuje się, że procent brakujących wartości w {column_text} wyniesie co najwyżej {max_pct}%.", + "da": "Forvent, at procentdelen af manglende værdier i {column_text} højst er {max_pct}%.", + "sv": "Förvänta dig att andelen saknade värden i {column_text} är högst {max_pct}%.", + "nb": "Forvent at prosentandelen av manglende verdier i {column_text} er høyst {max_pct}%.", + "nl": "Verwacht dat het percentage ontbrekende waarden in {column_text} hoogstens {max_pct}% is.", + "fi": "Odota, että puuttuvien arvojen prosenttiosuus sarakkeessa {column_text} on enintään {max_pct}%.", + "is": "Væntir þess að hlutfall vantandi gilda í {column_text} sé í mesta lagi {max_pct}%.", + "ar": "توقع أن تكون نسبة القيم المفقودة في {column_text} {max_pct}% على الأكثر.", + "hi": "अपेक्षा है कि {column_text} में अनुपस्थित मानों का प्रतिशत अधिकतम {max_pct}% होना चाहिए।", + "el": "Αναμένεται το ποσοστό των ελλιπών τιμών στη στήλη {column_text} να είναι το πολύ {max_pct}%.", + "id": "Mengharapkan bahwa persentase nilai yang hilang dalam {column_text} paling banyak {max_pct}%.", + "uk": "Очікується, що відсоток відсутніх значень в {column_text} становитиме не більше {max_pct}%.", + "bg": "Очаква се процентът на липсващите стойности в {column_text} да бъде най-много {max_pct}%.", + "hr": "Očekuje se da postotak nedostajućih vrijednosti u {column_text} bude najviše {max_pct}%.", + "et": "Eeldatakse, et puuduvate väärtuste protsent veerus {column_text} on kõige rohkem {max_pct}%.", + "hu": "Elvárás, hogy a hiányzó értékek aránya a {column_text} oszlopban legfeljebb {max_pct}% legyen.", + "ga": "Táthar ag súil go mbeadh céatadán na luachanna ar iarraidh i {column_text} ar a mhéad {max_pct}%.", + "lv": "Tiek sagaidīts, ka trūkstošo vērtību procents {column_text} būs ne vairāk kā {max_pct}%.", + "lt": "Tikimasi, kad trūkstamų reikšmių procentas stulpelyje {column_text} bus ne daugiau kaip {max_pct}%.", + "mt": "Mistenni li l-perċentwal ta' valuri nieqsa f'{column_text} huwa l-aktar {max_pct}%.", + "sk": "Očakáva sa, že percento chýbajúcich hodnôt v {column_text} bude najviac {max_pct}%.", + "sl": "Pričakuje se, da bo odstotek manjkajočih vrednosti v {column_text} največ {max_pct}%.", + "he": "צפוי שאחוז הערכים החסרים ב{column_text} יהיה לכל היותר {max_pct}%.", + "th": "คาดหวังว่าเปอร์เซ็นต์ของค่าที่หายไปใน {column_text} จะไม่เกิน {max_pct}%", + "fa": "انتظار می‌رود که درصد مقادیر مفقود در {column_text} حداکثر {max_pct}% باشد.", + }, + "col_pct_missing_failure_text": { + "en": "The percentage of missing values in {column_text} exceeded {max_pct}%.", + "fr": "Le pourcentage de valeurs manquantes dans {column_text} a dépassé {max_pct}%.", + "de": "Der Prozentsatz der fehlenden Werte in {column_text} überschritt {max_pct}%.", + "it": "La percentuale di valori mancanti in {column_text} ha superato {max_pct}%.", + "es": "El porcentaje de valores faltantes en {column_text} superó {max_pct}%.", + "pt": "A porcentagem de valores ausentes em {column_text} excedeu {max_pct}%.", + "ro": "Procentul valorilor lipsă în {column_text} a depășit {max_pct}%.", + "tr": "{column_text} içindeki eksik değerlerin yüzdesi {max_pct}% değerini aştı.", + "zh-Hans": "{column_text}中缺失值的百分比超过了{max_pct}%。", + "zh-Hant": "{column_text}中缺失值的百分比超過了{max_pct}%。", + "ja": "{column_text}の欠損値の割合が{max_pct}%を超えました。", + "ko": "{column_text}의 결측값 비율이 {max_pct}%를 초과했습니다.", + "vi": "Tỷ lệ phần trăm giá trị thiếu trong {column_text} đã vượt quá {max_pct}%.", + "ru": "Процент отсутствующих значений в {column_text} превысил {max_pct}%.", + "cs": "Procento chybějících hodnot ve sloupci {column_text} překročilo {max_pct}%.", + "pl": "Procent brakujących wartości w {column_text} przekroczył {max_pct}%.", + "da": "Procentdelen af manglende værdier i {column_text} oversteg {max_pct}%.", + "sv": "Andelen saknade värden i {column_text} översteg {max_pct}%.", + "nb": "Prosentandelen av manglende verdier i {column_text} oversteg {max_pct}%.", + "nl": "Het percentage ontbrekende waarden in {column_text} overschreed {max_pct}%.", + "fi": "Puuttuvien arvojen prosenttiosuus sarakkeessa {column_text} ylitti {max_pct}%.", + "is": "Hlutfall vantandi gilda í {column_text} fór yfir {max_pct}%.", + "ar": "تجاوزت نسبة القيم المفقودة في {column_text} {max_pct}%.", + "hi": "{column_text} में अनुपस्थित मानों का प्रतिशत {max_pct}% से अधिक था।", + "el": "Το ποσοστό των ελλιπών τιμών στη στήλη {column_text} ξεπέρασε {max_pct}%.", + "id": "Persentase nilai yang hilang dalam {column_text} melebihi {max_pct}%.", + "uk": "Відсоток відсутніх значень в {column_text} перевищив {max_pct}%.", + "bg": "Процентът на липсващите стойности в {column_text} надхвърли {max_pct}%.", + "hr": "Postotak nedostajućih vrijednosti u {column_text} premašio je {max_pct}%.", + "et": "Puuduvate väärtuste protsent veerus {column_text} ületas {max_pct}%.", + "hu": "A hiányzó értékek aránya a {column_text} oszlopban meghaladta a {max_pct}%-ot.", + "ga": "Sháraigh céatadán na luachanna ar iarraidh i {column_text} {max_pct}%.", + "lv": "Trūkstošo vērtību procents {column_text} pārsniedza {max_pct}%.", + "lt": "Trūkstamų reikšmių procentas stulpelyje {column_text} viršijo {max_pct}%.", + "mt": "Il-perċentwal ta' valuri nieqsa f'{column_text} qabeż {max_pct}%.", + "sk": "Percento chýbajúcich hodnôt v {column_text} prekročilo {max_pct}%.", + "sl": "Odstotek manjkajočih vrednosti v {column_text} je presegel {max_pct}%.", + "he": "אחוז הערכים החסרים ב{column_text} חרג מ-{max_pct}%.", + "th": "เปอร์เซ็นต์ของค่าที่หายไปใน {column_text} เกิน {max_pct}%", + "fa": "درصد مقادیر مفقود در {column_text} از {max_pct}% فراتر رفت.", + }, + "col_missing_coded_expectation_text": { + "en": "Expect that all missing values in {column_text} are coded (no uncoded Null values).", + "fr": "On s'attend à ce que toutes les valeurs manquantes dans {column_text} soient codées (aucune valeur nulle non codée).", + "de": "Erwarten Sie, dass alle fehlenden Werte in {column_text} kodiert sind (keine unkodierten Nullwerte).", + "it": "Aspettatevi che tutti i valori mancanti in {column_text} siano codificati (nessun valore nullo non codificato).", + "es": "Se espera que todos los valores faltantes en {column_text} estén codificados (sin valores nulos no codificados).", + "pt": "Espera-se que todos os valores ausentes em {column_text} estejam codificados (sem valores nulos não codificados).", + "ro": "Se așteaptă ca toate valorile lipsă în {column_text} să fie codificate (fără valori nule necodificate).", + "tr": "{column_text} içindeki tüm eksik değerlerin kodlanmış olmasını bekleyin (kodlanmamış boş değer yok).", + "zh-Hans": "预期{column_text}中所有缺失值都已编码(没有未编码的空值)。", + "zh-Hant": "{column_text}中所有缺失值都應已編碼(沒有未編碼的空值)。", + "ja": "{column_text}のすべての欠損値がコード化されていることを期待します(コード化されていないnull値がない)。", + "ko": "{column_text}의 모든 결측값이 코드화되어 있어야 합니다(코드화되지 않은 null 값 없음).", + "vi": "Kỳ vọng tất cả giá trị thiếu trong {column_text} đều được mã hóa (không có giá trị null chưa mã hóa).", + "ru": "Ожидается, что все отсутствующие значения в {column_text} закодированы (нет незакодированных нулевых значений).", + "cs": "Očekává se, že všechny chybějící hodnoty ve sloupci {column_text} jsou zakódované (žádné nezakódované null hodnoty).", + "pl": "Oczekuje się, że wszystkie brakujące wartości w {column_text} są zakodowane (brak niezakodowanych wartości null).", + "da": "Forvent, at alle manglende værdier i {column_text} er kodede (ingen ukodede null-værdier).", + "sv": "Förvänta dig att alla saknade värden i {column_text} är kodade (inga okodade null-värden).", + "nb": "Forvent at alle manglende verdier i {column_text} er kodet (ingen ukodede null-verdier).", + "nl": "Verwacht dat alle ontbrekende waarden in {column_text} gecodeerd zijn (geen ongecodeerde null-waarden).", + "fi": "Odota, että kaikki puuttuvat arvot sarakkeessa {column_text} on koodattu (ei koodaamattomia null-arvoja).", + "is": "Væntir þess að öll vantandi gildi í {column_text} séu kóðuð (engin ókóðuð null-gildi).", + "ar": "توقع أن تكون جميع القيم المفقودة في {column_text} مرمّزة (لا توجد قيم فارغة غير مرمّزة).", + "hi": "अपेक्षा है कि {column_text} में सभी अनुपस्थित मान कोडित हों (कोई बिना कोडित null मान नहीं)।", + "el": "Αναμένεται όλες οι ελλιπείς τιμές στη στήλη {column_text} να είναι κωδικοποιημένες (καμία μη κωδικοποιημένη null τιμή).", + "id": "Mengharapkan bahwa semua nilai yang hilang dalam {column_text} dikodekan (tidak ada nilai null yang tidak dikodekan).", + "uk": "Очікується, що всі відсутні значення в {column_text} закодовані (немає незакодованих нульових значень).", + "bg": "Очаква се всички липсващи стойности в {column_text} да са кодирани (без некодирани null стойности).", + "hr": "Očekuje se da su sve nedostajuće vrijednosti u {column_text} kodirane (bez nekodiranih null vrijednosti).", + "et": "Eeldatakse, et kõik puuduvad väärtused veerus {column_text} on kodeeritud (kodeerimata null-väärtusi pole).", + "hu": "Elvárás, hogy a {column_text} oszlopban minden hiányzó érték kódolt legyen (nincs kódolatlan null érték).", + "ga": "Táthar ag súil go mbeadh gach luach ar iarraidh i {column_text} códaithe (gan aon luachanna null gan chódú).", + "lv": "Tiek sagaidīts, ka visas trūkstošās vērtības {column_text} ir kodētas (nav nekodētu null vērtību).", + "lt": "Tikimasi, kad visos trūkstamos reikšmės stulpelyje {column_text} yra užkoduotos (nėra neužkoduotų null reikšmių).", + "mt": "Mistenni li l-valuri nieqsa kollha f'{column_text} huma kodifikati (l-ebda valuri null mhux kodifikati).", + "sk": "Očakáva sa, že všetky chýbajúce hodnoty v {column_text} sú zakódované (žiadne nezakódované null hodnoty).", + "sl": "Pričakuje se, da so vse manjkajoče vrednosti v {column_text} kodirane (brez nekodiranih null vrednosti).", + "he": "צפוי שכל הערכים החסרים ב{column_text} יהיו מקודדים (אין ערכי null לא מקודדים).", + "th": "คาดหวังว่าค่าที่หายไปทั้งหมดใน {column_text} จะถูกเข้ารหัส (ไม่มีค่า null ที่ไม่ได้เข้ารหัส)", + "fa": "انتظار می‌رود که همه مقادیر مفقود در {column_text} کدگذاری شده باشند (هیچ مقدار null کدگذاری‌نشده‌ای وجود نداشته باشد).", + }, + "col_missing_coded_failure_text": { + "en": "Uncoded missing values (raw Null values) were present in {column_text}.", + "fr": "Des valeurs manquantes non codées (valeurs nulles brutes) étaient présentes dans {column_text}.", + "de": "Unkodierte fehlende Werte (rohe Nullwerte) waren in {column_text} vorhanden.", + "it": "Erano presenti valori mancanti non codificati (valori nulli grezzi) in {column_text}.", + "es": "Había valores faltantes no codificados (valores nulos sin procesar) en {column_text}.", + "pt": "Havia valores ausentes não codificados (valores nulos brutos) em {column_text}.", + "ro": "Valori lipsă necodificate (valori nule brute) au fost prezente în {column_text}.", + "tr": "{column_text} içinde kodlanmamış eksik değerler (ham boş değerler) mevcuttu.", + "zh-Hans": "{column_text}中存在未编码的缺失值(原始空值)。", + "zh-Hant": "{column_text}中存在未編碼的缺失值(原始空值)。", + "ja": "{column_text}にコード化されていない欠損値(生のnull値)が存在しました。", + "ko": "{column_text}에 코드화되지 않은 결측값(원시 null 값)이 있었습니다.", + "vi": "Có giá trị thiếu chưa mã hóa (giá trị null thô) trong {column_text}.", + "ru": "В {column_text} присутствовали незакодированные отсутствующие значения (необработанные нулевые значения).", + "cs": "Ve sloupci {column_text} byly přítomny nezakódované chybějící hodnoty (surové null hodnoty).", + "pl": "W {column_text} obecne były niezakodowane brakujące wartości (surowe wartości null).", + "da": "Ukodede manglende værdier (rå null-værdier) var til stede i {column_text}.", + "sv": "Okodade saknade värden (råa null-värden) förekom i {column_text}.", + "nb": "Ukodede manglende verdier (rå null-verdier) var til stede i {column_text}.", + "nl": "Ongecodeerde ontbrekende waarden (ruwe null-waarden) waren aanwezig in {column_text}.", + "fi": "Sarakkeessa {column_text} oli koodaamattomia puuttuvia arvoja (raakoja null-arvoja).", + "is": "Ókóðuð vantandi gildi (hrá null-gildi) voru til staðar í {column_text}.", + "ar": "كانت هناك قيم مفقودة غير مرمّزة (قيم فارغة خام) في {column_text}.", + "hi": "{column_text} में बिना कोडित अनुपस्थित मान (कच्चे null मान) मौजूद थे।", + "el": "Μη κωδικοποιημένες ελλιπείς τιμές (ακατέργαστες null τιμές) υπήρχαν στη στήλη {column_text}.", + "id": "Nilai yang hilang tidak dikodekan (nilai null mentah) ada dalam {column_text}.", + "uk": "У {column_text} були присутні незакодовані відсутні значення (необроблені нульові значення).", + "bg": "В {column_text} присъстваха некодирани липсващи стойности (необработени null стойности).", + "hr": "U {column_text} bile su prisutne nekodirane nedostajuće vrijednosti (sirove null vrijednosti).", + "et": "Veerus {column_text} esinesid kodeerimata puuduvad väärtused (toored null-väärtused).", + "hu": "A {column_text} oszlopban kódolatlan hiányzó értékek (nyers null értékek) voltak jelen.", + "ga": "Bhí luachanna ar iarraidh gan chódú (luachanna null amha) i láthair i {column_text}.", + "lv": "{column_text} bija nekodētas trūkstošās vērtības (neapstrādātas null vērtības).", + "lt": "Stulpelyje {column_text} buvo neužkoduotų trūkstamų reikšmių (neapdorotų null reikšmių).", + "mt": "Valuri nieqsa mhux kodifikati (valuri null mhux ipproċessati) kienu preżenti f'{column_text}.", + "sk": "V {column_text} sa vyskytli nezakódované chýbajúce hodnoty (surové null hodnoty).", + "sl": "V {column_text} so bile prisotne nekodirane manjkajoče vrednosti (surove null vrednosti).", + "he": "ערכים חסרים לא מקודדים (ערכי null גולמיים) היו נוכחים ב{column_text}.", + "th": "มีค่าที่หายไปที่ไม่ได้เข้ารหัส (ค่า null ดิบ) อยู่ใน {column_text}", + "fa": "مقادیر مفقود کدگذاری‌نشده (مقادیر null خام) در {column_text} وجود داشت.", + }, + "col_missing_only_coded_expectation_text": { + "en": "Expect that {column_text} contains only documented missing codes and legitimate values.", + "fr": "On s'attend à ce que {column_text} ne contienne que des codes de valeurs manquantes documentés et des valeurs légitimes.", + "de": "Erwarten Sie, dass {column_text} nur dokumentierte fehlende Codes und legitime Werte enthält.", + "it": "Aspettatevi che {column_text} contenga solo codici mancanti documentati e valori legittimi.", + "es": "Se espera que {column_text} contenga solo códigos de valores faltantes documentados y valores legítimos.", + "pt": "Espera-se que {column_text} contenha apenas códigos de valores ausentes documentados e valores legítimos.", + "ro": "Se așteaptă ca {column_text} să conțină doar coduri de valori lipsă documentate și valori legitime.", + "tr": "{column_text} öğesinin yalnızca belgelenmiş eksik kodları ve geçerli değerleri içermesini bekleyin.", + "zh-Hans": "预期{column_text}仅包含已记录的缺失值代码和合法值。", + "zh-Hant": "{column_text}應僅包含已記錄的缺失值代碼和合法值。", + "ja": "{column_text}に文書化された欠損コードと正当な値のみが含まれていることを期待します。", + "ko": "{column_text}에 문서화된 결측 코드와 정당한 값만 포함되어 있어야 합니다.", + "vi": "Kỳ vọng {column_text} chỉ chứa các mã thiếu đã được ghi nhận và các giá trị hợp lệ.", + "ru": "Ожидается, что {column_text} содержит только задокументированные коды отсутствия и допустимые значения.", + "cs": "Očekává se, že {column_text} obsahuje pouze zdokumentované chybějící kódy a legitimní hodnoty.", + "pl": "Oczekuje się, że {column_text} zawiera tylko udokumentowane kody braków i prawidłowe wartości.", + "da": "Forvent, at {column_text} kun indeholder dokumenterede manglende koder og legitime værdier.", + "sv": "Förvänta dig att {column_text} endast innehåller dokumenterade saknade koder och legitima värden.", + "nb": "Forvent at {column_text} bare inneholder dokumenterte manglende koder og legitime verdier.", + "nl": "Verwacht dat {column_text} alleen gedocumenteerde ontbrekende codes en legitieme waarden bevat.", + "fi": "Odota, että {column_text} sisältää vain dokumentoituja puuttuvien arvojen koodeja ja kelvollisia arvoja.", + "is": "Væntir þess að {column_text} innihaldi aðeins skráða vantandi kóða og lögmæt gildi.", + "ar": "توقع أن يحتوي {column_text} على رموز القيم المفقودة الموثقة والقيم المشروعة فقط.", + "hi": "अपेक्षा है कि {column_text} में केवल प्रलेखित अनुपस्थित कोड और वैध मान हों।", + "el": "Αναμένεται η στήλη {column_text} να περιέχει μόνο τεκμηριωμένους κωδικούς ελλιπών τιμών και έγκυρες τιμές.", + "id": "Mengharapkan bahwa {column_text} hanya berisi kode nilai yang hilang yang terdokumentasi dan nilai yang sah.", + "uk": "Очікується, що {column_text} містить лише задокументовані коди відсутності та допустимі значення.", + "bg": "Очаква се {column_text} да съдържа само документирани кодове за липсващи стойности и легитимни стойности.", + "hr": "Očekuje se da {column_text} sadrži samo dokumentirane kodove za nedostajuće vrijednosti i legitimne vrijednosti.", + "et": "Eeldatakse, et {column_text} sisaldab ainult dokumenteeritud puuduvate väärtuste koode ja õiguspäraseid väärtusi.", + "hu": "Elvárás, hogy a {column_text} csak dokumentált hiányzó kódokat és érvényes értékeket tartalmazzon.", + "ga": "Táthar ag súil nach mbeadh i {column_text} ach cóid ar iarraidh dhoiciméadaithe agus luachanna dlisteanacha.", + "lv": "Tiek sagaidīts, ka {column_text} satur tikai dokumentētus trūkstošo vērtību kodus un likumīgas vērtības.", + "lt": "Tikimasi, kad {column_text} yra tik dokumentuoti trūkstamų reikšmių kodai ir teisėtos reikšmės.", + "mt": "Mistenni li {column_text} ikun fih biss kodiċijiet ta' valuri nieqsa dokumentati u valuri leġittimi.", + "sk": "Očakáva sa, že {column_text} obsahuje iba zdokumentované chýbajúce kódy a legitímne hodnoty.", + "sl": "Pričakuje se, da {column_text} vsebuje samo dokumentirane kode manjkajočih vrednosti in legitimne vrednosti.", + "he": "צפוי ש{column_text} יכיל רק קודי ערכים חסרים מתועדים וערכים לגיטימיים.", + "th": "คาดหวังว่า {column_text} จะมีเฉพาะรหัสค่าที่หายไปที่มีการบันทึกไว้และค่าที่ถูกต้องเท่านั้น", + "fa": "انتظار می‌رود که {column_text} فقط شامل کدهای مفقود مستندشده و مقادیر معتبر باشد.", + }, + "col_missing_only_coded_failure_text": { + "en": "Undocumented codes were present in {column_text}.", + "fr": "Des codes non documentés étaient présents dans {column_text}.", + "de": "Undokumentierte Codes waren in {column_text} vorhanden.", + "it": "Erano presenti codici non documentati in {column_text}.", + "es": "Había códigos no documentados en {column_text}.", + "pt": "Havia códigos não documentados em {column_text}.", + "ro": "Coduri nedocumentate au fost prezente în {column_text}.", + "tr": "{column_text} içinde belgelenmemiş kodlar mevcuttu.", + "zh-Hans": "{column_text}中存在未记录的代码。", + "zh-Hant": "{column_text}中存在未記錄的代碼。", + "ja": "{column_text}に文書化されていないコードが存在しました。", + "ko": "{column_text}에 문서화되지 않은 코드가 있었습니다.", + "vi": "Có các mã chưa được ghi nhận trong {column_text}.", + "ru": "В {column_text} присутствовали незадокументированные коды.", + "cs": "Ve sloupci {column_text} byly přítomny nezdokumentované kódy.", + "pl": "W {column_text} obecne były nieudokumentowane kody.", + "da": "Udokumenterede koder var til stede i {column_text}.", + "sv": "Odokumenterade koder fanns i {column_text}.", + "nb": "Udokumenterte koder var til stede i {column_text}.", + "nl": "Er waren ongedocumenteerde codes aanwezig in {column_text}.", + "fi": "Sarakkeessa {column_text} oli dokumentoimattomia koodeja.", + "is": "Óskráðir kóðar voru til staðar í {column_text}.", + "ar": "كانت هناك رموز غير موثقة في {column_text}.", + "hi": "{column_text} में बिना प्रलेखित कोड मौजूद थे।", + "el": "Υπήρχαν μη τεκμηριωμένοι κωδικοί στη στήλη {column_text}.", + "id": "Terdapat kode yang tidak terdokumentasi dalam {column_text}.", + "uk": "У {column_text} були наявні незадокументовані коди.", + "bg": "В {column_text} присъстваха недокументирани кодове.", + "hr": "U {column_text} bili su prisutni nedokumentirani kodovi.", + "et": "Veerus {column_text} esines dokumenteerimata koode.", + "hu": "A {column_text} oszlopban dokumentálatlan kódok voltak jelen.", + "ga": "Bhí cóid neamhdhoiciméadaithe i láthair i {column_text}.", + "lv": "{column_text} bija nedokumentēti kodi.", + "lt": "{column_text} buvo nedokumentuotų kodų.", + "mt": "Kien hemm kodiċijiet mhux dokumentati f'{column_text}.", + "sk": "V {column_text} sa vyskytli nezdokumentované kódy.", + "sl": "V {column_text} so bili prisotni nedokumentirani kodi.", + "he": "היו קודים לא מתועדים ב{column_text}.", + "th": "พบรหัสที่ไม่มีการบันทึกไว้ใน {column_text}", + "fa": "کدهای مستندنشده در {column_text} وجود داشت.", + }, + "col_missing_consistent_expectation_text": { + "en": "Expect consistent missingness for reason {reason} across columns {columns_text}.", + "fr": "On s'attend à une absence cohérente pour la raison {reason} dans les colonnes {columns_text}.", + "de": "Erwarten Sie eine konsistente Fehlendheit für den Grund {reason} über die Spalten {columns_text} hinweg.", + "it": "Aspettatevi una mancanza coerente per il motivo {reason} tra le colonne {columns_text}.", + "es": "Se espera una ausencia coherente por el motivo {reason} en las columnas {columns_text}.", + "pt": "Espera-se uma ausência consistente pelo motivo {reason} nas colunas {columns_text}.", + "ro": "Se așteaptă o lipsă consecventă pentru motivul {reason} în coloanele {columns_text}.", + "tr": "{columns_text} sütunlarında {reason} nedeniyle tutarlı eksiklik bekleyin.", + "zh-Hans": "预期各列 {columns_text} 中因 {reason} 导致的缺失情况一致。", + "zh-Hant": "預期各欄 {columns_text} 中因 {reason} 導致的缺失情況一致。", + "ja": "列 {columns_text} 全体で理由 {reason} による欠損が一貫していることを期待します。", + "ko": "{columns_text} 열 전체에서 사유 {reason}에 대한 일관된 결측을 기대합니다.", + "vi": "Kỳ vọng sự thiếu hụt nhất quán cho lý do {reason} trên các cột {columns_text}.", + "ru": "Ожидается согласованная пропущенность по причине {reason} в столбцах {columns_text}.", + "cs": "Očekává se konzistentní chybějící hodnoty z důvodu {reason} napříč sloupci {columns_text}.", + "pl": "Oczekuje się spójnego braku danych z powodu {reason} w kolumnach {columns_text}.", + "da": "Forvent konsistent manglende data af årsagen {reason} på tværs af kolonnerne {columns_text}.", + "sv": "Förvänta dig konsekvent saknad data av orsaken {reason} över kolumnerna {columns_text}.", + "nb": "Forvent konsistent manglende data av årsaken {reason} på tvers av kolonnene {columns_text}.", + "nl": "Verwacht consistente ontbrekendheid om reden {reason} in de kolommen {columns_text}.", + "fi": "Odota johdonmukaista puuttuvuutta syystä {reason} sarakkeissa {columns_text}.", + "is": "Væntir samkvæmrar vöntunar af ástæðunni {reason} yfir dálkana {columns_text}.", + "ar": "توقع غيابًا متسقًا للسبب {reason} عبر الأعمدة {columns_text}.", + "hi": "अपेक्षा है कि कारण {reason} के लिए स्तंभों {columns_text} में सुसंगत अनुपस्थिति हो।", + "el": "Αναμένεται συνεπής έλλειψη για τον λόγο {reason} στις στήλες {columns_text}.", + "id": "Mengharapkan ketiadaan yang konsisten karena alasan {reason} di seluruh kolom {columns_text}.", + "uk": "Очікується узгоджена відсутність даних з причини {reason} у стовпцях {columns_text}.", + "bg": "Очаква се последователна липса по причина {reason} в колоните {columns_text}.", + "hr": "Očekuje se dosljedno nedostajanje za razlog {reason} u stupcima {columns_text}.", + "et": "Eeldatakse järjepidevat puudumist põhjusel {reason} veergudes {columns_text}.", + "hu": "Elvárás, hogy a {reason} okból következetes hiány legyen a(z) {columns_text} oszlopokban.", + "ga": "Táthar ag súil le heaspa chomhsheasmhach ar an gcúis {reason} ar fud na gcolún {columns_text}.", + "lv": "Tiek sagaidīts konsekvents trūkums iemesla {reason} dēļ kolonnās {columns_text}.", + "lt": "Tikimasi nuoseklaus trūkumo dėl priežasties {reason} stulpeliuose {columns_text}.", + "mt": "Mistenni nuqqas konsistenti għar-raġuni {reason} fil-kolonni {columns_text}.", + "sk": "Očakávajú sa konzistentné chýbajúce hodnoty z dôvodu {reason} v stĺpcoch {columns_text}.", + "sl": "Pričakuje se dosledno manjkanje zaradi razloga {reason} v stolpcih {columns_text}.", + "he": "צפויה חוסר עקבי מסיבה {reason} בעמודות {columns_text}.", + "th": "คาดหวังว่าการขาดหายไปด้วยเหตุผล {reason} จะสอดคล้องกันในคอลัมน์ {columns_text}", + "fa": "انتظار می‌رود فقدان سازگار به دلیل {reason} در ستون‌های {columns_text} وجود داشته باشد.", + }, + "col_missing_consistent_failure_text": { + "en": "Inconsistent missingness for reason {reason} was found across columns {columns_text}.", + "fr": "Une absence incohérente pour la raison {reason} a été trouvée dans les colonnes {columns_text}.", + "de": "Inkonsistente Fehlendheit für den Grund {reason} wurde über die Spalten {columns_text} hinweg gefunden.", + "it": "È stata rilevata una mancanza incoerente per il motivo {reason} tra le colonne {columns_text}.", + "es": "Se encontró una ausencia incoherente por el motivo {reason} en las columnas {columns_text}.", + "pt": "Foi encontrada uma ausência inconsistente pelo motivo {reason} nas colunas {columns_text}.", + "ro": "A fost găsită o lipsă inconsecventă pentru motivul {reason} în coloanele {columns_text}.", + "tr": "{columns_text} sütunlarında {reason} nedeniyle tutarsız eksiklik bulundu.", + "zh-Hans": "在各列 {columns_text} 中发现因 {reason} 导致的缺失情况不一致。", + "zh-Hant": "在各欄 {columns_text} 中發現因 {reason} 導致的缺失情況不一致。", + "ja": "列 {columns_text} 全体で理由 {reason} による欠損が一貫していないことが見つかりました。", + "ko": "{columns_text} 열 전체에서 사유 {reason}에 대한 일관되지 않은 결측이 발견되었습니다.", + "vi": "Đã tìm thấy sự thiếu hụt không nhất quán cho lý do {reason} trên các cột {columns_text}.", + "ru": "В столбцах {columns_text} обнаружена несогласованная пропущенность по причине {reason}.", + "cs": "Napříč sloupci {columns_text} byly nalezeny nekonzistentní chybějící hodnoty z důvodu {reason}.", + "pl": "W kolumnach {columns_text} znaleziono niespójny brak danych z powodu {reason}.", + "da": "Inkonsistent manglende data af årsagen {reason} blev fundet på tværs af kolonnerne {columns_text}.", + "sv": "Inkonsekvent saknad data av orsaken {reason} hittades över kolumnerna {columns_text}.", + "nb": "Inkonsistent manglende data av årsaken {reason} ble funnet på tvers av kolonnene {columns_text}.", + "nl": "Inconsistente ontbrekendheid om reden {reason} werd aangetroffen in de kolommen {columns_text}.", + "fi": "Sarakkeissa {columns_text} havaittiin epäjohdonmukaista puuttuvuutta syystä {reason}.", + "is": "Ósamkvæm vöntun af ástæðunni {reason} fannst yfir dálkana {columns_text}.", + "ar": "تم العثور على غياب غير متسق للسبب {reason} عبر الأعمدة {columns_text}.", + "hi": "कारण {reason} के लिए स्तंभों {columns_text} में असंगत अनुपस्थिति पाई गई।", + "el": "Βρέθηκε ασυνεπής έλλειψη για τον λόγο {reason} στις στήλες {columns_text}.", + "id": "Ketiadaan yang tidak konsisten karena alasan {reason} ditemukan di seluruh kolom {columns_text}.", + "uk": "У стовпцях {columns_text} виявлено неузгоджену відсутність даних з причини {reason}.", + "bg": "Установена е непоследователна липса по причина {reason} в колоните {columns_text}.", + "hr": "Pronađeno je nedosljedno nedostajanje za razlog {reason} u stupcima {columns_text}.", + "et": "Veergudes {columns_text} leiti ebajärjepidev puudumine põhjusel {reason}.", + "hu": "A(z) {columns_text} oszlopokban következetlen hiány található a {reason} okból.", + "ga": "Fuarthas easpa neamhchomhsheasmhach ar an gcúis {reason} ar fud na gcolún {columns_text}.", + "lv": "Kolonnās {columns_text} tika atrasts nekonsekvents trūkums iemesla {reason} dēļ.", + "lt": "Stulpeliuose {columns_text} rastas nenuoseklus trūkumas dėl priežasties {reason}.", + "mt": "Instab nuqqas inkonsistenti għar-raġuni {reason} fil-kolonni {columns_text}.", + "sk": "V stĺpcoch {columns_text} sa našli nekonzistentné chýbajúce hodnoty z dôvodu {reason}.", + "sl": "V stolpcih {columns_text} je bilo najdeno nedosledno manjkanje zaradi razloga {reason}.", + "he": "נמצאה חוסר לא עקבי מסיבה {reason} בעמודות {columns_text}.", + "th": "พบการขาดหายไปด้วยเหตุผล {reason} ที่ไม่สอดคล้องกันในคอลัมน์ {columns_text}", + "fa": "فقدان ناسازگار به دلیل {reason} در ستون‌های {columns_text} یافت شد.", + }, "regex_expectation_text": { "en": "Expect that values in {column_text} should match the regular expression: {values_text}.", "fr": "On s'attend à ce que les valeurs de {column_text} correspondent à l'expression régulière : {values_text}.", diff --git a/pointblank/_interrogation.py b/pointblank/_interrogation.py index 28cd2bf34..218a705e1 100644 --- a/pointblank/_interrogation.py +++ b/pointblank/_interrogation.py @@ -755,6 +755,52 @@ def col_pct_null( return n_null >= (abs_target - lower_bound) and n_null <= (abs_target + upper_bound) +def col_pct_missing( + data_tbl: IntoFrame, + column: str, + sentinels: list, + count_null: bool, + max_pct: float, +) -> bool: + """Check that the percentage of missing values in a column does not exceed `max_pct`. + + Missing values are those equal to one of the `sentinels` and, when `count_null=True`, actual + null values. The percentage is computed over the total number of rows. + """ + nw_frame = nw.from_native(data_tbl) + + # Build a boolean expression that flags missing values + missing_expr = None + if sentinels: + missing_expr = nw.col(column).is_in(sentinels) + if count_null: + null_expr = nw.col(column).is_null() + missing_expr = null_expr if missing_expr is None else (missing_expr | null_expr) + + if missing_expr is None: + # Nothing counts as missing under this spec/filter + return 0.0 <= max_pct + + # Cast boolean to Int32 before sum to support PySpark which can't sum booleans + if is_narwhals_lazyframe(nw_frame): + stats = nw_frame.select( + total_rows=nw.len(), + n_missing=missing_expr.cast(nw.Int32).sum(), + ).collect() + total_rows: int = int(stats["total_rows"][0]) + n_missing: int = int(stats["n_missing"][0]) + else: + assert is_narwhals_dataframe(nw_frame) + total_rows = int(nw_frame.select(nw.len()).item()) + n_missing = int(nw_frame.select(missing_expr.cast(nw.Int32).sum()).item()) + + if total_rows == 0: + return True + + pct_missing = n_missing / total_rows + return pct_missing <= max_pct + + def col_count_match(data_tbl: IntoFrame, count: Any, inverse: bool) -> bool: """ Check if DataFrame column count matches expected count. @@ -2534,6 +2580,125 @@ def interrogate_not_null(tbl: IntoFrame, column: str) -> Any: return result_tbl.to_native() +def apply_missing_exclusion(results_tbl: IntoFrame, column: str, spec: Any) -> Any: + """Mark rows with structured-missing values as passing. + + Given a `results_tbl` that already carries a boolean `pb_is_good_` column, force that column to + `True` for any row whose value in `column` is a declared sentinel of `spec` (a `MissingSpec`), + or a null when `spec.null_is_missing` is `True`. This implements the `missing=` exclusion on + `col_vals_*` validation methods: sentinel/missing values are excluded from the check (they pass) + so that only the "real" values are validated. + """ + sentinels = spec.sentinel_values() + + # Build a null-free boolean mask. Note `is_in()` yields null for null inputs, and OR-ing a null + # into `pb_is_good_` would corrupt a failing row (False | null = null under Kleene logic), so the + # sentinel mask is explicitly filled with `False` for null rows. + mask = None + if sentinels: + mask = nw.col(column).is_in(sentinels).fill_null(False) + if spec.null_is_missing: + null_expr = nw.col(column).is_null() + mask = null_expr if mask is None else (mask | null_expr) + + if mask is None: + return results_tbl + + nw_tbl = nw.from_native(results_tbl) + assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame)) + nw_tbl = nw_tbl.with_columns(pb_is_good_=(nw.col("pb_is_good_") | mask)) + return nw_tbl.to_native() + + +def interrogate_missing_only_coded( + tbl: IntoFrame, + column: str, + sentinels: list, + count_null: bool, + allowed: list | None, + min_val: Any, + max_val: Any, +) -> Any: + """Missing-only-coded interrogation. + + A row passes when its value is either a declared sentinel (a documented missing code), a null + (when `count_null=True`), or a legitimate "real" value — one in `allowed` or within the + `[min_val, max_val]` range. Any other value is treated as an *undocumented* code and fails. + """ + nw_tbl = nw.from_native(tbl) + + good = None + + def _or(expr): + nonlocal good + good = expr if good is None else (good | expr) + + if sentinels: + _or(nw.col(column).is_in(sentinels).fill_null(False)) + if count_null: + _or(nw.col(column).is_null()) + if allowed: + _or(nw.col(column).is_in(allowed).fill_null(False)) + if min_val is not None or max_val is not None: + range_expr = nw.lit(True) + if min_val is not None: + range_expr = range_expr & (nw.col(column) >= min_val) + if max_val is not None: + range_expr = range_expr & (nw.col(column) <= max_val) + _or(range_expr.fill_null(False)) + + if good is None: + good = nw.lit(False) + + result_tbl = nw_tbl.with_columns(pb_is_good_=good) + return result_tbl.to_native() + + +def interrogate_missing_consistent( + tbl: IntoFrame, columns: list[str], sentinels: list, count_null: bool +) -> Any: + """Cross-column missing-consistency interrogation. + + Given a set of related `columns`, a row passes when the "missing for a given reason" status is + consistent across all of them: either *none* of the columns carry the reason, or *all* of them + do. A row fails when some-but-not-all of the columns are missing for that reason. Missingness + for the reason is encoded by the `sentinels` values (and, when `count_null=True`, actual nulls). + """ + nw_tbl = nw.from_native(tbl) + n_cols = len(columns) + + count_expr = None + for c in columns: + if sentinels: + col_expr = nw.col(c).is_in(sentinels).fill_null(False) + else: + col_expr = nw.lit(False) # noqa + if count_null: + col_expr = col_expr | nw.col(c).is_null() + col_count = col_expr.cast(nw.Int32) + count_expr = col_count if count_expr is None else (count_expr + col_count) + + result_tbl = nw_tbl.with_columns(_n_reason_=count_expr) + result_tbl = result_tbl.with_columns( + pb_is_good_=((nw.col("_n_reason_") == 0) | (nw.col("_n_reason_") == n_cols)) + ) + result_tbl = result_tbl.drop("_n_reason_") + return result_tbl.to_native() + + +def interrogate_missing_coded(tbl: IntoFrame, column: str) -> Any: + """Missing-coded interrogation. + + A row passes when its value is *not* a raw null. Under the structured-missingness model, every + absence should be expressed with an explicit sentinel code (which is non-null), so a raw null + represents *uncoded* missingness and fails the test unit. + """ + nw_tbl = nw.from_native(tbl) + assert isinstance(nw_tbl, (nw.DataFrame, nw.LazyFrame)) + result_tbl = nw_tbl.with_columns(pb_is_good_=~nw.col(column).is_null()) + return result_tbl.to_native() + + def interrogate_increasing( tbl: IntoFrame, column: str, allow_stationary: bool, decreasing_tol: float, na_pass: bool ) -> Any: diff --git a/pointblank/metadata/_types.py b/pointblank/metadata/_types.py index 04e6b83b1..c70e9a78d 100644 --- a/pointblank/metadata/_types.py +++ b/pointblank/metadata/_types.py @@ -223,6 +223,21 @@ class VariableMetadata: unit: str | None = None unit_system: str | None = None + def to_missing_spec(self) -> Any: + """Build a [`MissingSpec`](`pointblank.MissingSpec`) from this variable's missing values. + + Reads `missing_values` and derives reason labels from `missing_value_labels` or + `value_labels` when available. + + Returns + ------- + MissingSpec | None + A `MissingSpec` for the variable, or `None` if no missing values are declared. + """ + from pointblank.missing import MissingSpec + + return MissingSpec.from_variable_metadata(self) + @dataclass class MetadataImport: @@ -340,6 +355,39 @@ def get_variable(self, name: str) -> VariableMetadata: return var raise KeyError(f"No variable named '{name}' in imported metadata") + def missing_specs(self) -> dict[str, Any]: + """Auto-generate [`MissingSpec`](`pointblank.MissingSpec`) objects for all variables. + + Builds a mapping of column name to `MissingSpec` for every imported variable that declares + missing values (e.g., SPSS user-defined missing values, SAS special missing). The result + can be passed directly to validation methods (via `missing=`) or to + [`missing_vals_tbl()`](`pointblank.missing_vals_tbl`). + + Returns + ------- + dict[str, MissingSpec] + A mapping of column name to `MissingSpec`. Variables without declared missing values + are omitted. + + Examples + -------- + ```python + import pointblank as pb + + meta = pb.import_metadata("survey.sav", format="spss") + specs = meta.missing_specs() + + # Use the auto-generated specs in a missingness report + pb.missing_vals_tbl(data, missing=specs) + ``` + """ + specs: dict[str, Any] = {} + for var in self.variables: + spec = var.to_missing_spec() + if spec is not None: + specs[var.name] = spec + return specs + def get_codelist(self, name: str) -> Codelist: """Get a specific codelist by name. diff --git a/pointblank/missing.py b/pointblank/missing.py new file mode 100644 index 000000000..4d2814aac --- /dev/null +++ b/pointblank/missing.py @@ -0,0 +1,501 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from typing import Any + +__all__ = [ + "MissingSpec", +] + + +# Standard HL7/CDISC null flavors mapped to snake_case reason labels +_CDISC_NULL_FLAVORS: dict[str, str] = { + "NI": "no_information", + "NA": "not_applicable", + "UNK": "unknown", + "ASKU": "asked_but_unknown", + "NAV": "temporarily_unavailable", + "NASK": "not_asked", + "OTH": "other", + "PINF": "positive_infinity", + "NINF": "negative_infinity", + "MSK": "masked", + "DER": "derived", + "QS": "sufficient_quantity", + "TRC": "trace", + "NP": "not_present", +} + + +def _slugify(label: Any) -> str: + """Convert a human-readable label into a snake_case reason identifier.""" + slug = re.sub(r"[^0-9a-zA-Z]+", "_", str(label).strip().lower()).strip("_") + return slug or "missing" + + +@dataclass +class MissingSpec: + """ + Specification for structured missing values in a column. + + Real-world data rarely encodes missingness as a single `null` value. Survey data distinguishes + *refused* from *don't know* from *not applicable*; clinical data uses codes like `"NOT DONE"`; + statistical packages use sentinel values such as `-99`, `".A"`, or `""`. A `MissingSpec` + captures these sentinel values, the *reason* each one represents, and how they should be + handled during validation and analysis. + + This brings the idea of *structured missingness* (a missing value carries a reason for its + absence) into Pointblank's runtime validation layer. Once defined, a `MissingSpec` can be + passed to validation methods (via `missing=`) to automatically exclude sentinel values from + constraint checks, or used with dedicated methods like + [`Validate.col_missing_coded()`](`pointblank.Validate.col_missing_coded`) and + [`Validate.col_pct_missing()`](`pointblank.Validate.col_pct_missing`). + + Parameters + ---------- + reasons + A dictionary mapping sentinel values to reason labels. Keys are the actual values present + in the data (e.g., `-99`, `"NA"`, `".A"`). Values are human-readable reason identifiers + (e.g., `"refused"`, `"not_asked"`). + categories + Optional grouping of reasons into categories (e.g., an `"item_nonresponse"` category that + groups `"refused"` and `"dont_know"`). Useful for aggregate reporting and for checking + missingness rates by category. Each value is a list of reason labels that appear in + `reasons`. Default is `None`. + null_is_missing + Whether actual null/`None`/`NaN` values should also be treated as missing (with reason + given by `null_reason`). Default is `True`. + null_reason + The reason label assigned to actual null values when `null_is_missing=True`. Default is + `"unknown"`. + description + Optional human-readable description of the overall missingness pattern. Default is `None`. + + Returns + ------- + MissingSpec + A missing-value specification that can be attached to a `Field` (via `missing=`) or passed + to validation methods. + + Examples + -------- + Define the missing-value codes for a survey `age` variable: + + ```python + import pointblank as pb + + age_missing = pb.MissingSpec( + reasons={ + -99: "not_asked", # Question wasn't asked to this participant + -98: "refused", # Participant declined to answer + -97: "dont_know", # Participant didn't know + -96: "not_applicable", # Question doesn't apply + }, + categories={ + "item_nonresponse": ["refused", "dont_know"], + "design": ["not_asked", "not_applicable"], + }, + ) + ``` + + The spec can then answer questions about its own structure: + + ```python + age_missing.sentinel_values() # [-99, -98, -97, -96] + age_missing.reason_for(-98) # "refused" + age_missing.values_for_reason("refused") # [-98] + age_missing.values_for_category("item_nonresponse") # [-98, -97] + ``` + """ + + reasons: dict[Any, str] + categories: dict[str, list[str]] | None = None + null_is_missing: bool = True + null_reason: str = "unknown" + description: str | None = field(default=None) + + def __post_init__(self) -> None: + self._validate() + + def _validate(self) -> None: + """Validate that the missing specification is internally consistent.""" + if not isinstance(self.reasons, dict): + raise TypeError( + f"reasons must be a dict mapping sentinel values to reason labels, " + f"got {type(self.reasons).__name__}" + ) + + if len(self.reasons) == 0 and not self.null_is_missing: + raise ValueError( + "A MissingSpec must define at least one sentinel value in `reasons`, " + "or set `null_is_missing=True`." + ) + + for value, reason in self.reasons.items(): + if not isinstance(reason, str): + raise TypeError( + f"Reason labels must be strings, got {type(reason).__name__} " + f"for sentinel value {value!r}." + ) + + if not isinstance(self.null_reason, str): + raise TypeError(f"null_reason must be a string, got {type(self.null_reason).__name__}.") + + if self.categories is not None: + if not isinstance(self.categories, dict): + raise TypeError( + f"categories must be a dict mapping category names to lists of reason " + f"labels, got {type(self.categories).__name__}." + ) + + known_reasons = set(self.reasons.values()) + if self.null_is_missing: + known_reasons.add(self.null_reason) + + for category, reason_list in self.categories.items(): + if not isinstance(reason_list, (list, tuple)): + raise TypeError( + f"Category '{category}' must map to a list of reason labels, " + f"got {type(reason_list).__name__}." + ) + unknown = [r for r in reason_list if r not in known_reasons] + if unknown: + raise ValueError( + f"Category '{category}' references unknown reason label(s) {unknown}. " + f"Known reasons are {sorted(known_reasons)}." + ) + + def sentinel_values(self) -> list: + """Get all sentinel values that encode missingness. + + Returns + ------- + list + The keys of `reasons` (the actual values in the data that represent missingness). + Note that this does *not* include `None` even when `null_is_missing=True`; use + [`is_missing()`](`pointblank.MissingSpec.is_missing`) to test individual values. + """ + return list(self.reasons.keys()) + + def reason_for(self, value: Any) -> str | None: + """Get the reason label for a specific value. + + Parameters + ---------- + value + A value from the data. + + Returns + ------- + str | None + The reason label if `value` is a declared sentinel value, `null_reason` if `value` + is `None` and `null_is_missing=True`, or `None` if the value is not considered + missing. + """ + if value is None: + return self.null_reason if self.null_is_missing else None + return self.reasons.get(value) + + def is_missing(self, value: Any) -> bool: + """Check whether a value should be considered missing under this spec. + + Parameters + ---------- + value + A value from the data. + + Returns + ------- + bool + `True` if `value` is a declared sentinel value, or if `value` is `None` and + `null_is_missing=True`. + """ + if value is None: + return self.null_is_missing + return value in self.reasons + + def values_for_reason(self, reason: str) -> list: + """Get all sentinel values that correspond to a given reason. + + Parameters + ---------- + reason + A reason label. + + Returns + ------- + list + All sentinel values mapped to `reason`. + """ + return [v for v, r in self.reasons.items() if r == reason] + + def values_for_category(self, category: str) -> list: + """Get all sentinel values whose reason falls in a given category. + + Parameters + ---------- + category + A category name defined in `categories`. + + Returns + ------- + list + All sentinel values whose reason label is in the given category. Returns an empty + list if `categories` is `None` or the category is undefined. + """ + if self.categories is None: + return [] + reasons_in_cat = self.categories.get(category, []) + return [v for v, r in self.reasons.items() if r in reasons_in_cat] + + def reasons_list(self) -> list[str]: + """Get the distinct reason labels defined by this spec. + + Returns + ------- + list[str] + The distinct reason labels (in first-seen order), including `null_reason` when + `null_is_missing=True`. + """ + seen: dict[str, None] = {} + for r in self.reasons.values(): + seen.setdefault(r, None) + if self.null_is_missing: + seen.setdefault(self.null_reason, None) + return list(seen.keys()) + + # ------------------------------------------------------------------ + # Factory methods (pre-built specs and metadata-import integration) + # ------------------------------------------------------------------ + + @classmethod + def from_cdisc_null_flavors( + cls, + null_is_missing: bool = True, + null_reason: str = "no_information", + description: str | None = "CDISC/HL7 null flavors", + ) -> "MissingSpec": + """Create a `MissingSpec` for the standard HL7/CDISC *null flavors*. + + Clinical data uses standardized null flavor codes to record *why* a value is absent (e.g., + `"NASK"` for "not asked", `"UNK"` for "unknown"). This returns a ready-to-use spec mapping + those codes to reason labels. + + Parameters + ---------- + null_is_missing + Whether actual null values should also be treated as missing. Default is `True`. + null_reason + The reason label for actual null values. Default is `"no_information"`. + description + Optional description. Default identifies the spec as CDISC/HL7 null flavors. + + Returns + ------- + MissingSpec + A spec with the standard null flavor codes. + + Examples + -------- + ```python + import pointblank as pb + + cdisc_missing = pb.MissingSpec.from_cdisc_null_flavors() + cdisc_missing.reason_for("NASK") # "not_asked" + ``` + """ + reasons = dict(_CDISC_NULL_FLAVORS) + categories = { + "unknown": [ + "no_information", + "unknown", + "asked_but_unknown", + "temporarily_unavailable", + ], + "not_applicable": ["not_applicable", "not_asked", "not_present"], + "boundary": ["positive_infinity", "negative_infinity"], + } + return cls( + reasons=reasons, + categories=categories, + null_is_missing=null_is_missing, + null_reason=null_reason, + description=description, + ) + + # Convenient short alias + @classmethod + def from_cdisc(cls, **kwargs: Any) -> "MissingSpec": + """Alias for [`from_cdisc_null_flavors()`](`pointblank.MissingSpec.from_cdisc_null_flavors`).""" + return cls.from_cdisc_null_flavors(**kwargs) + + @classmethod + def from_sas( + cls, + reasons: dict[str, str] | None = None, + include_underscore: bool = True, + null_is_missing: bool = True, + null_reason: str = "system_missing", + description: str | None = "SAS special missing values", + ) -> "MissingSpec": + """Create a `MissingSpec` for SAS special missing values. + + SAS encodes missingness with `"."` (system missing), `"._"`, and `".A"` through `".Z"` (27 + user-defined missing codes). This returns a spec covering all of them; you can override the + reason label for any specific code via `reasons=`. + + Parameters + ---------- + reasons + Optional mapping of specific SAS missing codes to custom reason labels (e.g., + `{".A": "not_applicable", ".B": "below_detection"}`). These override the defaults. + include_underscore + Whether to include the `"._"` special missing code. Default is `True`. + null_is_missing + Whether actual null values should also be treated as missing. Default is `True`. + null_reason + The reason label for actual null values. Default is `"system_missing"`. + description + Optional description. Default identifies the spec as SAS special missing values. + + Returns + ------- + MissingSpec + A spec covering the SAS special missing values. + + Examples + -------- + ```python + import pointblank as pb + + sas_missing = pb.MissingSpec.from_sas( + reasons={".A": "not_applicable", ".B": "below_detection"} + ) + sas_missing.reason_for(".A") # "not_applicable" + sas_missing.reason_for(".C") # "user_missing_c" + ``` + """ + built: dict[Any, str] = {".": "system_missing"} + if include_underscore: + built["._"] = "system_missing" + for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": + built[f".{letter}"] = f"user_missing_{letter.lower()}" + if reasons: + for code, label in reasons.items(): + built[code] = label + return cls( + reasons=built, + null_is_missing=null_is_missing, + null_reason=null_reason, + description=description, + ) + + @classmethod + def from_spss( + cls, + missing_values: list, + labels: dict[Any, str] | None = None, + null_is_missing: bool = True, + null_reason: str = "unknown", + description: str | None = "SPSS user-defined missing values", + ) -> "MissingSpec": + """Create a `MissingSpec` from SPSS-style user-defined missing values. + + SPSS supports up to 3 user-defined missing values per variable (plus a range). Pass the + missing values (and optionally their value labels) to build a spec. Reason labels are + derived from the labels when available, otherwise a `"missing_"` placeholder is used. + + Parameters + ---------- + missing_values + The sentinel values that SPSS marks as missing for the variable (e.g., `[-99, -98]`). + labels + Optional mapping of sentinel value to human-readable label (e.g., `{-99: "Refused"}`). + Labels are slugified into reason identifiers (e.g., `"Refused"` -> `"refused"`). + null_is_missing + Whether actual null values should also be treated as missing. Default is `True`. + null_reason + The reason label for actual null values. Default is `"unknown"`. + description + Optional description. Default identifies the spec as SPSS user-defined missing values. + + Returns + ------- + MissingSpec + A spec built from the SPSS missing values. + + Examples + -------- + ```python + import pointblank as pb + + spss_missing = pb.MissingSpec.from_spss( + missing_values=[-99, -98], + labels={-99: "Not asked", -98: "Refused"}, + ) + spss_missing.reason_for(-98) # "refused" + ``` + """ + labels = labels or {} + reasons = { + value: (_slugify(labels[value]) if value in labels else f"missing_{_slugify(value)}") + for value in missing_values + } + return cls( + reasons=reasons, + null_is_missing=null_is_missing, + null_reason=null_reason, + description=description, + ) + + @classmethod + def from_variable_metadata( + cls, + variable: Any, + null_is_missing: bool = True, + null_reason: str = "unknown", + ) -> "MissingSpec | None": + """Create a `MissingSpec` from an imported variable's metadata. + + This works with a [`VariableMetadata`](`pointblank.VariableMetadata`) object (as produced by + [`import_metadata()`](`pointblank.import_metadata`) for SPSS, Stata, and SAS files). It reads + the variable's `missing_values` and derives reason labels from `missing_value_labels` or + `value_labels` when available. + + Parameters + ---------- + variable + A variable-metadata object exposing `missing_values` and (optionally) + `missing_value_labels` / `value_labels` attributes. + null_is_missing + Whether actual null values should also be treated as missing. Default is `True`. + null_reason + The reason label for actual null values. Default is `"unknown"`. + + Returns + ------- + MissingSpec | None + A spec built from the variable's missing values, or `None` if the variable declares no + missing values. + """ + missing_values = getattr(variable, "missing_values", None) or [] + if not missing_values: + return None + + labels = getattr(variable, "missing_value_labels", None) or {} + value_labels = getattr(variable, "value_labels", None) or {} + + reasons: dict[Any, str] = {} + for value in missing_values: + label = labels.get(value) + if label is None: + label = value_labels.get(value) + reasons[value] = _slugify(label) if label else f"missing_{_slugify(value)}" + + return cls( + reasons=reasons, + null_is_missing=null_is_missing, + null_reason=null_reason, + description=f"Imported missing values for '{getattr(variable, 'name', 'variable')}'", + ) diff --git a/pointblank/validate.py b/pointblank/validate.py index d58e33caa..2787bed5d 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -61,8 +61,10 @@ from pointblank._interrogation import ( NumberOfTestUnits, SpeciallyValidation, + apply_missing_exclusion, col_count_match, col_exists, + col_pct_missing, col_pct_null, col_schema_match, col_vals_expr, @@ -75,6 +77,9 @@ interrogate_le, interrogate_lt, interrogate_ne, + interrogate_missing_coded, + interrogate_missing_consistent, + interrogate_missing_only_coded, interrogate_not_null, interrogate_notin, interrogate_null, @@ -85,6 +90,7 @@ rows_complete, ) from pointblank._typing import SegmentSpec +from pointblank.missing import MissingSpec from pointblank._utils import ( _check_any_df_lib, _check_invalid_fields, @@ -2674,7 +2680,206 @@ def _generate_display_table( return gt_tbl -def missing_vals_tbl(data: Any) -> GT: +def _build_structured_missing_tbl( + data: Any, missing: dict[str, MissingSpec], as_heatmap: bool = False +) -> GT: + """Build a structured-missingness breakdown table (one row per column, columns for the count + and percentage of complete values and of each missing reason). + + When `as_heatmap=True`, render the reason proportions as a color-coded heatmap (cells shaded + from light to dark by the proportion missing for each reason) instead of count/percent text. + """ + if not isinstance(missing, dict): + raise TypeError( + f"`missing=` must be a dict mapping column names to MissingSpec objects, " + f"got {type(missing).__name__}." + ) + for col_name, spec in missing.items(): + if not isinstance(spec, MissingSpec): + raise TypeError( + f"`missing[{col_name!r}]` must be a MissingSpec, got {type(spec).__name__}." + ) + + nw_frame = nw.from_native(data) + is_lazy = isinstance(nw_frame, nw.LazyFrame) + + available_columns = list(nw_frame.columns) + + # Build the ordered union of *declared* (coded) reason labels across all specs (first-seen + # order). Raw Null/None/NA values are tallied separately in a fixed "Null" column rather than + # being treated as a reason, since they are not part of any MissingSpec. + reason_order: list[str] = [] + for spec in missing.values(): + for r in spec.reasons.values(): + if r not in reason_order: + reason_order.append(r) + + # A "Null" column is shown only if at least one spec counts raw nulls as missing + has_null_col = any(spec.null_is_missing for spec in missing.values()) + + records: list[dict[str, Any]] = [] + for column, spec in missing.items(): + if column not in available_columns: + raise ValueError(f"Column '{column}' given in `missing=` was not found in the table.") + + # One aggregation per declared reason (sentinel values only), plus a separate raw-null + # count when the spec treats nulls as missing; coded reasons and raw nulls are kept distinct + declared_reasons = list(dict.fromkeys(spec.reasons.values())) + select_exprs: dict[str, Any] = {"__total__": nw.len()} + reason_alias: dict[str, str] = {} + for i, r in enumerate(declared_reasons): + reason_alias[r] = f"__r{i}__" + select_exprs[reason_alias[r]] = ( + nw.col(column).is_in(spec.values_for_reason(r)).cast(nw.Int32).sum() + ) + if spec.null_is_missing: + select_exprs["__null__"] = nw.col(column).is_null().cast(nw.Int32).sum() + + out = nw_frame.select(**select_exprs) + if is_lazy: + out = out.collect() + + total = int(out["__total__"][0]) + coded_counts = {r: int(out[reason_alias[r]][0]) for r in declared_reasons} + n_null = int(out["__null__"][0]) if spec.null_is_missing else 0 + + total_missing = sum(coded_counts.values()) + n_null + complete = total - total_missing + + # A coded reason only *applies* to a column if its spec declares it; non-applicable reasons + # render as an em dash (not "0"). The "Null" column applies only when null_is_missing=True. + applicable = set(declared_reasons) + + def _prop(count: int) -> float: + return (count / total) if total > 0 else 0.0 + + if as_heatmap: + # Numeric proportions (0..1) so reason cells can be color-shaded; non-applicable cells + # are left as None (shown as an em dash, uncolored) + record: dict[str, Any] = {"columns": column, "complete": _prop(complete)} + for r in reason_order: + record[r] = _prop(coded_counts.get(r, 0)) if r in applicable else None + if has_null_col: + record["null"] = _prop(n_null) if spec.null_is_missing else None + else: + + def _fmt(count: int) -> str: + pct = round(100 * count / total) if total > 0 else 0 + return f"{count} ({pct}%)" + + record = {"columns": column, "complete": _fmt(complete)} + for r in reason_order: + record[r] = _fmt(coded_counts.get(r, 0)) if r in applicable else "—" + if has_null_col: + record["null"] = _fmt(n_null) if spec.null_is_missing else "—" + records.append(record) + + # Build a DataFrame from the records using the available DataFrame library + df_lib_gt = _select_df_lib(preference="polars") + if df_lib_gt.__name__ == "polars": + import polars as pl + + breakdown_df = pl.DataFrame(records) + else: + import pandas as pd + + breakdown_df = pd.DataFrame(records) + + # Reason columns keep their raw input form as labels (e.g. "not_asked", not "Not Asked"); the + # fixed columns are relabeled. The total row count is already shown in the header, so there's no + # redundant "Total N" column. Raw nulls appear in a fixed "Null" column (styled like "Complete"), + # not as a reason. + cols_labels = {"columns": "Column", "complete": "Complete"} + if has_null_col: + cols_labels["null"] = "Null" + + value_columns = ["complete"] + reason_order + (["null"] if has_null_col else []) + + # Build a header that matches the default `missing_vals_tbl()` look: a plain (large) title in + # IBM Plex Sans and a subtitle showing the table type and dimensions + tbl_type = _get_tbl_type(data=data) + n_rows_total = get_row_count(data) + table_type_html = _create_table_type_html(tbl_type=tbl_type, tbl_name=None, font_size="10px") + tbl_dims_html = _create_table_dims_html( + columns=len(available_columns), rows=n_rows_total, font_size="10px" + ) + combined_subtitle = ( + "
" + '
' + f"{table_type_html}" + f"{tbl_dims_html}" + "
" + "
" + ) + + # The left "Column" column is rendered in monospace, matching the default report's body font + column_name_style = style.text( + color="black", font=google_font(name="IBM Plex Mono"), size="12px" + ) + # The reason column labels keep their raw input form and are shown in monospace + reason_label_style = style.text(font=google_font(name="IBM Plex Mono"), size="12px") + + # Columns that should show an em dash for non-applicable cells (reason columns + the Null column) + em_dash_columns = reason_order + (["null"] if has_null_col else []) + + if as_heatmap: + title = "Missing Pattern Heatmap" + # "complete" and "null" are shown as plain percentages (uncolored, like the default report); + # only the coded reason columns are color-shaded by proportion + prop_columns = ["complete"] + reason_order + (["null"] if has_null_col else []) + + gt_tbl = ( + GT(breakdown_df) + .tab_header(title=title, subtitle=html(combined_subtitle)) + .opt_table_font(font=google_font(name="IBM Plex Sans")) + .opt_align_table_header(align="left") + .cols_label(cases=cols_labels) + .cols_align(align="center", columns=value_columns) + .cols_align(align="left", columns="columns") + .fmt_percent(columns=prop_columns, decimals=0) + .data_color( + columns=reason_order, + palette=["#F5F5F5", "#000000"], + domain=[0, 1], + na_color="#FFFFFF", + ) + .sub_missing(columns=em_dash_columns, missing_text="—") + .tab_style(style=column_name_style, locations=loc.body(columns="columns")) + .tab_style(style=reason_label_style, locations=loc.column_labels(columns=reason_order)) + ) + else: + title = "Missing Values by Reason" + + gt_tbl = ( + GT(breakdown_df) + .tab_header(title=title, subtitle=html(combined_subtitle)) + .opt_table_font(font=google_font(name="IBM Plex Sans")) + .opt_align_table_header(align="left") + .cols_label(cases=cols_labels) + .cols_align(align="right", columns=value_columns) + .cols_align(align="left", columns="columns") + .tab_style( + style=style.text(font=google_font(name="IBM Plex Mono"), size="12px"), + locations=loc.body(columns=value_columns), + ) + .tab_style(style=column_name_style, locations=loc.body(columns="columns")) + .tab_style(style=reason_label_style, locations=loc.column_labels(columns=reason_order)) + ) + + # Group only the coded reasons under a "Missing Reasons" spanner. Raw nulls live in the fixed + # "Null" column (styled like "Complete"), so they aren't mistaken for declared spec reasons. + if reason_order: + gt_tbl = gt_tbl.tab_spanner(label="Missing Reasons", columns=reason_order) + + if version("great_tables") >= "0.17.0": + gt_tbl = gt_tbl.tab_options(quarto_disable_processing=True) + + return gt_tbl + + +def missing_vals_tbl( + data: Any, missing: dict[str, MissingSpec] | None = None, as_heatmap: bool = False +) -> GT: """ Display a table that shows the missing values in the input table. @@ -2682,12 +2887,40 @@ def missing_vals_tbl(data: Any) -> GT: table. The table is displayed using the Great Tables API, which allows for further customization of the table's appearance if so desired. + By default, missingness is treated as binary (a value is either Null or it isn't) and the + function renders a sector-based heatmap of the proportion of Null values across the rows of each + column. When a `missing=` mapping of columns to [`MissingSpec`](`pointblank.MissingSpec`) objects + is supplied, the function instead renders a *structured missingness* breakdown: one row per + column with the count and percentage of complete values and of each missing *reason* (e.g., + `refused`, `not_asked`). Declared (coded) reasons are grouped under a "Missing Reasons" spanner + and keep their raw input form as labels; actual `Null`/`None`/`NA` values (which are not part of + the spec) are tallied in a fixed "Null" column at the far right (styled like "Complete"), so + they aren't mistaken for declared reasons. + + Note that supplying `missing=` produces a *different report* than the default view: it is a + distinct visualization (a per-reason breakdown table, or a per-reason heatmap with + `as_heatmap=True`), not an annotated version of the default sector heatmap. The report titles + differ accordingly ("Missing Values" for the default, "Missing Values by Reason" or "Missing + Pattern Heatmap" for the structured views), and the shared header/title styling makes the family + resemblance clear. + Parameters ---------- data The table for which to display the missing values. This could be a DataFrame object, an Ibis table object, a CSV file path, a Parquet file path, or a database connection string. Read the *Supported Input Table Types* section for details on the supported table types. + missing + An optional dictionary mapping column names to [`MissingSpec`](`pointblank.MissingSpec`) + objects. When provided, the function renders a structured breakdown of missingness by + reason for the specified columns (rather than the default sector heatmap). The reason + columns are the union of reasons across the supplied specs; a reason that isn't defined for + a given column is shown as an em dash (not applicable), as distinct from a defined-but-unobserved + reason (shown as `0 (0%)`). + as_heatmap + Only applies when `missing=` is provided. When `True`, render the per-reason proportions as + a color-coded heatmap (cells shaded from light to dark by the proportion missing) instead of + the count/percentage text breakdown. Default is `False`. Returns ------- @@ -2765,6 +2998,12 @@ def missing_vals_tbl(data: Any) -> GT: if "pyspark" not in tbl_type: data = copy.deepcopy(data) + # If a `missing=` spec mapping is provided, render the structured missingness breakdown + # (count and percentage of complete values and each missing reason, per column) instead of + # the default sector heatmap + if missing is not None: + return _build_structured_missing_tbl(data=data, missing=missing, as_heatmap=as_heatmap) + # Get the number of rows in the table n_rows = get_row_count(data) @@ -3815,6 +4054,7 @@ def from_agg_validator( values: Any | list[Any] | tuple | None = None inclusive: tuple[bool, bool] | None = None na_pass: bool | None = None + missing: Any | None = None pre: Callable | None = None segments: Any | None = None thresholds: Thresholds | None = None @@ -5208,6 +5448,7 @@ def col_vals_gt( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -5480,6 +5721,7 @@ def col_vals_gt( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -5497,6 +5739,7 @@ def col_vals_lt( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -5776,6 +6019,7 @@ def col_vals_lt( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -5793,6 +6037,7 @@ def col_vals_eq( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -6072,6 +6317,7 @@ def col_vals_eq( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -6089,6 +6335,7 @@ def col_vals_ne( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -6366,6 +6613,7 @@ def col_vals_ne( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -6383,6 +6631,7 @@ def col_vals_ge( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -6663,6 +6912,7 @@ def col_vals_ge( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -6680,6 +6930,7 @@ def col_vals_le( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -6960,6 +7211,7 @@ def col_vals_le( column=column, values=value, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -6979,6 +7231,7 @@ def col_vals_between( right: float | int | Column, inclusive: tuple[bool, bool] = (True, True), na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -7285,6 +7538,7 @@ def col_vals_between( values=value, inclusive=inclusive, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -7304,6 +7558,7 @@ def col_vals_outside( right: float | int | Column, inclusive: tuple[bool, bool] = (True, True), na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -7610,6 +7865,7 @@ def col_vals_outside( values=value, inclusive=inclusive, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -7626,6 +7882,7 @@ def col_vals_in_set( self, columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, set: Collection[Any], + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -7932,6 +8189,7 @@ class Color(Enum): assertion_type=assertion_type, column=column, values=set, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -7948,6 +8206,7 @@ def col_vals_not_in_set( self, columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, set: Collection[Any], + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -8226,6 +8485,7 @@ class InvalidStatus(Enum): assertion_type=assertion_type, column=column, values=set, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -8244,6 +8504,7 @@ def col_vals_increasing( allow_stationary: bool = False, decreasing_tol: float | None = None, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -8422,6 +8683,7 @@ def col_vals_increasing( column=column, values="", na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -8444,6 +8706,7 @@ def col_vals_decreasing( allow_stationary: bool = False, increasing_tol: float | None = None, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -8622,6 +8885,7 @@ def col_vals_decreasing( column=column, values="", na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -9140,6 +9404,7 @@ def col_vals_regex( pattern: str, na_pass: bool = False, inverse: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -9391,6 +9656,7 @@ def col_vals_regex( column=column, values=values, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -9408,6 +9674,7 @@ def col_vals_within_spec( columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, spec: str, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -9681,6 +9948,7 @@ def col_vals_within_spec( column=column, values=values, na_pass=na_pass, + missing=missing, pre=pre, segments=segments, thresholds=thresholds, @@ -10404,48 +10672,59 @@ def col_pct_null( return self - def rows_distinct( + def col_pct_missing( self, - columns_subset: str | list[str] | None = None, - pre: Callable | None = None, - segments: SegmentSpec | None = None, - thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + max_pct: float, + reason: str | None = None, + category: str | None = None, + thresholds: int | float | None | bool | tuple | dict | Thresholds = None, actions: Actions | None = None, brief: str | bool | None = None, active: bool | Callable = True, ) -> Validate: """ - Validate whether rows in the table are distinct. + Validate that the percentage of *structured* missing values stays within a limit. - The `rows_distinct()` method checks whether rows in the table are distinct. This validation - will operate over the number of test units that is equal to the number of rows in the table - (determined after any `pre=` mutation has been applied). + The `col_pct_missing()` validation method checks whether the percentage of missing values + in a column is at most `max_pct=`. Unlike [`col_pct_null()`](`pointblank.Validate.col_pct_null`), + which only considers actual null values, this method uses a + [`MissingSpec`](`pointblank.MissingSpec`) to define which values count as missing: declared + sentinel values (e.g., `-99` for `"refused"`) and, when `null_is_missing=True`, actual null + values. This validation operates at the column level, generating a single validation step + per column that passes when the missing percentage does not exceed `max_pct=`. + + You can narrow the check to a single reason (via `reason=`) or a category of reasons (via + `category=`), making it possible to assert things like "at most 10% of values were refused" + or "at most 15% are item nonresponse". Parameters ---------- - columns_subset - A single column or a list of columns to use as a subset for the distinct comparison. - If `None`, then all columns in the table will be used for the comparison. If multiple - columns are supplied, the distinct comparison will be made over the combination of - values in those columns. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If + multiple columns are supplied or resolved, there will be a separate validation step + generated for each column. + missing + A [`MissingSpec`](`pointblank.MissingSpec`) describing the sentinel values (and their + reasons) that encode missingness for this column. + max_pct + The maximum allowable percentage of missing values, expressed as a decimal between + `0.0` and `1.0`. For example, `max_pct=0.20` means at most 20% of values may be missing. + reason + If provided, only count missing values whose reason matches this label. Cannot be + combined with `category=`. + category + If provided, only count missing values whose reason falls in this category (as defined + in `MissingSpec.categories`). Cannot be combined with `reason=`. thresholds Set threshold failure levels for reporting and reacting to exceedences of the levels. The thresholds are set at the step level and will override any global thresholds set in `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. + be set locally and global thresholds (if any) will take effect. actions - Optional actions to take when the validation step meets or exceeds any set threshold + Optional actions to take when the validation step(s) meets or exceeds any set threshold levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to define the actions. brief @@ -10457,73 +10736,13 @@ def rows_distinct( active A boolean value or callable that determines whether the validation step should be active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. + presence and keeping indexes for the steps unchanged). Returns ------- Validate The `Validate` object with the added validation step. - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns_subset=` that are expected to be present in the transformed table, but - may not exist in the table before preprocessing. Regarding the lifetime of the transformed - table, it only exists during the validation step and is not stored in the `Validate` object - or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - Thresholds ---------- The `thresholds=` parameter is used to set the failure-condition levels for the validation @@ -10560,57 +10779,936 @@ def rows_distinct( import pointblank as pb pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) ``` - For the examples here, we'll use a simple Polars DataFrame with three string columns - (`col_1`, `col_2`, and `col_3`). The table is shown below: + Survey data often encodes missingness with sentinel values rather than nulls. Here, the + `age` column uses `-99` (`"not_asked"`), `-98` (`"refused"`), and `-97` (`"dont_know"`): ```{python} import pointblank as pb import polars as pl tbl = pl.DataFrame( - { - "col_1": ["a", "b", "c", "d"], - "col_2": ["a", "a", "c", "d"], - "col_3": ["a", "a", "d", "e"], + {"age": [34, -98, 41, -99, 29, -98, 55, 38]}, + ) + + age_missing = pb.MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}, + categories={"item_nonresponse": ["refused", "dont_know"]}, + ) + + validation = ( + pb.Validate(data=tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.5) + .col_pct_missing(columns="age", missing=age_missing, reason="refused", max_pct=0.30) + .interrogate() + ) + + validation + ``` + """ + assertion_type = _get_fn_name() + + _check_column(column=columns) + _check_thresholds(thresholds=thresholds) + _check_active_input(param=active, param_name="active") + + if not isinstance(missing, MissingSpec): + raise TypeError(f"`missing=` must be a MissingSpec, got {type(missing).__name__}.") + + if reason is not None and category is not None: + raise ValueError("Only one of `reason=` or `category=` can be specified.") + + if not 0.0 <= max_pct <= 1.0: + raise ValueError(f"`max_pct=` must be between 0.0 and 1.0, got {max_pct}.") + + # Resolve which sentinel values (and whether nulls) count as missing for this step + if reason is not None: + sentinels = missing.values_for_reason(reason) + count_null = missing.null_is_missing and missing.null_reason == reason + elif category is not None: + sentinels = missing.values_for_category(category) + cat_reasons = (missing.categories or {}).get(category, []) + count_null = missing.null_is_missing and missing.null_reason in cat_reasons + else: + sentinels = missing.sentinel_values() + count_null = missing.null_is_missing + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later + # resolve the columns + if isinstance(columns, (ColumnSelector, nw.selectors.Selector)): + columns = col(columns) + + # If `columns` is Column value or a string, place it in a list for iteration + if isinstance(columns, (Column, str)): + columns = [columns] + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + # Iterate over the columns and create a validation step for each + for column in columns: + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=column, + values={ + "sentinels": sentinels, + "count_null": count_null, + "max_pct": max_pct, + "reason": reason, + "category": category, + "spec": missing, + }, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + ) + + self._add_validation(validation_info=val_info) + + return self + + def col_missing_coded( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: + """ + Validate that all missing values in a column are *coded* (no uncoded nulls). + + The `col_missing_coded()` validation method checks that every absent value in a column is + expressed with an explicit missing-value code, rather than a raw null. Under the structured + missingness model (see [`MissingSpec`](`pointblank.MissingSpec`)), every absence should + carry a *reason* — encoded as a sentinel value such as `-99` for `"not_asked"`. A raw null + represents *uncoded* (unknown) missingness, so this validation treats raw nulls as failing + test units while declared sentinel values and real values pass. + + This validation operates over the number of test units equal to the number of rows in the + table (determined after any `pre=` mutation has been applied). + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If + multiple columns are supplied or resolved, there will be a separate validation step + generated for each column. + missing + A [`MissingSpec`](`pointblank.MissingSpec`) describing the sentinel values (and their + reasons) that encode missingness for this column. The spec documents which codes are + considered valid expressions of missingness. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. + actions + Optional actions to take when the validation step(s) meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value or callable that determines whether the validation step should be + active. Using `False` will make the validation step inactive (still reporting its + presence and keeping indexes for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Preprocessing + ------------- + The `pre=` argument allows for a preprocessing function or lambda to be applied to the data + table during interrogation. This function should take a table as input and return a modified + table. This is useful for performing any necessary transformations or filtering on the data + before the validation step is applied. + + Segmentation + ------------ + The `segments=` argument allows for the segmentation of a validation step into multiple + segments. This is useful for applying the same validation step to different subsets of the + data. The segmentation can be done based on a single column or specific fields within a + column. Providing a single column name results in a separate validation step for each unique + value in that column; a tuple of `(column, values)` restricts segmentation to the listed + values. The segmentation is performed after any `pre=` preprocessing. + + Thresholds + ---------- + The `thresholds=` parameter is used to set the failure-condition levels for the validation + step. If they are set here at the step level, these thresholds will override any thresholds + set at the global level in `Validate(thresholds=...)`. + + There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values + can either be set as a proportion failing of all test units (a value between `0` to `1`), + or, the absolute number of failing test units (as integer that's `1` or greater). + + Thresholds can be defined using one of these input schemes: + + 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create + thresholds) + 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is + the 'error' level, and position `2` is the 'critical' level + 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and + 'critical' + 4. a single integer/float value denoting absolute number or fraction of failing test units + for the 'warning' level only + + If the number of failing test units exceeds set thresholds, the validation step will be + marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be + set, you're free to set any combination of them. + + Aside from reporting failure conditions, thresholds can be used to determine the actions to + take for each level of failure (using the `actions=` parameter). + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) + ``` + Here, the `age` column codes its missingness with sentinel values, except for one row that + has a raw null (an uncoded absence): + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame({"age": [34, -98, 41, None, 29, -99, 55, 38]}) + + age_missing = pb.MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}, + ) + + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing) + .interrogate() + ) + + validation + ``` + + The validation reports a single failing test unit: the row where `age` is a raw null, which + represents missingness without a documented reason. + """ + assertion_type = _get_fn_name() + + _check_column(column=columns) + _check_pre(pre=pre) + _check_thresholds(thresholds=thresholds) + _check_active_input(param=active, param_name="active") + + if not isinstance(missing, MissingSpec): + raise TypeError(f"`missing=` must be a MissingSpec, got {type(missing).__name__}.") + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + columns = _resolve_columns(columns) + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + # Iterate over the columns and create a validation step for each + for column in columns: + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=column, + values=missing, + pre=pre, + segments=segments, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + ) + + self._add_validation(validation_info=val_info) + + return self + + def col_missing_only_coded( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + allowed: Collection[Any] | None = None, + min_val: float | int | None = None, + max_val: float | int | None = None, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: + """ + Validate that a column contains only documented codes and legitimate values. + + The `col_missing_only_coded()` method checks that every value in a column is *accounted + for*: it is either a declared missing-value code (a sentinel in the + [`MissingSpec`](`pointblank.MissingSpec`), or a null when `null_is_missing=True`), or a + legitimate "real" value. Legitimate real values are defined by `allowed=` (an explicit set) + and/or a `[min_val, max_val]` range. Any value that is neither a documented code nor a + legitimate real value is flagged — this catches *undocumented* sentinel codes (e.g., a + stray `-95`) that aren't part of the spec. + + At least one of `allowed=`, `min_val=`, or `max_val=` must be provided so that legitimate + real values can be distinguished from undocumented codes. This validation operates over the + number of test units equal to the number of rows in the table. + + Parameters + ---------- + columns + A single column or a list of columns to validate. Can also use + [`col()`](`pointblank.col`) with column selectors to specify one or more columns. + missing + A [`MissingSpec`](`pointblank.MissingSpec`) declaring the documented sentinel codes. + allowed + An explicit set of legitimate real values. A value in this set passes. Can be combined + with `min_val=`/`max_val=` (a value passes if it satisfies either constraint). + min_val + Lower bound (inclusive) of the legitimate real-value range. + max_val + Upper bound (inclusive) of the legitimate real-value range. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. + actions + Optional actions to take when the validation step meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value or callable that determines whether the validation step should be + active. Using `False` will make the validation step inactive (still reporting its + presence and keeping indexes for the steps unchanged). + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Preprocessing + ------------- + The `pre=` argument allows for a preprocessing function or lambda to be applied to the data + table during interrogation. This function should take a table as input and return a modified + table. This is useful for performing any necessary transformations or filtering on the data + before the validation step is applied. + + Segmentation + ------------ + The `segments=` argument allows for the segmentation of a validation step into multiple + segments. This is useful for applying the same validation step to different subsets of the + data. The segmentation can be done based on a single column or specific fields within a + column. Providing a single column name results in a separate validation step for each unique + value in that column; a tuple of `(column, values)` restricts segmentation to the listed + values. The segmentation is performed after any `pre=` preprocessing. + + Thresholds + ---------- + The `thresholds=` parameter is used to set the failure-condition levels for the validation + step. If they are set here at the step level, these thresholds will override any thresholds + set at the global level in `Validate(thresholds=...)`. + + There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values + can either be set as a proportion failing of all test units (a value between `0` to `1`), + or, the absolute number of failing test units (as integer that's `1` or greater). + + Thresholds can be defined using one of these input schemes: + + 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create + thresholds) + 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is + the 'error' level, and position `2` is the 'critical' level + 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and + 'critical' + 4. a single integer/float value denoting absolute number or fraction of failing test units + for the 'warning' level only + + If the number of failing test units exceeds set thresholds, the validation step will be + marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be + set, you're free to set any combination of them. + + Aside from reporting failure conditions, thresholds can be used to determine the actions to + take for each level of failure (using the `actions=` parameter). + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) + ``` + The `age` column should contain real ages in `[0, 120]` or the documented codes `-99`/`-98`. + The value `-95` is an *undocumented* code and should be flagged: + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame({"age": [34, -98, 41, -95, 29, -99, 55]}) + + age_missing = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + validation = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=age_missing, min_val=0, max_val=120) + .interrogate() + ) + + validation + ``` + + The validation reports one failing test unit: the row where `age` is `-95`, which is + neither a real age in range nor a declared sentinel. + """ + assertion_type = _get_fn_name() + + _check_column(column=columns) + _check_pre(pre=pre) + _check_thresholds(thresholds=thresholds) + _check_active_input(param=active, param_name="active") + + if not isinstance(missing, MissingSpec): + raise TypeError(f"`missing=` must be a MissingSpec, got {type(missing).__name__}.") + + if allowed is None and min_val is None and max_val is None: + raise ValueError( + "`col_missing_only_coded()` requires at least one of `allowed=`, `min_val=`, or " + "`max_val=` so that legitimate real values can be distinguished from undocumented " + "codes." + ) + + sentinels = missing.sentinel_values() + count_null = missing.null_is_missing + allowed_list = list(allowed) if allowed is not None else None + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + columns = _resolve_columns(columns) + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + # Iterate over the columns and create a validation step for each + for column in columns: + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=column, + values={ + "sentinels": sentinels, + "count_null": count_null, + "allowed": allowed_list, + "min_val": min_val, + "max_val": max_val, + "spec": missing, + }, + pre=pre, + segments=segments, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + ) + + self._add_validation(validation_info=val_info) + + return self + + def rows_distinct( + self, + columns_subset: str | list[str] | None = None, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: + """ + Validate whether rows in the table are distinct. + + The `rows_distinct()` method checks whether rows in the table are distinct. This validation + will operate over the number of test units that is equal to the number of rows in the table + (determined after any `pre=` mutation has been applied). + + Parameters + ---------- + columns_subset + A single column or a list of columns to use as a subset for the distinct comparison. + If `None`, then all columns in the table will be used for the comparison. If multiple + columns are supplied, the distinct comparison will be made over the combination of + values in those columns. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + Have a look at the *Preprocessing* section for more information on how to use this + argument. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). Can be a single column name, a tuple that specifies a + column name and its corresponding values to segment on, or a combination of both + (provided as a list). Read the *Segmentation* section for usage information. + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will + be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* + section for information on how to set threshold levels. + actions + Optional actions to take when the validation step meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value or callable that determines whether the validation step should be + active. Using `False` will make the validation step inactive (still reporting its + presence and keeping indexes for the steps unchanged). A callable can also be + provided; it will receive the data table as its single argument and must return a + boolean value. The callable is evaluated *before* any `pre=` processing. Inspection + functions like [`has_columns()`](`pointblank.has_columns`) and + [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step + based on properties of the target table. + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Preprocessing + ------------- + The `pre=` argument allows for a preprocessing function or lambda to be applied to the data + table during interrogation. This function should take a table as input and return a modified + table. This is useful for performing any necessary transformations or filtering on the data + before the validation step is applied. + + The preprocessing function can be any callable that takes a table as input and returns a + modified table. For example, you could use a lambda function to filter the table based on + certain criteria or to apply a transformation to the data. Note that you can refer to + columns via `columns_subset=` that are expected to be present in the transformed table, but + may not exist in the table before preprocessing. Regarding the lifetime of the transformed + table, it only exists during the validation step and is not stored in the `Validate` object + or used in subsequent validation steps. + + Segmentation + ------------ + The `segments=` argument allows for the segmentation of a validation step into multiple + segments. This is useful for applying the same validation step to different subsets of the + data. The segmentation can be done based on a single column or specific fields within a + column. + + Providing a single column name will result in a separate validation step for each unique + value in that column. For example, if you have a column called `"region"` with values + `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each + region. + + Alternatively, you can provide a tuple that specifies a column name and its corresponding + values to segment on. For example, if you have a column called `"date"` and you want to + segment on only specific dates, you can provide a tuple like + `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded + (i.e., no validation steps will be created for them). + + A list with a combination of column names and tuples can be provided as well. This allows + for more complex segmentation scenarios. The following inputs are both valid: + + ``` + # Segments from all unique values in the `region` column + # and specific dates in the `date` column + segments=["region", ("date", ["2023-01-01", "2023-01-02"])] + + # Segments from all unique values in the `region` and `date` columns + segments=["region", "date"] + ``` + + The segmentation is performed during interrogation, and the resulting validation steps will + be numbered sequentially. Each segment will have its own validation step, and the results + will be reported separately. This allows for a more granular analysis of the data and helps + identify issues within specific segments. + + Importantly, the segmentation process will be performed after any preprocessing of the data + table. Because of this, one can conceivably use the `pre=` argument to generate a column + that can be used for segmentation. For example, you could create a new column called + `"segment"` through use of `pre=` and then use that column for segmentation. + + Thresholds + ---------- + The `thresholds=` parameter is used to set the failure-condition levels for the validation + step. If they are set here at the step level, these thresholds will override any thresholds + set at the global level in `Validate(thresholds=...)`. + + There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values + can either be set as a proportion failing of all test units (a value between `0` to `1`), + or, the absolute number of failing test units (as integer that's `1` or greater). + + Thresholds can be defined using one of these input schemes: + + 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create + thresholds) + 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is + the 'error' level, and position `2` is the 'critical' level + 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and + 'critical' + 4. a single integer/float value denoting absolute number or fraction of failing test units + for the 'warning' level only + + If the number of failing test units exceeds set thresholds, the validation step will be + marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be + set, you're free to set any combination of them. + + Aside from reporting failure conditions, thresholds can be used to determine the actions to + take for each level of failure (using the `actions=` parameter). + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) + ``` + For the examples here, we'll use a simple Polars DataFrame with three string columns + (`col_1`, `col_2`, and `col_3`). The table is shown below: + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame( + { + "col_1": ["a", "b", "c", "d"], + "col_2": ["a", "a", "c", "d"], + "col_3": ["a", "a", "d", "e"], + } + ) + + pb.preview(tbl) + ``` + + Let's validate that the rows in the table are distinct with `rows_distinct()`. We'll + determine if this validation had any failing test units (there are four test units, one for + each row). A failing test units means that a given row is not distinct from every other row. + + ```{python} + validation = ( + pb.Validate(data=tbl) + .rows_distinct() + .interrogate() + ) + + validation + ``` + + From this validation table we see that there are no failing test units. All rows in the + table are distinct from one another. + + We can also use a subset of columns to determine distinctness. Let's specify the subset + using columns `col_2` and `col_3` for the next validation. + + ```{python} + validation = ( + pb.Validate(data=tbl) + .rows_distinct(columns_subset=["col_2", "col_3"]) + .interrogate() + ) + + validation + ``` + + The validation table reports two failing test units. The first and second rows are + duplicated when considering only the values in columns `col_2` and `col_3`. There's only + one set of duplicates but there are two failing test units since each row is compared to all + others. + """ + + assertion_type = _get_fn_name() + + _check_pre(pre=pre) + # TODO: add check for segments + # _check_segments(segments=segments) + _check_thresholds(thresholds=thresholds) + _check_active_input(param=active, param_name="active") + + # Determine threshold to use (global or local) and normalize a local `thresholds=` value + thresholds = ( + self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + ) + + if columns_subset is not None and isinstance(columns_subset, str): + columns_subset = [columns_subset] + + # TODO: incorporate Column object + + # Determine brief to use (global or local) and transform any shorthands of `brief=` + brief = self.brief if brief is None else _transform_auto_brief(brief=brief) + + val_info = _ValidationInfo( + assertion_type=assertion_type, + column=columns_subset, + pre=pre, + segments=segments, + thresholds=thresholds, + actions=actions, + brief=brief, + active=active, + ) + + self._add_validation(validation_info=val_info) + + return self + + def rows_complete( + self, + columns_subset: str | list[str] | None = None, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: + """ + Validate whether row data are complete by having no missing values. + + The `rows_complete()` method checks whether rows in the table are complete. Completeness + of a row means that there are no missing values within the row. This validation will operate + over the number of test units that is equal to the number of rows in the table (determined + after any `pre=` mutation has been applied). A subset of columns can be specified for the + completeness check. If no subset is provided, all columns in the table will be used. + + Parameters + ---------- + columns_subset + A single column or a list of columns to use as a subset for the completeness check. If + `None` (the default), then all columns in the table will be used. + pre + An optional preprocessing function or lambda to apply to the data table during + interrogation. This function should take a table as input and return a modified table. + Have a look at the *Preprocessing* section for more information on how to use this + argument. + segments + An optional directive on segmentation, which serves to split a validation step into + multiple (one step per segment). Can be a single column name, a tuple that specifies a + column name and its corresponding values to segment on, or a combination of both + (provided as a list). Read the *Segmentation* section for usage information. + thresholds + Set threshold failure levels for reporting and reacting to exceedences of the levels. + The thresholds are set at the step level and will override any global thresholds set in + `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will + be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* + section for information on how to set threshold levels. + actions + Optional actions to take when the validation step meets or exceeds any set threshold + levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to + define the actions. + brief + An optional brief description of the validation step that will be displayed in the + reporting table. You can use the templating elements like `"{step}"` to insert + the step number, or `"{auto}"` to include an automatically generated brief. If `True` + the entire brief will be automatically generated. If `None` (the default) then there + won't be a brief. + active + A boolean value or callable that determines whether the validation step should be + active. Using `False` will make the validation step inactive (still reporting its + presence and keeping indexes for the steps unchanged). A callable can also be + provided; it will receive the data table as its single argument and must return a + boolean value. The callable is evaluated *before* any `pre=` processing. Inspection + functions like [`has_columns()`](`pointblank.has_columns`) and + [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step + based on properties of the target table. + + Returns + ------- + Validate + The `Validate` object with the added validation step. + + Preprocessing + ------------- + The `pre=` argument allows for a preprocessing function or lambda to be applied to the data + table during interrogation. This function should take a table as input and return a modified + table. This is useful for performing any necessary transformations or filtering on the data + before the validation step is applied. + + The preprocessing function can be any callable that takes a table as input and returns a + modified table. For example, you could use a lambda function to filter the table based on + certain criteria or to apply a transformation to the data. Note that you can refer to + columns via `columns_subset=` that are expected to be present in the transformed table, but + may not exist in the table before preprocessing. Regarding the lifetime of the transformed + table, it only exists during the validation step and is not stored in the `Validate` object + or used in subsequent validation steps. + + Segmentation + ------------ + The `segments=` argument allows for the segmentation of a validation step into multiple + segments. This is useful for applying the same validation step to different subsets of the + data. The segmentation can be done based on a single column or specific fields within a + column. + + Providing a single column name will result in a separate validation step for each unique + value in that column. For example, if you have a column called `"region"` with values + `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each + region. + + Alternatively, you can provide a tuple that specifies a column name and its corresponding + values to segment on. For example, if you have a column called `"date"` and you want to + segment on only specific dates, you can provide a tuple like + `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded + (i.e., no validation steps will be created for them). + + A list with a combination of column names and tuples can be provided as well. This allows + for more complex segmentation scenarios. The following inputs are both valid: + + ``` + # Segments from all unique values in the `region` column + # and specific dates in the `date` column + segments=["region", ("date", ["2023-01-01", "2023-01-02"])] + + # Segments from all unique values in the `region` and `date` columns + segments=["region", "date"] + ``` + + The segmentation is performed during interrogation, and the resulting validation steps will + be numbered sequentially. Each segment will have its own validation step, and the results + will be reported separately. This allows for a more granular analysis of the data and helps + identify issues within specific segments. + + Importantly, the segmentation process will be performed after any preprocessing of the data + table. Because of this, one can conceivably use the `pre=` argument to generate a column + that can be used for segmentation. For example, you could create a new column called + `"segment"` through use of `pre=` and then use that column for segmentation. + + Thresholds + ---------- + The `thresholds=` parameter is used to set the failure-condition levels for the validation + step. If they are set here at the step level, these thresholds will override any thresholds + set at the global level in `Validate(thresholds=...)`. + + There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values + can either be set as a proportion failing of all test units (a value between `0` to `1`), + or, the absolute number of failing test units (as integer that's `1` or greater). + + Thresholds can be defined using one of these input schemes: + + 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create + thresholds) + 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is + the 'error' level, and position `2` is the 'critical' level + 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and + 'critical' + 4. a single integer/float value denoting absolute number or fraction of failing test units + for the 'warning' level only + + If the number of failing test units exceeds set thresholds, the validation step will be + marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be + set, you're free to set any combination of them. + + Aside from reporting failure conditions, thresholds can be used to determine the actions to + take for each level of failure (using the `actions=` parameter). + + Examples + -------- + ```{python} + #| echo: false + #| output: false + import pointblank as pb + pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) + ``` + For the examples here, we'll use a simple Polars DataFrame with three string columns + (`col_1`, `col_2`, and `col_3`). The table is shown below: + + ```{python} + import pointblank as pb + import polars as pl + + tbl = pl.DataFrame( + { + "col_1": ["a", None, "c", "d"], + "col_2": ["a", "a", "c", None], + "col_3": ["a", "a", "d", None], } ) pb.preview(tbl) ``` - Let's validate that the rows in the table are distinct with `rows_distinct()`. We'll + Let's validate that the rows in the table are complete with `rows_complete()`. We'll determine if this validation had any failing test units (there are four test units, one for - each row). A failing test units means that a given row is not distinct from every other row. + each row). A failing test units means that a given row is not complete (i.e., has at least + one missing value). ```{python} validation = ( pb.Validate(data=tbl) - .rows_distinct() + .rows_complete() .interrogate() ) validation ``` - From this validation table we see that there are no failing test units. All rows in the - table are distinct from one another. + From this validation table we see that there are two failing test units. This is because + two rows in the table have at least one missing value (the second row and the last row). - We can also use a subset of columns to determine distinctness. Let's specify the subset + We can also use a subset of columns to determine completeness. Let's specify the subset using columns `col_2` and `col_3` for the next validation. ```{python} validation = ( pb.Validate(data=tbl) - .rows_distinct(columns_subset=["col_2", "col_3"]) + .rows_complete(columns_subset=["col_2", "col_3"]) .interrogate() ) validation ``` - The validation table reports two failing test units. The first and second rows are - duplicated when considering only the values in columns `col_2` and `col_3`. There's only - one set of duplicates but there are two failing test units since each row is compared to all + The validation table reports a single failing test units. The last row contains missing + values in both the `col_2` and `col_3` columns. others. """ @@ -10627,8 +11725,8 @@ def rows_distinct( self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) ) - if columns_subset is not None and isinstance(columns_subset, str): - columns_subset = [columns_subset] + if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover + columns_subset = [columns_subset] # pragma: no cover # TODO: incorporate Column object @@ -10650,9 +11748,11 @@ def rows_distinct( return self - def rows_complete( + def col_missing_consistent( self, - columns_subset: str | list[str] | None = None, + columns: list[str], + missing: MissingSpec, + when_reason: str, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -10661,35 +11761,41 @@ def rows_complete( active: bool | Callable = True, ) -> Validate: """ - Validate whether row data are complete by having no missing values. + Validate that related columns share a consistent missingness pattern for a given reason. - The `rows_complete()` method checks whether rows in the table are complete. Completeness - of a row means that there are no missing values within the row. This validation will operate - over the number of test units that is equal to the number of rows in the table (determined - after any `pre=` mutation has been applied). A subset of columns can be specified for the - completeness check. If no subset is provided, all columns in the table will be used. + The `col_missing_consistent()` method checks that, across a set of related columns, the + "missing for a specific reason" status is *consistent*: for each row, either *none* of the + columns are missing for `when_reason=`, or *all* of them are. This is useful for structured + survey or clinical data where a skip pattern should propagate across related fields — for + example, if a question wasn't asked (`"not_asked"`) then all of its dependent fields should + also be coded `"not_asked"`. + + A value is considered "missing for the reason" when it is one of the sentinel values mapped + to `when_reason=` in the [`MissingSpec`](`pointblank.MissingSpec`) (and, when the reason is + the spec's `null_reason` and `null_is_missing=True`, an actual null). This validation + operates over the number of test units equal to the number of rows in the table. A row fails + when some — but not all — of the columns are missing for the given reason. Parameters ---------- - columns_subset - A single column or a list of columns to use as a subset for the completeness check. If - `None` (the default), then all columns in the table will be used. + columns + A list of related columns to check for consistent missingness. + missing + A [`MissingSpec`](`pointblank.MissingSpec`) describing the sentinel values and their + reasons for the columns. + when_reason + The reason label whose presence should be consistent across `columns=`. If one column + in a row is missing for this reason, all of them should be. pre An optional preprocessing function or lambda to apply to the data table during interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. segments An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. + multiple (one step per segment). thresholds Set threshold failure levels for reporting and reacting to exceedences of the levels. The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. + `Validate(thresholds=...)`. actions Optional actions to take when the validation step meets or exceeds any set threshold levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to @@ -10703,12 +11809,7 @@ def rows_complete( active A boolean value or callable that determines whether the validation step should be active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. + presence and keeping indexes for the steps unchanged). Returns ------- @@ -10722,53 +11823,14 @@ def rows_complete( table. This is useful for performing any necessary transformations or filtering on the data before the validation step is applied. - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns_subset=` that are expected to be present in the transformed table, but - may not exist in the table before preprocessing. Regarding the lifetime of the transformed - table, it only exists during the validation step and is not stored in the `Validate` object - or used in subsequent validation steps. - Segmentation ------------ The `segments=` argument allows for the segmentation of a validation step into multiple segments. This is useful for applying the same validation step to different subsets of the data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. + column. Providing a single column name results in a separate validation step for each unique + value in that column; a tuple of `(column, values)` restricts segmentation to the listed + values. The segmentation is performed after any `pre=` preprocessing. Thresholds ---------- @@ -10806,8 +11868,9 @@ def rows_complete( import pointblank as pb pb.config(report_incl_header=False, report_incl_footer_timings=False, preview_incl_header=False) ``` - For the examples here, we'll use a simple Polars DataFrame with three string columns - (`col_1`, `col_2`, and `col_3`). The table is shown below: + Here, `income_source` and `income_amount` should both be coded `"not_asked"` (`-99`) together + when the income question wasn't asked. The last row is inconsistent — only one field is + coded `-99`: ```{python} import pointblank as pb @@ -10815,75 +11878,65 @@ def rows_complete( tbl = pl.DataFrame( { - "col_1": ["a", None, "c", "d"], - "col_2": ["a", "a", "c", None], - "col_3": ["a", "a", "d", None], + "income_source": [1, -99, 2, -99], + "income_amount": [50000, -99, 42000, 38000], } ) - pb.preview(tbl) - ``` - - Let's validate that the rows in the table are complete with `rows_complete()`. We'll - determine if this validation had any failing test units (there are four test units, one for - each row). A failing test units means that a given row is not complete (i.e., has at least - one missing value). - - ```{python} - validation = ( - pb.Validate(data=tbl) - .rows_complete() - .interrogate() - ) - - validation - ``` - - From this validation table we see that there are two failing test units. This is because - two rows in the table have at least one missing value (the second row and the last row). - - We can also use a subset of columns to determine completeness. Let's specify the subset - using columns `col_2` and `col_3` for the next validation. + income_missing = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) - ```{python} validation = ( pb.Validate(data=tbl) - .rows_complete(columns_subset=["col_2", "col_3"]) + .col_missing_consistent( + columns=["income_source", "income_amount"], + missing=income_missing, + when_reason="not_asked", + ) .interrogate() ) validation ``` - The validation table reports a single failing test units. The last row contains missing - values in both the `col_2` and `col_3` columns. - others. + The validation reports one failing test unit: the final row, where `income_source` is coded + `-99` (`"not_asked"`) but `income_amount` is a real value. """ - assertion_type = _get_fn_name() _check_pre(pre=pre) - # TODO: add check for segments - # _check_segments(segments=segments) _check_thresholds(thresholds=thresholds) _check_active_input(param=active, param_name="active") + if not isinstance(missing, MissingSpec): + raise TypeError(f"`missing=` must be a MissingSpec, got {type(missing).__name__}.") + + if isinstance(columns, str): + columns = [columns] + columns = list(columns) + if len(columns) < 2: + raise ValueError("`col_missing_consistent()` requires at least two columns to compare.") + + # Resolve which sentinel values (and whether nulls) represent `when_reason` + sentinels = missing.values_for_reason(when_reason) + count_null = missing.null_is_missing and missing.null_reason == when_reason + # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) ) - if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover - columns_subset = [columns_subset] # pragma: no cover - - # TODO: incorporate Column object - # Determine brief to use (global or local) and transform any shorthands of `brief=` brief = self.brief if brief is None else _transform_auto_brief(brief=brief) val_info = _ValidationInfo( assertion_type=assertion_type, - column=columns_subset, + column=columns, + values={ + "sentinels": sentinels, + "count_null": count_null, + "when_reason": when_reason, + "spec": missing, + }, pre=pre, segments=segments, thresholds=thresholds, @@ -13440,6 +14493,15 @@ def interrogate( validation.autobrief = autobrief + # If the step carries structured-missingness context (a `missing=` spec or a dedicated + # missing method), attach a one-line note summarizing the codes and any reason/range + # filter. This keeps the VALUES cell minimal while surfacing detail in the Notes section. + missing_note = _build_missing_note(validation) + if missing_note is not None: + validation._add_note( + key="missing_spec", markdown=missing_note[0], text=missing_note[1] + ) + # ------------------------------------------------ # Bypassing the validation step if conditions met # ------------------------------------------------ @@ -13769,6 +14831,8 @@ def interrogate( "col_vals_le", "col_vals_null", "col_vals_not_null", + "col_missing_coded", + "col_missing_only_coded", "col_vals_increasing", "col_vals_decreasing", "col_vals_between", @@ -13811,6 +14875,18 @@ def interrogate( results_tbl = interrogate_null(tbl=tbl, column=column) elif assertion_method == "not_null": results_tbl = interrogate_not_null(tbl=tbl, column=column) + elif assertion_method == "missing_coded": + results_tbl = interrogate_missing_coded(tbl=tbl, column=column) + elif assertion_method == "missing_only_coded": + results_tbl = interrogate_missing_only_coded( + tbl=tbl, + column=column, + sentinels=value["sentinels"], + count_null=value["count_null"], + allowed=value["allowed"], + min_val=value["min_val"], + max_val=value["max_val"], + ) elif assertion_type == "col_vals_increasing": from pointblank._interrogation import interrogate_increasing @@ -13882,6 +14958,14 @@ def interrogate( tbl=tbl, column=column, values=value, na_pass=na_pass ) + # Apply structured-missingness exclusion: any row whose value is a + # declared sentinel (or a null when `null_is_missing=True`) is treated + # as a passing test unit, so only the "real" values are validated + if validation.missing is not None and results_tbl is not None: + results_tbl = apply_missing_exclusion( + results_tbl=results_tbl, column=column, spec=validation.missing + ) + elif assertion_type == "col_pct_null": result_bool = col_pct_null( data_tbl=data_tbl_step, @@ -13897,6 +14981,22 @@ def interrogate( results_tbl = None + elif assertion_type == "col_pct_missing": + result_bool = col_pct_missing( + data_tbl=data_tbl_step, + column=column, + sentinels=value["sentinels"], + count_null=value["count_null"], + max_pct=value["max_pct"], + ) + + validation.all_passed = result_bool + validation.n = 1 + validation.n_passed = int(result_bool) + validation.n_failed = 1 - int(result_bool) + + results_tbl = None + elif assertion_type == "col_vals_expr": results_tbl = col_vals_expr( data_tbl=data_tbl_step, expr=value, tbl_type=tbl_type @@ -13910,6 +15010,14 @@ def interrogate( elif assertion_type == "rows_complete": results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column) + elif assertion_type == "col_missing_consistent": + results_tbl = interrogate_missing_consistent( + tbl=data_tbl_step, + columns=column, + sentinels=value["sentinels"], + count_null=value["count_null"], + ) + elif assertion_type == "prompt": from pointblank._interrogation import interrogate_prompt @@ -14531,7 +15639,8 @@ def interrogate( if ( collect_extracts and assertion_type - in ROW_BASED_VALIDATION_TYPES + ["rows_distinct", "rows_complete"] + in ROW_BASED_VALIDATION_TYPES + + ["rows_distinct", "rows_complete", "col_missing_consistent"] and tbl_type not in IBIS_BACKENDS ): # Add row numbers to the results table @@ -16956,12 +18065,25 @@ def get_tabular_report( elif assertion_type[i] in [ "col_vals_null", "col_vals_not_null", + "col_missing_coded", "col_exists", "rows_distinct", "rows_complete", ]: values_upd.append("—") + elif assertion_type[i] in ["col_missing_consistent"]: + # Minimal cell: a compact badge (the reason and columns live in the step note) + values_upd.append( + "CONSISTENT" + ) + + elif assertion_type[i] in ["col_missing_only_coded"]: + # Minimal cell: a compact badge (allowed values/range live in the step note) + values_upd.append( + "ONLY CODED" + ) + elif assertion_type[i] in ["col_pct_null"]: # Extract p and tol from the values dict for nice formatting p_value = value["p"] @@ -16971,6 +18093,10 @@ def get_tabular_report( tol_value = bound_finder.keywords.get("tol", 0) if bound_finder else 0 values_upd.append(f"p = {p_value}
tol = {tol_value}") + elif assertion_type[i] in ["col_pct_missing"]: + # Minimal cell: just the threshold (reason/category detail lives in the step note) + values_upd.append(f"≤ {value['max_pct']}") + elif assertion_type[i] in ["data_freshness"]: # Format max_age nicely for display max_age = value.get("max_age") @@ -17105,6 +18231,20 @@ def get_tabular_report( else: # pragma: no cover values_upd.append(str(value)) # pragma: no cover + # Annotate `col_vals_*` steps that carry a `missing=` MissingSpec so the report shows that + # structured-missing values (sentinels and, optionally, nulls) were excluded from the check. + # The `missing` spec is fetched directly from the validation steps (it isn't a report field). + missing_specs = [getattr(v, "missing", None) for v in self.validation_info] + for i, spec in enumerate(missing_specs): + if spec is None or i >= len(values_upd): + continue + # Keep the cell minimal: a compact badge. The reason/code detail lives in the step note. + annotation = ( + "
MISSING-AWARE" + ) + values_upd[i] = f"{values_upd[i]}{annotation}" + # Remove the `inclusive` entry from the dictionary validation_info_dict.pop("inclusive") @@ -17829,7 +18969,10 @@ def get_step_report( # if get_row_count(extract) == 0: # return "No rows were extracted." - if assertion_type in ROW_BASED_VALIDATION_TYPES + ["rows_complete"]: + if assertion_type in ROW_BASED_VALIDATION_TYPES + [ + "rows_complete", + "col_missing_consistent", + ]: # Get the extracted data for the step extract = self.get_data_extracts(i=i, frame=True) @@ -17911,6 +19054,24 @@ def get_step_report( else: step_report = None # pragma: no cover + # If the step is associated with a MissingSpec, append a legend of the missing-value codes + # and their reasons so that sentinel values appearing in the failing rows can be interpreted + step_spec = getattr(self.validation_info[i - 1], "missing", None) + if step_spec is None and isinstance(values, MissingSpec): + # col_missing_coded stores the spec directly in `values` + step_spec = values + if ( + step_spec is None + and isinstance(values, dict) + and isinstance(values.get("spec"), MissingSpec) + ): + # col_missing_only_coded and col_missing_consistent stash the spec under `values["spec"]` + step_spec = values["spec"] + if step_spec is not None and step_report is not None: + legend_html = _missing_legend_html(step_spec) + if legend_html and hasattr(step_report, "tab_source_note"): + step_report = step_report.tab_source_note(source_note=html(legend_html)) + return step_report def get_dataframe_report( @@ -19284,6 +20445,37 @@ def _create_autobrief_or_failure_text( n_rows=n_rows, ) + if assertion_type == "col_pct_missing": + return _create_text_col_pct_missing( + lang=lang, + column=column, + value=values, + for_failure=for_failure, + locale=locale if locale else lang, + ) + + if assertion_type == "col_missing_coded": + return _create_text_col_missing_coded( + lang=lang, + column=column, + for_failure=for_failure, + ) + + if assertion_type == "col_missing_only_coded": + return _create_text_col_missing_only_coded( + lang=lang, + column=column, + for_failure=for_failure, + ) + + if assertion_type == "col_missing_consistent": + return _create_text_col_missing_consistent( + lang=lang, + columns=column, + value=values, + for_failure=for_failure, + ) + if assertion_type == "conjointly": return _create_text_conjointly(lang=lang, for_failure=for_failure) @@ -19693,6 +20885,72 @@ def _create_text_col_pct_null( return text +def _create_text_col_pct_missing( + lang: str, + column: str | None, + value: dict, + for_failure: bool = False, + locale: str | None = None, +) -> str: + """Create autobrief/failure text for col_pct_missing validation.""" + type_ = _expect_failure_type(for_failure=for_failure) + + column_text = _prep_column_text(column=column) + + fmt_locale = locale if locale else lang + + max_pct_value = value.get("max_pct", 0) * 100 # Convert to percentage + max_pct_formatted = _format_number_safe(max_pct_value, decimals=1, locale=fmt_locale) + + return EXPECT_FAIL_TEXT[f"col_pct_missing_{type_}_text"][lang].format( + column_text=column_text, + max_pct=max_pct_formatted, + ) + + +def _create_text_col_missing_coded(lang: str, column: str | None, for_failure: bool = False) -> str: + """Create autobrief/failure text for col_missing_coded validation.""" + type_ = _expect_failure_type(for_failure=for_failure) + + column_text = _prep_column_text(column=column) + + return EXPECT_FAIL_TEXT[f"col_missing_coded_{type_}_text"][lang].format( + column_text=column_text, + ) + + +def _create_text_col_missing_only_coded( + lang: str, column: str | None, for_failure: bool = False +) -> str: + """Create autobrief/failure text for col_missing_only_coded validation.""" + type_ = _expect_failure_type(for_failure=for_failure) + + column_text = _prep_column_text(column=column) + + return EXPECT_FAIL_TEXT[f"col_missing_only_coded_{type_}_text"][lang].format( + column_text=column_text, + ) + + +def _create_text_col_missing_consistent( + lang: str, columns: Any, value: dict, for_failure: bool = False +) -> str: + """Create autobrief/failure text for col_missing_consistent validation.""" + type_ = _expect_failure_type(for_failure=for_failure) + + if isinstance(columns, (list, tuple)): + columns_text = _prep_values_text(values=list(columns), lang=lang, limit=5) + else: + columns_text = _prep_column_text(column=columns) + + reason = value.get("when_reason") if isinstance(value, dict) else None + + return EXPECT_FAIL_TEXT[f"col_missing_consistent_{type_}_text"][lang].format( + columns_text=columns_text, + reason=reason, + ) + + def _create_text_conjointly(lang: str, for_failure: bool = False) -> str: type_ = _expect_failure_type(for_failure=for_failure) @@ -20040,6 +21298,91 @@ def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any: return data_tbl +def _resolve_step_missing_spec(validation: Any) -> Any: + """Return the `MissingSpec` associated with a validation step, if any. + + The spec lives in different places depending on the method: on `validation.missing` for + `col_vals_*` steps that used `missing=`; directly in `validation.values` for `col_missing_coded`; + and under `validation.values["spec"]` for `col_pct_missing`, `col_missing_only_coded`, and + `col_missing_consistent`. + """ + spec = getattr(validation, "missing", None) + if spec is not None: + return spec + vals = getattr(validation, "values", None) + if isinstance(vals, MissingSpec): + return vals + if isinstance(vals, dict) and isinstance(vals.get("spec"), MissingSpec): + return vals["spec"] + return None + + +def _build_missing_note(validation: Any) -> tuple[str, str] | None: + """Build a one-line (markdown, text) note summarizing a step's structured-missingness context. + + Returns `None` when the step has no associated `MissingSpec`. + """ + spec = _resolve_step_missing_spec(validation) + if spec is None or not hasattr(spec, "reasons"): + return None + + codes_md = ", ".join(f"`{value}`→{reason}" for value, reason in spec.reasons.items()) + codes_tx = ", ".join(f"{value}->{reason}" for value, reason in spec.reasons.items()) + if getattr(spec, "null_is_missing", False): + codes_md += f", `null`→{spec.null_reason}" + codes_tx += f", null->{spec.null_reason}" + + md = f"**Missing codes:** {codes_md}" + tx = f"Missing codes: {codes_tx}" + + # Method-specific context appended to the one-line summary + assertion_type = getattr(validation, "assertion_type", None) + vals = getattr(validation, "values", None) + + if assertion_type == "col_pct_missing" and isinstance(vals, dict): + if vals.get("reason") is not None: + md += f". Counting reason `{vals['reason']}`" + tx += f". Counting reason {vals['reason']}" + elif vals.get("category") is not None: + md += f". Counting category `{vals['category']}`" + tx += f". Counting category {vals['category']}" + elif assertion_type == "col_missing_only_coded" and isinstance(vals, dict): + bits_md = [] + bits_tx = [] + if vals.get("allowed") is not None: + allowed_str = ", ".join(str(a) for a in vals["allowed"]) + bits_md.append(f"allowed {{{allowed_str}}}") + bits_tx.append(f"allowed {{{allowed_str}}}") + if vals.get("min_val") is not None or vals.get("max_val") is not None: + rng = f"[{vals.get('min_val')}, {vals.get('max_val')}]" + bits_md.append(f"range {rng}") + bits_tx.append(f"range {rng}") + if bits_md: + md += f". Legitimate values: {', '.join(bits_md)}" + tx += f". Legitimate values: {', '.join(bits_tx)}" + elif assertion_type == "col_missing_consistent" and isinstance(vals, dict): + if vals.get("when_reason") is not None: + md += f". Consistency required for reason `{vals['when_reason']}`" + tx += f". Consistency required for reason {vals['when_reason']}" + + return md, tx + + +def _missing_legend_html(spec: Any) -> str: + """Build an HTML legend of a MissingSpec's sentinel codes and their reasons, for step reports.""" + if not hasattr(spec, "reasons"): + return "" + items = [f"{value} → {reason}" for value, reason in spec.reasons.items()] + if getattr(spec, "null_is_missing", False): + items.append(f"null → {spec.null_reason}") + if not items: + return "" + return ( + "
" + "Missing codes: " + "; ".join(items) + "
" + ) + + def _validation_info_as_dict(validation_info: _ValidationInfo) -> dict: """ Convert a `_ValidationInfo` object to a dictionary. @@ -21766,6 +23109,17 @@ def _step_report_row_based( text = STEP_REPORT_TEXT["rows_complete_all"][lang] else: text = STEP_REPORT_TEXT["rows_complete_subset"][lang] + elif assertion_type == "col_missing_coded": + text = f"{column} is missing-coded" + elif assertion_type == "col_missing_only_coded": + text = f"{column} only documented codes" + elif assertion_type == "col_missing_consistent": + cols = ", ".join(column) if isinstance(column, (list, tuple)) else str(column) + reason = values.get("when_reason") if isinstance(values, dict) else None + text = f"consistent “{reason}” across {{{cols}}}" + else: + # Fallback for any other assertion type: show the assertion type name + text = str(assertion_type) # Wrap assertion text in a tag text = ( diff --git a/pointblank/validate.pyi b/pointblank/validate.pyi index 82a7664de..72df3911c 100644 --- a/pointblank/validate.pyi +++ b/pointblank/validate.pyi @@ -7,6 +7,7 @@ from pathlib import Path from pointblank._typing import SegmentSpec, Tolerance from pointblank._utils import _PBUnresolvedColumn from pointblank.column import Column, ColumnSelector, ColumnSelectorNarwhals, ReferenceColumn +from pointblank.missing import MissingSpec from pointblank.schema import Schema from pointblank.thresholds import Actions, FinalActions, Thresholds from typing import Any, Callable, Literal, ParamSpec, TypeVar @@ -77,7 +78,9 @@ def preview( min_tbl_width: int = 500, incl_header: bool | None = None, ) -> GT: ... -def missing_vals_tbl(data: Any) -> GT: ... +def missing_vals_tbl( + data: Any, missing: dict[str, MissingSpec] | None = None, as_heatmap: bool = False +) -> GT: ... def get_column_count(data: Any) -> int: ... def get_row_count(data: Any) -> int: ... @dataclass @@ -103,6 +106,7 @@ class _ValidationInfo: values: Any | list[Any] | tuple | None = ... inclusive: tuple[bool, bool] | None = ... na_pass: bool | None = ... + missing: Any | None = ... pre: Callable | None = ... segments: Any | None = ... thresholds: Thresholds | None = ... @@ -178,6 +182,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -190,6 +195,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -202,6 +208,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -214,6 +221,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -226,6 +234,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -238,6 +247,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, value: float | int | Column, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -252,6 +262,7 @@ class Validate: right: float | int | Column, inclusive: tuple[bool, bool] = (True, True), na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -266,6 +277,7 @@ class Validate: right: float | int | Column, inclusive: tuple[bool, bool] = (True, True), na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -277,6 +289,7 @@ class Validate: self, columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, set: Collection[Any], + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -288,6 +301,7 @@ class Validate: self, columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, set: Collection[Any], + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -301,6 +315,7 @@ class Validate: allow_stationary: bool = False, decreasing_tol: float | None = None, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -314,6 +329,7 @@ class Validate: allow_stationary: bool = False, increasing_tol: float | None = None, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -347,6 +363,7 @@ class Validate: pattern: str, na_pass: bool = False, inverse: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -359,6 +376,7 @@ class Validate: columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, spec: str, na_pass: bool = False, + missing: MissingSpec | None = None, pre: Callable | None = None, segments: SegmentSpec | None = None, thresholds: int | float | bool | tuple | dict | Thresholds | None = None, @@ -394,6 +412,43 @@ class Validate: brief: str | bool | None = None, active: bool | Callable = True, ) -> Validate: ... + def col_pct_missing( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + max_pct: float, + reason: str | None = None, + category: str | None = None, + thresholds: int | float | None | bool | tuple | dict | Thresholds = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: ... + def col_missing_coded( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: ... + def col_missing_only_coded( + self, + columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals, + missing: MissingSpec, + allowed: Collection[Any] | None = None, + min_val: float | int | None = None, + max_val: float | int | None = None, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: ... def rows_distinct( self, columns_subset: str | list[str] | None = None, @@ -414,6 +469,18 @@ class Validate: brief: str | bool | None = None, active: bool | Callable = True, ) -> Validate: ... + def col_missing_consistent( + self, + columns: list[str], + missing: MissingSpec, + when_reason: str, + pre: Callable | None = None, + segments: SegmentSpec | None = None, + thresholds: int | float | bool | tuple | dict | Thresholds | None = None, + actions: Actions | None = None, + brief: str | bool | None = None, + active: bool | Callable = True, + ) -> Validate: ... def prompt( self, prompt: str, diff --git a/pointblank/yaml.py b/pointblank/yaml.py index 4bdd67cd2..22ef133a8 100644 --- a/pointblank/yaml.py +++ b/pointblank/yaml.py @@ -8,9 +8,40 @@ from pointblank._agg import is_valid_agg from pointblank._utils import _is_lib_present +from pointblank.missing import MissingSpec from pointblank.thresholds import Actions from pointblank.validate import Validate, load_dataset + +def _missing_spec_from_dict(spec_def: dict) -> MissingSpec: + """Build a `MissingSpec` from a YAML mapping.""" + if not isinstance(spec_def, dict): + raise YAMLValidationError( + f"A missing spec must be a mapping, got {type(spec_def).__name__}." + ) + return MissingSpec( + reasons=spec_def.get("reasons", {}), + categories=spec_def.get("categories"), + null_is_missing=spec_def.get("null_is_missing", True), + null_reason=spec_def.get("null_reason", "unknown"), + description=spec_def.get("description"), + ) + + +def _missing_spec_to_code(spec: MissingSpec) -> str: + """Render a `MissingSpec` as a `pb.MissingSpec(...)` constructor call for code generation.""" + parts = [f"reasons={spec.reasons!r}"] + if spec.categories is not None: + parts.append(f"categories={spec.categories!r}") + if spec.null_is_missing is not True: + parts.append(f"null_is_missing={spec.null_is_missing!r}") + if spec.null_reason != "unknown": + parts.append(f"null_reason={spec.null_reason!r}") + if spec.description is not None: + parts.append(f"description={spec.description!r}") + return f"pb.MissingSpec({', '.join(parts)})" + + if TYPE_CHECKING: from typing import Literal @@ -243,6 +274,10 @@ class YAMLValidator: "col_vals_decreasing": "col_vals_decreasing", "col_vals_within_spec": "col_vals_within_spec", "col_pct_null": "col_pct_null", + "col_pct_missing": "col_pct_missing", + "col_missing_coded": "col_missing_coded", + "col_missing_only_coded": "col_missing_only_coded", + "col_missing_consistent": "col_missing_consistent", "rows_distinct": "rows_distinct", "rows_complete": "rows_complete", "col_count_match": "col_count_match", @@ -332,6 +367,7 @@ def _validate_schema(self, config: dict) -> None: "steps", "tbl_name", "label", + "missing_specs", "thresholds", "actions", "final_actions", @@ -608,10 +644,45 @@ def _parse_schema_spec(self, schema_spec: Any) -> Any: f"Schema specification must be a dictionary, got: {type(schema_spec)}" ) + def _parse_missing_specs(self, config: dict) -> dict[str, MissingSpec]: + """Parse the top-level `missing_specs` block into named `MissingSpec` objects.""" + raw = config.get("missing_specs") + if raw is None: + return {} + if not isinstance(raw, dict): + raise YAMLValidationError("'missing_specs' must be a dictionary of named specs") + return {name: _missing_spec_from_dict(spec_def) for name, spec_def in raw.items()} + + def _resolve_missing( + self, value: Any, missing_specs: Optional[dict[str, MissingSpec]] + ) -> MissingSpec: + """Resolve a step's `missing=` value to a `MissingSpec`. + + The value can be a named reference into the top-level `missing_specs` block, an inline + mapping defining a spec, or an already-constructed `MissingSpec`. + """ + if isinstance(value, MissingSpec): + return value + if isinstance(value, str): + if not missing_specs or value not in missing_specs: + available = sorted(missing_specs.keys()) if missing_specs else [] + raise YAMLValidationError( + f"Unknown missing spec '{value}'. Define it under the top-level " + f"'missing_specs' block. Available: {available}" + ) + return missing_specs[value] + if isinstance(value, dict): + return _missing_spec_from_dict(value) + raise YAMLValidationError( + f"Invalid 'missing' value: {value!r}. Use a named reference, an inline mapping, " + "or a MissingSpec." + ) + def _parse_validation_step( self, step_config: Union[str, dict], namespaces: Optional[Union[Iterable[str], Mapping[str, str]]] = None, + missing_specs: Optional[dict[str, MissingSpec]] = None, ) -> tuple[str, dict]: """Parse a single validation step from YAML configuration. @@ -676,6 +747,10 @@ def _parse_validation_step( # (e.g., `active: pb.has_columns("col_a")` or `active: false`) elif key == "active" and isinstance(value, str): processed_parameters[key] = _safe_eval_python_code(value, namespaces=namespaces) + elif key == "missing": + # Pass the raw value through (a spec name, inline mapping, or MissingSpec); it is + # resolved to a MissingSpec below, after the loop + processed_parameters[key] = value else: # Normal processing (requires python: block syntax) processed_parameters[key] = _process_python_expressions( @@ -683,6 +758,11 @@ def _parse_validation_step( ) parameters = processed_parameters + # Resolve a `missing=` parameter (used by col_pct_missing, col_missing_coded) into a + # MissingSpec, looking up named references in the top-level `missing_specs` block + if "missing" in parameters: + parameters["missing"] = self._resolve_missing(parameters["missing"], missing_specs) + # Convert `columns=` specification if "columns" in parameters: parameters["columns"] = self._parse_column_spec(parameters["columns"]) @@ -832,10 +912,13 @@ def build_validation( validation = Validate(data, **validate_kwargs) + # Parse any named missing specs declared at the top level + missing_specs = self._parse_missing_specs(config) + # Add validation steps for step_config in config["steps"]: method_name, parameters = self._parse_validation_step( - step_config, namespaces=namespaces + step_config, namespaces=namespaces, missing_specs=missing_specs ) # Get the method from the validation object @@ -1644,6 +1727,9 @@ def extract_python_expressions(obj, path=""): validator = YAMLValidator() config = validator.load_config(yaml) + # Parse any named missing specs so steps referencing them can be rendered + missing_specs = validator._parse_missing_specs(config) + # Start building the Python code code_lines = [] @@ -1780,7 +1866,9 @@ def extract_python_expressions(obj, path=""): # Handle string steps (parameterless methods like "rows_distinct") if isinstance(step_config, str): - method_name, parameters = validator._parse_validation_step(step_config, namespaces=None) + method_name, parameters = validator._parse_validation_step( + step_config, namespaces=None, missing_specs=missing_specs + ) code_lines.append(f" .{method_name}()") continue @@ -1802,7 +1890,9 @@ def extract_python_expressions(obj, path=""): elif isinstance(step_params["expr"], str): original_expressions["expr"] = step_params["expr"] - method_name, parameters = validator._parse_validation_step(step_config, namespaces=None) + method_name, parameters = validator._parse_validation_step( + step_config, namespaces=None, missing_specs=missing_specs + ) # Apply the original expressions to override the converted lambda functions if method_name == "conjointly" and "expressions" in original_expressions: @@ -1852,6 +1942,9 @@ def extract_python_expressions(obj, path=""): param_parts.append(f"{key}={columns_str}") else: param_parts.append(f'{key}="{value}"') # pragma: no cover + elif key == "missing" and isinstance(value, MissingSpec): + # Render a resolved MissingSpec as a `pb.MissingSpec(...)` constructor call + param_parts.append(f"missing={_missing_spec_to_code(value)}") elif key == "brief": # Handle `brief=` parameter: can be a boolean or a string if isinstance(value, bool): diff --git a/pyproject.toml b/pyproject.toml index 2b4c897ac..314790628 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ docs = [ "jupyter", "nbclient>=0.10.0", "nbformat>=5.10.4", - "pandas>=2.2.3", + "pandas>=2.2.3,<3", # <3: pandas 3.0's default `str` dtype is unsupported by the pinned duckdb (<1.3.3) "polars>=1.17.1", "pyspark==3.5.6", "openpyxl>=3.0.0", @@ -94,7 +94,7 @@ dev = [ "jupyter", "nbclient>=0.10.0", "nbformat>=5.10.4", - "pandas>=2.2.3", + "pandas>=2.2.3,<3", # <3: pandas 3.0's default `str` dtype is unsupported by the pinned duckdb (<1.3.3) "polars>=1.17.1", "pre-commit==2.15.0", "pyarrow", diff --git a/tests/test_col_missing_coded.py b/tests/test_col_missing_coded.py new file mode 100644 index 000000000..c4911eaae --- /dev/null +++ b/tests/test_col_missing_coded.py @@ -0,0 +1,96 @@ +import polars as pl +import pytest + +import pointblank as pb + + +@pytest.fixture +def age_missing(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}) + + +class TestColMissingCoded: + def test_passes_when_all_coded(self, age_missing): + # All absence expressed as sentinels; no raw nulls + tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29, -97, 55, 38]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing) + .interrogate() + ) + info = validation.validation_info[0] + assert info.n == 8 + assert info.n_failed == 0 + assert info.all_passed is True + + def test_fails_on_raw_null(self, age_missing): + tbl = pl.DataFrame({"age": [34, -98, 41, None, 29, -99, 55, None]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing) + .interrogate() + ) + info = validation.validation_info[0] + assert info.n == 8 + assert info.n_failed == 2 # two raw nulls + assert info.all_passed is False + + def test_sentinels_pass(self, age_missing): + # only sentinels and reals, no nulls -> all pass + tbl = pl.DataFrame({"age": [-99, -98, -97, -99]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing) + .interrogate() + ) + assert validation.validation_info[0].all_passed is True + + def test_missing_must_be_missingspec(self): + tbl = pl.DataFrame({"age": [1, 2, 3]}) + with pytest.raises(TypeError): + pb.Validate(data=tbl).col_missing_coded(columns="age", missing={-99: "x"}) + + def test_multiple_columns(self, age_missing): + tbl = pl.DataFrame({"a": [1, None, 3], "b": [-99, 2, 3]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns=["a", "b"], missing=age_missing) + .interrogate() + ) + assert len(validation.validation_info) == 2 + assert validation.validation_info[0].n_failed == 1 # column a has a null + assert validation.validation_info[1].n_failed == 0 # column b has none + + def test_report_renders_with_brief(self, age_missing): + tbl = pl.DataFrame({"age": [34, None, 41]}) + validation = ( + pb.Validate(data=tbl) + .col_missing_coded(columns="age", missing=age_missing, brief=True) + .interrogate() + ) + gt = validation.get_tabular_report() + assert gt is not None + + +class TestAutobriefTranslations: + """Exercise the autobrief text builders across languages (no KeyError).""" + + @pytest.mark.parametrize("lang", ["en", "fr", "de", "ja", "ar", "zh-Hans", "fa", "he"]) + def test_col_missing_coded_brief_langs(self, age_missing, lang): + tbl = pl.DataFrame({"age": [34, None, 41]}) + validation = ( + pb.Validate(data=tbl, lang=lang) + .col_missing_coded(columns="age", missing=age_missing, brief=True) + .interrogate() + ) + assert validation.validation_info[0].autobrief + + @pytest.mark.parametrize("lang", ["en", "fr", "de", "ja", "ar", "zh-Hans", "fa", "he"]) + def test_col_pct_missing_brief_langs(self, age_missing, lang): + tbl = pl.DataFrame({"age": [34, -98, 41, -99]}) + validation = ( + pb.Validate(data=tbl, lang=lang) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.5, brief=True) + .interrogate() + ) + assert validation.validation_info[0].autobrief diff --git a/tests/test_col_missing_consistent.py b/tests/test_col_missing_consistent.py new file mode 100644 index 000000000..1bcedce26 --- /dev/null +++ b/tests/test_col_missing_consistent.py @@ -0,0 +1,108 @@ +import polars as pl +import pandas as pd +import pytest + +import pointblank as pb + + +@pytest.fixture +def spec(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + +def _info(v): + return v.validation_info[0] + + +class TestColMissingConsistent: + def test_basic_inconsistency(self, spec): + tbl = pl.DataFrame( + {"income_source": [1, -99, 2, -99], "income_amount": [50000, -99, 42000, 38000]} + ) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent( + columns=["income_source", "income_amount"], missing=spec, when_reason="not_asked" + ) + .interrogate() + ) + info = _info(v) + assert info.n == 4 + assert info.n_failed == 1 # last row: only one column is -99 + + def test_all_consistent_passes(self, spec): + tbl = pl.DataFrame({"a": [1, -99, 2, -99], "b": [5, -99, 6, -99]}) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .interrogate() + ) + assert _info(v).n_failed == 0 + + def test_null_reason_consistency(self): + # when_reason == null_reason, null_is_missing True -> nulls count + spec = pb.MissingSpec(reasons={-98: "refused"}, null_reason="unknown") + tbl = pl.DataFrame({"a": [1, None, None], "b": [5, None, 6]}) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="unknown") + .interrogate() + ) + # row2 both null -> ok; row3 only a null -> fail + assert _info(v).n_failed == 1 + + def test_three_columns(self, spec): + tbl = pl.DataFrame({"a": [-99, 1, -99], "b": [-99, 2, -99], "c": [-99, 3, 7]}) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b", "c"], missing=spec, when_reason="not_asked") + .interrogate() + ) + # row1 all -99 ok; row2 none ok; row3 a,b -99 but c=7 -> fail + assert _info(v).n_failed == 1 + + def test_requires_two_columns(self, spec): + tbl = pl.DataFrame({"a": [1, 2]}) + with pytest.raises(ValueError, match="at least two columns"): + pb.Validate(data=tbl).col_missing_consistent( + columns=["a"], missing=spec, when_reason="not_asked" + ) + + def test_missing_must_be_spec(self): + tbl = pl.DataFrame({"a": [1], "b": [2]}) + with pytest.raises(TypeError): + pb.Validate(data=tbl).col_missing_consistent( + columns=["a", "b"], missing={-99: "x"}, when_reason="not_asked" + ) + + def test_pandas_backend(self, spec): + tbl = pd.DataFrame({"a": [1, -99, -99], "b": [5, -99, 6]}) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .interrogate() + ) + assert _info(v).n_failed == 1 + + def test_report_and_step_report(self, spec): + tbl = pl.DataFrame({"a": [1, -99, -99], "b": [5, -99, 6]}) + v = ( + pb.Validate(data=tbl) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .interrogate() + ) + assert v.get_tabular_report() is not None + # step report (row-based extract path) should build without error + assert v.get_step_report(i=1) is not None + + @pytest.mark.parametrize("lang", ["en", "fr", "de", "ja", "ar", "zh-Hans"]) + def test_brief_langs(self, spec, lang): + tbl = pl.DataFrame({"a": [1, -99, -99], "b": [5, -99, 6]}) + v = ( + pb.Validate(data=tbl, lang=lang) + .col_missing_consistent( + columns=["a", "b"], missing=spec, when_reason="not_asked", brief=True + ) + .interrogate() + ) + assert _info(v).autobrief diff --git a/tests/test_col_missing_only_coded.py b/tests/test_col_missing_only_coded.py new file mode 100644 index 000000000..55bf83b80 --- /dev/null +++ b/tests/test_col_missing_only_coded.py @@ -0,0 +1,111 @@ +import polars as pl +import pandas as pd +import pytest + +import pointblank as pb + + +@pytest.fixture +def spec(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + +def _info(v): + return v.validation_info[0] + + +class TestColMissingOnlyCoded: + def test_flags_undocumented_code(self, spec): + # -95 is undocumented; reals in [0,120]; -99/-98 documented + tbl = pl.DataFrame({"age": [34, -98, 41, -95, 29, -99, 55]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + info = _info(v) + assert info.n == 7 + assert info.n_failed == 1 # only -95 + + def test_all_documented_or_real_passes(self, spec): + tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + assert _info(v).n_failed == 0 + + def test_allowed_set(self, spec): + tbl = pl.DataFrame({"grade": [1, 2, -99, 3, -95, -98]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="grade", missing=spec, allowed=[1, 2, 3]) + .interrogate() + ) + # -95 is undocumented -> 1 failure + assert _info(v).n_failed == 1 + + def test_null_documented_when_null_is_missing(self): + spec = pb.MissingSpec(reasons={-99: "not_asked"}, null_is_missing=True) + tbl = pl.DataFrame({"age": [34, None, -99, 200]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + # null passes (documented as unknown), -99 passes, 200 out of range -> fail + assert _info(v).n_failed == 1 + + def test_null_fails_when_not_missing(self): + spec = pb.MissingSpec(reasons={-99: "not_asked"}, null_is_missing=False) + tbl = pl.DataFrame({"age": [34, None, -99, 41]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + # null is neither documented nor a real value -> fail + assert _info(v).n_failed == 1 + + def test_requires_a_real_value_constraint(self, spec): + tbl = pl.DataFrame({"age": [1, 2, 3]}) + with pytest.raises(ValueError, match="at least one of"): + pb.Validate(data=tbl).col_missing_only_coded(columns="age", missing=spec) + + def test_missing_must_be_spec(self): + tbl = pl.DataFrame({"age": [1, 2, 3]}) + with pytest.raises(TypeError): + pb.Validate(data=tbl).col_missing_only_coded( + columns="age", missing={-99: "x"}, min_val=0, max_val=10 + ) + + def test_pandas_backend(self, spec): + tbl = pd.DataFrame({"age": [34, -98, -95, 200]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + # -95 undocumented, 200 out of range -> 2 failures + assert _info(v).n_failed == 2 + + def test_report_and_step_report(self, spec): + tbl = pl.DataFrame({"age": [34, -98, -95, 41]}) + v = ( + pb.Validate(data=tbl) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120, brief=True) + .interrogate() + ) + assert v.get_tabular_report() is not None + assert v.get_step_report(i=1) is not None + + @pytest.mark.parametrize("lang", ["en", "fr", "de", "ja", "ar", "zh-Hans"]) + def test_brief_langs(self, spec, lang): + tbl = pl.DataFrame({"age": [34, -95]}) + v = ( + pb.Validate(data=tbl, lang=lang) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120, brief=True) + .interrogate() + ) + assert _info(v).autobrief diff --git a/tests/test_col_pct_missing.py b/tests/test_col_pct_missing.py new file mode 100644 index 000000000..4dbce285f --- /dev/null +++ b/tests/test_col_pct_missing.py @@ -0,0 +1,142 @@ +import polars as pl +import pytest + +import pointblank as pb + + +@pytest.fixture +def survey_tbl(): + # 8 rows: ages with sentinel codes + # -99 = not_asked, -98 = refused, -97 = dont_know + # values: 34, -98, 41, -99, 29, -98, 55, 38 + # -> 2 refused, 1 not_asked, 0 dont_know, 5 real -> 3/8 = 0.375 missing + return pl.DataFrame({"age": [34, -98, 41, -99, 29, -98, 55, 38]}) + + +@pytest.fixture +def age_missing(): + return pb.MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}, + categories={"item_nonresponse": ["refused", "dont_know"], "design": ["not_asked"]}, + ) + + +def _single_step_passed(validation): + info = validation.validation_info[0] + return info.all_passed + + +class TestColPctMissing: + def test_overall_pass(self, survey_tbl, age_missing): + validation = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.5) + .interrogate() + ) + assert _single_step_passed(validation) is True + + def test_overall_fail(self, survey_tbl, age_missing): + validation = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.30) + .interrogate() + ) + # 3/8 = 0.375 > 0.30 -> fail + assert _single_step_passed(validation) is False + + def test_by_reason_refused(self, survey_tbl, age_missing): + # 2/8 = 0.25 refused + passing = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, reason="refused", max_pct=0.25) + .interrogate() + ) + failing = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, reason="refused", max_pct=0.20) + .interrogate() + ) + assert _single_step_passed(passing) is True + assert _single_step_passed(failing) is False + + def test_by_reason_zero(self, survey_tbl, age_missing): + # no dont_know values -> 0% always passes + validation = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, reason="dont_know", max_pct=0.0) + .interrogate() + ) + assert _single_step_passed(validation) is True + + def test_by_category(self, survey_tbl, age_missing): + # item_nonresponse = refused + dont_know = 2/8 = 0.25 + passing = ( + pb.Validate(data=survey_tbl) + .col_pct_missing( + columns="age", missing=age_missing, category="item_nonresponse", max_pct=0.25 + ) + .interrogate() + ) + assert _single_step_passed(passing) is True + + def test_nulls_counted(self, age_missing): + tbl = pl.DataFrame({"age": [34, None, 41, -98, 29, 38, 55, 38]}) + # null_is_missing=True by default: 1 null + 1 refused = 2/8 = 0.25 + validation = ( + pb.Validate(data=tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.25) + .interrogate() + ) + assert _single_step_passed(validation) is True + + def test_nulls_excluded_when_spec_says_so(self): + spec = pb.MissingSpec(reasons={-98: "refused"}, null_is_missing=False) + tbl = pl.DataFrame({"age": [34, None, None, -98, 29, 38, 55, 38]}) + # only -98 counts: 1/8 = 0.125 + validation = ( + pb.Validate(data=tbl) + .col_pct_missing(columns="age", missing=spec, max_pct=0.125) + .interrogate() + ) + assert _single_step_passed(validation) is True + + def test_reason_and_category_mutually_exclusive(self, survey_tbl, age_missing): + with pytest.raises(ValueError, match="Only one of"): + pb.Validate(data=survey_tbl).col_pct_missing( + columns="age", + missing=age_missing, + reason="refused", + category="item_nonresponse", + max_pct=0.5, + ) + + def test_max_pct_bounds(self, survey_tbl, age_missing): + with pytest.raises(ValueError, match="max_pct"): + pb.Validate(data=survey_tbl).col_pct_missing( + columns="age", missing=age_missing, max_pct=1.5 + ) + + def test_missing_must_be_missingspec(self, survey_tbl): + with pytest.raises(TypeError): + pb.Validate(data=survey_tbl).col_pct_missing( + columns="age", missing={-99: "not_asked"}, max_pct=0.5 + ) + + def test_multiple_columns(self, age_missing): + tbl = pl.DataFrame({"a": [1, -98, 3, 4], "b": [-99, -99, 3, 4]}) + validation = ( + pb.Validate(data=tbl) + .col_pct_missing(columns=["a", "b"], missing=age_missing, max_pct=0.5) + .interrogate() + ) + assert len(validation.validation_info) == 2 + + def test_report_renders(self, survey_tbl, age_missing): + # The validation report should build without error (exercises icon + value rendering) + validation = ( + pb.Validate(data=survey_tbl) + .col_pct_missing(columns="age", missing=age_missing, max_pct=0.5, brief=True) + .interrogate() + ) + gt = validation.get_tabular_report() + assert gt is not None diff --git a/tests/test_col_vals_missing_param.py b/tests/test_col_vals_missing_param.py new file mode 100644 index 000000000..ea86b4dde --- /dev/null +++ b/tests/test_col_vals_missing_param.py @@ -0,0 +1,94 @@ +import polars as pl +import pandas as pd +import pytest + +import pointblank as pb + + +@pytest.fixture +def spec(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + +@pytest.fixture +def spec_no_null(): + return pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}, null_is_missing=False) + + +def _info(v): + return v.validation_info[0] + + +class TestMissingExclusion: + def test_between_excludes_sentinels_and_nulls(self, spec): + tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29, 200, 55, None]}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + info = _info(v) + assert info.n == 8 + # only 200 is a real out-of-range value + assert info.n_failed == 1 + + def test_gt_excludes(self, spec): + tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29, 55]}) + v = pb.Validate(data=tbl).col_vals_gt(columns="age", value=0, missing=spec).interrogate() + assert _info(v).n_failed == 0 + + def test_null_not_excluded_when_spec_says_so(self, spec_no_null): + # null_is_missing=False -> nulls are NOT excluded; with na_pass default False, null fails gt + tbl = pl.DataFrame({"age": [34, -98, None, 41]}) + v = ( + pb.Validate(data=tbl) + .col_vals_gt(columns="age", value=0, missing=spec_no_null) + .interrogate() + ) + # -98 excluded (passes); null fails (na_pass False); reals pass -> 1 failure + assert _info(v).n_failed == 1 + + def test_in_set_excludes_sentinels(self, spec): + tbl = pl.DataFrame({"grade": [1, 2, -99, 3, -98, 9]}) + v = ( + pb.Validate(data=tbl) + .col_vals_in_set(columns="grade", set=[1, 2, 3], missing=spec) + .interrogate() + ) + # 9 is the only real value not in the set + assert _info(v).n_failed == 1 + + def test_regex_excludes_string_sentinels(self): + spec = pb.MissingSpec(reasons={"N/A": "not_applicable", "REF": "refused"}) + tbl = pl.DataFrame({"code": ["AB12", "N/A", "CD34", "REF", "bad code"]}) + v = ( + pb.Validate(data=tbl) + .col_vals_regex(columns="code", pattern=r"^[A-Z]{2}[0-9]{2}$", missing=spec) + .interrogate() + ) + # "bad code" is the only real non-matching value + assert _info(v).n_failed == 1 + + def test_no_missing_param_unchanged(self): + tbl = pl.DataFrame({"age": [34, -98, 41]}) + v = pb.Validate(data=tbl).col_vals_gt(columns="age", value=0).interrogate() + # -98 is a real value < 0 -> fails when missing= not used + assert _info(v).n_failed == 1 + + def test_pandas_backend(self, spec): + tbl = pd.DataFrame({"age": [34, -98, 41, -99, 200]}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + assert _info(v).n_failed == 1 + + def test_report_renders(self, spec): + tbl = pl.DataFrame({"age": [34, -98, 41, 200]}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + assert v.get_tabular_report() is not None diff --git a/tests/test_missing.py b/tests/test_missing.py new file mode 100644 index 000000000..53b62099a --- /dev/null +++ b/tests/test_missing.py @@ -0,0 +1,124 @@ +import pytest + +import pointblank as pb +from pointblank.missing import MissingSpec + + +class TestMissingSpecConstruction: + """Tests for MissingSpec construction and validation.""" + + def test_minimal_spec(self): + spec = MissingSpec(reasons={-99: "not_asked"}) + assert spec.reasons == {-99: "not_asked"} + assert spec.categories is None + assert spec.null_is_missing is True + assert spec.null_reason == "unknown" + assert spec.description is None + + def test_full_spec(self): + spec = MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}, + categories={"item_nonresponse": ["refused", "dont_know"], "design": ["not_asked"]}, + null_is_missing=False, + null_reason="system", + description="Standard survey codes", + ) + assert spec.null_is_missing is False + assert spec.null_reason == "system" + assert spec.description == "Standard survey codes" + + def test_exported_from_top_level(self): + assert pb.MissingSpec is MissingSpec + + def test_reasons_must_be_dict(self): + with pytest.raises(TypeError): + MissingSpec(reasons=[-99, -98]) # type: ignore[arg-type] + + def test_empty_reasons_requires_null_is_missing(self): + # OK: empty reasons but null_is_missing=True + MissingSpec(reasons={}, null_is_missing=True) + # Not OK: empty reasons and null_is_missing=False + with pytest.raises(ValueError): + MissingSpec(reasons={}, null_is_missing=False) + + def test_reason_labels_must_be_strings(self): + with pytest.raises(TypeError): + MissingSpec(reasons={-99: 1}) # type: ignore[dict-item] + + def test_category_must_reference_known_reasons(self): + with pytest.raises(ValueError, match="unknown reason"): + MissingSpec( + reasons={-99: "not_asked"}, + categories={"bad": ["nonexistent"]}, + ) + + def test_category_can_reference_null_reason(self): + spec = MissingSpec( + reasons={-99: "not_asked"}, + categories={"all_absent": ["not_asked", "unknown"]}, + null_is_missing=True, + ) + assert spec.values_for_category("all_absent") == [-99] + + def test_categories_must_be_dict(self): + with pytest.raises(TypeError): + MissingSpec(reasons={-99: "not_asked"}, categories=["not_asked"]) # type: ignore[arg-type] + + +class TestMissingSpecMethods: + @pytest.fixture + def spec(self): + return MissingSpec( + reasons={-99: "not_asked", -98: "refused", -97: "dont_know", -96: "not_applicable"}, + categories={ + "item_nonresponse": ["refused", "dont_know"], + "design": ["not_asked", "not_applicable"], + }, + ) + + def test_sentinel_values(self, spec): + assert spec.sentinel_values() == [-99, -98, -97, -96] + + def test_reason_for(self, spec): + assert spec.reason_for(-98) == "refused" + assert spec.reason_for(5) is None + + def test_reason_for_null(self, spec): + assert spec.reason_for(None) == "unknown" + spec_no_null = MissingSpec(reasons={-99: "not_asked"}, null_is_missing=False) + assert spec_no_null.reason_for(None) is None + + def test_is_missing(self, spec): + assert spec.is_missing(-99) is True + assert spec.is_missing(42) is False + assert spec.is_missing(None) is True + + def test_is_missing_null_excluded(self): + spec = MissingSpec(reasons={-99: "not_asked"}, null_is_missing=False) + assert spec.is_missing(None) is False + + def test_values_for_reason(self, spec): + assert spec.values_for_reason("refused") == [-98] + assert spec.values_for_reason("nonexistent") == [] + + def test_values_for_category(self, spec): + assert spec.values_for_category("item_nonresponse") == [-98, -97] + assert spec.values_for_category("design") == [-99, -96] + assert spec.values_for_category("nonexistent") == [] + + def test_values_for_category_no_categories(self): + spec = MissingSpec(reasons={-99: "not_asked"}) + assert spec.values_for_category("anything") == [] + + def test_reasons_list(self, spec): + assert spec.reasons_list() == [ + "not_asked", + "refused", + "dont_know", + "not_applicable", + "unknown", + ] + + def test_reasons_list_no_null(self): + spec = MissingSpec(reasons={-99: "a", -98: "b"}, null_is_missing=False) + assert spec.reasons_list() == ["a", "b"] diff --git a/tests/test_missing_factories.py b/tests/test_missing_factories.py new file mode 100644 index 000000000..7cdeb0610 --- /dev/null +++ b/tests/test_missing_factories.py @@ -0,0 +1,145 @@ +import pytest + +import pointblank as pb +from pointblank.missing import MissingSpec, _slugify +from pointblank.metadata import VariableMetadata, MetadataImport + + +class TestSlugify: + @pytest.mark.parametrize( + "label,expected", + [ + ("Refused", "refused"), + ("Not Applicable", "not_applicable"), + ("DON'T KNOW", "don_t_know"), + (" spaced ", "spaced"), + (-99, "99"), + ("", "missing"), + ], + ) + def test_slugify(self, label, expected): + assert _slugify(label) == expected + + +class TestFromCdisc: + def test_standard_codes(self): + spec = MissingSpec.from_cdisc_null_flavors() + assert spec.reason_for("NASK") == "not_asked" + assert spec.reason_for("UNK") == "unknown" + assert spec.reason_for("PINF") == "positive_infinity" + assert spec.reason_for("NA") == "not_applicable" + + def test_categories(self): + spec = MissingSpec.from_cdisc_null_flavors() + assert set(spec.values_for_category("boundary")) == {"PINF", "NINF"} + assert "NASK" in spec.values_for_category("not_applicable") + + def test_alias(self): + assert MissingSpec.from_cdisc().reason_for("MSK") == "masked" + + def test_null_handling(self): + spec = MissingSpec.from_cdisc_null_flavors() + assert spec.null_is_missing is True + assert spec.reason_for(None) == "no_information" + + def test_exported_via_top_level(self): + assert pb.MissingSpec.from_cdisc_null_flavors().reason_for("NI") == "no_information" + + +class TestFromSas: + def test_defaults(self): + spec = MissingSpec.from_sas() + assert spec.reason_for(".") == "system_missing" + assert spec.reason_for(".A") == "user_missing_a" + assert spec.reason_for(".Z") == "user_missing_z" + assert spec.reason_for("._") == "system_missing" + + def test_overrides(self): + spec = MissingSpec.from_sas(reasons={".A": "not_applicable", ".B": "below_detection"}) + assert spec.reason_for(".A") == "not_applicable" + assert spec.reason_for(".B") == "below_detection" + assert spec.reason_for(".C") == "user_missing_c" # default preserved + + def test_no_underscore(self): + spec = MissingSpec.from_sas(include_underscore=False) + assert spec.reason_for("._") is None + # 26 letters + "." = 27 sentinels + assert len(spec.sentinel_values()) == 27 + + +class TestFromSpss: + def test_with_labels(self): + spec = MissingSpec.from_spss( + missing_values=[-99, -98], labels={-99: "Not asked", -98: "Refused"} + ) + assert spec.reason_for(-99) == "not_asked" + assert spec.reason_for(-98) == "refused" + + def test_without_labels(self): + spec = MissingSpec.from_spss(missing_values=[-99, -1]) + assert spec.reason_for(-99) == "missing_99" + assert spec.reason_for(-1) == "missing_1" + + +class TestFromVariableMetadata: + def test_uses_missing_value_labels(self): + var = VariableMetadata( + name="age", + dtype="Int64", + missing_values=[-99, -98], + missing_value_labels={-99: "Not asked", -98: "Refused"}, + ) + spec = MissingSpec.from_variable_metadata(var) + assert spec.reason_for(-98) == "refused" + + def test_falls_back_to_value_labels(self): + var = VariableMetadata( + name="age", + dtype="Int64", + missing_values=[-99], + value_labels={-99: "Not Asked", 1: "Yes"}, + ) + spec = MissingSpec.from_variable_metadata(var) + assert spec.reason_for(-99) == "not_asked" + + def test_no_missing_returns_none(self): + var = VariableMetadata(name="id", dtype="Int64") + assert MissingSpec.from_variable_metadata(var) is None + + def test_to_missing_spec_method(self): + var = VariableMetadata(name="age", dtype="Int64", missing_values=[-99]) + assert var.to_missing_spec().is_missing(-99) is True + + +class TestMetadataImportMissingSpecs: + def test_missing_specs_mapping(self): + v1 = VariableMetadata( + name="age", + dtype="Int64", + missing_values=[-99, -98], + missing_value_labels={-99: "Not asked", -98: "Refused"}, + ) + v2 = VariableMetadata(name="id", dtype="Int64") # no missing values + meta = MetadataImport(source_format="spss", variables=[v1, v2]) + + specs = meta.missing_specs() + assert list(specs.keys()) == ["age"] # id omitted (no missing values) + assert specs["age"].reason_for(-99) == "not_asked" + + def test_specs_usable_in_validation(self): + import polars as pl + + v = VariableMetadata( + name="age", dtype="Int64", missing_values=[-99], missing_value_labels={-99: "Not asked"} + ) + meta = MetadataImport(source_format="spss", variables=[v]) + specs = meta.missing_specs() + + tbl = pl.DataFrame({"age": [34, -99, 200]}) + validation = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=specs["age"]) + .interrogate() + ) + # -99 excluded; only 200 fails + assert validation.validation_info[0].n_failed == 1 diff --git a/tests/test_missing_report_integration.py b/tests/test_missing_report_integration.py new file mode 100644 index 000000000..6751ec13e --- /dev/null +++ b/tests/test_missing_report_integration.py @@ -0,0 +1,106 @@ +import polars as pl + +import pointblank as pb + + +def test_tabular_report_annotates_missing_aware_steps(): + tbl = pl.DataFrame({"age": [34, -98, 41, 200]}) + spec = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + html = v.get_tabular_report().as_raw_html() + # The VALUES cell carries a compact badge; the reason/code detail goes to the step note + assert "MISSING-AWARE" in html + assert "Missing codes" in html + assert "refused" in html and "not_asked" in html + + +def test_tabular_report_no_annotation_without_missing(): + tbl = pl.DataFrame({"age": [34, -98, 41, 200]}) + v = pb.Validate(data=tbl).col_vals_between(columns="age", left=0, right=120).interrogate() + html = v.get_tabular_report().as_raw_html() + assert "MISSING-AWARE" not in html + assert "Missing codes" not in html + + +def test_dedicated_methods_show_minimal_cell_and_note(): + tbl = pl.DataFrame({"age": [34, -98, 41, -99]}) + spec = pb.MissingSpec( + reasons={-99: "not_asked", -98: "refused"}, + categories={"nonresponse": ["refused"]}, + ) + v = ( + pb.Validate(data=tbl) + .col_pct_missing(columns="age", missing=spec, reason="refused", max_pct=0.5) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + html = v.get_tabular_report().as_raw_html() + # Compact VALUES cells: a threshold for col_pct_missing and an "ONLY CODED" badge + assert "ONLY CODED" in html + # Detail is surfaced via the auto Notes system + assert "Missing codes" in html + assert "Counting reason" in html and "refused" in html + assert "Legitimate values" in html and "[0, 120]" in html + # The old verbose VALUES strings should no longer be present + assert "reason = refused" not in html + assert "max_pct = " not in html + + +def test_step_report_shows_missing_codes_legend(): + spec = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + + # col_vals_* with missing= + tbl = pl.DataFrame({"age": [34, -98, 200, -99, 300]}) + v = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=spec) + .interrogate() + ) + h = v.get_step_report(i=1).as_raw_html() + assert "Missing codes" in h and "not_asked" in h and "refused" in h + + # col_missing_coded (spec in values) + tbl2 = pl.DataFrame({"age": [34, None, 41]}) + v2 = pb.Validate(data=tbl2).col_missing_coded(columns="age", missing=spec).interrogate() + assert "Missing codes" in v2.get_step_report(i=1).as_raw_html() + + # col_missing_only_coded (spec stashed in values dict) + tbl3 = pl.DataFrame({"age": [34, -98, -95, 41]}) + v3 = ( + pb.Validate(data=tbl3) + .col_missing_only_coded(columns="age", missing=spec, min_val=0, max_val=120) + .interrogate() + ) + assert "Missing codes" in v3.get_step_report(i=1).as_raw_html() + + # col_missing_consistent + tbl4 = pl.DataFrame({"a": [1, -99, -99], "b": [5, -99, 6]}) + v4 = ( + pb.Validate(data=tbl4) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .interrogate() + ) + assert "Missing codes" in v4.get_step_report(i=1).as_raw_html() + + +def test_step_report_no_legend_without_missing(): + tbl = pl.DataFrame({"age": [34, 200, 41]}) + v = pb.Validate(data=tbl).col_vals_between(columns="age", left=0, right=120).interrogate() + assert "Missing codes" not in v.get_step_report(i=1).as_raw_html() + + +def test_report_renders_with_mixed_steps(): + tbl = pl.DataFrame({"a": [1, -99, 3], "b": [-99, -99, 3]}) + spec = pb.MissingSpec(reasons={-99: "not_asked"}) + v = ( + pb.Validate(data=tbl) + .col_vals_gt(columns="a", value=0, missing=spec) + .col_missing_consistent(columns=["a", "b"], missing=spec, when_reason="not_asked") + .col_missing_coded(columns="a", missing=spec) + .interrogate() + ) + assert v.get_tabular_report() is not None diff --git a/tests/test_missing_vals_tbl_structured.py b/tests/test_missing_vals_tbl_structured.py new file mode 100644 index 000000000..ae46963b0 --- /dev/null +++ b/tests/test_missing_vals_tbl_structured.py @@ -0,0 +1,192 @@ +import polars as pl +import pandas as pd +import pytest +from great_tables import GT + +import pointblank as pb + + +@pytest.fixture +def tbl_pl(): + return pl.DataFrame( + { + "age": [34, -98, 41, -99, 29, -98, 55, None], + "income": [50000, -99, -1, None, 42000, -99, 38000, 61000], + } + ) + + +@pytest.fixture +def specs(): + return { + "age": pb.MissingSpec(reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}), + "income": pb.MissingSpec(reasons={-99: "not_asked", -1: "below_threshold"}), + } + + +class TestStructuredMissingTbl: + def test_returns_gt(self, tbl_pl, specs): + result = pb.missing_vals_tbl(tbl_pl, missing=specs) + assert isinstance(result, GT) + + def test_reason_columns_present(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html() + # Coded reason labels keep their raw input form (snake_case), grouped under a spanner + for token in [ + "not_asked", + "refused", + "dont_know", + "below_threshold", + "Complete", + "Null", # fixed column for raw nulls (not a reason) + "Missing Reasons", # spanner over the coded reason columns only + ]: + assert token in html + # Labels are not prettified to Title Case + assert "Not Asked" not in html and "Below Threshold" not in html + # The redundant "Total N" column was removed (row count is in the header) + assert "Total N" not in html + + def test_null_is_fixed_column_not_a_reason(self, tbl_pl, specs): + # Raw nulls appear in a fixed "Null" column, not as an "unknown" reason under the spanner + html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html() + assert "Null" in html + assert "unknown" not in html # the null_reason label is not shown + # "Null" is a fixed column to the right of the coded reasons + gt = pb.missing_vals_tbl(tbl_pl, missing=specs) + cols = list(gt._tbl_data.columns) + assert cols[-1] == "null" + assert cols.index("null") > cols.index("below_threshold") + + def test_no_null_column_when_null_not_missing(self): + # null_is_missing=False -> no "Null" column and no "unknown" text + tbl = pl.DataFrame({"age": [34, -98, 41, None]}) + spec = {"age": pb.MissingSpec(reasons={-98: "refused"}, null_is_missing=False)} + gt = pb.missing_vals_tbl(tbl, missing=spec) + assert "null" not in list(gt._tbl_data.columns) + html = gt.as_raw_html() + assert "unknown" not in html + + def test_null_column_em_dash_when_not_applicable(self): + # When one spec counts nulls and another doesn't, the Null column shows an em dash for the + # column whose spec sets null_is_missing=False + tbl = pl.DataFrame({"a": [1, -99, None], "b": [1, -99, None]}) + specs = { + "a": pb.MissingSpec(reasons={-99: "not_asked"}), # null_is_missing=True + "b": pb.MissingSpec(reasons={-99: "not_asked"}, null_is_missing=False), + } + gt = pb.missing_vals_tbl(tbl, missing=specs) + null_vals = list(gt._tbl_data["null"]) + # column "a" counts its 1 null; column "b" is not applicable (em dash) + assert null_vals[0] == "1 (33%)" + assert null_vals[1] == "—" + + def test_counts_correct(self, tbl_pl): + # age: total 8 -> refused 2 (25%), not_asked 1 (12%), dont_know 0 (0%), + # unknown/null 1 (12%), complete 4 (50%) + spec = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}) + html = pb.missing_vals_tbl(tbl_pl, missing={"age": spec}).as_raw_html() + assert "4 (50%)" in html # complete + assert "2 (25%)" in html # refused + assert "1 (12%)" in html # not_asked / unknown + assert "0 (0%)" in html # dont_know + + def test_null_excluded_when_spec_says_so(self): + # null_is_missing=False -> the null is counted as complete, no Unknown column + tbl = pl.DataFrame({"age": [34, -98, 41, None]}) + spec = pb.MissingSpec(reasons={-98: "refused"}, null_is_missing=False) + html = pb.missing_vals_tbl(tbl, missing={"age": spec}).as_raw_html() + assert "unknown" not in html + # complete = 3 (null + 2 reals) of 4 = 75% + assert "3 (75%)" in html + + def test_pandas_input(self, specs): + tbl = pd.DataFrame( + { + "age": [34, -98, 41, -99, 29, -98, 55, None], + "income": [50000, -99, -1, None, 42000, -99, 38000, 61000], + } + ) + result = pb.missing_vals_tbl(tbl, missing=specs) + assert isinstance(result, GT) + + def test_default_behavior_unchanged(self, tbl_pl): + # No missing= -> the original sector heatmap path + result = pb.missing_vals_tbl(tbl_pl) + assert isinstance(result, GT) + + def test_missing_must_be_dict_of_specs(self, tbl_pl): + with pytest.raises(TypeError): + pb.missing_vals_tbl(tbl_pl, missing={"age": {-99: "x"}}) + + def test_unknown_column_raises(self, tbl_pl): + spec = pb.MissingSpec(reasons={-99: "not_asked"}) + with pytest.raises(ValueError, match="not found"): + pb.missing_vals_tbl(tbl_pl, missing={"nonexistent": spec}) + + +class TestMissingHeatmap: + def test_heatmap_returns_gt(self, tbl_pl, specs): + result = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True) + assert isinstance(result, GT) + + def test_heatmap_title_and_labels(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True).as_raw_html() + assert "Missing Pattern Heatmap" in html + assert "refused" in html and "below_threshold" in html + assert "Missing Reasons" in html # spanner over reason columns + assert "%" in html # proportions formatted as percentages + + def test_heatmap_pandas(self, specs): + tbl = pd.DataFrame( + { + "age": [34, -98, 41, -99, 29, -98, 55, None], + "income": [50000, -99, -1, None, 42000, -99, 38000, 61000], + } + ) + assert isinstance(pb.missing_vals_tbl(tbl, missing=specs, as_heatmap=True), GT) + + def test_as_heatmap_ignored_without_missing(self, tbl_pl): + # as_heatmap only applies with missing=; default sector view still returned + assert isinstance(pb.missing_vals_tbl(tbl_pl, as_heatmap=True), GT) + + +class TestStyledLikeOriginal: + """The structured/heatmap outputs should reuse the original report's title style and the + monospaced left Column column.""" + + def test_table_mode_styling(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html() + # Monospaced font present (left Column column + value columns) + assert "IBM Plex Mono" in html + # Header carries the table type + dimensions subtitle (as the default report does) + assert "rows" in html.lower() or "columns" in html.lower() + # Plain title (no shrunk font-size wrapper as before) + assert "
Missing Values by Reason" not in html + + def test_heatmap_mode_styling(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True).as_raw_html() + assert "IBM Plex Mono" in html + assert "
Missing Pattern Heatmap" not in html + + +class TestNonApplicableReasons: + """Reasons not defined in a column's spec should render as an em dash, not '0 (0%)'.""" + + def test_table_mode_em_dash(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs).as_raw_html() + # age has no "below_threshold"; income has no "refused"/"dont_know" -> 3 em dashes + assert html.count("—") == 3 + # age DOES define "dont_know" but observes none -> should still show "0 (0%)" + assert "0 (0%)" in html + + def test_heatmap_mode_em_dash(self, tbl_pl, specs): + html = pb.missing_vals_tbl(tbl_pl, missing=specs, as_heatmap=True).as_raw_html() + assert html.count("—") == 3 + + def test_single_spec_no_em_dash(self): + # With one spec, every reason in the union applies -> no em dashes + tbl = pl.DataFrame({"age": [34, -98, 41, -99]}) + spec = {"age": pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"})} + html = pb.missing_vals_tbl(tbl, missing=spec).as_raw_html() + assert "—" not in html diff --git a/tests/test_validate.py b/tests/test_validate.py index 64e8718d0..6ff03cdfa 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -833,6 +833,7 @@ def test_validation_plan_and_interrogation(request, tbl_fixture) -> None: "values", "inclusive", "na_pass", + "missing", "pre", "segments", "thresholds", @@ -915,6 +916,7 @@ def test_validation_plan_and_interrogation(request, tbl_fixture) -> None: "values", "inclusive", "na_pass", + "missing", "pre", "segments", "thresholds", diff --git a/tests/test_yaml_missing_specs.py b/tests/test_yaml_missing_specs.py new file mode 100644 index 000000000..9937bd939 --- /dev/null +++ b/tests/test_yaml_missing_specs.py @@ -0,0 +1,127 @@ +import polars as pl +import pytest + +import pointblank as pb +from pointblank.yaml import YAMLValidationError, yaml_interrogate, yaml_to_python + + +def _write_csv(tmp_path, df): + p = tmp_path / "survey.csv" + df.write_csv(p) + return str(p) + + +@pytest.fixture +def survey_csv(tmp_path): + df = pl.DataFrame({"age": [34, -98, 41, -99, 29, -98, 55, 38]}) + return _write_csv(tmp_path, df) + + +def test_named_missing_spec_pct(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +missing_specs: + standard_survey: + reasons: + -99: not_asked + -98: refused + -97: dont_know + categories: + nonresponse: [refused, dont_know] +steps: + - col_pct_missing: + columns: age + missing: standard_survey + max_pct: 0.5 + - col_pct_missing: + columns: age + missing: standard_survey + reason: refused + max_pct: 0.30 +""" + result = yaml_interrogate(yaml_str) + assert len(result.validation_info) == 2 + # overall 3/8=0.375 <= 0.5 pass; refused 2/8=0.25 <= 0.30 pass + assert result.validation_info[0].all_passed is True + assert result.validation_info[1].all_passed is True + + +def test_named_missing_spec_coded(tmp_path): + df = pl.DataFrame({"age": [34, -98, 41, None, 29, -99, 55, 38]}) + csv = _write_csv(tmp_path, df) + yaml_str = f""" +tbl: {csv} +missing_specs: + survey: + reasons: + -99: not_asked + -98: refused +steps: + - col_missing_coded: + columns: age + missing: survey +""" + result = yaml_interrogate(yaml_str) + info = result.validation_info[0] + assert info.n_failed == 1 # one raw null + + +def test_inline_missing_spec(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +steps: + - col_pct_missing: + columns: age + missing: + reasons: + -99: not_asked + -98: refused + max_pct: 0.5 +""" + result = yaml_interrogate(yaml_str) + assert result.validation_info[0].all_passed is True + + +def test_unknown_spec_reference_raises(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +steps: + - col_pct_missing: + columns: age + missing: nonexistent + max_pct: 0.5 +""" + with pytest.raises(YAMLValidationError, match="Unknown missing spec"): + yaml_interrogate(yaml_str) + + +def test_missing_specs_must_be_dict(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +missing_specs: + - not_a_mapping +steps: + - rows_distinct +""" + with pytest.raises(YAMLValidationError): + yaml_interrogate(yaml_str) + + +def test_yaml_to_python_renders_missing_spec(survey_csv): + yaml_str = f""" +tbl: {survey_csv} +missing_specs: + survey: + reasons: + -99: not_asked + -98: refused +steps: + - col_pct_missing: + columns: age + missing: survey + max_pct: 0.5 +""" + code = yaml_to_python(yaml_str) + assert "pb.MissingSpec(" in code + assert "col_pct_missing" in code + assert "reasons=" in code diff --git a/user_guide/01-validation-plan/02-validation-methods.qmd b/user_guide/01-validation-plan/02-validation-methods.qmd index 2cc85e623..c420961c1 100644 --- a/user_guide/01-validation-plan/02-validation-methods.qmd +++ b/user_guide/01-validation-plan/02-validation-methods.qmd @@ -293,6 +293,78 @@ In summary, `na_pass=` works like this: - `na_pass=True`: missing values pass validation regardless of the condition being tested - `na_pass=False` (the default): missing values fail validation +### Structured Missingness with `missing=` + +`na_pass=` treats missingness as binary, but real-world data often encodes *why* a value is absent +using sentinel codes (e.g., `-99` for "not asked", `-98` for "refused"). The +[`MissingSpec`](`pointblank.MissingSpec`) class captures these codes and their reasons, and most +validation methods accept a `missing=` argument that uses it. + +When you pass `missing=` to a `col_vals_*()` method, declared sentinel values (and, by default, +`Null` values) are *excluded* from the check, so only the "real" values are validated: + +```{python} +import polars as pl + +tbl = pl.DataFrame({"age": [34, -98, 41, -99, 29, 200, 55, None]}) + +age_missing = pb.MissingSpec(reasons={-99: "not_asked", -98: "refused"}) + +validation = ( + pb.Validate(data=tbl) + .col_vals_between(columns="age", left=0, right=120, missing=age_missing) + .interrogate() +) + +validation +``` + +Only the real value `200` is out of range; the sentinel codes and the `Null` are excluded and pass. +In the report, such steps are marked with a compact `MISSING-AWARE` badge, and a one-line summary of +the codes appears in the step's notes. + +Pointblank also provides dedicated missingness validation methods that use a `MissingSpec`: + +- [`Validate.col_pct_missing()`](`Validate.col_pct_missing`): assert the percentage of missing + values stays within a limit, optionally filtered by a specific `reason=` or `category=`. +- [`Validate.col_missing_coded()`](`Validate.col_missing_coded`): assert every absence is expressed + as a documented code (no uncoded raw `Null` values). +- [`Validate.col_missing_only_coded()`](`Validate.col_missing_only_coded`): assert a column contains + only documented codes and legitimate values (catching undocumented codes like a stray `-95`), + paired with an `allowed=` set or a `min_val`/`max_val` range. +- [`Validate.col_missing_consistent()`](`Validate.col_missing_consistent`): assert related columns + share a consistent missingness pattern for a given reason (e.g., a survey skip pattern). + +```{python} +income_missing = pb.MissingSpec(reasons={-99: "not_asked", -1: "below_threshold"}) + +survey = pl.DataFrame( + { + "income_source": [1, -99, 2, -99], + "income_amount": [50000, -99, 42000, 38000], + } +) + +validation = ( + pb.Validate(data=survey) + # No more than 30% of income values may be "not_asked" + .col_pct_missing(columns="income_amount", missing=income_missing, reason="not_asked", max_pct=0.30) + # If income wasn't asked, both related columns should be coded together + .col_missing_consistent( + columns=["income_source", "income_amount"], missing=income_missing, when_reason="not_asked" + ) + .interrogate() +) + +validation +``` + +`MissingSpec` also offers pre-built factories for common standards (e.g., +`pb.MissingSpec.from_cdisc_null_flavors()`, `pb.MissingSpec.from_sas()`, +`pb.MissingSpec.from_spss()`), and importing metadata from SPSS/Stata/SAS files can auto-generate +specs via [`MetadataImport.missing_specs()`](`pointblank.MetadataImport`). For a fuller treatment of +structured-missingness *reporting*, see the *Missing Values Reporting* article. + ## 2. Row-based Validations Row-based validations focus on examining properties that span across entire rows rather than diff --git a/user_guide/03-yaml/02-yaml-reference.qmd b/user_guide/03-yaml/02-yaml-reference.qmd index cbe3341b2..a13408beb 100644 --- a/user_guide/03-yaml/02-yaml-reference.qmd +++ b/user_guide/03-yaml/02-yaml-reference.qmd @@ -40,6 +40,11 @@ actions: # OPTIONAL: Global failure actions final_actions: # OPTIONAL: Actions triggered after all steps complete warning: "Post-validation warning" error: "Post-validation error" +missing_specs: # OPTIONAL: Named structured-missingness specs + standard_survey: + reasons: + -99: not_asked + -98: refused steps: # REQUIRED: List of validation steps - validation_method_name - validation_method_name: @@ -191,6 +196,62 @@ Template variables available for action strings: - `{level}`: severity level ('warning'/'error'/'critical') - `{time}`: timestamp of validation +### Structured Missingness (`missing_specs`) + +The optional top-level `missing_specs` key defines named [`MissingSpec`](`pointblank.MissingSpec`) +objects that steps can reference. Each named spec maps sentinel values to reason labels, and may +declare `categories`, `null_is_missing`, and `null_reason`: + +```yaml +missing_specs: + standard_survey: + reasons: + -99: not_asked + -98: refused + -97: dont_know + categories: + nonresponse: [refused, dont_know] + null_is_missing: true # OPTIONAL (default true) + null_reason: unknown # OPTIONAL (default "unknown") +``` + +Steps reference a named spec by name through the `missing:` parameter. This works both on the +`col_vals_*` methods (to exclude sentinel values from a check) and on the dedicated missingness +methods (`col_pct_missing`, `col_missing_coded`, `col_missing_only_coded`, `col_missing_consistent`): + +```yaml +missing_specs: + standard_survey: + reasons: + -99: not_asked + -98: refused + +steps: + - col_vals_between: + columns: age + left: 0 + right: 120 + missing: standard_survey # excludes -99/-98 (and nulls) from the range check + - col_pct_missing: + columns: age + missing: standard_survey + reason: refused + max_pct: 0.30 +``` + +A step can also define a spec inline (an anonymous mapping) instead of referencing a named one: + +```yaml +steps: + - col_pct_missing: + columns: age + max_pct: 0.5 + missing: + reasons: + -99: not_asked + -98: refused +``` + ## Validation Methods Reference ### Column Value Validations diff --git a/user_guide/05-data-inspection/03-missing-vals-tbl.qmd b/user_guide/05-data-inspection/03-missing-vals-tbl.qmd index 951392099..f47c264ff 100644 --- a/user_guide/05-data-inspection/03-missing-vals-tbl.qmd +++ b/user_guide/05-data-inspection/03-missing-vals-tbl.qmd @@ -81,3 +81,90 @@ pb.missing_vals_tbl(game_revenue) We see nothing but light blue in this report! The header also indicates that there are no missing values by displaying a large green check mark (the other report tables provided a count of total missing values across all columns). + +## Structured Missingness by Reason + +So far we've treated missingness as binary: a value is either `Null` or it isn't. But real-world +data often encodes *why* a value is absent. Survey data distinguishes *refused* from *not asked* +from *don't know*; clinical and statistical-package data use sentinel codes like `-99`, `".A"`, or +`"NOT DONE"`. Pointblank captures this with the [`MissingSpec`](`pointblank.MissingSpec`) class, +which maps sentinel values to human-readable *reasons*. + +When you pass a `missing=` mapping of column names to `MissingSpec` objects, `missing_vals_tbl()` +switches from the sector heatmap to a *structured breakdown*: one row per column with the count and +percentage of complete values and of each missing reason. + +::: {.callout-note} +## Supplying `missing=` produces a different report + +The structured breakdown is a *distinct visualization*, not an annotated version of the default +sector heatmap. Adding `missing=` changes the table's whole layout. The report title changes too +(from "Missing Values" to "Missing Values by Reason", or "Missing Pattern Heatmap" with +`as_heatmap=True`), and the shared title styling and monospaced column list keep the two views +recognizably part of the same family. +::: + +```{python} +import polars as pl + +survey = pl.DataFrame( + { + "age": [34, -98, 41, -99, 29, -98, 55, None], + "income": [50000, -99, -1, None, 42000, -99, 38000, 61000], + } +) + +specs = { + "age": pb.MissingSpec(reasons={-99: "not_asked", -98: "refused", -97: "dont_know"}), + "income": pb.MissingSpec(reasons={-99: "not_asked", -1: "below_threshold"}), +} + +pb.missing_vals_tbl(survey, missing=specs) +``` + +Each `MissingSpec` declares the sentinel values for a column and the reason each one represents. +Those declared (coded) reasons are grouped under the **Missing Reasons** spanner. By default, actual +`Null` values are also counted as missing; because those are raw `Null`/`None`/`NA` values and *not* +part of the spec, they're tallied in a fixed **Null** column at the far right (styled like +**Complete**), rather than as a reason. Set `null_is_missing=False` on the spec if raw nulls should +be treated as real values instead — then there's no **Null** column at all. + +The reason columns are the *union* of reasons across all the specs you provide. When a reason isn't +defined for a particular column, that cell shows an em dash (`—`) rather than `0`. This signals +"not applicable to this column", as distinct from a reason that *is* defined but simply wasn't +observed (which shows `0 (0%)`). + +### Viewing the pattern as a heatmap + +For a more visual read of *where* missingness concentrates, pass `as_heatmap=True`. The reason +columns are then shaded from light to dark by the proportion missing: + +```{python} +pb.missing_vals_tbl(survey, missing=specs, as_heatmap=True) +``` + +### Pre-built specs for common standards + +You don't always have to define reasons by hand. `MissingSpec` provides factory methods for common +encodings, including CDISC/HL7 null flavors and SAS special missing values: + +```{python} +cdisc = pb.MissingSpec.from_cdisc_null_flavors() +print("NASK ->", cdisc.reason_for("NASK")) # not_asked +print("UNK ->", cdisc.reason_for("UNK")) # unknown +``` + +When metadata is imported from SPSS, Stata, or SAS files (see the *Metadata Import* section), +[`MetadataImport.missing_specs()`](`pointblank.MetadataImport`) auto-generates a `{column: +MissingSpec}` mapping from the variables' declared missing values, ready to pass straight to +`missing_vals_tbl()`. + +::: {.callout-note} +The same `MissingSpec` objects power missingness-aware *validation*, not just reporting. You can +pass `missing=` to the `col_vals_*()` methods (to exclude sentinel values from a check) and use the +dedicated [`col_pct_missing()`](`pointblank.Validate.col_pct_missing`), +[`col_missing_coded()`](`pointblank.Validate.col_missing_coded`), +[`col_missing_only_coded()`](`pointblank.Validate.col_missing_only_coded`), and +[`col_missing_consistent()`](`pointblank.Validate.col_missing_consistent`) validation steps. See the +*Validation Methods* article for details. +::: diff --git a/user_guide/11-metadata-import/02-statistical-packages.qmd b/user_guide/11-metadata-import/02-statistical-packages.qmd index 25d85e1eb..9b40338b2 100644 --- a/user_guide/11-metadata-import/02-statistical-packages.qmd +++ b/user_guide/11-metadata-import/02-statistical-packages.qmd @@ -130,6 +130,36 @@ them appropriately. When validation is generated, these codes are documented in rather than generating explicit exclusion rules, since the correct handling depends on your analysis context. +#### Turning missing codes into `MissingSpec` objects + +To put these codes to work in validation and reporting, convert them into +[`MissingSpec`](`pointblank.MissingSpec`) objects. The +[`MetadataImport.missing_specs()`](`pointblank.MetadataImport`) method does this for every variable +that declares missing values, returning a `{column: MissingSpec}` mapping (the reason labels are +derived from the variables' value labels): + +```python +meta = pb.import_metadata("survey.sav") + +# Auto-generate a {column: MissingSpec} mapping from the declared missing values +specs = meta.missing_specs() + +# Use the specs in a structured missingness report... +pb.missing_vals_tbl(data, missing=specs) + +# ...or in missingness-aware validation +validation = ( + pb.Validate(data=data) + .col_vals_between(columns="age", left=0, right=120, missing=specs["age"]) + .interrogate() +) +``` + +You can also build a spec for a single variable with +[`VariableMetadata.to_missing_spec()`](`pointblank.VariableMetadata`), or construct one directly +from SPSS-style values via `pb.MissingSpec.from_spss(missing_values=[...], labels={...})`. See the +*Missing Values Reporting* and *Validation Methods* articles for what you can do with these specs. + ### Type Detection from Formats SPSS stores numeric variables with format strings that indicate how they should be displayed. These diff --git a/user_guide/11-metadata-import/03-cdisc-validation.qmd b/user_guide/11-metadata-import/03-cdisc-validation.qmd index eb1a32e21..e8d8d4308 100644 --- a/user_guide/11-metadata-import/03-cdisc-validation.qmd +++ b/user_guide/11-metadata-import/03-cdisc-validation.qmd @@ -531,6 +531,27 @@ This layered approach gives you the flexibility to apply different levels of val on your needs. The Define-XML checks enforce what was specifically documented for your study, while the SDTM template checks enforce the broader standard requirements that apply universally. +## Null Flavors and Structured Missingness + +Clinical data uses standardized HL7/CDISC *null flavors* to record *why* a value is absent (e.g., +`"NASK"` = not asked, `"UNK"` = unknown, `"NA"` = not applicable). Pointblank ships a pre-built +[`MissingSpec`](`pointblank.MissingSpec`) for these codes via +`MissingSpec.from_cdisc_null_flavors()`: + +```{python} +cdisc = pb.MissingSpec.from_cdisc_null_flavors() + +print("NASK ->", cdisc.reason_for("NASK")) # not_asked +print("UNK ->", cdisc.reason_for("UNK")) # unknown +print("boundary codes:", cdisc.values_for_category("boundary")) +``` + +This spec can be passed to `missing_vals_tbl()` for a reason-by-reason breakdown, or to the +`col_vals_*()` and dedicated missingness validation methods (`col_pct_missing()`, +`col_missing_coded()`, `col_missing_only_coded()`, `col_missing_consistent()`) to validate data +while accounting for the null flavor codes. See the *Missing Values Reporting* and *Validation +Methods* articles for the full set of capabilities. + ## Conclusion CDISC data validation with Pointblank covers the full spectrum of clinical trial data management: