From 1957f7726e3fa35b40ae17c645961b2b86a6646e Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Tue, 19 May 2026 15:10:20 +0200 Subject: [PATCH 1/5] feat(regex): add German structured PII detection Add deterministic German-specific PII entity types to the regex engine: - DE_VAT_ID: German VAT identification number (USt-IdNr) - DE_IBAN: German IBAN for payments (DE + 20 digits) - DE_TAX_ID: German tax ID (Steuer-ID, 11 digits) - DE_SOCIAL_SECURITY_NUMBER: German pension insurance number (11 characters) - DE_PHONE: German phone numbers (+49 country code) - DE_POSTAL_CODE: German postal code with prefix (PLZ/DE/D + 5 digits) - DE_PASSPORT_NUMBER: German passport (1 letter + 8 digits) - DE_RESIDENCE_PERMIT_NUMBER: German residence permit (AT + 7 digits) Changes: - Added regex patterns and labels to RegexAnnotator - Registered canonical entity types in engine.py and core.py - Expanded structured_pii.json corpus with test cases - Created comprehensive test_de_pii_regex.py with positive/negative cases - Updated STRUCTURED_TYPES in accuracy tests - No setup.py or dependency changes (regex-only, deterministic) Test results: - 381 tests passed (includes 18 new German PII tests) - All regex and accuracy tests pass - No regressions in existing functionality --- datafog/core.py | 10 +- datafog/engine.py | 8 ++ .../regex_annotator/regex_annotator.py | 109 +++++++++++++++++- tests/corpus/structured_pii.json | 102 ++++++++++++++++ tests/test_de_pii_regex.py | 92 +++++++++++++++ tests/test_detection_accuracy.py | 8 ++ tests/test_regex_annotator.py | 4 +- 7 files changed, 329 insertions(+), 4 deletions(-) create mode 100644 tests/test_de_pii_regex.py diff --git a/datafog/core.py b/datafog/core.py index f4e17850..c899e9f0 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -210,7 +210,7 @@ def get_supported_entities() -> List[str]: Example: >>> entities = get_supported_entities() >>> print(entities) - ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DOB', 'ZIP'] + ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DE_TAX_ID', 'DE_SOCIAL_SECURITY_NUMBER', 'DE_PHONE', 'DE_POSTAL_CODE', 'DE_PASSPORT_NUMBER', 'DE_RESIDENCE_PERMIT_NUMBER', 'DATE', 'ZIP_CODE'] """ result = [ "EMAIL", @@ -218,6 +218,14 @@ def get_supported_entities() -> List[str]: "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_PHONE", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", "DATE", "ZIP_CODE", ] diff --git a/datafog/engine.py b/datafog/engine.py index 1a94e634..53af6e18 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -31,6 +31,14 @@ "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_PHONE", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", "DATE", "ZIP_CODE", "PERSON", diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index a843a8d8..0c6c7ea1 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -28,7 +28,23 @@ class RegexAnnotator: """ # Labels for PII entities - LABELS = ["EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", "DOB", "ZIP"] + LABELS = [ + "EMAIL", + "PHONE", + "SSN", + "CREDIT_CARD", + "IP_ADDRESS", + "DOB", + "ZIP", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_PHONE", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", + ] def __init__(self): # Compile all patterns once at initialization @@ -175,6 +191,97 @@ def __init__(self): """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), + # German VAT ID (USt-IdNr) - DE followed by 9 digits + "DE_VAT_ID": re.compile( + r""" + (? None: + annotator = RegexAnnotator() + result = annotator.annotate(text) + assert expected in result[label] + + +@pytest.mark.parametrize( + "label,text", + [ + ("DE_TAX_ID", "Steuer-ID 1234567890 liegt vor."), + ("DE_TAX_ID", "Steuer-ID 123456789012 liegt vor."), + ( + "DE_SOCIAL_SECURITY_NUMBER", + "Rentenversicherungsnummer 65150804123 liegt vor.", + ), + ( + "DE_SOCIAL_SECURITY_NUMBER", + "Rentenversicherungsnummer 65150804AA23 liegt vor.", + ), + ("DE_PHONE", "Hotline 030 12345678 erreichbar."), + ("DE_POSTAL_CODE", "10115 Berlin."), + ("DE_PASSPORT_NUMBER", "Passnummer 12345678 wurde geprueft."), + ( + "DE_RESIDENCE_PERMIT_NUMBER", + "Aufenthaltstitel AT12345678 gueltig.", + ), + ], +) +def test_de_regex_negative_cases(label: str, text: str) -> None: + annotator = RegexAnnotator() + result = annotator.annotate(text) + assert not result[label] diff --git a/tests/test_detection_accuracy.py b/tests/test_detection_accuracy.py index 852a7937..46fdcd34 100644 --- a/tests/test_detection_accuracy.py +++ b/tests/test_detection_accuracy.py @@ -22,6 +22,14 @@ "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_PHONE", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", "DATE", "ZIP_CODE", } diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py index 5916bfae..986bf16e 100644 --- a/tests/test_regex_annotator.py +++ b/tests/test_regex_annotator.py @@ -41,8 +41,8 @@ def test_regex_annotator_initialization(): annotator = RegexAnnotator() assert annotator is not None assert ( - len(annotator.LABELS) == 7 - ) # EMAIL, PHONE, SSN, CREDIT_CARD, IP_ADDRESS, DOB, ZIP + len(annotator.LABELS) == 15 + ) # Base + German structured labels def test_regex_annotator_create_method(): From fa4dd0ddf29671d059bdfb8ab5bfd0f8cf241ff8 Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Tue, 19 May 2026 16:14:41 +0200 Subject: [PATCH 2/5] fix(regex): use alphanumeric boundaries in German PII patterns Replace digit-only lookahead with alphanumeric boundaries to prevent false positive prefix matches. For example, DE123456789A now correctly rejects the longer token instead of matching as DE123456789. All 363 tests pass with zero regressions. --- .../regex_annotator/regex_annotator.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 0c6c7ea1..03f26c1e 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -198,7 +198,7 @@ def __init__(self): DE [\s-]? \d{9} - (?!\d) + (?![A-Za-z0-9]) """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), @@ -210,16 +210,16 @@ def __init__(self): \d{2} (?:\s?\d{4}){4} \s?\d{2} - (?!\d) + (?![A-Za-z0-9]) """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), # German Tax ID (Steuer-ID) - 11 digits "DE_TAX_ID": re.compile( r""" - (? Date: Tue, 19 May 2026 17:30:09 +0200 Subject: [PATCH 3/5] fix: remove DE_PHONE to avoid overlapping entity matches DE_PHONE overlaps with the generic PHONE pattern, causing the redaction system to apply both replacements and corrupt output. Since German phone numbers are already detected by the generic PHONE pattern, remove the DE_PHONE pattern as a separate entity type. Removes: - DE_PHONE from LABELS and regex patterns - DE_PHONE from ALL_ENTITY_TYPES in engine - DE_PHONE from supported entities in core - DE_PHONE test cases from test_de_pii_regex.py - DE_PHONE corpus entry from structured_pii.json - Updated label count from 15 to 14 German PII detection is still comprehensive with 7 entity types: DE_VAT_ID, DE_IBAN, DE_TAX_ID, DE_SOCIAL_SECURITY_NUMBER, DE_POSTAL_CODE, DE_PASSPORT_NUMBER, DE_RESIDENCE_PERMIT_NUMBER All 361 tests pass with zero regressions. --- datafog/core.py | 3 +-- datafog/engine.py | 1 - .../regex_annotator/regex_annotator.py | 14 -------------- tests/corpus/structured_pii.json | 18 ------------------ tests/test_de_pii_regex.py | 11 ----------- tests/test_detection_accuracy.py | 1 - tests/test_regex_annotator.py | 4 ++-- 7 files changed, 3 insertions(+), 49 deletions(-) diff --git a/datafog/core.py b/datafog/core.py index c899e9f0..8db94618 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -210,7 +210,7 @@ def get_supported_entities() -> List[str]: Example: >>> entities = get_supported_entities() >>> print(entities) - ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DE_TAX_ID', 'DE_SOCIAL_SECURITY_NUMBER', 'DE_PHONE', 'DE_POSTAL_CODE', 'DE_PASSPORT_NUMBER', 'DE_RESIDENCE_PERMIT_NUMBER', 'DATE', 'ZIP_CODE'] + ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DE_TAX_ID', 'DE_SOCIAL_SECURITY_NUMBER', 'DE_POSTAL_CODE', 'DE_PASSPORT_NUMBER', 'DE_RESIDENCE_PERMIT_NUMBER', 'DATE', 'ZIP_CODE'] """ result = [ "EMAIL", @@ -222,7 +222,6 @@ def get_supported_entities() -> List[str]: "DE_IBAN", "DE_TAX_ID", "DE_SOCIAL_SECURITY_NUMBER", - "DE_PHONE", "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", diff --git a/datafog/engine.py b/datafog/engine.py index 53af6e18..cc1c4d2f 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -35,7 +35,6 @@ "DE_IBAN", "DE_TAX_ID", "DE_SOCIAL_SECURITY_NUMBER", - "DE_PHONE", "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 03f26c1e..1ddeed19 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -40,7 +40,6 @@ class RegexAnnotator: "DE_IBAN", "DE_TAX_ID", "DE_SOCIAL_SECURITY_NUMBER", - "DE_PHONE", "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", @@ -239,19 +238,6 @@ def __init__(self): """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), - # German phone number - requires +49 or 0049 country code - "DE_PHONE": re.compile( - r""" - (? None: "DE_SOCIAL_SECURITY_NUMBER", "Rentenversicherungsnummer 65150804AA23 liegt vor.", ), - ("DE_PHONE", "Hotline 030 12345678 erreichbar."), ("DE_POSTAL_CODE", "10115 Berlin."), ("DE_PASSPORT_NUMBER", "Passnummer 12345678 wurde geprueft."), ( diff --git a/tests/test_detection_accuracy.py b/tests/test_detection_accuracy.py index 46fdcd34..c9e680e2 100644 --- a/tests/test_detection_accuracy.py +++ b/tests/test_detection_accuracy.py @@ -26,7 +26,6 @@ "DE_IBAN", "DE_TAX_ID", "DE_SOCIAL_SECURITY_NUMBER", - "DE_PHONE", "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py index 986bf16e..d2481da8 100644 --- a/tests/test_regex_annotator.py +++ b/tests/test_regex_annotator.py @@ -41,8 +41,8 @@ def test_regex_annotator_initialization(): annotator = RegexAnnotator() assert annotator is not None assert ( - len(annotator.LABELS) == 15 - ) # Base + German structured labels + len(annotator.LABELS) == 14 + ) # Base + German structured labels (without DE_PHONE) def test_regex_annotator_create_method(): From 044be3848e171f2efbcbebc6b25f5bd36dcb3b8e Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Fri, 22 May 2026 19:02:23 +0200 Subject: [PATCH 4/5] test: relax LABELS count assertion and add DE_VAT_ID/DE_IBAN test coverage - Replace exact LABELS length check with subset validation to avoid breakage on future label additions - Add positive and negative test cases for DE_VAT_ID and DE_IBAN regex patterns - Ensures regex patterns are resilient to new entity types without modifying existing tests --- tests/test_de_pii_regex.py | 24 ++++++++++++++++++++++++ tests/test_regex_annotator.py | 20 +++++++++++++++++--- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/tests/test_de_pii_regex.py b/tests/test_de_pii_regex.py index 5049cd05..245aa669 100644 --- a/tests/test_de_pii_regex.py +++ b/tests/test_de_pii_regex.py @@ -6,6 +6,26 @@ @pytest.mark.parametrize( "label,text,expected", [ + ( + "DE_VAT_ID", + "USt-IdNr DE 123456789 ist gesetzt.", + "DE 123456789", + ), + ( + "DE_VAT_ID", + "USt-IdNr DE-123456789 liegt vor.", + "DE-123456789", + ), + ( + "DE_IBAN", + "IBAN DE44500105175407324931 ist gueltig.", + "DE44500105175407324931", + ), + ( + "DE_IBAN", + "IBAN DE44 5001 0517 5407 3249 31 ist gueltig.", + "DE44 5001 0517 5407 3249 31", + ), ( "DE_TAX_ID", "Steuer-ID 12345678901 liegt vor.", @@ -57,6 +77,10 @@ def test_de_regex_positive_cases(label: str, text: str, expected: str) -> None: @pytest.mark.parametrize( "label,text", [ + ("DE_VAT_ID", "USt-IdNr DE12345678 liegt vor."), + ("DE_VAT_ID", "USt-IdNr DE1234567890 liegt vor."), + ("DE_IBAN", "IBAN DE4450010517540732493 ist gueltig."), + ("DE_IBAN", "IBAN DE44 5001 0517 5407 3249 3X ist gueltig."), ("DE_TAX_ID", "Steuer-ID 1234567890 liegt vor."), ("DE_TAX_ID", "Steuer-ID 123456789012 liegt vor."), ( diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py index d2481da8..85894c6d 100644 --- a/tests/test_regex_annotator.py +++ b/tests/test_regex_annotator.py @@ -40,9 +40,23 @@ def test_regex_annotator_initialization(): """Test that the RegexAnnotator can be initialized.""" annotator = RegexAnnotator() assert annotator is not None - assert ( - len(annotator.LABELS) == 14 - ) # Base + German structured labels (without DE_PHONE) + required_labels = { + "EMAIL", + "PHONE", + "SSN", + "CREDIT_CARD", + "IP_ADDRESS", + "DOB", + "ZIP", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", + } + assert required_labels.issubset(set(annotator.LABELS)) def test_regex_annotator_create_method(): From ffbccdccb4b222b8ae9f15a914c1d33c3eb17a4a Mon Sep 17 00:00:00 2001 From: Pranjal Parmar <76609992+pranjalparmar@users.noreply.github.com> Date: Tue, 26 May 2026 22:12:24 +0200 Subject: [PATCH 5/5] feat(regex): locale-gate DE patterns --- README.md | 18 +++ datafog/__init__.py | 24 ++- datafog/__init___lean.py | 13 +- datafog/core.py | 46 ++++-- datafog/engine.py | 12 +- datafog/main.py | 14 +- datafog/main_lean.py | 11 +- .../regex_annotator/regex_annotator.py | 142 +++++++++++++++--- datafog/services/text_service.py | 8 +- datafog/services/text_service_lean.py | 13 +- datafog/services/text_service_original.py | 13 +- tests/test_de_pii_regex.py | 54 ++++++- tests/test_regex_annotator.py | 14 ++ 13 files changed, 311 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 62f7e10d..5b78920e 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,24 @@ Use the engine that matches your accuracy and dependency constraints: - Cascades regex with optional NER engines. - If optional deps are missing, it degrades gracefully and warns. +### Locale-specific regex patterns + +German regex patterns (DE_*) are locale-specific and disabled by default to avoid +false positives on non-German text. Enable them explicitly via `locales`: + +```python +import datafog + +result = datafog.scan( + "Steuer-ID 12345678903", + engine="regex", + locales=["de"], +) +print(result.entities) +``` + +German DE_* patterns also include checksum/context validation to reduce noise. + ## Backward-Compatible APIs The existing public API remains available. diff --git a/datafog/__init__.py b/datafog/__init__.py index e3974ad7..211e7953 100644 --- a/datafog/__init__.py +++ b/datafog/__init__.py @@ -163,6 +163,7 @@ def scan( text: str, engine: str = "regex", entity_types: list[str] | None = None, + locales: list[str] | None = None, ) -> ScanResult: """ v5-preview scan entrypoint. @@ -170,7 +171,9 @@ def scan( Defaults to the lightweight regex engine so the core install works without optional dependency fallback warnings. """ - return _scan(text=text, engine=engine, entity_types=entity_types) + return _scan( + text=text, engine=engine, entity_types=entity_types, locales=locales + ) def redact( @@ -178,6 +181,7 @@ def redact( entities: list[Entity] | None = None, engine: str = "regex", entity_types: list[str] | None = None, + locales: list[str] | None = None, strategy: str = "token", preset: str | None = None, ) -> RedactResult: @@ -201,6 +205,7 @@ def redact( text=text, engine=engine, entity_types=entity_types, + locales=locales, strategy=strategy, ) @@ -223,7 +228,7 @@ def protect( # Simple API for core functionality (backward compatibility) -def detect(text: str) -> list: +def detect(text: str, locales: list[str] | None = None) -> list: """ Detect PII in text using regex patterns. @@ -240,16 +245,16 @@ def detect(text: str) -> list: """ _warn_v5_replacement("detect", "datafog.scan()") - return _detect_impl(text) + return _detect_impl(text, locales=locales) -def _detect_impl(text: str) -> list: +def _detect_impl(text: str, locales: list[str] | None = None) -> list: import time as _time _start = _time.monotonic() _lazy_import_regex_annotator() - annotator = RegexAnnotator() + annotator = RegexAnnotator(locales=locales) # Use the structured output to get proper positions _, result = annotator.annotate_with_spans(text) @@ -290,7 +295,12 @@ def _detect_impl(text: str) -> list: return entities -def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: +def process( + text: str, + anonymize: bool = False, + method: str = "redact", + locales: list[str] | None = None, +) -> dict: """ Process text to detect and optionally anonymize PII. @@ -317,7 +327,7 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: _start = _time.monotonic() - findings = _detect_impl(text) + findings = _detect_impl(text, locales=locales) result = {"original": text, "findings": findings} diff --git a/datafog/__init___lean.py b/datafog/__init___lean.py index 40a3f530..50f2c7ed 100644 --- a/datafog/__init___lean.py +++ b/datafog/__init___lean.py @@ -79,7 +79,7 @@ def _missing_dependency(*args, **kwargs): # Simple API for core functionality -def detect(text: str) -> list: +def detect(text: str, locales: list[str] | None = None) -> list: """ Detect PII in text using regex patterns. @@ -94,7 +94,7 @@ def detect(text: str) -> list: >>> detect("Contact john@example.com") [{'type': 'EMAIL', 'value': 'john@example.com', 'start': 8, 'end': 24}] """ - annotator = RegexAnnotator() + annotator = RegexAnnotator(locales=locales) result = annotator.annotate(text) # Convert to simple format @@ -113,7 +113,12 @@ def detect(text: str) -> list: return entities -def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: +def process( + text: str, + anonymize: bool = False, + method: str = "redact", + locales: list[str] | None = None, +) -> dict: """ Process text to detect and optionally anonymize PII. @@ -134,7 +139,7 @@ def process(text: str, anonymize: bool = False, method: str = "redact") -> dict: 'findings': [{'type': 'EMAIL', 'value': 'john@example.com', ...}] } """ - findings = detect(text) + findings = detect(text, locales=locales) result = {"original": text, "findings": findings} diff --git a/datafog/core.py b/datafog/core.py index 8db94618..a37c82a5 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -5,7 +5,7 @@ without requiring heavy dependencies like spaCy or PyTorch. """ -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from datafog.engine import scan, scan_and_redact from datafog.models.anonymizer import AnonymizerType @@ -16,12 +16,13 @@ AUTO_ENGINE = "auto" -def detect_pii(text: str) -> Dict[str, List[str]]: +def detect_pii(text: str, locales: Optional[List[str]] = None) -> Dict[str, List[str]]: """ Simple PII detection using lightweight regex engine. Args: text: Text to scan for PII + locales: Optional list of locale codes that enable locale-specific labels Returns: Dictionary mapping entity types to lists of detected values @@ -37,7 +38,7 @@ def detect_pii(text: str) -> Dict[str, List[str]]: try: # Use engine boundary for canonical scan behavior. - scan_result = scan(text=text, engine=REGEX_ENGINE) + scan_result = scan(text=text, engine=REGEX_ENGINE, locales=locales) pii_dict: Dict[str, List[str]] = {} for entity in scan_result.entities: if not entity.text.strip(): @@ -81,13 +82,18 @@ def detect_pii(text: str) -> Dict[str, List[str]]: ) from e -def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> str: +def anonymize_text( + text: str, + method: Union[str, AnonymizerType] = "redact", + locales: Optional[List[str]] = None, +) -> str: """ Simple text anonymization using lightweight regex engine. Args: text: Text to anonymize method: Anonymization method ('redact', 'replace', or 'hash') + locales: Optional list of locale codes that enable locale-specific labels Returns: Anonymized text string @@ -119,6 +125,7 @@ def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> result = scan_and_redact( text=text, engine=REGEX_ENGINE, + locales=locales, strategy=strategy_map[method], ) @@ -155,7 +162,7 @@ def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> def scan_text( - text: str, return_entities: bool = False + text: str, return_entities: bool = False, locales: Optional[List[str]] = None ) -> Union[bool, Dict[str, List[str]]]: """ Quick scan to check if text contains any PII. @@ -163,6 +170,7 @@ def scan_text( Args: text: Text to scan return_entities: If True, return detected entities; if False, return boolean + locales: Optional list of locale codes that enable locale-specific labels Returns: Boolean indicating PII presence, or dictionary of detected entities @@ -180,7 +188,7 @@ def scan_text( _start = _time.monotonic() - entities = detect_pii(text) + entities = detect_pii(text, locales=locales) result = entities if return_entities else len(entities) > 0 @@ -200,24 +208,31 @@ def scan_text( return result -def get_supported_entities() -> List[str]: +def get_supported_entities(locales: Optional[List[str]] = None) -> List[str]: """ Get list of PII entity types supported by the regex engine. + Locale-specific labels (e.g., DE_*) are only included when locales include "de". + Returns: List of supported entity type names Example: >>> entities = get_supported_entities() >>> print(entities) - ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DE_TAX_ID', 'DE_SOCIAL_SECURITY_NUMBER', 'DE_POSTAL_CODE', 'DE_PASSPORT_NUMBER', 'DE_RESIDENCE_PERMIT_NUMBER', 'DATE', 'ZIP_CODE'] + ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DATE', 'ZIP_CODE'] """ - result = [ + base = [ "EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DATE", + "ZIP_CODE", + ] + + de_labels = [ "DE_VAT_ID", "DE_IBAN", "DE_TAX_ID", @@ -225,10 +240,19 @@ def get_supported_entities() -> List[str]: "DE_POSTAL_CODE", "DE_PASSPORT_NUMBER", "DE_RESIDENCE_PERMIT_NUMBER", - "DATE", - "ZIP_CODE", ] + if not locales: + result = base + else: + locale_values = [locales] if isinstance(locales, str) else locales + normalized = { + value.strip().lower() + for value in locale_values + if isinstance(value, str) and value.strip() + } + result = base + de_labels if "de" in normalized else base + try: from datafog.telemetry import track_function_call diff --git a/datafog/engine.py b/datafog/engine.py index cc1c4d2f..1a3884ec 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -138,8 +138,8 @@ def _entities_from_dict( return entities -def _regex_entities(text: str) -> list[Entity]: - annotator = RegexAnnotator() +def _regex_entities(text: str, locales: Optional[list[str]] = None) -> list[Entity]: + annotator = RegexAnnotator(locales=locales) _, structured = annotator.annotate_with_spans(text) entities: list[Entity] = [] for span in structured.spans: @@ -242,6 +242,7 @@ def scan( text: str, engine: str = "smart", entity_types: Optional[list[str]] = None, + locales: Optional[list[str]] = None, ) -> ScanResult: """Scan text for PII entities.""" if not isinstance(text, str): @@ -250,7 +251,7 @@ def scan( if engine not in {"regex", "spacy", "gliner", "smart"}: raise ValueError("engine must be one of: regex, spacy, gliner, smart") - regex_entities = _regex_entities(text) + regex_entities = _regex_entities(text, locales=locales) if engine == "regex": filtered = _filter_entity_types(regex_entities, entity_types) @@ -384,8 +385,11 @@ def scan_and_redact( text: str, engine: str = "smart", entity_types: Optional[list[str]] = None, + locales: Optional[list[str]] = None, strategy: str = "token", ) -> RedactResult: """Convenience wrapper: scan then redact.""" - scan_result = scan(text=text, engine=engine, entity_types=entity_types) + scan_result = scan( + text=text, engine=engine, entity_types=entity_types, locales=locales + ) return redact(text=text, entities=scan_result.entities, strategy=strategy) diff --git a/datafog/main.py b/datafog/main.py index 31ac22e5..c045cf0d 100644 --- a/datafog/main.py +++ b/datafog/main.py @@ -10,7 +10,7 @@ import json import logging -from typing import List +from typing import List, Optional from .config import OperationType from .engine import scan, scan_and_redact @@ -39,8 +39,10 @@ def __init__( operations: List[OperationType] = [OperationType.SCAN], hash_type: HashType = HashType.SHA256, anonymizer_type: AnonymizerType = AnonymizerType.REPLACE, + locales: Optional[List[str]] = None, ): - self.regex_annotator = RegexAnnotator() + self.regex_annotator = RegexAnnotator(locales=locales) + self.locales = locales normalized_ops: List[OperationType] = [] for op in operations: if isinstance(op, OperationType): @@ -181,7 +183,7 @@ def detect(self, text: str) -> dict: _start = _time.monotonic() - scan_result = scan(text=text, engine="regex") + scan_result = scan(text=text, engine="regex", locales=self.locales) result = {label: [] for label in RegexAnnotator.LABELS} legacy_map = {"DATE": "DOB", "ZIP_CODE": "ZIP"} for entity in scan_result.entities: @@ -245,6 +247,7 @@ def process( redact_result = scan_and_redact( text=text, engine="regex", + locales=self.locales, strategy=strategy, ) result["anonymized"] = redact_result.redacted_text @@ -288,8 +291,9 @@ class TextPIIAnnotator: regex_annotator: RegexAnnotator instance for text annotation. """ - def __init__(self): - self.regex_annotator = RegexAnnotator() + def __init__(self, locales: Optional[List[str]] = None): + self.regex_annotator = RegexAnnotator(locales=locales) + self.locales = locales def run(self, text, output_path=None): """ diff --git a/datafog/main_lean.py b/datafog/main_lean.py index af61559e..4a260ff9 100644 --- a/datafog/main_lean.py +++ b/datafog/main_lean.py @@ -10,7 +10,7 @@ import json import logging -from typing import List +from typing import List, Optional from .config import OperationType from .models.anonymizer import Anonymizer, AnonymizerType, HashType @@ -38,8 +38,10 @@ def __init__( operations: List[OperationType] = [OperationType.SCAN], hash_type: HashType = HashType.SHA256, anonymizer_type: AnonymizerType = AnonymizerType.REPLACE, + locales: Optional[List[str]] = None, ): - self.regex_annotator = RegexAnnotator() + self.regex_annotator = RegexAnnotator(locales=locales) + self.locales = locales self.operations: List[OperationType] = operations self.anonymizer = Anonymizer( hash_type=hash_type, anonymizer_type=anonymizer_type @@ -161,8 +163,9 @@ class TextPIIAnnotator: regex_annotator: RegexAnnotator instance for text annotation. """ - def __init__(self): - self.regex_annotator = RegexAnnotator() + def __init__(self, locales: Optional[List[str]] = None): + self.regex_annotator = RegexAnnotator(locales=locales) + self.locales = locales def run(self, text, output_path=None): """ diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 1ddeed19..291bf032 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -1,5 +1,5 @@ import re -from typing import Dict, List, Pattern, Tuple +from typing import Callable, Dict, Iterable, List, Optional, Pattern, Set, Tuple from pydantic import BaseModel @@ -25,10 +25,14 @@ class RegexAnnotator: This annotator serves as a fallback to the SpaCy annotator and is optimized for performance, targeting ≤ 20 µs / kB on a MacBook M-series. + + Locale notes: + German-specific entity types (DE_*) are disabled by default. Enable them by + passing locales=["de"]. This avoids false positives on non-German text. """ # Labels for PII entities - LABELS = [ + BASE_LABELS = [ "EMAIL", "PHONE", "SSN", @@ -36,18 +40,66 @@ class RegexAnnotator: "IP_ADDRESS", "DOB", "ZIP", - "DE_VAT_ID", - "DE_IBAN", - "DE_TAX_ID", - "DE_SOCIAL_SECURITY_NUMBER", - "DE_POSTAL_CODE", - "DE_PASSPORT_NUMBER", - "DE_RESIDENCE_PERMIT_NUMBER", ] - def __init__(self): + LOCALE_LABELS = { + "de": [ + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", + ], + } + + LABELS = BASE_LABELS + LOCALE_LABELS["de"] + + _DE_PASSPORT_PREFIXES = "CFGHJKLMNPRTVWXYZ" + _DE_RESIDENCE_CONTEXT_RE = re.compile( + r"\b(aufenthaltstitel|aufenthaltserlaubnis|aufenthaltskarte|residence permit|residence card)\b", + re.IGNORECASE, + ) + + def __init__(self, locales: Optional[Iterable[str]] = None): + self.locales = self._normalize_locales(locales) + self.active_labels = self._labels_for_locales(self.locales) + # Compile all patterns once at initialization - self.patterns: Dict[str, Pattern] = { + self.patterns = self._compile_patterns() + self.validators = self._build_validators() + + @staticmethod + def _normalize_locales(locales: Optional[Iterable[str]]) -> Set[str]: + if locales is None: + return set() + if isinstance(locales, str): + values = [locales] + else: + values = list(locales) + normalized = { + value.strip().lower() + for value in values + if isinstance(value, str) and value.strip() + } + return normalized + + @classmethod + def labels_for_locales(cls, locales: Optional[Iterable[str]] = None) -> List[str]: + normalized = cls._normalize_locales(locales) + return cls._labels_for_locales(normalized) + + @classmethod + def _labels_for_locales(cls, locales: Set[str]) -> List[str]: + labels = list(cls.BASE_LABELS) + for locale, locale_labels in cls.LOCALE_LABELS.items(): + if locale in locales: + labels.extend(locale_labels) + return labels + + def _compile_patterns(self) -> Dict[str, Pattern]: + patterns: Dict[str, Pattern] = { # Email pattern - RFC 5322 subset # Intentionally permissive to favor false positives over false negatives # Allows for multiple dots, special characters in local part, and subdomains @@ -217,7 +269,7 @@ def __init__(self): "DE_TAX_ID": re.compile( r""" (? Dict[str, Callable[[re.Match, str], bool]]: + validators: Dict[str, Callable[[re.Match, str], bool]] = {} + if "DE_TAX_ID" in self.active_labels: + validators["DE_TAX_ID"] = self._validate_de_tax_id + if "DE_RESIDENCE_PERMIT_NUMBER" in self.active_labels: + validators["DE_RESIDENCE_PERMIT_NUMBER"] = self._validate_de_residence_permit + return validators + + @staticmethod + def _digits_only(value: str) -> str: + return "".join(ch for ch in value if ch.isdigit()) + + @staticmethod + def _de_tax_id_check_digit(digits10: str) -> int: + product = 10 + for ch in digits10: + sum_ = (int(ch) + product) % 10 + if sum_ == 0: + sum_ = 10 + product = (sum_ * 2) % 11 + return (11 - product) % 10 + + def _validate_de_tax_id(self, match: re.Match, text: str) -> bool: + digits = self._digits_only(match.group()) + if len(digits) != 11: + return False + if digits[0] == "0": + return False + return digits[-1] == str(self._de_tax_id_check_digit(digits[:10])) + + def _validate_de_residence_permit(self, match: re.Match, text: str) -> bool: + window = 40 + start = max(match.start() - window, 0) + end = min(match.end() + window, len(text)) + context = text[start:end] + return bool(self._DE_RESIDENCE_CONTEXT_RE.search(context)) + @classmethod - def create(cls) -> "RegexAnnotator": + def create(cls, locales: Optional[Iterable[str]] = None) -> "RegexAnnotator": """Factory method to create a new RegexAnnotator instance.""" - return cls() + return cls(locales=locales) def annotate(self, text: str) -> Dict[str, List[str]]: """Annotate text with PII entities using regex patterns. @@ -292,7 +388,10 @@ def annotate(self, text: str) -> Dict[str, List[str]]: # Process with each pattern for label, pattern in self.patterns.items(): + validator = self.validators.get(label) for match in pattern.finditer(text): + if validator and not validator(match, text): + continue result[label].append(match.group()) return result @@ -317,7 +416,10 @@ def annotate_with_spans( return spans_by_label, AnnotationResult(text=text, spans=all_spans) for label, pattern in self.patterns.items(): + validator = self.validators.get(label) for match in pattern.finditer(text): + if validator and not validator(match, text): + continue span = Span( label=label, start=match.start(), diff --git a/datafog/services/text_service.py b/datafog/services/text_service.py index 0956256f..7ed4298d 100644 --- a/datafog/services/text_service.py +++ b/datafog/services/text_service.py @@ -7,7 +7,7 @@ import asyncio import warnings -from typing import TYPE_CHECKING, Dict, List, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Union if TYPE_CHECKING: from datafog.processing.text_processing.regex_annotator.regex_annotator import Span @@ -43,6 +43,7 @@ def __init__( text_chunk_length: int = 1000, engine: str = "regex", gliner_model: str = "urchade/gliner_multi_pii-v1", + locales: Optional[List[str]] = None, ): """ Initialize the TextService with specified chunk length and annotation engine. @@ -56,6 +57,7 @@ def __init__( - "auto": Try RegexAnnotator first and fall back to SpacyPIIAnnotator if no entities found - "smart": Try RegexAnnotator → GLiNER → SpaCy cascade (requires nlp-advanced extra) gliner_model: GLiNER model name to use when engine is "gliner" or "smart" + locales: Optional list of locale codes that enable locale-specific regex labels Raises: AssertionError: If an invalid engine type is provided @@ -65,6 +67,7 @@ def __init__( self.engine = engine self.text_chunk_length = text_chunk_length self.gliner_model = gliner_model + self.locales = locales # Lazy initialization - annotators created only when needed self._regex_annotator = None @@ -90,6 +93,7 @@ def __init__( engine=engine, text_chunk_length=text_chunk_length, gliner_model=gliner_model if engine in ("gliner", "smart") else None, + locales=locales, ) except Exception: pass @@ -102,7 +106,7 @@ def regex_annotator(self): RegexAnnotator, ) - self._regex_annotator = RegexAnnotator() + self._regex_annotator = RegexAnnotator(locales=self.locales) return self._regex_annotator @property diff --git a/datafog/services/text_service_lean.py b/datafog/services/text_service_lean.py index ce9203ec..50d110cd 100644 --- a/datafog/services/text_service_lean.py +++ b/datafog/services/text_service_lean.py @@ -6,7 +6,7 @@ """ import asyncio -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from datafog.processing.text_processing.regex_annotator.regex_annotator import ( RegexAnnotator, @@ -26,7 +26,12 @@ class TextService: pip install datafog[nlp] """ - def __init__(self, text_chunk_length: int = 1000, engine: str = "regex"): + def __init__( + self, + text_chunk_length: int = 1000, + engine: str = "regex", + locales: Optional[List[str]] = None, + ): """ Initialize the TextService with specified chunk length and annotation engine. @@ -36,6 +41,7 @@ def __init__(self, text_chunk_length: int = 1000, engine: str = "regex"): - "regex": (Default) Use RegexAnnotator for fast pattern-based entity detection - "spacy": Use SpacyPIIAnnotator for NLP-based entity detection (requires nlp extra) - "auto": Try RegexAnnotator first and fall back to SpacyPIIAnnotator if no entities found + locales: Optional list of locale codes that enable locale-specific regex labels Raises: AssertionError: If an invalid engine type is provided @@ -43,8 +49,9 @@ def __init__(self, text_chunk_length: int = 1000, engine: str = "regex"): """ assert engine in {"regex", "spacy", "auto"}, "Invalid engine" self.engine = engine - self.regex_annotator = RegexAnnotator() + self.regex_annotator = RegexAnnotator(locales=locales) self.text_chunk_length = text_chunk_length + self.locales = locales # Only initialize spacy if needed and available self.spacy_annotator = None diff --git a/datafog/services/text_service_original.py b/datafog/services/text_service_original.py index 6d5dde1b..e8ea4ab3 100644 --- a/datafog/services/text_service_original.py +++ b/datafog/services/text_service_original.py @@ -4,7 +4,7 @@ """ import asyncio -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from datafog.processing.text_processing.regex_annotator.regex_annotator import ( RegexAnnotator, @@ -22,7 +22,12 @@ class TextService: and combining annotations from multiple chunks. """ - def __init__(self, text_chunk_length: int = 1000, engine: str = "auto"): + def __init__( + self, + text_chunk_length: int = 1000, + engine: str = "auto", + locales: Optional[List[str]] = None, + ): """ Initialize the TextService with specified chunk length and annotation engine. @@ -32,6 +37,7 @@ def __init__(self, text_chunk_length: int = 1000, engine: str = "auto"): - "regex": Use only the RegexAnnotator for pattern-based entity detection - "spacy": Use only the SpacyPIIAnnotator for NLP-based entity detection - "auto": (Default) Try RegexAnnotator first and fall back to SpacyPIIAnnotator if no entities are found + locales: Optional list of locale codes that enable locale-specific regex labels Raises: AssertionError: If an invalid engine type is provided @@ -39,8 +45,9 @@ def __init__(self, text_chunk_length: int = 1000, engine: str = "auto"): assert engine in {"regex", "spacy", "auto"}, "Invalid engine" self.engine = engine self.spacy_annotator = SpacyPIIAnnotator.create() - self.regex_annotator = RegexAnnotator() + self.regex_annotator = RegexAnnotator(locales=locales) self.text_chunk_length = text_chunk_length + self.locales = locales def _chunk_text(self, text: str) -> List[str]: """Split the text into chunks of specified length.""" diff --git a/tests/test_de_pii_regex.py b/tests/test_de_pii_regex.py index 245aa669..f23c130a 100644 --- a/tests/test_de_pii_regex.py +++ b/tests/test_de_pii_regex.py @@ -3,6 +3,32 @@ from datafog.processing.text_processing.regex_annotator import RegexAnnotator +def _de_tax_id_check_digit(digits10: str) -> int: + product = 10 + for ch in digits10: + sum_ = (int(ch) + product) % 10 + if sum_ == 0: + sum_ = 10 + product = (sum_ * 2) % 11 + return (11 - product) % 10 + + +def _make_de_tax_id(digits10: str) -> str: + return digits10 + str(_de_tax_id_check_digit(digits10)) + + +def _format_de_tax_id_spaced(digits11: str) -> str: + return f"{digits11[:2]} {digits11[2:5]} {digits11[5:8]} {digits11[8:]}" + + +VALID_DE_TAX_ID = _make_de_tax_id("1234567890") +VALID_DE_TAX_ID_SPACED = _format_de_tax_id_spaced(VALID_DE_TAX_ID) +INVALID_DE_TAX_ID = ( + VALID_DE_TAX_ID[:-1] + + str((int(VALID_DE_TAX_ID[-1]) + 1) % 10) +) + + @pytest.mark.parametrize( "label,text,expected", [ @@ -28,13 +54,13 @@ ), ( "DE_TAX_ID", - "Steuer-ID 12345678901 liegt vor.", - "12345678901", + f"Steuer-ID {VALID_DE_TAX_ID} liegt vor.", + VALID_DE_TAX_ID, ), ( "DE_TAX_ID", - "Steuer-ID 12 345 678 901 ist gesetzt.", - "12 345 678 901", + f"Steuer-ID {VALID_DE_TAX_ID_SPACED} ist gesetzt.", + VALID_DE_TAX_ID_SPACED, ), ( "DE_SOCIAL_SECURITY_NUMBER", @@ -53,8 +79,8 @@ ), ( "DE_POSTAL_CODE", - "DE10115 Berlin.", - "DE10115", + "PLZ 10115 Berlin.", + "PLZ 10115", ), ( "DE_PASSPORT_NUMBER", @@ -69,7 +95,7 @@ ], ) def test_de_regex_positive_cases(label: str, text: str, expected: str) -> None: - annotator = RegexAnnotator() + annotator = RegexAnnotator(locales=["de"]) result = annotator.annotate(text) assert expected in result[label] @@ -83,6 +109,9 @@ def test_de_regex_positive_cases(label: str, text: str, expected: str) -> None: ("DE_IBAN", "IBAN DE44 5001 0517 5407 3249 3X ist gueltig."), ("DE_TAX_ID", "Steuer-ID 1234567890 liegt vor."), ("DE_TAX_ID", "Steuer-ID 123456789012 liegt vor."), + ("DE_TAX_ID", f"Steuer-ID {INVALID_DE_TAX_ID} liegt vor."), + ("DE_TAX_ID", "Steuer-ID 12345678901 liegt vor."), + ("DE_TAX_ID", "Steuer-ID 01234567890 liegt vor."), ( "DE_SOCIAL_SECURITY_NUMBER", "Rentenversicherungsnummer 65150804123 liegt vor.", @@ -92,14 +121,23 @@ def test_de_regex_positive_cases(label: str, text: str, expected: str) -> None: "Rentenversicherungsnummer 65150804AA23 liegt vor.", ), ("DE_POSTAL_CODE", "10115 Berlin."), + ("DE_POSTAL_CODE", "D12345"), + ("DE_POSTAL_CODE", "DE12345"), + ("DE_POSTAL_CODE", "DE10115 Berlin."), + ("DE_POSTAL_CODE", "D10115 Berlin."), ("DE_PASSPORT_NUMBER", "Passnummer 12345678 wurde geprueft."), + ("DE_PASSPORT_NUMBER", "Bestellung A12345678 liegt vor."), ( "DE_RESIDENCE_PERMIT_NUMBER", "Aufenthaltstitel AT12345678 gueltig.", ), + ( + "DE_RESIDENCE_PERMIT_NUMBER", + "AT1234567 ohne Kontext.", + ), ], ) def test_de_regex_negative_cases(label: str, text: str) -> None: - annotator = RegexAnnotator() + annotator = RegexAnnotator(locales=["de"]) result = annotator.annotate(text) assert not result[label] diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py index 85894c6d..600d80e6 100644 --- a/tests/test_regex_annotator.py +++ b/tests/test_regex_annotator.py @@ -66,6 +66,20 @@ def test_regex_annotator_create_method(): assert isinstance(annotator, RegexAnnotator) +def test_de_labels_inactive_without_locale(): + """German DE_ labels should be inactive unless locales include 'de'.""" + annotator = RegexAnnotator() + result = annotator.annotate("Passnummer C12345678 wurde geprueft.") + assert not result["DE_PASSPORT_NUMBER"] + + +def test_de_labels_active_with_locale(): + """German DE_ labels should activate when locales include 'de'.""" + annotator = RegexAnnotator(locales=["de"]) + result = annotator.annotate("Passnummer C12345678 wurde geprueft.") + assert "C12345678" in result["DE_PASSPORT_NUMBER"] + + def test_empty_text_annotation(): """Test that annotating empty text returns empty results.""" annotator = RegexAnnotator()