Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion datafog/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,14 +210,21 @@ def get_supported_entities() -> List[str]:
Example:
>>> entities = get_supported_entities()
>>> print(entities)
['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DOB', 'ZIP']
['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DE_TAX_ID', 'DE_SOCIAL_SECURITY_NUMBER', 'DE_POSTAL_CODE', 'DE_PASSPORT_NUMBER', 'DE_RESIDENCE_PERMIT_NUMBER', 'DATE', 'ZIP_CODE']
"""
result = [
"EMAIL",
"PHONE",
"SSN",
"CREDIT_CARD",
"IP_ADDRESS",
"DE_VAT_ID",
"DE_IBAN",
"DE_TAX_ID",
"DE_SOCIAL_SECURITY_NUMBER",
"DE_POSTAL_CODE",
"DE_PASSPORT_NUMBER",
"DE_RESIDENCE_PERMIT_NUMBER",
"DATE",
"ZIP_CODE",
]
Expand Down
7 changes: 7 additions & 0 deletions datafog/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@
"SSN",
"CREDIT_CARD",
"IP_ADDRESS",
"DE_VAT_ID",
"DE_IBAN",
"DE_TAX_ID",
"DE_SOCIAL_SECURITY_NUMBER",
"DE_POSTAL_CODE",
"DE_PASSPORT_NUMBER",
"DE_RESIDENCE_PERMIT_NUMBER",
"DATE",
"ZIP_CODE",
"PERSON",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,22 @@ class RegexAnnotator:
"""

# Labels for PII entities
LABELS = ["EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", "DOB", "ZIP"]
LABELS = [
"EMAIL",
"PHONE",
"SSN",
"CREDIT_CARD",
"IP_ADDRESS",
"DOB",
"ZIP",
"DE_VAT_ID",
"DE_IBAN",
"DE_TAX_ID",
"DE_SOCIAL_SECURITY_NUMBER",
"DE_POSTAL_CODE",
"DE_PASSPORT_NUMBER",
"DE_RESIDENCE_PERMIT_NUMBER",
]

def __init__(self):
# Compile all patterns once at initialization
Expand Down Expand Up @@ -175,6 +190,84 @@ def __init__(self):
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
),
# German VAT ID (USt-IdNr) - DE followed by 9 digits
"DE_VAT_ID": re.compile(
r"""
(?<![A-Za-z0-9])
DE
[\s-]?
\d{9}
(?![A-Za-z0-9])
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
),
# German IBAN - DE followed by 20 digits (often grouped)
"DE_IBAN": re.compile(
r"""
(?<![A-Za-z0-9])
DE
\d{2}
(?:\s?\d{4}){4}
\s?\d{2}
(?![A-Za-z0-9])
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
),
# German Tax ID (Steuer-ID) - 11 digits
"DE_TAX_ID": re.compile(
r"""
(?<![A-Za-z0-9])
(?:\d{11}|\d{2}\s?\d{3}\s?\d{3}\s?\d{3})
(?![A-Za-z0-9])
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
),
# German Social Security Number (Rentenversicherungsnummer)
# Format: 2 digits + 6 digits (DOB) + 1 letter + 3 digits
"DE_SOCIAL_SECURITY_NUMBER": re.compile(
r"""
(?<![A-Za-z0-9])
\d{2}
\s?
\d{6}
\s?
[A-Z]
\s?
\d{3}
(?![A-Za-z0-9])
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
),
# German postal code - prefixed format (PLZ/DE/D followed by 5 digits)
"DE_POSTAL_CODE": re.compile(
r"""
(?<![A-Za-z0-9])
(?:PLZ|DE|D)
\d{5}
(?![A-Za-z0-9])
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
),
# German passport number - 1 letter followed by 8 digits
"DE_PASSPORT_NUMBER": re.compile(
r"""
(?<![A-Za-z0-9])
[A-Z]
\d{8}
(?![A-Za-z0-9])
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
),
# German residence permit number - AT followed by 7 digits
"DE_RESIDENCE_PERMIT_NUMBER": re.compile(
r"""
(?<![A-Za-z0-9])
AT
\d{7}
(?![A-Za-z0-9])
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
),
}

@classmethod
Expand Down
84 changes: 84 additions & 0 deletions tests/corpus/structured_pii.json
Original file line number Diff line number Diff line change
Expand Up @@ -733,5 +733,89 @@
"end": 5
}
]
},
{
"id": "de-vat-id-simple",
"input": "VAT number: DE123456789 for invoices.",
"expected_entities": [
{
"type": "DE_VAT_ID",
"text": "DE123456789",
"start": 12,
"end": 23
}
]
},
{
"id": "de-iban-formatted",
"input": "IBAN: DE89 3704 0044 0532 0130 00 for payments.",
"expected_entities": [
{
"type": "DE_IBAN",
"text": "DE89 3704 0044 0532 0130 00",
"start": 6,
"end": 33
}
]
},
{
"id": "de-tax-id-simple",
"input": "Steuer-ID 12345678901 liegt vor.",
"expected_entities": [
{
"type": "DE_TAX_ID",
"text": "12345678901",
"start": 10,
"end": 21
}
]
},
{
"id": "de-social-security-number",
"input": "Rentenversicherungsnummer 65150804A123 liegt vor.",
"expected_entities": [
{
"type": "DE_SOCIAL_SECURITY_NUMBER",
"text": "65150804A123",
"start": 26,
"end": 38
}
]
},
{
"id": "de-postal-code-prefixed",
"input": "PLZ10115 Berlin.",
"expected_entities": [
{
"type": "DE_POSTAL_CODE",
"text": "PLZ10115",
"start": 0,
"end": 8
}
]
},
{
"id": "de-passport-number",
"input": "Passnummer C12345678 wurde geprueft.",
"expected_entities": [
{
"type": "DE_PASSPORT_NUMBER",
"text": "C12345678",
"start": 11,
"end": 20
}
]
},
{
"id": "de-residence-permit-number",
"input": "Aufenthaltstitel AT1234567 gueltig.",
"expected_entities": [
{
"type": "DE_RESIDENCE_PERMIT_NUMBER",
"text": "AT1234567",
"start": 17,
"end": 26
}
]
}
]
105 changes: 105 additions & 0 deletions tests/test_de_pii_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import pytest

from datafog.processing.text_processing.regex_annotator import RegexAnnotator


@pytest.mark.parametrize(
"label,text,expected",
[
(
"DE_VAT_ID",
"USt-IdNr DE 123456789 ist gesetzt.",
"DE 123456789",
),
(
"DE_VAT_ID",
"USt-IdNr DE-123456789 liegt vor.",
"DE-123456789",
),
(
"DE_IBAN",
"IBAN DE44500105175407324931 ist gueltig.",
"DE44500105175407324931",
),
(
"DE_IBAN",
"IBAN DE44 5001 0517 5407 3249 31 ist gueltig.",
"DE44 5001 0517 5407 3249 31",
),
(
"DE_TAX_ID",
"Steuer-ID 12345678901 liegt vor.",
"12345678901",
),
(
"DE_TAX_ID",
"Steuer-ID 12 345 678 901 ist gesetzt.",
"12 345 678 901",
),
(
"DE_SOCIAL_SECURITY_NUMBER",
"Rentenversicherungsnummer 65150804A123 liegt vor.",
"65150804A123",
),
(
"DE_SOCIAL_SECURITY_NUMBER",
"Rentenversicherungsnummer 65 150804 A123 liegt vor.",
"65 150804 A123",
),
(
"DE_POSTAL_CODE",
"PLZ10115 Berlin.",
"PLZ10115",
),
(
"DE_POSTAL_CODE",
"DE10115 Berlin.",
"DE10115",
),
(
"DE_PASSPORT_NUMBER",
"Passnummer C12345678 wurde geprueft.",
"C12345678",
),
(
"DE_RESIDENCE_PERMIT_NUMBER",
"Aufenthaltstitel AT1234567 gueltig.",
"AT1234567",
),
],
)
def test_de_regex_positive_cases(label: str, text: str, expected: str) -> None:
annotator = RegexAnnotator()
result = annotator.annotate(text)
assert expected in result[label]


@pytest.mark.parametrize(
"label,text",
[
("DE_VAT_ID", "USt-IdNr DE12345678 liegt vor."),
("DE_VAT_ID", "USt-IdNr DE1234567890 liegt vor."),
("DE_IBAN", "IBAN DE4450010517540732493 ist gueltig."),
("DE_IBAN", "IBAN DE44 5001 0517 5407 3249 3X ist gueltig."),
("DE_TAX_ID", "Steuer-ID 1234567890 liegt vor."),
("DE_TAX_ID", "Steuer-ID 123456789012 liegt vor."),
(
"DE_SOCIAL_SECURITY_NUMBER",
"Rentenversicherungsnummer 65150804123 liegt vor.",
),
(
"DE_SOCIAL_SECURITY_NUMBER",
"Rentenversicherungsnummer 65150804AA23 liegt vor.",
),
("DE_POSTAL_CODE", "10115 Berlin."),
("DE_PASSPORT_NUMBER", "Passnummer 12345678 wurde geprueft."),
(
"DE_RESIDENCE_PERMIT_NUMBER",
"Aufenthaltstitel AT12345678 gueltig.",
),
],
)
def test_de_regex_negative_cases(label: str, text: str) -> None:
annotator = RegexAnnotator()
result = annotator.annotate(text)
assert not result[label]
7 changes: 7 additions & 0 deletions tests/test_detection_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
"SSN",
"CREDIT_CARD",
"IP_ADDRESS",
"DE_VAT_ID",
"DE_IBAN",
"DE_TAX_ID",
"DE_SOCIAL_SECURITY_NUMBER",
"DE_POSTAL_CODE",
"DE_PASSPORT_NUMBER",
"DE_RESIDENCE_PERMIT_NUMBER",
"DATE",
"ZIP_CODE",
}
Expand Down
20 changes: 17 additions & 3 deletions tests/test_regex_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,23 @@ def test_regex_annotator_initialization():
"""Test that the RegexAnnotator can be initialized."""
annotator = RegexAnnotator()
assert annotator is not None
assert (
len(annotator.LABELS) == 7
) # EMAIL, PHONE, SSN, CREDIT_CARD, IP_ADDRESS, DOB, ZIP
required_labels = {
"EMAIL",
"PHONE",
"SSN",
"CREDIT_CARD",
"IP_ADDRESS",
"DOB",
"ZIP",
"DE_VAT_ID",
"DE_IBAN",
"DE_TAX_ID",
"DE_SOCIAL_SECURITY_NUMBER",
"DE_POSTAL_CODE",
"DE_PASSPORT_NUMBER",
"DE_RESIDENCE_PERMIT_NUMBER",
}
assert required_labels.issubset(set(annotator.LABELS))


def test_regex_annotator_create_method():
Expand Down