diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 463cf894..0f1713f9 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.3.0 +current_version = 4.4.0a5 commit = True tag = True tag_name = v{new_version} @@ -20,7 +20,3 @@ values = [bumpversion:file:datafog/__about__.py] search = __version__ = "{current_version}" replace = __version__ = "{new_version}" - -[bumpversion:file:setup.py] -search = version="{current_version}" -replace = version="{new_version}" \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a3020390..dfcae0f4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,14 +31,6 @@ jobs: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] install-profile: ["core", "nlp", "nlp-advanced"] - exclude: - # v4.4.0 claims Python 3.13 support for core + CLI first. - # Optional heavyweight profiles remain validated separately before - # we advertise Python 3.13 support for them. - - python-version: "3.13" - install-profile: "nlp" - - python-version: "3.13" - install-profile: "nlp-advanced" steps: - uses: actions/checkout@v4 - name: Set up Python @@ -159,6 +151,7 @@ jobs: strategy: fail-fast: false matrix: + python-version: ["3.11"] install-profile: - core - cli @@ -167,18 +160,31 @@ jobs: - ocr - distributed - web + include: + - python-version: "3.13" + install-profile: nlp + - python-version: "3.13" + install-profile: nlp-advanced + - python-version: "3.13" + install-profile: ocr steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.11" + python-version: ${{ matrix.python-version }} cache: "pip" - name: Upgrade pip run: | python -m pip install --upgrade pip + - name: Install Tesseract OCR + if: matrix.install-profile == 'ocr' + run: | + sudo apt-get update + sudo apt-get install -y tesseract-ocr libtesseract-dev + - name: Install dependencies (core) if: matrix.install-profile == 'core' run: | @@ -192,6 +198,7 @@ jobs: - name: Run install profile smoke test env: DATAFOG_INSTALL_PROFILE: ${{ matrix.install-profile }} + DATAFOG_REQUIRE_TESSERACT: ${{ matrix.install-profile == 'ocr' && '1' || '' }} run: | pytest tests/test_install_profiles.py -q diff --git a/.gitignore b/.gitignore index 2f62eff9..1316a1f3 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ error_log.txt # Environment .env .venv +.venv*/ venv/ env/ examples/venv/ @@ -58,14 +59,15 @@ docs/* !docs/conf.py !docs/Makefile !docs/make.bat +!docs/optional-surfaces.rst +!docs/agents/ +!docs/agents/** !docs/audit/ !docs/audit/** # Keep all directories but ignore their contents */**/__pycache__/ -# Keep all files but ignore their contents -Claude.md notes/benchmarking_notes.md Roadmap.md notes/* diff --git a/Claude.md b/AGENTS.md similarity index 82% rename from Claude.md rename to AGENTS.md index dcbe7934..a46c4402 100644 --- a/Claude.md +++ b/AGENTS.md @@ -1,18 +1,26 @@ -# DataFog - Claude Development Guide +# DataFog - Agent Development Guide ## Project Overview + **DataFog** is an open-source Python library for PII detection and anonymization with a focus on speed and lightweight architecture. ## Core Value Proposition + - **Ultra-Fast Performance**: 190x faster than spaCy for structured PII, 32x faster with GLiNER - **Lightweight Core**: <2MB package with optional ML extras - **Modern Engine Options**: Regex, GLiNER, spaCy, and smart cascading - **Production Ready**: Comprehensive testing, CI/CD, and performance validation ## Current Project Status -**Version: 4.3.0** + +**Stable version: 4.4.0** + +**Development version: 4.4.0a5** + +**Next minor target: 4.5.0** ### ✅ Recently Completed (Latest) + - **GLiNER Integration**: Modern NER engine with PII-specialized models - **Smart Cascading**: Intelligent regex → GLiNER → spaCy progression - **Enhanced CLI**: Model management with `--engine` flags @@ -43,6 +51,7 @@ python -c "from datafog.services.text_service import TextService; print('✅ All ## Architecture Overview ### Engine Ecosystem (Updated with GLiNER) + ```python from datafog.services.text_service import TextService @@ -59,21 +68,23 @@ auto_service = TextService(engine="auto") # Legacy: regex→spaCy ``` ### Performance Comparison (Validated) -| Engine | Speed vs spaCy | Accuracy | Use Case | Install | -|---------|----------------|----------|----------|---------| -| `regex` | **190x faster** | High (structured) | Emails, phones, SSNs | Core only | -| `gliner` | **32x faster** | Very High | Modern NER, custom entities | `[nlp-advanced]` | -| `spacy` | 1x (baseline) | Good | Traditional NLP | `[nlp]` | -| `smart` | **60x faster** | Highest | Best balance | `[nlp-advanced]` | + +| Engine | Speed vs spaCy | Accuracy | Use Case | Install | +| -------- | --------------- | ----------------- | --------------------------- | ---------------- | +| `regex` | **190x faster** | High (structured) | Emails, phones, SSNs | Core only | +| `gliner` | **32x faster** | Very High | Modern NER, custom entities | `[nlp-advanced]` | +| `spacy` | 1x (baseline) | Good | Traditional NLP | `[nlp]` | +| `smart` | **60x faster** | Highest | Best balance | `[nlp-advanced]` | ### Dependency Strategy + ```python # Lightweight core (<2MB) pip install datafog # Optional ML engines pip install datafog[nlp] # spaCy (traditional NLP) -pip install datafog[nlp-advanced] # GLiNER (modern NER) +pip install datafog[nlp-advanced] # GLiNER (modern NER) pip install datafog[ocr] # Image processing pip install datafog[all] # Everything ``` @@ -81,15 +92,18 @@ pip install datafog[all] # Everything ## GLiNER Integration (NEW) ### Overview + GLiNER (Generalist Model for Named Entity Recognition) provides modern, accurate NER capabilities optimized for PII detection. ### Key Features + - **PII-Specialized Models**: `urchade/gliner_multi_pii-v1` trained specifically for PII - **Custom Entity Types**: Configurable entity detection beyond default PII types - **Smart Cascading**: Automatically tries regex first, GLiNER second, spaCy last - **CLI Management**: Download and manage GLiNER models via CLI ### Usage Examples + ```python # GLiNER engine from datafog.services.text_service import TextService @@ -108,6 +122,7 @@ subprocess.run(["datafog", "list-models", "--engine", "gliner"]) ``` ### Available GLiNER Models + - `urchade/gliner_multi_pii-v1` - PII-specialized (recommended) - `urchade/gliner_base` - General purpose starter - `urchade/gliner_large-v2` - Higher accuracy @@ -116,17 +131,19 @@ subprocess.run(["datafog", "list-models", "--engine", "gliner"]) ## Development Workflow ### Git Branch Strategy + - **main**: Production releases only - **dev**: Main development branch (use this) -- **feature/***: New features from dev -- **fix/***: Bug fixes from dev +- **feature/\***: New features from dev +- **fix/\***: Bug fixes from dev ### Making Changes + ```bash # Start from dev git checkout dev && git pull origin dev -# Create feature branch +# Create feature branch git checkout -b feature/your-change # Make changes, test, commit @@ -137,6 +154,7 @@ git push -u origin feature/your-change ``` ### Testing + ```bash # Run specific test suites pytest tests/test_text_service.py -v # Core functionality @@ -149,13 +167,14 @@ PYTEST_DONUT=yes pytest tests/test_ocr_integration.py # OCR with real models # Performance requirements # - Regex: 150x+ faster than spaCy -# - GLiNER: 25x+ faster than spaCy +# - GLiNER: 25x+ faster than spaCy # - Package size: Core <2MB, full <8MB ``` ## Key Implementation Patterns ### Simple API (Recommended) + ```python # Always available, lightweight from datafog import detect, process @@ -164,6 +183,7 @@ result = process("john@example.com", method="redact") ``` ### Advanced Engine Selection + ```python # For specialized use cases from datafog.services.text_service import TextService @@ -173,7 +193,7 @@ service = TextService(engine="regex") # Modern NER with custom entities service = TextService( - engine="gliner", + engine="gliner", gliner_model="urchade/gliner_base" ) @@ -182,6 +202,7 @@ service = TextService(engine="smart") ``` ### Graceful Degradation + ```python # Handles missing dependencies elegantly try: @@ -194,18 +215,21 @@ except ImportError: ## Common Tasks ### Adding New Entity Types + 1. Update regex patterns in `regex_annotator.py` 2. Add GLiNER entity types in `gliner_annotator.py` 3. Update tests and benchmarks 4. Validate performance doesn't regress >10% ### Performance Optimization + 1. Profile with existing benchmarks 2. Maintain speed thresholds (regex 150x+, GLiNER 25x+) 3. Update baselines when making improvements 4. Test across all engines ### CLI Enhancements + 1. Update `client.py` with new commands 2. Support `--engine` flag for multi-engine commands 3. Add comprehensive help text and examples @@ -215,31 +239,36 @@ except ImportError: ### Workflow Architecture (3 workflows) -| Workflow | Purpose | Trigger | -|----------|---------|---------| -| `ci.yml` | Lint + Test + Coverage + Wheel size | Push/PR to main/dev | -| `release.yml` | Alpha/Beta/Stable publishing | Schedule + manual dispatch | -| `benchmark.yml` | Performance benchmarks | Push/PR/weekly | +| Workflow | Purpose | Trigger | +| --------------- | ----------------------------------- | -------------------------- | +| `ci.yml` | Lint + Test + Coverage + Wheel size | Push/PR to main/dev | +| `release.yml` | Alpha/Beta/Stable publishing | Schedule + manual dispatch | +| `benchmark.yml` | Performance benchmarks | Push/PR/weekly | ### Release Cadence + - **Alpha** (Mon-Wed 2AM UTC): Automatic from `dev`, date+commit versioning - **Beta** (Thursday 2AM UTC): Automatic from `dev`, incremental beta numbers - **Stable** (manual dispatch): From `main`, base version or override ### Release Pipeline + `determine-release` → `test` → `publish` → `cleanup` + - Tests are a hard gate — no tests = no publish - Stable releases check out `main`; alpha/beta check out `dev` - Old alphas pruned to 7, betas to 5 - `[skip ci]` in version bump commits to prevent loops ### Pre-commit Hooks + - **isort**, **black**, **flake8**, **ruff**: Code formatting and linting - **prettier**: Markdown, JSON, YAML formatting - **gitleaks**: Secret scanning - **pre-commit-hooks**: Large file checks, merge conflict detection, YAML validation ## Environment Variables + ```bash # Testing configuration export PYTEST_DONUT=yes # Enable real OCR testing @@ -250,33 +279,51 @@ export PYTHONPATH=$(pwd) # Local development imports ``` ## Performance Requirements + - **Core Package**: <2MB (from ~8MB in v4.0.x) - **Regex Engine**: 150x+ faster than spaCy (currently 190x) -- **GLiNER Engine**: 25x+ faster than spaCy (currently 32x) +- **GLiNER Engine**: 25x+ faster than spaCy (currently 32x) - **Memory Usage**: Graceful handling of large texts (1MB+ chunks) - **Model Loading**: Cache GLiNER models to avoid repeated downloads -## Best Practices for Claude Agents +## Agent skills + +### Issue tracker + +Issues and PRDs are tracked in Linear under the DFPY team. See `docs/agents/issue-tracker.md`. + +### Triage labels + +Use the default five-label triage vocabulary. See `docs/agents/triage-labels.md`. + +### Domain docs + +Single-context repo: use root `CONTEXT.md` and root `docs/adr/` when present. See `docs/agents/domain.md`. + +## Best Practices for Agents Before beginning any task please checkout a branch from `dev` and create a pull request to `dev`. ### Code Quality + - Follow existing patterns before implementing new approaches - Add comprehensive tests for all new functionality - Update documentation immediately with code changes - Run benchmarks for any text processing modifications ### GLiNER Development + - Use PII-specialized models when available (`urchade/gliner_multi_pii-v1`) - Test graceful degradation when GLiNER dependencies missing - Validate smart cascading thresholds with real data - Consider model download time and caching strategies ### Release Preparation + - Alpha/beta releases are automated via `release.yml` schedule - Stable releases: merge `dev` → `main`, then trigger `release.yml` with `stable` type - Use `dry_run: true` to validate before actual publish - Performance validation on realistic data sets -- In Release Notes or Comments, do not reference that it was authored by Claude (all code is anonymously authored) +- In Release Notes or Comments, do not reference that it was authored by an AI agent (all code is anonymously authored) -This guide provides the essential information for DataFog development while maintaining focus on current priorities and recent GLiNER integration work. \ No newline at end of file +This guide provides the essential information for DataFog development while maintaining focus on current priorities and recent GLiNER integration work. diff --git a/CHANGELOG.MD b/CHANGELOG.MD index 976e9cc5..45d042da 100644 --- a/CHANGELOG.MD +++ b/CHANGELOG.MD @@ -1,5 +1,70 @@ # ChangeLog +## [Unreleased] + +### `datafog-python` [4.5.0] + +#### Release Thesis + +- Frames 4.5.0 as a focused, lightweight text PII screening release rather + than a v5 package overhaul. +- Keeps the first path centered on core install, regex scanning/redaction, + CLI text commands, and agent-oriented guardrail helpers. +- Defers dedicated Sentry, OpenTelemetry, logging-framework, and cloud DLP + middleware adapters to v5 planning. + +#### Core Text PII Screening + +- Clarifies the live top-level APIs: `scan`, `redact`, `protect`, + `scan_prompt`, `filter_output`, `sanitize`, and guardrail helpers. +- Documents the current module map so users and contributors can distinguish + live 4.5 modules from historical compatibility and audit artifacts. +- Preserves backward-compatible `DataFog` and `TextService` entry points. + +#### German Structured PII + +- Adds regex-only German structured PII support without adding core + dependencies. +- Detects German VAT IDs and German IBANs by default because their country-code + structure is precise enough for default screening. +- Enables broader German identifiers only through `locales=["de"]` or explicit + entity selection, including German tax IDs, pension insurance numbers, + postal codes, passport numbers, and residence permit numbers. + +#### Optional Profiles And Python 3.13 + +- Certifies Python 3.13 support for the core SDK, CLI, `nlp`, `nlp-advanced`, + and `ocr` install profiles. +- Adds CI coverage for Python 3.13 `nlp` and `nlp-advanced` test profiles plus + 3.13 smoke checks for `nlp`, `nlp-advanced`, and `ocr`. +- Documents Donut OCR as requiring a local model before runtime use. +- Leaves `distributed` and `all` outside the new Python 3.13 certification + claim for 4.5.0. + +#### Optional OCR And Spark Surfaces + +- Documents OCR and Spark as supported optional surfaces, not deprecated + features and not the main 4.5 adoption path. +- Keeps local OCR behind `datafog[ocr]`, URL image inputs behind + `datafog[web,ocr]`, Donut behind `datafog[nlp-advanced,ocr]`, and Spark + behind `datafog[distributed]`. + +#### Telemetry And Privacy + +- Documents telemetry behavior without changing defaults. +- Telemetry remains disabled unless `DATAFOG_TELEMETRY=1` is set. +- `DATAFOG_NO_TELEMETRY=1` and `DO_NOT_TRACK=1` continue to force telemetry + off for tests, CI, and privacy-sensitive environments. + +#### Release Readiness + +- Adds a 4.5 release-readiness checklist covering docs build, formatting, + core no-network checks, install-profile smoke checks, German regex tests, + broad non-slow tests, package build checks, and final CI status. +- Clarifies the version alignment path: the development package remains + `4.4.0a5` until stable release promotion, and the final stable release should + publish as `4.5.0`. + ## [2026-02-13] ### `datafog-python` [4.3.0] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 483a2b3e..6e7e416e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,6 +27,11 @@ pushes except for explicit emergency maintenance. ## Local Development +The current contributor runbook for DataFog 4.5 lives in +[`docs/contributing.rst`](docs/contributing.rst). It includes supported Python +versions, install profiles, focused and broad test commands, docs-build +verification, and the 4.5 release-flow boundary. + ```bash git clone https://github.com/datafog/datafog-python cd datafog-python @@ -43,17 +48,33 @@ package install: pip install -r requirements-dev.txt ``` -For optional NLP or OCR work, install the relevant extras: +For optional NLP, OCR, or distributed work, install the relevant extras: ```bash pip install -e ".[dev,cli,nlp]" pip install -e ".[dev,cli,nlp,nlp-advanced]" +pip install -e ".[dev,cli,ocr]" +pip install -e ".[dev,cli,distributed]" pip install -e ".[all,dev]" ``` ## Tests -Run the core test suite before opening a pull request: +Run focused tests for the area you changed before opening a pull request. For +core import and dependency-boundary work, use: + +```bash +DATAFOG_NO_TELEMETRY=1 DO_NOT_TRACK=1 \ + pytest tests/test_runtime_dependency_safety.py tests/test_no_network_core.py -q +``` + +For broader local confidence, run the non-slow suite: + +```bash +pytest -m "not slow" -q +``` + +To mimic the core CI profile, run: ```bash pytest tests/ -m "not slow" \ @@ -68,9 +89,12 @@ Run the focused test file for the area you changed whenever possible. For documentation-only changes, build the docs: ```bash -sphinx-build -b html docs docs/_build/html +python -m sphinx -b html docs docs/_build/html ``` +See [`docs/contributing.rst`](docs/contributing.rst) for optional-profile smoke +commands and release-prep checks. + ## Pull Request Checklist Before requesting review: diff --git a/README.md b/README.md index 62f7e10d..6f080e30 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,14 @@ It provides: - A simple agent-oriented API for LLM applications - Backward-compatible `DataFog` and `TextService` classes +## 4.5 Focus + +DataFog 4.5 is focused on lightweight text PII screening: a small core install, +fast regex-based scan/redact helpers, explicit optional extras, and a clearer +path toward future middleware use cases. Dedicated Sentry, OpenTelemetry, +logging-framework, and cloud DLP adapters are future-facing work and are not +part of the 4.5 release. + ## Installation ```bash @@ -21,13 +29,20 @@ pip install datafog[nlp] # Add GLiNER + spaCy support pip install datafog[nlp-advanced] +# Add local OCR support +pip install datafog[ocr] + +# Add Spark/distributed support +pip install datafog[distributed] + # Everything pip install datafog[all] ``` -Python 3.13 support is certified for the core SDK and CLI. Optional extras such -as `nlp`, `nlp-advanced`, `ocr`, `distributed`, and `all` are available but not -yet certified on Python 3.13. +Python 3.13 support is certified for the core SDK, CLI, `nlp`, +`nlp-advanced`, and `ocr` install profiles. Donut OCR still requires a model +that is available locally before runtime use. `distributed` and `all` are not +newly certified on Python 3.13 in the 4.5 line. ## Quick Start @@ -62,6 +77,25 @@ print(datafog.sanitize("Card: 4111-1111-1111-1111", engine="regex")) # Card: [CREDIT_CARD_1] ``` +## German Structured PII + +German VAT IDs and German IBANs are detected by the default regex path because +their country-code structure is specific enough for default-on screening. +Broader German identifiers are available with explicit locale selection or +explicit entity-type filtering. + +```python +import datafog + +text = "Steuer-ID 12345678901 liegt vor." + +print(datafog.scan(text, engine="regex").entities) +# [] + +print(datafog.scan(text, engine="regex", locales=["de"]).entities) +# [Entity(type='DE_TAX_ID', text='12345678901', ...)] +``` + ### Guardrails ```python @@ -84,7 +118,8 @@ Use the engine that matches your accuracy and dependency constraints: - `regex`: - Fastest and always available. - - Best for structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DATE`, `ZIP_CODE`. + - Best for structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DE_VAT_ID`, `DE_IBAN`, `DATE`, `ZIP_CODE`. + - Use `locales=["de"]` for broader German structured IDs such as `DE_TAX_ID`, `DE_POSTAL_CODE`, and passport or residence permit numbers. - `spacy`: - Requires `pip install datafog[nlp]`. - Useful for unstructured entities like person and organization names. @@ -95,6 +130,31 @@ Use the engine that matches your accuracy and dependency constraints: - Cascades regex with optional NER engines. - If optional deps are missing, it degrades gracefully and warns. +## Optional OCR And Spark Surfaces + +DataFog 4.5 keeps the main package story centered on lightweight text PII +screening. OCR and Spark remain supported optional surfaces for users who +already rely on them, but they are not required for the core import, default +scan/redact helpers, or guardrail helpers. + +- OCR: + - Install `datafog[ocr]` for local image OCR helpers. + - URL-based image downloading also needs `datafog[web,ocr]`. + - Tesseract usage requires the system `tesseract` binary. + - Python 3.13 is validated for the OCR install profile, Pillow, + pytesseract, and system Tesseract smoke checks. + - Donut OCR requires `datafog[nlp-advanced,ocr]` and a model already available + locally. +- Spark: + - Install `datafog[distributed]` for `SparkService`. + - Spark PII UDF helpers also require `datafog[nlp]` and an installed spaCy + model. + - A Java runtime is required by PySpark. + +OCR and Spark are not deprecated. Their broader API and packaging overhaul is +deferred; the 4.5 goal is to keep them explicit, documented, and isolated from +the lightweight core path. + ## Backward-Compatible APIs The existing public API remains available. @@ -132,6 +192,9 @@ datafog replace-text "john@example.com" # Hash detected entities datafog hash-text "john@example.com" + +# Enable broader German regex identifiers +datafog redact-text "Steuer-ID 12345678901" --locale de ``` ## Telemetry diff --git a/datafog/__init__.py b/datafog/__init__.py index e3974ad7..4f7567a8 100644 --- a/datafog/__init__.py +++ b/datafog/__init__.py @@ -61,24 +61,6 @@ def _lazy_import_regex_annotator(): globals()["RegexAnnotator"] = RegexAnnotator -# Optional imports with graceful fallback -try: - from .client import app -except ImportError: - app = None - -try: - from .main import DataFog, TextPIIAnnotator -except ImportError: - DataFog = None - TextPIIAnnotator = None - -try: - from .services.text_service import TextService -except ImportError: - TextService = None - - def __getattr__(name: str): """Handle lazy imports for better lightweight performance.""" # Lazy import core models when first accessed @@ -98,46 +80,53 @@ def __getattr__(name: str): _lazy_import_regex_annotator() return globals()[name] - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") - + elif name in _LAZY_EXPORTS: + module_path, attr_name, extra_name = _LAZY_EXPORTS[name] + try: + module = __import__(module_path, fromlist=[attr_name]) + value = getattr(module, attr_name) + except ImportError: + if extra_name is None: + value = None + else: -# Optional heavy features - only import if dependencies available -def _optional_import(name, module_path, extra_name): - """Helper to import optional modules with helpful error messages.""" - try: - module = __import__(module_path, fromlist=[name]) - return getattr(module, name) - except ImportError: - - def _missing_dependency(*args, **kwargs): - raise ImportError( - f"{name} requires additional dependencies. " - f"Install with: pip install datafog[{extra_name}]" - ) + def _missing_dependency(*args, **kwargs): + raise ImportError( + f"{name} requires additional dependencies. " + f"Install with: pip install datafog[{extra_name}]" + ) - return _missing_dependency + value = _missing_dependency + globals()[name] = value + return value -# OCR/Image processing - requires 'ocr' extra -DonutProcessor = _optional_import( - "DonutProcessor", "datafog.processing.image_processing.donut_processor", "ocr" -) -PytesseractProcessor = _optional_import( - "PytesseractProcessor", - "datafog.processing.image_processing.pytesseract_processor", - "ocr", -) -ImageService = _optional_import("ImageService", "datafog.services.image_service", "ocr") + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") -# NLP processing - requires 'nlp' extra -SpacyPIIAnnotator = _optional_import( - "SpacyPIIAnnotator", "datafog.processing.text_processing.spacy_pii_annotator", "nlp" -) -# Distributed processing - requires 'distributed' extra -SparkService = _optional_import( - "SparkService", "datafog.services.spark_service", "distributed" -) +_LAZY_EXPORTS = { + "app": ("datafog.client", "app", None), + "DataFog": ("datafog.main", "DataFog", None), + "TextPIIAnnotator": ("datafog.main", "TextPIIAnnotator", None), + "TextService": ("datafog.services.text_service", "TextService", None), + "DonutProcessor": ( + "datafog.processing.image_processing.donut_processor", + "DonutProcessor", + "ocr", + ), + "PytesseractProcessor": ( + "datafog.processing.image_processing.pytesseract_processor", + "PytesseractProcessor", + "ocr", + ), + "ImageService": ("datafog.services.image_service", "ImageService", "ocr"), + "SpacyPIIAnnotator": ( + "datafog.processing.text_processing.spacy_pii_annotator", + "SpacyPIIAnnotator", + "nlp", + ), + "SparkService": ("datafog.services.spark_service", "SparkService", "distributed"), +} _REDACT_PRESETS = { @@ -163,6 +152,7 @@ def scan( text: str, engine: str = "regex", entity_types: list[str] | None = None, + locales: list[str] | None = None, ) -> ScanResult: """ v5-preview scan entrypoint. @@ -170,7 +160,7 @@ def scan( Defaults to the lightweight regex engine so the core install works without optional dependency fallback warnings. """ - return _scan(text=text, engine=engine, entity_types=entity_types) + return _scan(text=text, engine=engine, entity_types=entity_types, locales=locales) def redact( @@ -180,6 +170,7 @@ def redact( entity_types: list[str] | None = None, strategy: str = "token", preset: str | None = None, + locales: list[str] | None = None, ) -> RedactResult: """ v5-preview redaction entrypoint. @@ -202,6 +193,7 @@ def redact( engine=engine, entity_types=entity_types, strategy=strategy, + locales=locales, ) @@ -210,6 +202,7 @@ def protect( engine: str = "regex", strategy: str = "token", on_detect: str = "redact", + locales: list[str] | None = None, ): """ v5-preview guardrail factory. @@ -219,6 +212,7 @@ def protect( engine=engine, strategy=strategy, on_detect=on_detect, + locales=locales, ) diff --git a/datafog/__init___lean.py b/datafog/__init___lean.py index 40a3f530..5c25f9bc 100644 --- a/datafog/__init___lean.py +++ b/datafog/__init___lean.py @@ -1,4 +1,10 @@ -""" +"""Historical shadow package export module. + +This file is not the live DataFog 4.5 package export surface. Use +``datafog/__init__.py`` for current package exports, lazy optional imports, +top-level helpers, and compatibility shims. This snapshot remains importable +only as historical reference until legacy cleanup can remove it safely. + DataFog: Lightning-fast PII detection and anonymization library. Core package provides regex-based PII detection with 190x performance advantage. diff --git a/datafog/__init___original.py b/datafog/__init___original.py index 7838dd31..380511b1 100644 --- a/datafog/__init___original.py +++ b/datafog/__init___original.py @@ -1,3 +1,12 @@ +"""Historical shadow package export module. + +This file is not the live DataFog 4.5 package export surface. Use +``datafog/__init__.py`` for current package exports, lazy optional imports, +top-level helpers, and compatibility shims. This eager-export snapshot remains +importable only as historical reference until legacy cleanup can remove it +safely. +""" + from .__about__ import __version__ from .client import app from .config import OperationType, get_config diff --git a/datafog/agent.py b/datafog/agent.py index 58a84ed7..a031a147 100644 --- a/datafog/agent.py +++ b/datafog/agent.py @@ -27,11 +27,7 @@ class GuardrailWatch: def scan(self, text: str) -> ScanResult: """Scan text and increment detection counters.""" - result = scan( - text=text, - engine=self.guardrail.engine, - entity_types=self.guardrail.entity_types, - ) + result = self.guardrail.scan(text) if result.entities: self.detections += len(result.entities) return result @@ -51,7 +47,8 @@ class Guardrail: """Reusable text guardrail for wrapping LLM prompts and outputs.""" entity_types: Optional[list[str]] = None - engine: str = "smart" + locales: Optional[list[str]] = None + engine: str = "regex" strategy: str = "token" on_detect: str = "redact" @@ -61,7 +58,12 @@ def __post_init__(self) -> None: def scan(self, text: str) -> ScanResult: """Scan a text value for entities.""" - return scan(text=text, engine=self.engine, entity_types=self.entity_types) + return scan( + text=text, + engine=self.engine, + entity_types=self.entity_types, + locales=self.locales, + ) def filter(self, text: str) -> RedactResult: """Scan then enforce configured behavior.""" @@ -70,6 +72,7 @@ def filter(self, text: str) -> RedactResult: engine=self.engine, entity_types=self.entity_types, strategy=self.strategy, + locales=self.locales, ) if not result.entities: return result @@ -111,33 +114,34 @@ def watch(self) -> Iterator[GuardrailWatch]: yield watcher -def sanitize(text: str, **kwargs: Any) -> str: +def sanitize(text: str, engine: str = "regex", **kwargs: Any) -> str: """ One-liner PII removal. Returns the redacted text only. """ - result = scan_and_redact(text=text, **kwargs) + result = scan_and_redact(text=text, engine=engine, **kwargs) return result.redacted_text -def scan_prompt(prompt: str, **kwargs: Any) -> ScanResult: +def scan_prompt(prompt: str, engine: str = "regex", **kwargs: Any) -> ScanResult: """ Scan an LLM prompt for PII without modifying the input text. """ - return scan(prompt, **kwargs) + return scan(prompt, engine=engine, **kwargs) -def filter_output(output: str, **kwargs: Any) -> RedactResult: +def filter_output(output: str, engine: str = "regex", **kwargs: Any) -> RedactResult: """ Scan and redact PII from model output before returning to users. """ - return scan_and_redact(output, **kwargs) + return scan_and_redact(output, engine=engine, **kwargs) def create_guardrail( entity_types: Optional[list[str]] = None, - engine: str = "smart", + locales: Optional[list[str]] = None, + engine: str = "regex", strategy: str = "token", on_detect: str = "redact", ) -> Guardrail: @@ -146,6 +150,7 @@ def create_guardrail( """ return Guardrail( entity_types=entity_types, + locales=locales, engine=engine, strategy=strategy, on_detect=on_detect, diff --git a/datafog/client.py b/datafog/client.py index c3d493c8..eab203b8 100644 --- a/datafog/client.py +++ b/datafog/client.py @@ -6,7 +6,7 @@ import asyncio import logging -from typing import List +from typing import List, Optional import typer @@ -104,6 +104,11 @@ def scan_text( None, help="List of texts to extract text from" ), operations: str = typer.Option("scan", help="Operation to perform"), + locale: Optional[List[str]] = typer.Option( + None, + "--locale", + help="Regex locale to enable, for example de.", + ), ): """ Scan texts for PII. @@ -123,7 +128,7 @@ def scan_text( logging.basicConfig(level=logging.INFO) # Convert comma-separated string operations to a list of OperationType objects operation_list = [OperationType(op.strip()) for op in operations.split(",")] - text_client = DataFog(operations=operation_list) + text_client = DataFog(operations=operation_list, locales=locale) try: results = text_client.run_text_pipeline_sync(str_list=str_list) typer.echo(f"Text Pipeline Results: {results}") @@ -316,7 +321,14 @@ def list_entities(): @app.command() -def redact_text(text: str = typer.Argument(None, help="Text to redact")): +def redact_text( + text: str = typer.Argument(None, help="Text to redact"), + locale: Optional[List[str]] = typer.Option( + None, + "--locale", + help="Regex locale to enable, for example de.", + ), +): """ Redact PII in text. @@ -329,7 +341,12 @@ def redact_text(text: str = typer.Argument(None, help="Text to redact")): typer.echo("No text provided to redact.") raise typer.Exit(code=1) - result = scan_and_redact(text=text, engine="smart", strategy="token") + result = scan_and_redact( + text=text, + engine="smart", + strategy="token", + locales=locale, + ) typer.echo(result.redacted_text) try: @@ -346,7 +363,14 @@ def redact_text(text: str = typer.Argument(None, help="Text to redact")): @app.command() -def replace_text(text: str = typer.Argument(None, help="Text to replace PII")): +def replace_text( + text: str = typer.Argument(None, help="Text to replace PII"), + locale: Optional[List[str]] = typer.Option( + None, + "--locale", + help="Regex locale to enable, for example de.", + ), +): """ Replace PII in text with anonymized values. @@ -359,7 +383,12 @@ def replace_text(text: str = typer.Argument(None, help="Text to replace PII")): typer.echo("No text provided to replace PII.") raise typer.Exit(code=1) - result = scan_and_redact(text=text, engine="smart", strategy="pseudonymize") + result = scan_and_redact( + text=text, + engine="smart", + strategy="pseudonymize", + locales=locale, + ) typer.echo(result.redacted_text) try: @@ -379,6 +408,11 @@ def replace_text(text: str = typer.Argument(None, help="Text to replace PII")): def hash_text( text: str = typer.Argument(None, help="Text to hash PII"), hash_type: HashType = typer.Option(HashType.SHA256, help="Hash algorithm to use"), + locale: Optional[List[str]] = typer.Option( + None, + "--locale", + help="Regex locale to enable, for example de.", + ), ): """ Choose from SHA256, MD5, or SHA3-256 algorithms to hash detected PII in text. @@ -395,7 +429,12 @@ def hash_text( # HashType is retained for backward-compatible CLI signature. _ = hash_type - result = scan_and_redact(text=text, engine="smart", strategy="hash") + result = scan_and_redact( + text=text, + engine="smart", + strategy="hash", + locales=locale, + ) typer.echo(result.redacted_text) try: diff --git a/datafog/core.py b/datafog/core.py index f4e17850..aa207f6d 100644 --- a/datafog/core.py +++ b/datafog/core.py @@ -9,6 +9,7 @@ from datafog.engine import scan, scan_and_redact from datafog.models.anonymizer import AnonymizerType +from datafog.processing.text_processing.regex_annotator import RegexAnnotator # Engine types as constants REGEX_ENGINE = "regex" @@ -16,7 +17,10 @@ AUTO_ENGINE = "auto" -def detect_pii(text: str) -> Dict[str, List[str]]: +def detect_pii( + text: str, + locales: List[str] | None = None, +) -> Dict[str, List[str]]: """ Simple PII detection using lightweight regex engine. @@ -37,7 +41,7 @@ def detect_pii(text: str) -> Dict[str, List[str]]: try: # Use engine boundary for canonical scan behavior. - scan_result = scan(text=text, engine=REGEX_ENGINE) + scan_result = scan(text=text, engine=REGEX_ENGINE, locales=locales) pii_dict: Dict[str, List[str]] = {} for entity in scan_result.entities: if not entity.text.strip(): @@ -81,7 +85,11 @@ def detect_pii(text: str) -> Dict[str, List[str]]: ) from e -def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> str: +def anonymize_text( + text: str, + method: Union[str, AnonymizerType] = "redact", + locales: List[str] | None = None, +) -> str: """ Simple text anonymization using lightweight regex engine. @@ -120,6 +128,7 @@ def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> text=text, engine=REGEX_ENGINE, strategy=strategy_map[method], + locales=locales, ) try: @@ -155,7 +164,9 @@ def anonymize_text(text: str, method: Union[str, AnonymizerType] = "redact") -> def scan_text( - text: str, return_entities: bool = False + text: str, + return_entities: bool = False, + locales: List[str] | None = None, ) -> Union[bool, Dict[str, List[str]]]: """ Quick scan to check if text contains any PII. @@ -180,7 +191,7 @@ def scan_text( _start = _time.monotonic() - entities = detect_pii(text) + entities = detect_pii(text, locales=locales) result = entities if return_entities else len(entities) > 0 @@ -200,7 +211,7 @@ def scan_text( return result -def get_supported_entities() -> List[str]: +def get_supported_entities(locales: List[str] | None = None) -> List[str]: """ Get list of PII entity types supported by the regex engine. @@ -210,17 +221,15 @@ def get_supported_entities() -> List[str]: Example: >>> entities = get_supported_entities() >>> print(entities) - ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DOB', 'ZIP'] + ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'IP_ADDRESS', 'DE_VAT_ID', 'DE_IBAN', 'DATE', 'ZIP_CODE'] """ - result = [ - "EMAIL", - "PHONE", - "SSN", - "CREDIT_CARD", - "IP_ADDRESS", - "DATE", - "ZIP_CODE", - ] + annotator = RegexAnnotator(locales=locales) + legacy_map = {"DOB": "DATE", "ZIP": "ZIP_CODE"} + result = [] + for label in annotator.active_labels: + canonical = legacy_map.get(label, label) + if canonical not in result: + result.append(canonical) try: from datafog.telemetry import track_function_call diff --git a/datafog/engine.py b/datafog/engine.py index 1a94e634..250cd6fe 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -31,6 +31,13 @@ "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", "DATE", "ZIP_CODE", "PERSON", @@ -41,6 +48,20 @@ NER_ENTITY_TYPES = {"PERSON", "ORGANIZATION", "LOCATION", "ADDRESS"} +ENTITY_TYPE_PRIORITY = { + "DE_IBAN": 100, + "DE_VAT_ID": 100, + "DE_TAX_ID": 100, + "DE_SOCIAL_SECURITY_NUMBER": 100, + "DE_POSTAL_CODE": 100, + "DE_PASSPORT_NUMBER": 100, + "DE_RESIDENCE_PERMIT_NUMBER": 100, + "CREDIT_CARD": 90, + "IP_ADDRESS": 80, + "SSN": 70, + "PHONE": 60, +} + @dataclass(frozen=True) class _UnavailableAnnotator: @@ -131,8 +152,40 @@ def _entities_from_dict( return entities -def _regex_entities(text: str) -> list[Entity]: - annotator = RegexAnnotator() +def _entity_length(entity: Entity) -> int: + return max(entity.end - entity.start, 0) + + +def _entities_overlap(left: Entity, right: Entity) -> bool: + if left.start < 0 or right.start < 0: + return False + return left.start < right.end and right.start < left.end + + +def _suppress_overlapping_entities(entities: list[Entity]) -> list[Entity]: + selected: list[Entity] = [] + for entity in sorted( + entities, + key=lambda item: ( + -_entity_length(item), + -ENTITY_TYPE_PRIORITY.get(item.type, 0), + item.start, + item.end, + item.type, + ), + ): + if any(_entities_overlap(entity, kept) for kept in selected): + continue + selected.append(entity) + return sorted(selected, key=lambda item: (item.start, item.end, item.type)) + + +def _regex_entities( + text: str, + entity_types: Optional[list[str]] = None, + locales: Optional[list[str]] = None, +) -> list[Entity]: + annotator = RegexAnnotator(locales=locales, enabled_labels=entity_types) _, structured = annotator.annotate_with_spans(text) entities: list[Entity] = [] for span in structured.spans: @@ -148,7 +201,7 @@ def _regex_entities(text: str) -> list[Entity]: engine="regex", ) ) - return entities + return _suppress_overlapping_entities(entities) def _spacy_entities(text: str) -> list[Entity]: @@ -235,6 +288,7 @@ def scan( text: str, engine: str = "smart", entity_types: Optional[list[str]] = None, + locales: Optional[list[str]] = None, ) -> ScanResult: """Scan text for PII entities.""" if not isinstance(text, str): @@ -243,7 +297,11 @@ def scan( if engine not in {"regex", "spacy", "gliner", "smart"}: raise ValueError("engine must be one of: regex, spacy, gliner, smart") - regex_entities = _regex_entities(text) + regex_entities = _regex_entities( + text, + entity_types=entity_types, + locales=locales, + ) if engine == "regex": filtered = _filter_entity_types(regex_entities, entity_types) @@ -378,7 +436,13 @@ def scan_and_redact( engine: str = "smart", entity_types: Optional[list[str]] = None, strategy: str = "token", + locales: Optional[list[str]] = None, ) -> RedactResult: """Convenience wrapper: scan then redact.""" - scan_result = scan(text=text, engine=engine, entity_types=entity_types) + scan_result = scan( + text=text, + engine=engine, + entity_types=entity_types, + locales=locales, + ) return redact(text=text, entities=scan_result.entities, strategy=strategy) diff --git a/datafog/main.py b/datafog/main.py index 31ac22e5..a8b1bcba 100644 --- a/datafog/main.py +++ b/datafog/main.py @@ -39,8 +39,10 @@ def __init__( operations: List[OperationType] = [OperationType.SCAN], hash_type: HashType = HashType.SHA256, anonymizer_type: AnonymizerType = AnonymizerType.REPLACE, + locales: List[str] | None = None, ): - self.regex_annotator = RegexAnnotator() + self.locales = locales + self.regex_annotator = RegexAnnotator(locales=locales) normalized_ops: List[OperationType] = [] for op in operations: if isinstance(op, OperationType): @@ -181,7 +183,7 @@ def detect(self, text: str) -> dict: _start = _time.monotonic() - scan_result = scan(text=text, engine="regex") + scan_result = scan(text=text, engine="regex", locales=self.locales) result = {label: [] for label in RegexAnnotator.LABELS} legacy_map = {"DATE": "DOB", "ZIP_CODE": "ZIP"} for entity in scan_result.entities: @@ -246,6 +248,7 @@ def process( text=text, engine="regex", strategy=strategy, + locales=self.locales, ) result["anonymized"] = redact_result.redacted_text @@ -288,8 +291,8 @@ class TextPIIAnnotator: regex_annotator: RegexAnnotator instance for text annotation. """ - def __init__(self): - self.regex_annotator = RegexAnnotator() + def __init__(self, locales: List[str] | None = None): + self.regex_annotator = RegexAnnotator(locales=locales) def run(self, text, output_path=None): """ diff --git a/datafog/main_lean.py b/datafog/main_lean.py index af61559e..f67501e6 100644 --- a/datafog/main_lean.py +++ b/datafog/main_lean.py @@ -1,4 +1,10 @@ -""" +"""Historical shadow main module. + +This file is not the live DataFog 4.5 ``DataFog`` implementation. Use +``datafog/main.py`` for the current backward-compatible class and text/OCR +compatibility methods. This lean snapshot remains importable only as historical +reference until legacy cleanup can remove it safely. + Lean main module for DataFog core functionality. This module contains the lightweight core classes for DataFog: diff --git a/datafog/main_original.py b/datafog/main_original.py index 58224e59..9594c824 100644 --- a/datafog/main_original.py +++ b/datafog/main_original.py @@ -1,4 +1,11 @@ -""" +"""Historical shadow main module. + +This file is not the live DataFog 4.5 ``DataFog`` implementation. Use +``datafog/main.py`` for the current backward-compatible class and use the +optional OCR/Spark service modules directly when those surfaces are needed. +This original full-featured snapshot remains importable for legacy tests until +cleanup can remove it safely. + Main module for DataFog. This module contains the core classes for DataFog: diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index a843a8d8..d478b43d 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -1,5 +1,6 @@ import re -from typing import Dict, List, Pattern, Tuple +from collections.abc import Iterable +from typing import Dict, List, Match, Pattern, Tuple from pydantic import BaseModel @@ -27,12 +28,36 @@ class RegexAnnotator: performance, targeting ≤ 20 µs / kB on a MacBook M-series. """ - # Labels for PII entities - LABELS = ["EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", "DOB", "ZIP"] + BASE_LABELS = ["EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS", "DOB", "ZIP"] + GERMAN_LABELS = [ + "DE_VAT_ID", + "DE_IBAN", + "DE_TAX_ID", + "DE_SOCIAL_SECURITY_NUMBER", + "DE_POSTAL_CODE", + "DE_PASSPORT_NUMBER", + "DE_RESIDENCE_PERMIT_NUMBER", + ] + DEFAULT_LOCALIZED_LABELS = ["DE_VAT_ID", "DE_IBAN"] + LABELS = BASE_LABELS + GERMAN_LABELS + DEFAULT_LABELS = BASE_LABELS + DEFAULT_LOCALIZED_LABELS + SUPPORTED_LOCALES = {"de", "de-de", "de_de"} + LOCALE_LABELS = { + "de": GERMAN_LABELS, + "de-de": GERMAN_LABELS, + "de_de": GERMAN_LABELS, + } + + def __init__( + self, + locales: str | Iterable[str] | None = None, + enabled_labels: Iterable[str] | None = None, + ): + self.locales = self._normalize_locales(locales) + self.active_labels = self._resolve_active_labels(enabled_labels) - def __init__(self): # Compile all patterns once at initialization - self.patterns: Dict[str, Pattern] = { + all_patterns: Dict[str, Pattern] = { # Email pattern - RFC 5322 subset # Intentionally permissive to favor false positives over false negatives # Allows for multiple dots, special characters in local part, and subdomains @@ -175,12 +200,171 @@ def __init__(self): """, re.IGNORECASE | re.MULTILINE | re.VERBOSE, ), + # German VAT ID (USt-IdNr) - DE followed by 9 digits. + "DE_VAT_ID": re.compile( + r""" + (? + (? + (? + (?: + PLZ[\s:-]? | + DE[\s-] | + D[\s-] + ) + \d{5} + ) + (?![A-Za-z0-9]) + """, + re.IGNORECASE | re.MULTILINE | re.VERBOSE, + ), + # German passport number - context required; bare A12345678 is too broad. + "DE_PASSPORT_NUMBER": re.compile( + r""" + (?: + Passnummer | + Reisepass(?:nummer)? | + Passport(?:\s+No\.?|\s+Number)? + ) + \s*[:#-]?\s* + (?P + (? + (? tuple[str, ...]: + if locales is None: + return () + if isinstance(locales, str): + locales = [locales] + normalized = [] + for locale in locales: + value = locale.strip().lower() + if not value: + continue + if value not in cls.SUPPORTED_LOCALES: + allowed = ", ".join(sorted(cls.SUPPORTED_LOCALES)) + raise ValueError(f"locale must be one of: {allowed}") + normalized.append(value) + return tuple(dict.fromkeys(normalized)) + + def _resolve_active_labels(self, enabled_labels: Iterable[str] | None) -> list[str]: + active = set(self.DEFAULT_LABELS) + for locale in self.locales: + active.update(self.LOCALE_LABELS[locale]) + if enabled_labels is not None: + active.update(label.strip().upper() for label in enabled_labels) + return [label for label in self.LABELS if label in active] + + @staticmethod + def _match_text(match: Match[str]) -> str: + return match.groupdict().get("value") or match.group() + + @staticmethod + def _match_span(match: Match[str]) -> tuple[int, int]: + if "value" in match.groupdict() and match.group("value") is not None: + return match.start("value"), match.end("value") + return match.start(), match.end() @classmethod - def create(cls) -> "RegexAnnotator": + def create(cls, **kwargs) -> "RegexAnnotator": """Factory method to create a new RegexAnnotator instance.""" - return cls() + return cls(**kwargs) def annotate(self, text: str) -> Dict[str, List[str]]: """Annotate text with PII entities using regex patterns. @@ -200,7 +384,7 @@ def annotate(self, text: str) -> Dict[str, List[str]]: # Process with each pattern for label, pattern in self.patterns.items(): for match in pattern.finditer(text): - result[label].append(match.group()) + result[label].append(self._match_text(match)) return result @@ -225,11 +409,12 @@ def annotate_with_spans( for label, pattern in self.patterns.items(): for match in pattern.finditer(text): + start, end = self._match_span(match) span = Span( label=label, - start=match.start(), - end=match.end(), - text=match.group(), + start=start, + end=end, + text=self._match_text(match), ) spans_by_label[label].append(span) all_spans.append(span) diff --git a/datafog/services/text_service.py b/datafog/services/text_service.py index 0956256f..1057eb6a 100644 --- a/datafog/services/text_service.py +++ b/datafog/services/text_service.py @@ -43,6 +43,7 @@ def __init__( text_chunk_length: int = 1000, engine: str = "regex", gliner_model: str = "urchade/gliner_multi_pii-v1", + locales: List[str] | None = None, ): """ Initialize the TextService with specified chunk length and annotation engine. @@ -56,6 +57,8 @@ def __init__( - "auto": Try RegexAnnotator first and fall back to SpacyPIIAnnotator if no entities found - "smart": Try RegexAnnotator → GLiNER → SpaCy cascade (requires nlp-advanced extra) gliner_model: GLiNER model name to use when engine is "gliner" or "smart" + locales: Optional locale tags for regex detection. Use ["de"] to enable + broader German structured identifiers. Raises: AssertionError: If an invalid engine type is provided @@ -65,6 +68,7 @@ def __init__( self.engine = engine self.text_chunk_length = text_chunk_length self.gliner_model = gliner_model + self.locales = locales # Lazy initialization - annotators created only when needed self._regex_annotator = None @@ -102,7 +106,7 @@ def regex_annotator(self): RegexAnnotator, ) - self._regex_annotator = RegexAnnotator() + self._regex_annotator = RegexAnnotator(locales=self.locales) return self._regex_annotator @property diff --git a/datafog/services/text_service_lean.py b/datafog/services/text_service_lean.py index ce9203ec..1262db3f 100644 --- a/datafog/services/text_service_lean.py +++ b/datafog/services/text_service_lean.py @@ -1,4 +1,11 @@ -"""Lean text processing service for PII annotation. +"""Historical shadow text service module. + +This file is not the live DataFog 4.5 text service. Use +``datafog/services/text_service.py`` for the current regex, spaCy, GLiNER, +auto, and smart engine boundary. This lean snapshot remains importable only as +historical reference until legacy cleanup can remove it safely. + +Lean text processing service for PII annotation. Provides synchronous and asynchronous methods for annotating text with personally identifiable information (PII) using regex patterns. Supports chunking long texts diff --git a/datafog/services/text_service_original.py b/datafog/services/text_service_original.py index 6d5dde1b..8ad576a9 100644 --- a/datafog/services/text_service_original.py +++ b/datafog/services/text_service_original.py @@ -1,4 +1,11 @@ -"""Text processing service for PII annotation. +"""Historical shadow text service module. + +This file is not the live DataFog 4.5 text service. Use +``datafog/services/text_service.py`` for the current regex, spaCy, GLiNER, +auto, and smart engine boundary. This original snapshot remains importable for +legacy tests until cleanup can remove it safely. + +Text processing service for PII annotation. Provides synchronous and asynchronous methods for annotating text with personally identifiable information (PII) using SpaCy or regex patterns. Supports chunking long texts and batch processing. """ diff --git a/docs/agents/domain.md b/docs/agents/domain.md new file mode 100644 index 00000000..4fbc0445 --- /dev/null +++ b/docs/agents/domain.md @@ -0,0 +1,31 @@ +# Domain Docs + +How the engineering skills should consume this repo's domain documentation when exploring the codebase. + +Configured layout: single-context. + +## Before exploring, read these + +- **`CONTEXT.md`** at the repo root. +- **`docs/adr/`** for ADRs that touch the area you're about to work in. + +If any of these files don't exist, proceed silently. Don't flag their absence; don't suggest creating them upfront. The producer skill (`/grill-with-docs`) creates them lazily when terms or decisions actually get resolved. + +## File structure + +```text +/ +|-- CONTEXT.md +|-- docs/adr/ +`-- datafog/ +``` + +## Use the glossary's vocabulary + +When your output names a domain concept in an issue title, refactor proposal, hypothesis, or test name, use the term as defined in `CONTEXT.md`. Don't drift to synonyms the glossary explicitly avoids. + +If the concept you need isn't in the glossary yet, that's a signal: either you're inventing language the project doesn't use, or there's a real gap to note for `/grill-with-docs`. + +## Flag ADR conflicts + +If your output contradicts an existing ADR, surface it explicitly rather than silently overriding. diff --git a/docs/agents/issue-tracker.md b/docs/agents/issue-tracker.md new file mode 100644 index 00000000..b07dfca3 --- /dev/null +++ b/docs/agents/issue-tracker.md @@ -0,0 +1,24 @@ +# Issue tracker: Linear + +Issues and PRDs for this repo live in Linear under the DFPY team: + +https://linear.app/threadfork/team/DFPY/all + +Use the Linear connector/app when available. Do not create GitHub or GitLab issues for this repo unless the user explicitly asks for that. + +## Conventions + +- Create new issues in the DFPY team. +- Use the triage labels mapped in `docs/agents/triage-labels.md`. +- Keep issue titles concise and action-oriented. +- Include enough context, acceptance criteria, and verification notes for an AFK agent or human implementer to pick up the work. +- When referencing code, include repo-relative file paths and relevant symbols. +- When a task comes from a PRD, link related Linear issues together where possible. + +## When a skill says "publish to the issue tracker" + +Create a Linear issue in the DFPY team. + +## When a skill says "fetch the relevant ticket" + +Use the Linear connector/app to read the referenced Linear issue, including description, labels, status, comments, and linked issues. diff --git a/docs/agents/triage-labels.md b/docs/agents/triage-labels.md new file mode 100644 index 00000000..0806b2f8 --- /dev/null +++ b/docs/agents/triage-labels.md @@ -0,0 +1,13 @@ +# Triage Labels + +The skills speak in terms of five canonical triage roles. This file maps those roles to the actual label strings used in this repo's issue tracker. + +| Label in mattpocock/skills | Label in our tracker | Meaning | +| -------------------------- | -------------------- | ---------------------------------------- | +| `needs-triage` | `needs-triage` | Maintainer needs to evaluate this issue | +| `needs-info` | `needs-info` | Waiting on reporter for more information | +| `ready-for-agent` | `ready-for-agent` | Fully specified, ready for an AFK agent | +| `ready-for-human` | `ready-for-human` | Requires human implementation | +| `wontfix` | `wontfix` | Will not be actioned | + +When a skill mentions a role, use the corresponding label string from this table. diff --git a/docs/cli.rst b/docs/cli.rst index a4c67272..9742887d 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -7,6 +7,29 @@ Overview The main entrypoint for the CLI is through the DataFog client file, defined in :mod:`datafog.client`. We use Typer to build the CLI, with each command defined as a separate function. +Core text commands such as ``scan-text``, ``redact-text``, ``replace-text``, +and ``hash-text`` are the primary 4.5 CLI path. OCR commands remain available +for existing users, but they are optional: + +* Local image OCR requires ``datafog[ocr]`` and any needed system OCR binaries + such as Tesseract. +* URL-based image OCR also requires ``datafog[web,ocr]``. +* Donut OCR requires ``datafog[nlp-advanced,ocr]`` and a local model. + +Spark/distributed workflows are Python SDK surfaces rather than first-path CLI +commands. Install ``datafog[distributed]`` when using ``SparkService``. + +German locale support +--------------------- + +German VAT IDs and German IBANs are detected by the default regex path. Broader +German identifiers are opt-in through ``--locale de`` on the core text commands: + +.. code-block:: bash + + datafog scan-text "Steuer-ID 12345678901" --locale de + datafog redact-text "Passnummer C12345678" --locale de + Definitions ----------- .. automodule:: datafog.client diff --git a/docs/conf.py b/docs/conf.py index 1cb1c895..d71e76b0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,13 +3,18 @@ # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html +import re +from pathlib import Path + # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "DataFog" copyright = "2024, DataFog Inc." author = "Sid Mohan" -release = "v4.1.1" +_version_file = Path(__file__).resolve().parents[1] / "datafog" / "__about__.py" +_version_match = re.search(r'^__version__ = "([^"]+)"', _version_file.read_text(), re.M) +release = f"v{_version_match.group(1)}" if _version_match else "v0.0.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 00000000..b1b58777 --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1,211 @@ +======================================= +Contributor Setup And 4.5 Release Flow +======================================= + +This page is the contributor runbook for DataFog 4.5 work. It is meant for +humans and agents preparing local changes, choosing verification commands, and +understanding where the 4.5 release boundary sits. + +Version Frame +============= + +Current release planning uses this frame: + +* Stable package release: ``4.4.0``. +* Current development package version: ``4.4.0a5``. +* Next minor target: ``4.5.0``. + +Do not bump routine feature, documentation, or cleanup branches directly to +``4.5.0``. Keep the version stable during local release-prep work, then handle +the final version and release-note alignment in the release-readiness slice. + +Python Environments +=================== + +DataFog currently declares support for Python ``>=3.10,<3.14``. The CI matrix +tests core, NLP, and NLP-advanced installs on Python 3.10, 3.11, 3.12, and +3.13. OCR profile smoke checks also run on Python 3.13 with system Tesseract +installed. Distributed and all-profile Python 3.13 validation remain outside +the 4.5 support claim. + +Create one virtual environment per Python version when you need to compare +profiles locally: + +.. code-block:: bash + + python3.12 -m venv .venv312 + source .venv312/bin/activate + python -m pip install --upgrade pip + +For another version, keep the environment name explicit: + +.. code-block:: bash + + python3.10 -m venv .venv310 + python3.11 -m venv .venv311 + python3.13 -m venv .venv313 + +Install Profiles +================ + +Install the package in editable mode with the smallest profile that matches the +work you are doing: + +.. list-table:: + :header-rows: 1 + + * - Profile + - Command + - Notes + * - Core + - ``pip install -e .`` + - Lightweight regex engine and package import path. + * - Core test + CLI + - ``pip install -e ".[test,cli]" -r requirements-test.txt`` + - Matches the core CI test profile. + * - Docs + - ``pip install -e ".[docs]" -r requirements-docs.txt`` + - Enough to build Sphinx docs locally. + * - Local dev + - ``pip install -e ".[dev,cli]" && pip install -r requirements-dev.txt`` + - Test, docs, lint, formatting, and pre-commit tooling. + * - NLP + - ``pip install -e ".[test,cli,nlp]" -r requirements-test.txt`` + - Also install the spaCy model needed for NLP tests. + * - NLP advanced + - ``pip install -e ".[test,cli,nlp,nlp-advanced]" -r requirements-test.txt`` + - Also install spaCy and GLiNER models explicitly. + * - OCR + - ``pip install -e ".[test,ocr]" -r requirements-test.txt`` + - Tesseract workflows also need the system ``tesseract`` binary. + * - Distributed + - ``pip install -e ".[test,distributed]" -r requirements-test.txt`` + - Spark workflows also need a Java runtime. + * - All extras + - ``pip install -e ".[all,dev]"`` + - Use only when you deliberately want every optional surface locally. + +Optional model setup is explicit: + +.. code-block:: bash + + python -m spacy download en_core_web_lg + datafog download-model urchade/gliner_multi_pii-v1 --engine gliner + +Focused Verification +==================== + +Use focused checks for the area you touched before running broader suites. +Set the no-telemetry environment variables when testing core privacy and import +behavior: + +.. code-block:: bash + + export DATAFOG_NO_TELEMETRY=1 + export DO_NOT_TRACK=1 + +Core dependency and no-network checks: + +.. code-block:: bash + + python -m pytest tests/test_runtime_dependency_safety.py tests/test_no_network_core.py -q + +Run a changed test file directly when behavior changes: + +.. code-block:: bash + + python -m pytest tests/test_engine_api.py -q + python -m pytest tests/test_agent_api.py -q + python -m pytest tests/test_cli_smoke.py -q + +Docs build: + +.. code-block:: bash + + python -m sphinx -b html docs docs/_build/html + +Pre-commit on touched files: + +.. code-block:: bash + + pre-commit run --files README.md docs/index.rst --show-diff-on-failure + git diff --check + +Broad Verification +================== + +Run the broad non-slow suite when a change affects shared behavior, +public docs, imports, packaging, or release confidence: + +.. code-block:: bash + + python -m pytest -m "not slow" -q + +To mimic the core CI profile more closely: + +.. code-block:: bash + + python -m pytest tests/ \ + -m "not slow" \ + --ignore=tests/test_gliner_annotator.py \ + --ignore=tests/test_image_service.py \ + --ignore=tests/test_ocr_integration.py \ + --ignore=tests/test_spark_integration.py \ + --ignore=tests/test_text_service_integration.py + +Use optional-profile smoke checks when changing extras, dependency boundaries, +or install behavior: + +.. code-block:: bash + + DATAFOG_INSTALL_PROFILE=core python -m pytest tests/test_install_profiles.py -q + DATAFOG_INSTALL_PROFILE=cli python -m pytest tests/test_install_profiles.py -q + DATAFOG_INSTALL_PROFILE=nlp python -m pytest tests/test_install_profiles.py -q + DATAFOG_INSTALL_PROFILE=nlp-advanced python -m pytest tests/test_install_profiles.py -q + DATAFOG_INSTALL_PROFILE=ocr python -m pytest tests/test_install_profiles.py -q + DATAFOG_INSTALL_PROFILE=ocr DATAFOG_REQUIRE_TESSERACT=1 python -m pytest tests/test_install_profiles.py -q + DATAFOG_INSTALL_PROFILE=distributed python -m pytest tests/test_install_profiles.py -q + DATAFOG_INSTALL_PROFILE=web python -m pytest tests/test_install_profiles.py -q + +Live Modules +============ + +Use :doc:`live-module-map` before changing core package structure. It lists the +live 4.5 modules for each concept and the historical ``*_lean`` and +``*_original`` files that are kept only as non-live compatibility/audit +artifacts. + +4.5 Release Flow +================ + +The 4.5 work lands as focused pull requests into ``dev``. Keep feature and docs +branches narrow, and avoid mixing local cleanup, external PR review, and final +release mechanics in one branch. + +The release flow for 4.5 is: + +1. Land the local release-prep baseline and follow-up cleanup/docs slices. +2. Review the external German regex PR after the local release-prep baseline is + in place. +3. Integrate German regex support only if review says it fits the 4.5 + lightweight text screening thesis. +4. Validate optional Python 3.13 profiles before claiming support beyond core + SDK and CLI. +5. Prepare release readiness with :doc:`v45-release-readiness`: changelog and + release notes, package checks, docs build, CI state, and version alignment. +6. Bump or override the final stable release to ``4.5.0`` only during the + release-readiness and stable-release path. + +The current release workflow strips prerelease suffixes from the package +version unless a manual stable ``version_override`` is provided. For the final +4.5 stable release, use a dedicated release-readiness change or the stable +workflow override so the published version is ``4.5.0`` rather than another +``4.4.0`` prerelease line. + +External PR Boundary +==================== + +The external German PII regex PR belongs after local baseline cleanup. Review +it as a 4.5 candidate, not as a v5 planning shortcut. If accepted, adapt it in +the German regex integration slice with tests, documentation of locale +coverage, and no new dependency burden on the core path. diff --git a/docs/getting-started.rst b/docs/getting-started.rst new file mode 100644 index 00000000..bfc42290 --- /dev/null +++ b/docs/getting-started.rst @@ -0,0 +1,141 @@ +================================ +Getting Started With DataFog 4.5 +================================ + +DataFog 4.5 focuses on lightweight text PII screening. A core install should +let you scan and redact common structured PII without installing OCR, Spark, +large NLP models, or middleware integrations. + +Install Profiles +================ + +Core text screening: + +.. code-block:: bash + + pip install datafog + +Optional extras are explicit: + +.. list-table:: + :header-rows: 1 + + * - Profile + - Install command + - Use when + * - Core + - ``pip install datafog`` + - You need regex-based text scanning, redaction, and guardrail helpers. + * - NLP + - ``pip install "datafog[nlp]"`` + - You need spaCy-backed named entity recognition. + * - Advanced NLP + - ``pip install "datafog[nlp-advanced]"`` + - You need GLiNER-backed named entity recognition. + * - OCR + - ``pip install "datafog[ocr]"`` + - You need local image text extraction before PII scanning. + * - OCR from URLs + - ``pip install "datafog[web,ocr]"`` + - You need DataFog to download image inputs before OCR. + * - Spark + - ``pip install "datafog[distributed]"`` + - You need the optional ``SparkService`` surface. + * - Everything + - ``pip install "datafog[all]"`` + - You are developing or deliberately want every optional surface. + +Python Usage +============ + +Use the top-level helpers for the 4.5 core path: + +.. code-block:: python + + import datafog + + text = "Contact jane@example.com or call 415-555-1212" + + scan_result = datafog.scan(text, engine="regex") + print(scan_result.entities) + + redact_result = datafog.redact(text, engine="regex") + print(redact_result.redacted_text) + + print(datafog.sanitize("Card: 4111-1111-1111-1111")) + +Agent-oriented helpers use the same lightweight text path: + +.. code-block:: python + + import datafog + + prompt = "My SSN is 123-45-6789" + scan_result = datafog.scan_prompt(prompt, engine="regex") + + if scan_result.entities: + print("PII detected before sending the prompt") + + output = "Email me at jane.doe@example.com" + safe_output = datafog.filter_output(output, engine="regex") + print(safe_output.redacted_text) + +German Structured PII +===================== + +The core regex engine includes German VAT IDs and German IBANs by default +because they carry strong country-code structure: + +.. code-block:: python + + import datafog + + result = datafog.scan("USt-IdNr DE 123456789", engine="regex") + print([(entity.type, entity.text) for entity in result.entities]) + +Broader German identifiers such as ``DE_TAX_ID``, +``DE_SOCIAL_SECURITY_NUMBER``, ``DE_POSTAL_CODE``, +``DE_PASSPORT_NUMBER``, and ``DE_RESIDENCE_PERMIT_NUMBER`` require explicit +German locale selection or explicit ``entity_types`` filtering. This keeps +ordinary ticket, SKU, order, and invoice IDs from becoming default-on false +positives. + +.. code-block:: python + + text = "Steuer-ID 12345678901 liegt vor." + + print(datafog.scan(text, engine="regex").entities) + print(datafog.scan(text, engine="regex", locales=["de"]).entities) + print(datafog.scan(text, engine="regex", entity_types=["DE_TAX_ID"]).entities) + +CLI Usage +========= + +The CLI core path is text-first: + +.. code-block:: bash + + datafog scan-text "Contact jane@example.com" + datafog redact-text "Contact jane@example.com" + datafog replace-text "Contact jane@example.com" + datafog hash-text "Contact jane@example.com" + datafog redact-text "Steuer-ID 12345678901" --locale de + +Image commands are optional. Install ``datafog[ocr]`` for local OCR and +``datafog[web,ocr]`` when the CLI needs to download image inputs. + +What 4.5 Is Not +=============== + +DataFog 4.5 prepares the package for future middleware use cases, but it does +not ship dedicated Sentry, OpenTelemetry, logging-framework, or cloud DLP +adapters. Those integrations are future-facing work built on the same core +text screening path. + +Next Pages +========== + +* :doc:`python-sdk` documents the Python API surface. +* :doc:`cli` documents command-line usage. +* :doc:`optional-surfaces` documents OCR and Spark install notes. +* :doc:`roadmap` explains how 4.5 leads toward later middleware work. diff --git a/docs/index.rst b/docs/index.rst index a22af1c5..d5cfdc66 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,107 +2,58 @@ DataFog Documentation ===================== -DataFog is an open-source tool for PII detection and anonymization of unstructured data. This documentation covers the CLI and Python SDK. +DataFog 4.5 is a lightweight text PII screening package for Python. The +primary path is a small core install, fast regex-based scanning and redaction, +agent-friendly guardrail helpers, and explicit optional extras when you need +NLP, OCR, Spark, or web inputs. + +Start with :doc:`getting-started` if you want the shortest route from install +to scanning text. The roadmap and historical planning pages remain available, +but the live user docs are the first path for 4.5. + +Use DataFog 4.5 +=============== .. toctree:: :maxdepth: 2 + :caption: Use DataFog 4.5 - important-concepts - cli + getting-started python-sdk - definitions - roadmap - v44-bridge-release - v5-product-brief - v5-compatibility-matrix - v5-cut-line - -===================== -Getting Started -===================== - -Installation ------------- - -Install DataFog via pip: - -.. code-block:: bash - - pip install datafog - -This installs the latest stable version with CLI support. - ---------------------- -CLI Usage ---------------------- - -For a list of available operations, run: - -.. code-block:: bash - - datafog --help - -Scan text for PII: - -.. code-block:: bash - - datafog scan-text "Your text here" - -Extract text from image: - -.. code-block:: bash - - datafog scan-image "path/to/image.png" --operations extract - -Scan for PII in image: - -.. code-block:: bash - - datafog scan-image "path/to/image.png" --operations scan - -For more information on the CLI, see :doc:`cli`. - ---------------------- -Python SDK Usage ---------------------- - -Scan text for PII: - -.. code-block:: python + cli + optional-surfaces + important-concepts - - import requests - from datafog import DataFog +Reference +========= - # For text annotation - client = DataFog(operations="scan") +.. toctree:: + :maxdepth: 2 + :caption: Reference - # Fetch sample medical record - doc_url = "https://gist.githubusercontent.com/sidmohan0/b43b72693226422bac5f083c941ecfdb/raw/b819affb51796204d59987893f89dee18428ed5d/note1.txt" - response = requests.get(doc_url) - text_lines = [line for line in response.text.splitlines() if line.strip()] + definitions - # Run annotation - annotations = client.run_text_pipeline_sync(str_list=text_lines) - print(annotations) - -Scan image for PII: +Contributing +============ -.. code-block:: python +.. toctree:: + :maxdepth: 2 + :caption: Contributing - - import asyncio - from datafog import DataFog + contributing + v45-release-readiness + live-module-map - # For OCR and PII annotation - ocr_client = DataFog(operations="extract,scan") +Planning And History +==================== - async def run_ocr_pipeline_demo(): - image_url = "https://s3.amazonaws.com/thumbnails.venngage.com/template/dc377004-1c2d-49f2-8ddf-d63f11c8d9c2.png" - results = await ocr_client.run_ocr_pipeline(image_urls=[image_url]) - print("OCR Pipeline Results:", results) +The pages below document release planning, migration history, and future +direction. They are useful context, but they are secondary to the live 4.5 +usage path above. - # Run the async function - asyncio.run(run_ocr_pipeline_demo()) +.. toctree:: + :maxdepth: 1 + :caption: Planning and history -For detailed information on the Python SDK, see :doc:`python-sdk`. + roadmap + planning-history diff --git a/docs/live-module-map.rst b/docs/live-module-map.rst new file mode 100644 index 00000000..8ed98bd7 --- /dev/null +++ b/docs/live-module-map.rst @@ -0,0 +1,99 @@ +=============== +Live Module Map +=============== + +This map identifies the live modules for DataFog 4.5 and the historical +shadow files that should not be used for new work. The goal is to make the +current code path obvious without removing importable legacy files that still +have test coverage or historical value. + +Live 4.5 Modules +================ + +.. list-table:: + :header-rows: 1 + + * - Concept + - Live module + - Notes + * - Package exports + - ``datafog/__init__.py`` + - Top-level ``scan``, ``redact``, guardrail helpers, compatibility shims, + and lazy optional exports. + * - Core engine + - ``datafog/engine.py`` + - Dataclass-based scan/redact path used by the 4.5 core helpers. + * - Agent helpers + - ``datafog/agent.py`` + - Prompt/output screening and guardrail helpers on the lightweight text + path. + * - Backward-compatible ``DataFog`` class + - ``datafog/main.py`` + - Current public ``DataFog`` class and text/OCR compatibility methods. + * - Text service + - ``datafog/services/text_service.py`` + - Current service boundary for regex, spaCy, GLiNER, auto, and smart + engines. + * - CLI + - ``datafog/client.py`` + - Current command-line entrypoint. + * - OCR surface + - ``datafog/services/image_service.py`` and + ``datafog/processing/image_processing/`` + - Optional image/OCR surface behind explicit extras. + * - Spark surface + - ``datafog/services/spark_service.py`` and + ``datafog/processing/spark_processing/`` + - Optional distributed surface behind explicit extras. + * - Packaging + - ``setup.py`` and ``requirements-*.txt`` + - Current packaging and contributor dependency inputs. + +Historical Shadow Files +======================= + +The following files are historical snapshots or alternate implementation +lineage. They are kept importable for now, but new work should not add behavior +to them. + +.. list-table:: + :header-rows: 1 + + * - Historical file + - Live replacement + - 4.5 status + * - ``datafog/__init___lean.py`` + - ``datafog/__init__.py`` + - Historical package export snapshot. + * - ``datafog/__init___original.py`` + - ``datafog/__init__.py`` + - Historical eager-export package snapshot. + * - ``datafog/main_lean.py`` + - ``datafog/main.py`` + - Historical lightweight ``DataFog`` implementation. + * - ``datafog/main_original.py`` + - ``datafog/main.py`` plus optional OCR/Spark services + - Historical full-featured ``DataFog`` implementation still referenced by + legacy tests. + * - ``datafog/services/text_service_lean.py`` + - ``datafog/services/text_service.py`` + - Historical regex-first service variant. + * - ``datafog/services/text_service_original.py`` + - ``datafog/services/text_service.py`` + - Historical spaCy/regex service still referenced by legacy tests. + * - ``setup_lean.py`` + - ``setup.py`` + - Historical packaging snapshot. + * - ``setup_original.py`` + - ``setup.py`` + - Historical packaging snapshot. + +Cleanup Boundary +================ + +This 4.5 slice marks the shadow files as non-live and documents their live +replacements. It does not remove importable modules because ``main_original`` +and ``text_service_original`` still have explicit legacy tests. A future +breaking cleanup can remove the shadow files after any remaining tested +behavior is either migrated to live modules or intentionally dropped with a +compatibility note. diff --git a/docs/optional-surfaces.rst b/docs/optional-surfaces.rst new file mode 100644 index 00000000..57ea5994 --- /dev/null +++ b/docs/optional-surfaces.rst @@ -0,0 +1,143 @@ +========================= +Optional OCR And Spark +========================= + +DataFog 4.5 keeps the core package focused on lightweight text PII screening. +The default path is: + +.. code-block:: bash + + pip install datafog + +.. code-block:: python + + import datafog + + result = datafog.redact("Email jane@example.com", engine="regex") + print(result.redacted_text) + +OCR and Spark are supported optional surfaces. They are useful for image and +distributed workflows, but they should not be treated as required for the core +install, package import, text scanning, text redaction, or guardrail helpers. + +OCR +--- + +Use OCR when you need to extract text from images before running PII detection. + +Install local OCR support: + +.. code-block:: bash + + pip install "datafog[ocr]" + +Use URL-based image downloads: + +.. code-block:: bash + + pip install "datafog[web,ocr]" + +Use Donut OCR: + +.. code-block:: bash + + pip install "datafog[nlp-advanced,ocr]" + +Notes: + +* Tesseract usage requires the system ``tesseract`` binary in addition to the + Python extra. +* Python 3.13 is validated for the OCR install profile, Pillow, pytesseract, + and system Tesseract smoke checks. +* Donut OCR requires a model that is already available locally. DataFog should + not download models implicitly during normal runtime usage. +* OCR is not deprecated. A broader OCR API and packaging overhaul is deferred + beyond the 4.5 focus release. + +Example local OCR flow: + +.. code-block:: python + + import asyncio + from datafog.services.image_service import ImageService + + async def main(): + service = ImageService(use_tesseract=True, use_donut=False) + extracted = await service.ocr_extract(["./invoice.png"]) + print(extracted) + + asyncio.run(main()) + +Spark +------ + +Use Spark when you need distributed processing around DataFog PII detection. + +Install Spark support: + +.. code-block:: bash + + pip install "datafog[distributed]" + +Use Spark PII UDF helpers: + +.. code-block:: bash + + pip install "datafog[distributed,nlp]" + +Notes: + +* ``SparkService`` requires PySpark and a Java runtime. +* Spark PII UDF helpers also require spaCy and an installed spaCy model. +* Spark is not deprecated. A broader Spark overhaul is deferred beyond the 4.5 + focus release. + +Example local Spark flow: + +.. code-block:: python + + from datafog.services.spark_service import SparkService + + service = SparkService(master="local[1]") + rows = service.read_json("./records.json") + print(rows) + +Core-path verification +---------------------- + +The repository includes tests that block optional dependency imports while +importing ``datafog`` and running the default text helpers. These checks verify +that OCR, Spark, NLP, model-loading, and web dependencies are not required for +the core path. + +Python 3.13 optional-profile status +----------------------------------- + +DataFog 4.5 validates Python 3.13 beyond the core/CLI path for the optional +profiles that currently have compatible wheels in the tested dependency set. + +.. list-table:: + :header-rows: 1 + + * - Profile + - Python 3.13 status + - Notes + * - ``nlp`` + - Supported + - spaCy imports and the profile smoke test pass on Python 3.13. + * - ``nlp-advanced`` + - Supported + - GLiNER, torch, transformers, and onnxruntime import successfully on + Python 3.13. + * - ``ocr`` + - Supported + - Pillow, pytesseract, and the system Tesseract bridge validate on Python + 3.13 when the ``tesseract`` binary is installed. + * - ``nlp-advanced,ocr`` + - Supported with local model requirement + - Donut dependencies import on Python 3.13; runtime OCR still requires the + configured Donut model to be present locally. + * - ``distributed`` and ``all`` + - Not newly certified in 4.5 + - Keep using Python 3.10-3.12 for distributed/all-profile validation until + Spark and the full optional surface are audited separately. diff --git a/docs/planning-history.rst b/docs/planning-history.rst new file mode 100644 index 00000000..8237f38f --- /dev/null +++ b/docs/planning-history.rst @@ -0,0 +1,35 @@ +==================== +Planning And History +==================== + +These pages and artifacts are preserved for context, but they are not the +first path for using DataFog 4.5. Start with :doc:`getting-started` for live +user docs. + +Release Planning +================ + +.. toctree:: + :maxdepth: 1 + + v44-bridge-release + v45-release-readiness + v5-product-brief + v5-compatibility-matrix + v5-cut-line + +Additional planning artifacts: + +* :download:`v5 model selection requirements ` + +Audit Artifacts +=============== + +Historical audit notes remain available in the repository for maintainers who +need the detailed background: + +* :download:`Reconnaissance notes ` +* :download:`Coverage baseline ` +* :download:`Detection accuracy review ` +* :download:`Architecture review ` +* :download:`Final coverage notes ` diff --git a/docs/python-sdk.rst b/docs/python-sdk.rst index dbf1982d..b1093de0 100644 --- a/docs/python-sdk.rst +++ b/docs/python-sdk.rst @@ -4,8 +4,79 @@ DataFog Python SDK Overview -------- -The main entrypoint for the SDK is through the DataFog class, defined in :mod:`datafog.main`. -Here you can initialize the different services, including TextService, ImageService, and SparkService. +The primary 4.5 SDK path is lightweight text PII screening through the +top-level ``datafog`` helpers. These helpers use the regex engine by default +and do not require OCR, Spark, model downloads, or distributed dependencies. + +.. code-block:: python + + import datafog + + text = "Contact jane@example.com or call 415-555-1212" + + scan_result = datafog.scan(text, engine="regex") + print(scan_result.entities) + + redact_result = datafog.redact(text, engine="regex") + print(redact_result.redacted_text) + + print(datafog.sanitize(text)) + +The backward-compatible ``DataFog`` and ``TextService`` classes remain +available for existing users. ``TextService(engine="regex")`` is the +dependency-light service path; ``spacy``, ``gliner``, ``smart``, OCR, and Spark +surfaces require their explicit extras. + +German locale coverage +---------------------- + +DataFog 4.5 includes regex-only German structured PII support without adding +dependencies. German VAT IDs and German IBANs are active in the default regex +path. Broader German-only identifiers are opt-in because their raw shapes are +common in ordinary product, ticket, invoice, and order data. + +Use ``locales=["de"]`` to enable the broader German set: + +.. code-block:: python + + import datafog + + text = "Steuer-ID 12345678901 liegt vor." + result = datafog.scan(text, engine="regex", locales=["de"]) + print([(entity.type, entity.text) for entity in result.entities]) + +You can also request one German entity type directly: + +.. code-block:: python + + result = datafog.scan( + "Steuer-ID 12345678901 liegt vor.", + engine="regex", + entity_types=["DE_TAX_ID"], + ) + +The opt-in German set currently covers ``DE_TAX_ID``, +``DE_SOCIAL_SECURITY_NUMBER``, ``DE_POSTAL_CODE``, +``DE_PASSPORT_NUMBER``, and ``DE_RESIDENCE_PERMIT_NUMBER``. The default set +also covers ``DE_VAT_ID`` and ``DE_IBAN``. + +Optional services +----------------- + +OCR and Spark are supported optional surfaces, not the primary 4.5 path: + +* Use ``datafog[ocr]`` for local OCR helpers such as ``ImageService`` and + ``PytesseractProcessor``. +* Use ``datafog[web,ocr]`` when OCR inputs must be downloaded from URLs. +* Use ``datafog[nlp-advanced,ocr]`` for Donut OCR, with the model already + available locally. +* Use ``datafog[distributed]`` for ``SparkService``. +* Use ``datafog[distributed,nlp]`` plus an installed spaCy model for Spark PII + UDF helpers. + +OCR and Spark are not deprecated. Their broader overhaul is deferred so the +4.5 release can keep the core package tight while preserving existing optional +usage. See :doc:`optional-surfaces` for install notes and limitations. Definitions ----------- diff --git a/docs/roadmap.rst b/docs/roadmap.rst index acf8b6a0..1ea8bbc4 100644 --- a/docs/roadmap.rst +++ b/docs/roadmap.rst @@ -24,9 +24,10 @@ v4.4.0 should focus on: * Targeted deprecation warnings with no warnings on import. * Migration docs and release notes that announce the v5 path. -Scope artifact: +Scope artifacts: * :doc:`v44-bridge-release` +* :doc:`v45-release-readiness` v5.0.0 - Offline PII Firewall for AI Apps ----------------------------------------- @@ -134,13 +135,29 @@ All features will remain backward compatible with the lightweight architecture. 4.5.0 ------ -Version ``4.5.0`` will introduce: - -* **Enterprise features** in dedicated extras -* **Advanced analytics** for PII detection patterns -* **Multi-language support** for international PII types -* **Cloud integration** helpers for AWS, GCP, Azure -* **Performance monitoring** and metrics collection +Version ``4.5.0`` is a focus release for lightweight text PII screening. It +should make the core package easier to install, reason about, test, and use +before larger v5 middleware work. + +4.5.0 should focus on: + +* Core text scanning, redaction, and guardrail helpers that stay dependency + light by default. +* Clear install-profile documentation for core, NLP, OCR, Spark, CLI, and web + surfaces. +* OCR and Spark as supported optional surfaces, not the main 4.5 adoption path. +* Documentation cleanup so users and contributors can find the current package + story without reading historical planning material first. +* German PII regex support if the external PR passes review and does not + compromise core precision. + +Deferred beyond 4.5.0: + +* Full middleware adapters for Sentry, OpenTelemetry, logging frameworks, or + cloud DLP services. +* OCR architecture overhaul. +* Spark architecture overhaul. +* Enterprise dashboards and analytics. -The lightweight core will remain unchanged, ensuring existing -integrations continue to work without modification. +The lightweight core remains the first path; optional surfaces should stay +explicit and isolated from default import, scan, redact, and guardrail usage. diff --git a/docs/v45-release-readiness.rst b/docs/v45-release-readiness.rst new file mode 100644 index 00000000..6d655c3b --- /dev/null +++ b/docs/v45-release-readiness.rst @@ -0,0 +1,139 @@ +====================== +v4.5 Release Readiness +====================== + +This page is the release-readiness artifact for DataFog 4.5.0. It summarizes +the intended release story, the final version alignment path, and the checks +that should be true before promoting the release. + +Release Position +================ + +DataFog 4.5.0 is a lightweight text PII screening focus release. It should make +the current package easier to install, read, test, and contribute to while +building toward a sharper v5 middleware direction. + +The 4.5 release includes: + +* Core text scanning, redaction, and guardrail helpers that stay dependency + light by default. +* Regex-only German structured PII support with broad German identifiers gated + behind explicit locale or entity selection. +* Clear optional-profile documentation for NLP, OCR, Spark, CLI, web, and + install-profile testing. +* Python 3.13 validation for the core SDK, CLI, ``nlp``, ``nlp-advanced``, and + ``ocr`` profiles. +* Telemetry documentation that states the existing opt-in behavior and opt-out + controls without changing runtime defaults. + +The 4.5 release does not include: + +* A v5 package break. +* Dedicated Sentry, OpenTelemetry, logging-framework, or cloud DLP middleware + adapters. +* An OCR or Spark architecture overhaul. +* Full certification of ``distributed`` or ``all`` install profiles on + Python 3.13. + +Release Notes Draft +=================== + +Use this framing for the GitHub release notes and package announcement: + + DataFog 4.5.0 is a focused release for lightweight text PII screening. The + core install remains dependency-light while the text APIs, CLI, guardrail + helpers, German structured PII coverage, optional-profile docs, and Python + 3.13 compatibility story become clearer and easier to verify. + +Call out these user-facing points: + +* German VAT IDs and German IBANs are detected by default in the regex engine. +* Broader German identifiers such as tax IDs, postal codes, passport numbers, + residence permit numbers, and pension insurance numbers require + ``locales=["de"]`` or explicit entity selection. +* OCR and Spark remain supported optional surfaces. They are not deprecated, + but their broader overhaul is deferred beyond 4.5. +* Telemetry remains disabled unless ``DATAFOG_TELEMETRY=1`` is set. + ``DATAFOG_NO_TELEMETRY=1`` and ``DO_NOT_TRACK=1`` continue to force it off. +* Python 3.13 is certified for core SDK, CLI, ``nlp``, ``nlp-advanced``, and + ``ocr``. Donut OCR still requires a model already available locally. + +Version Alignment +================= + +The source of truth for the package version is ``datafog/__about__.py``. +``setup.py`` reads that value during packaging, and ``docs/conf.py`` reads the +same value for the Sphinx ``release`` field. + +Before stable release promotion: + +* Stable package release: ``4.4.0``. +* Current development package version: ``4.4.0a5``. +* Next stable target: ``4.5.0``. + +Do not bump routine feature or documentation branches directly to ``4.5.0``. +For the stable release, promote the merged 4.5 stack and either: + +* trigger the release workflow with ``release_type=stable`` and + ``version_override=4.5.0``, or +* make a dedicated stable-release bump that updates ``datafog/__about__.py`` + and reruns the docs build so Sphinx reports ``v4.5.0``. + +After the bump path is chosen, verify: + +* ``python -c "import datafog; print(datafog.__version__)"`` prints + ``4.5.0`` from the release build environment. +* Built package metadata reports ``Version: 4.5.0``. +* Built docs report ``v4.5.0`` through ``docs/conf.py``. +* ``CHANGELOG.MD`` and the GitHub release notes both describe the 4.5 focus + release rather than v5 planning work. + +Readiness Checklist +=================== + +Run these gates before promoting 4.5.0: + +.. list-table:: + :header-rows: 1 + + * - Gate + - Command or evidence + * - Formatting and static checks + - ``pre-commit run --all-files --show-diff-on-failure`` and + ``git diff --check`` + * - Docs build + - ``python -m sphinx -b html docs docs/_build/html`` + * - Core no-network and dependency boundary + - ``DATAFOG_NO_TELEMETRY=1 DO_NOT_TRACK=1 python -m pytest tests/test_runtime_dependency_safety.py tests/test_no_network_core.py -q`` + * - German regex behavior + - ``python -m pytest tests/test_de_pii_regex.py tests/test_regex_annotator.py -q`` + * - Broad non-slow suite + - ``DATAFOG_NO_TELEMETRY=1 DO_NOT_TRACK=1 python -m pytest -m "not slow" -q`` + * - Install-profile smoke checks + - ``DATAFOG_INSTALL_PROFILE= python -m pytest tests/test_install_profiles.py -q`` for ``core``, ``cli``, ``nlp``, ``nlp-advanced``, ``ocr``, ``distributed``, and ``web`` + * - OCR system smoke + - ``DATAFOG_INSTALL_PROFILE=ocr DATAFOG_REQUIRE_TESSERACT=1 python -m pytest tests/test_install_profiles.py -q`` + * - Package build + - ``python -m build`` and ``python scripts/check_wheel_size.py`` + * - GitHub CI + - The final release-readiness PR and the merged release branch have green + CI, including Python 3.13 profile coverage. + * - Stable release dry run + - Trigger ``release_type=stable``, ``version_override=4.5.0``, + ``dry_run=true`` before publishing. + +Review Notes +============ + +German regex support is included in 4.5 with documented default and opt-in +behavior. The external PR was treated as review input rather than merged +unchanged, because broad German identifiers need locale or contextual gating to +avoid noisy default detection. + +OCR and Spark remain documented as optional surfaces. They should not be +described as deprecated, and they should not be positioned as the primary 4.5 +adoption path. + +The v5 planning pages remain useful context, but the 4.5 release should not +claim middleware adapters, a package break, or a complete optional-surface +redesign. diff --git a/docs/v5-compatibility-matrix.rst b/docs/v5-compatibility-matrix.rst index b95483e6..a0e2691c 100644 --- a/docs/v5-compatibility-matrix.rst +++ b/docs/v5-compatibility-matrix.rst @@ -134,6 +134,8 @@ Compatibility Matrix - Parallel historical implementations. - Remove or make private after migration path. - Consolidate around the v5 core and delete duplicate runtime surfaces. + In 4.5 these files are marked as non-live shadow modules; removal is + deferred until legacy tests are migrated or intentionally dropped. Warning Policy -------------- diff --git a/scripts/generate_changelog.py b/scripts/generate_changelog.py index 23babcf8..114d037a 100755 --- a/scripts/generate_changelog.py +++ b/scripts/generate_changelog.py @@ -122,6 +122,43 @@ def generate_changelog(beta=False, alpha=False): "and `DO_NOT_TRACK=1` continue to force telemetry off.\n\n" ) + if not alpha and not beta and current_version == "4.5.0": + changelog += "## 4.5 Release Focus\n\n" + changelog += ( + "DataFog 4.5.0 is a focused release for lightweight text PII " + "screening. The core install remains dependency-light while the " + "text APIs, CLI, guardrail helpers, German structured PII coverage, " + "optional-profile docs, and Python 3.13 compatibility story become " + "clearer and easier to verify.\n\n" + ) + changelog += "## German Structured PII\n\n" + changelog += ( + "German VAT IDs and German IBANs are detected by default in the " + "regex engine. Broader German identifiers such as tax IDs, postal " + "codes, passport numbers, residence permit numbers, and pension " + 'insurance numbers require `locales=["de"]` or explicit entity ' + "selection.\n\n" + ) + changelog += "## Python 3.13 Optional Profiles\n\n" + changelog += ( + "Python 3.13 is certified for the core SDK, CLI, `nlp`, " + "`nlp-advanced`, and `ocr` install profiles. Donut OCR still " + "requires a model already available locally. `distributed` and " + "`all` are not newly certified on Python 3.13 in 4.5.0.\n\n" + ) + changelog += "## Optional OCR And Spark Surfaces\n\n" + changelog += ( + "OCR and Spark remain supported optional surfaces. They are not " + "deprecated, but their broader overhaul is deferred beyond 4.5.0 " + "so the core package can stay tight and text-first.\n\n" + ) + changelog += "## Telemetry Defaults\n\n" + changelog += ( + "Telemetry remains disabled unless `DATAFOG_TELEMETRY=1` is set. " + "`DATAFOG_NO_TELEMETRY=1` and `DO_NOT_TRACK=1` continue to force " + "telemetry off.\n\n" + ) + if categories["features"]: changelog += "## 🚀 New Features\n" for commit in categories["features"]: diff --git a/setup.py b/setup.py index f84c241a..39f01651 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ # Optional heavy dependencies nlp_deps = [ + "click>=8.0,<9.0", "spacy>=3.7.0,<4.0", ] @@ -57,6 +58,7 @@ ] cli_deps = [ + "click>=8.0,<9.0", "typer>=0.12.0", "pydantic-settings>=2.0.0", ] diff --git a/setup_lean.py b/setup_lean.py index 9cd06dd6..ece73a1f 100644 --- a/setup_lean.py +++ b/setup_lean.py @@ -1,5 +1,8 @@ from setuptools import find_packages, setup +# Historical shadow packaging snapshot. The live DataFog 4.5 packaging input is +# setup.py. Do not use this file for builds, releases, or dependency changes. + # Read README for the long description with open("README.md", "r") as f: long_description = f.read() diff --git a/setup_original.py b/setup_original.py index 1e6ca3af..d70db319 100644 --- a/setup_original.py +++ b/setup_original.py @@ -1,5 +1,8 @@ from setuptools import find_packages, setup +# Historical shadow packaging snapshot. The live DataFog 4.5 packaging input is +# setup.py. Do not use this file for builds, releases, or dependency changes. + # Read README for the long description with open("README.md", "r") as f: long_description = f.read() diff --git a/tests/corpus/structured_pii.json b/tests/corpus/structured_pii.json index 672e7483..9c13dda1 100644 --- a/tests/corpus/structured_pii.json +++ b/tests/corpus/structured_pii.json @@ -733,5 +733,29 @@ "end": 5 } ] + }, + { + "id": "de-vat-id-default", + "input": "USt-IdNr DE 123456789 ist gesetzt.", + "expected_entities": [ + { + "type": "DE_VAT_ID", + "text": "DE 123456789", + "start": 9, + "end": 21 + } + ] + }, + { + "id": "de-iban-default", + "input": "IBAN DE44 5001 0517 5407 3249 31 ist gueltig.", + "expected_entities": [ + { + "type": "DE_IBAN", + "text": "DE44 5001 0517 5407 3249 31", + "start": 5, + "end": 32 + } + ] } ] diff --git a/tests/test_de_pii_regex.py b/tests/test_de_pii_regex.py new file mode 100644 index 00000000..f24d605c --- /dev/null +++ b/tests/test_de_pii_regex.py @@ -0,0 +1,146 @@ +import pytest + +import datafog +from datafog.core import get_supported_entities +from datafog.engine import scan, scan_and_redact +from datafog.processing.text_processing.regex_annotator import RegexAnnotator +from datafog.services.text_service import TextService + + +@pytest.mark.parametrize( + "label,text,expected", + [ + ("DE_VAT_ID", "USt-IdNr DE 123456789 ist gesetzt.", "DE 123456789"), + ("DE_VAT_ID", "USt-IdNr DE-123456789 liegt vor.", "DE-123456789"), + ( + "DE_IBAN", + "IBAN DE44500105175407324931 ist gueltig.", + "DE44500105175407324931", + ), + ( + "DE_IBAN", + "IBAN DE44 5001 0517 5407 3249 31 ist gueltig.", + "DE44 5001 0517 5407 3249 31", + ), + ], +) +def test_high_specificity_german_regex_default_cases( + label: str, text: str, expected: str +) -> None: + annotator = RegexAnnotator() + result = annotator.annotate(text) + assert expected in result[label] + + +@pytest.mark.parametrize( + "label,text,expected", + [ + ("DE_TAX_ID", "Steuer-ID 12345678901 liegt vor.", "12345678901"), + ("DE_TAX_ID", "Steuer-ID 12 345 678 901 ist gesetzt.", "12 345 678 901"), + ( + "DE_SOCIAL_SECURITY_NUMBER", + "Rentenversicherungsnummer 65150804A123 liegt vor.", + "65150804A123", + ), + ( + "DE_SOCIAL_SECURITY_NUMBER", + "Rentenversicherungsnummer 65 150804 A123 liegt vor.", + "65 150804 A123", + ), + ("DE_POSTAL_CODE", "PLZ10115 Berlin.", "PLZ10115"), + ("DE_POSTAL_CODE", "DE-10115 Berlin.", "DE-10115"), + ("DE_PASSPORT_NUMBER", "Passnummer C12345678 wurde geprueft.", "C12345678"), + ( + "DE_RESIDENCE_PERMIT_NUMBER", + "Aufenthaltstitel AT1234567 gueltig.", + "AT1234567", + ), + ], +) +def test_broad_german_regex_cases_require_german_locale( + label: str, text: str, expected: str +) -> None: + default_result = RegexAnnotator().annotate(text) + assert expected not in default_result[label] + + german_result = RegexAnnotator(locales=["de"]).annotate(text) + assert expected in german_result[label] + + +@pytest.mark.parametrize( + "label,text", + [ + ("DE_VAT_ID", "USt-IdNr DE12345678 liegt vor."), + ("DE_VAT_ID", "USt-IdNr DE1234567890 liegt vor."), + ("DE_VAT_ID", "USt-IdNr DE123456789A should not prefix-match."), + ("DE_IBAN", "IBAN DE4450010517540732493 ist gueltig."), + ("DE_IBAN", "IBAN DE44 5001 0517 5407 3249 3X ist gueltig."), + ("DE_TAX_ID", "Invoice 12345678901 was paid."), + ("DE_SOCIAL_SECURITY_NUMBER", "Build 65150804A123 failed."), + ("DE_POSTAL_CODE", "SKU D12345 is not a postcode."), + ("DE_POSTAL_CODE", "Release DE12345 shipped."), + ("DE_PASSPORT_NUMBER", "Ticket A12345678 was shipped."), + ("DE_RESIDENCE_PERMIT_NUMBER", "Order AT1234567 is internal."), + ], +) +def test_german_regex_false_positive_guards(label: str, text: str) -> None: + result = RegexAnnotator(locales=["de"]).annotate(text) + assert not result[label] + + +def test_scan_locale_and_explicit_entity_type_activation() -> None: + text = "Steuer-ID 12345678901 liegt vor." + + default_result = scan(text, engine="regex") + assert "DE_TAX_ID" not in {entity.type for entity in default_result.entities} + + locale_result = scan(text, engine="regex", locales=["de"]) + assert [ + entity.text for entity in locale_result.entities if entity.type == "DE_TAX_ID" + ] == ["12345678901"] + + explicit_result = scan(text, engine="regex", entity_types=["DE_TAX_ID"]) + assert [(entity.type, entity.text) for entity in explicit_result.entities] == [ + ("DE_TAX_ID", "12345678901") + ] + + +def test_redaction_and_service_locale_support() -> None: + text = "Passnummer C12345678 wurde geprueft." + + default_redaction = scan_and_redact(text, engine="regex") + assert default_redaction.redacted_text == text + + locale_redaction = scan_and_redact(text, engine="regex", locales=["de"]) + assert "[DE_PASSPORT_NUMBER_1]" in locale_redaction.redacted_text + + service_result = TextService(locales=["de"]).annotate_text_sync(text) + assert service_result["DE_PASSPORT_NUMBER"] == ["C12345678"] + + +def test_german_vat_redaction_suppresses_inner_generic_ssn_match() -> None: + text = "USt-IdNr DE123456789 ist gesetzt." + + scan_result = scan(text, engine="regex") + assert [(entity.type, entity.text) for entity in scan_result.entities] == [ + ("DE_VAT_ID", "DE123456789") + ] + + redaction = scan_and_redact(text, engine="regex") + assert redaction.redacted_text == "USt-IdNr [DE_VAT_ID_1] ist gesetzt." + + +def test_top_level_helpers_and_supported_entities_respect_locale() -> None: + default_entities = get_supported_entities() + assert "DE_VAT_ID" in default_entities + assert "DE_IBAN" in default_entities + assert "DE_TAX_ID" not in default_entities + + german_entities = get_supported_entities(locales=["de"]) + assert "DE_TAX_ID" in german_entities + assert "DE_RESIDENCE_PERMIT_NUMBER" in german_entities + + result = datafog.scan("Aufenthaltstitel AT1234567 gueltig.", locales=["de"]) + assert [(entity.type, entity.text) for entity in result.entities] == [ + ("DE_RESIDENCE_PERMIT_NUMBER", "AT1234567") + ] diff --git a/tests/test_detection_accuracy.py b/tests/test_detection_accuracy.py index 852a7937..6777d9dd 100644 --- a/tests/test_detection_accuracy.py +++ b/tests/test_detection_accuracy.py @@ -22,6 +22,8 @@ "SSN", "CREDIT_CARD", "IP_ADDRESS", + "DE_VAT_ID", + "DE_IBAN", "DATE", "ZIP_CODE", } diff --git a/tests/test_donut_lazy_import.py b/tests/test_donut_lazy_import.py index 80c9ec09..9b2a28f1 100644 --- a/tests/test_donut_lazy_import.py +++ b/tests/test_donut_lazy_import.py @@ -1,23 +1,36 @@ +import os +import subprocess import sys +from pathlib import Path -from datafog.services.image_service import ImageService + +def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]: + env = dict(os.environ) + env["PYTHONPATH"] = str(Path.cwd()) + env["DATAFOG_NO_TELEMETRY"] = "1" + env["DO_NOT_TRACK"] = "1" + return subprocess.run( + [sys.executable, "-c", script], + check=True, + env=env, + text=True, + capture_output=True, + ) def test_no_torch_import_when_donut_disabled(): """Test that torch is not imported when use_donut is False""" - # Remove torch and transformers from sys.modules if they're already imported - if "torch" in sys.modules: - del sys.modules["torch"] - if "transformers" in sys.modules: - del sys.modules["transformers"] + _run_isolated_python( + """ +import sys +from datafog.services.image_service import ImageService - # Create ImageService with use_donut=False - # The variable is used indirectly by creating the service which affects sys.modules - _ = ImageService(use_donut=False, use_tesseract=True) +_ = ImageService(use_donut=False, use_tesseract=True) - # Verify that torch and transformers were not imported - assert "torch" not in sys.modules - assert "transformers" not in sys.modules +assert "torch" not in sys.modules +assert "transformers" not in sys.modules +""" + ) def test_lazy_import_mechanism(): @@ -26,24 +39,16 @@ def test_lazy_import_mechanism(): # to use lazy imports. We don't need to actually test the imports themselves, # just that the structure is correct. - # First, ensure torch and transformers are not in sys.modules - if "torch" in sys.modules: - del sys.modules["torch"] - if "transformers" in sys.modules: - del sys.modules["transformers"] - - # Import the DonutProcessor directly - from datafog.processing.image_processing.donut_processor import DonutProcessor - - # Create a processor instance - processor = DonutProcessor() - - # Verify that torch and transformers were not imported just by creating the processor - assert "torch" not in sys.modules - assert "transformers" not in sys.modules + _run_isolated_python( + """ +import sys +from datafog.processing.image_processing.donut_processor import DonutProcessor - # Verify that the extract_text_from_image method exists - assert hasattr(processor, "extract_text_from_image") +processor = DonutProcessor() - # Runtime package installation helpers should not exist on the processor. - assert not hasattr(processor, "ensure_installed") +assert "torch" not in sys.modules +assert "transformers" not in sys.modules +assert hasattr(processor, "extract_text_from_image") +assert not hasattr(processor, "ensure_installed") +""" + ) diff --git a/tests/test_install_profiles.py b/tests/test_install_profiles.py index e17261be..2680543b 100644 --- a/tests/test_install_profiles.py +++ b/tests/test_install_profiles.py @@ -17,10 +17,13 @@ def test_install_profile_import_surface() -> None: assert datafog.scan("Email jane@example.com").entities assert datafog.redact("Email jane@example.com").redacted_text elif profile == "cli": + import click # noqa: F401 + from datafog.client import app assert app is not None elif profile == "nlp": + import click # noqa: F401 import spacy # noqa: F401 from datafog.models.spacy_nlp import SpacyAnnotator @@ -52,6 +55,8 @@ def test_install_profile_import_surface() -> None: assert DonutProcessor is not None assert ImageService is not None assert PytesseractProcessor is not None + if os.environ.get("DATAFOG_REQUIRE_TESSERACT"): + assert pytesseract.get_tesseract_version() elif profile == "distributed": from datafog.processing.spark_processing import pyspark_udfs from datafog.services.spark_service import SparkService diff --git a/tests/test_main.py b/tests/test_main.py index c35ed505..e9b1e385 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -29,7 +29,8 @@ ImageService = None TextService = None -# Try to import the full-featured DataFog for integration tests +# Keep historical main_original importable while legacy tests still cover it. +# New behavior belongs in datafog.main.DataFog. try: from datafog.main_original import DataFog as FullDataFog diff --git a/tests/test_no_network_core.py b/tests/test_no_network_core.py index 905984f4..f06e8360 100644 --- a/tests/test_no_network_core.py +++ b/tests/test_no_network_core.py @@ -63,6 +63,19 @@ def fail_optional_engine_probe(): guarded = guardrail.filter("Email jane@example.com") assert guarded.redacted_text == "Email [EMAIL_1]" + sanitized = datafog.sanitize("Email jane@example.com") + assert sanitized == "Email [EMAIL_1]" + + prompt_result = datafog.scan_prompt("Email jane@example.com") + assert [entity.type for entity in prompt_result.entities] == ["EMAIL"] + + output_result = datafog.filter_output("Email jane@example.com") + assert output_result.redacted_text == "Email [EMAIL_1]" + + agent_guardrail = datafog.create_guardrail() + agent_guarded = agent_guardrail.filter("Email jane@example.com") + assert agent_guarded.redacted_text == "Email [EMAIL_1]" + def test_import_probes_do_not_load_optional_models() -> None: _run_isolated_python( @@ -94,3 +107,42 @@ def from_pretrained(*_args, **_kwargs): assert datafog.scan("Email jane@example.com").entities """ ) + + +def test_core_path_does_not_import_optional_dependency_modules() -> None: + _run_isolated_python( + """ +import importlib.abc +import sys + +blocked = { + "aiohttp", + "certifi", + "gliner", + "PIL", + "pyspark", + "pytesseract", + "spacy", + "torch", + "transformers", +} + +class BlockOptionalImports(importlib.abc.MetaPathFinder): + def find_spec(self, fullname, path=None, target=None): + if fullname.split(".", 1)[0] in blocked: + raise AssertionError(f"optional dependency imported: {fullname}") + return None + +sys.meta_path.insert(0, BlockOptionalImports()) + +import datafog + +assert datafog.scan("Email jane@example.com").entities +assert datafog.redact("Email jane@example.com").redacted_text == "Email [EMAIL_1]" +assert datafog.protect().filter("Email jane@example.com").redacted_text == "Email [EMAIL_1]" +assert datafog.sanitize("Email jane@example.com") == "Email [EMAIL_1]" +assert datafog.scan_prompt("Email jane@example.com").entities +assert datafog.filter_output("Email jane@example.com").redacted_text == "Email [EMAIL_1]" +assert datafog.create_guardrail().filter("Email jane@example.com").redacted_text == "Email [EMAIL_1]" +""" + ) diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py index 5916bfae..fc63dc2f 100644 --- a/tests/test_regex_annotator.py +++ b/tests/test_regex_annotator.py @@ -40,9 +40,9 @@ def test_regex_annotator_initialization(): """Test that the RegexAnnotator can be initialized.""" annotator = RegexAnnotator() assert annotator is not None - assert ( - len(annotator.LABELS) == 7 - ) # EMAIL, PHONE, SSN, CREDIT_CARD, IP_ADDRESS, DOB, ZIP + assert set(RegexAnnotator.BASE_LABELS).issubset(annotator.LABELS) + assert {"DE_VAT_ID", "DE_IBAN"}.issubset(annotator.active_labels) + assert "DE_TAX_ID" not in annotator.active_labels def test_regex_annotator_create_method(): diff --git a/tests/test_runtime_dependency_safety.py b/tests/test_runtime_dependency_safety.py index 9410ddc6..d34ceb8a 100644 --- a/tests/test_runtime_dependency_safety.py +++ b/tests/test_runtime_dependency_safety.py @@ -1,4 +1,6 @@ import importlib +import os +import subprocess import sys import types from pathlib import Path @@ -6,6 +8,20 @@ import pytest +def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]: + env = dict(os.environ) + env["PYTHONPATH"] = str(Path.cwd()) + env["DATAFOG_NO_TELEMETRY"] = "1" + env["DO_NOT_TRACK"] = "1" + return subprocess.run( + [sys.executable, "-c", script], + check=True, + env=env, + text=True, + capture_output=True, + ) + + def test_runtime_code_does_not_install_packages() -> None: blocked_snippets = [ "subprocess.check_call", @@ -25,6 +41,43 @@ def test_runtime_code_does_not_install_packages() -> None: assert offenders == [] +def test_ocr_and_spark_public_services_do_not_require_optional_imports() -> None: + _run_isolated_python( + """ +import importlib.abc +import sys + +blocked = { + "aiohttp", + "certifi", + "PIL", + "pyspark", + "pytesseract", + "torch", + "transformers", +} + +class BlockOptionalImports(importlib.abc.MetaPathFinder): + def find_spec(self, fullname, path=None, target=None): + if fullname.split(".", 1)[0] in blocked: + raise AssertionError(f"optional dependency imported: {fullname}") + return None + +sys.meta_path.insert(0, BlockOptionalImports()) + +import datafog +from datafog.services import ImageService, SparkService, TextService + +assert datafog.scan("Email jane@example.com").entities +assert ImageService is not None +assert SparkService is not None +assert TextService is not None +assert datafog.ImageService is ImageService +assert datafog.SparkService is SparkService +""" + ) + + def test_spacy_pii_missing_model_requires_explicit_download( monkeypatch: pytest.MonkeyPatch, ) -> None: diff --git a/tests/test_text_service.py b/tests/test_text_service.py index 9f02f3c8..74156082 100644 --- a/tests/test_text_service.py +++ b/tests/test_text_service.py @@ -2,7 +2,8 @@ import pytest -# Test the full-featured TextService from text_service_original +# Legacy coverage for the historical shadow TextService. New behavior belongs +# in datafog.services.text_service. from datafog.services.text_service_original import TextService