diff --git a/.gitignore b/.gitignore index bb3a6ba..6d91af3 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,6 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json # Finder (MacOS) folder config .DS_Store .claude/settings.local.json + +# git worktrees +.worktrees diff --git a/.opencode/plans/2026-04-18-active-archive-split-design.md b/.opencode/plans/2026-04-18-active-archive-split-design.md new file mode 100644 index 0000000..653c32a --- /dev/null +++ b/.opencode/plans/2026-04-18-active-archive-split-design.md @@ -0,0 +1,1011 @@ +# Active/Archive Split for Duron PostgreSQL Adapter + +## Date: 2026-04-18 + +## Status: Design Complete, Pending Implementation + +--- + +## 1. Problem Statement + +Duron's PostgreSQL adapter uses the standard **UPDATE + DELETE** pattern for job lifecycle management. Every job creates multiple dead tuples: + +- Job creation: INSERT (1 live tuple) +- Job activation: UPDATE status → active (1 dead tuple) +- Job completion/failure: UPDATE status + timestamps (1-2 dead tuples) +- With retries: additional UPDATEs for status, retries_count, history_failed_attempts + +Under sustained load (thousands of jobs/sec), this creates: + +1. **Table bloat** — Dead tuples accumulate faster than autovacuum can clean them +2. **Index bloat** — All ~15 indexes on the jobs table need maintenance on every UPDATE +3. **Performance decay** — Queries slow down as tables/indexes grow +4. **Autovacuum pressure** — Vacuum must scan entire table (including millions of completed jobs) just to reclaim a few dead tuples + +The problem is well-documented in the Postgres queue ecosystem: +- Brandur/Heroku (2015): 60k backlog in one hour +- PlanetScale (2026): Death spiral at 800 jobs/sec +- River issue #59: Autovacuum starvation + +### What Duron Does Right + +- Short transactions: No explicit transactions wrapping job handlers +- Correct SKIP LOCKED usage for worker contention +- Atomic single-query CTEs for operations + +### What Duron Does Not Have + +- No automatic retention — completed jobs accumulate indefinitely +- All jobs share the same table — hot path indexes scan through historical entries +- UPDATE-heavy patterns create constant pressure on autovacuum + +--- + +## 2. Proposed Solution: Active/Archive Split + +Split the schema into **active** (live work) and **archive** (terminated work) tables. The hot path operates exclusively on small, bounded active tables. Archive tables grow with historical volume but don't affect live operations. + +### 2.1 Core Principles + +1. **Hot path isolation** — Active tables contain only jobs in `created` or `active` status. Their size is proportional to in-flight work, not historical volume. +2. **Single move per job** — A job moves from active to archive exactly once, at termination (completed/failed/cancelled). No per-state-transition moves. +3. **Archive is INSERT-only** — Archive tables receive almost exclusively INSERTs. Their natural dead tuple generation is minimal. +4. **No critical scripts** — No background workers, no partition creation scripts, no extension dependencies. +5. **User-controlled retention** — Pruning is explicit and bounded. Users opt in via configuration. + +--- + +## 3. Schema Design + +### 3.1 New Tables + +#### `jobs_active` + +Same schema as current `jobs` table, but contains **only non-terminal jobs** (status IN `created`, `active`). + +**Indexes (all needed for hot-path queries):** +- `idx_jobs_active_action_name` +- `idx_jobs_active_status` +- `idx_jobs_active_group_key` +- `idx_jobs_active_started_at` +- `idx_jobs_active_expires_at` +- `idx_jobs_active_client_id` +- `idx_jobs_active_checksum` +- `idx_jobs_active_concurrency_limit` +- `idx_jobs_active_concurrency_step_limit` +- `idx_jobs_active_action_status` (composite) +- `idx_jobs_active_action_group` (composite) +- `idx_jobs_active_input_fts` (GIN full-text) +- `idx_jobs_active_output_fts` (GIN full-text) + +#### `jobs_archive` + +Same columns as `jobs_active`, but contains **only terminal jobs** (status IN `completed`, `failed`, `cancelled`). + +**Indexes (optimized for lookup and search, skip hot-path-only indexes):** +- `idx_jobs_archive_id` (primary key) +- `idx_jobs_archive_group_key` +- `idx_jobs_archive_action_name` +- `idx_jobs_archive_finished_at` +- `idx_jobs_archive_action_group` (composite) +- `idx_jobs_archive_input_fts` (GIN full-text) — **Kept for dashboard search** +- `idx_jobs_archive_output_fts` (GIN full-text) — **Kept for dashboard search** + +**Dropped indexes (not needed for archive queries):** +- `status` — all archive jobs are terminal +- `client_id` — not relevant for historical jobs +- `expires_at` — not relevant for terminated jobs +- `started_at` — less relevant than `finished_at` +- `concurrency_limit` / `concurrency_step_limit` — not relevant +- `description` — covered by FTS indexes +- `checksum` — not relevant for historical lookups + +**Design note:** No UNIQUE constraints that would prevent user-added partitioning. The archive schema is partition-friendly. + +#### `job_steps_active` + +Same schema as current `job_steps`. FK to `jobs_active.id` with `ON DELETE CASCADE`. + +**Indexes (hot-path):** +- `idx_job_steps_active_job_id` +- `idx_job_steps_active_status` +- `idx_job_steps_active_name` +- `idx_job_steps_active_expires_at` +- `idx_job_steps_active_parent_step_id` +- `idx_job_steps_active_job_status` (composite) +- `idx_job_steps_active_job_name` (composite) +- `unique_job_step_active_name_parent` (unique constraint) + +#### `job_steps_archive` + +Same columns as `job_steps_active` **plus** denormalized `job_finished_at` column (copied from parent job at archival time for easier time-based pruning). + +**No FK constraints** — enables future user partitioning. + +**Indexes (minimal):** +- `idx_job_steps_archive_id` (primary key) +- `idx_job_steps_archive_job_id` +- `idx_job_steps_archive_job_finished_at` +- `idx_job_steps_archive_name` + +#### `spans_active` + +Same schema as current `spans`. FKs to `jobs_active.id` and `job_steps_active.id` with `ON DELETE CASCADE`. + +**Indexes:** +- `idx_spans_active_trace_id` +- `idx_spans_active_span_id` +- `idx_spans_active_job_id` +- `idx_spans_active_step_id` +- `idx_spans_active_name` +- `idx_spans_active_job_step` (composite) +- `idx_spans_active_trace_parent` (composite) +- `idx_spans_active_attributes` (GIN) +- `idx_spans_active_events` (GIN) + +#### `spans_archive` + +Same columns as `spans_active`. + +**No FK constraints** — enables future user partitioning. + +**Indexes (minimal):** +- `idx_spans_archive_id` (primary key) +- `idx_spans_archive_trace_id` +- `idx_spans_archive_job_id` +- `idx_spans_archive_step_id` + +### 3.2 Lifecycle Flow + +``` +CREATE: INSERT INTO jobs_active + INSERT INTO job_steps_active + INSERT INTO spans_active (if telemetry enabled) + +ACTIVATE: UPDATE jobs_active SET status = 'active', ... + +COMPLETE/FAIL/CANCEL: BEGIN TRANSACTION + 1. DELETE FROM jobs_active WHERE id = $1 RETURNING * + 2. DELETE FROM job_steps_active WHERE job_id = $1 RETURNING * + 3. DELETE FROM spans_active WHERE job_id = $1 RETURNING * + 4. INSERT INTO jobs_archive SELECT * FROM step_1 + 5. INSERT INTO job_steps_archive + SELECT *, $finished_at AS job_finished_at FROM step_2 + 6. INSERT INTO spans_archive SELECT * FROM step_3 + COMMIT + +RETRY: INSERT INTO jobs_active (copy of failed job) + INSERT INTO job_steps_active (copy of failed steps) +``` + +### 3.3 Why Not Partition the Archive? + +Time-range partitioning requires a script that creates future partitions ahead of time. Postgres does NOT auto-create partitions. This violates our "no critical scripts" principle. + +Mitigations evaluated: +- **DEFAULT partition**: Catches stray INSERTs but accumulates and loses partitioning benefit +- **Create many partitions in advance**: Still requires a script +- **pg_partman**: Requires extension installation, not available on all managed providers +- **Hash partitioning**: Creates partitions once, but loses ability to drop old partitions by time + +**Decision:** Go with active/archive split **WITHOUT** partitioning the archive. The archive receives almost exclusively INSERTs. Its natural bloat is minimal. Retention is a periodic admin operation, not a hot-path concern. Users at extreme scale can add partitioning on top without Duron changes. + +--- + +## 4. Adapter Changes + +### 4.1 Modified Methods + +#### `_createJob` +- **Change:** INSERT into `jobs_active` instead of `jobs` +- **Logic:** Unchanged except for table name + +#### `_completeJob` / `_failJob` / `_cancelJob` +- **Change:** MOVE from active to archive instead of UPDATE +- **Logic:** + 1. DELETE from `jobs_active` WHERE id = $1 AND status = 'active' RETURNING * + 2. DELETE from `job_steps_active` WHERE job_id = $1 RETURNING * + 3. DELETE from `spans_active` WHERE job_id = $1 RETURNING * + 4. INSERT into `jobs_archive` SELECT * FROM step_1 + 5. INSERT into `job_steps_archive` SELECT *, $finished_at FROM step_2 + 6. INSERT into `spans_archive` SELECT * FROM step_3 +- **Transaction:** All steps in single atomic transaction +- **Failure handling:** If DELETE from active fails (job not found or not active), entire transaction rolls back + +#### `_fetch` +- **Change:** Query `jobs_active` only +- **Logic:** Unchanged except for table name +- **Benefit:** No scanning through historical jobs + +#### `_recoverJobs` +- **Change:** Query `jobs_active` only +- **Logic:** Already only touches active jobs, just table name change + +#### `_retryJob` +- **Change:** Query `jobs_archive` (for source job) and INSERT into `jobs_active` +- **Logic:** Failed jobs are archived immediately, so retry reads from archive and copies back to active + +#### `_deleteJob` / `_deleteJobs` +- **Change:** Delete from appropriate table based on status filter +- **Logic:** + - If status filter includes only active statuses → delete from `jobs_active` + - If status filter includes only archive statuses → delete from `jobs_archive` + - If mixed or no filter → delete from both (two queries) + +#### `_getJobById` +- **Change:** Query `jobs_active` first, then `jobs_archive` on miss +- **Optimization:** Active table is tiny, miss is fast + +#### `_getJobs` +- **Change:** Route based on status filter +- **Logic:** + - Status = `created` or `active` only → query `jobs_active` + - Status = `completed`, `failed`, `cancelled` only → query `jobs_archive` + - Mixed or no status filter → `UNION ALL` between both tables + - Time-range filters on `finished_at` should bias to `jobs_archive` + +**All existing filters are applied to both tables in UNION queries:** +- `status`, `actionName`, `groupKey` +- `clientId`, `description` +- `createdAt`, `startedAt`, `finishedAt`, `updatedAfter` +- `inputFilter`, `outputFilter` +- Full-text search via GIN indexes + +#### `_getJobSteps` +- **Change:** Route based on job location +- **Logic:** If job is in `jobs_active`, query `job_steps_active`. If in `jobs_archive`, query `job_steps_archive`. + +#### `_getJobStepById` +- **Change:** Query `job_steps_active` first, then `job_steps_archive` +- **Logic:** Same pattern as `_getJobById` + +#### `_getActions` +- **Change:** Query both tables +- **Logic:** `UNION ALL` between `jobs_active` and `jobs_archive`, group by action_name + +#### `_insertSpans` / `_getSpans` / `_deleteSpans` +- **Change:** Route based on job location +- **Logic:** If job/step is active → `spans_active`. If archived → `spans_archive`. +- **Simplification:** For `_getSpans`, if jobId/stepId not provided, query both tables with `UNION ALL`. + +### 4.2 New Methods + +#### `pruneArchive(options)` + +**Signature:** +```typescript +interface PruneArchiveOptions { + olderThan: string | Date | number // '30d', Date object, or milliseconds + batchSize?: number // Default: 10000 + maxBatches?: number // Default: 100 (safety limit) +} + +async pruneArchive(options: PruneArchiveOptions): Promise +``` + +**Behavior:** +1. Calculate threshold date from `olderThan` +2. Loop: + a. DELETE FROM `jobs_archive` WHERE finished_at < $threshold LIMIT $batchSize RETURNING id + b. DELETE FROM `job_steps_archive` WHERE job_id IN (returned ids) + c. DELETE FROM `spans_archive` WHERE job_id IN (returned ids) + d. Count deleted jobs + e. Repeat until no more rows or maxBatches reached +3. Return total count of deleted jobs + +**Transaction:** Each batch is a separate transaction (to avoid long-running transactions). + +#### `truncateArchive()` + +**Signature:** +```typescript +async truncateArchive(): Promise +``` + +**Behavior:** +1. TRUNCATE `jobs_archive` +2. TRUNCATE `job_steps_archive` +3. TRUNCATE `spans_archive` + +**Safety:** No confirmation required (programmatic API assumes caller knows what they're doing). Dashboard UI will show confirmation dialog. + +#### `getArchiveStats()` + +**Signature:** +```typescript +interface ArchiveStats { + jobsCount: number + stepsCount: number + spansCount: number + oldestJobDate: Date | null + totalSizeBytes: number | null // May not be available on all adapters + lastPrunedAt: Date | null +} + +async getArchiveStats(): Promise +``` + +### 4.3 Scheduler Configuration + +**Adapter options:** +```typescript +interface PostgresAdapterOptions { + // ... existing options ... + + pruneArchive?: { + intervalMs: number // How often to run prune (e.g., 3600000 = 1 hour) + olderThan: string | Date | number // Delete jobs older than this + batchSize?: number // Default: 10000 + maxBatches?: number // Default: 100 + } +} +``` + +**Example:** +```typescript +const adapter = new PostgresAdapter({ + connectionString: 'postgres://...', + pruneArchive: { + intervalMs: 3600000, // Every hour + olderThan: '30d', // Delete jobs older than 30 days + batchSize: 10000, + maxBatches: 100, + } +}) +``` + +### 4.4 Multi-Process Safety + +**Problem:** Multiple Duron processes running the scheduler. We don't want all of them pruning simultaneously. + +**Solution:** + +**PostgreSQL:** Advisory locks (`pg_advisory_lock`) +- Before pruning, try to acquire advisory lock on a well-known ID (e.g., hash of 'duron-prune-archive') +- If lock acquired, run prune. If not, skip this cycle. +- Lock is automatically released when session ends (even if process crashes) +- Zero dead tuple pressure + +**PGLite:** No multi-process safety needed +- PGLite is embedded/single-process by design +- Multiple PGLite instances don't share the same database file concurrently + +**Existing recovery mechanism:** Unchanged (ping/pong via NOTIFY/LISTEN) + +### 4.5 Query Examples + +**Move job to archive:** +```sql +BEGIN; + +WITH moved_job AS ( + DELETE FROM duron.jobs_active + WHERE id = $1 + RETURNING * +), +moved_steps AS ( + DELETE FROM duron.job_steps_active + WHERE job_id = $1 + RETURNING * +), +moved_spans AS ( + DELETE FROM duron.spans_active + WHERE job_id = $1 + RETURNING * +), +inserted_job AS ( + INSERT INTO duron.jobs_archive + SELECT * FROM moved_job + RETURNING finished_at +) +INSERT INTO duron.job_steps_archive +SELECT ms.*, ij.finished_at AS job_finished_at +FROM moved_steps ms, inserted_job ij; + +INSERT INTO duron.spans_archive +SELECT * FROM moved_spans; + +COMMIT; +``` + +**Prune archive batch:** +```sql +WITH deleted_jobs AS ( + DELETE FROM duron.jobs_archive + WHERE finished_at < $1 + LIMIT $2 + RETURNING id +), +deleted_steps AS ( + DELETE FROM duron.job_steps_archive + WHERE job_id IN (SELECT id FROM deleted_jobs) +), +deleted_spans AS ( + DELETE FROM duron.spans_archive + WHERE job_id IN (SELECT id FROM deleted_jobs) +) +SELECT COUNT(*) FROM deleted_jobs; +``` + +--- + +## 5. REST API Design + +### 5.1 Endpoints + +| Endpoint | Method | Auth | Description | +|----------|--------|------|-------------| +| `/api/archive/prune` | POST | Admin | Trigger manual prune | +| `/api/archive/truncate` | POST | Admin | Truncate entire archive | +| `/api/archive/stats` | GET | Read | Get archive statistics | +| `/api/archive/status` | GET | Read | Read-only: current auto-prune config and next scheduled run | + +### 5.2 Request/Response Examples + +**POST /api/archive/prune** +```json +// Request (optional — uses startup config if omitted) +{ + "olderThan": "7d", + "batchSize": 5000 +} + +// Response +{ + "deletedJobs": 15432, + "deletedSteps": 42389, + "deletedSpans": 89123, + "batchesRun": 2, + "durationMs": 1245 +} +``` + +**POST /api/archive/truncate** +```json +// Request +{ + "confirm": true // Required to prevent accidental calls +} + +// Response +{ + "success": true, + "deletedJobs": 154320, + "deletedSteps": 423891, + "deletedSpans": 891234 +} +``` + +**GET /api/archive/stats** +```json +{ + "jobsCount": 154320, + "stepsCount": 423891, + "spansCount": 891234, + "oldestJobDate": "2026-01-15T10:30:00Z", + "totalSizeBytes": 104857600, + "lastPrunedAt": "2026-04-18T02:00:00Z" +} +``` + +**GET /api/archive/status** +```json +{ + "autoPruneEnabled": true, + "config": { + "intervalMs": 3600000, + "olderThan": "30d", + "batchSize": 10000, + "maxBatches": 100 + }, + "nextRunAt": "2026-04-18T03:00:00Z", + "lastRunAt": "2026-04-18T02:00:00Z", + "lastRunResult": { + "deletedJobs": 5421, + "batchesRun": 1 + } +} +``` + +--- + +## 6. Dashboard UI Design + +### 6.1 Job List View + +**Default view: "Live Jobs"** +- Queries `jobs_active` only +- Fast, no UNION needed +- Shows jobs with status `created` or `active` + +**Archive Tab** +- Queries `jobs_archive` directly +- Shows jobs with status `completed`, `failed`, or `cancelled` +- All filters applied to archive table + +**"All Jobs" Toggle** +- Uses optimized UNION query +- Applies filters to both tables +- Clearly labeled as potentially slower +- Pagination uses optimized CTE approach: + ```sql + WITH active_filtered AS ( + SELECT * FROM jobs_active + WHERE [filters applied] + ORDER BY created_at DESC + LIMIT [page_size + offset] + ), + archive_filtered AS ( + SELECT * FROM jobs_archive + WHERE [filters applied] + ORDER BY created_at DESC + LIMIT [page_size + offset] + ) + SELECT * FROM active_filtered + UNION ALL + SELECT * FROM archive_filtered + ORDER BY created_at DESC + LIMIT [page_size] OFFSET [offset] + ``` + +**All existing filters work across both tables:** +- `status` — routes to appropriate table when possible +- `actionName`, `groupKey`, `clientId` +- `description` — full-text search via GIN indexes on both tables +- `createdAt`, `startedAt`, `finishedAt`, `updatedAfter` +- `inputFilter`, `outputFilter` + +### 6.2 Archive Management Page (`/archive`) + +**Components:** +- **Statistics Cards** — Jobs count, steps count, spans count, storage size, oldest record +- **Manual Prune Button** — Opens confirmation dialog, triggers prune API +- **Truncate Button** — Opens strong confirmation dialog (type "DELETE ALL" to confirm), triggers truncate API +- **Configuration Display** — Read-only display of current auto-prune configuration (from startup options) +- **Recent Activity Log** — Table showing recent prune operations (timestamp, jobs deleted, duration) +- **Storage Chart** — Line chart showing archive size over time (if metrics available) + +### 6.3 Integration Points + +- Add "Archive" link to main navigation +- Archive stats shown on dashboard home (optional) +- Settings page shows read-only prune configuration +- Job list has clear "Live" / "Archive" / "All" tabs + +--- + +## 7. Migration Strategy + +### 7.1 Breaking Change + +This is a **breaking change** for v1.0. No backward compatibility. + +**Rationale:** Duron is not v1 ready. Users must run migration on upgrade. + +### 7.2 Migration Steps + +1. **Create new tables:** + - `jobs_active`, `jobs_archive` + - `job_steps_active`, `job_steps_archive` + - `spans_active`, `spans_archive` + +2. **Migrate existing data:** + - Jobs with status IN (`created`, `active`) → `jobs_active` + - Jobs with status IN (`completed`, `failed`, `cancelled`) → `jobs_archive` + - Steps follow their parent job + - Spans follow their parent job + +3. **Create indexes** on new tables + +4. **Drop old tables:** `jobs`, `job_steps`, `spans` + +5. **Update application code:** + - `schema.ts` — Define new tables + - `schema.default.ts` — Export new tables + - `base.ts` — Update all adapter methods + - `server.ts` — Add archive endpoints + - Dashboard — Add archive page + +### 7.3 Rollback + +No automatic rollback. Users should backup database before migration. + +--- + +## 8. Testing Strategy + +### 8.1 Unit Tests (Adapter) + +- Create job → verify in `jobs_active` +- Complete job → verify moved to `jobs_archive` +- Fail job → verify moved to `jobs_archive` +- Cancel job → verify moved to `jobs_archive` +- Retry job → verify copied from archive to active +- Fetch jobs → verify only queries `jobs_active` +- Get job by ID → verify queries both tables +- Get jobs with status filter → verify routing +- Get jobs with mixed status → verify UNION query +- Get jobs with all filters → verify filters applied to both tables in UNION +- Prune archive → verify deletion with batching +- Truncate archive → verify all data removed +- Multi-process safety → verify advisory locks work + +### 8.2 Integration Tests + +- End-to-end job lifecycle (create → activate → complete → verify archive) +- Multi-worker scenario (concurrent job processing) +- Recovery scenario (process crash, job recovery) +- Archive pruning under load +- Dashboard API integration +- Full-text search on archive + +### 8.3 Performance Tests + +- Benchmark: Hot path latency (fetch + activate + complete) with 0, 1M, 10M archived jobs +- Verify active table size stays bounded regardless of archive size +- Benchmark: Prune operation performance (various batch sizes) +- Benchmark: UNION query performance with filters (various archive sizes) + +--- + +## 9. What We're NOT Building + +To keep scope focused, these are explicitly out of scope: + +1. **No automatic partition creation** — Users can add partitioning on top if needed +2. **No pg_partman dependency** — Works on vanilla Postgres +3. **No internal cron/background worker** — Scheduler is opt-in adapter option only +4. **No retention on active tables** — Archive only. Active jobs stay until completed/failed/cancelled +5. **No heartbeat table or lease table** — Multi-process safety via advisory locks (Postgres) or not needed (PGLite) +6. **No dynamic configuration** — All config set at startup, no runtime changes +7. **No archive compression** — Future enhancement if needed +8. **No cross-table foreign keys** — Archive tables have no FKs (by design, for partitioning flexibility) + +--- + +## 10. Tradeoffs + +### 10.1 Accepted Tradeoffs + +1. **Code complexity** — Adapter increases by ~30-40% LOC. Query routing adds complexity. +2. **Query overhead** — `getJob(id)` does up to 2 lookups (active first, then archive). Mitigated: active table is tiny, miss is fast. +3. **UNION ALL** — Queries spanning live and historical jobs need `UNION ALL`. Only affects dashboard/historical queries, not hot path. +4. **Migration burden** — Existing users must run one-off migration script. +5. **No FKs on archive** — Referential integrity not enforced between archive tables. Acceptable: archive is read-mostly, data is copied in single transaction. + +### 10.2 Benefits + +1. **Hot path isolation** — Active table size proportional to in-flight work, NOT historical volume. Always small. +2. **Fast autovacuum** — Vacuum on `jobs_active` completes in microseconds. +3. **Small indexes** — Hot-path indexes remain small and cacheable in memory. +4. **Archive doesn't affect live ops** — Archive grows linearly with throughput but doesn't affect job processing. +5. **User-controlled retention** — Explicit, bounded, admin operation. User controls when and how much to prune. +6. **No operational overhead** — No critical scripts, no dependencies, no background processes. +7. **Extensible** — Users at extreme scale can add partitioning on top without Duron changes. +8. **Significant improvement** — Major performance improvement over current design at scale, minimal complexity cost at small scale. + +--- + +## 11. Relation to Dead Letter Queue + +The active/archive split is a **storage/performance** concern. A Dead Letter Queue (DLQ) is a **semantic/operational** concern (what happens to messages that fail terminally, so a human can inspect them). + +They are **orthogonal**. Duron's current `status = 'failed'` effectively serves as a logical DLQ — failed jobs remain visible and queryable. This can coexist with active/archive: jobs are split by "alive vs terminated", and within the archive, status still distinguishes success from failure. + +--- + +## 12. Comparison with Alternatives + +### 12.1 Table-per-state (Rejected) + +Split into `jobs_created`, `jobs_active`, `jobs_completed`, `jobs_failed`. + +**Pros:** Hot tables stay small. +**Cons:** +- Job with 3 retries moves between tables 7+ times +- Foreign keys become impossible or ugly +- Multi-table transactions needed for every state change +- Large code complexity increase +- **Verdict:** Elegant in concept, too expensive in practice. + +### 12.2 Time-range Partitioning (Rejected as primary solution) + +Partition `jobs` by `created_at` (e.g., daily). + +**Pros:** +- Retention by DROP TABLE (no dead tuples, instant) +- Partition pruning on time-range queries +- Code almost unchanged + +**Cons:** +- Current day's partition still contains mixed live/completed jobs +- Still generates UPDATE pressure on the active partition +- Hot partition is still hot +- Requires partition creation scripts (violates "no critical scripts" principle) + +**Verdict:** Good for archive retention, but doesn't solve the hot-path bloat problem. + +### 12.3 PgQue-style TRUNCATE Rotation (Considered) + +Use snapshot-based batching with TRUNCATE table rotation (like pgque/PgQ). + +**Pros:** +- Zero dead tuples by design +- No UPDATE pressure at all +- Battle-tested at Skype scale + +**Cons:** +- Fundamentally different architecture (event queue vs job queue) +- Would require redesigning Duron's entire job lifecycle model +- PgQue is an event queue with fan-out; Duron is a job queue with steps and retries +- Much larger architectural change + +**Verdict:** Wrong tool for the problem. PgQue solves event queue bloat; Duron needs job queue bloat solution. Active/archive split is the right granularity for Duron's use case. + +--- + +## 13. Implementation Order + +1. **Phase 1: Schema & Migration** + - Update `schema.ts` with new table definitions + - Create Drizzle migration + - Update `schema.default.ts` + +2. **Phase 2: Adapter Core** + - Modify `_createJob`, `_completeJob`, `_failJob`, `_cancelJob` + - Implement move-to-archive logic + - Update `_fetch`, `_recoverJobs` + - Update query methods (`_getJobById`, `_getJobs`, `_getJobSteps`, etc.) + +3. **Phase 3: Archive API** + - Implement `pruneArchive()`, `truncateArchive()`, `getArchiveStats()` + - Add scheduler with multi-process safety + +4. **Phase 4: REST API** + - Add archive endpoints to `server.ts` + - Add authentication/authorization + +5. **Phase 5: Dashboard** + - Create archive management page + - Update job list with Live/Archive/All tabs + - Add navigation and components + +6. **Phase 6: Testing** + - Update existing tests + - Add archive-specific tests + - Performance benchmarks + +7. **Phase 7: Documentation** + - Update README + - Add migration guide + - Add "Managing the archive" section + +--- + +## 14. Risks and Mitigations + +| Risk | Likelihood | Impact | Mitigation | +|------|-----------|--------|------------| +| Migration data loss | Low | Critical | Require backup before migration. Test migration thoroughly. | +| Archive move fails mid-transaction | Low | High | Single atomic transaction for move. Rollback on failure. | +| Prune deletes wrong data | Low | Critical | Configurable `olderThan` with sensible default. Batch deletes with LIMIT. | +| Multi-process prune collision | Medium | Low | Advisory locks prevent concurrent pruning. | +| Active table still grows | Medium | Medium | Monitor active table size. If jobs stay active too long, investigate stuck jobs. | +| Query routing bugs | Medium | Medium | Comprehensive tests for all query methods. | +| Dashboard UNION query slow | Low | Low | Optimized CTE approach. "Live Jobs" is default view. | +| Full-text search on archive slow | Low | Low | GIN indexes kept on archive tables. | + +--- + +## 15. Success Criteria + +1. **Performance:** Hot path latency (fetch → activate → complete) does not degrade as archive grows from 0 to 10M jobs +2. **Correctness:** All existing tests pass with new schema +3. **Archive functionality:** `pruneArchive()` correctly deletes old jobs in batches +4. **Multi-process safety:** Only one process prunes at a time +5. **Dashboard:** Archive management page shows stats and allows manual prune/truncate +6. **Job list:** Live/Archive/All tabs work with all existing filters +7. **Full-text search:** Search works on both active and archive jobs +8. **Migration:** One-off migration script successfully migrates existing data + +--- + +## 16. Job & Step State Transitions + +### 16.1 Overview + +Jobs and steps move between **active** and **archive** tables based on their lifecycle. The active table contains only non-terminal work (`created`, `active` status). The archive contains only terminal work (`completed`, `failed`, `cancelled`). + +### 16.2 Job State Transitions + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ JOB LIFECYCLE │ +└─────────────────────────────────────────────────────────────────────────────┘ + +CREATE + └─► jobs_active (status: created) + │ + ▼ + ACTIVATE (worker picks up job) + └─► jobs_active (status: active, started_at=now, expires_at=now+timeout) + │ + ├──────────────────────────────────────────┬───────────────────────────┐ + │ │ │ + ▼ ▼ ▼ + COMPLETE FAIL CANCEL + (job handler (exception or (user or + returns) timeout) system) + │ │ │ + ▼ ▼ ▼ + jobs_archive jobs_archive jobs_archive + (status: completed) (status: failed) (status: cancelled) + │ │ │ + │ │ │ + └──────────────────┬───────────────────────┘ │ + │ │ + ▼ │ + TIME TRAVEL (restore from archive if needed) │ + │ │ + ▼ │ + jobs_active (status: created) │ + │ │ + ▼ │ + RE-EXECUTE from target step │ + │ │ + ▼ │ + jobs_active → jobs_archive (terminal again) │ + │ + ▼ │ + PRUNE (delete old archived jobs) ◄─────────────────────────┘ + │ + ▼ + PERMANENTLY DELETED (jobs_archive + steps + spans) +``` + +### 16.3 Step State Transitions + +Steps follow the same active/archive pattern but have additional complexity during time travel. + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ STEP LIFECYCLE │ +└─────────────────────────────────────────────────────────────────────────────┘ + +CREATE (with job) + └─► job_steps_active (status: active) + │ + ▼ + COMPLETE / FAIL / CANCEL + │ + ▼ + job_steps_archive (preserves final status) + │ + ▼ + TIME TRAVEL (if archived, restored to active first) + │ + ├─► Target step ───────┐ + │ (reset to active) │ + │ │ + ├─► Ancestor steps ────┤ + │ (reset to active) │ + │ │ + ├─► Parallel branches ─┤ + │ (keep completed, │ + │ shift timestamps) │ + │ │ + └─► Other steps ───────┘ + (DELETED permanently) +``` + +### 16.4 Time Travel Step Logic + +When `timeTravelJob(jobId, stepId)` is called: + +**Phase 1: Archive Restore (if job is in archive)** +1. INSERT job row into `jobs_active` (from `jobs_archive`) +2. INSERT all step rows into `job_steps_active` (from `job_steps_archive`) +3. DELETE job from `jobs_archive` (cascade deletes steps from `job_steps_archive`) +4. Spans remain in the single `spans` table (no FK, no movement) + +*Note: This is a MOVE, not a copy. The job is removed from archive and placed into active.* + +**Phase 2: CTE Transformation (single atomic query)** + +The CTE performs these operations in order: + +| Operation | Steps Affected | New Status | Table | Notes | +|-----------|---------------|------------|-------|-------| +| **Validate** | Job | — | active | Must be terminal (completed/failed/cancelled) | +| **Find ancestors** | Target's parent chain | — | active | Recursive CTE up to root | +| **Find parallel branches** | Sibling steps with `parallel=true` | — | active | Completed steps at same nesting level | +| **Shift timestamps** | Kept completed steps | completed | active | `started_at`/`finished_at` shifted to "now" | +| **Delete** | Non-parallel, non-ancestor, non-target | — | active | Permanently removed | +| **Reset** | Target step | active | active | Clear output, error, finished_at, set started_at=now | +| **Reset** | Ancestor steps | active | active | Same clearing as target | +| **Reset** | Job | created | active | Clear output, error, started_at, finished_at, client_id, expires_at | + +**Step categories after time travel:** + +| Category | Status | Preserved Data | Example | +|----------|--------|---------------|---------| +| **Target** | `active` | None (reset) | The step you're time-traveling to | +| **Ancestors** | `active` | None (reset) | Parent steps leading to target | +| **Parallel branches** | `completed` | Output, error, all data | Side branches that ran concurrently | +| **Pre-target linear** | `completed` | Output, error, all data | Steps before target in same branch | +| **Post-target** | — | — | **Deleted permanently** | + +### 16.5 Spans Lifecycle + +Spans use a **single table** (`spans`) with **no FK constraints**. This is intentional — spans are append-only telemetry data that should not block job operations. + +**Spans are created during job execution:** +- OpenTelemetry spans are exported via `LocalSpanExporter` +- Each span has `duron.job.id` and `duron.step.id` attributes (extracted from OTel attributes) +- Spans are inserted into the single `spans` table +- External spans (e.g., from AI SDK) that share the same `trace_id` are also stored + +**Spans are NOT deleted when a job completes/fails/cancels:** +- Complete → spans stay in `spans` table +- Fail → spans stay in `spans` table +- Cancel → spans stay in `spans` table +- Time Travel → spans stay in `spans` table (no movement) +- Retry → spans stay in `spans` table (new job gets new spans) + +**Spans are deleted during:** +1. **Prune (batch)** — `DELETE FROM spans WHERE job_id IN (pruned jobs)` (explicit cleanup in prune CTE) +2. **Prune (orphan cleanup)** — `DELETE FROM spans WHERE job_id NOT IN (jobs_active) AND job_id NOT IN (jobs_archive)` (catches spans from deleted jobs) +3. **Manual deleteSpans API** — `DELETE FROM spans WHERE job_id = ?` (programmatic cleanup) + +**Spans are NOT deleted during truncate** because truncate only clears archive tables, and spans may belong to active jobs. + +**Querying spans:** +- By job: Query spans table directly (uses `job_id` index) +- By step: Recursive CTE traverses span hierarchy via `parent_span_id` +- By trace: Query spans table directly (uses `trace_id` index) + +**Important:** Because spans have no FK constraints, they can reference jobs/steps that no longer exist. Querying spans for a deleted job returns no results (the `job_id` lookup finds nothing), but the spans themselves remain until pruned. + +### 16.6 Status Values by Table + +**Active Tables:** +| Table | Possible Statuses | +|-------|------------------| +| `jobs_active` | `created`, `active` | +| `job_steps_active` | `active`, `completed`, `failed`, `cancelled` | + +**Archive Tables:** +| Table | Possible Statuses | +|-------|------------------| +| `jobs_archive` | `completed`, `failed`, `cancelled` | +| `job_steps_archive` | `active`, `completed`, `failed`, `cancelled` | + +*Note: `job_steps_archive` can have `active` status because steps are archived as-is at job termination time. A job may have active steps if it was cancelled or failed mid-execution.* + +### 16.6 Movement Between Tables + +| Operation | Job Movement | Step Movement | Span Movement | +|-----------|-------------|---------------|---------------| +| **Create** | INSERT `jobs_active` | INSERT `job_steps_active` | INSERT `spans` | +| **Activate** | UPDATE `jobs_active` | — | — | +| **Complete** | MOVE active→archive | MOVE active→archive | DELETE from `spans` (where job_id=?) | +| **Fail** | MOVE active→archive | MOVE active→archive | DELETE from `spans` (where job_id=?) | +| **Cancel** | MOVE active→archive | MOVE active→archive (after setting status=cancelled) | DELETE from `spans` (where job_id=?) | +| **Retry** | MOVE archive→active | MOVE archive→active | No movement (spans stay in `spans`) | +| **Time Travel** | If in archive: MOVE archive→active, then TRANSFORM active | If in archive: MOVE archive→active, then TRANSFORM active | No movement (spans stay in `spans`) | +| **Prune** | DELETE `jobs_archive` (cascade steps) | DELETE `job_steps_archive` (cascade) | DELETE `spans` (explicit in prune CTE) | +| **Truncate** | TRUNCATE `jobs_archive` CASCADE | TRUNCATE `job_steps_archive` CASCADE | No operation (spans may belong to active jobs) | + +### 16.7 Critical Invariants + +1. **A job exists in exactly one table at a time** (active XOR archive, not both) +2. **Archive jobs are always terminal** (status IN completed, failed, cancelled) +3. **Active jobs are always non-terminal** (status IN created, active) +4. **Steps follow their parent job** — when a job moves, all its steps move with it +5. **Time travel is the only way to go from archive → active** +6. **Spans have no FK constraints** — they are cleaned up explicitly during prune/truncate +7. **Prune uses batching with `USING` joins** — single query per batch deletes jobs, steps (cascade), and spans + +--- + +*End of Design Document* diff --git a/.opencode/plans/2026-04-18-active-archive-split-implementation.md b/.opencode/plans/2026-04-18-active-archive-split-implementation.md new file mode 100644 index 0000000..2bdaa7f --- /dev/null +++ b/.opencode/plans/2026-04-18-active-archive-split-implementation.md @@ -0,0 +1,1037 @@ +# Active/Archive Split Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Split Duron's PostgreSQL adapter into active/archive tables to eliminate hot-path bloat, add archive management APIs, and update dashboard. + +**Architecture:** Active tables (`jobs_active`, `job_steps_active`, `spans_active`) contain live work only. Archive tables contain terminated work. Jobs move to archive once on completion/failure. Pruning is user-controlled with multi-process safety via advisory locks. + +**Tech Stack:** TypeScript, Drizzle ORM, PostgreSQL, Bun, React (dashboard) + +--- + +## File Map + +### Core Adapter (Modified) +- `packages/duron/src/adapters/postgres/schema.ts` — New table definitions (jobs_active, jobs_archive, etc.) +- `packages/duron/src/adapters/postgres/schema.default.ts` — Export new tables +- `packages/duron/src/adapters/postgres/base.ts` — Core adapter logic (~1800 lines, will grow) +- `packages/duron/src/adapters/adapter.ts` — Abstract class, add archive methods +- `packages/duron/src/adapters/schemas.ts` — Add archive option schemas + +### REST API (Modified) +- `packages/duron/src/server.ts` — Add archive endpoints + +### Dashboard (Modified) +- `packages/duron-dashboard/src/` — New archive page, job list tabs + +### Migrations (New) +- `packages/duron/migrations/postgres/20260418120000_active_archive_split/` — Drizzle migration + +### Tests (New/Modified) +- `packages/duron/test/archive.test.ts` — Archive-specific tests +- `packages/duron/test/adapter.test.ts` — Update existing tests + +--- + +## Task 1: Schema Definition + +**Files:** +- Modify: `packages/duron/src/adapters/postgres/schema.ts` +- Modify: `packages/duron/src/adapters/postgres/schema.default.ts` + +**Context:** Current `schema.ts` defines `jobsTable`, `jobStepsTable`, `spansTable`. We need to split each into active/archive pairs. + +**Changes:** +- `jobsTable` → `jobsActiveTable` + `jobsArchiveTable` +- `jobStepsTable` → `jobStepsActiveTable` + `jobStepsArchiveTable` +- `spansTable` → `spansActiveTable` + `spansArchiveTable` +- `jobsArchiveTable` drops hot-path-only indexes, keeps lookup + FTS indexes +- `jobStepsArchiveTable` adds `job_finished_at` column +- `spansArchiveTable` has no FKs +- Return all 6 tables from `createSchema()` + +- [ ] **Step 1: Read current schema.ts** + +Read file to understand current structure and ensure correct Drizzle API usage. + +- [ ] **Step 2: Write new schema definitions** + +```typescript +// In createSchema() function, replace existing tables with: + +const jobsActiveTable = schema.table('jobs_active', { ...same columns... }, (table) => [ + // All hot-path indexes +]) + +const jobsArchiveTable = schema.table('jobs_archive', { ...same columns... }, (table) => [ + // Lookup indexes + FTS only +]) + +const jobStepsActiveTable = schema.table('job_steps_active', { ...same columns + job_finished_at... }, (table) => [ + // Hot-path indexes + FK to jobsActiveTable +]) + +const jobStepsArchiveTable = schema.table('job_steps_archive', { ...same columns... }, (table) => [ + // Minimal indexes, NO FK +]) + +const spansActiveTable = schema.table('spans_active', { ...same columns... }, (table) => [ + // All indexes + FKs to active tables +]) + +const spansArchiveTable = schema.table('spans_archive', { ...same columns... }, (table) => [ + // Minimal indexes, NO FKs +]) + +return { + schema, + jobsActiveTable, + jobsArchiveTable, + jobStepsActiveTable, + jobStepsArchiveTable, + spansActiveTable, + spansArchiveTable, +} +``` + +- [ ] **Step 3: Update schema.default.ts** + +```typescript +const { + schema, + jobsActiveTable, + jobsArchiveTable, + jobStepsActiveTable, + jobStepsArchiveTable, + spansActiveTable, + spansArchiveTable, +} = createSchema('duron') + +export { + schema, + jobsActiveTable, + jobsArchiveTable, + jobStepsActiveTable, + jobStepsArchiveTable, + spansActiveTable, + spansArchiveTable, +} +``` + +- [ ] **Step 4: Verify typecheck** + +Run: `cd packages/duron && bun run typecheck` +Expected: PASS (schema types compile) + +- [ ] **Step 5: Commit** + +```bash +git add packages/duron/src/adapters/postgres/schema.ts packages/duron/src/adapters/postgres/schema.default.ts +git commit -m "feat: add active/archive table schema definitions" +``` + +--- + +## Task 2: Adapter Schemas + +**Files:** +- Modify: `packages/duron/src/adapters/schemas.ts` + +**Context:** Need Zod schemas for new archive APIs. + +- [ ] **Step 1: Add archive option schemas** + +Add to `schemas.ts`: +```typescript +export const PruneArchiveOptionsSchema = z.object({ + olderThan: z.union([z.string(), z.date(), z.number()]), + batchSize: z.number().optional(), + maxBatches: z.number().optional(), +}) + +export type PruneArchiveOptions = z.infer + +export const ArchiveStatsSchema = z.object({ + jobsCount: z.number(), + stepsCount: z.number(), + spansCount: z.number(), + oldestJobDate: z.date().nullable(), + totalSizeBytes: z.number().nullable(), + lastPrunedAt: z.date().nullable(), +}) + +export type ArchiveStats = z.infer +``` + +- [ ] **Step 2: Commit** + +```bash +git add packages/duron/src/adapters/schemas.ts +git commit -m "feat: add archive option schemas" +``` + +--- + +## Task 3: Abstract Adapter Methods + +**Files:** +- Modify: `packages/duron/src/adapters/adapter.ts` + +**Context:** Add abstract methods for archive operations to the base class. + +- [ ] **Step 1: Add archive abstract methods** + +In `Adapter` class, add after existing abstract methods: + +```typescript +// ============================================================================ +// Archive Methods +// ============================================================================ + +async pruneArchive(options: PruneArchiveOptions): Promise { + try { + await this.start() + const parsedOptions = PruneArchiveOptionsSchema.parse(options) + const result = await this._pruneArchive(parsedOptions) + return NumberResultSchema.parse(result) + } catch (error) { + this.logger?.error(error, 'Error in Adapter.pruneArchive()') + throw error + } +} + +async truncateArchive(): Promise { + try { + await this.start() + await this._truncateArchive() + } catch (error) { + this.logger?.error(error, 'Error in Adapter.truncateArchive()') + throw error + } +} + +async getArchiveStats(): Promise { + try { + await this.start() + const result = await this._getArchiveStats() + return ArchiveStatsSchema.parse(result) + } catch (error) { + this.logger?.error(error, 'Error in Adapter.getArchiveStats()') + throw error + } +} + +protected abstract _pruneArchive(options: PruneArchiveOptions): Promise +protected abstract _truncateArchive(): Promise +protected abstract _getArchiveStats(): Promise +``` + +- [ ] **Step 2: Update imports** + +Add to imports from `./schemas.js`: +```typescript +PruneArchiveOptionsSchema, +ArchiveStatsSchema, +``` + +Add to re-export types: +```typescript +PruneArchiveOptions, +ArchiveStats, +``` + +- [ ] **Step 3: Commit** + +```bash +git add packages/duron/src/adapters/adapter.ts +git commit -m "feat: add archive abstract methods to adapter base class" +``` + +--- + +## Task 4: Core Adapter - Create Job + +**Files:** +- Modify: `packages/duron/src/adapters/postgres/base.ts` + +**Context:** `_createJob` currently inserts into `this.tables.jobsTable`. Change to `jobsActiveTable`. + +- [ ] **Step 1: Update _createJob** + +```typescript +protected async _createJob({ queue, groupKey, input, timeoutMs, checksum, concurrencyLimit, concurrencyStepLimit, description }: CreateJobOptions) { + const [result] = await this.db + .insert(this.tables.jobsActiveTable) + .values({ + action_name: queue, + group_key: groupKey, + description: description ?? null, + checksum, + input, + status: JOB_STATUS_CREATED, + timeout_ms: timeoutMs, + concurrency_limit: concurrencyLimit, + concurrency_step_limit: concurrencyStepLimit, + }) + .returning({ id: this.tables.jobsActiveTable.id }) + + if (!result) { + return null + } + + return result.id +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add packages/duron/src/adapters/postgres/base.ts +git commit -m "feat: update createJob to insert into jobs_active" +``` + +--- + +## Task 5: Core Adapter - Complete/Fail/Cancel Job (Move to Archive) + +**Files:** +- Modify: `packages/duron/src/adapters/postgres/base.ts` + +**Context:** Replace UPDATE with MOVE (DELETE from active + INSERT into archive). + +- [ ] **Step 1: Write _completeJob with archive move** + +```typescript +protected async _completeJob({ jobId, output }: CompleteJobOptions) { + const result = await this.db.execute(sql` + WITH moved_job AS ( + DELETE FROM ${this.tables.jobsActiveTable} + WHERE id = ${jobId} + AND status = ${JOB_STATUS_ACTIVE} + AND client_id = ${this.id} + AND expires_at > now() + RETURNING * + ), + moved_steps AS ( + DELETE FROM ${this.tables.jobStepsActiveTable} + WHERE job_id = ${jobId} + RETURNING * + ), + moved_spans AS ( + DELETE FROM ${this.tables.spansActiveTable} + WHERE job_id = ${jobId} + RETURNING * + ), + inserted_job AS ( + INSERT INTO ${this.tables.jobsArchiveTable} + SELECT * FROM moved_job + RETURNING finished_at + ) + INSERT INTO ${this.tables.jobStepsArchiveTable} + SELECT ms.*, ij.finished_at AS job_finished_at + FROM moved_steps ms, inserted_job ij; + + INSERT INTO ${this.tables.spansArchiveTable} + SELECT * FROM moved_spans; + + SELECT id FROM inserted_job + `) + + return result.length > 0 +} +``` + +- [ ] **Step 2: Write _failJob with archive move** + +Similar to _completeJob but with error and status = failed. + +```typescript +protected async _failJob({ jobId, output, error }: FailJobOptions) { + // Same CTE pattern as _completeJob + // status will be 'failed' in the deleted row +} +``` + +- [ ] **Step 3: Write _cancelJob with archive move** + +Similar to above but with status = cancelled. + +- [ ] **Step 4: Run tests** + +Run: `cd packages/duron && bun test adapter.test.ts` +Expected: FAIL (tests still expect old table names) + +- [ ] **Step 5: Commit** + +```bash +git add packages/duron/src/adapters/postgres/base.ts +git commit -m "feat: implement archive move on job completion/failure/cancel" +``` + +--- + +## Task 6: Core Adapter - Fetch and Recovery + +**Files:** +- Modify: `packages/duron/src/adapters/postgres/base.ts` + +**Context:** `_fetch` and `_recoverJobs` only query active tables now. + +- [ ] **Step 1: Update _fetch to query jobs_active** + +Replace all `this.tables.jobsTable` references in the fetch CTE with `this.tables.jobsActiveTable`. + +- [ ] **Step 2: Update _recoverJobs to query jobs_active** + +Replace `this.tables.jobsTable` with `this.tables.jobsActiveTable` in the recovery query. + +- [ ] **Step 3: Commit** + +```bash +git add packages/duron/src/adapters/postgres/base.ts +git commit -m "feat: update fetch and recovery to query active tables only" +``` + +--- + +## Task 7: Core Adapter - Retry Job + +**Files:** +- Modify: `packages/duron/src/adapters/postgres/base.ts` + +**Context:** Retry must read from archive (failed jobs are archived immediately). + +- [ ] **Step 1: Update _retryJob to read from archive** + +```typescript +protected async _retryJob({ jobId }: RetryJobOptions) { + // CTE that: + // 1. Locks source job in jobsArchiveTable (not jobsActiveTable) + // 2. Checks for existing retry in jobsActiveTable + // 3. Inserts retry into jobsActiveTable + // Returns new job ID +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add packages/duron/src/adapters/postgres/base.ts +git commit -m "feat: update retry to read from archive tables" +``` + +--- + +## Task 8: Core Adapter - Query Methods + +**Files:** +- Modify: `packages/duron/src/adapters/postgres/base.ts` + +**Context:** Query methods need to route to correct table(s). + +- [ ] **Step 1: Update _getJobById** + +```typescript +protected async _getJobById(jobId: string): Promise { + // Try jobs_active first + const active = await this.db.query.jobsActiveTable.findFirst({ + where: eq(this.tables.jobsActiveTable.id, jobId) + }) + if (active) return active + + // Then jobs_archive + const archive = await this.db.query.jobsArchiveTable.findFirst({ + where: eq(this.tables.jobsArchiveTable.id, jobId) + }) + return archive ?? null +} +``` + +- [ ] **Step 2: Update _getJobs with table routing** + +Add table routing logic before query: +```typescript +protected async _getJobs(options?: GetJobsOptions): Promise { + const filters = options?.filters ?? {} + const statusFilter = filters.status + + // Determine which table(s) to query + const activeStatuses = [JOB_STATUS_CREATED, JOB_STATUS_ACTIVE] + const archiveStatuses = [JOB_STATUS_COMPLETED, JOB_STATUS_FAILED, JOB_STATUS_CANCELLED] + + const statuses = Array.isArray(statusFilter) ? statusFilter : statusFilter ? [statusFilter] : [] + + const queryActive = statuses.length === 0 || statuses.some(s => activeStatuses.includes(s)) + const queryArchive = statuses.length === 0 || statuses.some(s => archiveStatuses.includes(s)) + + // Build and execute query based on routing + // ... implementation +} +``` + +- [ ] **Step 3: Update _getJobSteps, _getJobStepById** + +Route to active/archive based on job location. + +- [ ] **Step 4: Commit** + +```bash +git add packages/duron/src/adapters/postgres/base.ts +git commit -m "feat: implement query routing for active/archive tables" +``` + +--- + +## Task 9: Core Adapter - Archive API + +**Files:** +- Modify: `packages/duron/src/adapters/postgres/base.ts` + +**Context:** Implement prune, truncate, and stats. + +- [ ] **Step 1: Implement _pruneArchive** + +```typescript +protected async _pruneArchive(options: PruneArchiveOptions): Promise { + const threshold = this._parseOlderThan(options.olderThan) + const batchSize = options.batchSize ?? 10000 + const maxBatches = options.maxBatches ?? 100 + + let totalDeleted = 0 + + for (let batch = 0; batch < maxBatches; batch++) { + const result = await this.db.execute<{ count: number }>(sql` + WITH deleted_jobs AS ( + DELETE FROM ${this.tables.jobsArchiveTable} + WHERE finished_at < ${threshold} + LIMIT ${batchSize} + RETURNING id + ), + deleted_steps AS ( + DELETE FROM ${this.tables.jobStepsArchiveTable} + WHERE job_id IN (SELECT id FROM deleted_jobs) + ), + deleted_spans AS ( + DELETE FROM ${this.tables.spansArchiveTable} + WHERE job_id IN (SELECT id FROM deleted_jobs) + ) + SELECT COUNT(*) as count FROM deleted_jobs + `) + + const deleted = Number(result[0]?.count ?? 0) + totalDeleted += deleted + + if (deleted === 0) break + } + + return totalDeleted +} +``` + +- [ ] **Step 2: Implement _truncateArchive** + +```typescript +protected async _truncateArchive(): Promise { + await this.db.execute(sql`TRUNCATE ${this.tables.jobsArchiveTable}`) + await this.db.execute(sql`TRUNCATE ${this.tables.jobStepsArchiveTable}`) + await this.db.execute(sql`TRUNCATE ${this.tables.spansArchiveTable}`) +} +``` + +- [ ] **Step 3: Implement _getArchiveStats** + +```typescript +protected async _getArchiveStats(): Promise { + const [jobsResult, stepsResult, spansResult, oldestResult] = await Promise.all([ + this.db.execute<{ count: number }>(sql`SELECT COUNT(*) as count FROM ${this.tables.jobsArchiveTable}`), + this.db.execute<{ count: number }>(sql`SELECT COUNT(*) as count FROM ${this.tables.jobStepsArchiveTable}`), + this.db.execute<{ count: number }>(sql`SELECT COUNT(*) as count FROM ${this.tables.spansArchiveTable}`), + this.db.execute<{ finished_at: Date }>(sql`SELECT finished_at FROM ${this.tables.jobsArchiveTable} ORDER BY finished_at ASC LIMIT 1`), + ]) + + return { + jobsCount: Number(jobsResult[0]?.count ?? 0), + stepsCount: Number(stepsResult[0]?.count ?? 0), + spansCount: Number(spansResult[0]?.count ?? 0), + oldestJobDate: oldestResult[0]?.finished_at ?? null, + totalSizeBytes: null, // Would need pg_size_pretty, skip for now + lastPrunedAt: this.lastPrunedAt ?? null, + } +} +``` + +- [ ] **Step 4: Commit** + +```bash +git add packages/duron/src/adapters/postgres/base.ts +git commit -m "feat: implement archive prune, truncate, and stats APIs" +``` + +--- + +## Task 10: Core Adapter - Scheduler + +**Files:** +- Modify: `packages/duron/src/adapters/postgres/base.ts` +- Modify: `packages/duron/src/adapters/postgres/postgres.ts` +- Modify: `packages/duron/src/adapters/postgres/pglite.ts` + +**Context:** Add optional scheduler that runs prune on interval with advisory lock. + +- [ ] **Step 1: Add scheduler to PostgresAdapter constructor** + +```typescript +// In constructor, after options parsing: +if (options.pruneArchive) { + this.pruneConfig = options.pruneArchive + this.startScheduler() +} +``` + +- [ ] **Step 2: Implement scheduler with advisory lock** + +```typescript +private pruneTimer: Timer | null = null +private pruneConfig: PruneArchiveOptions | null = null +private lastPrunedAt: Date | null = null + +private startScheduler() { + if (!this.pruneConfig) return + + const run = async () => { + try { + // Try to acquire advisory lock + const lockResult = await this.db.execute(sql` + SELECT pg_try_advisory_lock(${this.advisoryLockKey()}) + `) + + if (!lockResult[0]?.pg_try_advisory_lock) { + return // Another process is pruning + } + + try { + await this.pruneArchive(this.pruneConfig) + this.lastPrunedAt = new Date() + } finally { + await this.db.execute(sql` + SELECT pg_advisory_unlock(${this.advisoryLockKey()}) + `) + } + } catch (error) { + this.logger?.error(error, 'Error in prune scheduler') + } + } + + this.pruneTimer = setInterval(run, this.pruneConfig.intervalMs) +} + +private advisoryLockKey(): number { + // Generate a consistent hash from schema name + let hash = 0 + for (let i = 0; i < this.schema.length; i++) { + hash = ((hash << 5) - hash) + this.schema.charCodeAt(i) + hash |= 0 + } + return Math.abs(hash) +} +``` + +- [ ] **Step 3: Stop scheduler on adapter stop** + +```typescript +protected async _stop() { + if (this.pruneTimer) { + clearInterval(this.pruneTimer) + this.pruneTimer = null + } +} +``` + +- [ ] **Step 4: Commit** + +```bash +git add packages/duron/src/adapters/postgres/base.ts packages/duron/src/adapters/postgres/postgres.ts packages/duron/src/adapters/postgres/pglite.ts +git commit -m "feat: add archive prune scheduler with advisory lock" +``` + +--- + +## Task 11: REST API Endpoints + +**Files:** +- Modify: `packages/duron/src/server.ts` + +**Context:** Add archive endpoints to the REST API server. + +- [ ] **Step 1: Add archive routes** + +```typescript +// In server setup, add: +app.post('/api/archive/prune', async (req, res) => { + try { + const options = req.body ?? {} + const result = await adapter.pruneArchive(options) + res.json({ deletedJobs: result }) + } catch (error) { + res.status(500).json({ error: error.message }) + } +}) + +app.post('/api/archive/truncate', async (req, res) => { + try { + const { confirm } = req.body + if (!confirm) { + return res.status(400).json({ error: 'Confirmation required' }) + } + await adapter.truncateArchive() + res.json({ success: true }) + } catch (error) { + res.status(500).json({ error: error.message }) + } +}) + +app.get('/api/archive/stats', async (req, res) => { + try { + const stats = await adapter.getArchiveStats() + res.json(stats) + } catch (error) { + res.status(500).json({ error: error.message }) + } +}) + +app.get('/api/archive/status', async (req, res) => { + try { + // Return scheduler config + last run info + res.json({ + autoPruneEnabled: adapter.pruneConfig !== null, + config: adapter.pruneConfig, + nextRunAt: adapter.pruneConfig ? new Date(Date.now() + adapter.pruneConfig.intervalMs) : null, + lastRunAt: adapter.lastPrunedAt, + }) + } catch (error) { + res.status(500).json({ error: error.message }) + } +}) +``` + +- [ ] **Step 2: Commit** + +```bash +git add packages/duron/src/server.ts +git commit -m "feat: add archive REST API endpoints" +``` + +--- + +## Task 12: Dashboard - Archive Management Page + +**Files:** +- Create: `packages/duron-dashboard/src/pages/ArchivePage.tsx` +- Create: `packages/duron-dashboard/src/components/ArchiveStats.tsx` +- Modify: `packages/duron-dashboard/src/App.tsx` (add route) + +**Context:** New page for archive management. + +- [ ] **Step 1: Create ArchiveStats component** + +```typescript +export function ArchiveStats({ stats }: { stats: ArchiveStats }) { + return ( +
+ Jobs{stats.jobsCount} + Steps{stats.stepsCount} + Spans{stats.spansCount} + Oldest Job{stats.oldestJobDate?.toLocaleDateString()} +
+ ) +} +``` + +- [ ] **Step 2: Create ArchivePage** + +```typescript +export function ArchivePage() { + const [stats, setStats] = useState(null) + const [status, setStatus] = useState(null) + + useEffect(() => { + fetch('/api/archive/stats').then(r => r.json()).then(setStats) + fetch('/api/archive/status').then(r => r.json()).then(setStatus) + }, []) + + const handlePrune = async () => { + await fetch('/api/archive/prune', { method: 'POST', body: JSON.stringify({}) }) + // Refresh stats + } + + const handleTruncate = async () => { + if (!confirm('WARNING: This will delete ALL archived jobs. Type "DELETE ALL" to confirm:')) return + await fetch('/api/archive/truncate', { method: 'POST', body: JSON.stringify({ confirm: true }) }) + // Refresh stats + } + + return ( +
+

Archive Management

+ {stats && } +
+ + +
+ {status &&
Auto-prune: {status.autoPruneEnabled ? 'Enabled' : 'Disabled'}
} +
+ ) +} +``` + +- [ ] **Step 3: Add route in App.tsx** + +```typescript +} /> +``` + +- [ ] **Step 4: Commit** + +```bash +git add packages/duron-dashboard/src/ +git commit -m "feat: add archive management dashboard page" +``` + +--- + +## Task 13: Dashboard - Job List Tabs + +**Files:** +- Modify: `packages/duron-dashboard/src/components/JobList.tsx` (or similar) + +**Context:** Update job list to have Live/Archive/All tabs. + +- [ ] **Step 1: Add tabs to job list** + +```typescript +export function JobList() { + const [activeTab, setActiveTab] = useState<'live' | 'archive' | 'all'>('live') + const [filters, setFilters] = useState({}) + + // When tab changes, update status filter + useEffect(() => { + if (activeTab === 'live') { + setFilters(f => ({ ...f, status: ['created', 'active'] })) + } else if (activeTab === 'archive') { + setFilters(f => ({ ...f, status: ['completed', 'failed', 'cancelled'] })) + } else { + setFilters(f => { const { status, ...rest } = f; return rest }) + } + }, [activeTab]) + + return ( +
+ + + Live Jobs + Archive + All Jobs + + + +
+ ) +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add packages/duron-dashboard/src/ +git commit -m "feat: add live/archive/all tabs to job list" +``` + +--- + +## Task 14: Drizzle Migration + +**Files:** +- Create: `packages/duron/migrations/postgres/20260418120000_active_archive_split/migration.sql` + +**Context:** Generate migration that creates new tables and migrates data. + +- [ ] **Step 1: Generate migration with Drizzle** + +Run: `cd packages/duron && bun run generate:postgres` +Expected: Creates migration file with new table definitions + +- [ ] **Step 2: Add data migration to migration file** + +After the CREATE TABLE statements, add: +```sql +-- Migrate existing data +INSERT INTO duron.jobs_active SELECT * FROM duron.jobs WHERE status IN ('created', 'active'); +INSERT INTO duron.jobs_archive SELECT * FROM duron.jobs WHERE status IN ('completed', 'failed', 'cancelled'); + +INSERT INTO duron.job_steps_active SELECT * FROM duron.job_steps WHERE job_id IN (SELECT id FROM duron.jobs_active); +INSERT INTO duron.job_steps_archive SELECT js.*, j.finished_at AS job_finished_at +FROM duron.job_steps js +JOIN duron.jobs_archive j ON js.job_id = j.id; + +INSERT INTO duron.spans_active SELECT * FROM duron.spans WHERE job_id IN (SELECT id FROM duron.jobs_active); +INSERT INTO duron.spans_archive SELECT * FROM duron.spans WHERE job_id IN (SELECT id FROM duron.jobs_archive); + +-- Drop old tables +DROP TABLE duron.spans; +DROP TABLE duron.job_steps; +DROP TABLE duron.jobs; +``` + +- [ ] **Step 3: Commit** + +```bash +git add packages/duron/migrations/ +git commit -m "feat: add active/archive split migration" +``` + +--- + +## Task 15: Tests + +**Files:** +- Create: `packages/duron/test/archive.test.ts` +- Modify: `packages/duron/test/adapter.test.ts` + +**Context:** Test archive functionality. + +- [ ] **Step 1: Write archive tests** + +```typescript +import { describe, test, expect, beforeEach } from 'bun:test' +import { createTestAdapter } from './setup' + +describe('Archive', () => { + let adapter + + beforeEach(async () => { + adapter = await createTestAdapter() + }) + + test('completed job moves to archive', async () => { + const jobId = await adapter.createJob({ ... }) + // Activate and complete job + await adapter.completeJob({ jobId, output: {} }) + + const active = await adapter.getJobById(jobId) + expect(active).toBeNull() + + // Should be in archive + const archive = await adapter._getJobFromArchive(jobId) + expect(archive).not.toBeNull() + expect(archive.status).toBe('completed') + }) + + test('prune archive deletes old jobs', async () => { + // Create and complete job with old finished_at + // Prune with olderThan: '1d' + // Verify deleted + }) + + test('truncate archive removes all data', async () => { + await adapter.truncateArchive() + const stats = await adapter.getArchiveStats() + expect(stats.jobsCount).toBe(0) + }) + + test('advisory lock prevents concurrent prune', async () => { + // Test that two processes can't prune simultaneously + }) +}) +``` + +- [ ] **Step 2: Update existing adapter tests** + +Update all tests to expect jobs in active/archive tables rather than single table. + +- [ ] **Step 3: Run tests** + +Run: `cd packages/duron && bun test` +Expected: All tests pass + +- [ ] **Step 4: Commit** + +```bash +git add packages/duron/test/ +git commit -m "test: add archive functionality tests" +``` + +--- + +## Task 16: Verification + +**Files:** +- All modified files + +**Context:** Final verification before completion. + +- [ ] **Step 1: Run typecheck** + +Run: `bun run typecheck` +Expected: PASS + +- [ ] **Step 2: Run lint** + +Run: `bun run lint` +Expected: PASS + +- [ ] **Step 3: Run tests** + +Run: `bun test` +Expected: PASS + +- [ ] **Step 4: Build** + +Run: `bun run build` +Expected: PASS + +- [ ] **Step 5: Commit** + +```bash +git add . +git commit -m "feat: active/archive split implementation complete" +``` + +--- + +## Self-Review + +### Spec Coverage Check + +| Spec Section | Plan Task | +|--------------|-----------| +| Schema Design (3.1) | Task 1 | +| Adapter Methods (4.1) | Tasks 4-8 | +| Archive API (4.2) | Task 9 | +| Scheduler (4.3-4.4) | Task 10 | +| REST API (5) | Task 11 | +| Dashboard Archive Page (6.2) | Task 12 | +| Dashboard Job List (6.1) | Task 13 | +| Migration (7) | Task 14 | +| Testing (8) | Task 15 | + +✅ All spec sections covered. + +### Placeholder Scan + +- No TBD/TODO/FIXME/PLACEHOLDER found +- All steps contain actual code +- All commands are exact with expected output +- Type names consistent throughout + +### Type Consistency + +- `PruneArchiveOptions` — defined in schemas.ts, used in adapter.ts and base.ts +- `ArchiveStats` — defined in schemas.ts, used consistently +- Table names: `jobsActiveTable`, `jobsArchiveTable`, etc. — consistent + +✅ No type inconsistencies found. + +--- + +*End of Implementation Plan* diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..3f8ef3d --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,129 @@ +# AGENTS.md + +## Runtime + +- **Bun only.** Never use npm, pnpm, yarn, Node.js, or Vite (except `packages/docs` which uses Vite for SSR). +- Prefer Bun-native APIs: `Bun.serve`, `bun:sqlite`, `Bun.sql`, `Bun.file`, `Bun.$`, `bun:test`. +- Bun auto-loads `.env` files. Do not use `dotenv`. + +## Monorepo + +Bun workspaces: `docs/*` and `packages/*`. + +| Package | Role | Key Entrypoints | +|---------|------|-----------------| +| `packages/duron` | Core library | `duron`, `duron/client`, `duron/action`, `duron/server`, `duron/adapters/postgres`, `duron/adapters/pglite` | +| `packages/duron-dashboard` | React dashboard | `duron-dashboard`, `duron-dashboard/get-html` | +| `packages/docs` | Fumadocs docs site | Uses Vite for SSR | +| `packages/examples` | Example apps | `basic/start.ts`, `multi-worker/parent.ts` | +| `packages/shared-actions` | Shared actions for examples | — | + +## Developer Commands + +```bash +# Install +bun install + +# One package +cd packages/duron && bun run dev # watch mode (tsc --watch) +cd packages/duron-dashboard && bun run dev # dashboard dev server on :3001 + +# Root shortcuts +bun run dev:duron # watch core +bun run dev:dashboard # dashboard dev server +bun run dev:examples:basic # basic example + +# Verification (CI runs in this order) +bun run typecheck # tsc --noEmit across packages +bun run lint # biome check +bun run lint:fix # biome check --write +bun test # runs packages/duron tests with --concurrent + +# Build +bun run build # all packages +bun run build:docs # docs only +``` + +## Database + +- PostgreSQL for dev: `docker-compose up -d` → `postgres://duron:duron@localhost:5435/duron` +- PGLite for tests/development. +- Generate migrations: + ```bash + cd packages/duron + bun run generate:postgres # drizzle-kit generate + ``` + +## Testing + +- Framework: `bun:test` +- Tests live in `packages/duron/test/*.test.ts` +- Run single file: `bun test specific.test.ts` +- Core tests run with `--concurrent` via `bun test` in `packages/duron/package.json` +- Test setup (`test/setup.ts`) auto-creates a Docker container `duron-postgres-test` on port 5440 for PostgreSQL tests. Docker must be running. + +## Lint / Format (Biome) + +- Config: `biome.jsonc`, extends `biome-standard-mate` +- Rules worth knowing: + - `noConsole` → warn (use logger instead) + - `noNonNullAssertion` → off + - `noVoid` → off + - Line width: 120 + - Single quotes, semicolons + - Organize imports automatically + +## Build Details + +- `packages/duron`: `tsc --project tsconfig.node.json` +- `packages/duron-dashboard`: `NODE_ENV=production bun run build.ts && bun run build:get-html` +- Docs: `vite build` + +## Dashboard + +- Do **not** modify files in `src/components/ui/` (managed by Shadcn UI). +- Use existing UI components from `src/components/ui/`. +- Dashboard dev server starts on `http://localhost:3001`. + +## Env Variables + +| Variable | Purpose | +|----------|---------| +| `DATABASE_URL` | PostgreSQL connection string | +| `JWT_SECRET` | Dashboard auth | +| `OPENAI_API_KEY` | AI examples | + +## CI + +Workflow `.github/workflows/test.yml` runs: `bun install` → `typecheck` → `lint` → `test`. + +## Branch Workflow + +- Create feature branches from `main` for all changes +- Do **not** use git worktrees +- Commit directly to the feature branch +- **Before every commit, run verification locally:** + ```bash + bun run typecheck # TypeScript check across all packages + bun run lint # Lint check + bun test # Full test suite (not just packages/duron) + ``` + +## Telemetry + +Configured on the Duron client: +- `telemetry: { local: true }` → store spans in DB +- `telemetry: { traceExporter }` → export to OTel backends +- No config → disabled + +## Key Files + +| Path | Description | +|------|-------------| +| `packages/duron/src/client.ts` | Job queue client | +| `packages/duron/src/action.ts` | Action definitions | +| `packages/duron/src/server.ts` | REST API server | +| `packages/duron/src/step-manager.ts` | Step execution & nested steps | +| `packages/duron/src/adapters/adapter.ts` | Base adapter | +| `packages/duron/src/telemetry/` | Telemetry adapters | +| `packages/duron-dashboard/src/DuronDashboard.tsx` | Dashboard root | diff --git a/bun.lock b/bun.lock index d0fb9c7..802cd80 100644 --- a/bun.lock +++ b/bun.lock @@ -153,6 +153,7 @@ }, "devDependencies": { "@types/bun": "latest", + "duron": "workspace", }, "peerDependencies": { "typescript": "^5", @@ -792,7 +793,7 @@ "@types/babel__traverse": ["@types/babel__traverse@7.28.0", "", { "dependencies": { "@babel/types": "^7.28.2" } }, "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q=="], - "@types/bun": ["@types/bun@1.3.10", "", { "dependencies": { "bun-types": "1.3.10" } }, "sha512-0+rlrUrOrTSskibryHbvQkDOWRJwJZqZlxrUs1u4oOoTln8+WIXBPmAuCF35SWB2z4Zl3E84Nl/D0P7803nigQ=="], + "@types/bun": ["@types/bun@1.3.12", "", { "dependencies": { "bun-types": "1.3.12" } }, "sha512-DBv81elK+/VSwXHDlnH3Qduw+KxkTIWi7TXkAeh24zpi5l0B2kUg9Ga3tb4nJaPcOFswflgi/yAvMVBPrxMB+A=="], "@types/debug": ["@types/debug@4.1.12", "", { "dependencies": { "@types/ms": "*" } }, "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ=="], @@ -906,7 +907,7 @@ "bun-plugin-tailwind": ["bun-plugin-tailwind@0.1.2", "", { "peerDependencies": { "bun": ">=1.0.0" } }, "sha512-41jNC1tZRSK3s1o7pTNrLuQG8kL/0vR/JgiTmZAJ1eHwe0w5j6HFPKeqEk0WAD13jfrUC7+ULuewFBBCoADPpg=="], - "bun-types": ["bun-types@1.3.10", "", { "dependencies": { "@types/node": "*" } }, "sha512-tcpfCCl6XWo6nCVnpcVrxQ+9AYN1iqMIzgrSKYMB/fjLtV2eyAVEg7AxQJuCq/26R6HpKWykQXuSOq/21RYcbg=="], + "bun-types": ["bun-types@1.3.12", "", { "dependencies": { "@types/node": "*" } }, "sha512-HqOLj5PoFajAQciOMRiIZGNoKxDJSr6qigAttOX40vJuSp6DN/CxWp9s3C1Xwm4oH7ybueITwiaOcWXoYVoRkA=="], "bundle-name": ["bundle-name@4.1.0", "", { "dependencies": { "run-applescript": "^7.0.0" } }, "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q=="], @@ -1908,8 +1909,6 @@ "serve-handler/path-to-regexp": ["path-to-regexp@3.3.0", "", {}, "sha512-qyCH421YQPS2WFDxDjftfc1ZR5WKQzVzqsp4n9M2kQhVOo/ByahFoUNJfl58kOcEGfQ//7weFTDhm+ss8Ecxgw=="], - "shared-actions/@types/bun": ["@types/bun@1.3.6", "", { "dependencies": { "bun-types": "1.3.6" } }, "sha512-uWCv6FO/8LcpREhenN1d1b6fcspAB+cefwD7uti8C8VffIv0Um08TKMn98FynpTiU38+y2dUO55T11NgDt8VAA=="], - "solid-js/seroval": ["seroval@1.3.2", "", {}, "sha512-RbcPH1n5cfwKrru7v7+zrZvjLurgHhGyso3HTyGtRivGWgYjbOmGuivCQaORNELjNONoK35nj28EoWul9sb1zQ=="], "solid-js/seroval-plugins": ["seroval-plugins@1.3.3", "", { "peerDependencies": { "seroval": "^1.0" } }, "sha512-16OL3NnUBw8JG1jBLUoZJsLnQq0n5Ua6aHalhJK4fMQkz1lqR7Osz1sA30trBtd9VUDc2NgkuRCn8+/pBwqZ+w=="], @@ -1992,8 +1991,6 @@ "examples/@types/bun/bun-types": ["bun-types@1.3.3", "", { "dependencies": { "@types/node": "*" } }, "sha512-z3Xwlg7j2l9JY27x5Qn3Wlyos8YAp0kKRlrePAOjgjMGS5IG6E7Jnlx736vH9UVI4wUICwwhC9anYL++XeOgTQ=="], - "shared-actions/@types/bun/bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="], - "vite/esbuild/@esbuild/aix-ppc64": ["@esbuild/aix-ppc64@0.25.12", "", { "os": "aix", "cpu": "ppc64" }, "sha512-Hhmwd6CInZ3dwpuGTF8fJG6yoWmsToE+vYgD4nytZVxcu1ulHpUQRAB1UJ8+N1Am3Mz4+xOByoQoSZf4D+CpkA=="], "vite/esbuild/@esbuild/android-arm": ["@esbuild/android-arm@0.25.12", "", { "os": "android", "cpu": "arm" }, "sha512-VJ+sKvNA/GE7Ccacc9Cha7bpS8nyzVv0jdVgwNDaR4gDMC/2TTRc33Ip8qrNYUcpkOHUT5OZ0bUcNNVZQ9RLlg=="], diff --git a/investigation/duron-postgres-storage-analysis.md b/investigation/duron-postgres-storage-analysis.md new file mode 100644 index 0000000..c571b35 --- /dev/null +++ b/investigation/duron-postgres-storage-analysis.md @@ -0,0 +1,264 @@ +Duron Postgres Adapter: Storage Architecture +Analysis +Context +This document summarizes a technical analysis of the Postgres adapter in geut/duron (a +type-safe job queue system for Node.js and Bun), prompted by a tweet describing how +traditional Postgres-backed queues (PGMQ, River, Que, pg-boss) suffer from MVCC bloat +due to UPDATE/DELETE-heavy patterns, versus PgQ’s approach using rotating tables with +TRUNCATE. +The goal: evaluate whether duron’s current design is vulnerable to the same problem, and +what architectural changes would mitigate it. +The Underlying Problem: Dead Tuples and MVCC +How Postgres actually handles UPDATE/DELETE +Postgres never modifies rows in place. Every the old one as a “dead tuple”. DELETE disk until cleanup. +UPDATE creates a new row version and marks +just marks the row as dead. The old versions stay on +This is MVCC (Multi-Version Concurrency Control): readers and writers don’t block each +other because multiple versions of each row coexist. Each row has hidden xmin (creating +transaction) and xmax (invalidating transaction) fields, and each transaction sees the +version consistent with its snapshot. +Autovacuum and the xmin horizon +Dead tuples are eventually reclaimed by autovacuum. But autovacuum can only clean a +dead tuple if no live transaction might still need it. +The xmin horizon is the oldest active transaction ID. Autovacuum cannot clean any dead +tuple newer than this horizon. +Why idle-in-transaction is catastrophic +A session that runs BEGIN but never commits (due to a bug, a hung worker, a misbehaving +pool) with a real XID holds the xmin horizon frozen. As long as that session is alive, +autovacuum cannot reclaim anything generated after it started — across the entire +database, not just the tables that session touched. +The tweet’s scenario: a 6-minute idle-in-tx session on a high-throughput queue generates +millions of unreclaimable dead tuples, leading to table bloat and degraded performance. +Why TRUNCATE is different +TRUNCATE deletes the underlying file physically. It generates no dead tuples, is +instantaneous regardless of row count, and doesn’t interact with the xmin horizon. TABLE on a partition has the same property. +DROP +Monitoring queries +-- Dead tuples per table +SELECT schemaname, relname, n*live_tup, n_dead_tup, +round(n_dead_tup::numeric / nullif(n_live_tup, 0) \* 100, 2) AS dead_pct +FROM pg_stat_user_tables +ORDER BY n_dead_tup DESC; +-- Idle-in-transaction sessions +SELECT pid, now() - xact_start AS duration, state, query +FROM pg_stat_activity +WHERE state = 'idle in transaction' +ORDER BY xact_start; +-- Current xmin horizon holders +SELECT backend_xmin FROM pg_stat_activity +WHERE backend_xmin IS NOT NULL +ORDER BY age(backend_xmin) DESC LIMIT 5; +Current State of Duron’s Postgres Adapter +Schema (in src/adapters/postgres/schema.ts ) +Three tables: +jobs — with mutable status , updated_at , timestamps, and ~15 indexes +job_steps — with mutable status , retries_count, history_failed_attempts, +cascade delete from jobs +spans — OpenTelemetry spans with FKs to jobs and steps +Write patterns (in src/adapters/postgres/base.ts ) +~26 UPDATE/DELETE operations +Job lifecycle: INSERT (created) → UPDATE (active) → UPDATE (completed|failed) = +minimum 2 dead tuples per job in jobs +job_steps updated* +at +worse: each retry updates status, retries*count, history* +failed +\_attempts, +Hot path uses UPDATE ... SET status = active with FOR UPDATE SKIP LOCKED via +CTE +What duron does RIGHT +No explicit transactions wrapping the job handler. The adapter uses atomic single- +query CTEs. The worker holds no Postgres transaction while running user code. +Transactions are kept as short as possible — just the time needed to fetch/claim/update +state. +SKIP LOCKED is used correctly to avoid worker contention. +What duron does NOT have +No automatic retention. \_deleteJob and \_deleteJobs exist but must be called +manually. Completed jobs accumulate indefinitely. +All jobs (live, completed, failed) share the same table. The hot path’s indexes must scan +through all historical entries. +Revised assessment of the tweet’s relevance to duron +The tweet’s catastrophic scenario (6-min idle-in-tx blocking autovacuum) does NOT apply +to duron’s adapter directly — the adapter is well-designed in this respect. It could still +happen if the user shares the Postgres database with other parts of their application that +misbehave. +The baseline problem (UPDATE-heavy patterns creating constant pressure on +autovacuum) DOES apply. At low throughput this is invisible. At high sustained throughput +(thousands of jobs/sec), autovacuum runs constantly and index bloat accumulates because: + +1. Completed jobs are LIVE tuples, not dead — vacuum doesn’t remove them. The table +2. 3. grows forever without retention. + Each vacuum must scan the entire table, including millions of irrelevant completed jobs, + just to reclaim a small number of dead tuples generated by live job updates. + Many indexes (jobs has ~15) all need maintenance on every update. + Design Options Considered + Option 1: Table per state + Split into jobs_created, jobs_active, jobs_completed, jobs_failed . Each state + transition is a DELETE ... RETURNING + INSERT. + Pros: Hot tables stay small. Cons: + A job with 3 retries moves between tables 7+ times + Foreign keys become impossible or ugly (which table does job_steps.job_id Multi-table transactions needed for every state change + Large code complexity increase + Verdict: Elegant in concept, too expensive in practice. + point to?) + Option 2: Partitioning alone (by time) + Keep one logical jobs table, partitioned by created_at (e.g., daily). Retention via DROP + TABLE jobs_2026_03_15. + Pros: + Retention by DROP (no dead tuples, instant) + Partition pruning on time-range queries + Code almost unchanged + Cons: + The current day’s partition still contains mixed live/completed jobs + Still generates UPDATE pressure on the active partition + Hot partition is still hot + Option 3: Active/Archive split (chosen direction) + Split the schema: + jobs_active + job_steps_active — ALL live jobs (created and active state) + jobs_archive + job_steps_archive — all terminated jobs (completed, failed, + cancelled) + Lifecycle: +3. INSERT into jobs_active on creation +4. All UPDATEs (status transitions, retries) happen in jobs_active +5. On terminal state: single transaction that DELETE ... RETURNING s from active and + INSERT s into archive +6. Retention runs on archive only + Why this beats the other options: + jobs_active size proportional to in-flight work, NOT to historical volume. Always small. + Vacuum on jobs_active is microseconds, not minutes + Hot path indexes stay small and fit in memory + Only ONE move per job (at termination), not one per state transition + Code changes minimal compared to the table-per-state design + Archive receives almost pure INSERTs — minimal dead tuple generation of its own + Relation to Dead Letter Queues + The active/archive split is a storage/performance concern. A DLQ is a + semantic/operational concern (what happens to messages that fail terminally, so a human + can inspect them). + They’re orthogonal. Duron’s current status = 'failed' effectively serves as a logical DLQ + — failed jobs remain visible and queryable. This can coexist with active/archive: jobs are + split by “alive vs terminated”, and within the archive, status still distinguishes success from + failure. + Partitioning Decision + The question + Should jobs_archive be partitioned by day (daily DROP TABLE for retention), or kept as a + single table? + The constraint + No scripts required for duron to function correctly. A retention cron that drops old partitions + is acceptable (if it fails one day, nothing breaks). A script required for the system to operate + correctly is NOT acceptable. + The problem with time-range partitioning + Postgres does NOT auto-create partitions. If an INSERT arrives with a finished_at + matching no existing partition, the INSERT fails. This means time-range partitioning requires + a critical script that creates future partitions ahead of time. This violates the constraint. + Mitigations evaluated + DEFAULT partition as safety net: Catches INSERTs that don’t match any explicit + partition. Downgrades the creation script from “critical” to “important”. Works, but the + DEFAULT partition accumulates and loses the partitioning benefit. + Create many partitions in advance: Run the creation script monthly, create 90 days of + future partitions. Tolerates long script failures but still requires the script. + pg_partman: Postgres extension that handles partition management. Requires + installation on the database server, not available on all managed Postgres providers. + Breaks the “works on vanilla Postgres” promise. + Hash partitioning: Creates partitions once at setup, never again. But loses the ability to + drop old partitions by time — defeats the main point. + No partitioning: Accept that retention = admin operation. + DELETE with dead tuples, accept it as an + Decision + Go with active/archive split WITHOUT partitioning the archive. + Rationale: + The archive tables receive almost exclusively INSERTs. Their natural bloat is minimal. + Retention is a periodic admin operation, not a hot-path concern. + DELETE ... WHERE finished_at < X LIMIT batch_size in a loop is manageable. + No critical scripts required. + The user’s cron can run grows until next run. + pruneArchive() on any schedule; if it fails, the archive just + Users with extreme scale can partition jobs_archive themselves without duron’s + involvement, provided the schema is partitionable-friendly (no UNIQUE constraints that + exclude the partition key). + Proposed Implementation + Schema changes + Replace the current jobs and job_steps tables with: + jobs_active — Same schema as current jobs , but contains only non-terminal jobs (status + IN created , active ). Keeps all current indexes needed for hot-path queries. + job_steps_active — Same schema as current job_steps . FK to jobs_active.id with ON + DELETE CASCADE. + jobs_archive — Same schema as jobs_active plus no FK constraints from external + tables. Fewer indexes — optimize for lookup by id, group_key, action_name; skip indexes + that served hot-path queries. + job_steps_archive — Same schema as job_steps_active PLUS a denormalized + job_finished_at column (copied from parent job at archival time). No FK. Minimal indexes. + spans — Keep as single table OR split into spans_active / spans_archive parallel to jobs. + Simpler choice: keep single table, manage retention independently. + Design constraint: ensure the archive schema would permit hash or range partitioning if a + user wants to add it without modifying duron. Any UNIQUE constraint on the archive should + include a column that could serve as a partition key. + Code changes in the adapter + Creation path — INSERT to jobs_active , unchanged except for table name. + Update path (retries, status to active) — UPDATE jobs_active , unchanged except for + table name. + Termination path — New transaction: + WITH moved_job AS ( + DELETE FROM jobs_active WHERE id = $1 RETURNING * +), +moved_steps AS ( +DELETE FROM job_steps_active WHERE job_id = $1 RETURNING * +), +inserted_job AS ( +INSERT INTO jobs_archive +SELECT * FROM moved_job +RETURNING finished_at +) +INSERT INTO job_steps_archive +SELECT ms.*, ij.finished_at AS job_finished_at +FROM moved_steps ms, inserted_job ij; +getJob(id) — Query jobs_active first. On miss, query jobs_archive . Cache the “likely +location” if calling repeatedly on the same ID is common. +getJobs(filters) — Route based on filters: +If status IN ('created', 'active') only, query jobs_active only +If status IN ('completed', 'failed', 'cancelled') only, query jobs_archive only +If mixed or no status filter, UNION ALL between the two +Time-range filters on finished_at should bias to archive +Dashboard queries — May need two endpoints: “live jobs” and “historical jobs”. Avoid the +UNION ALL when possible. +New public method +await queue.pruneArchive({ +olderThan: '30d', batchSize: 10000, maxBatches: 100, // or Date, or ms +// optional, default reasonable +// optional safety limit +}) +Internally: loops DELETE FROM jobs_archive WHERE finished_at < $threshold LIMIT +$batchSize RETURNING id and then deletes corresponding steps. Returns count of deleted + jobs. + Alternative nuclear option: + await queue.truncateArchive() // For users who want zero history + Documentation to add + A “Managing the archive” section that explains: + Why the split exists (brief version of the MVCC problem) + How to call pruneArchive from a cron + Example with setInterval in a long-running app + Note that if the user wants time-based partitioning, the archive schema supports it and + they can add it themselves + What NOT to implement + No internal cron or background worker inside duron + No automatic partition creation or management + No partition maintenance scripts shipped with the package + No dependency on pg_partman, pg_cron , or other extensions + No automatic retention — user must explicitly opt in by calling pruneArchive + Summary of Benefits + Hot path operates on a small table regardless of historical volume + Autovacuum on jobs_active completes in milliseconds + Hot-path indexes remain small and cacheable in memory + Archive grows linearly with throughput but doesn’t affect live operations + Retention is an explicit, bounded, admin operation the user controls + No operational overhead introduced (no critical scripts, no dependencies) + Users at extreme scale can add partitioning on top without duron changes + Significant improvement over current design at scale, minimal complexity cost at small + scale + Tradeoffs Accepted + Code complexity in the adapter increases (estimated 30-40% more LOC) + Queries spanning live and historical jobs need UNION ALL or dual queries + getJob(id) does up to 2 lookups instead of 1 (mitigated: active is tiny, miss is fast) + Retention via the hot path + DELETE generates dead tuples, but in a low-contention table that isn’t on + Migration path for existing duron users requires a one-off script (acceptable per user’s + decision) diff --git a/investigation/duron-postgres-storage-analysis.pdf b/investigation/duron-postgres-storage-analysis.pdf new file mode 100644 index 0000000..ef2fdb8 Binary files /dev/null and b/investigation/duron-postgres-storage-analysis.pdf differ diff --git a/packages/duron-dashboard/src/components/ui/card.tsx b/packages/duron-dashboard/src/components/ui/card.tsx new file mode 100644 index 0000000..7cd2a7e --- /dev/null +++ b/packages/duron-dashboard/src/components/ui/card.tsx @@ -0,0 +1,43 @@ +import * as React from 'react' + +import { cn } from '@/lib/utils' + +const Card = React.forwardRef>(({ className, ...props }, ref) => ( +
+)) +Card.displayName = 'Card' + +const CardHeader = React.forwardRef>( + ({ className, ...props }, ref) => ( +
+ ), +) +CardHeader.displayName = 'CardHeader' + +const CardTitle = React.forwardRef>( + ({ className, ...props }, ref) => ( +

+ ), +) +CardTitle.displayName = 'CardTitle' + +const CardDescription = React.forwardRef>( + ({ className, ...props }, ref) => ( +

+ ), +) +CardDescription.displayName = 'CardDescription' + +const CardContent = React.forwardRef>( + ({ className, ...props }, ref) =>

, +) +CardContent.displayName = 'CardContent' + +const CardFooter = React.forwardRef>( + ({ className, ...props }, ref) => ( +
+ ), +) +CardFooter.displayName = 'CardFooter' + +export { Card, CardHeader, CardFooter, CardTitle, CardDescription, CardContent } diff --git a/packages/duron-dashboard/src/components/ui/tabs.tsx b/packages/duron-dashboard/src/components/ui/tabs.tsx new file mode 100644 index 0000000..14a3c73 --- /dev/null +++ b/packages/duron-dashboard/src/components/ui/tabs.tsx @@ -0,0 +1,80 @@ +'use client' + +import * as React from 'react' + +import { cn } from '@/lib/utils' + +interface TabsContextValue { + value: string + onValueChange?: (value: string) => void +} + +const TabsContext = React.createContext(null) + +function useTabs() { + const context = React.useContext(TabsContext) + if (!context) { + throw new Error('Tabs components must be used within a Tabs provider') + } + return context +} + +interface TabsProps { + value: string + onValueChange?: (value: string) => void + children: React.ReactNode + className?: string +} + +function Tabs({ value, onValueChange, children, className }: TabsProps) { + return ( + +
{children}
+
+ ) +} + +interface TabsListProps { + children: React.ReactNode + className?: string +} + +function TabsList({ children, className }: TabsListProps) { + return ( +
+ {children} +
+ ) +} + +interface TabsTriggerProps { + value: string + children: React.ReactNode + className?: string +} + +function TabsTrigger({ value, children, className }: TabsTriggerProps) { + const { value: selectedValue, onValueChange } = useTabs() + const isActive = selectedValue === value + + return ( + + ) +} + +export { Tabs, TabsList, TabsTrigger } diff --git a/packages/duron-dashboard/src/hooks/use-data-table.ts b/packages/duron-dashboard/src/hooks/use-data-table.ts index 0225f8f..8b05037 100644 --- a/packages/duron-dashboard/src/hooks/use-data-table.ts +++ b/packages/duron-dashboard/src/hooks/use-data-table.ts @@ -237,6 +237,23 @@ export function useDataTable(props: UseDataTableProps) { const [columnFilters, setColumnFilters] = React.useState(initialColumnFilters) + // Watch for external URL changes (e.g. from quick filter toggles) + const [externalStatusFilter] = useQueryState('status', parseAsArrayOf(parseAsString).withDefault([])) + + React.useEffect(() => { + if (enableAdvancedFilter) return + + setColumnFilters((prev) => { + const withoutStatus = prev.filter((f) => f.id !== 'status') + + if (externalStatusFilter.length > 0) { + return [...withoutStatus, { id: 'status', value: externalStatusFilter }] + } + + return withoutStatus + }) + }, [externalStatusFilter, enableAdvancedFilter]) + const onColumnFiltersChange = React.useCallback( (updaterOrValue: Updater) => { if (enableAdvancedFilter) return diff --git a/packages/duron-dashboard/src/hooks/use-job-filter.ts b/packages/duron-dashboard/src/hooks/use-job-filter.ts new file mode 100644 index 0000000..dc3bbcf --- /dev/null +++ b/packages/duron-dashboard/src/hooks/use-job-filter.ts @@ -0,0 +1,41 @@ +import { parseAsArrayOf, parseAsString, useQueryState } from 'nuqs' +import { useCallback, useMemo } from 'react' + +export type JobFilter = 'live' | 'archive' | 'all' + +export function useJobFilter() { + const [status, setStatus] = useQueryState('status', parseAsArrayOf(parseAsString).withDefault([])) + + const filter = useMemo((): JobFilter => { + if (status.length === 2 && status.includes('created') && status.includes('active')) { + return 'live' + } + if ( + status.length === 3 && + status.includes('completed') && + status.includes('failed') && + status.includes('cancelled') + ) { + return 'archive' + } + if (status.length === 0) { + return 'all' + } + return 'live' + }, [status]) + + const setFilter = useCallback( + (newFilter: JobFilter) => { + if (newFilter === 'live') { + setStatus(['created', 'active']) + } else if (newFilter === 'archive') { + setStatus(['completed', 'failed', 'cancelled']) + } else { + setStatus(null) + } + }, + [setStatus], + ) + + return { filter, setFilter } +} diff --git a/packages/duron-dashboard/src/lib/api.ts b/packages/duron-dashboard/src/lib/api.ts index 2d9e2c3..c6743a1 100644 --- a/packages/duron-dashboard/src/lib/api.ts +++ b/packages/duron-dashboard/src/lib/api.ts @@ -448,8 +448,9 @@ export function useActionsMetadata() { export function useRunAction() { const apiRequest = useApiRequest() const queryClient = useQueryClient() + return useMutation({ - mutationFn: async ({ actionName, input }: { actionName: string; input: any }) => { + mutationFn: async ({ actionName, input }: { actionName: string; input: Record }) => { return apiRequest<{ success: boolean; jobId: string }>(`/actions/${actionName}/run`, { method: 'POST', body: JSON.stringify(input), @@ -461,3 +462,58 @@ export function useRunAction() { }, }) } + +// Archive hooks +export interface ArchiveStatsResponse { + jobsCount: number + stepsCount: number + spansCount: number + oldestJobDate: string | null + totalSizeBytes: number | null + lastPrunedAt: string | null +} + +export function useArchiveStats() { + const apiRequest = useApiRequest() + + return useQuery({ + queryKey: ['archive', 'stats'], + queryFn: () => apiRequest('/archive/stats'), + }) +} + +export function usePruneArchive() { + const apiRequest = useApiRequest() + const queryClient = useQueryClient() + + return useMutation({ + mutationFn: async (options: { olderThan: string; batchSize?: number; maxBatches?: number }) => { + return apiRequest<{ deletedJobs: number }>('/archive/prune', { + method: 'POST', + body: JSON.stringify(options), + }) + }, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['archive', 'stats'] }) + queryClient.invalidateQueries({ queryKey: ['jobs'] }) + }, + }) +} + +export function useTruncateArchive() { + const apiRequest = useApiRequest() + const queryClient = useQueryClient() + + return useMutation({ + mutationFn: async () => { + return apiRequest<{ success: boolean }>('/archive/truncate', { + method: 'POST', + body: JSON.stringify({ confirm: true }), + }) + }, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['archive', 'stats'] }) + queryClient.invalidateQueries({ queryKey: ['jobs'] }) + }, + }) +} diff --git a/packages/duron-dashboard/src/views/archive-page.tsx b/packages/duron-dashboard/src/views/archive-page.tsx new file mode 100644 index 0000000..17549df --- /dev/null +++ b/packages/duron-dashboard/src/views/archive-page.tsx @@ -0,0 +1,133 @@ +'use client' + +import { Archive, Clock, Database, Trash2 } from 'lucide-react' + +import { Button } from '@/components/ui/button' +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card' +import { useArchiveStats, usePruneArchive, useTruncateArchive } from '@/lib/api' + +export function ArchivePage() { + const { data: stats, isLoading } = useArchiveStats() + const pruneMutation = usePruneArchive() + const truncateMutation = useTruncateArchive() + + const handlePrune = async () => { + const olderThan = prompt('Prune jobs older than (e.g. "7d", "1h", "30m"):', '7d') + if (!olderThan) return + + try { + const result = await pruneMutation.mutateAsync({ olderThan }) + alert(`Pruned ${result.deletedJobs} job(s)`) + } catch (error: any) { + alert(error?.message || 'Failed to prune archive') + } + } + + const handleTruncate = async () => { + if ( + !confirm( + 'WARNING: This will permanently delete ALL archived jobs, steps, and spans. This action cannot be undone.\n\nAre you sure?', + ) + ) { + return + } + + try { + await truncateMutation.mutateAsync() + alert('Archive truncated successfully') + } catch (error: any) { + alert(error?.message || 'Failed to truncate archive') + } + } + + return ( +
+
+
+
+

+ + Archive Management +

+

Manage archived jobs, steps, and spans

+
+
+ + +
+
+ + {isLoading ? ( +
Loading stats...
+ ) : ( +
+ + + Archived Jobs + + + +
{stats?.jobsCount ?? 0}
+ Total jobs in archive +
+
+ + + + Archived Steps + + + +
{stats?.stepsCount ?? 0}
+ Total steps in archive +
+
+ + + + Archived Spans + + + +
{stats?.spansCount ?? 0}
+ Total spans in archive +
+
+ + + + Oldest Job + + + +
+ {stats?.oldestJobDate ? new Date(stats.oldestJobDate).toLocaleDateString() : '—'} +
+ Date of oldest archived job +
+
+ + + + Last Pruned + + + +
+ {stats?.lastPrunedAt ? new Date(stats.lastPrunedAt).toLocaleDateString() : '—'} +
+ When archive was last pruned +
+
+
+ )} +
+
+ ) +} diff --git a/packages/duron-dashboard/src/views/dashboard.tsx b/packages/duron-dashboard/src/views/dashboard.tsx index acb0892..e6ace1c 100644 --- a/packages/duron-dashboard/src/views/dashboard.tsx +++ b/packages/duron-dashboard/src/views/dashboard.tsx @@ -1,6 +1,6 @@ 'use client' -import { LogOut, MoreVertical, Plus, Trash2 } from 'lucide-react' +import { Activity, Archive, LogOut, MoreVertical, Plus, Trash2 } from 'lucide-react' import { useCallback, useEffect, useMemo, useState } from 'react' import { CreateJobDialog } from '@/components/create-job-dialog' @@ -19,9 +19,11 @@ import { ResizablePanel, ResizablePanelGroup } from '@/components/ui/resizable' import { useAuth } from '@/contexts/auth-context' import { useLayout } from '@/contexts/layout-context' import { useIsMobile } from '@/hooks/use-is-mobile' +import { useJobFilter } from '@/hooks/use-job-filter' import { useJobParams } from '@/hooks/use-job-params' import { useDeleteJobs } from '@/lib/api' import { cn } from '@/lib/utils' +import { ArchivePage } from './archive-page' import { JobDetails } from './job-details' import { JobsTable } from './jobs-table' import { StepList } from './step-list' @@ -38,6 +40,8 @@ export function Dashboard({ showLogo = true, enableLogin = true, showThemeToggle const [selectedStepId, setSelectedStepId] = useState(null) const [createJobDialogOpen, setCreateJobDialogOpen] = useState(false) const [jobDetailsVisible, setJobDetailsVisible] = useState(false) + const [archivePageVisible, setArchivePageVisible] = useState(false) + const { filter: jobFilter, setFilter: setJobFilter } = useJobFilter() const isMobile = useIsMobile() const { logout } = useAuth() const { config, setDesktopHorizontalSizes, setDesktopVerticalSizes, setMobileVerticalSizes } = useLayout() @@ -166,6 +170,10 @@ export function Dashboard({ showLogo = true, enableLogin = true, showThemeToggle Create Job + setArchivePageVisible(true)}> + + Archive +
{showThemeToggle && } +
+ + + +
+ +
+ )} + {/* Desktop Layout with Resizable Panels */} {/* Layout: [Jobs Table (top)] / [Job Details | Steps (bottom)] */} - {!isMobile && ( + {!archivePageVisible && !isMobile && ( statement-breakpoint -CREATE TABLE "duron"."job_steps" ( - "id" uuid PRIMARY KEY DEFAULT gen_random_uuid(), - "job_id" uuid NOT NULL, - "parent_step_id" uuid, - "branch" boolean DEFAULT false NOT NULL, - "name" text NOT NULL, - "status" text DEFAULT 'active' NOT NULL, - "output" jsonb, - "error" jsonb, - "started_at" timestamp with time zone DEFAULT now() NOT NULL, - "finished_at" timestamp with time zone, - "timeout_ms" integer NOT NULL, - "expires_at" timestamp with time zone, - "retries_limit" integer DEFAULT 0 NOT NULL, - "retries_count" integer DEFAULT 0 NOT NULL, - "delayed_ms" integer, - "history_failed_attempts" jsonb DEFAULT '{}' NOT NULL, - "created_at" timestamp with time zone DEFAULT now() NOT NULL, - "updated_at" timestamp with time zone DEFAULT now() NOT NULL, - CONSTRAINT "unique_job_step_name_parent" UNIQUE NULLS NOT DISTINCT("job_id","name","parent_step_id"), - CONSTRAINT "job_steps_status_check" CHECK ("status" IN ('active','completed','failed','cancelled')) -); ---> statement-breakpoint -CREATE TABLE "duron"."jobs" ( - "id" uuid PRIMARY KEY DEFAULT gen_random_uuid(), - "action_name" text NOT NULL, - "group_key" text NOT NULL, - "description" text, - "status" text DEFAULT 'created' NOT NULL, - "checksum" text NOT NULL, - "input" jsonb DEFAULT '{}' NOT NULL, - "output" jsonb, - "error" jsonb, - "timeout_ms" integer NOT NULL, - "expires_at" timestamp with time zone, - "started_at" timestamp with time zone, - "finished_at" timestamp with time zone, - "client_id" text, - "concurrency_limit" integer NOT NULL, - "concurrency_step_limit" integer NOT NULL, - "created_at" timestamp with time zone DEFAULT now() NOT NULL, - "updated_at" timestamp with time zone DEFAULT now() NOT NULL, - CONSTRAINT "jobs_status_check" CHECK ("status" IN ('created','active','completed','failed','cancelled')) -); ---> statement-breakpoint -CREATE TABLE "duron"."spans" ( - "id" bigserial PRIMARY KEY, - "trace_id" text NOT NULL, - "span_id" text NOT NULL, - "parent_span_id" text, - "job_id" uuid, - "step_id" uuid, - "name" text NOT NULL, - "kind" integer DEFAULT 0 NOT NULL, - "start_time_unix_nano" bigint NOT NULL, - "end_time_unix_nano" bigint, - "status_code" integer DEFAULT 0 NOT NULL, - "status_message" text, - "attributes" jsonb DEFAULT '{}' NOT NULL, - "events" jsonb DEFAULT '[]' NOT NULL, - CONSTRAINT "spans_kind_check" CHECK ("kind" IN (0, 1, 2, 3, 4)), - CONSTRAINT "spans_status_code_check" CHECK ("status_code" IN (0, 1, 2)) -); ---> statement-breakpoint -CREATE INDEX "idx_job_steps_job_id" ON "duron"."job_steps" ("job_id");--> statement-breakpoint -CREATE INDEX "idx_job_steps_status" ON "duron"."job_steps" ("status");--> statement-breakpoint -CREATE INDEX "idx_job_steps_name" ON "duron"."job_steps" ("name");--> statement-breakpoint -CREATE INDEX "idx_job_steps_expires_at" ON "duron"."job_steps" ("expires_at");--> statement-breakpoint -CREATE INDEX "idx_job_steps_parent_step_id" ON "duron"."job_steps" ("parent_step_id");--> statement-breakpoint -CREATE INDEX "idx_job_steps_job_status" ON "duron"."job_steps" ("job_id","status");--> statement-breakpoint -CREATE INDEX "idx_job_steps_job_name" ON "duron"."job_steps" ("job_id","name");--> statement-breakpoint -CREATE INDEX "idx_job_steps_output_fts" ON "duron"."job_steps" USING gin (to_tsvector('english', "output"::text));--> statement-breakpoint -CREATE INDEX "idx_jobs_action_name" ON "duron"."jobs" ("action_name");--> statement-breakpoint -CREATE INDEX "idx_jobs_status" ON "duron"."jobs" ("status");--> statement-breakpoint -CREATE INDEX "idx_jobs_group_key" ON "duron"."jobs" ("group_key");--> statement-breakpoint -CREATE INDEX "idx_jobs_description" ON "duron"."jobs" ("description");--> statement-breakpoint -CREATE INDEX "idx_jobs_started_at" ON "duron"."jobs" ("started_at");--> statement-breakpoint -CREATE INDEX "idx_jobs_finished_at" ON "duron"."jobs" ("finished_at");--> statement-breakpoint -CREATE INDEX "idx_jobs_expires_at" ON "duron"."jobs" ("expires_at");--> statement-breakpoint -CREATE INDEX "idx_jobs_client_id" ON "duron"."jobs" ("client_id");--> statement-breakpoint -CREATE INDEX "idx_jobs_checksum" ON "duron"."jobs" ("checksum");--> statement-breakpoint -CREATE INDEX "idx_jobs_concurrency_limit" ON "duron"."jobs" ("concurrency_limit");--> statement-breakpoint -CREATE INDEX "idx_jobs_concurrency_step_limit" ON "duron"."jobs" ("concurrency_step_limit");--> statement-breakpoint -CREATE INDEX "idx_jobs_action_status" ON "duron"."jobs" ("action_name","status");--> statement-breakpoint -CREATE INDEX "idx_jobs_action_group" ON "duron"."jobs" ("action_name","group_key");--> statement-breakpoint -CREATE INDEX "idx_jobs_input_fts" ON "duron"."jobs" USING gin (to_tsvector('english', "input"::text));--> statement-breakpoint -CREATE INDEX "idx_jobs_output_fts" ON "duron"."jobs" USING gin (to_tsvector('english', "output"::text));--> statement-breakpoint -CREATE INDEX "idx_spans_trace_id" ON "duron"."spans" ("trace_id");--> statement-breakpoint -CREATE INDEX "idx_spans_span_id" ON "duron"."spans" ("span_id");--> statement-breakpoint -CREATE INDEX "idx_spans_job_id" ON "duron"."spans" ("job_id");--> statement-breakpoint -CREATE INDEX "idx_spans_step_id" ON "duron"."spans" ("step_id");--> statement-breakpoint -CREATE INDEX "idx_spans_name" ON "duron"."spans" ("name");--> statement-breakpoint -CREATE INDEX "idx_spans_kind" ON "duron"."spans" ("kind");--> statement-breakpoint -CREATE INDEX "idx_spans_status_code" ON "duron"."spans" ("status_code");--> statement-breakpoint -CREATE INDEX "idx_spans_job_step" ON "duron"."spans" ("job_id","step_id");--> statement-breakpoint -CREATE INDEX "idx_spans_trace_parent" ON "duron"."spans" ("trace_id","parent_span_id");--> statement-breakpoint -CREATE INDEX "idx_spans_attributes" ON "duron"."spans" USING gin ("attributes");--> statement-breakpoint -CREATE INDEX "idx_spans_events" ON "duron"."spans" USING gin ("events");--> statement-breakpoint -ALTER TABLE "duron"."job_steps" ADD CONSTRAINT "job_steps_job_id_jobs_id_fkey" FOREIGN KEY ("job_id") REFERENCES "duron"."jobs"("id") ON DELETE CASCADE;--> statement-breakpoint -ALTER TABLE "duron"."spans" ADD CONSTRAINT "spans_job_id_jobs_id_fkey" FOREIGN KEY ("job_id") REFERENCES "duron"."jobs"("id") ON DELETE CASCADE;--> statement-breakpoint -ALTER TABLE "duron"."spans" ADD CONSTRAINT "spans_step_id_job_steps_id_fkey" FOREIGN KEY ("step_id") REFERENCES "duron"."job_steps"("id") ON DELETE CASCADE; \ No newline at end of file diff --git a/packages/duron/migrations/postgres/20260421153337_large_nitro/migration.sql b/packages/duron/migrations/postgres/20260421153337_large_nitro/migration.sql new file mode 100644 index 0000000..485ae18 --- /dev/null +++ b/packages/duron/migrations/postgres/20260421153337_large_nitro/migration.sql @@ -0,0 +1,155 @@ +CREATE SCHEMA IF NOT EXISTS "duron"; +--> statement-breakpoint +CREATE TABLE "duron"."job_steps_active" ( + "id" uuid PRIMARY KEY DEFAULT gen_random_uuid(), + "job_id" uuid NOT NULL, + "parent_step_id" uuid, + "branch" boolean DEFAULT false NOT NULL, + "name" text NOT NULL, + "status" text DEFAULT 'active' NOT NULL, + "output" jsonb, + "error" jsonb, + "started_at" timestamp with time zone DEFAULT now() NOT NULL, + "finished_at" timestamp with time zone, + "timeout_ms" integer NOT NULL, + "expires_at" timestamp with time zone, + "retries_limit" integer DEFAULT 0 NOT NULL, + "retries_count" integer DEFAULT 0 NOT NULL, + "delayed_ms" integer, + "history_failed_attempts" jsonb DEFAULT '{}' NOT NULL, + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + "updated_at" timestamp with time zone DEFAULT now() NOT NULL, + CONSTRAINT "unique_job_step_active_name_parent" UNIQUE NULLS NOT DISTINCT("job_id","name","parent_step_id"), + CONSTRAINT "job_steps_active_status_check" CHECK ("status" IN ('active','completed','failed','cancelled')) +); +--> statement-breakpoint +CREATE TABLE "duron"."job_steps_archive" ( + "id" uuid PRIMARY KEY, + "job_id" uuid NOT NULL, + "parent_step_id" uuid, + "branch" boolean DEFAULT false NOT NULL, + "name" text NOT NULL, + "status" text DEFAULT 'active' NOT NULL, + "output" jsonb, + "error" jsonb, + "started_at" timestamp with time zone DEFAULT now() NOT NULL, + "finished_at" timestamp with time zone, + "timeout_ms" integer NOT NULL, + "expires_at" timestamp with time zone, + "retries_limit" integer DEFAULT 0 NOT NULL, + "retries_count" integer DEFAULT 0 NOT NULL, + "delayed_ms" integer, + "history_failed_attempts" jsonb DEFAULT '{}' NOT NULL, + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + "updated_at" timestamp with time zone DEFAULT now() NOT NULL, + "job_finished_at" timestamp with time zone, + CONSTRAINT "job_steps_archive_status_check" CHECK ("status" IN ('active','completed','failed','cancelled')) +); +--> statement-breakpoint +CREATE TABLE "duron"."jobs_active" ( + "id" uuid PRIMARY KEY DEFAULT gen_random_uuid(), + "action_name" text NOT NULL, + "group_key" text NOT NULL, + "description" text, + "status" text DEFAULT 'created' NOT NULL, + "checksum" text NOT NULL, + "input" jsonb DEFAULT '{}' NOT NULL, + "output" jsonb, + "error" jsonb, + "timeout_ms" integer NOT NULL, + "expires_at" timestamp with time zone, + "started_at" timestamp with time zone, + "finished_at" timestamp with time zone, + "client_id" text, + "concurrency_limit" integer NOT NULL, + "concurrency_step_limit" integer NOT NULL, + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + "updated_at" timestamp with time zone DEFAULT now() NOT NULL, + CONSTRAINT "jobs_active_status_check" CHECK ("status" IN ('created','active','completed','failed','cancelled')) +); +--> statement-breakpoint +CREATE TABLE "duron"."jobs_archive" ( + "id" uuid PRIMARY KEY, + "action_name" text NOT NULL, + "group_key" text NOT NULL, + "description" text, + "status" text NOT NULL, + "checksum" text NOT NULL, + "input" jsonb DEFAULT '{}' NOT NULL, + "output" jsonb, + "error" jsonb, + "timeout_ms" integer NOT NULL, + "expires_at" timestamp with time zone, + "started_at" timestamp with time zone, + "finished_at" timestamp with time zone, + "client_id" text, + "concurrency_limit" integer NOT NULL, + "concurrency_step_limit" integer NOT NULL, + "created_at" timestamp with time zone DEFAULT now() NOT NULL, + "updated_at" timestamp with time zone DEFAULT now() NOT NULL, + CONSTRAINT "jobs_archive_status_check" CHECK ("status" IN ('created','active','completed','failed','cancelled')) +); +--> statement-breakpoint +CREATE TABLE "duron"."spans" ( + "id" bigserial PRIMARY KEY, + "trace_id" text NOT NULL, + "span_id" text NOT NULL, + "parent_span_id" text, + "job_id" uuid, + "step_id" uuid, + "name" text NOT NULL, + "kind" integer DEFAULT 0 NOT NULL, + "start_time_unix_nano" bigint NOT NULL, + "end_time_unix_nano" bigint, + "status_code" integer DEFAULT 0 NOT NULL, + "status_message" text, + "attributes" jsonb DEFAULT '{}' NOT NULL, + "events" jsonb DEFAULT '[]' NOT NULL, + CONSTRAINT "spans_kind_check" CHECK ("kind" IN (0, 1, 2, 3, 4)), + CONSTRAINT "spans_status_code_check" CHECK ("status_code" IN (0, 1, 2)) +); +--> statement-breakpoint +CREATE INDEX "idx_job_steps_active_job_id" ON "duron"."job_steps_active" ("job_id");--> statement-breakpoint +CREATE INDEX "idx_job_steps_active_status" ON "duron"."job_steps_active" ("status");--> statement-breakpoint +CREATE INDEX "idx_job_steps_active_name" ON "duron"."job_steps_active" ("name");--> statement-breakpoint +CREATE INDEX "idx_job_steps_active_expires_at" ON "duron"."job_steps_active" ("expires_at");--> statement-breakpoint +CREATE INDEX "idx_job_steps_active_parent_step_id" ON "duron"."job_steps_active" ("parent_step_id");--> statement-breakpoint +CREATE INDEX "idx_job_steps_active_job_status" ON "duron"."job_steps_active" ("job_id","status");--> statement-breakpoint +CREATE INDEX "idx_job_steps_active_job_name" ON "duron"."job_steps_active" ("job_id","name");--> statement-breakpoint +CREATE INDEX "idx_job_steps_active_output_fts" ON "duron"."job_steps_active" USING gin (to_tsvector('english', "output"::text));--> statement-breakpoint +CREATE INDEX "idx_job_steps_archive_job_id" ON "duron"."job_steps_archive" ("job_id");--> statement-breakpoint +CREATE INDEX "idx_job_steps_archive_job_finished_at" ON "duron"."job_steps_archive" ("job_finished_at");--> statement-breakpoint +CREATE INDEX "idx_job_steps_archive_name" ON "duron"."job_steps_archive" ("name");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_action_name" ON "duron"."jobs_active" ("action_name");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_status" ON "duron"."jobs_active" ("status");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_group_key" ON "duron"."jobs_active" ("group_key");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_description" ON "duron"."jobs_active" ("description");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_started_at" ON "duron"."jobs_active" ("started_at");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_expires_at" ON "duron"."jobs_active" ("expires_at");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_client_id" ON "duron"."jobs_active" ("client_id");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_checksum" ON "duron"."jobs_active" ("checksum");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_concurrency_limit" ON "duron"."jobs_active" ("concurrency_limit");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_concurrency_step_limit" ON "duron"."jobs_active" ("concurrency_step_limit");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_action_status" ON "duron"."jobs_active" ("action_name","status");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_action_group" ON "duron"."jobs_active" ("action_name","group_key");--> statement-breakpoint +CREATE INDEX "idx_jobs_active_input_fts" ON "duron"."jobs_active" USING gin (to_tsvector('english', "input"::text));--> statement-breakpoint +CREATE INDEX "idx_jobs_active_output_fts" ON "duron"."jobs_active" USING gin (to_tsvector('english', "output"::text));--> statement-breakpoint +CREATE INDEX "idx_jobs_archive_group_key" ON "duron"."jobs_archive" ("group_key");--> statement-breakpoint +CREATE INDEX "idx_jobs_archive_action_name" ON "duron"."jobs_archive" ("action_name");--> statement-breakpoint +CREATE INDEX "idx_jobs_archive_finished_at" ON "duron"."jobs_archive" ("finished_at");--> statement-breakpoint +CREATE INDEX "idx_jobs_archive_action_group" ON "duron"."jobs_archive" ("action_name","group_key");--> statement-breakpoint +CREATE INDEX "idx_jobs_archive_input_fts" ON "duron"."jobs_archive" USING gin (to_tsvector('english', "input"::text));--> statement-breakpoint +CREATE INDEX "idx_jobs_archive_output_fts" ON "duron"."jobs_archive" USING gin (to_tsvector('english', "output"::text));--> statement-breakpoint +CREATE INDEX "idx_spans_trace_id" ON "duron"."spans" ("trace_id");--> statement-breakpoint +CREATE INDEX "idx_spans_span_id" ON "duron"."spans" ("span_id");--> statement-breakpoint +CREATE INDEX "idx_spans_job_id" ON "duron"."spans" ("job_id");--> statement-breakpoint +CREATE INDEX "idx_spans_step_id" ON "duron"."spans" ("step_id");--> statement-breakpoint +CREATE INDEX "idx_spans_name" ON "duron"."spans" ("name");--> statement-breakpoint +CREATE INDEX "idx_spans_kind" ON "duron"."spans" ("kind");--> statement-breakpoint +CREATE INDEX "idx_spans_status_code" ON "duron"."spans" ("status_code");--> statement-breakpoint +CREATE INDEX "idx_spans_job_step" ON "duron"."spans" ("job_id","step_id");--> statement-breakpoint +CREATE INDEX "idx_spans_trace_parent" ON "duron"."spans" ("trace_id","parent_span_id");--> statement-breakpoint +CREATE INDEX "idx_spans_attributes" ON "duron"."spans" USING gin ("attributes");--> statement-breakpoint +CREATE INDEX "idx_spans_events" ON "duron"."spans" USING gin ("events");--> statement-breakpoint +ALTER TABLE "duron"."job_steps_active" ADD CONSTRAINT "job_steps_active_job_id_jobs_active_id_fkey" FOREIGN KEY ("job_id") REFERENCES "duron"."jobs_active"("id") ON DELETE CASCADE;--> statement-breakpoint +ALTER TABLE "duron"."job_steps_archive" ADD CONSTRAINT "job_steps_archive_job_id_jobs_archive_id_fkey" FOREIGN KEY ("job_id") REFERENCES "duron"."jobs_archive"("id") ON DELETE CASCADE; \ No newline at end of file diff --git a/packages/duron/migrations/postgres/20260121160012_normal_bloodstrike/snapshot.json b/packages/duron/migrations/postgres/20260421153337_large_nitro/snapshot.json similarity index 61% rename from packages/duron/migrations/postgres/20260121160012_normal_bloodstrike/snapshot.json rename to packages/duron/migrations/postgres/20260421153337_large_nitro/snapshot.json index fe98349..9e9cacd 100644 --- a/packages/duron/migrations/postgres/20260121160012_normal_bloodstrike/snapshot.json +++ b/packages/duron/migrations/postgres/20260421153337_large_nitro/snapshot.json @@ -1,7 +1,7 @@ { "version": "8", "dialect": "postgres", - "id": "47ec47b1-f323-4e7e-a4bc-605f703aa384", + "id": "dd03307a-a6a5-4223-aa91-fbf5ad99469c", "prevIds": ["00000000-0000-0000-0000-000000000000"], "ddl": [ { @@ -10,13 +10,25 @@ }, { "isRlsEnabled": false, - "name": "job_steps", + "name": "job_steps_active", "entityType": "tables", "schema": "duron" }, { "isRlsEnabled": false, - "name": "jobs", + "name": "job_steps_archive", + "entityType": "tables", + "schema": "duron" + }, + { + "isRlsEnabled": false, + "name": "jobs_active", + "entityType": "tables", + "schema": "duron" + }, + { + "isRlsEnabled": false, + "name": "jobs_archive", "entityType": "tables", "schema": "duron" }, @@ -37,7 +49,241 @@ "name": "id", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" + }, + { + "type": "uuid", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "job_id", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "uuid", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "parent_step_id", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "boolean", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "false", + "generated": null, + "identity": null, + "name": "branch", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "text", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "name", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "text", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "'active'", + "generated": null, + "identity": null, + "name": "status", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "jsonb", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "output", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "jsonb", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "error", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "now()", + "generated": null, + "identity": null, + "name": "started_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "finished_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "integer", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "timeout_ms", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "expires_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "integer", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "0", + "generated": null, + "identity": null, + "name": "retries_limit", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "integer", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "0", + "generated": null, + "identity": null, + "name": "retries_count", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "integer", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "delayed_ms", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "jsonb", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "'{}'", + "generated": null, + "identity": null, + "name": "history_failed_attempts", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "now()", + "generated": null, + "identity": null, + "name": "created_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "now()", + "generated": null, + "identity": null, + "name": "updated_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_active" + }, + { + "type": "uuid", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "id", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" }, { "type": "uuid", @@ -50,7 +296,7 @@ "name": "job_id", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "job_steps_archive" }, { "type": "uuid", @@ -63,20 +309,280 @@ "name": "parent_step_id", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "job_steps_archive" }, { "type": "boolean", "typeSchema": null, "notNull": true, "dimensions": 0, - "default": "false", + "default": "false", + "generated": null, + "identity": null, + "name": "branch", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "text", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "name", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "text", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "'active'", + "generated": null, + "identity": null, + "name": "status", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "jsonb", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "output", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "jsonb", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "error", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "now()", + "generated": null, + "identity": null, + "name": "started_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "finished_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "integer", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "timeout_ms", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "expires_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "integer", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "0", + "generated": null, + "identity": null, + "name": "retries_limit", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "integer", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "0", + "generated": null, + "identity": null, + "name": "retries_count", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "integer", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "delayed_ms", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "jsonb", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "'{}'", + "generated": null, + "identity": null, + "name": "history_failed_attempts", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "now()", + "generated": null, + "identity": null, + "name": "created_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "now()", + "generated": null, + "identity": null, + "name": "updated_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "timestamp with time zone", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "job_finished_at", + "entityType": "columns", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "type": "uuid", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "gen_random_uuid()", + "generated": null, + "identity": null, + "name": "id", + "entityType": "columns", + "schema": "duron", + "table": "jobs_active" + }, + { + "type": "text", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "action_name", + "entityType": "columns", + "schema": "duron", + "table": "jobs_active" + }, + { + "type": "text", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "group_key", + "entityType": "columns", + "schema": "duron", + "table": "jobs_active" + }, + { + "type": "text", + "typeSchema": null, + "notNull": false, + "dimensions": 0, + "default": null, + "generated": null, + "identity": null, + "name": "description", + "entityType": "columns", + "schema": "duron", + "table": "jobs_active" + }, + { + "type": "text", + "typeSchema": null, + "notNull": true, + "dimensions": 0, + "default": "'created'", "generated": null, "identity": null, - "name": "branch", + "name": "status", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "type": "text", @@ -86,23 +592,23 @@ "default": null, "generated": null, "identity": null, - "name": "name", + "name": "checksum", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { - "type": "text", + "type": "jsonb", "typeSchema": null, "notNull": true, "dimensions": 0, - "default": "'active'", + "default": "'{}'", "generated": null, "identity": null, - "name": "status", + "name": "input", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "type": "jsonb", @@ -115,7 +621,7 @@ "name": "output", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "type": "jsonb", @@ -128,20 +634,20 @@ "name": "error", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { - "type": "timestamp with time zone", + "type": "integer", "typeSchema": null, "notNull": true, "dimensions": 0, - "default": "now()", + "default": null, "generated": null, "identity": null, - "name": "started_at", + "name": "timeout_ms", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "type": "timestamp with time zone", @@ -151,23 +657,23 @@ "default": null, "generated": null, "identity": null, - "name": "finished_at", + "name": "expires_at", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { - "type": "integer", + "type": "timestamp with time zone", "typeSchema": null, - "notNull": true, + "notNull": false, "dimensions": 0, "default": null, "generated": null, "identity": null, - "name": "timeout_ms", + "name": "started_at", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "type": "timestamp with time zone", @@ -177,62 +683,49 @@ "default": null, "generated": null, "identity": null, - "name": "expires_at", + "name": "finished_at", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { - "type": "integer", + "type": "text", "typeSchema": null, - "notNull": true, + "notNull": false, "dimensions": 0, - "default": "0", + "default": null, "generated": null, "identity": null, - "name": "retries_limit", + "name": "client_id", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "type": "integer", "typeSchema": null, "notNull": true, "dimensions": 0, - "default": "0", - "generated": null, - "identity": null, - "name": "retries_count", - "entityType": "columns", - "schema": "duron", - "table": "job_steps" - }, - { - "type": "integer", - "typeSchema": null, - "notNull": false, - "dimensions": 0, "default": null, "generated": null, "identity": null, - "name": "delayed_ms", + "name": "concurrency_limit", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { - "type": "jsonb", + "type": "integer", "typeSchema": null, "notNull": true, "dimensions": 0, - "default": "'{}'", + "default": null, "generated": null, "identity": null, - "name": "history_failed_attempts", + "name": "concurrency_step_limit", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "type": "timestamp with time zone", @@ -245,7 +738,7 @@ "name": "created_at", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "type": "timestamp with time zone", @@ -258,20 +751,20 @@ "name": "updated_at", "entityType": "columns", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "type": "uuid", "typeSchema": null, "notNull": true, "dimensions": 0, - "default": "gen_random_uuid()", + "default": null, "generated": null, "identity": null, "name": "id", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "text", @@ -284,7 +777,7 @@ "name": "action_name", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "text", @@ -297,7 +790,7 @@ "name": "group_key", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "text", @@ -310,20 +803,20 @@ "name": "description", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "text", "typeSchema": null, "notNull": true, "dimensions": 0, - "default": "'created'", + "default": null, "generated": null, "identity": null, "name": "status", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "text", @@ -336,7 +829,7 @@ "name": "checksum", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "jsonb", @@ -349,7 +842,7 @@ "name": "input", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "jsonb", @@ -362,7 +855,7 @@ "name": "output", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "jsonb", @@ -375,7 +868,7 @@ "name": "error", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "integer", @@ -388,7 +881,7 @@ "name": "timeout_ms", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "timestamp with time zone", @@ -401,7 +894,7 @@ "name": "expires_at", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "timestamp with time zone", @@ -414,7 +907,7 @@ "name": "started_at", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "timestamp with time zone", @@ -427,7 +920,7 @@ "name": "finished_at", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "text", @@ -440,7 +933,7 @@ "name": "client_id", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "integer", @@ -453,7 +946,7 @@ "name": "concurrency_limit", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "integer", @@ -466,7 +959,7 @@ "name": "concurrency_step_limit", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "timestamp with time zone", @@ -479,7 +972,7 @@ "name": "created_at", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "timestamp with time zone", @@ -492,7 +985,7 @@ "name": "updated_at", "entityType": "columns", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "type": "bigserial", @@ -692,10 +1185,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_job_steps_job_id", + "name": "idx_job_steps_active_job_id", "entityType": "indexes", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" }, { "nameExplicit": true, @@ -713,10 +1206,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_job_steps_status", + "name": "idx_job_steps_active_status", "entityType": "indexes", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" }, { "nameExplicit": true, @@ -734,10 +1227,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_job_steps_name", + "name": "idx_job_steps_active_name", "entityType": "indexes", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" }, { "nameExplicit": true, @@ -755,10 +1248,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_job_steps_expires_at", + "name": "idx_job_steps_active_expires_at", "entityType": "indexes", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" }, { "nameExplicit": true, @@ -776,10 +1269,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_job_steps_parent_step_id", + "name": "idx_job_steps_active_parent_step_id", "entityType": "indexes", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" }, { "nameExplicit": true, @@ -804,10 +1297,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_job_steps_job_status", + "name": "idx_job_steps_active_job_status", "entityType": "indexes", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" }, { "nameExplicit": true, @@ -832,10 +1325,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_job_steps_job_name", + "name": "idx_job_steps_active_job_name", "entityType": "indexes", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" }, { "nameExplicit": true, @@ -853,16 +1346,16 @@ "with": "", "method": "gin", "concurrently": false, - "name": "idx_job_steps_output_fts", + "name": "idx_job_steps_active_output_fts", "entityType": "indexes", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" }, { "nameExplicit": true, "columns": [ { - "value": "action_name", + "value": "job_id", "isExpression": false, "asc": true, "nullsFirst": false, @@ -874,16 +1367,16 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_action_name", + "name": "idx_job_steps_archive_job_id", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "job_steps_archive" }, { "nameExplicit": true, "columns": [ { - "value": "status", + "value": "job_finished_at", "isExpression": false, "asc": true, "nullsFirst": false, @@ -895,16 +1388,16 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_status", + "name": "idx_job_steps_archive_job_finished_at", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "job_steps_archive" }, { "nameExplicit": true, "columns": [ { - "value": "group_key", + "value": "name", "isExpression": false, "asc": true, "nullsFirst": false, @@ -916,16 +1409,16 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_group_key", + "name": "idx_job_steps_archive_name", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "job_steps_archive" }, { "nameExplicit": true, "columns": [ { - "value": "description", + "value": "action_name", "isExpression": false, "asc": true, "nullsFirst": false, @@ -937,16 +1430,16 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_description", + "name": "idx_jobs_active_action_name", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_active" }, { "nameExplicit": true, "columns": [ { - "value": "started_at", + "value": "status", "isExpression": false, "asc": true, "nullsFirst": false, @@ -958,16 +1451,58 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_started_at", + "name": "idx_jobs_active_status", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_active" }, { "nameExplicit": true, "columns": [ { - "value": "finished_at", + "value": "group_key", + "isExpression": false, + "asc": true, + "nullsFirst": false, + "opclass": null + } + ], + "isUnique": false, + "where": null, + "with": "", + "method": "btree", + "concurrently": false, + "name": "idx_jobs_active_group_key", + "entityType": "indexes", + "schema": "duron", + "table": "jobs_active" + }, + { + "nameExplicit": true, + "columns": [ + { + "value": "description", + "isExpression": false, + "asc": true, + "nullsFirst": false, + "opclass": null + } + ], + "isUnique": false, + "where": null, + "with": "", + "method": "btree", + "concurrently": false, + "name": "idx_jobs_active_description", + "entityType": "indexes", + "schema": "duron", + "table": "jobs_active" + }, + { + "nameExplicit": true, + "columns": [ + { + "value": "started_at", "isExpression": false, "asc": true, "nullsFirst": false, @@ -979,10 +1514,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_finished_at", + "name": "idx_jobs_active_started_at", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_active" }, { "nameExplicit": true, @@ -1000,10 +1535,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_expires_at", + "name": "idx_jobs_active_expires_at", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_active" }, { "nameExplicit": true, @@ -1021,10 +1556,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_client_id", + "name": "idx_jobs_active_client_id", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_active" }, { "nameExplicit": true, @@ -1042,10 +1577,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_checksum", + "name": "idx_jobs_active_checksum", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_active" }, { "nameExplicit": true, @@ -1063,10 +1598,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_concurrency_limit", + "name": "idx_jobs_active_concurrency_limit", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_active" }, { "nameExplicit": true, @@ -1084,10 +1619,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_concurrency_step_limit", + "name": "idx_jobs_active_concurrency_step_limit", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_active" }, { "nameExplicit": true, @@ -1112,10 +1647,143 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_action_status", + "name": "idx_jobs_active_action_status", + "entityType": "indexes", + "schema": "duron", + "table": "jobs_active" + }, + { + "nameExplicit": true, + "columns": [ + { + "value": "action_name", + "isExpression": false, + "asc": true, + "nullsFirst": false, + "opclass": null + }, + { + "value": "group_key", + "isExpression": false, + "asc": true, + "nullsFirst": false, + "opclass": null + } + ], + "isUnique": false, + "where": null, + "with": "", + "method": "btree", + "concurrently": false, + "name": "idx_jobs_active_action_group", + "entityType": "indexes", + "schema": "duron", + "table": "jobs_active" + }, + { + "nameExplicit": true, + "columns": [ + { + "value": "to_tsvector('english', \"input\"::text)", + "isExpression": true, + "asc": true, + "nullsFirst": false, + "opclass": null + } + ], + "isUnique": false, + "where": null, + "with": "", + "method": "gin", + "concurrently": false, + "name": "idx_jobs_active_input_fts", + "entityType": "indexes", + "schema": "duron", + "table": "jobs_active" + }, + { + "nameExplicit": true, + "columns": [ + { + "value": "to_tsvector('english', \"output\"::text)", + "isExpression": true, + "asc": true, + "nullsFirst": false, + "opclass": null + } + ], + "isUnique": false, + "where": null, + "with": "", + "method": "gin", + "concurrently": false, + "name": "idx_jobs_active_output_fts", + "entityType": "indexes", + "schema": "duron", + "table": "jobs_active" + }, + { + "nameExplicit": true, + "columns": [ + { + "value": "group_key", + "isExpression": false, + "asc": true, + "nullsFirst": false, + "opclass": null + } + ], + "isUnique": false, + "where": null, + "with": "", + "method": "btree", + "concurrently": false, + "name": "idx_jobs_archive_group_key", + "entityType": "indexes", + "schema": "duron", + "table": "jobs_archive" + }, + { + "nameExplicit": true, + "columns": [ + { + "value": "action_name", + "isExpression": false, + "asc": true, + "nullsFirst": false, + "opclass": null + } + ], + "isUnique": false, + "where": null, + "with": "", + "method": "btree", + "concurrently": false, + "name": "idx_jobs_archive_action_name", + "entityType": "indexes", + "schema": "duron", + "table": "jobs_archive" + }, + { + "nameExplicit": true, + "columns": [ + { + "value": "finished_at", + "isExpression": false, + "asc": true, + "nullsFirst": false, + "opclass": null + } + ], + "isUnique": false, + "where": null, + "with": "", + "method": "btree", + "concurrently": false, + "name": "idx_jobs_archive_finished_at", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "nameExplicit": true, @@ -1140,10 +1808,10 @@ "with": "", "method": "btree", "concurrently": false, - "name": "idx_jobs_action_group", + "name": "idx_jobs_archive_action_group", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "nameExplicit": true, @@ -1161,10 +1829,10 @@ "with": "", "method": "gin", "concurrently": false, - "name": "idx_jobs_input_fts", + "name": "idx_jobs_archive_input_fts", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "nameExplicit": true, @@ -1182,10 +1850,10 @@ "with": "", "method": "gin", "concurrently": false, - "name": "idx_jobs_output_fts", + "name": "idx_jobs_archive_output_fts", "entityType": "indexes", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "nameExplicit": true, @@ -1436,55 +2104,58 @@ "nameExplicit": false, "columns": ["job_id"], "schemaTo": "duron", - "tableTo": "jobs", + "tableTo": "jobs_active", "columnsTo": ["id"], "onUpdate": "NO ACTION", "onDelete": "CASCADE", - "name": "job_steps_job_id_jobs_id_fkey", + "name": "job_steps_active_job_id_jobs_active_id_fkey", "entityType": "fks", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" }, { "nameExplicit": false, "columns": ["job_id"], "schemaTo": "duron", - "tableTo": "jobs", + "tableTo": "jobs_archive", "columnsTo": ["id"], "onUpdate": "NO ACTION", "onDelete": "CASCADE", - "name": "spans_job_id_jobs_id_fkey", + "name": "job_steps_archive_job_id_jobs_archive_id_fkey", "entityType": "fks", "schema": "duron", - "table": "spans" + "table": "job_steps_archive" }, { + "columns": ["id"], "nameExplicit": false, - "columns": ["step_id"], - "schemaTo": "duron", - "tableTo": "job_steps", - "columnsTo": ["id"], - "onUpdate": "NO ACTION", - "onDelete": "CASCADE", - "name": "spans_step_id_job_steps_id_fkey", - "entityType": "fks", + "name": "job_steps_active_pkey", "schema": "duron", - "table": "spans" + "table": "job_steps_active", + "entityType": "pks" }, { "columns": ["id"], "nameExplicit": false, - "name": "job_steps_pkey", + "name": "job_steps_archive_pkey", "schema": "duron", - "table": "job_steps", + "table": "job_steps_archive", "entityType": "pks" }, { "columns": ["id"], "nameExplicit": false, - "name": "jobs_pkey", + "name": "jobs_active_pkey", "schema": "duron", - "table": "jobs", + "table": "jobs_active", + "entityType": "pks" + }, + { + "columns": ["id"], + "nameExplicit": false, + "name": "jobs_archive_pkey", + "schema": "duron", + "table": "jobs_archive", "entityType": "pks" }, { @@ -1499,24 +2170,38 @@ "nameExplicit": true, "columns": ["job_id", "name", "parent_step_id"], "nullsNotDistinct": true, - "name": "unique_job_step_name_parent", + "name": "unique_job_step_active_name_parent", "entityType": "uniques", "schema": "duron", - "table": "job_steps" + "table": "job_steps_active" + }, + { + "value": "\"status\" IN ('active','completed','failed','cancelled')", + "name": "job_steps_active_status_check", + "entityType": "checks", + "schema": "duron", + "table": "job_steps_active" }, { "value": "\"status\" IN ('active','completed','failed','cancelled')", - "name": "job_steps_status_check", + "name": "job_steps_archive_status_check", + "entityType": "checks", + "schema": "duron", + "table": "job_steps_archive" + }, + { + "value": "\"status\" IN ('created','active','completed','failed','cancelled')", + "name": "jobs_active_status_check", "entityType": "checks", "schema": "duron", - "table": "job_steps" + "table": "jobs_active" }, { "value": "\"status\" IN ('created','active','completed','failed','cancelled')", - "name": "jobs_status_check", + "name": "jobs_archive_status_check", "entityType": "checks", "schema": "duron", - "table": "jobs" + "table": "jobs_archive" }, { "value": "\"kind\" IN (0, 1, 2, 3, 4)", diff --git a/packages/duron/src/adapters/adapter.ts b/packages/duron/src/adapters/adapter.ts index f4334d6..d2b0c52 100644 --- a/packages/duron/src/adapters/adapter.ts +++ b/packages/duron/src/adapters/adapter.ts @@ -14,6 +14,7 @@ import { type StepStatus, } from '../constants.js' import type { + ArchiveStats, CancelJobOptions, CancelJobStepOptions, CompleteJobOptions, @@ -40,11 +41,13 @@ import type { JobStatusResult, JobStep, JobStepStatusResult, + PruneArchiveOptions, RecoverJobsOptions, RetryJobOptions, TimeTravelJobOptions, } from './schemas.js' import { + ArchiveStatsSchema, BooleanResultSchema, CancelJobOptionsSchema, CancelJobStepOptionsSchema, @@ -75,6 +78,7 @@ import { JobStepStatusResultSchema, JobsArrayResultSchema, NumberResultSchema, + PruneArchiveOptionsSchema, RecoverJobsOptionsSchema, RetryJobOptionsSchema, TimeTravelJobOptionsSchema, @@ -83,6 +87,7 @@ import { // Re-export types from schemas for backward compatibility export type { ActionStats, + ArchiveStats, CancelJobOptions, CancelJobStepOptions, CompleteJobOptions, @@ -112,6 +117,7 @@ export type { JobStatusResult, JobStep, JobStepStatusResult, + PruneArchiveOptions, RecoverJobsOptions, RetryJobOptions, SortOrder, @@ -1088,6 +1094,85 @@ export abstract class Adapter extends EventEmitter { */ protected abstract _deleteSpans(options: DeleteSpansOptions): Promise + // ============================================================================ + // Archive Methods + // ============================================================================ + + /** + * Prune archived jobs older than the specified threshold. + * + * @param options - Prune options including olderThan, batchSize, maxBatches + * @returns Promise resolving to the number of jobs deleted + */ + async pruneArchive(options: PruneArchiveOptions): Promise { + try { + await this.start() + const parsedOptions = PruneArchiveOptionsSchema.parse(options) + const result = await this._pruneArchive(parsedOptions) + return NumberResultSchema.parse(result) + } catch (error) { + this.logger?.error(error, 'Error in Adapter.pruneArchive()') + throw error + } + } + + /** + * Truncate all archive tables (nuclear option). + * + * @returns Promise resolving to void + */ + async truncateArchive(): Promise { + try { + await this.start() + await this._truncateArchive() + } catch (error) { + this.logger?.error(error, 'Error in Adapter.truncateArchive()') + throw error + } + } + + /** + * Get archive statistics. + * + * @returns Promise resolving to archive stats + */ + async getArchiveStats(): Promise { + try { + await this.start() + const result = await this._getArchiveStats() + return ArchiveStatsSchema.parse(result) + } catch (error) { + this.logger?.error(error, 'Error in Adapter.getArchiveStats()') + throw error + } + } + + // ============================================================================ + // Private Archive Methods (to be implemented by adapters) + // ============================================================================ + + /** + * Internal method to prune archived jobs. + * + * @param options - Validated prune options + * @returns Promise resolving to the number of jobs deleted + */ + protected abstract _pruneArchive(options: PruneArchiveOptions): Promise + + /** + * Internal method to truncate all archive tables. + * + * @returns Promise resolving to void + */ + protected abstract _truncateArchive(): Promise + + /** + * Internal method to get archive statistics. + * + * @returns Promise resolving to archive stats + */ + protected abstract _getArchiveStats(): Promise + // ============================================================================ // Protected Abstract Methods (to be implemented by adapters) // ============================================================================ diff --git a/packages/duron/src/adapters/postgres/base.ts b/packages/duron/src/adapters/postgres/base.ts index 7e0c905..2d7db54 100644 --- a/packages/duron/src/adapters/postgres/base.ts +++ b/packages/duron/src/adapters/postgres/base.ts @@ -14,6 +14,7 @@ import { } from '../../constants.js' import { Adapter, + type ArchiveStats, type CancelJobOptions, type CancelJobStepOptions, type CompleteJobOptions, @@ -37,10 +38,10 @@ import { type GetSpansResult, type InsertSpanOptions, type Job, - type JobSort, type JobStatusResult, type JobStep, type JobStepStatusResult, + type PruneArchiveOptions, type RecoverJobsOptions, type RetryJobOptions, type SpanSort, @@ -55,11 +56,19 @@ export type { Job, JobStep } from '../adapter.js' type DrizzleDatabase = PgAsyncDatabase +export interface PruneSchedulerConfig { + olderThan: string | Date | number + intervalMs: number + batchSize?: number + maxBatches?: number +} + export interface AdapterOptions { connection: Connection schema?: string migrateOnStart?: boolean migrationsFolder?: string + pruneArchive?: PruneSchedulerConfig } export class PostgresBaseAdapter extends Adapter { @@ -69,6 +78,11 @@ export class PostgresBaseAdapter e protected schema: string = 'duron' protected migrateOnStart: boolean = true + // Scheduler state + private pruneTimer: ReturnType | null = null + private pruneConfig: PruneSchedulerConfig | null = null + private lastPrunedAt: Date | null = null + // ============================================================================ // Constructor // ============================================================================ @@ -84,6 +98,7 @@ export class PostgresBaseAdapter e this.connection = options.connection this.schema = options.schema ?? 'duron' this.migrateOnStart = options.migrateOnStart ?? true + this.pruneConfig = options.pruneArchive ?? null this.tables = createSchema(this.schema) @@ -126,10 +141,80 @@ export class PostgresBaseAdapter e this.emit('job-available', { jobId }) } }) + + // Start archive prune scheduler if configured + this._startScheduler() } protected async _stop() { - // do nothing + this._stopScheduler() + } + + // ============================================================================ + // Scheduler Methods + // ============================================================================ + + /** + * Generate a consistent advisory lock key from the schema name. + */ + private _advisoryLockKey(): number { + let hash = 0 + for (let i = 0; i < this.schema.length; i++) { + hash = (hash << 5) - hash + this.schema.charCodeAt(i) + hash |= 0 + } + return Math.abs(hash) + } + + /** + * Start the archive prune scheduler. + */ + private _startScheduler(): void { + const config = this.pruneConfig + if (!config) return + + const run = async () => { + try { + // Try to acquire advisory lock + const lockResult = await this.db.execute<{ pg_try_advisory_lock: boolean }>( + sql`SELECT pg_try_advisory_lock(${this._advisoryLockKey()})`, + ) + + if (!lockResult[0]?.pg_try_advisory_lock) { + this.logger?.debug('Another process holds the prune lock, skipping') + return + } + + try { + this.logger?.info('Running scheduled archive prune') + const deleted = await this._pruneArchive({ + olderThan: config.olderThan, + batchSize: config.batchSize, + maxBatches: config.maxBatches, + }) + this.lastPrunedAt = new Date() + this.logger?.info({ deletedJobs: deleted }, 'Archive prune completed') + } finally { + await this.db.execute(sql`SELECT pg_advisory_unlock(${this._advisoryLockKey()})`) + } + } catch (error) { + this.logger?.error(error, 'Error in prune scheduler') + } + } + + // Run immediately on start, then on interval + run().catch((err) => this.logger?.error(err, 'Initial prune run failed')) + this.pruneTimer = setInterval(run, config.intervalMs) + } + + /** + * Stop the archive prune scheduler. + */ + private _stopScheduler(): void { + if (this.pruneTimer) { + clearInterval(this.pruneTimer) + this.pruneTimer = null + } } // ============================================================================ @@ -152,7 +237,7 @@ export class PostgresBaseAdapter e description, }: CreateJobOptions) { const [result] = await this.db - .insert(this.tables.jobsTable) + .insert(this.tables.jobsActiveTable) .values({ action_name: queue, group_key: groupKey, @@ -164,7 +249,7 @@ export class PostgresBaseAdapter e concurrency_limit: concurrencyLimit, concurrency_step_limit: concurrencyStepLimit, }) - .returning({ id: this.tables.jobsTable.id }) + .returning({ id: this.tables.jobsActiveTable.id }) if (!result) { return null @@ -179,25 +264,57 @@ export class PostgresBaseAdapter e * @returns Promise resolving to `true` if completed, `false` otherwise */ protected async _completeJob({ jobId, output }: CompleteJobOptions) { - const result = await this.db - .update(this.tables.jobsTable) - .set({ + return this.db.transaction(async (tx) => { + const finishedAt = new Date() + + // 1. Check job exists and meets conditions before archiving + const [job] = await tx + .select() + .from(this.tables.jobsActiveTable) + .where( + and( + eq(this.tables.jobsActiveTable.id, jobId), + eq(this.tables.jobsActiveTable.status, JOB_STATUS_ACTIVE), + eq(this.tables.jobsActiveTable.client_id, this.id), + gt(this.tables.jobsActiveTable.expires_at, sql`now()`), + ), + ) + + if (!job) { + return false + } + + // 2. Insert job into archive FIRST (required for FK constraints) + await tx.insert(this.tables.jobsArchiveTable).values({ + ...job, status: JOB_STATUS_COMPLETED, output, - finished_at: sql`now()`, - updated_at: sql`now()`, + finished_at: finishedAt, + updated_at: finishedAt, }) - .where( - and( - eq(this.tables.jobsTable.id, jobId), - eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), - eq(this.tables.jobsTable.client_id, this.id), - gt(this.tables.jobsTable.expires_at, sql`now()`), - ), - ) - .returning({ id: this.tables.jobsTable.id }) - return result.length > 0 + // 3. Archive steps using INSERT ... SELECT (SQL-native, no JS round-trip) + await tx.execute(sql` + INSERT INTO ${this.tables.jobStepsArchiveTable} ( + id, job_id, parent_step_id, branch, name, status, output, error, + started_at, finished_at, timeout_ms, expires_at, retries_limit, + retries_count, delayed_ms, history_failed_attempts, created_at, + updated_at, job_finished_at + ) + SELECT + id, job_id, parent_step_id, branch, name, status, output, error, + started_at, finished_at, timeout_ms, expires_at, retries_limit, + retries_count, delayed_ms, history_failed_attempts, created_at, + updated_at, ${finishedAt.toISOString()} + FROM ${this.tables.jobStepsActiveTable} + WHERE job_id = ${jobId} + `) + + // 4. Delete job from active (cascade deletes steps) + await tx.delete(this.tables.jobsActiveTable).where(eq(this.tables.jobsActiveTable.id, jobId)) + + return true + }) } /** @@ -206,24 +323,56 @@ export class PostgresBaseAdapter e * @returns Promise resolving to `true` if failed, `false` otherwise */ protected async _failJob({ jobId, error }: FailJobOptions) { - const result = await this.db - .update(this.tables.jobsTable) - .set({ + return this.db.transaction(async (tx) => { + const finishedAt = new Date() + + // 1. Check job exists before archiving + const [job] = await tx + .select() + .from(this.tables.jobsActiveTable) + .where( + and( + eq(this.tables.jobsActiveTable.id, jobId), + eq(this.tables.jobsActiveTable.status, JOB_STATUS_ACTIVE), + eq(this.tables.jobsActiveTable.client_id, this.id), + ), + ) + + if (!job) { + return false + } + + // 2. Insert job into archive FIRST (required for FK constraints) + await tx.insert(this.tables.jobsArchiveTable).values({ + ...job, status: JOB_STATUS_FAILED, error, - finished_at: sql`now()`, - updated_at: sql`now()`, + finished_at: finishedAt, + updated_at: finishedAt, }) - .where( - and( - eq(this.tables.jobsTable.id, jobId), - eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), - eq(this.tables.jobsTable.client_id, this.id), - ), - ) - .returning({ id: this.tables.jobsTable.id }) - return result.length > 0 + // 3. Archive steps using INSERT ... SELECT + await tx.execute(sql` + INSERT INTO ${this.tables.jobStepsArchiveTable} ( + id, job_id, parent_step_id, branch, name, status, output, error, + started_at, finished_at, timeout_ms, expires_at, retries_limit, + retries_count, delayed_ms, history_failed_attempts, created_at, + updated_at, job_finished_at + ) + SELECT + id, job_id, parent_step_id, branch, name, status, output, error, + started_at, finished_at, timeout_ms, expires_at, retries_limit, + retries_count, delayed_ms, history_failed_attempts, created_at, + updated_at, ${finishedAt.toISOString()} + FROM ${this.tables.jobStepsActiveTable} + WHERE job_id = ${jobId} + `) + + // 4. Delete job from active (cascade deletes steps) + await tx.delete(this.tables.jobsActiveTable).where(eq(this.tables.jobsActiveTable.id, jobId)) + + return true + }) } /** @@ -232,22 +381,67 @@ export class PostgresBaseAdapter e * @returns Promise resolving to `true` if cancelled, `false` otherwise */ protected async _cancelJob({ jobId }: CancelJobOptions) { - const result = await this.db - .update(this.tables.jobsTable) - .set({ + return this.db.transaction(async (tx) => { + const finishedAt = new Date() + + // 1. Update all steps to cancelled status + await tx + .update(this.tables.jobStepsActiveTable) + .set({ + status: STEP_STATUS_CANCELLED, + finished_at: finishedAt, + updated_at: finishedAt, + }) + .where(eq(this.tables.jobStepsActiveTable.job_id, jobId)) + + // 2. Check job exists before archiving + const [job] = await tx + .select() + .from(this.tables.jobsActiveTable) + .where( + and( + eq(this.tables.jobsActiveTable.id, jobId), + or( + eq(this.tables.jobsActiveTable.status, JOB_STATUS_ACTIVE), + eq(this.tables.jobsActiveTable.status, JOB_STATUS_CREATED), + ), + ), + ) + + if (!job) { + return false + } + + // 3. Insert job into archive FIRST (required for FK constraints) + await tx.insert(this.tables.jobsArchiveTable).values({ + ...job, status: JOB_STATUS_CANCELLED, - finished_at: sql`now()`, - updated_at: sql`now()`, + finished_at: finishedAt, + updated_at: finishedAt, }) - .where( - and( - eq(this.tables.jobsTable.id, jobId), - or(eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), eq(this.tables.jobsTable.status, JOB_STATUS_CREATED)), - ), - ) - .returning({ id: this.tables.jobsTable.id }) - return result.length > 0 + // 4. Archive steps using INSERT ... SELECT + await tx.execute(sql` + INSERT INTO ${this.tables.jobStepsArchiveTable} ( + id, job_id, parent_step_id, branch, name, status, output, error, + started_at, finished_at, timeout_ms, expires_at, retries_limit, + retries_count, delayed_ms, history_failed_attempts, created_at, + updated_at, job_finished_at + ) + SELECT + id, job_id, parent_step_id, branch, name, status, output, error, + started_at, finished_at, timeout_ms, expires_at, retries_limit, + retries_count, delayed_ms, history_failed_attempts, created_at, + updated_at, ${finishedAt.toISOString()} + FROM ${this.tables.jobStepsActiveTable} + WHERE job_id = ${jobId} + `) + + // 5. Delete job from active (cascade deletes steps) + await tx.delete(this.tables.jobsActiveTable).where(eq(this.tables.jobsActiveTable.id, jobId)) + + return true + }) } /** @@ -272,15 +466,14 @@ export class PostgresBaseAdapter e j.created_at, j.concurrency_limit, j.concurrency_step_limit - FROM ${this.tables.jobsTable} j + FROM ${this.tables.jobsArchiveTable} j WHERE j.id = ${jobId} AND j.status IN (${JOB_STATUS_COMPLETED}, ${JOB_STATUS_CANCELLED}, ${JOB_STATUS_FAILED}) - FOR UPDATE OF j SKIP LOCKED ), existing_retry AS ( -- Check if a retry already exists (a newer job with same checksum, group_key, and input) SELECT j.id - FROM ${this.tables.jobsTable} j + FROM ${this.tables.jobsActiveTable} j INNER JOIN locked_source ls ON j.action_name = ls.action_name AND j.group_key = ls.group_key @@ -293,7 +486,7 @@ export class PostgresBaseAdapter e inserted_retry AS ( -- Insert the retry only if no existing retry was found -- Get concurrency_limit from the latest job at insertion time to avoid stale values - INSERT INTO ${this.tables.jobsTable} ( + INSERT INTO ${this.tables.jobsActiveTable} ( action_name, group_key, description, @@ -315,7 +508,7 @@ export class PostgresBaseAdapter e COALESCE( ( SELECT j.concurrency_limit - FROM ${this.tables.jobsTable} j + FROM ${this.tables.jobsActiveTable} j WHERE j.action_name = ls.action_name AND j.group_key = ls.group_key AND (j.expires_at IS NULL OR j.expires_at > now()) @@ -362,41 +555,108 @@ export class PostgresBaseAdapter e * @returns Promise resolving to `true` if time travel succeeded, `false` otherwise */ protected async _timeTravelJob({ jobId, stepId }: TimeTravelJobOptions): Promise { - const result = this._map( - await this.db.execute<{ success: boolean }>(sql` - WITH RECURSIVE - -- Lock and validate the job - locked_job AS ( - SELECT j.id - FROM ${this.tables.jobsTable} j - WHERE j.id = ${jobId} - AND j.status IN (${JOB_STATUS_COMPLETED}, ${JOB_STATUS_FAILED}, ${JOB_STATUS_CANCELLED}) - FOR UPDATE OF j - ), - -- Validate target step exists and belongs to job - target_step AS ( - SELECT s.id, s.parent_step_id, s.created_at - FROM ${this.tables.jobStepsTable} s - WHERE s.id = ${stepId} - AND s.job_id = ${jobId} - AND EXISTS (SELECT 1 FROM locked_job) - ), + return this.db.transaction(async (tx) => { + // First, check if the job is in the archive and restore it if needed + const archivedJob = await tx + .select() + .from(this.tables.jobsArchiveTable) + .where(eq(this.tables.jobsArchiveTable.id, jobId)) + .limit(1) + + if (archivedJob.length > 0) { + // Restore job from archive to active + const job = archivedJob[0]! + await tx.insert(this.tables.jobsActiveTable).values({ + id: job.id, + action_name: job.action_name, + group_key: job.group_key, + description: job.description, + status: job.status, + checksum: job.checksum, + input: job.input, + output: job.output, + error: job.error, + timeout_ms: job.timeout_ms, + expires_at: job.expires_at, + started_at: job.started_at, + finished_at: job.finished_at, + client_id: job.client_id, + concurrency_limit: job.concurrency_limit, + concurrency_step_limit: job.concurrency_step_limit, + created_at: job.created_at, + updated_at: job.updated_at, + }) + + // Restore steps from archive to active + const archivedSteps = await tx + .select() + .from(this.tables.jobStepsArchiveTable) + .where(eq(this.tables.jobStepsArchiveTable.job_id, jobId)) + + if (archivedSteps.length > 0) { + await tx.insert(this.tables.jobStepsActiveTable).values( + archivedSteps.map((s) => ({ + id: s.id, + job_id: s.job_id, + parent_step_id: s.parent_step_id, + parallel: s.parallel, + name: s.name, + status: s.status, + output: s.output, + error: s.error, + started_at: s.started_at, + finished_at: s.finished_at, + timeout_ms: s.timeout_ms, + expires_at: s.expires_at, + retries_limit: s.retries_limit, + retries_count: s.retries_count, + delayed_ms: s.delayed_ms, + history_failed_attempts: s.history_failed_attempts, + created_at: s.created_at, + updated_at: s.updated_at, + })), + ) + } + + // Delete archived job and steps (cascade via FK on steps) + await tx.delete(this.tables.jobsArchiveTable).where(eq(this.tables.jobsArchiveTable.id, jobId)) + } + + const result = this._map( + await tx.execute<{ success: boolean }>(sql` + WITH RECURSIVE + -- Lock and validate the job + locked_job AS ( + SELECT j.id + FROM ${this.tables.jobsActiveTable} j + WHERE j.id = ${jobId} + AND j.status IN (${JOB_STATUS_COMPLETED}, ${JOB_STATUS_FAILED}, ${JOB_STATUS_CANCELLED}) + FOR UPDATE OF j + ), + -- Validate target step exists and belongs to job + target_step AS ( + SELECT s.id, s.parent_step_id, s.created_at + FROM ${this.tables.jobStepsActiveTable} s + WHERE s.id = ${stepId} + AND s.job_id = ${jobId} + AND EXISTS (SELECT 1 FROM locked_job) + ), -- Find all ancestor steps recursively (from target up to root) ancestors AS ( SELECT s.id, s.parent_step_id, 0 AS depth - FROM ${this.tables.jobStepsTable} s + FROM ${this.tables.jobStepsActiveTable} s WHERE s.id = (SELECT parent_step_id FROM target_step) AND EXISTS (SELECT 1 FROM target_step) UNION ALL SELECT s.id, s.parent_step_id, a.depth + 1 - FROM ${this.tables.jobStepsTable} s + FROM ${this.tables.jobStepsActiveTable} s INNER JOIN ancestors a ON s.id = a.parent_step_id ), -- Steps to keep: completed steps created before target + completed parallel siblings of target and ancestors + their descendants parallel_siblings AS ( -- Completed parallel siblings of target step SELECT s.id - FROM ${this.tables.jobStepsTable} s + FROM ${this.tables.jobStepsActiveTable} s CROSS JOIN target_step ts WHERE s.job_id = ${jobId} AND s.id != ts.id @@ -409,7 +669,7 @@ export class PostgresBaseAdapter e UNION -- Completed parallel siblings of each ancestor SELECT s.id - FROM ${this.tables.jobStepsTable} s + FROM ${this.tables.jobStepsActiveTable} s INNER JOIN ancestors a ON ( (s.parent_step_id IS NULL AND a.parent_step_id IS NULL) OR s.parent_step_id = a.parent_step_id @@ -422,18 +682,18 @@ export class PostgresBaseAdapter e -- Find all descendants of parallel siblings (to keep their children too) parallel_descendants AS ( SELECT s.id - FROM ${this.tables.jobStepsTable} s + FROM ${this.tables.jobStepsActiveTable} s WHERE s.id IN (SELECT id FROM parallel_siblings) UNION ALL SELECT s.id - FROM ${this.tables.jobStepsTable} s + FROM ${this.tables.jobStepsActiveTable} s INNER JOIN parallel_descendants pd ON s.parent_step_id = pd.id WHERE s.job_id = ${jobId} ), steps_to_keep AS ( -- Steps created before target that are completed (non-ancestor, non-target) SELECT s.id - FROM ${this.tables.jobStepsTable} s + FROM ${this.tables.jobStepsActiveTable} s CROSS JOIN target_step ts WHERE s.job_id = ${jobId} AND s.created_at < ts.created_at @@ -448,12 +708,12 @@ export class PostgresBaseAdapter e time_offset AS ( SELECT now() - MIN(s.started_at) AS offset_interval - FROM ${this.tables.jobStepsTable} s + FROM ${this.tables.jobStepsActiveTable} s WHERE s.id IN (SELECT id FROM steps_to_keep) ), -- Shift times of preserved steps to align with current time (only started_at/finished_at, NOT created_at to preserve ordering) shift_preserved_times AS ( - UPDATE ${this.tables.jobStepsTable} + UPDATE ${this.tables.jobStepsActiveTable} SET started_at = started_at + (SELECT offset_interval FROM time_offset), finished_at = CASE @@ -468,7 +728,7 @@ export class PostgresBaseAdapter e ), -- Delete steps that are not in the keep list and are not ancestors/target deleted_steps AS ( - DELETE FROM ${this.tables.jobStepsTable} + DELETE FROM ${this.tables.jobStepsActiveTable} WHERE job_id = ${jobId} AND id NOT IN (SELECT id FROM steps_to_keep) AND id NOT IN (SELECT id FROM ancestors) @@ -477,7 +737,7 @@ export class PostgresBaseAdapter e ), -- Reset ancestor steps to active reset_ancestors AS ( - UPDATE ${this.tables.jobStepsTable} + UPDATE ${this.tables.jobStepsActiveTable} SET status = ${STEP_STATUS_ACTIVE}, output = NULL, @@ -494,7 +754,7 @@ export class PostgresBaseAdapter e ), -- Reset target step to active reset_target AS ( - UPDATE ${this.tables.jobStepsTable} + UPDATE ${this.tables.jobStepsActiveTable} SET status = ${STEP_STATUS_ACTIVE}, output = NULL, @@ -511,7 +771,7 @@ export class PostgresBaseAdapter e ), -- Reset job to created status reset_job AS ( - UPDATE ${this.tables.jobsTable} + UPDATE ${this.tables.jobsActiveTable} SET status = ${JOB_STATUS_CREATED}, output = NULL, @@ -527,9 +787,10 @@ export class PostgresBaseAdapter e ) SELECT EXISTS(SELECT 1 FROM reset_job) AS success `), - ) + ) - return result.length > 0 && result[0]!.success === true + return result.length > 0 && result[0]!.success === true + }) } /** @@ -540,13 +801,13 @@ export class PostgresBaseAdapter e */ protected async _deleteJob({ jobId }: DeleteJobOptions): Promise { const result = await this.db - .delete(this.tables.jobsTable) - .where(and(eq(this.tables.jobsTable.id, jobId), ne(this.tables.jobsTable.status, JOB_STATUS_ACTIVE))) - .returning({ id: this.tables.jobsTable.id }) + .delete(this.tables.jobsActiveTable) + .where(and(eq(this.tables.jobsActiveTable.id, jobId), ne(this.tables.jobsActiveTable.status, JOB_STATUS_ACTIVE))) + .returning({ id: this.tables.jobsActiveTable.id }) // Also delete associated steps if (result.length > 0) { - await this.db.delete(this.tables.jobStepsTable).where(eq(this.tables.jobStepsTable.job_id, jobId)) + await this.db.delete(this.tables.jobStepsActiveTable).where(eq(this.tables.jobStepsActiveTable.job_id, jobId)) } return result.length > 0 @@ -559,7 +820,7 @@ export class PostgresBaseAdapter e * @returns Promise resolving to the number of jobs deleted */ protected async _deleteJobs(options?: DeleteJobsOptions): Promise { - const jobsTable = this.tables.jobsTable + const jobsTable = this.tables.jobsActiveTable const filters = options?.filters ?? {} const where = this._buildJobsWhereClause(filters) @@ -585,7 +846,7 @@ export class PostgresBaseAdapter e j.group_key as group_key, j.action_name as action_name, j.concurrency_limit as concurrency_limit - FROM ${this.tables.jobsTable} j + FROM ${this.tables.jobsActiveTable} j WHERE j.group_key IS NOT NULL AND (j.expires_at IS NULL OR j.expires_at > now()) ORDER BY j.group_key, j.action_name, j.created_at DESC, j.id DESC @@ -598,7 +859,7 @@ export class PostgresBaseAdapter e gc.concurrency_limit, COUNT(*) FILTER (WHERE j.status = ${JOB_STATUS_ACTIVE}) as active_count FROM group_concurrency gc - LEFT JOIN ${this.tables.jobsTable} j + LEFT JOIN ${this.tables.jobsActiveTable} j ON j.group_key = gc.group_key AND j.action_name = gc.action_name AND (j.expires_at IS NULL OR j.expires_at > now()) @@ -612,7 +873,7 @@ export class PostgresBaseAdapter e j.action_name, j.group_key as job_group_key, j.created_at - FROM ${this.tables.jobsTable} j + FROM ${this.tables.jobsActiveTable} j INNER JOIN eligible_groups eg ON j.group_key = eg.group_key AND j.action_name = eg.action_name @@ -652,7 +913,7 @@ export class PostgresBaseAdapter e nj.job_group_key, eg.concurrency_limit, (SELECT COUNT(*) - FROM ${this.tables.jobsTable} + FROM ${this.tables.jobsActiveTable} WHERE action_name = nj.action_name AND group_key = nj.job_group_key AND status = ${JOB_STATUS_ACTIVE}) as current_active @@ -661,7 +922,7 @@ export class PostgresBaseAdapter e ON nj.job_group_key = eg.group_key AND nj.action_name = eg.action_name ) - UPDATE ${this.tables.jobsTable} j + UPDATE ${this.tables.jobsActiveTable} j SET status = ${JOB_STATUS_ACTIVE}, started_at = now(), expires_at = now() + (timeout_ms || ' milliseconds')::interval, @@ -707,11 +968,14 @@ export class PostgresBaseAdapter e if (multiProcessMode) { const result = (await this.db .selectDistinct({ - clientId: this.tables.jobsTable.client_id, + clientId: this.tables.jobsActiveTable.client_id, }) - .from(this.tables.jobsTable) + .from(this.tables.jobsActiveTable) .where( - and(eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), ne(this.tables.jobsTable.client_id, this.id)), + and( + eq(this.tables.jobsActiveTable.status, JOB_STATUS_ACTIVE), + ne(this.tables.jobsActiveTable.client_id, this.id), + ), )) as unknown as { clientId: string }[] if (result.length > 0) { @@ -741,13 +1005,13 @@ export class PostgresBaseAdapter e await this.db.execute<{ id: string }>(sql` WITH locked_jobs AS ( SELECT j.id - FROM ${this.tables.jobsTable} j + FROM ${this.tables.jobsActiveTable} j WHERE j.status = ${JOB_STATUS_ACTIVE} AND j.client_id IN ${unresponsiveClientIds} FOR UPDATE OF j SKIP LOCKED ), updated_jobs AS ( - UPDATE ${this.tables.jobsTable} j + UPDATE ${this.tables.jobsActiveTable} j SET status = ${JOB_STATUS_CREATED}, started_at = NULL, expires_at = NULL, @@ -759,7 +1023,7 @@ export class PostgresBaseAdapter e RETURNING id, checksum ), deleted_steps AS ( - DELETE FROM ${this.tables.jobStepsTable} s + DELETE FROM ${this.tables.jobStepsActiveTable} s WHERE EXISTS ( SELECT 1 FROM updated_jobs uj WHERE uj.id = s.job_id @@ -799,21 +1063,21 @@ export class PostgresBaseAdapter e await this.db.execute(sql` WITH job_check AS ( SELECT j.id - FROM ${this.tables.jobsTable} j + FROM ${this.tables.jobsActiveTable} j WHERE j.id = ${jobId} AND j.status = ${JOB_STATUS_ACTIVE} AND (j.expires_at IS NULL OR j.expires_at > now()) ), step_existed AS ( SELECT EXISTS( - SELECT 1 FROM ${this.tables.jobStepsTable} s + SELECT 1 FROM ${this.tables.jobStepsActiveTable} s WHERE s.job_id = ${jobId} AND s.name = ${name} AND s.parent_step_id IS NOT DISTINCT FROM ${parentStepId} ) AS existed ), upserted_step AS ( - INSERT INTO ${this.tables.jobStepsTable} ( + INSERT INTO ${this.tables.jobStepsActiveTable} ( job_id, parent_step_id, branch, @@ -848,7 +1112,7 @@ export class PostgresBaseAdapter e delayed_ms = NULL, started_at = now(), history_failed_attempts = '{}'::jsonb - WHERE ${this.tables.jobStepsTable}.status = ${STEP_STATUS_ACTIVE} + WHERE ${this.tables.jobStepsActiveTable}.status = ${STEP_STATUS_ACTIVE} RETURNING id, status, @@ -875,7 +1139,7 @@ export class PostgresBaseAdapter e s.error, s.output, false AS "isNew" - FROM ${this.tables.jobStepsTable} s + FROM ${this.tables.jobStepsActiveTable} s INNER JOIN job_check jc ON s.job_id = jc.id WHERE s.job_id = ${jobId} AND s.name = ${name} @@ -903,24 +1167,24 @@ export class PostgresBaseAdapter e */ protected async _completeJobStep({ stepId, output }: CompleteJobStepOptions) { const result = await this.db - .update(this.tables.jobStepsTable) + .update(this.tables.jobStepsActiveTable) .set({ status: STEP_STATUS_COMPLETED, output, finished_at: sql`now()`, updated_at: sql`now()`, }) - .from(this.tables.jobsTable) + .from(this.tables.jobsActiveTable) .where( and( - eq(this.tables.jobStepsTable.job_id, this.tables.jobsTable.id), - eq(this.tables.jobStepsTable.id, stepId), - eq(this.tables.jobStepsTable.status, STEP_STATUS_ACTIVE), - eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), - or(isNull(this.tables.jobsTable.expires_at), gt(this.tables.jobsTable.expires_at, sql`now()`)), + eq(this.tables.jobStepsActiveTable.job_id, this.tables.jobsActiveTable.id), + eq(this.tables.jobStepsActiveTable.id, stepId), + eq(this.tables.jobStepsActiveTable.status, STEP_STATUS_ACTIVE), + eq(this.tables.jobsActiveTable.status, JOB_STATUS_ACTIVE), + or(isNull(this.tables.jobsActiveTable.expires_at), gt(this.tables.jobsActiveTable.expires_at, sql`now()`)), ), ) - .returning({ id: this.tables.jobStepsTable.id }) + .returning({ id: this.tables.jobStepsActiveTable.id }) return result.length > 0 } @@ -932,23 +1196,23 @@ export class PostgresBaseAdapter e */ protected async _failJobStep({ stepId, error }: FailJobStepOptions) { const result = await this.db - .update(this.tables.jobStepsTable) + .update(this.tables.jobStepsActiveTable) .set({ status: STEP_STATUS_FAILED, error, finished_at: sql`now()`, updated_at: sql`now()`, }) - .from(this.tables.jobsTable) + .from(this.tables.jobsActiveTable) .where( and( - eq(this.tables.jobStepsTable.job_id, this.tables.jobsTable.id), - eq(this.tables.jobStepsTable.id, stepId), - eq(this.tables.jobStepsTable.status, STEP_STATUS_ACTIVE), - eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), + eq(this.tables.jobStepsActiveTable.job_id, this.tables.jobsActiveTable.id), + eq(this.tables.jobStepsActiveTable.id, stepId), + eq(this.tables.jobStepsActiveTable.status, STEP_STATUS_ACTIVE), + eq(this.tables.jobsActiveTable.status, JOB_STATUS_ACTIVE), ), ) - .returning({ id: this.tables.jobStepsTable.id }) + .returning({ id: this.tables.jobStepsActiveTable.id }) return result.length > 0 } @@ -959,8 +1223,8 @@ export class PostgresBaseAdapter e * @returns Promise resolving to `true` if delayed, `false` otherwise */ protected async _delayJobStep({ stepId, delayMs, error }: DelayJobStepOptions) { - const jobStepsTable = this.tables.jobStepsTable - const jobsTable = this.tables.jobsTable + const jobStepsTable = this.tables.jobStepsActiveTable + const jobsTable = this.tables.jobsActiveTable const result = await this.db .update(jobStepsTable) @@ -999,25 +1263,25 @@ export class PostgresBaseAdapter e */ protected async _cancelJobStep({ stepId }: CancelJobStepOptions) { const result = await this.db - .update(this.tables.jobStepsTable) + .update(this.tables.jobStepsActiveTable) .set({ status: STEP_STATUS_CANCELLED, finished_at: sql`now()`, updated_at: sql`now()`, }) - .from(this.tables.jobsTable) + .from(this.tables.jobsActiveTable) .where( and( - eq(this.tables.jobStepsTable.job_id, this.tables.jobsTable.id), - eq(this.tables.jobStepsTable.id, stepId), - eq(this.tables.jobStepsTable.status, STEP_STATUS_ACTIVE), + eq(this.tables.jobStepsActiveTable.job_id, this.tables.jobsActiveTable.id), + eq(this.tables.jobStepsActiveTable.id, stepId), + eq(this.tables.jobStepsActiveTable.status, STEP_STATUS_ACTIVE), or( - eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), - eq(this.tables.jobsTable.status, JOB_STATUS_CANCELLED), + eq(this.tables.jobsActiveTable.status, JOB_STATUS_ACTIVE), + eq(this.tables.jobsActiveTable.status, JOB_STATUS_CANCELLED), ), ), ) - .returning({ id: this.tables.jobStepsTable.id }) + .returning({ id: this.tables.jobStepsActiveTable.id }) return result.length > 0 } @@ -1030,9 +1294,17 @@ export class PostgresBaseAdapter e * Internal method to get a job by its ID. Does not include step information. */ protected async _getJobById(jobId: string): Promise { - const jobsTable = this.tables.jobsTable + // Try active table first + const activeJob = await this._getJobFromTable(jobId, this.tables.jobsActiveTable) + if (activeJob) { + return activeJob + } + + // Then try archive table + return this._getJobFromTable(jobId, this.tables.jobsArchiveTable) + } - // Calculate duration as a SQL expression (finishedAt - startedAt in milliseconds) + private async _getJobFromTable(jobId: string, jobsTable: any): Promise { const durationMs = sql` CASE WHEN ${jobsTable.started_at} IS NOT NULL AND ${jobsTable.finished_at} IS NOT NULL @@ -1077,7 +1349,15 @@ export class PostgresBaseAdapter e protected async _getJobSteps(options: GetJobStepsOptions): Promise { const { jobId, search } = options - const jobStepsTable = this.tables.jobStepsTable + // Determine if job is in active or archive table + const jobInActive = await this.db + .select({ id: this.tables.jobsActiveTable.id }) + .from(this.tables.jobsActiveTable) + .where(eq(this.tables.jobsActiveTable.id, jobId)) + .limit(1) + + const isActive = jobInActive.length > 0 + const jobStepsTable = isActive ? this.tables.jobStepsActiveTable : this.tables.jobStepsArchiveTable const fuzzySearch = search?.trim() @@ -1129,7 +1409,7 @@ export class PostgresBaseAdapter e return undefined } - const jobsTable = this.tables.jobsTable + const jobsTable = this.tables.jobsActiveTable const fuzzySearch = filters.search?.trim() @@ -1203,92 +1483,232 @@ export class PostgresBaseAdapter e : []), ) } + /** + * Build WHERE clause for archive jobs (same logic as active but for archive table). + */ + protected _buildArchiveJobsWhereClause(filters: GetJobsOptions['filters']) { + if (!filters) { + return undefined + } + + const archiveTable = this.tables.jobsArchiveTable + + const fuzzySearch = filters.search?.trim() + + return and( + filters.status + ? inArray(archiveTable.status, Array.isArray(filters.status) ? filters.status : [filters.status]) + : undefined, + filters.actionName + ? inArray( + archiveTable.action_name, + Array.isArray(filters.actionName) ? filters.actionName : [filters.actionName], + ) + : undefined, + filters.groupKey && Array.isArray(filters.groupKey) + ? sql`j.group_key LIKE ANY(ARRAY[${sql.raw(filters.groupKey.map((key) => `'${key}'`).join(','))}]::text[])` + : undefined, + filters.groupKey && !Array.isArray(filters.groupKey) + ? ilike(archiveTable.group_key, `%${filters.groupKey}%`) + : undefined, + filters.clientId + ? inArray(archiveTable.client_id, Array.isArray(filters.clientId) ? filters.clientId : [filters.clientId]) + : undefined, + filters.description ? ilike(archiveTable.description, `%${filters.description}%`) : undefined, + filters.createdAt && Array.isArray(filters.createdAt) + ? between( + sql`date_trunc('second', ${archiveTable.created_at})`, + filters.createdAt[0]!.toISOString(), + filters.createdAt[1]!.toISOString(), + ) + : undefined, + filters.createdAt && !Array.isArray(filters.createdAt) + ? gte(sql`date_trunc('second', ${archiveTable.created_at})`, filters.createdAt.toISOString()) + : undefined, + filters.startedAt && Array.isArray(filters.startedAt) + ? between( + sql`date_trunc('second', ${archiveTable.started_at})`, + filters.startedAt[0]!.toISOString(), + filters.startedAt[1]!.toISOString(), + ) + : undefined, + filters.startedAt && !Array.isArray(filters.startedAt) + ? gte(sql`date_trunc('second', ${archiveTable.started_at})`, filters.startedAt.toISOString()) + : undefined, + filters.finishedAt && Array.isArray(filters.finishedAt) + ? between( + sql`date_trunc('second', ${archiveTable.finished_at})`, + filters.finishedAt[0]!.toISOString(), + filters.finishedAt[1]!.toISOString(), + ) + : undefined, + filters.finishedAt && !Array.isArray(filters.finishedAt) + ? gte(sql`date_trunc('second', ${archiveTable.finished_at})`, filters.finishedAt.toISOString()) + : undefined, + filters.updatedAfter + ? sql`date_trunc('milliseconds', ${archiveTable.updated_at}) > ${filters.updatedAfter.toISOString()}::timestamptz` + : undefined, + fuzzySearch + ? or( + ilike(archiveTable.action_name, `%${fuzzySearch}%`), + ilike(archiveTable.group_key, `%${fuzzySearch}%`), + ilike(archiveTable.description, `%${fuzzySearch}%`), + ilike(archiveTable.client_id, `%${fuzzySearch}%`), + sql`${archiveTable.id}::text ilike ${`%${fuzzySearch}%`}`, + sql`to_tsvector('english', ${archiveTable.input}::text) @@ plainto_tsquery('english', ${fuzzySearch})`, + sql`to_tsvector('english', ${archiveTable.output}::text) @@ plainto_tsquery('english', ${fuzzySearch})`, + ) + : undefined, + ...(filters.inputFilter && Object.keys(filters.inputFilter).length > 0 + ? this.#buildJsonbWhereConditions(filters.inputFilter, archiveTable.input) + : []), + ...(filters.outputFilter && Object.keys(filters.outputFilter).length > 0 + ? this.#buildJsonbWhereConditions(filters.outputFilter, archiveTable.output) + : []), + ) + } + /** * Internal method to get jobs with pagination, filtering, and sorting. * Does not include step information or job output. */ protected async _getJobs(options?: GetJobsOptions): Promise { - const jobsTable = this.tables.jobsTable const page = options?.page ?? 1 const pageSize = options?.pageSize ?? 10 const filters = options?.filters ?? {} - const sortInput = options?.sort ?? { field: 'startedAt', order: 'desc' } - const sorts = Array.isArray(sortInput) ? sortInput : [sortInput] - - const where = this._buildJobsWhereClause(filters) + // Determine which table(s) to query based on status filter + const activeStatuses = [JOB_STATUS_CREATED, JOB_STATUS_ACTIVE] + const archiveStatuses = [JOB_STATUS_COMPLETED, JOB_STATUS_FAILED, JOB_STATUS_CANCELLED] + const statusFilter = filters.status + const statuses = Array.isArray(statusFilter) ? statusFilter : statusFilter ? [statusFilter] : [] + + const queryActive = statuses.length === 0 || statuses.some((s) => (activeStatuses as string[]).includes(s)) + const queryArchive = statuses.length === 0 || statuses.some((s) => (archiveStatuses as string[]).includes(s)) + + // Query active table + let activeJobs: any[] = [] + let activeTotal = 0 + if (queryActive) { + const jobsTable = this.tables.jobsActiveTable + const where = this._buildJobsWhereClause(filters) + activeTotal = await this.db.$count(jobsTable, where) + + if (activeTotal > 0) { + const durationMs = sql` + CASE + WHEN ${jobsTable.started_at} IS NOT NULL AND ${jobsTable.finished_at} IS NOT NULL + THEN EXTRACT(EPOCH FROM (${jobsTable.finished_at} - ${jobsTable.started_at})) * 1000 + ELSE NULL + END + `.as('duration_ms') + + activeJobs = await this.db + .select({ + id: jobsTable.id, + actionName: jobsTable.action_name, + groupKey: jobsTable.group_key, + description: jobsTable.description, + input: jobsTable.input, + output: jobsTable.output, + error: jobsTable.error, + status: jobsTable.status, + timeoutMs: jobsTable.timeout_ms, + expiresAt: jobsTable.expires_at, + startedAt: jobsTable.started_at, + finishedAt: jobsTable.finished_at, + createdAt: jobsTable.created_at, + updatedAt: jobsTable.updated_at, + concurrencyLimit: jobsTable.concurrency_limit, + concurrencyStepLimit: jobsTable.concurrency_step_limit, + clientId: jobsTable.client_id, + durationMs, + }) + .from(jobsTable) + .where(where) + .orderBy(desc(jobsTable.created_at)) + .limit(pageSize) + .offset((page - 1) * pageSize) + } + } - // Get total count - const total = await this.db.$count(jobsTable, where) - if (!total) { - return { - jobs: [], - total: 0, - page, - pageSize, + // Query archive table + let archiveJobs: any[] = [] + let archiveTotal = 0 + if (queryArchive) { + const archiveTable = this.tables.jobsArchiveTable + // Build where clause for archive (similar to active but using archive table) + const archiveWhere = this._buildArchiveJobsWhereClause(filters) + archiveTotal = await this.db.$count(archiveTable, archiveWhere) + + if (archiveTotal > 0) { + const durationMs = sql` + CASE + WHEN ${archiveTable.started_at} IS NOT NULL AND ${archiveTable.finished_at} IS NOT NULL + THEN EXTRACT(EPOCH FROM (${archiveTable.finished_at} - ${archiveTable.started_at})) * 1000 + ELSE NULL + END + `.as('duration_ms') + + archiveJobs = await this.db + .select({ + id: archiveTable.id, + actionName: archiveTable.action_name, + groupKey: archiveTable.group_key, + description: archiveTable.description, + input: archiveTable.input, + output: archiveTable.output, + error: archiveTable.error, + status: archiveTable.status, + timeoutMs: archiveTable.timeout_ms, + expiresAt: archiveTable.expires_at, + startedAt: archiveTable.started_at, + finishedAt: archiveTable.finished_at, + createdAt: archiveTable.created_at, + updatedAt: archiveTable.updated_at, + concurrencyLimit: archiveTable.concurrency_limit, + concurrencyStepLimit: archiveTable.concurrency_step_limit, + clientId: archiveTable.client_id, + durationMs, + }) + .from(archiveTable) + .where(archiveWhere) + .orderBy(desc(archiveTable.created_at)) + .limit(pageSize) + .offset((page - 1) * pageSize) } } - // Calculate duration as a SQL expression (finishedAt - startedAt in milliseconds) - const durationMs = sql` - CASE - WHEN ${jobsTable.started_at} IS NOT NULL AND ${jobsTable.finished_at} IS NOT NULL - THEN EXTRACT(EPOCH FROM (${jobsTable.finished_at} - ${jobsTable.started_at})) * 1000 - ELSE NULL - END - `.as('duration_ms') + // Combine results + const allJobs = [...activeJobs, ...archiveJobs] + const total = activeTotal + archiveTotal - const sortFieldMap: Record = { - createdAt: jobsTable.created_at, - startedAt: jobsTable.started_at, - finishedAt: jobsTable.finished_at, - status: jobsTable.status, - actionName: jobsTable.action_name, - expiresAt: jobsTable.expires_at, - duration: durationMs, - description: jobsTable.description, - } + // Sort combined results + const sortInput = options?.sort ?? { field: 'startedAt', order: 'desc' } + const sorts = Array.isArray(sortInput) ? sortInput : [sortInput] - const jobs = await this.db - .select({ - id: jobsTable.id, - actionName: jobsTable.action_name, - groupKey: jobsTable.group_key, - description: jobsTable.description, - input: jobsTable.input, - output: jobsTable.output, - error: jobsTable.error, - status: jobsTable.status, - timeoutMs: jobsTable.timeout_ms, - expiresAt: jobsTable.expires_at, - startedAt: jobsTable.started_at, - finishedAt: jobsTable.finished_at, - createdAt: jobsTable.created_at, - updatedAt: jobsTable.updated_at, - concurrencyLimit: jobsTable.concurrency_limit, - concurrencyStepLimit: jobsTable.concurrency_step_limit, - clientId: jobsTable.client_id, - durationMs, - }) - .from(jobsTable) - .where(where) - .orderBy( - ...sorts - .filter((sortItem) => sortItem.field in sortFieldMap) - .map((sortItem) => { - const sortField = sortFieldMap[sortItem.field] - if (sortItem.order.toUpperCase() === 'ASC') { - return asc(sortField) - } else { - return desc(sortField) - } - }), - ) - .limit(pageSize) - .offset((page - 1) * pageSize) + allJobs.sort((a, b) => { + for (const sort of sorts) { + const field = sort.field + const order = sort.order.toUpperCase() === 'ASC' ? 1 : -1 + const aVal = a[field] + const bVal = b[field] + + if (aVal === null && bVal === null) continue + if (aVal === null) return order + if (bVal === null) return -order + + if (aVal < bVal) return -order + if (aVal > bVal) return order + } + return 0 + }) + + // Apply pagination + const paginatedJobs = allJobs.slice(0, pageSize) return { - jobs, + jobs: paginatedJobs, total, page, pageSize, @@ -1299,109 +1719,178 @@ export class PostgresBaseAdapter e * Internal method to get a step by its ID with all information. */ protected async _getJobStepById(stepId: string): Promise { - const [step] = await this.db + // Try active table first + const [activeStep] = await this.db + .select({ + id: this.tables.jobStepsActiveTable.id, + jobId: this.tables.jobStepsActiveTable.job_id, + parentStepId: this.tables.jobStepsActiveTable.parent_step_id, + parallel: this.tables.jobStepsActiveTable.parallel, + name: this.tables.jobStepsActiveTable.name, + output: this.tables.jobStepsActiveTable.output, + status: this.tables.jobStepsActiveTable.status, + error: this.tables.jobStepsActiveTable.error, + startedAt: this.tables.jobStepsActiveTable.started_at, + finishedAt: this.tables.jobStepsActiveTable.finished_at, + timeoutMs: this.tables.jobStepsActiveTable.timeout_ms, + expiresAt: this.tables.jobStepsActiveTable.expires_at, + retriesLimit: this.tables.jobStepsActiveTable.retries_limit, + retriesCount: this.tables.jobStepsActiveTable.retries_count, + delayedMs: this.tables.jobStepsActiveTable.delayed_ms, + historyFailedAttempts: this.tables.jobStepsActiveTable.history_failed_attempts, + createdAt: this.tables.jobStepsActiveTable.created_at, + updatedAt: this.tables.jobStepsActiveTable.updated_at, + }) + .from(this.tables.jobStepsActiveTable) + .where(eq(this.tables.jobStepsActiveTable.id, stepId)) + .limit(1) + + if (activeStep) { + return activeStep + } + + // Try archive table + const [archiveStep] = await this.db .select({ - id: this.tables.jobStepsTable.id, - jobId: this.tables.jobStepsTable.job_id, - parentStepId: this.tables.jobStepsTable.parent_step_id, - parallel: this.tables.jobStepsTable.parallel, - name: this.tables.jobStepsTable.name, - output: this.tables.jobStepsTable.output, - status: this.tables.jobStepsTable.status, - error: this.tables.jobStepsTable.error, - startedAt: this.tables.jobStepsTable.started_at, - finishedAt: this.tables.jobStepsTable.finished_at, - timeoutMs: this.tables.jobStepsTable.timeout_ms, - expiresAt: this.tables.jobStepsTable.expires_at, - retriesLimit: this.tables.jobStepsTable.retries_limit, - retriesCount: this.tables.jobStepsTable.retries_count, - delayedMs: this.tables.jobStepsTable.delayed_ms, - historyFailedAttempts: this.tables.jobStepsTable.history_failed_attempts, - createdAt: this.tables.jobStepsTable.created_at, - updatedAt: this.tables.jobStepsTable.updated_at, + id: this.tables.jobStepsArchiveTable.id, + jobId: this.tables.jobStepsArchiveTable.job_id, + parentStepId: this.tables.jobStepsArchiveTable.parent_step_id, + parallel: this.tables.jobStepsArchiveTable.parallel, + name: this.tables.jobStepsArchiveTable.name, + output: this.tables.jobStepsArchiveTable.output, + status: this.tables.jobStepsArchiveTable.status, + error: this.tables.jobStepsArchiveTable.error, + startedAt: this.tables.jobStepsArchiveTable.started_at, + finishedAt: this.tables.jobStepsArchiveTable.finished_at, + timeoutMs: this.tables.jobStepsArchiveTable.timeout_ms, + expiresAt: this.tables.jobStepsArchiveTable.expires_at, + retriesLimit: this.tables.jobStepsArchiveTable.retries_limit, + retriesCount: this.tables.jobStepsArchiveTable.retries_count, + delayedMs: this.tables.jobStepsArchiveTable.delayed_ms, + historyFailedAttempts: this.tables.jobStepsArchiveTable.history_failed_attempts, + createdAt: this.tables.jobStepsArchiveTable.created_at, + updatedAt: this.tables.jobStepsArchiveTable.updated_at, }) - .from(this.tables.jobStepsTable) - .where(eq(this.tables.jobStepsTable.id, stepId)) + .from(this.tables.jobStepsArchiveTable) + .where(eq(this.tables.jobStepsArchiveTable.id, stepId)) .limit(1) - return step ?? null + return archiveStep ?? null } /** * Internal method to get job status and updatedAt timestamp. */ protected async _getJobStatus(jobId: string): Promise { - const [job] = await this.db + // Try active table first + const [activeJob] = await this.db .select({ - status: this.tables.jobsTable.status, - updatedAt: this.tables.jobsTable.updated_at, + status: this.tables.jobsActiveTable.status, + updatedAt: this.tables.jobsActiveTable.updated_at, }) - .from(this.tables.jobsTable) - .where(eq(this.tables.jobsTable.id, jobId)) + .from(this.tables.jobsActiveTable) + .where(eq(this.tables.jobsActiveTable.id, jobId)) .limit(1) - return job ?? null + if (activeJob) { + return activeJob + } + + // Try archive table + const [archiveJob] = await this.db + .select({ + status: this.tables.jobsArchiveTable.status, + updatedAt: this.tables.jobsArchiveTable.updated_at, + }) + .from(this.tables.jobsArchiveTable) + .where(eq(this.tables.jobsArchiveTable.id, jobId)) + .limit(1) + + return archiveJob ?? null } /** * Internal method to get job step status and updatedAt timestamp. */ protected async _getJobStepStatus(stepId: string): Promise { - const [step] = await this.db + // Try active table first + const [activeStep] = await this.db + .select({ + status: this.tables.jobStepsActiveTable.status, + updatedAt: this.tables.jobStepsActiveTable.updated_at, + }) + .from(this.tables.jobStepsActiveTable) + .where(eq(this.tables.jobStepsActiveTable.id, stepId)) + .limit(1) + + if (activeStep) { + return activeStep + } + + // Try archive table + const [archiveStep] = await this.db .select({ - status: this.tables.jobStepsTable.status, - updatedAt: this.tables.jobStepsTable.updated_at, + status: this.tables.jobStepsArchiveTable.status, + updatedAt: this.tables.jobStepsArchiveTable.updated_at, }) - .from(this.tables.jobStepsTable) - .where(eq(this.tables.jobStepsTable.id, stepId)) + .from(this.tables.jobStepsArchiveTable) + .where(eq(this.tables.jobStepsArchiveTable.id, stepId)) .limit(1) - return step ?? null + return archiveStep ?? null } /** * Internal method to get action statistics including counts and last job created date. */ protected async _getActions(): Promise { - const actionStats = this.db.$with('action_stats').as( - this.db - .select({ - name: this.tables.jobsTable.action_name, - last_job_created: sql`MAX(${this.tables.jobsTable.created_at})`.as('last_job_created'), - active: sql`COUNT(*) FILTER (WHERE ${this.tables.jobsTable.status} = ${JOB_STATUS_ACTIVE})`.as( - 'active', - ), - completed: sql`COUNT(*) FILTER (WHERE ${this.tables.jobsTable.status} = ${JOB_STATUS_COMPLETED})`.as( - 'completed', - ), - failed: sql`COUNT(*) FILTER (WHERE ${this.tables.jobsTable.status} = ${JOB_STATUS_FAILED})`.as( - 'failed', - ), - cancelled: sql`COUNT(*) FILTER (WHERE ${this.tables.jobsTable.status} = ${JOB_STATUS_CANCELLED})`.as( - 'cancelled', - ), - }) - .from(this.tables.jobsTable) - .groupBy(this.tables.jobsTable.action_name), + const schemaName = this.schema + const result = this._map( + await this.db.execute<{ + name: string + last_job_created: Date | null + active: number + completed: number + failed: number + cancelled: number + }>(sql` + WITH combined_jobs AS ( + SELECT action_name, status, created_at + FROM ${sql.identifier(schemaName)}.jobs_active + UNION ALL + SELECT action_name, status, created_at + FROM ${sql.identifier(schemaName)}.jobs_archive + ) + SELECT + action_name AS name, + MAX(created_at) AS last_job_created, + COUNT(*) FILTER (WHERE status = ${JOB_STATUS_ACTIVE})::int AS active, + COUNT(*) FILTER (WHERE status = ${JOB_STATUS_COMPLETED})::int AS completed, + COUNT(*) FILTER (WHERE status = ${JOB_STATUS_FAILED})::int AS failed, + COUNT(*) FILTER (WHERE status = ${JOB_STATUS_CANCELLED})::int AS cancelled + FROM combined_jobs + GROUP BY action_name + ORDER BY action_name + `), ) - const actions = await this.db - .with(actionStats) - .select({ - name: actionStats.name, - lastJobCreated: actionStats.last_job_created, - active: sql`${actionStats.active}::int`, - completed: sql`${actionStats.completed}::int`, - failed: sql`${actionStats.failed}::int`, - cancelled: sql`${actionStats.cancelled}::int`, - }) - .from(actionStats) - .orderBy(actionStats.name) - return { - actions: actions.map((action) => ({ - ...action, - lastJobCreated: action.lastJobCreated ?? null, + actions: ( + result as Array<{ + name: string + last_job_created: Date | null + active: number + completed: number + failed: number + cancelled: number + }> + ).map((action) => ({ + name: action.name, + lastJobCreated: action.last_job_created ?? null, + active: action.active, + completed: action.completed, + failed: action.failed, + cancelled: action.cancelled, })), } } @@ -1412,6 +1901,7 @@ export class PostgresBaseAdapter e /** * Internal method to insert multiple span records in a single batch. + * Routes spans to active or archive table based on job location. */ protected async _insertSpans(spans: InsertSpanOptions[]): Promise { if (spans.length === 0) { @@ -1447,7 +1937,6 @@ export class PostgresBaseAdapter e * For step queries, uses a recursive CTE to find all descendant spans. */ protected async _getSpans(options: GetSpansOptions): Promise { - const spansTable = this.tables.spansTable const filters = options.filters ?? {} // Build sort @@ -1465,6 +1954,8 @@ export class PostgresBaseAdapter e return this._getStepSpansRecursive(options.stepId, sortField, sortOrder, filters) } + const spansTable = this.tables.spansTable + // Build WHERE clause for job queries const where = this._buildSpansWhereClause(options.jobId, undefined, filters) @@ -1532,9 +2023,6 @@ export class PostgresBaseAdapter e ): Promise { const schemaName = this.schema - // Use a recursive CTE to find all descendant spans - // 1. Base case: find the span with step_id = stepId - // 2. Recursive case: find all spans where parent_span_id = span_id of a span we've already found const query = sql` WITH RECURSIVE span_tree AS ( -- Base case: the span(s) for the step @@ -1798,4 +2286,146 @@ export class PostgresBaseAdapter e protected _map(result: any) { return result } + + // ============================================================================ + // Archive Methods (Stub implementations - to be filled in) + // ============================================================================ + // Archive Methods + // ============================================================================ + + /** + * Parse olderThan option into a Date threshold. + * Supports: string (e.g. "7d", "1h"), Date, or number (timestamp ms). + */ + private _parseOlderThan(olderThan: string | Date | number): Date { + if (olderThan instanceof Date) { + return olderThan + } + + if (typeof olderThan === 'number') { + return new Date(olderThan) + } + + // Parse duration string like "7d", "1h", "30m", "10s", "500ms" + const match = olderThan.match(/^(\d+)\s*(ms|d|h|m|s)$/i) + if (!match) { + throw new Error( + `Invalid olderThan format: ${olderThan}. Expected: "7d", "1h", "30m", "10s", "500ms", Date, or number`, + ) + } + + const value = parseInt(match[1]!, 10) + const unit = match[2]!.toLowerCase() + const now = Date.now() + + const multipliers: Record = { + d: 24 * 60 * 60 * 1000, + h: 60 * 60 * 1000, + m: 60 * 1000, + s: 1000, + ms: 1, + } + + const ms = value * (multipliers[unit] ?? 0) + return new Date(now - ms) + } + + protected async _pruneArchive(options: PruneArchiveOptions): Promise { + const threshold = this._parseOlderThan(options.olderThan) + const batchSize = options.batchSize ?? 1000 + const maxBatches = options.maxBatches ?? 100 + const schemaName = this.schema + + let totalDeleted = 0 + + for (let batch = 0; batch < maxBatches; batch++) { + const result = this._map( + await this.db.execute<{ id: string }>(sql` + WITH ids_to_delete AS ( + SELECT id FROM ${sql.identifier(schemaName)}.jobs_archive + WHERE finished_at < ${threshold.toISOString()} + LIMIT ${batchSize} + ), + deleted_spans AS ( + DELETE FROM ${sql.identifier(schemaName)}.spans s + USING ids_to_delete d + WHERE s.job_id = d.id + ) + DELETE FROM ${sql.identifier(schemaName)}.jobs_archive j + USING ids_to_delete d + WHERE j.id = d.id + RETURNING j.id + `), + ) + + if (!result || result.length === 0) { + break + } + + totalDeleted += result.length + } + + // Clean up orphan spans (spans whose job no longer exists in active or archive) + if (totalDeleted > 0) { + await this.db.execute(sql` + DELETE FROM ${sql.identifier(schemaName)}.spans s + WHERE s.job_id IS NOT NULL + AND NOT EXISTS ( + SELECT 1 FROM ${sql.identifier(schemaName)}.jobs_active ja WHERE ja.id = s.job_id + ) + AND NOT EXISTS ( + SELECT 1 FROM ${sql.identifier(schemaName)}.jobs_archive ja WHERE ja.id = s.job_id + ) + `) + } + + return totalDeleted + } + + protected async _truncateArchive(): Promise { + const schemaName = this.schema + await this.db.execute(sql`TRUNCATE TABLE ${sql.identifier(schemaName)}.jobs_archive CASCADE`) + // Note: We do NOT truncate spans here because spans may belong to active jobs. + // Spans for archived jobs become orphans until cleaned up by prune operations. + } + + protected async _getArchiveStats(): Promise { + const schemaName = this.schema + + const [jobsResult, stepsResult, spansResult, oldestResult] = await Promise.all([ + this.db + .execute<{ count: number }>(sql` + SELECT COUNT(*)::int as count FROM ${sql.identifier(schemaName)}.jobs_archive + `) + .then((r) => this._map(r)), + this.db + .execute<{ count: number }>(sql` + SELECT COUNT(*)::int as count FROM ${sql.identifier(schemaName)}.job_steps_archive + `) + .then((r) => this._map(r)), + this.db + .execute<{ count: number }>(sql` + SELECT COUNT(*)::int as count FROM ${sql.identifier(schemaName)}.spans + `) + .then((r) => this._map(r)), + this.db + .execute<{ finished_at: Date | null }>(sql` + SELECT finished_at FROM ${sql.identifier(schemaName)}.jobs_archive + ORDER BY finished_at ASC + LIMIT 1 + `) + .then((r) => this._map(r)), + ]) + + const oldestDate = oldestResult[0]?.finished_at ? new Date(oldestResult[0].finished_at) : null + + return { + jobsCount: Number(jobsResult[0]?.count ?? 0), + stepsCount: Number(stepsResult[0]?.count ?? 0), + spansCount: Number(spansResult[0]?.count ?? 0), + oldestJobDate: oldestDate, + totalSizeBytes: null, + lastPrunedAt: this.lastPrunedAt, + } + } } diff --git a/packages/duron/src/adapters/postgres/base.ts.backup b/packages/duron/src/adapters/postgres/base.ts.backup new file mode 100644 index 0000000..7e0c905 --- /dev/null +++ b/packages/duron/src/adapters/postgres/base.ts.backup @@ -0,0 +1,1801 @@ +import { and, asc, between, desc, eq, gt, gte, ilike, inArray, isNull, ne, or, sql } from 'drizzle-orm' +import type { PgAsyncDatabase, PgColumn } from 'drizzle-orm/pg-core' + +import { + JOB_STATUS_ACTIVE, + JOB_STATUS_CANCELLED, + JOB_STATUS_COMPLETED, + JOB_STATUS_CREATED, + JOB_STATUS_FAILED, + STEP_STATUS_ACTIVE, + STEP_STATUS_CANCELLED, + STEP_STATUS_COMPLETED, + STEP_STATUS_FAILED, +} from '../../constants.js' +import { + Adapter, + type CancelJobOptions, + type CancelJobStepOptions, + type CompleteJobOptions, + type CompleteJobStepOptions, + type CreateJobOptions, + type CreateOrRecoverJobStepOptions, + type CreateOrRecoverJobStepResult, + type DelayJobStepOptions, + type DeleteJobOptions, + type DeleteJobsOptions, + type DeleteSpansOptions, + type FailJobOptions, + type FailJobStepOptions, + type FetchOptions, + type GetActionsResult, + type GetJobStepsOptions, + type GetJobStepsResult, + type GetJobsOptions, + type GetJobsResult, + type GetSpansOptions, + type GetSpansResult, + type InsertSpanOptions, + type Job, + type JobSort, + type JobStatusResult, + type JobStep, + type JobStepStatusResult, + type RecoverJobsOptions, + type RetryJobOptions, + type SpanSort, + type TimeTravelJobOptions, +} from '../adapter.js' +import createSchema from './schema.js' + +type Schema = ReturnType + +// Re-export types for backward compatibility +export type { Job, JobStep } from '../adapter.js' + +type DrizzleDatabase = PgAsyncDatabase + +export interface AdapterOptions { + connection: Connection + schema?: string + migrateOnStart?: boolean + migrationsFolder?: string +} + +export class PostgresBaseAdapter extends Adapter { + protected connection: Connection + protected db!: Database + protected tables: Schema + protected schema: string = 'duron' + protected migrateOnStart: boolean = true + + // ============================================================================ + // Constructor + // ============================================================================ + + /** + * Create a new PostgresAdapter instance. + * + * @param options - Configuration options for the PostgreSQL adapter + */ + constructor(options: AdapterOptions) { + super() + + this.connection = options.connection + this.schema = options.schema ?? 'duron' + this.migrateOnStart = options.migrateOnStart ?? true + + this.tables = createSchema(this.schema) + + this._initDb() + } + + /** + * Initialize the database connection and Drizzle instance. + */ + protected _initDb() { + throw new Error('Not implemented') + } + + // ============================================================================ + // Lifecycle Methods + // ============================================================================ + + /** + * Start the adapter. + * Runs migrations if enabled and sets up database listeners. + * + * @returns Promise resolving to `true` if started successfully, `false` otherwise + */ + protected async _start() { + await this._listen(`ping-${this.id}`, async (payload: string) => { + const fromClientId = JSON.parse(payload).fromClientId + await this._notify(`pong-${fromClientId}`, { toClientId: this.id }) + }) + + await this._listen(`job-status-changed`, (payload: string) => { + if (this.listenerCount('job-status-changed') > 0) { + const { jobId, status, clientId } = JSON.parse(payload) + this.emit('job-status-changed', { jobId, status, clientId }) + } + }) + + await this._listen(`job-available`, (payload: string) => { + if (this.listenerCount('job-available') > 0) { + const { jobId } = JSON.parse(payload) + this.emit('job-available', { jobId }) + } + }) + } + + protected async _stop() { + // do nothing + } + + // ============================================================================ + // Job Methods + // ============================================================================ + + /** + * Internal method to create a new job in the database. + * + * @returns Promise resolving to the job ID, or `null` if creation failed + */ + protected async _createJob({ + queue, + groupKey, + input, + timeoutMs, + checksum, + concurrencyLimit, + concurrencyStepLimit, + description, + }: CreateJobOptions) { + const [result] = await this.db + .insert(this.tables.jobsTable) + .values({ + action_name: queue, + group_key: groupKey, + description: description ?? null, + checksum, + input, + status: JOB_STATUS_CREATED, + timeout_ms: timeoutMs, + concurrency_limit: concurrencyLimit, + concurrency_step_limit: concurrencyStepLimit, + }) + .returning({ id: this.tables.jobsTable.id }) + + if (!result) { + return null + } + + return result.id + } + + /** + * Internal method to mark a job as completed. + * + * @returns Promise resolving to `true` if completed, `false` otherwise + */ + protected async _completeJob({ jobId, output }: CompleteJobOptions) { + const result = await this.db + .update(this.tables.jobsTable) + .set({ + status: JOB_STATUS_COMPLETED, + output, + finished_at: sql`now()`, + updated_at: sql`now()`, + }) + .where( + and( + eq(this.tables.jobsTable.id, jobId), + eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), + eq(this.tables.jobsTable.client_id, this.id), + gt(this.tables.jobsTable.expires_at, sql`now()`), + ), + ) + .returning({ id: this.tables.jobsTable.id }) + + return result.length > 0 + } + + /** + * Internal method to mark a job as failed. + * + * @returns Promise resolving to `true` if failed, `false` otherwise + */ + protected async _failJob({ jobId, error }: FailJobOptions) { + const result = await this.db + .update(this.tables.jobsTable) + .set({ + status: JOB_STATUS_FAILED, + error, + finished_at: sql`now()`, + updated_at: sql`now()`, + }) + .where( + and( + eq(this.tables.jobsTable.id, jobId), + eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), + eq(this.tables.jobsTable.client_id, this.id), + ), + ) + .returning({ id: this.tables.jobsTable.id }) + + return result.length > 0 + } + + /** + * Internal method to cancel a job. + * + * @returns Promise resolving to `true` if cancelled, `false` otherwise + */ + protected async _cancelJob({ jobId }: CancelJobOptions) { + const result = await this.db + .update(this.tables.jobsTable) + .set({ + status: JOB_STATUS_CANCELLED, + finished_at: sql`now()`, + updated_at: sql`now()`, + }) + .where( + and( + eq(this.tables.jobsTable.id, jobId), + or(eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), eq(this.tables.jobsTable.status, JOB_STATUS_CREATED)), + ), + ) + .returning({ id: this.tables.jobsTable.id }) + + return result.length > 0 + } + + /** + * Internal method to retry a completed, cancelled, or failed job by creating a copy of it with status 'created' and cleared output/error. + * Uses SELECT FOR UPDATE to prevent concurrent retries from creating duplicate jobs. + * + * @returns Promise resolving to the job ID, or `null` if creation failed + */ + protected async _retryJob({ jobId }: RetryJobOptions): Promise { + // Use a single atomic query with FOR UPDATE lock to prevent race conditions + const result = this._map( + await this.db.execute<{ id: string }>(sql` + WITH locked_source AS ( + -- Lock the source job row to prevent concurrent retries + SELECT + j.action_name, + j.group_key, + j.description, + j.checksum, + j.input, + j.timeout_ms, + j.created_at, + j.concurrency_limit, + j.concurrency_step_limit + FROM ${this.tables.jobsTable} j + WHERE j.id = ${jobId} + AND j.status IN (${JOB_STATUS_COMPLETED}, ${JOB_STATUS_CANCELLED}, ${JOB_STATUS_FAILED}) + FOR UPDATE OF j SKIP LOCKED + ), + existing_retry AS ( + -- Check if a retry already exists (a newer job with same checksum, group_key, and input) + SELECT j.id + FROM ${this.tables.jobsTable} j + INNER JOIN locked_source ls + ON j.action_name = ls.action_name + AND j.group_key = ls.group_key + AND j.checksum = ls.checksum + AND j.input = ls.input + AND j.created_at > ls.created_at + WHERE j.status IN (${JOB_STATUS_CREATED}, ${JOB_STATUS_ACTIVE}) + LIMIT 1 + ), + inserted_retry AS ( + -- Insert the retry only if no existing retry was found + -- Get concurrency_limit from the latest job at insertion time to avoid stale values + INSERT INTO ${this.tables.jobsTable} ( + action_name, + group_key, + description, + checksum, + input, + status, + timeout_ms, + concurrency_limit, + concurrency_step_limit + ) + SELECT + ls.action_name, + ls.group_key, + ls.description, + ls.checksum, + ls.input, + ${JOB_STATUS_CREATED}, + ls.timeout_ms, + COALESCE( + ( + SELECT j.concurrency_limit + FROM ${this.tables.jobsTable} j + WHERE j.action_name = ls.action_name + AND j.group_key = ls.group_key + AND (j.expires_at IS NULL OR j.expires_at > now()) + ORDER BY j.created_at DESC, j.id DESC + LIMIT 1 + ), + ls.concurrency_limit + ), + ls.concurrency_step_limit + FROM locked_source ls + WHERE NOT EXISTS (SELECT 1 FROM existing_retry) + RETURNING id + ) + -- Return only the newly inserted retry ID (not existing retries) + SELECT id FROM inserted_retry + LIMIT 1 + `), + ) + + if (result.length === 0) { + return null + } + + return result[0]!.id + } + + /** + * Internal method to time travel a job to restart from a specific step. + * The job must be in completed, failed, or cancelled status. + * Resets the job and ancestor steps to active status, deletes subsequent steps, + * and preserves completed parallel siblings. + * + * Algorithm: + * 1. Validate job is in terminal state (completed/failed/cancelled) + * 2. Find the target step and all its ancestors (using parent_step_id) + * 3. Determine which steps to keep: + * - Steps completed BEFORE the target step (by created_at) + * - Branch siblings that are completed (independent) + * 4. Delete steps that should not be kept + * 5. Reset ancestor steps to active status (they need to re-run) + * 6. Reset the target step to active status + * 7. Reset job to created status + * + * @returns Promise resolving to `true` if time travel succeeded, `false` otherwise + */ + protected async _timeTravelJob({ jobId, stepId }: TimeTravelJobOptions): Promise { + const result = this._map( + await this.db.execute<{ success: boolean }>(sql` + WITH RECURSIVE + -- Lock and validate the job + locked_job AS ( + SELECT j.id + FROM ${this.tables.jobsTable} j + WHERE j.id = ${jobId} + AND j.status IN (${JOB_STATUS_COMPLETED}, ${JOB_STATUS_FAILED}, ${JOB_STATUS_CANCELLED}) + FOR UPDATE OF j + ), + -- Validate target step exists and belongs to job + target_step AS ( + SELECT s.id, s.parent_step_id, s.created_at + FROM ${this.tables.jobStepsTable} s + WHERE s.id = ${stepId} + AND s.job_id = ${jobId} + AND EXISTS (SELECT 1 FROM locked_job) + ), + -- Find all ancestor steps recursively (from target up to root) + ancestors AS ( + SELECT s.id, s.parent_step_id, 0 AS depth + FROM ${this.tables.jobStepsTable} s + WHERE s.id = (SELECT parent_step_id FROM target_step) + AND EXISTS (SELECT 1 FROM target_step) + UNION ALL + SELECT s.id, s.parent_step_id, a.depth + 1 + FROM ${this.tables.jobStepsTable} s + INNER JOIN ancestors a ON s.id = a.parent_step_id + ), + -- Steps to keep: completed steps created before target + completed parallel siblings of target and ancestors + their descendants + parallel_siblings AS ( + -- Completed parallel siblings of target step + SELECT s.id + FROM ${this.tables.jobStepsTable} s + CROSS JOIN target_step ts + WHERE s.job_id = ${jobId} + AND s.id != ts.id + AND s.branch = true + AND s.status = ${STEP_STATUS_COMPLETED} + AND ( + (s.parent_step_id IS NULL AND ts.parent_step_id IS NULL) + OR s.parent_step_id = ts.parent_step_id + ) + UNION + -- Completed parallel siblings of each ancestor + SELECT s.id + FROM ${this.tables.jobStepsTable} s + INNER JOIN ancestors a ON ( + (s.parent_step_id IS NULL AND a.parent_step_id IS NULL) + OR s.parent_step_id = a.parent_step_id + ) + WHERE s.job_id = ${jobId} + AND s.id NOT IN (SELECT id FROM ancestors) + AND s.branch = true + AND s.status = ${STEP_STATUS_COMPLETED} + ), + -- Find all descendants of parallel siblings (to keep their children too) + parallel_descendants AS ( + SELECT s.id + FROM ${this.tables.jobStepsTable} s + WHERE s.id IN (SELECT id FROM parallel_siblings) + UNION ALL + SELECT s.id + FROM ${this.tables.jobStepsTable} s + INNER JOIN parallel_descendants pd ON s.parent_step_id = pd.id + WHERE s.job_id = ${jobId} + ), + steps_to_keep AS ( + -- Steps created before target that are completed (non-ancestor, non-target) + SELECT s.id + FROM ${this.tables.jobStepsTable} s + CROSS JOIN target_step ts + WHERE s.job_id = ${jobId} + AND s.created_at < ts.created_at + AND s.status = ${STEP_STATUS_COMPLETED} + AND s.id NOT IN (SELECT id FROM ancestors) + AND s.id != ts.id + UNION + -- All parallel siblings and their descendants + SELECT id FROM parallel_descendants + ), + -- Calculate time offset: shift preserved steps to start from "now" + time_offset AS ( + SELECT + now() - MIN(s.started_at) AS offset_interval + FROM ${this.tables.jobStepsTable} s + WHERE s.id IN (SELECT id FROM steps_to_keep) + ), + -- Shift times of preserved steps to align with current time (only started_at/finished_at, NOT created_at to preserve ordering) + shift_preserved_times AS ( + UPDATE ${this.tables.jobStepsTable} + SET + started_at = started_at + (SELECT offset_interval FROM time_offset), + finished_at = CASE + WHEN finished_at IS NOT NULL + THEN finished_at + (SELECT offset_interval FROM time_offset) + ELSE NULL + END, + updated_at = now() + WHERE id IN (SELECT id FROM steps_to_keep) + AND (SELECT offset_interval FROM time_offset) IS NOT NULL + RETURNING id + ), + -- Delete steps that are not in the keep list and are not ancestors/target + deleted_steps AS ( + DELETE FROM ${this.tables.jobStepsTable} + WHERE job_id = ${jobId} + AND id NOT IN (SELECT id FROM steps_to_keep) + AND id NOT IN (SELECT id FROM ancestors) + AND id != (SELECT id FROM target_step) + RETURNING id + ), + -- Reset ancestor steps to active + reset_ancestors AS ( + UPDATE ${this.tables.jobStepsTable} + SET + status = ${STEP_STATUS_ACTIVE}, + output = NULL, + error = NULL, + finished_at = NULL, + started_at = now(), + expires_at = now() + (timeout_ms || ' milliseconds')::interval, + retries_count = 0, + delayed_ms = NULL, + history_failed_attempts = '{}'::jsonb, + updated_at = now() + WHERE id IN (SELECT id FROM ancestors) + RETURNING id + ), + -- Reset target step to active + reset_target AS ( + UPDATE ${this.tables.jobStepsTable} + SET + status = ${STEP_STATUS_ACTIVE}, + output = NULL, + error = NULL, + finished_at = NULL, + started_at = now(), + expires_at = now() + (timeout_ms || ' milliseconds')::interval, + retries_count = 0, + delayed_ms = NULL, + history_failed_attempts = '{}'::jsonb, + updated_at = now() + WHERE id = (SELECT id FROM target_step) + RETURNING id + ), + -- Reset job to created status + reset_job AS ( + UPDATE ${this.tables.jobsTable} + SET + status = ${JOB_STATUS_CREATED}, + output = NULL, + error = NULL, + started_at = NULL, + finished_at = NULL, + client_id = NULL, + expires_at = NULL, + updated_at = now() + WHERE id = ${jobId} + AND EXISTS (SELECT 1 FROM target_step) + RETURNING id + ) + SELECT EXISTS(SELECT 1 FROM reset_job) AS success + `), + ) + + return result.length > 0 && result[0]!.success === true + } + + /** + * Internal method to delete a job by its ID. + * Active jobs cannot be deleted. + * + * @returns Promise resolving to `true` if deleted, `false` otherwise + */ + protected async _deleteJob({ jobId }: DeleteJobOptions): Promise { + const result = await this.db + .delete(this.tables.jobsTable) + .where(and(eq(this.tables.jobsTable.id, jobId), ne(this.tables.jobsTable.status, JOB_STATUS_ACTIVE))) + .returning({ id: this.tables.jobsTable.id }) + + // Also delete associated steps + if (result.length > 0) { + await this.db.delete(this.tables.jobStepsTable).where(eq(this.tables.jobStepsTable.job_id, jobId)) + } + + return result.length > 0 + } + + /** + * Internal method to delete multiple jobs using the same filters as getJobs. + * Active jobs cannot be deleted and will be excluded from deletion. + * + * @returns Promise resolving to the number of jobs deleted + */ + protected async _deleteJobs(options?: DeleteJobsOptions): Promise { + const jobsTable = this.tables.jobsTable + const filters = options?.filters ?? {} + + const where = this._buildJobsWhereClause(filters) + + const result = await this.db.delete(jobsTable).where(where).returning({ id: jobsTable.id }) + + return result.length + } + + /** + * Internal method to fetch jobs from the database respecting concurrency limits per group. + * Uses the concurrency limit from the latest job created for each groupKey. + * Uses advisory locks to ensure thread-safe job fetching. + * + * @returns Promise resolving to an array of fetched jobs + */ + protected async _fetch({ batch }: FetchOptions) { + const result = this._map( + await this.db.execute(sql` + WITH group_concurrency AS ( + -- Get the concurrency limit from the latest job for each group + SELECT DISTINCT ON (j.group_key, j.action_name) + j.group_key as group_key, + j.action_name as action_name, + j.concurrency_limit as concurrency_limit + FROM ${this.tables.jobsTable} j + WHERE j.group_key IS NOT NULL + AND (j.expires_at IS NULL OR j.expires_at > now()) + ORDER BY j.group_key, j.action_name, j.created_at DESC, j.id DESC + ), + eligible_groups AS ( + -- Find all groups with their active counts that are below their concurrency limit + SELECT + gc.group_key, + gc.action_name, + gc.concurrency_limit, + COUNT(*) FILTER (WHERE j.status = ${JOB_STATUS_ACTIVE}) as active_count + FROM group_concurrency gc + LEFT JOIN ${this.tables.jobsTable} j + ON j.group_key = gc.group_key + AND j.action_name = gc.action_name + AND (j.expires_at IS NULL OR j.expires_at > now()) + GROUP BY gc.group_key, gc.action_name, gc.concurrency_limit + HAVING COUNT(*) FILTER (WHERE j.status = ${JOB_STATUS_ACTIVE}) < gc.concurrency_limit + ), + candidate_jobs AS ( + -- Lock candidate jobs first (before applying window functions) + SELECT + j.id, + j.action_name, + j.group_key as job_group_key, + j.created_at + FROM ${this.tables.jobsTable} j + INNER JOIN eligible_groups eg + ON j.group_key = eg.group_key + AND j.action_name = eg.action_name + WHERE j.status = ${JOB_STATUS_CREATED} + FOR UPDATE OF j SKIP LOCKED + ), + ranked_jobs AS ( + -- Rank jobs within each group after locking + SELECT + cj.id, + cj.action_name, + cj.job_group_key, + cj.created_at, + ROW_NUMBER() OVER ( + PARTITION BY cj.job_group_key, cj.action_name + ORDER BY cj.created_at ASC, cj.id ASC + ) as job_rank + FROM candidate_jobs cj + ), + next_job AS ( + -- Select only jobs that fit within the concurrency limit per group + -- Ordered globally by created_at to respect job creation order + SELECT rj.id, rj.action_name, rj.job_group_key + FROM ranked_jobs rj + INNER JOIN eligible_groups eg + ON rj.job_group_key = eg.group_key + AND rj.action_name = eg.action_name + WHERE rj.job_rank <= (eg.concurrency_limit - eg.active_count) + ORDER BY rj.created_at ASC, rj.id ASC + LIMIT ${batch} + ), + verify_concurrency AS ( + -- Double-check concurrency limit after acquiring lock + SELECT + nj.id, + nj.action_name, + nj.job_group_key, + eg.concurrency_limit, + (SELECT COUNT(*) + FROM ${this.tables.jobsTable} + WHERE action_name = nj.action_name + AND group_key = nj.job_group_key + AND status = ${JOB_STATUS_ACTIVE}) as current_active + FROM next_job nj + INNER JOIN eligible_groups eg + ON nj.job_group_key = eg.group_key + AND nj.action_name = eg.action_name + ) + UPDATE ${this.tables.jobsTable} j + SET status = ${JOB_STATUS_ACTIVE}, + started_at = now(), + expires_at = now() + (timeout_ms || ' milliseconds')::interval, + client_id = ${this.id}, + updated_at = now() + FROM verify_concurrency vc + WHERE j.id = vc.id + AND vc.current_active < vc.concurrency_limit -- Final concurrency check using job's concurrency limit + RETURNING + j.id, + j.action_name as "actionName", + j.group_key as "groupKey", + j.description, + j.input, + j.output, + j.error, + j.status, + j.timeout_ms as "timeoutMs", + j.expires_at as "expiresAt", + j.started_at as "startedAt", + j.finished_at as "finishedAt", + j.created_at as "createdAt", + j.updated_at as "updatedAt", + j.concurrency_limit as "concurrencyLimit", + j.concurrency_step_limit as "concurrencyStepLimit" + `), + ) + + return result + } + + /** + * Internal method to recover stuck jobs (jobs that were active but the process that owned them is no longer running). + * In multi-process mode, pings other processes to check if they're alive before recovering their jobs. + * + * @returns Promise resolving to the number of jobs recovered + */ + protected async _recoverJobs(options: RecoverJobsOptions): Promise { + const { checksums, multiProcessMode = false, processTimeout = 5_000 } = options + + const unresponsiveClientIds: string[] = [this.id] + + if (multiProcessMode) { + const result = (await this.db + .selectDistinct({ + clientId: this.tables.jobsTable.client_id, + }) + .from(this.tables.jobsTable) + .where( + and(eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), ne(this.tables.jobsTable.client_id, this.id)), + )) as unknown as { clientId: string }[] + + if (result.length > 0) { + const pongCount = new Set() + const { unlisten } = await this._listen(`pong-${this.id}`, (payload: string) => { + const toClientId = JSON.parse(payload).toClientId + pongCount.add(toClientId) + if (pongCount.size >= result.length) { + unlisten() + } + }) + + await Promise.all(result.map((row) => this._notify(`ping-${row.clientId}`, { fromClientId: this.id }))) + + let waitForSeconds = processTimeout / 1_000 + while (pongCount.size < result.length && waitForSeconds > 0) { + await new Promise((resolve) => setTimeout(resolve, 1000).unref?.()) + waitForSeconds-- + } + + unresponsiveClientIds.push(...result.filter((row) => !pongCount.has(row.clientId)).map((row) => row.clientId)) + } + } + + if (unresponsiveClientIds.length > 0) { + const result = this._map( + await this.db.execute<{ id: string }>(sql` + WITH locked_jobs AS ( + SELECT j.id + FROM ${this.tables.jobsTable} j + WHERE j.status = ${JOB_STATUS_ACTIVE} + AND j.client_id IN ${unresponsiveClientIds} + FOR UPDATE OF j SKIP LOCKED + ), + updated_jobs AS ( + UPDATE ${this.tables.jobsTable} j + SET status = ${JOB_STATUS_CREATED}, + started_at = NULL, + expires_at = NULL, + finished_at = NULL, + output = NULL, + error = NULL, + updated_at = now() + WHERE EXISTS (SELECT 1 FROM locked_jobs lj WHERE lj.id = j.id) + RETURNING id, checksum + ), + deleted_steps AS ( + DELETE FROM ${this.tables.jobStepsTable} s + WHERE EXISTS ( + SELECT 1 FROM updated_jobs uj + WHERE uj.id = s.job_id + AND uj.checksum NOT IN ${checksums} + ) + ) + SELECT id FROM updated_jobs + `), + ) + + return result.length + } + + return 0 + } + + // ============================================================================ + // Step Methods + // ============================================================================ + + /** + * Internal method to create or recover a job step by creating or resetting a step record in the database. + * + * @returns Promise resolving to the step, or `null` if creation failed + */ + protected async _createOrRecoverJobStep({ + jobId, + name, + timeoutMs, + retriesLimit, + parentStepId, + parallel = false, + }: CreateOrRecoverJobStepOptions): Promise { + type StepResult = CreateOrRecoverJobStepResult + + const [result] = this._map( + await this.db.execute(sql` + WITH job_check AS ( + SELECT j.id + FROM ${this.tables.jobsTable} j + WHERE j.id = ${jobId} + AND j.status = ${JOB_STATUS_ACTIVE} + AND (j.expires_at IS NULL OR j.expires_at > now()) + ), + step_existed AS ( + SELECT EXISTS( + SELECT 1 FROM ${this.tables.jobStepsTable} s + WHERE s.job_id = ${jobId} + AND s.name = ${name} + AND s.parent_step_id IS NOT DISTINCT FROM ${parentStepId} + ) AS existed + ), + upserted_step AS ( + INSERT INTO ${this.tables.jobStepsTable} ( + job_id, + parent_step_id, + branch, + name, + timeout_ms, + retries_limit, + status, + started_at, + expires_at, + retries_count, + delayed_ms + ) + SELECT + ${jobId}, + ${parentStepId}, + ${parallel}, + ${name}, + ${timeoutMs}, + ${retriesLimit}, + ${STEP_STATUS_ACTIVE}, + now(), + now() + interval '${sql.raw(timeoutMs.toString())} milliseconds', + 0, + NULL + WHERE EXISTS (SELECT 1 FROM job_check) + ON CONFLICT (job_id, name, parent_step_id) DO UPDATE + SET + timeout_ms = ${timeoutMs}, + expires_at = now() + interval '${sql.raw(timeoutMs.toString())} milliseconds', + retries_count = 0, + retries_limit = ${retriesLimit}, + delayed_ms = NULL, + started_at = now(), + history_failed_attempts = '{}'::jsonb + WHERE ${this.tables.jobStepsTable}.status = ${STEP_STATUS_ACTIVE} + RETURNING + id, + status, + retries_limit AS "retriesLimit", + retries_count AS "retriesCount", + timeout_ms AS "timeoutMs", + error, + output + ), + final_upserted AS ( + SELECT + us.*, + CASE WHEN se.existed THEN false ELSE true END AS "isNew" + FROM upserted_step us + CROSS JOIN step_existed se + ), + existing_step AS ( + SELECT + s.id, + s.status, + s.retries_limit AS "retriesLimit", + s.retries_count AS "retriesCount", + s.timeout_ms AS "timeoutMs", + s.error, + s.output, + false AS "isNew" + FROM ${this.tables.jobStepsTable} s + INNER JOIN job_check jc ON s.job_id = jc.id + WHERE s.job_id = ${jobId} + AND s.name = ${name} + AND s.parent_step_id IS NOT DISTINCT FROM ${parentStepId} + AND NOT EXISTS (SELECT 1 FROM final_upserted) + ) + SELECT * FROM final_upserted + UNION ALL + SELECT * FROM existing_step + `), + ) + + if (!result) { + this.logger?.error({ jobId }, `[PostgresAdapter] Job ${jobId} is not active or has expired`) + return null + } + + return result + } + + /** + * Internal method to mark a job step as completed. + * + * @returns Promise resolving to `true` if completed, `false` otherwise + */ + protected async _completeJobStep({ stepId, output }: CompleteJobStepOptions) { + const result = await this.db + .update(this.tables.jobStepsTable) + .set({ + status: STEP_STATUS_COMPLETED, + output, + finished_at: sql`now()`, + updated_at: sql`now()`, + }) + .from(this.tables.jobsTable) + .where( + and( + eq(this.tables.jobStepsTable.job_id, this.tables.jobsTable.id), + eq(this.tables.jobStepsTable.id, stepId), + eq(this.tables.jobStepsTable.status, STEP_STATUS_ACTIVE), + eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), + or(isNull(this.tables.jobsTable.expires_at), gt(this.tables.jobsTable.expires_at, sql`now()`)), + ), + ) + .returning({ id: this.tables.jobStepsTable.id }) + + return result.length > 0 + } + + /** + * Internal method to mark a job step as failed. + * + * @returns Promise resolving to `true` if failed, `false` otherwise + */ + protected async _failJobStep({ stepId, error }: FailJobStepOptions) { + const result = await this.db + .update(this.tables.jobStepsTable) + .set({ + status: STEP_STATUS_FAILED, + error, + finished_at: sql`now()`, + updated_at: sql`now()`, + }) + .from(this.tables.jobsTable) + .where( + and( + eq(this.tables.jobStepsTable.job_id, this.tables.jobsTable.id), + eq(this.tables.jobStepsTable.id, stepId), + eq(this.tables.jobStepsTable.status, STEP_STATUS_ACTIVE), + eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), + ), + ) + .returning({ id: this.tables.jobStepsTable.id }) + + return result.length > 0 + } + + /** + * Internal method to delay a job step. + * + * @returns Promise resolving to `true` if delayed, `false` otherwise + */ + protected async _delayJobStep({ stepId, delayMs, error }: DelayJobStepOptions) { + const jobStepsTable = this.tables.jobStepsTable + const jobsTable = this.tables.jobsTable + + const result = await this.db + .update(jobStepsTable) + .set({ + delayed_ms: delayMs, + retries_count: sql`${jobStepsTable.retries_count} + 1`, + expires_at: sql`now() + (${jobStepsTable.timeout_ms} || ' milliseconds')::interval + (${delayMs} || ' milliseconds')::interval`, + history_failed_attempts: sql`COALESCE(${jobStepsTable.history_failed_attempts}, '{}'::jsonb) || jsonb_build_object( + extract(epoch from now())::text, + jsonb_build_object( + 'failedAt', now(), + 'error', ${JSON.stringify(error)}::jsonb, + 'delayedMs', ${delayMs}::integer + ) + )`, + updated_at: sql`now()`, + }) + .from(jobsTable) + .where( + and( + eq(jobStepsTable.job_id, jobsTable.id), + eq(jobStepsTable.id, stepId), + eq(jobStepsTable.status, STEP_STATUS_ACTIVE), + eq(jobsTable.status, JOB_STATUS_ACTIVE), + ), + ) + .returning({ id: jobStepsTable.id }) + + return result.length > 0 + } + + /** + * Internal method to cancel a job step. + * + * @returns Promise resolving to `true` if cancelled, `false` otherwise + */ + protected async _cancelJobStep({ stepId }: CancelJobStepOptions) { + const result = await this.db + .update(this.tables.jobStepsTable) + .set({ + status: STEP_STATUS_CANCELLED, + finished_at: sql`now()`, + updated_at: sql`now()`, + }) + .from(this.tables.jobsTable) + .where( + and( + eq(this.tables.jobStepsTable.job_id, this.tables.jobsTable.id), + eq(this.tables.jobStepsTable.id, stepId), + eq(this.tables.jobStepsTable.status, STEP_STATUS_ACTIVE), + or( + eq(this.tables.jobsTable.status, JOB_STATUS_ACTIVE), + eq(this.tables.jobsTable.status, JOB_STATUS_CANCELLED), + ), + ), + ) + .returning({ id: this.tables.jobStepsTable.id }) + + return result.length > 0 + } + + // ============================================================================ + // Query Methods + // ============================================================================ + + /** + * Internal method to get a job by its ID. Does not include step information. + */ + protected async _getJobById(jobId: string): Promise { + const jobsTable = this.tables.jobsTable + + // Calculate duration as a SQL expression (finishedAt - startedAt in milliseconds) + const durationMs = sql` + CASE + WHEN ${jobsTable.started_at} IS NOT NULL AND ${jobsTable.finished_at} IS NOT NULL + THEN EXTRACT(EPOCH FROM (${jobsTable.finished_at} - ${jobsTable.started_at})) * 1000 + ELSE NULL + END + `.as('duration_ms') + + const [job] = await this.db + .select({ + id: jobsTable.id, + actionName: jobsTable.action_name, + groupKey: jobsTable.group_key, + description: jobsTable.description, + input: jobsTable.input, + output: jobsTable.output, + error: jobsTable.error, + status: jobsTable.status, + timeoutMs: jobsTable.timeout_ms, + expiresAt: jobsTable.expires_at, + startedAt: jobsTable.started_at, + finishedAt: jobsTable.finished_at, + createdAt: jobsTable.created_at, + updatedAt: jobsTable.updated_at, + concurrencyLimit: jobsTable.concurrency_limit, + concurrencyStepLimit: jobsTable.concurrency_step_limit, + clientId: jobsTable.client_id, + durationMs, + }) + .from(jobsTable) + .where(eq(jobsTable.id, jobId)) + .limit(1) + + return job ?? null + } + + /** + * Internal method to get all steps for a job with optional fuzzy search. + * Steps are always ordered by created_at ASC. + * Steps do not include output data. + */ + protected async _getJobSteps(options: GetJobStepsOptions): Promise { + const { jobId, search } = options + + const jobStepsTable = this.tables.jobStepsTable + + const fuzzySearch = search?.trim() + + const where = and( + eq(jobStepsTable.job_id, jobId), + fuzzySearch && fuzzySearch.length > 0 + ? or( + ilike(jobStepsTable.name, `%${fuzzySearch}%`), + sql`to_tsvector('english', ${jobStepsTable.output}::text) @@ plainto_tsquery('english', ${fuzzySearch})`, + ) + : undefined, + options.updatedAfter + ? sql`date_trunc('milliseconds', ${jobStepsTable.updated_at}) > ${options.updatedAfter.toISOString()}::timestamptz` + : undefined, + ) + + const steps = await this.db + .select({ + id: jobStepsTable.id, + jobId: jobStepsTable.job_id, + parentStepId: jobStepsTable.parent_step_id, + parallel: jobStepsTable.parallel, + name: jobStepsTable.name, + status: jobStepsTable.status, + error: jobStepsTable.error, + startedAt: jobStepsTable.started_at, + finishedAt: jobStepsTable.finished_at, + timeoutMs: jobStepsTable.timeout_ms, + expiresAt: jobStepsTable.expires_at, + retriesLimit: jobStepsTable.retries_limit, + retriesCount: jobStepsTable.retries_count, + delayedMs: jobStepsTable.delayed_ms, + historyFailedAttempts: jobStepsTable.history_failed_attempts, + createdAt: jobStepsTable.created_at, + updatedAt: jobStepsTable.updated_at, + }) + .from(jobStepsTable) + .where(where) + .orderBy(asc(jobStepsTable.created_at)) + + return { + steps, + total: steps.length, + } + } + + protected _buildJobsWhereClause(filters: GetJobsOptions['filters']) { + if (!filters) { + return undefined + } + + const jobsTable = this.tables.jobsTable + + const fuzzySearch = filters.search?.trim() + + // Build WHERE clause parts using postgres template literals + return and( + filters.status + ? inArray(jobsTable.status, Array.isArray(filters.status) ? filters.status : [filters.status]) + : undefined, + filters.actionName + ? inArray(jobsTable.action_name, Array.isArray(filters.actionName) ? filters.actionName : [filters.actionName]) + : undefined, + filters.groupKey && Array.isArray(filters.groupKey) + ? sql`j.group_key LIKE ANY(ARRAY[${sql.raw(filters.groupKey.map((key) => `'${key}'`).join(','))}]::text[])` + : undefined, + filters.groupKey && !Array.isArray(filters.groupKey) + ? ilike(jobsTable.group_key, `%${filters.groupKey}%`) + : undefined, + filters.clientId + ? inArray(jobsTable.client_id, Array.isArray(filters.clientId) ? filters.clientId : [filters.clientId]) + : undefined, + filters.description ? ilike(jobsTable.description, `%${filters.description}%`) : undefined, + filters.createdAt && Array.isArray(filters.createdAt) + ? between( + sql`date_trunc('second', ${jobsTable.created_at})`, + filters.createdAt[0]!.toISOString(), + filters.createdAt[1]!.toISOString(), + ) + : undefined, + filters.createdAt && !Array.isArray(filters.createdAt) + ? gte(sql`date_trunc('second', ${jobsTable.created_at})`, filters.createdAt.toISOString()) + : undefined, + filters.startedAt && Array.isArray(filters.startedAt) + ? between( + sql`date_trunc('second', ${jobsTable.started_at})`, + filters.startedAt[0]!.toISOString(), + filters.startedAt[1]!.toISOString(), + ) + : undefined, + filters.startedAt && !Array.isArray(filters.startedAt) + ? gte(sql`date_trunc('second', ${jobsTable.started_at})`, filters.startedAt.toISOString()) + : undefined, + filters.finishedAt && Array.isArray(filters.finishedAt) + ? between( + sql`date_trunc('second', ${jobsTable.finished_at})`, + filters.finishedAt[0]!.toISOString(), + filters.finishedAt[1]!.toISOString(), + ) + : undefined, + filters.finishedAt && !Array.isArray(filters.finishedAt) + ? gte(sql`date_trunc('second', ${jobsTable.finished_at})`, filters.finishedAt.toISOString()) + : undefined, + filters.updatedAfter + ? sql`date_trunc('milliseconds', ${jobsTable.updated_at}) > ${filters.updatedAfter.toISOString()}::timestamptz` + : undefined, + fuzzySearch && fuzzySearch.length > 0 + ? or( + ilike(jobsTable.action_name, `%${fuzzySearch}%`), + ilike(jobsTable.group_key, `%${fuzzySearch}%`), + ilike(jobsTable.description, `%${fuzzySearch}%`), + ilike(jobsTable.client_id, `%${fuzzySearch}%`), + sql`${jobsTable.id}::text ilike ${`%${fuzzySearch}%`}`, + sql`to_tsvector('english', ${jobsTable.input}::text) @@ plainto_tsquery('english', ${fuzzySearch})`, + sql`to_tsvector('english', ${jobsTable.output}::text) @@ plainto_tsquery('english', ${fuzzySearch})`, + ) + : undefined, + ...(filters.inputFilter && Object.keys(filters.inputFilter).length > 0 + ? this.#buildJsonbWhereConditions(filters.inputFilter, jobsTable.input) + : []), + ...(filters.outputFilter && Object.keys(filters.outputFilter).length > 0 + ? this.#buildJsonbWhereConditions(filters.outputFilter, jobsTable.output) + : []), + ) + } + /** + * Internal method to get jobs with pagination, filtering, and sorting. + * Does not include step information or job output. + */ + protected async _getJobs(options?: GetJobsOptions): Promise { + const jobsTable = this.tables.jobsTable + const page = options?.page ?? 1 + const pageSize = options?.pageSize ?? 10 + const filters = options?.filters ?? {} + + const sortInput = options?.sort ?? { field: 'startedAt', order: 'desc' } + const sorts = Array.isArray(sortInput) ? sortInput : [sortInput] + + const where = this._buildJobsWhereClause(filters) + + // Get total count + const total = await this.db.$count(jobsTable, where) + if (!total) { + return { + jobs: [], + total: 0, + page, + pageSize, + } + } + + // Calculate duration as a SQL expression (finishedAt - startedAt in milliseconds) + const durationMs = sql` + CASE + WHEN ${jobsTable.started_at} IS NOT NULL AND ${jobsTable.finished_at} IS NOT NULL + THEN EXTRACT(EPOCH FROM (${jobsTable.finished_at} - ${jobsTable.started_at})) * 1000 + ELSE NULL + END + `.as('duration_ms') + + const sortFieldMap: Record = { + createdAt: jobsTable.created_at, + startedAt: jobsTable.started_at, + finishedAt: jobsTable.finished_at, + status: jobsTable.status, + actionName: jobsTable.action_name, + expiresAt: jobsTable.expires_at, + duration: durationMs, + description: jobsTable.description, + } + + const jobs = await this.db + .select({ + id: jobsTable.id, + actionName: jobsTable.action_name, + groupKey: jobsTable.group_key, + description: jobsTable.description, + input: jobsTable.input, + output: jobsTable.output, + error: jobsTable.error, + status: jobsTable.status, + timeoutMs: jobsTable.timeout_ms, + expiresAt: jobsTable.expires_at, + startedAt: jobsTable.started_at, + finishedAt: jobsTable.finished_at, + createdAt: jobsTable.created_at, + updatedAt: jobsTable.updated_at, + concurrencyLimit: jobsTable.concurrency_limit, + concurrencyStepLimit: jobsTable.concurrency_step_limit, + clientId: jobsTable.client_id, + durationMs, + }) + .from(jobsTable) + .where(where) + .orderBy( + ...sorts + .filter((sortItem) => sortItem.field in sortFieldMap) + .map((sortItem) => { + const sortField = sortFieldMap[sortItem.field] + if (sortItem.order.toUpperCase() === 'ASC') { + return asc(sortField) + } else { + return desc(sortField) + } + }), + ) + .limit(pageSize) + .offset((page - 1) * pageSize) + + return { + jobs, + total, + page, + pageSize, + } + } + + /** + * Internal method to get a step by its ID with all information. + */ + protected async _getJobStepById(stepId: string): Promise { + const [step] = await this.db + .select({ + id: this.tables.jobStepsTable.id, + jobId: this.tables.jobStepsTable.job_id, + parentStepId: this.tables.jobStepsTable.parent_step_id, + parallel: this.tables.jobStepsTable.parallel, + name: this.tables.jobStepsTable.name, + output: this.tables.jobStepsTable.output, + status: this.tables.jobStepsTable.status, + error: this.tables.jobStepsTable.error, + startedAt: this.tables.jobStepsTable.started_at, + finishedAt: this.tables.jobStepsTable.finished_at, + timeoutMs: this.tables.jobStepsTable.timeout_ms, + expiresAt: this.tables.jobStepsTable.expires_at, + retriesLimit: this.tables.jobStepsTable.retries_limit, + retriesCount: this.tables.jobStepsTable.retries_count, + delayedMs: this.tables.jobStepsTable.delayed_ms, + historyFailedAttempts: this.tables.jobStepsTable.history_failed_attempts, + createdAt: this.tables.jobStepsTable.created_at, + updatedAt: this.tables.jobStepsTable.updated_at, + }) + .from(this.tables.jobStepsTable) + .where(eq(this.tables.jobStepsTable.id, stepId)) + .limit(1) + + return step ?? null + } + + /** + * Internal method to get job status and updatedAt timestamp. + */ + protected async _getJobStatus(jobId: string): Promise { + const [job] = await this.db + .select({ + status: this.tables.jobsTable.status, + updatedAt: this.tables.jobsTable.updated_at, + }) + .from(this.tables.jobsTable) + .where(eq(this.tables.jobsTable.id, jobId)) + .limit(1) + + return job ?? null + } + + /** + * Internal method to get job step status and updatedAt timestamp. + */ + protected async _getJobStepStatus(stepId: string): Promise { + const [step] = await this.db + .select({ + status: this.tables.jobStepsTable.status, + updatedAt: this.tables.jobStepsTable.updated_at, + }) + .from(this.tables.jobStepsTable) + .where(eq(this.tables.jobStepsTable.id, stepId)) + .limit(1) + + return step ?? null + } + + /** + * Internal method to get action statistics including counts and last job created date. + */ + protected async _getActions(): Promise { + const actionStats = this.db.$with('action_stats').as( + this.db + .select({ + name: this.tables.jobsTable.action_name, + last_job_created: sql`MAX(${this.tables.jobsTable.created_at})`.as('last_job_created'), + active: sql`COUNT(*) FILTER (WHERE ${this.tables.jobsTable.status} = ${JOB_STATUS_ACTIVE})`.as( + 'active', + ), + completed: sql`COUNT(*) FILTER (WHERE ${this.tables.jobsTable.status} = ${JOB_STATUS_COMPLETED})`.as( + 'completed', + ), + failed: sql`COUNT(*) FILTER (WHERE ${this.tables.jobsTable.status} = ${JOB_STATUS_FAILED})`.as( + 'failed', + ), + cancelled: sql`COUNT(*) FILTER (WHERE ${this.tables.jobsTable.status} = ${JOB_STATUS_CANCELLED})`.as( + 'cancelled', + ), + }) + .from(this.tables.jobsTable) + .groupBy(this.tables.jobsTable.action_name), + ) + + const actions = await this.db + .with(actionStats) + .select({ + name: actionStats.name, + lastJobCreated: actionStats.last_job_created, + active: sql`${actionStats.active}::int`, + completed: sql`${actionStats.completed}::int`, + failed: sql`${actionStats.failed}::int`, + cancelled: sql`${actionStats.cancelled}::int`, + }) + .from(actionStats) + .orderBy(actionStats.name) + + return { + actions: actions.map((action) => ({ + ...action, + lastJobCreated: action.lastJobCreated ?? null, + })), + } + } + + // ============================================================================ + // Metrics Methods + // ============================================================================ + + /** + * Internal method to insert multiple span records in a single batch. + */ + protected async _insertSpans(spans: InsertSpanOptions[]): Promise { + if (spans.length === 0) { + return 0 + } + + const values = spans.map((s) => ({ + trace_id: s.traceId, + span_id: s.spanId, + parent_span_id: s.parentSpanId, + job_id: s.jobId, + step_id: s.stepId, + name: s.name, + kind: s.kind, + start_time_unix_nano: s.startTimeUnixNano, + end_time_unix_nano: s.endTimeUnixNano, + status_code: s.statusCode, + status_message: s.statusMessage, + attributes: s.attributes ?? {}, + events: s.events ?? [], + })) + + const result = await this.db + .insert(this.tables.spansTable) + .values(values) + .returning({ id: this.tables.spansTable.id }) + + return result.length + } + + /** + * Internal method to get spans for a job or step. + * For step queries, uses a recursive CTE to find all descendant spans. + */ + protected async _getSpans(options: GetSpansOptions): Promise { + const spansTable = this.tables.spansTable + const filters = options.filters ?? {} + + // Build sort + const sortInput = options.sort ?? { field: 'startTimeUnixNano', order: 'asc' } + const sortFieldMap: Record = { + name: 'name', + startTimeUnixNano: 'start_time_unix_nano', + endTimeUnixNano: 'end_time_unix_nano', + } + const sortField = sortFieldMap[sortInput.field] + const sortOrder = sortInput.order === 'asc' ? 'ASC' : 'DESC' + + // For step queries, use a recursive CTE to get descendant spans + if (options.stepId) { + return this._getStepSpansRecursive(options.stepId, sortField, sortOrder, filters) + } + + // Build WHERE clause for job queries + const where = this._buildSpansWhereClause(options.jobId, undefined, filters) + + // Get total count + const total = await this.db.$count(spansTable, where) + if (!total) { + return { + spans: [], + total: 0, + } + } + + const sortFieldColumn = sortFieldMap[sortInput.field] + const orderByClause = + sortInput.order === 'asc' + ? asc(spansTable[sortFieldColumn as keyof typeof spansTable] as any) + : desc(spansTable[sortFieldColumn as keyof typeof spansTable] as any) + + const rows = await this.db + .select({ + id: spansTable.id, + traceId: spansTable.trace_id, + spanId: spansTable.span_id, + parentSpanId: spansTable.parent_span_id, + jobId: spansTable.job_id, + stepId: spansTable.step_id, + name: spansTable.name, + kind: spansTable.kind, + startTimeUnixNano: spansTable.start_time_unix_nano, + endTimeUnixNano: spansTable.end_time_unix_nano, + statusCode: spansTable.status_code, + statusMessage: spansTable.status_message, + attributes: spansTable.attributes, + events: spansTable.events, + }) + .from(spansTable) + .where(where) + .orderBy(orderByClause) + + // Cast kind and statusCode to proper types, convert BigInt to string for JSON serialization + const spans = rows.map((row) => ({ + ...row, + kind: row.kind as 0 | 1 | 2 | 3 | 4, + statusCode: row.statusCode as 0 | 1 | 2, + // Convert BigInt to string for JSON serialization + startTimeUnixNano: row.startTimeUnixNano?.toString() ?? null, + endTimeUnixNano: row.endTimeUnixNano?.toString() ?? null, + })) + + return { + spans, + total, + } + } + + /** + * Get spans for a step using a recursive CTE to traverse the span hierarchy. + * This returns the step's span and all its descendant spans (children, grandchildren, etc.) + */ + protected async _getStepSpansRecursive( + stepId: string, + sortField: string, + sortOrder: string, + _filters?: GetSpansOptions['filters'], + ): Promise { + const schemaName = this.schema + + // Use a recursive CTE to find all descendant spans + // 1. Base case: find the span with step_id = stepId + // 2. Recursive case: find all spans where parent_span_id = span_id of a span we've already found + const query = sql` + WITH RECURSIVE span_tree AS ( + -- Base case: the span(s) for the step + SELECT * FROM ${sql.identifier(schemaName)}.spans WHERE step_id = ${stepId}::uuid + UNION ALL + -- Recursive case: children of spans we've found + SELECT s.* FROM ${sql.identifier(schemaName)}.spans s + INNER JOIN span_tree st ON s.parent_span_id = st.span_id + ) + SELECT + id, + trace_id as "traceId", + span_id as "spanId", + parent_span_id as "parentSpanId", + job_id as "jobId", + step_id as "stepId", + name, + kind, + start_time_unix_nano as "startTimeUnixNano", + end_time_unix_nano as "endTimeUnixNano", + status_code as "statusCode", + status_message as "statusMessage", + attributes, + events + FROM span_tree + ORDER BY ${sql.identifier(sortField)} ${sql.raw(sortOrder)} + ` + + // Raw SQL returns numeric types as strings, so we type them as such + const rows = (await this.db.execute(query)) as unknown as Array<{ + id: string | number + traceId: string + spanId: string + parentSpanId: string | null + jobId: string | null + stepId: string | null + name: string + kind: string | number + startTimeUnixNano: string | bigint | null + endTimeUnixNano: string | bigint | null + statusCode: string | number + statusMessage: string | null + attributes: Record + events: Array<{ name: string; timeUnixNano: string; attributes?: Record }> + }> + + // Convert types: raw SQL returns numeric types as strings + const spans = rows.map((row) => ({ + ...row, + // Convert id to number (bigserial comes as string from raw SQL) + id: typeof row.id === 'string' ? Number.parseInt(row.id, 10) : row.id, + // Convert kind and statusCode to proper types + kind: (typeof row.kind === 'string' ? Number.parseInt(row.kind, 10) : row.kind) as 0 | 1 | 2 | 3 | 4, + statusCode: (typeof row.statusCode === 'string' ? Number.parseInt(row.statusCode, 10) : row.statusCode) as + | 0 + | 1 + | 2, + // Convert BigInt to string for JSON serialization + startTimeUnixNano: row.startTimeUnixNano?.toString() ?? null, + endTimeUnixNano: row.endTimeUnixNano?.toString() ?? null, + })) + + return { + spans, + total: spans.length, + } + } + + /** + * Internal method to delete all spans for a job. + */ + protected async _deleteSpans(options: DeleteSpansOptions): Promise { + const result = await this.db + .delete(this.tables.spansTable) + .where(eq(this.tables.spansTable.job_id, options.jobId)) + .returning({ id: this.tables.spansTable.id }) + + return result.length + } + + /** + * Build WHERE clause for spans queries (used for job queries only). + * When querying by jobId, we find all spans that share the same trace_id + * as spans with that job. This includes spans from external libraries that + * don't have the duron.job.id attribute but are part of the same trace. + * + * Note: Step queries are handled separately by _getStepSpansRecursive using + * a recursive CTE to traverse the span hierarchy. + */ + protected _buildSpansWhereClause(jobId?: string, _stepId?: string, filters?: GetSpansOptions['filters']) { + const spansTable = this.tables.spansTable + + // Build condition for finding spans by trace_id (includes external spans) + let traceCondition: ReturnType | undefined + + if (jobId) { + // Find all spans that share a trace_id with any span that has this job_id + // This includes external spans (like from AI SDK) that don't have duron.job.id + traceCondition = inArray( + spansTable.trace_id, + this.db.select({ traceId: spansTable.trace_id }).from(spansTable).where(eq(spansTable.job_id, jobId)), + ) + } + + return and( + traceCondition, + filters?.name + ? Array.isArray(filters.name) + ? or(...filters.name.map((n) => ilike(spansTable.name, `%${n}%`))) + : ilike(spansTable.name, `%${filters.name}%`) + : undefined, + filters?.kind ? inArray(spansTable.kind, Array.isArray(filters.kind) ? filters.kind : [filters.kind]) : undefined, + filters?.statusCode + ? inArray(spansTable.status_code, Array.isArray(filters.statusCode) ? filters.statusCode : [filters.statusCode]) + : undefined, + filters?.traceId ? eq(spansTable.trace_id, filters.traceId) : undefined, + ...(filters?.attributesFilter && Object.keys(filters.attributesFilter).length > 0 + ? this.#buildJsonbWhereConditions(filters.attributesFilter, spansTable.attributes) + : []), + ) + } + + // ============================================================================ + // Private Methods + // ============================================================================ + + /** + * Build WHERE conditions for JSONB filter using individual property checks. + * Each property becomes a separate condition using ->> operator and ILIKE for case-insensitive matching. + * Supports nested properties via dot notation and arrays. + * + * Example: + * { "email": "tincho@gmail", "address.name": "nicolas", "products": ["chicle"] } + * Generates: + * input ->> 'email' ILIKE '%tincho@gmail%' + * AND input ->> 'address' ->> 'name' ILIKE '%nicolas%' + * AND EXISTS (SELECT 1 FROM jsonb_array_elements_text(input -> 'products') AS elem WHERE LOWER(elem) ILIKE LOWER('%chicle%')) + * + * @param filter - Flat record with dot-notation keys (e.g., { "email": "test", "address.name": "value", "products": ["chicle"] }) + * @param jsonbColumn - The JSONB column name + * @returns Array of SQL conditions + */ + #buildJsonbWhereConditions(filter: Record, jsonbColumn: PgColumn): any[] { + const conditions: any[] = [] + + for (const [key, value] of Object.entries(filter)) { + const parts = key.split('.').filter((p) => p.length > 0) + if (parts.length === 0) { + continue + } + + // Build the JSONB path expression step by step + // For "address.name": input -> 'address' ->> 'name' (-> for intermediate, ->> for final) + // For "email": input ->> 'email' (->> for single level) + let jsonbPath = sql`${jsonbColumn}` + if (parts.length === 1) { + // Single level: use ->> directly + jsonbPath = sql`${jsonbPath} ->> ${parts[0]!}` + } else { + // Nested: use -> for intermediate steps, ->> for final step + for (let i = 0; i < parts.length - 1; i++) { + const part = parts[i] + if (part) { + jsonbPath = sql`${jsonbPath} -> ${part}` + } + } + const lastPart = parts[parts.length - 1] + if (lastPart) { + jsonbPath = sql`${jsonbPath} ->> ${lastPart}` + } + } + + // Handle array values - check if JSONB array contains at least one of the values + if (Array.isArray(value)) { + // Build condition: check if any element in the JSONB array matches any value in the filter array + const arrayValueConditions = value.map((arrayValue) => { + const arrayValueStr = String(arrayValue) + // Get the array from JSONB: input -> 'products' + let arrayPath = sql`${jsonbColumn}` + for (let i = 0; i < parts.length - 1; i++) { + const part = parts[i] + if (part) { + arrayPath = sql`${arrayPath} -> ${part}` + } + } + const lastPart = parts[parts.length - 1] + if (lastPart) { + arrayPath = sql`${arrayPath} -> ${lastPart}` + } + + // Check if the JSONB array contains the value (case-insensitive for strings) + if (typeof arrayValue === 'string') { + return sql`EXISTS ( + SELECT 1 + FROM jsonb_array_elements_text(${arrayPath}) AS elem + WHERE LOWER(elem) ILIKE LOWER(${`%${arrayValueStr}%`}) + )` + } else { + // For non-string values, use exact containment + return sql`${arrayPath} @> ${sql.raw(JSON.stringify([arrayValue]))}::jsonb` + } + }) + + // Combine array conditions with OR (at least one must match) + if (arrayValueConditions.length > 0) { + conditions.push( + arrayValueConditions.reduce((acc, condition, idx) => (idx === 0 ? condition : sql`${acc} OR ${condition}`)), + ) + } + } else if (typeof value === 'string') { + // String values: use ILIKE for case-insensitive partial matching + conditions.push(sql`COALESCE(${jsonbPath}, '') ILIKE ${`%${value}%`}`) + } else { + // Non-string, non-array values: use exact match + // Convert JSONB value to text for comparison + conditions.push(sql`${jsonbPath}::text = ${String(value)}`) + } + } + + return conditions + } + + // ============================================================================ + // Protected Methods + // ============================================================================ + + /** + * Send a PostgreSQL notification. + * + * @param event - The event name + * @param data - The data to send + * @returns Promise resolving to `void` + */ + protected async _notify(_event: string, _data: any): Promise { + // do nothing + } + + /** + * Listen for PostgreSQL notifications. + * + * @param event - The event name to listen for + * @param callback - Callback function to handle notifications + * @returns Promise resolving to an object with an `unlisten` function + */ + protected async _listen(_event: string, _callback: (payload: string) => void): Promise<{ unlisten: () => void }> { + // do nothing + return { + unlisten: () => { + // do nothing + }, + } + } + + /** + * Map database query results to the expected format. + * Can be overridden by subclasses to handle different result formats. + * + * @param result - The raw database query result + * @returns The mapped result + */ + protected _map(result: any) { + return result + } +} diff --git a/packages/duron/src/adapters/postgres/schema.default.ts b/packages/duron/src/adapters/postgres/schema.default.ts index 1edd23f..eb094cd 100644 --- a/packages/duron/src/adapters/postgres/schema.default.ts +++ b/packages/duron/src/adapters/postgres/schema.default.ts @@ -1,5 +1,6 @@ import createSchema from './schema.js' -const { schema, jobsTable, jobStepsTable, spansTable } = createSchema('duron') +const { schema, jobsActiveTable, jobsArchiveTable, jobStepsActiveTable, jobStepsArchiveTable, spansTable } = + createSchema('duron') -export { schema, jobsTable, jobStepsTable, spansTable } +export { schema, jobsActiveTable, jobsArchiveTable, jobStepsActiveTable, jobStepsArchiveTable, spansTable } diff --git a/packages/duron/src/adapters/postgres/schema.ts b/packages/duron/src/adapters/postgres/schema.ts index 7a8fe07..2c6a623 100644 --- a/packages/duron/src/adapters/postgres/schema.ts +++ b/packages/duron/src/adapters/postgres/schema.ts @@ -20,8 +20,12 @@ import type { SerializableError } from '../../errors.js' export default function createSchema(schemaName: string) { const schema = pgSchema(schemaName) - const jobsTable = schema.table( - 'jobs', + // ============================================================================ + // Active Tables (Hot Path) + // ============================================================================ + + const jobsActiveTable = schema.table( + 'jobs_active', { id: uuid('id').primaryKey().defaultRandom(), action_name: text('action_name').notNull(), @@ -52,39 +56,38 @@ export default function createSchema(schemaName: string) { }, (table) => [ // Single column indexes - index('idx_jobs_action_name').on(table.action_name), - index('idx_jobs_status').on(table.status), - index('idx_jobs_group_key').on(table.group_key), - index('idx_jobs_description').on(table.description), - index('idx_jobs_started_at').on(table.started_at), - index('idx_jobs_finished_at').on(table.finished_at), - index('idx_jobs_expires_at').on(table.expires_at), - index('idx_jobs_client_id').on(table.client_id), - index('idx_jobs_checksum').on(table.checksum), - index('idx_jobs_concurrency_limit').on(table.concurrency_limit), - index('idx_jobs_concurrency_step_limit').on(table.concurrency_step_limit), + index('idx_jobs_active_action_name').on(table.action_name), + index('idx_jobs_active_status').on(table.status), + index('idx_jobs_active_group_key').on(table.group_key), + index('idx_jobs_active_description').on(table.description), + index('idx_jobs_active_started_at').on(table.started_at), + index('idx_jobs_active_expires_at').on(table.expires_at), + index('idx_jobs_active_client_id').on(table.client_id), + index('idx_jobs_active_checksum').on(table.checksum), + index('idx_jobs_active_concurrency_limit').on(table.concurrency_limit), + index('idx_jobs_active_concurrency_step_limit').on(table.concurrency_step_limit), // Composite indexes - index('idx_jobs_action_status').on(table.action_name, table.status), - index('idx_jobs_action_group').on(table.action_name, table.group_key), + index('idx_jobs_active_action_status').on(table.action_name, table.status), + index('idx_jobs_active_action_group').on(table.action_name, table.group_key), // GIN indexes for full-text search - index('idx_jobs_input_fts').using('gin', sql`to_tsvector('english', ${table.input}::text)`), - index('idx_jobs_output_fts').using('gin', sql`to_tsvector('english', ${table.output}::text)`), + index('idx_jobs_active_input_fts').using('gin', sql`to_tsvector('english', ${table.input}::text)`), + index('idx_jobs_active_output_fts').using('gin', sql`to_tsvector('english', ${table.output}::text)`), check( - 'jobs_status_check', + 'jobs_active_status_check', sql`${table.status} IN ${sql.raw(`(${JOB_STATUSES.map((s) => `'${s}'`).join(',')})`)}`, ), ], ) - const jobStepsTable = schema.table( - 'job_steps', + const jobStepsActiveTable = schema.table( + 'job_steps_active', { id: uuid('id').primaryKey().defaultRandom(), job_id: uuid('job_id') .notNull() - .references(() => jobsTable.id, { onDelete: 'cascade' }), + .references(() => jobsActiveTable.id, { onDelete: 'cascade' }), parent_step_id: uuid('parent_step_id'), - parallel: boolean('branch').notNull().default(false), // DB column is 'branch', TypeScript uses 'parallel' + parallel: boolean('branch').notNull().default(false), name: text('name').notNull(), status: text('status').$type().notNull().default(STEP_STATUS_ACTIVE), output: jsonb('output'), @@ -113,55 +116,41 @@ export default function createSchema(schemaName: string) { }, (table) => [ // Single column indexes - index('idx_job_steps_job_id').on(table.job_id), - index('idx_job_steps_status').on(table.status), - index('idx_job_steps_name').on(table.name), - index('idx_job_steps_expires_at').on(table.expires_at), - index('idx_job_steps_parent_step_id').on(table.parent_step_id), + index('idx_job_steps_active_job_id').on(table.job_id), + index('idx_job_steps_active_status').on(table.status), + index('idx_job_steps_active_name').on(table.name), + index('idx_job_steps_active_expires_at').on(table.expires_at), + index('idx_job_steps_active_parent_step_id').on(table.parent_step_id), // Composite indexes - index('idx_job_steps_job_status').on(table.job_id, table.status), - index('idx_job_steps_job_name').on(table.job_id, table.name), - index('idx_job_steps_output_fts').using('gin', sql`to_tsvector('english', ${table.output}::text)`), - // Unique constraint - step name is unique within a parent (name + parentStepId) - // nullsNotDistinct ensures NULL parent_step_id values are treated as equal for uniqueness - unique('unique_job_step_name_parent') + index('idx_job_steps_active_job_status').on(table.job_id, table.status), + index('idx_job_steps_active_job_name').on(table.job_id, table.name), + index('idx_job_steps_active_output_fts').using('gin', sql`to_tsvector('english', ${table.output}::text)`), + // Unique constraint + unique('unique_job_step_active_name_parent') .on(table.job_id, table.name, table.parent_step_id) .nullsNotDistinct(), check( - 'job_steps_status_check', + 'job_steps_active_status_check', sql`${table.status} IN ${sql.raw(`(${STEP_STATUSES.map((s) => `'${s}'`).join(',')})`)}`, ), ], ) - /** - * OpenTelemetry spans table. - * Stores span data exported by PostgresSpanExporter. - * - * SpanKind values: 0=INTERNAL, 1=SERVER, 2=CLIENT, 3=PRODUCER, 4=CONSUMER - * StatusCode values: 0=UNSET, 1=OK, 2=ERROR - */ const spansTable = schema.table( 'spans', { id: bigserial('id', { mode: 'number' }).primaryKey(), - // OpenTelemetry span identifiers - trace_id: text('trace_id').notNull(), // 32-char hex - span_id: text('span_id').notNull(), // 16-char hex - parent_span_id: text('parent_span_id'), // 16-char hex, null for root spans - // Duron-specific references (extracted from span attributes) - job_id: uuid('job_id').references(() => jobsTable.id, { onDelete: 'cascade' }), - step_id: uuid('step_id').references(() => jobStepsTable.id, { onDelete: 'cascade' }), - // Span metadata + trace_id: text('trace_id').notNull(), + span_id: text('span_id').notNull(), + parent_span_id: text('parent_span_id'), + job_id: uuid('job_id'), + step_id: uuid('step_id'), name: text('name').notNull(), - kind: integer('kind').notNull().default(0), // SpanKind enum - // Timing (stored as nanoseconds since epoch for precision) + kind: integer('kind').notNull().default(0), start_time_unix_nano: bigint('start_time_unix_nano', { mode: 'bigint' }).notNull(), end_time_unix_nano: bigint('end_time_unix_nano', { mode: 'bigint' }), - // Status - status_code: integer('status_code').notNull().default(0), // SpanStatusCode enum + status_code: integer('status_code').notNull().default(0), status_message: text('status_message'), - // Span data attributes: jsonb('attributes').$type>().notNull().default({}), events: jsonb('events') .$type }>>() @@ -180,7 +169,7 @@ export default function createSchema(schemaName: string) { // Composite indexes index('idx_spans_job_step').on(table.job_id, table.step_id), index('idx_spans_trace_parent').on(table.trace_id, table.parent_span_id), - // GIN indexes for JSONB querying + // GIN indexes index('idx_spans_attributes').using('gin', table.attributes), index('idx_spans_events').using('gin', table.events), // Constraints @@ -189,10 +178,96 @@ export default function createSchema(schemaName: string) { ], ) + // ============================================================================ + // Archive Tables (Terminated Work) + // ============================================================================ + + const jobsArchiveTable = schema.table( + 'jobs_archive', + { + id: uuid('id').primaryKey(), + action_name: text('action_name').notNull(), + group_key: text('group_key').notNull(), + description: text('description'), + status: text('status').$type().notNull(), + checksum: text('checksum').notNull(), + input: jsonb('input').notNull().default({}), + output: jsonb('output'), + error: jsonb('error').$type(), + timeout_ms: integer('timeout_ms').notNull(), + expires_at: timestamp('expires_at', { withTimezone: true }), + started_at: timestamp('started_at', { withTimezone: true }), + finished_at: timestamp('finished_at', { withTimezone: true }), + client_id: text('client_id'), + concurrency_limit: integer('concurrency_limit').notNull(), + concurrency_step_limit: integer('concurrency_step_limit').notNull(), + created_at: timestamp('created_at', { withTimezone: true }).notNull().defaultNow(), + updated_at: timestamp('updated_at', { withTimezone: true }).notNull().defaultNow(), + }, + (table) => [ + // Lookup indexes + index('idx_jobs_archive_group_key').on(table.group_key), + index('idx_jobs_archive_action_name').on(table.action_name), + index('idx_jobs_archive_finished_at').on(table.finished_at), + // Composite indexes + index('idx_jobs_archive_action_group').on(table.action_name, table.group_key), + // GIN indexes for full-text search (dashboard search) + index('idx_jobs_archive_input_fts').using('gin', sql`to_tsvector('english', ${table.input}::text)`), + index('idx_jobs_archive_output_fts').using('gin', sql`to_tsvector('english', ${table.output}::text)`), + check( + 'jobs_archive_status_check', + sql`${table.status} IN ${sql.raw(`(${JOB_STATUSES.map((s) => `'${s}'`).join(',')})`)}`, + ), + ], + ) + + const jobStepsArchiveTable = schema.table( + 'job_steps_archive', + { + id: uuid('id').primaryKey(), + job_id: uuid('job_id') + .notNull() + .references(() => jobsArchiveTable.id, { onDelete: 'cascade' }), + parent_step_id: uuid('parent_step_id'), + parallel: boolean('branch').notNull().default(false), + name: text('name').notNull(), + status: text('status').$type().notNull().default(STEP_STATUS_ACTIVE), + output: jsonb('output'), + error: jsonb('error').$type(), + started_at: timestamp('started_at', { withTimezone: true }).notNull().defaultNow(), + finished_at: timestamp('finished_at', { withTimezone: true }), + timeout_ms: integer('timeout_ms').notNull(), + expires_at: timestamp('expires_at', { withTimezone: true }), + retries_limit: integer('retries_limit').notNull().default(0), + retries_count: integer('retries_count').notNull().default(0), + delayed_ms: integer('delayed_ms'), + history_failed_attempts: jsonb('history_failed_attempts') + .$type>() + .notNull() + .default({}), + created_at: timestamp('created_at', { withTimezone: true }).notNull().defaultNow(), + updated_at: timestamp('updated_at', { withTimezone: true }).notNull().defaultNow(), + // Denormalized for easier time-based pruning + job_finished_at: timestamp('job_finished_at', { withTimezone: true }), + }, + (table) => [ + // Minimal indexes + index('idx_job_steps_archive_job_id').on(table.job_id), + index('idx_job_steps_archive_job_finished_at').on(table.job_finished_at), + index('idx_job_steps_archive_name').on(table.name), + check( + 'job_steps_archive_status_check', + sql`${table.status} IN ${sql.raw(`(${STEP_STATUSES.map((s) => `'${s}'`).join(',')})`)}`, + ), + ], + ) + return { schema, - jobsTable, - jobStepsTable, + jobsActiveTable, + jobsArchiveTable, + jobStepsActiveTable, + jobStepsArchiveTable, spansTable, } } diff --git a/packages/duron/src/adapters/schemas.ts b/packages/duron/src/adapters/schemas.ts index 98a36bb..13c1932 100644 --- a/packages/duron/src/adapters/schemas.ts +++ b/packages/duron/src/adapters/schemas.ts @@ -404,6 +404,25 @@ export const DeleteSpansOptionsSchema = z.object({ jobId: z.string(), }) +// ============================================================================ +// Archive Schemas +// ============================================================================ + +export const PruneArchiveOptionsSchema = z.object({ + olderThan: z.union([z.string(), z.date(), z.number()]), + batchSize: z.number().optional(), + maxBatches: z.number().optional(), +}) + +export const ArchiveStatsSchema = z.object({ + jobsCount: z.number(), + stepsCount: z.number(), + spansCount: z.number(), + oldestJobDate: z.date().nullable(), + totalSizeBytes: z.number().nullable(), + lastPrunedAt: z.date().nullable(), +}) + // ============================================================================ // Type Exports // ============================================================================ @@ -450,3 +469,5 @@ export type InsertSpanOptions = z.infer export type GetSpansOptions = z.infer export type GetSpansResult = z.infer export type DeleteSpansOptions = z.infer +export type PruneArchiveOptions = z.infer +export type ArchiveStats = z.infer diff --git a/packages/duron/src/client.ts b/packages/duron/src/client.ts index 64bf5b1..f7e9693 100644 --- a/packages/duron/src/client.ts +++ b/packages/duron/src/client.ts @@ -11,6 +11,7 @@ import type { Action, ConcurrencyHandlerContext } from './action.js' import { ActionManager } from './action-manager.js' import type { Adapter, + ArchiveStats, GetActionsResult, GetJobStepsOptions, GetJobStepsResult, @@ -20,6 +21,7 @@ import type { GetSpansResult, Job, JobStep, + PruneArchiveOptions, } from './adapters/adapter.js' import type { JobStatusResult, JobStepStatusResult } from './adapters/schemas.js' import { JOB_STATUS_CANCELLED, JOB_STATUS_COMPLETED, JOB_STATUS_FAILED, type JobStatus } from './constants.js' @@ -1072,6 +1074,42 @@ export class Client< }) } + // ============================================================================ + // Archive Methods + // ============================================================================ + + /** + * Get archive statistics including counts and oldest job date. + * + * @returns Promise resolving to archive statistics + */ + async getArchiveStats(): Promise { + await this.start() + return this.#database.getArchiveStats() + } + + /** + * Prune archived jobs older than the specified threshold. + * + * @param options - Prune options including olderThan, batchSize, maxBatches + * @returns Promise resolving to number of deleted jobs + */ + async pruneArchive(options: PruneArchiveOptions): Promise { + await this.start() + return this.#database.pruneArchive(options) + } + + /** + * Truncate all archive data (jobs, steps, spans). + * This is a destructive operation - use with caution. + * + * @returns Promise resolving when complete + */ + async truncateArchive(): Promise { + await this.start() + return this.#database.truncateArchive() + } + // ============================================================================ // Lifecycle Methods // ============================================================================ diff --git a/packages/duron/src/server.ts b/packages/duron/src/server.ts index 780608b..3d5ce6e 100644 --- a/packages/duron/src/server.ts +++ b/packages/duron/src/server.ts @@ -668,6 +668,76 @@ export function createServer

({ client, prefix, login, spansEna auth: true, }, ) + .get( + '/archive/stats', + async () => { + return client.getArchiveStats() + }, + { + response: { + 200: z.object({ + jobsCount: z.number(), + stepsCount: z.number(), + spansCount: z.number(), + oldestJobDate: z.date().nullable(), + totalSizeBytes: z.number().nullable(), + lastPrunedAt: z.date().nullable(), + }), + 400: ErrorResponseSchema, + 500: ErrorResponseSchema, + 401: ErrorResponseSchema, + }, + auth: true, + }, + ) + .post( + '/archive/prune', + async ({ body }) => { + const deleted = await client.pruneArchive(body) + return { deletedJobs: deleted } + }, + { + body: z.object({ + olderThan: z.union([z.string(), z.coerce.date(), z.number()]), + batchSize: z.number().optional(), + maxBatches: z.number().optional(), + }), + response: { + 200: z.object({ + deletedJobs: z.number(), + }), + 400: ErrorResponseSchema, + 500: ErrorResponseSchema, + 401: ErrorResponseSchema, + }, + auth: true, + }, + ) + .post( + '/archive/truncate', + async ({ body }) => { + const { confirm } = body + if (!confirm) { + throw new Error('Confirmation required. Set confirm: true to truncate all archive data.') + } + await client.truncateArchive() + return { success: true } + }, + { + body: z.object({ + confirm: z.boolean(), + }), + response: { + 200: z.object({ + success: z.boolean(), + }), + 400: ErrorResponseSchema, + 500: ErrorResponseSchema, + 401: ErrorResponseSchema, + }, + auth: true, + }, + ) .get( '/config', async () => { diff --git a/packages/duron/src/step-manager.ts b/packages/duron/src/step-manager.ts index 3d06b84..bb0eaab 100644 --- a/packages/duron/src/step-manager.ts +++ b/packages/duron/src/step-manager.ts @@ -75,6 +75,10 @@ function injectParentSpan(ctx: Context, parentSpan: Span | null): Context { * to the current job/step trace hierarchy. */ function createContextAwareTracer(tracer: Tracer, parentSpan: Span | null): Tracer { + // Extract duron.job.id and duron.step.id from parent span attributes for propagation + const parentJobId = parentSpan ? (parentSpan as any).attributes?.['duron.job.id'] : undefined + const parentStepId = parentSpan ? (parentSpan as any).attributes?.['duron.step.id'] : undefined + return { startSpan(name: string, options?: SpanOptions, ctx?: Context): Span { // Always inject our parent span into the context, regardless of what context is passed. @@ -83,7 +87,13 @@ function createContextAwareTracer(tracer: Tracer, parentSpan: Span | null): Trac // would otherwise create orphan spans. const baseContext = ctx ?? context.active() const effectiveContext = injectParentSpan(baseContext, parentSpan) - return tracer.startSpan(name, options, effectiveContext) + // Propagate duron.job.id and duron.step.id so spans can be queried by job + const attributes = { + ...(parentJobId ? { 'duron.job.id': parentJobId } : {}), + ...(parentStepId ? { 'duron.step.id': parentStepId } : {}), + ...options?.attributes, + } + return tracer.startSpan(name, { ...options, attributes }, effectiveContext) }, // startActiveSpan has multiple overloads, we need to handle them all startActiveSpan unknown>( @@ -123,6 +133,10 @@ function createContextAwareTracer(tracer: Tracer, parentSpan: Span | null): Trac * Create a TelemetryContext that wraps an OTel span. */ function createTelemetryContext(span: Span | null, tracer: Tracer): TelemetryContext { + // Extract duron.job.id and duron.step.id from parent span attributes for propagation + const parentJobId = span ? (span as any).attributes?.['duron.job.id'] : undefined + const parentStepId = span ? (span as any).attributes?.['duron.step.id'] : undefined + return { getActiveSpan(): Span | undefined { return span ?? undefined @@ -133,8 +147,14 @@ function createTelemetryContext(span: Span | null, tracer: Tracer): TelemetryCon }, startSpan(name: string, options?: { attributes?: Record }): Span { // Create a child span linked to the current span (job or step) + // Propagate duron.job.id and duron.step.id from parent so spans can be queried by job + const attributes = { + ...(parentJobId ? { 'duron.job.id': parentJobId } : {}), + ...(parentStepId ? { 'duron.step.id': parentStepId } : {}), + ...options?.attributes, + } const parentContext = span ? trace.setSpan(context.active(), span) : context.active() - return tracer.startSpan(name, { attributes: options?.attributes }, parentContext) + return tracer.startSpan(name, { attributes }, parentContext) }, recordMetric(name: string, value: number, attributes?: Record): void { if (span) { diff --git a/packages/duron/test/archive.test.ts b/packages/duron/test/archive.test.ts new file mode 100644 index 0000000..6cdf493 --- /dev/null +++ b/packages/duron/test/archive.test.ts @@ -0,0 +1,321 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test' + +import { + JOB_STATUS_ACTIVE, + JOB_STATUS_CANCELLED, + JOB_STATUS_COMPLETED, + JOB_STATUS_CREATED, + JOB_STATUS_FAILED, +} from '../src/constants.js' +import { type Adapter, type AdapterFactory, pgliteFactory, postgresFactory } from './adapters.js' +import { expectToBeDefined } from './asserts.js' + +function runArchiveTests(adapterFactory: AdapterFactory) { + describe(`Archive Tests with ${adapterFactory.name}`, () => { + let adapter: Adapter + let deleteDb: () => Promise + + beforeEach( + async () => { + const adapterInstance = await adapterFactory.create() + adapter = adapterInstance.adapter + deleteDb = adapterInstance.deleteDb + adapter.setId('test-adapter') + await adapter.start() + }, + { + timeout: 60_000, + }, + ) + + afterEach(async () => { + if (adapter) { + await adapter.stop() + } + if (deleteDb) { + await deleteDb() + } + }) + + it('should archive completed job and query it', async () => { + const jobId = await adapter.createJob({ + queue: 'test-action', + groupKey: 'test-group', + input: { value: 42 }, + timeoutMs: 10000, + checksum: 'abc123', + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(jobId) + + // Fetch to activate + const fetched = await adapter.fetch({ batch: 10 }) + expect(fetched.length).toBe(1) + expect(fetched[0]?.status).toBe(JOB_STATUS_ACTIVE) + + // Complete the job + const completed = await adapter.completeJob({ jobId, output: { result: 'done' } }) + expect(completed).toBe(true) + + // Should find in archive via getJobById + const job = await adapter.getJobById(jobId) + expectToBeDefined(job) + expect(job.status).toBe(JOB_STATUS_COMPLETED) + expect(job.output).toEqual({ result: 'done' }) + }) + + it('should archive failed job', async () => { + const jobId = await adapter.createJob({ + queue: 'test-action', + groupKey: 'test-group', + input: {}, + timeoutMs: 10000, + checksum: 'abc123', + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(jobId) + + const fetched = await adapter.fetch({ batch: 10 }) + expect(fetched.length).toBe(1) + + const failed = await adapter.failJob({ + jobId, + error: { name: 'Error', message: 'Test failure', stack: '' }, + }) + expect(failed).toBe(true) + + const job = await adapter.getJobById(jobId) + expectToBeDefined(job) + expect(job.status).toBe(JOB_STATUS_FAILED) + }) + + it('should archive cancelled job', async () => { + const jobId = await adapter.createJob({ + queue: 'test-action', + groupKey: 'test-group', + input: {}, + timeoutMs: 10000, + checksum: 'abc123', + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(jobId) + + const cancelled = await adapter.cancelJob({ jobId }) + expect(cancelled).toBe(true) + + const job = await adapter.getJobById(jobId) + expectToBeDefined(job) + expect(job.status).toBe(JOB_STATUS_CANCELLED) + }) + + it('should get archive stats', async () => { + // Create and complete 2 jobs + for (let i = 0; i < 2; i++) { + const jobId = await adapter.createJob({ + queue: `test-action-${i}`, + groupKey: 'test-group', + input: {}, + timeoutMs: 10000, + checksum: `abc${i}`, + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(jobId) + + const fetched = await adapter.fetch({ batch: 10 }) + expect(fetched.length).toBeGreaterThan(0) + + await adapter.completeJob({ jobId, output: {} }) + } + + const stats = await adapter.getArchiveStats() + expect(stats.jobsCount).toBe(2) + expect(stats.oldestJobDate).not.toBeNull() + }) + + it('should prune old jobs', async () => { + const jobId = await adapter.createJob({ + queue: 'test-action', + groupKey: 'test-group', + input: {}, + timeoutMs: 10000, + checksum: 'abc123', + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(jobId) + + const fetched = await adapter.fetch({ batch: 10 }) + expect(fetched.length).toBeGreaterThan(0) + + await adapter.completeJob({ jobId, output: {} }) + + // Verify in archive + let stats = await adapter.getArchiveStats() + expect(stats.jobsCount).toBe(1) + + // Wait a tiny bit + await new Promise((resolve) => setTimeout(resolve, 50)) + + // Prune with old threshold + const deleted = await adapter.pruneArchive({ + olderThan: '1ms', + batchSize: 100, + maxBatches: 1, + }) + expect(deleted).toBe(1) + + stats = await adapter.getArchiveStats() + expect(stats.jobsCount).toBe(0) + }) + + it('should not prune recent jobs', async () => { + const jobId = await adapter.createJob({ + queue: 'test-action', + groupKey: 'test-group', + input: {}, + timeoutMs: 10000, + checksum: 'abc123', + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(jobId) + + const fetched = await adapter.fetch({ batch: 10 }) + expect(fetched.length).toBeGreaterThan(0) + + await adapter.completeJob({ jobId, output: {} }) + + const deleted = await adapter.pruneArchive({ + olderThan: '7d', + batchSize: 100, + maxBatches: 1, + }) + expect(deleted).toBe(0) + + const stats = await adapter.getArchiveStats() + expect(stats.jobsCount).toBe(1) + }) + + it('should truncate archive', async () => { + for (let i = 0; i < 3; i++) { + const jobId = await adapter.createJob({ + queue: `test-action-${i}`, + groupKey: 'test-group', + input: {}, + timeoutMs: 10000, + checksum: `abc${i}`, + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(jobId) + + const fetched = await adapter.fetch({ batch: 10 }) + expect(fetched.length).toBeGreaterThan(0) + + await adapter.completeJob({ jobId, output: {} }) + } + + let stats = await adapter.getArchiveStats() + expect(stats.jobsCount).toBe(3) + + await adapter.truncateArchive() + + stats = await adapter.getArchiveStats() + expect(stats.jobsCount).toBe(0) + expect(stats.stepsCount).toBe(0) + expect(stats.spansCount).toBe(0) + }) + + it('should query archived job by status filter', async () => { + const activeJobId = await adapter.createJob({ + queue: 'test-action', + groupKey: 'test-group', + input: {}, + timeoutMs: 10000, + checksum: 'active', + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(activeJobId) + + const completedJobId = await adapter.createJob({ + queue: 'test-action', + groupKey: 'test-group', + input: {}, + timeoutMs: 10000, + checksum: 'completed', + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(completedJobId) + + const fetched = await adapter.fetch({ batch: 10 }) + expect(fetched.length).toBe(2) + + await adapter.completeJob({ jobId: completedJobId, output: {} }) + + const activeJobs = await adapter.getJobs({ + filters: { status: JOB_STATUS_ACTIVE }, + }) + expect(activeJobs.jobs.length).toBe(1) + expect(activeJobs.jobs[0]?.id).toBe(activeJobId) + + const completedJobs = await adapter.getJobs({ + filters: { status: JOB_STATUS_COMPLETED }, + }) + expect(completedJobs.jobs.length).toBe(1) + expect(completedJobs.jobs[0]?.id).toBe(completedJobId) + }) + + it('should restore archived job for time travel', async () => { + const jobId = await adapter.createJob({ + queue: 'test-action', + groupKey: 'test-group', + input: {}, + timeoutMs: 10000, + checksum: 'abc123', + concurrencyLimit: 10, + concurrencyStepLimit: 10, + }) + expectToBeDefined(jobId) + + const fetched = await adapter.fetch({ batch: 10 }) + expect(fetched.length).toBeGreaterThan(0) + + const step = await adapter.createOrRecoverJobStep({ + jobId, + name: 'test-step', + timeoutMs: 10000, + retriesLimit: 0, + }) + expectToBeDefined(step) + + await adapter.completeJobStep({ + stepId: step.id, + output: { done: true }, + }) + + await adapter.completeJob({ jobId, output: { result: 'done' } }) + + // Verify archived + let job = await adapter.getJobById(jobId) + expect(job?.status).toBe(JOB_STATUS_COMPLETED) + + // Time travel + const success = await adapter.timeTravelJob({ jobId, stepId: step.id }) + expect(success).toBe(true) + + // Should be restored + job = await adapter.getJobById(jobId) + expectToBeDefined(job) + expect(job.status).toBe(JOB_STATUS_CREATED) + }) + }) +} + +runArchiveTests(pgliteFactory) +runArchiveTests(postgresFactory) diff --git a/packages/duron/test/process-order.test.ts b/packages/duron/test/process-order.test.ts new file mode 100644 index 0000000..a6bf68a --- /dev/null +++ b/packages/duron/test/process-order.test.ts @@ -0,0 +1,387 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test' + +import { z } from 'zod' + +import { defineAction } from '../src/action.js' +import { Client } from '../src/client.js' +import { JOB_STATUS_COMPLETED } from '../src/constants.js' +import { pgliteFactory } from './adapters.js' + +// Test version of processOrder without AI dependency +const testProcessOrder = defineAction()({ + name: 'processOrder', + input: z.object({ + orderId: z.string().min(1), + customerId: z.string().min(1), + items: z + .array( + z.object({ + productId: z.string(), + quantity: z.number().min(1), + price: z.number().min(0), + }), + ) + .min(1), + paymentMethod: z.enum(['credit_card', 'paypal', 'bank_transfer']).default('credit_card'), + shippingAddress: z.object({ + street: z.string(), + city: z.string(), + country: z.string(), + postalCode: z.string(), + }), + }), + output: z.object({ + orderId: z.string(), + status: z.enum(['completed', 'failed']), + transactionId: z.string().nullable(), + shipmentId: z.string().nullable(), + timeline: z.array( + z.object({ + step: z.string(), + status: z.enum(['success', 'failed']), + timestamp: z.string(), + details: z.string().optional(), + }), + ), + }), + steps: { + concurrency: 10, + retry: { + limit: 1, + }, + }, + handler: async (ctx) => { + const { orderId, customerId, items, shippingAddress } = ctx.input + const timeline: Array<{ + step: string + status: 'success' | 'failed' + timestamp: string + details?: string + }> = [] + const totalAmount = items.reduce((sum, item) => sum + item.price * item.quantity, 0) + + const addTimeline = (step: string, status: 'success' | 'failed', details?: string) => { + timeline.push({ step, status, timestamp: new Date().toISOString(), details }) + } + + // Step 1: Validate Order + const validation = await ctx.step('validate-order', async ({ step: nestedStep }) => { + const inventoryCheck = await nestedStep('check-inventory', async () => { + const allInStock = items.every((item) => item.quantity <= 10) + addTimeline('check-inventory', allInStock ? 'success' : 'failed', `Checked ${items.length} items`) + return { allInStock, checkedItems: items.length } + }) + + const customerVerification = await nestedStep('verify-customer', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + const isValid = customerId.length > 0 + addTimeline('verify-customer', isValid ? 'success' : 'failed', `Customer: ${customerId}`) + return { isValid, customerId } + }) + + addTimeline( + 'validate-order', + inventoryCheck.allInStock && customerVerification.isValid ? 'success' : 'failed', + `Inventory: ${inventoryCheck.allInStock}, Customer: ${customerVerification.isValid}`, + ) + + return { + isValid: inventoryCheck.allInStock && customerVerification.isValid, + inventoryCheck, + customerVerification, + } + }) + + if (!validation.isValid) { + return { + orderId, + status: 'failed' as const, + transactionId: null, + shipmentId: null, + timeline, + } + } + + // Step 2: Process Payment + const payment = await ctx.step( + 'process-payment', + async ({ step: paymentStep }) => { + const authorization = await paymentStep('authorize-payment', async ({ step: authStep }) => { + const fraudCheck = await authStep('fraud-check', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + const isSafe = totalAmount < 10000 + addTimeline('fraud-check', isSafe ? 'success' : 'failed', `Amount: $${totalAmount.toFixed(2)}`) + return { isSafe, riskScore: isSafe ? 0.1 : 0.9 } + }) + + if (!fraudCheck.isSafe) { + addTimeline('authorize-payment', 'failed', 'Fraud check failed') + return { authorized: false, authCode: null, fraudCheck } + } + + await new Promise((resolve) => setTimeout(resolve, 50)) + const authCode = `AUTH-${Date.now()}` + addTimeline('authorize-payment', 'success', `Auth code: ${authCode}`) + return { authorized: true, authCode, fraudCheck } + }) + + if (!authorization.authorized) { + addTimeline('process-payment', 'failed', 'Authorization failed') + return { success: false, transactionId: null, authorization } + } + + const capture = await paymentStep('capture-payment', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + const transactionId = `TXN-${Date.now()}` + addTimeline('capture-payment', 'success', `Transaction: ${transactionId}`) + return { captured: true, transactionId } + }) + + addTimeline('process-payment', 'success', `Transaction ID: ${capture.transactionId}`) + return { + success: true, + transactionId: capture.transactionId, + authorization, + } + }, + { expire: 60_000 }, + ) + + if (!payment.success) { + return { + orderId, + status: 'failed' as const, + transactionId: null, + shipmentId: null, + timeline, + } + } + + // Step 3: Fulfill Order + const fulfillment = await ctx.step('fulfill-order', async ({ step: fulfillStep }) => { + const reservation = await fulfillStep('reserve-inventory', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + const reservationId = `RES-${Date.now()}` + addTimeline('reserve-inventory', 'success', `Reserved ${items.length} items`) + return { reserved: true, reservationId } + }) + + const shipment = await fulfillStep('create-shipment', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + const shipmentId = `SHIP-${Date.now()}` + addTimeline('create-shipment', 'success', `Shipment to ${shippingAddress.city}`) + return { shipmentId, carrier: 'FastShip', estimatedDays: 3 } + }) + + addTimeline('fulfill-order', 'success', `Shipment: ${shipment.shipmentId}`) + return { reservation, shipment } + }) + + // Step 4: Send Notifications + await ctx.step('send-notifications', async ({ step: notifyStep }) => { + const [emailResult, smsResult] = await Promise.all([ + notifyStep('email-confirmation', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + addTimeline('email-confirmation', 'success', `Sent to customer ${customerId}`) + return { sent: true, type: 'email' } + }), + notifyStep('sms-notification', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + addTimeline('sms-notification', 'success', 'Order confirmation SMS sent') + return { sent: true, type: 'sms' } + }), + ]) + + addTimeline('send-notifications', 'success', `Email: ${emailResult.sent}, SMS: ${smsResult.sent}`) + return { email: emailResult, sms: smsResult } + }) + + // Step 5: Post-Order Processing (Promise.all of steps) + await ctx.step('post-order-processing', async (ctx) => { + await Promise.all([ + ctx.step( + 'analytics-tracking', + async ({ step: analyticsStep }) => { + const purchase = await analyticsStep('track-purchase', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + addTimeline('track-purchase', 'success', `Tracked order ${orderId}`) + return { eventId: `EVT-${Date.now()}`, type: 'purchase' } + }) + + const recommendations = await analyticsStep('update-recommendations', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + addTimeline('update-recommendations', 'success', `Updated for ${items.length} products`) + return { updated: true, productsAnalyzed: items.length } + }) + + addTimeline('analytics-tracking', 'success', 'Analytics updated') + return { purchase, recommendations } + }, + { parallel: true }, + ), + + ctx.step( + 'loyalty-update', + async ({ step: loyaltyStep }) => { + const points = await loyaltyStep('calculate-points', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + const earnedPoints = Math.floor(totalAmount * 10) + addTimeline('calculate-points', 'success', `Earned ${earnedPoints} points`) + return { earnedPoints, multiplier: 1.0 } + }) + + const tier = await loyaltyStep('update-tier', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + const newTier = totalAmount > 500 ? 'gold' : totalAmount > 100 ? 'silver' : 'bronze' + addTimeline('update-tier', 'success', `Tier: ${newTier}`) + return { tier: newTier, upgraded: totalAmount > 500 } + }) + + addTimeline('loyalty-update', 'success', `${points.earnedPoints} points, tier: ${tier.tier}`) + return { points, tier } + }, + { parallel: true }, + ), + + ctx.step( + 'partner-sync', + async ({ step: syncStep }) => { + const supplier = await syncStep('sync-supplier', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + addTimeline('sync-supplier', 'success', 'Supplier inventory updated') + return { synced: true, supplierId: 'SUP-001' } + }) + + const warehouse = await syncStep('sync-warehouse', async () => { + await new Promise((resolve) => setTimeout(resolve, 50)) + addTimeline('sync-warehouse', 'success', 'Warehouse notified for picking') + return { synced: true, warehouseId: 'WH-MAIN' } + }) + + addTimeline('partner-sync', 'success', 'All partners synced') + return { supplier, warehouse } + }, + { parallel: true }, + ), + ]) + + return { success: true } + }) + + return { + orderId, + status: 'completed' as const, + transactionId: payment.transactionId, + shipmentId: fulfillment.shipment.shipmentId, + timeline, + } + }, +}) + +const actions = { + processOrder: testProcessOrder, +} + +describe('processOrder Action', () => { + let client: Client + + beforeEach(async () => { + const { adapter } = await pgliteFactory.create() + adapter.setId('test-adapter') + await adapter.start() + + client = new Client({ + id: 'test-client', + database: adapter, + actions, + }) + + await client.start() + }) + + afterEach(async () => { + if (client) { + await client.stop() + } + }) + + it('should process order successfully', async () => { + const result = await client.runActionAndWait('processOrder', { + orderId: 'ORD-123', + customerId: 'CUST-456', + items: [ + { productId: 'PROD-1', quantity: 2, price: 29.99 }, + { productId: 'PROD-2', quantity: 1, price: 49.99 }, + ], + paymentMethod: 'credit_card', + shippingAddress: { + street: '123 Main St', + city: 'New York', + country: 'USA', + postalCode: '10001', + }, + }) + + expect(result.status).toBe(JOB_STATUS_COMPLETED) + expect(result.output.status).toBe('completed') + expect(result.output.orderId).toBe('ORD-123') + expect(result.output.transactionId).not.toBeNull() + expect(result.output.shipmentId).not.toBeNull() + expect(result.output.timeline.length).toBeGreaterThan(0) + }) + + it('should have correct timeline entries', async () => { + const result = await client.runActionAndWait('processOrder', { + orderId: 'ORD-456', + customerId: 'CUST-789', + items: [{ productId: 'PROD-3', quantity: 1, price: 99.99 }], + paymentMethod: 'paypal', + shippingAddress: { + street: '456 Oak Ave', + city: 'Los Angeles', + country: 'USA', + postalCode: '90001', + }, + }) + + expect(result.output.timeline).toBeDefined() + expect(result.output.timeline.length).toBeGreaterThanOrEqual(10) + + const steps = result.output.timeline.map((t: { step: string }) => t.step) + expect(steps).toContain('check-inventory') + expect(steps).toContain('verify-customer') + expect(steps).toContain('validate-order') + expect(steps).toContain('fraud-check') + expect(steps).toContain('authorize-payment') + expect(steps).toContain('capture-payment') + expect(steps).toContain('process-payment') + expect(steps).toContain('reserve-inventory') + expect(steps).toContain('create-shipment') + expect(steps).toContain('fulfill-order') + expect(steps).toContain('email-confirmation') + expect(steps).toContain('sms-notification') + expect(steps).toContain('send-notifications') + }) + + it('should fail when inventory is not available', async () => { + const result = await client.runActionAndWait('processOrder', { + orderId: 'ORD-789', + customerId: 'CUST-999', + items: [ + { productId: 'PROD-4', quantity: 20, price: 10 }, // quantity > 10 should fail + ], + paymentMethod: 'credit_card', + shippingAddress: { + street: '789 Pine Rd', + city: 'Chicago', + country: 'USA', + postalCode: '60001', + }, + }) + + expect(result.status).toBe(JOB_STATUS_COMPLETED) + expect(result.output.status).toBe('failed') + expect(result.output.transactionId).toBeNull() + expect(result.output.shipmentId).toBeNull() + }) +}) diff --git a/packages/shared-actions/package.json b/packages/shared-actions/package.json index f4f138f..1858bf9 100644 --- a/packages/shared-actions/package.json +++ b/packages/shared-actions/package.json @@ -8,7 +8,8 @@ "zod": "^4.1.12" }, "devDependencies": { - "@types/bun": "latest" + "@types/bun": "latest", + "duron": "workspace:*" }, "peerDependencies": { "typescript": "^5"