diff --git a/charts/teachlink-backend/Chart.yaml b/charts/teachlink-backend/Chart.yaml new file mode 100644 index 00000000..dcf6ede0 --- /dev/null +++ b/charts/teachlink-backend/Chart.yaml @@ -0,0 +1,17 @@ +apiVersion: v2 +name: teachlink-backend +description: Helm chart for the TeachLink backend API service +type: application +version: 0.1.0 +appVersion: "1.0.0" + +keywords: + - teachlink + - backend + - nestjs + +maintainers: + - name: rinafcode + url: https://github.com/rinafcode + +dependencies: [] diff --git a/charts/teachlink-backend/templates/_helpers.tpl b/charts/teachlink-backend/templates/_helpers.tpl new file mode 100644 index 00000000..c6287902 --- /dev/null +++ b/charts/teachlink-backend/templates/_helpers.tpl @@ -0,0 +1,49 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "teachlink-backend.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "teachlink-backend.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "teachlink-backend.labels" -}} +helm.sh/chart: {{ include "teachlink-backend.chart" . }} +{{ include "teachlink-backend.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "teachlink-backend.selectorLabels" -}} +app.kubernetes.io/name: {{ include "teachlink-backend.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Chart label +*/}} +{{- define "teachlink-backend.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} diff --git a/charts/teachlink-backend/templates/prometheus-rules.yaml b/charts/teachlink-backend/templates/prometheus-rules.yaml new file mode 100644 index 00000000..bf6f6439 --- /dev/null +++ b/charts/teachlink-backend/templates/prometheus-rules.yaml @@ -0,0 +1,117 @@ +{{- if .Values.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "teachlink-backend.fullname" . }}-alerts + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: {{ include "teachlink-backend.name" . }} + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + helm.sh/chart: {{ include "teachlink-backend.chart" . }} + {{- if .Values.prometheusRule.additionalLabels }} + {{- toYaml .Values.prometheusRule.additionalLabels | nindent 4 }} + {{- end }} +spec: + groups: + # ────────────────────────────────────────────────────────────────────────── + # Group: SLA — fires when the service breaches contractual error/latency SLAs + # ────────────────────────────────────────────────────────────────────────── + - name: teachlink.sla + interval: 30s + rules: + + # HighErrorRate + # Fires when the proportion of 5xx responses exceeds 1 % over a 5-minute + # window. Requires the "http_requests_total" counter with a "status_code" + # label emitted by the NestJS Prometheus interceptor. + - alert: HighErrorRate + expr: | + ( + sum(rate(http_requests_total{ + job="{{ .Values.prometheusRule.jobLabel }}", + status_code=~"5.." + }[5m])) + / + sum(rate(http_requests_total{ + job="{{ .Values.prometheusRule.jobLabel }}" + }[5m])) + ) > {{ .Values.prometheusRule.thresholds.errorRatePct | default 0.01 }} + for: 5m + labels: + severity: critical + team: backend + service: teachlink-backend + annotations: + summary: "High 5xx error rate on TeachLink backend" + description: "The 5xx error rate has exceeded the 1% SLA threshold. See runbook for triage steps." + runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#higherrorrate" + + # HighP99Latency + # Fires when the 99th-percentile request latency exceeds 1 second for + # 10 consecutive minutes. Requires the "http_request_duration_seconds" + # histogram emitted by the NestJS metrics module. + - alert: HighP99Latency + expr: | + histogram_quantile( + 0.99, + sum by (le) ( + rate(http_request_duration_seconds_bucket{ + job="{{ .Values.prometheusRule.jobLabel }}" + }[5m]) + ) + ) > {{ .Values.prometheusRule.thresholds.p99LatencySeconds | default 1.0 }} + for: 10m + labels: + severity: warning + team: backend + service: teachlink-backend + annotations: + summary: "P99 request latency exceeds 1 s on TeachLink backend" + description: "P99 latency is above the 1 s SLA threshold for 10 minutes. See runbook for triage steps." + runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#highp99latency" + + # ────────────────────────────────────────────────────────────────────────── + # Group: queues — fires on job-queue saturation or dead-letter accumulation + # ────────────────────────────────────────────────────────────────────────── + - name: teachlink.queues + interval: 60s + rules: + + # QueueDepthHigh + # Fires when any Bull/BullMQ queue has more than 1 000 waiting jobs for + # 10 consecutive minutes. Requires the "bull_queue_waiting" gauge emitted + # by the @willsoto/nestjs-prometheus Bull metrics plugin. + - alert: QueueDepthHigh + expr: | + bull_queue_waiting{ + job="{{ .Values.prometheusRule.jobLabel }}" + } > {{ .Values.prometheusRule.thresholds.queueDepth | default 1000 }} + for: 10m + labels: + severity: warning + team: backend + service: teachlink-backend + annotations: + summary: "Job queue depth exceeds 1 000 on TeachLink backend" + description: "A Bull/BullMQ queue has more than 1000 waiting jobs for 10 minutes. See runbook for triage steps." + runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#queuedepthhigh" + + # DLQDepthHigh + # Fires when the dead-letter queue depth exceeds threshold, indicating + # jobs are repeatedly failing. Requires the "bull_queue_failed" gauge. + - alert: DLQDepthHigh + expr: | + bull_queue_failed{ + job="{{ .Values.prometheusRule.jobLabel }}" + } > {{ .Values.prometheusRule.thresholds.dlqDepth | default 50 }} + for: 5m + labels: + severity: critical + team: backend + service: teachlink-backend + annotations: + summary: "Dead-letter queue depth is growing on TeachLink backend" + description: "More than 50 jobs have moved to the failed DLQ and are not being retried. See runbook for triage steps." + runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#dlqdepthhigh" +{{- end }} diff --git a/charts/teachlink-backend/values.yaml b/charts/teachlink-backend/values.yaml new file mode 100644 index 00000000..a352b754 --- /dev/null +++ b/charts/teachlink-backend/values.yaml @@ -0,0 +1,159 @@ +# ───────────────────────────────────────────────────────────────────────────── +# TeachLink Backend – Helm chart default values +# Override any of these in values-staging.yaml / values-production.yaml +# ───────────────────────────────────────────────────────────────────────────── + +# -- Deployment metadata +replicaCount: 2 + +image: + repository: ghcr.io/rinafcode/teachlink-backend + pullPolicy: IfNotPresent + tag: "" # Defaults to Chart.appVersion when empty + +service: + type: ClusterIP + port: 3001 + +# ───────────────────────────────────────────────────────────────────────────── +# Prometheus alerting rules +# ───────────────────────────────────────────────────────────────────────────── +prometheusRule: + # Set to true to deploy the PrometheusRule CR. + # Requires kube-prometheus-stack (or prometheus-operator) to be installed. + enabled: true + + # Must match the ruleSelector labels configured on your Prometheus CR. + # With kube-prometheus-stack defaults this label is sufficient. + additionalLabels: + release: kube-prometheus-stack + + # The Prometheus `job` label value that the backend pods are scraped under. + # Matches the ServiceMonitor / PodMonitor `jobLabel` field. + jobLabel: teachlink-backend + + # Base URL prepended to runbook fragment links embedded in alert annotations. + # Point this at your Git-hosting URL so on-call engineers can click straight + # through from Alertmanager / PagerDuty / Slack to the runbook section. + runbookBaseUrl: "https://github.com/rinafcode/teachLink_backend/blob/main/docs" + + # SLA thresholds – override per environment if needed. + thresholds: + errorRatePct: 0.01 # 1 % – fraction (not percentage) + p99LatencySeconds: 1.0 # 1 s + queueDepth: 1000 # waiting jobs + dlqDepth: 50 # failed jobs in DLQ + +# ───────────────────────────────────────────────────────────────────────────── +# Alertmanager – Slack / webhook routing +# +# This section configures the Alertmanager config secret that kube-prometheus- +# stack creates. Set alertmanager.enabled=true and fill in your Slack +# webhook URL (or supply it via a pre-existing Secret; see secretRef below). +# ───────────────────────────────────────────────────────────────────────────── +alertmanager: + enabled: true + + # ── Slack integration ────────────────────────────────────────────────────── + slack: + # Whether to enable Slack notifications. + enabled: true + + # Webhook URL for your Slack app. + # NEVER commit a real token here. Supply it at deploy-time: + # helm upgrade ... --set alertmanager.slack.webhookUrl="https://hooks.slack.com/..." + # or store it in a Kubernetes Secret and reference it via secretRef below. + webhookUrl: "" + + # Alternatively, reference a pre-existing Secret that contains the key + # `slack-webhook-url`. When set, webhookUrl above is ignored. + secretRef: + name: "" # e.g. teachlink-alertmanager-secrets + key: slack-webhook-url + + # Slack channels per severity level. + channels: + critical: "#teachlink-alerts-critical" + warning: "#teachlink-alerts-warning" + default: "#teachlink-alerts" + + # ── Generic webhook (PagerDuty, OpsGenie, custom endpoint, …) ───────────── + webhook: + enabled: false + url: "" + + # ── Alertmanager routing config (rendered into the kube-prometheus-stack + # alertmanager.config value). + # Adjust `group_wait`, `group_interval`, and `repeat_interval` to taste. + config: + global: + resolve_timeout: 5m + # slack_api_url is set dynamically from alertmanager.slack.webhookUrl + # by the chart helper; do not set it here. + + route: + group_by: ['alertname', 'service', 'namespace'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + receiver: slack-default + + routes: + # Critical alerts → dedicated critical channel, short repeat. + - matchers: + - severity = "critical" + receiver: slack-critical + repeat_interval: 1h + + # Warning alerts → warning channel. + - matchers: + - severity = "warning" + receiver: slack-warning + repeat_interval: 4h + + receivers: + - name: slack-default + slack_configs: + - channel: "{{ .Values.alertmanager.slack.channels.default }}" + send_resolved: true + title: '[{{ "{{" }} .Status | toUpper {{ "}}" }}{{ "{{" }} if eq .Status "firing" {{ "}}" }}:{{ "{{" }} .Alerts.Firing | len {{ "}}" }}{{ "{{" }} end {{ "}}" }}] TeachLink Alert' + text: >- + {{ "{{" }} range .Alerts {{ "}}" }} + *Alert:* {{ "{{" }} .Annotations.summary {{ "}}" }} + *Severity:* {{ "{{" }} .Labels.severity {{ "}}" }} + *Description:* {{ "{{" }} .Annotations.description {{ "}}" }} + *Runbook:* {{ "{{" }} .Annotations.runbook_url {{ "}}" }} + {{ "{{" }} end {{ "}}" }} + + - name: slack-critical + slack_configs: + - channel: "{{ .Values.alertmanager.slack.channels.critical }}" + send_resolved: true + title: '🚨 [CRITICAL] TeachLink Alert' + text: >- + {{ "{{" }} range .Alerts {{ "}}" }} + *Alert:* {{ "{{" }} .Annotations.summary {{ "}}" }} + *Description:* {{ "{{" }} .Annotations.description {{ "}}" }} + *Runbook:* {{ "{{" }} .Annotations.runbook_url {{ "}}" }} + {{ "{{" }} end {{ "}}" }} + + - name: slack-warning + slack_configs: + - channel: "{{ .Values.alertmanager.slack.channels.warning }}" + send_resolved: true + title: '⚠️ [WARNING] TeachLink Alert' + text: >- + {{ "{{" }} range .Alerts {{ "}}" }} + *Alert:* {{ "{{" }} .Annotations.summary {{ "}}" }} + *Description:* {{ "{{" }} .Annotations.description {{ "}}" }} + *Runbook:* {{ "{{" }} .Annotations.runbook_url {{ "}}" }} + {{ "{{" }} end {{ "}}" }} + + inhibit_rules: + # Suppress warnings when a critical alert for the same service is already + # firing, to reduce noise. + - source_matchers: + - severity = "critical" + target_matchers: + - severity = "warning" + equal: ['service', 'namespace'] diff --git a/docs/API_VERSIONING_POLICY.md b/docs/API_VERSIONING_POLICY.md index 811d8679..32c7d137 100644 --- a/docs/API_VERSIONING_POLICY.md +++ b/docs/API_VERSIONING_POLICY.md @@ -15,6 +15,18 @@ This project uses URL-based API versioning to protect clients from breaking chan - Path-based versioning is the primary version selection mechanism - API clients should prefer explicit `/api/v1/...` paths when available +## Supported version numbers + +The middleware validates version strings against the pattern `/^v\d+$/` (the letter +`v` followed by one or more digits). Any other format is rejected with `400 Bad Request`. + +| Version | Status | Notes | +|---------|---------|-------------------------| +| `v1` | Active | Current stable version | +| `v2` | Planned | Reserved for future use | + +Examples of **invalid** version strings that are rejected: `vABC`, `v1.2`, `../v1`, `123`. + ## Compatibility layer The middleware rewrites legacy API requests from `/api/*` to `/api/v1/*`. diff --git a/docs/RUNBOOKS.md b/docs/RUNBOOKS.md new file mode 100644 index 00000000..a98d4394 --- /dev/null +++ b/docs/RUNBOOKS.md @@ -0,0 +1,338 @@ +# TeachLink Backend – Alert Runbooks + +This document provides on-call guidance for every Prometheus alert defined in +`charts/teachlink-backend/templates/prometheus-rules.yaml`. + +Each section follows the same structure: + +1. **What it means** – plain-English explanation of why the alert fired. +2. **Impact** – which users or features are affected. +3. **Triage steps** – ordered checklist to reproduce, scope, and diagnose. +4. **Remediation** – actions to resolve the incident. +5. **Escalation** – who to page if the steps above don't resolve it within SLA. + +--- + +## Table of Contents + +- [HighErrorRate](#higherrorrate) +- [HighP99Latency](#highp99latency) +- [QueueDepthHigh](#queuedepthhigh) +- [DLQDepthHigh](#dlqdepthhigh) + +--- + +## HighErrorRate + +**Severity:** `critical` +**Alert expression:** +```promql +( + sum(rate(http_requests_total{job="teachlink-backend", status_code=~"5.."}[5m])) + / + sum(rate(http_requests_total{job="teachlink-backend"}[5m])) +) > 0.01 +``` +**Fires when:** More than 1 % of all HTTP requests have returned a 5xx response +code over a rolling 5-minute window, sustained for 5 minutes. + +### What it means + +A meaningful fraction of API calls are failing server-side. This can be caused +by unhandled exceptions, database timeouts, downstream service failures, +out-of-memory crashes, or a bad deployment. + +### Impact + +- End-users see errors when loading courses, submitting assignments, or making + payments. +- API consumers (mobile apps, third-party integrations) receive 5xx responses. + +### Triage steps + +1. **Confirm the alert is genuine** – open Grafana → TeachLink API dashboard → + "Error Rate" panel. Verify the rate exceeds 1 % and is not a transient blip. + +2. **Identify the failing endpoints:** + ```promql + topk(10, + rate(http_requests_total{job="teachlink-backend", status_code=~"5.."}[5m]) + ) by (path, method, status_code) + ``` + +3. **Check pod logs for stack traces:** + ```bash + kubectl logs -n -l app.kubernetes.io/name=teachlink-backend \ + --since=10m | grep -E "(ERROR|Exception|5[0-9][0-9])" + ``` + +4. **Check recent deployments:** + ```bash + kubectl rollout history deployment/teachlink-backend -n + ``` + If a deployment was rolled out in the last 30 minutes, rollback is the + fastest mitigation (see Remediation below). + +5. **Check downstream dependencies** – database connectivity, Redis, external + payment APIs. Look for connection-refused or timeout errors in the logs. + +6. **Check pod restarts / OOMKilled events:** + ```bash + kubectl get pods -n -l app.kubernetes.io/name=teachlink-backend + kubectl describe pod -n | grep -A5 "Last State" + ``` + +### Remediation + +| Cause | Action | +|---|---| +| Bad deployment | `kubectl rollout undo deployment/teachlink-backend -n ` | +| Database down | Restore DB or failover to replica; check RDS/PG logs | +| Pod OOMKilled | Increase `resources.limits.memory` in values; redeploy | +| Downstream API down | Enable circuit-breaker flag or return cached fallback | +| Unhandled exception | Hot-fix the code path identified in logs, redeploy | + +### Escalation + +If error rate does not drop below 1 % within **15 minutes** of initial triage: +- Page the on-call backend engineer via PagerDuty. +- Notify `#teachlink-incidents` Slack channel with a brief status update. + +--- + +## HighP99Latency + +**Severity:** `warning` +**Alert expression:** +```promql +histogram_quantile( + 0.99, + sum by (le) ( + rate(http_request_duration_seconds_bucket{job="teachlink-backend"}[5m]) + ) +) > 1.0 +``` +**Fires when:** The 99th-percentile request latency exceeds 1 second for 10 +consecutive minutes. + +### What it means + +At least 1 % of requests are taking longer than 1 second. Common culprits are +slow database queries, N+1 query patterns, lock contention, CPU throttling, or +memory pressure causing GC pauses. + +### Impact + +- Users experience sluggish page loads and time-outs on slow connections. +- Background jobs that call the API may queue up, eventually triggering + `QueueDepthHigh`. + +### Triage steps + +1. **Identify the slow endpoints:** + ```promql + topk(10, + histogram_quantile(0.99, + rate(http_request_duration_seconds_bucket{job="teachlink-backend"}[5m]) + ) by (path, method, le) + ) + ``` + +2. **Check database slow-query logs:** + - RDS Performance Insights → filter by `wait_event_type = Lock` or + `wait_event_type = IO`. + - Look for queries taking > 500 ms. + +3. **Check CPU and memory utilisation:** + ```promql + rate(process_cpu_seconds_total{job="teachlink-backend"}[5m]) * 100 + process_resident_memory_bytes{job="teachlink-backend"} / 1024 / 1024 + ``` + +4. **Check for pod CPU throttling:** + ```bash + kubectl top pods -n -l app.kubernetes.io/name=teachlink-backend + ``` + If pods are at or near CPU limit, throttling is the likely cause. + +5. **Enable query explain-analyse** on the suspected slow query in a staging + environment to confirm. + +### Remediation + +| Cause | Action | +|---|---| +| Slow DB query | Add index; rewrite query; cache result with Redis | +| CPU throttling | Increase `resources.limits.cpu`; add HPA scaling rule | +| N+1 queries | Apply DataLoader / eager-load relations in ORM | +| Memory pressure / GC | Increase memory limit; profile heap with `clinic.js` | +| External API slow | Add timeouts; cache responses; use background job | + +### Escalation + +If P99 latency remains above 1 s after **30 minutes**: +- Page backend engineer. +- If DB is implicated, page the DBA on-call. + +--- + +## QueueDepthHigh + +**Severity:** `warning` +**Alert expression:** +```promql +bull_queue_waiting{job="teachlink-backend"} > 1000 +``` +**Fires when:** Any Bull/BullMQ queue has more than 1 000 jobs waiting to be +processed for 10 consecutive minutes. + +### What it means + +Workers are not consuming jobs fast enough. This can mean workers have crashed, +processing is too slow, or a traffic spike has produced an unusual burst of jobs. + +### Impact + +- Delayed delivery of emails, push notifications, certificate generation, or + other async tasks. +- If the queue continues growing, Redis memory pressure will follow. + +### Triage steps + +1. **Identify which queue is backed up:** + ```promql + topk(5, bull_queue_waiting{job="teachlink-backend"}) by (queue) + ``` + +2. **Check the rate of job consumption vs. arrival:** + ```promql + rate(bull_queue_completed{job="teachlink-backend"}[5m]) by (queue) + rate(bull_queue_added{job="teachlink-backend"}[5m]) by (queue) + ``` + +3. **Check worker pod health:** + ```bash + kubectl get pods -n -l app.kubernetes.io/name=teachlink-backend + kubectl logs -n --since=10m | grep -i "worker\|queue\|bull" + ``` + +4. **Check Redis health** (Bull backs onto Redis): + ```bash + kubectl exec -it -n -- redis-cli INFO memory + ``` + +5. **Check for a sudden spike in job arrivals** (e.g., a scheduled batch job or + user-triggered bulk operation). + +### Remediation + +| Cause | Action | +|---|---| +| Workers crashed | `kubectl rollout restart deployment/teachlink-backend -n ` | +| Too few workers | Scale out: `kubectl scale deployment/teachlink-backend --replicas=N` | +| Slow job processing | Profile the job handler; optimise DB calls or external I/O | +| Redis OOM | Increase Redis memory limit or purge stale keys | +| Burst traffic | Enable rate-limiting at API layer to reduce job creation rate | + +### Escalation + +If queue depth does not decrease within **20 minutes**: +- Page backend engineer. +- If Redis is implicated, page infrastructure on-call. + +--- + +## DLQDepthHigh + +**Severity:** `critical` +**Alert expression:** +```promql +bull_queue_failed{job="teachlink-backend"} > 50 +``` +**Fires when:** More than 50 jobs have moved to the failed (dead-letter) state +within a queue, sustained for 5 minutes. + +### What it means + +Jobs are failing repeatedly and exhausting their retry budget. No further +automatic retries will occur for these jobs — the work is effectively lost until +an engineer intervenes. + +### Impact + +- Permanent failure of async tasks: emails unsent, certificates not issued, + webhooks not delivered, payments not reconciled. +- Data consistency issues if jobs were part of a saga or transactional workflow. + +### Triage steps + +1. **Identify the failing queue and error:** + ```promql + topk(5, bull_queue_failed{job="teachlink-backend"}) by (queue) + ``` + +2. **Inspect failed job payloads via Bull Board** (if deployed) at + `https:///admin/queues`, or directly via Redis: + ```bash + kubectl exec -it -n -- \ + redis-cli LRANGE bull::failed 0 4 + ``` + +3. **Read the failure reason from job metadata** — look for + `"failedReason"` in the JSON payload. + +4. **Check application logs** for the worker around the time failures spiked: + ```bash + kubectl logs -n -l app.kubernetes.io/name=teachlink-backend \ + --since=30m | grep -i "failed\|error\|unhandled" + ``` + +5. **Reproduce** the failing job in a staging environment using the same payload + to confirm the fix before retrying production jobs. + +### Remediation + +| Cause | Action | +|---|---| +| Code bug in job handler | Fix bug, redeploy, then retry jobs from Bull Board | +| External dependency down | Wait for dependency to recover; then bulk-retry jobs | +| Invalid job payload | Patch payload schema validation; discard or correct jobs | +| Credentials expired | Rotate the affected secret; restart the worker | + +**Bulk retry via Bull Board:** +Navigate to `Admin → Queues → → Failed` and click +**Retry All Failed**. + +**Bulk retry via Redis CLI** (last resort): +```bash +# Move all failed jobs back to the waiting list +kubectl exec -it -n -- \ + redis-cli EVAL " + local failed = redis.call('lrange', KEYS[1], 0, -1) + for _, v in ipairs(failed) do + redis.call('lpush', KEYS[2], v) + end + redis.call('del', KEYS[1]) + return #failed + " 2 bull::failed bull::wait +``` + +### Escalation + +If the DLQ continues growing after the fix is deployed: +- Page backend engineer immediately — data loss may be occurring. +- Open a P1 incident and notify `#teachlink-incidents`. +- Document affected job IDs for potential manual reprocessing. + +--- + +## Updating These Runbooks + +When a new alert is added to `prometheus-rules.yaml`: + +1. Add a matching `##` section to this file following the template above. +2. Set the `runbook_url` annotation in the alert to point at the new section: + ``` + runbook_url: "https://github.com/rinafcode/teachLink_backend/blob/main/docs/RUNBOOKS.md#" + ``` +3. Open a PR — runbook changes should be reviewed by the on-call rotation lead. diff --git a/src/app/(auth)/layout.tsx b/src/app/(auth)/layout.tsx new file mode 100644 index 00000000..51840533 --- /dev/null +++ b/src/app/(auth)/layout.tsx @@ -0,0 +1,23 @@ +import type { Metadata } from 'next'; + +export const metadata: Metadata = { + title: 'TeachLink - Sign In or Create an Account', + description: + 'Access your TeachLink account to continue learning offline. Sign in, sign up, or verify your email.', + openGraph: { + title: 'TeachLink - Sign In or Create an Account', + description: 'Access your TeachLink account to continue learning.', + type: 'website', + siteName: 'TeachLink', + }, + twitter: { + card: 'summary', + site: '@teachlink', + title: 'TeachLink - Sign In or Create an Account', + description: 'Access your TeachLink account to continue learning.', + }, +}; + +export default function AuthLayout({ children }: { children: React.ReactNode }) { + return <>{children}; +} diff --git a/src/app/__tests__/twitter-cards.test.ts b/src/app/__tests__/twitter-cards.test.ts new file mode 100644 index 00000000..277e40fb --- /dev/null +++ b/src/app/__tests__/twitter-cards.test.ts @@ -0,0 +1,76 @@ +import { describe, it, expect } from 'vitest'; +import { metadata as rootMetadata } from '@/app/layout'; +import { metadata as authMetadata } from '@/app/(auth)/layout'; +import { metadata as dashboardMetadata } from '@/app/dashboard/layout'; +import { metadata as profileMetadata } from '@/app/profile/layout'; + +describe('Twitter Cards metadata', () => { + describe('Root layout', () => { + it('exports a twitter card field', () => { + expect(rootMetadata.twitter).toBeDefined(); + }); + + it('uses summary_large_image card type', () => { + expect(rootMetadata.twitter?.card).toBe('summary_large_image'); + }); + + it('includes a twitter title', () => { + expect(rootMetadata.twitter?.title).toBeTruthy(); + }); + + it('includes a twitter description', () => { + expect(rootMetadata.twitter?.description).toBeTruthy(); + }); + + it('includes twitter site handle', () => { + expect(rootMetadata.twitter?.site).toBe('@teachlink'); + }); + + it('exports openGraph metadata', () => { + expect(rootMetadata.openGraph).toBeDefined(); + expect(rootMetadata.openGraph?.siteName).toBe('TeachLink'); + }); + }); + + describe('Auth layout', () => { + it('exports a twitter card field', () => { + expect(authMetadata.twitter).toBeDefined(); + }); + + it('uses summary card type', () => { + expect(authMetadata.twitter?.card).toBe('summary'); + }); + + it('includes a twitter title', () => { + expect(authMetadata.twitter?.title).toBeTruthy(); + }); + + it('includes a twitter description', () => { + expect(authMetadata.twitter?.description).toBeTruthy(); + }); + + it('includes twitter site handle', () => { + expect(authMetadata.twitter?.site).toBe('@teachlink'); + }); + }); + + describe('Dashboard layout', () => { + it('exports a twitter card field', () => { + expect(dashboardMetadata.twitter).toBeDefined(); + }); + + it('uses summary card type', () => { + expect(dashboardMetadata.twitter?.card).toBe('summary'); + }); + }); + + describe('Profile layout', () => { + it('exports a twitter card field', () => { + expect(profileMetadata.twitter).toBeDefined(); + }); + + it('uses summary card type', () => { + expect(profileMetadata.twitter?.card).toBe('summary'); + }); + }); +}); diff --git a/src/app/courses/[courseId]/page.tsx b/src/app/courses/[courseId]/page.tsx index 7545d78d..9b15f34a 100644 --- a/src/app/courses/[courseId]/page.tsx +++ b/src/app/courses/[courseId]/page.tsx @@ -12,6 +12,18 @@ export async function generateMetadata({ params }: CoursePageProps): Promise { expect(response.headers.get(API_VERSION_HEADER)).toBe('v1'); expect(response.headers.get(API_DEPRECATION_HEADER)).toBeNull(); }); -}); + + describe('valid version strings — should route correctly', () => { + it('accepts v1 and sets X-Api-Version header', () => { + const request = createMockRequest('/api/v1/posts'); + const response = middleware(request) as NextResponse; + expect(response.status).not.toBe(400); + expect(response.headers.get(API_VERSION_HEADER)).toBe('v1'); + }); + + it('accepts v2 and sets X-Api-Version header', () => { + const request = createMockRequest('/api/v2/posts'); + const response = middleware(request) as NextResponse; + expect(response.status).not.toBe(400); + expect(response.headers.get(API_VERSION_HEADER)).toBe('v2'); + }); + + it('accepts large version numbers like v10', () => { + const request = createMockRequest('/api/v10/posts'); + const response = middleware(request) as NextResponse; + expect(response.status).not.toBe(400); + expect(response.headers.get(API_VERSION_HEADER)).toBe('v10'); + }); + }); + + describe('malformed version strings — should return 400', () => { + it('rejects alphabetic version string (vABC)', () => { + const request = createMockRequest('/api/vABC/posts'); + const response = middleware(request) as NextResponse; + expect(response.status).toBe(400); + }); + + it('rejects path-traversal characters (/../)', () => { + const request = createMockRequest('/api/../v1/posts'); + const response = middleware(request) as NextResponse; + expect(response.status).toBe(400); + }); + + it('rejects empty version segment (/api/v/)', () => { + const request = createMockRequest('/api/v/posts'); + const response = middleware(request) as NextResponse; + expect(response.status).toBe(400); + }); + + it('rejects version with special characters (v1.2)', () => { + const request = createMockRequest('/api/v1.2/posts'); + const response = middleware(request) as NextResponse; + expect(response.status).toBe(400); + }); + + it('rejects version with injection attempt (v1;drop)', () => { + const request = createMockRequest('/api/v1;drop/posts'); + const response = middleware(request) as NextResponse; + expect(response.status).toBe(400); + }); + + it('rejects purely numeric version without v prefix (123)', () => { + const request = createMockRequest('/api/123/posts'); + const response = middleware(request) as NextResponse; + expect(response.status).toBe(400); + }); + }); +}); \ No newline at end of file