rinafcode · KingDavid9999 · Jun 26, 2026 · Jun 27, 2026 · Jun 27, 2026 · Jun 27, 2026
diff --git a/charts/teachlink-backend/Chart.yaml b/charts/teachlink-backend/Chart.yaml
@@ -0,0 +1,17 @@
+apiVersion: v2
+name: teachlink-backend
+description: Helm chart for the TeachLink backend API service
+type: application
+version: 0.1.0
+appVersion: "1.0.0"
+
+keywords:
+  - teachlink
+  - backend
+  - nestjs
+
+maintainers:
+  - name: rinafcode
+    url: https://github.com/rinafcode
+
+dependencies: []
diff --git a/charts/teachlink-backend/templates/_helpers.tpl b/charts/teachlink-backend/templates/_helpers.tpl
@@ -0,0 +1,49 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "teachlink-backend.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+*/}}
+{{- define "teachlink-backend.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "teachlink-backend.labels" -}}
+helm.sh/chart: {{ include "teachlink-backend.chart" . }}
+{{ include "teachlink-backend.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "teachlink-backend.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "teachlink-backend.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Chart label
+*/}}
+{{- define "teachlink-backend.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
diff --git a/charts/teachlink-backend/templates/prometheus-rules.yaml b/charts/teachlink-backend/templates/prometheus-rules.yaml
@@ -0,0 +1,117 @@
+{{- if .Values.prometheusRule.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: {{ include "teachlink-backend.fullname" . }}-alerts
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: {{ include "teachlink-backend.name" . }}
+    app.kubernetes.io/instance: {{ .Release.Name }}
+    app.kubernetes.io/managed-by: {{ .Release.Service }}
+    helm.sh/chart: {{ include "teachlink-backend.chart" . }}
+    {{- if .Values.prometheusRule.additionalLabels }}
+    {{- toYaml .Values.prometheusRule.additionalLabels | nindent 4 }}
+    {{- end }}
+spec:
+  groups:
+    # ──────────────────────────────────────────────────────────────────────────
+    # Group: SLA — fires when the service breaches contractual error/latency SLAs
+    # ──────────────────────────────────────────────────────────────────────────
+    - name: teachlink.sla
+      interval: 30s
+      rules:
+
+        # HighErrorRate
+        # Fires when the proportion of 5xx responses exceeds 1 % over a 5-minute
+        # window. Requires the "http_requests_total" counter with a "status_code"
+        # label emitted by the NestJS Prometheus interceptor.
+        - alert: HighErrorRate
+          expr: |
+            (
+              sum(rate(http_requests_total{
+                job="{{ .Values.prometheusRule.jobLabel }}",
+                status_code=~"5.."
+              }[5m]))
+              /
+              sum(rate(http_requests_total{
+                job="{{ .Values.prometheusRule.jobLabel }}"
+              }[5m]))
+            ) > {{ .Values.prometheusRule.thresholds.errorRatePct | default 0.01 }}
+          for: 5m
+          labels:
+            severity: critical
+            team: backend
+            service: teachlink-backend
+          annotations:
+            summary: "High 5xx error rate on TeachLink backend"
+            description: "The 5xx error rate has exceeded the 1% SLA threshold. See runbook for triage steps."
+            runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#higherrorrate"
+
+        # HighP99Latency
+        # Fires when the 99th-percentile request latency exceeds 1 second for
+        # 10 consecutive minutes. Requires the "http_request_duration_seconds"
+        # histogram emitted by the NestJS metrics module.
+        - alert: HighP99Latency
+          expr: |
+            histogram_quantile(
+              0.99,
+              sum by (le) (
+                rate(http_request_duration_seconds_bucket{
+                  job="{{ .Values.prometheusRule.jobLabel }}"
+                }[5m])
+              )
+            ) > {{ .Values.prometheusRule.thresholds.p99LatencySeconds | default 1.0 }}
+          for: 10m
+          labels:
+            severity: warning
+            team: backend
+            service: teachlink-backend
+          annotations:
+            summary: "P99 request latency exceeds 1 s on TeachLink backend"
+            description: "P99 latency is above the 1 s SLA threshold for 10 minutes. See runbook for triage steps."
+            runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#highp99latency"
+
+    # ──────────────────────────────────────────────────────────────────────────
+    # Group: queues — fires on job-queue saturation or dead-letter accumulation
+    # ──────────────────────────────────────────────────────────────────────────
+    - name: teachlink.queues
+      interval: 60s
+      rules:
+
+        # QueueDepthHigh
+        # Fires when any Bull/BullMQ queue has more than 1 000 waiting jobs for
+        # 10 consecutive minutes. Requires the "bull_queue_waiting" gauge emitted
+        # by the @willsoto/nestjs-prometheus Bull metrics plugin.
+        - alert: QueueDepthHigh
+          expr: |
+            bull_queue_waiting{
+              job="{{ .Values.prometheusRule.jobLabel }}"
+            } > {{ .Values.prometheusRule.thresholds.queueDepth | default 1000 }}
+          for: 10m
+          labels:
+            severity: warning
+            team: backend
+            service: teachlink-backend
+          annotations:
+            summary: "Job queue depth exceeds 1 000 on TeachLink backend"
+            description: "A Bull/BullMQ queue has more than 1000 waiting jobs for 10 minutes. See runbook for triage steps."
+            runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#queuedepthhigh"
+
+        # DLQDepthHigh
+        # Fires when the dead-letter queue depth exceeds threshold, indicating
+        # jobs are repeatedly failing. Requires the "bull_queue_failed" gauge.
+        - alert: DLQDepthHigh
+          expr: |
+            bull_queue_failed{
+              job="{{ .Values.prometheusRule.jobLabel }}"
+            } > {{ .Values.prometheusRule.thresholds.dlqDepth | default 50 }}
+          for: 5m
+          labels:
+            severity: critical
+            team: backend
+            service: teachlink-backend
+          annotations:
+            summary: "Dead-letter queue depth is growing on TeachLink backend"
+            description: "More than 50 jobs have moved to the failed DLQ and are not being retried. See runbook for triage steps."
+            runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#dlqdepthhigh"
+{{- end }}
diff --git a/charts/teachlink-backend/values.yaml b/charts/teachlink-backend/values.yaml
@@ -0,0 +1,159 @@
+# ─────────────────────────────────────────────────────────────────────────────
+# TeachLink Backend – Helm chart default values
+# Override any of these in values-staging.yaml / values-production.yaml
+# ─────────────────────────────────────────────────────────────────────────────
+
+# -- Deployment metadata
+replicaCount: 2
+
+image:
+  repository: ghcr.io/rinafcode/teachlink-backend
+  pullPolicy: IfNotPresent
+  tag: ""          # Defaults to Chart.appVersion when empty
+
+service:
+  type: ClusterIP
+  port: 3001
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Prometheus alerting rules
+# ─────────────────────────────────────────────────────────────────────────────
+prometheusRule:
+  # Set to true to deploy the PrometheusRule CR.
+  # Requires kube-prometheus-stack (or prometheus-operator) to be installed.
+  enabled: true
+
+  # Must match the ruleSelector labels configured on your Prometheus CR.
+  # With kube-prometheus-stack defaults this label is sufficient.
+  additionalLabels:
+    release: kube-prometheus-stack
+
+  # The Prometheus `job` label value that the backend pods are scraped under.
+  # Matches the ServiceMonitor / PodMonitor `jobLabel` field.
+  jobLabel: teachlink-backend
+
+  # Base URL prepended to runbook fragment links embedded in alert annotations.
+  # Point this at your Git-hosting URL so on-call engineers can click straight
+  # through from Alertmanager / PagerDuty / Slack to the runbook section.
+  runbookBaseUrl: "https://github.com/rinafcode/teachLink_backend/blob/main/docs"
+
+  # SLA thresholds – override per environment if needed.
+  thresholds:
+    errorRatePct: 0.01        # 1 %   – fraction (not percentage)
+    p99LatencySeconds: 1.0    # 1 s
+    queueDepth: 1000          # waiting jobs
+    dlqDepth: 50              # failed jobs in DLQ
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Alertmanager – Slack / webhook routing
+#
+# This section configures the Alertmanager config secret that kube-prometheus-
+# stack creates.  Set alertmanager.enabled=true and fill in your Slack
+# webhook URL (or supply it via a pre-existing Secret; see secretRef below).
+# ─────────────────────────────────────────────────────────────────────────────
+alertmanager:
+  enabled: true
+
+  # ── Slack integration ──────────────────────────────────────────────────────
+  slack:
+    # Whether to enable Slack notifications.
+    enabled: true
+
+    # Webhook URL for your Slack app.
+    # NEVER commit a real token here.  Supply it at deploy-time:
+    #   helm upgrade ... --set alertmanager.slack.webhookUrl="https://hooks.slack.com/..."
+    # or store it in a Kubernetes Secret and reference it via secretRef below.
+    webhookUrl: ""
+
+    # Alternatively, reference a pre-existing Secret that contains the key
+    # `slack-webhook-url`.  When set, webhookUrl above is ignored.
+    secretRef:
+      name: ""       # e.g. teachlink-alertmanager-secrets
+      key: slack-webhook-url
+
+    # Slack channels per severity level.
+    channels:
+      critical: "#teachlink-alerts-critical"
+      warning: "#teachlink-alerts-warning"
+      default: "#teachlink-alerts"
+
+  # ── Generic webhook (PagerDuty, OpsGenie, custom endpoint, …) ─────────────
+  webhook:
+    enabled: false
+    url: ""
+
+  # ── Alertmanager routing config (rendered into the kube-prometheus-stack
+  #    alertmanager.config value).
+  #    Adjust `group_wait`, `group_interval`, and `repeat_interval` to taste.
+  config:
+    global:
+      resolve_timeout: 5m
+      # slack_api_url is set dynamically from alertmanager.slack.webhookUrl
+      # by the chart helper; do not set it here.
+
+    route:
+      group_by: ['alertname', 'service', 'namespace']
+      group_wait: 30s
+      group_interval: 5m
+      repeat_interval: 4h
+      receiver: slack-default
+
+      routes:
+        # Critical alerts → dedicated critical channel, short repeat.
+        - matchers:
+            - severity = "critical"
+          receiver: slack-critical
+          repeat_interval: 1h
+
+        # Warning alerts → warning channel.
+        - matchers:
+            - severity = "warning"
+          receiver: slack-warning
+          repeat_interval: 4h
+
+    receivers:
+      - name: slack-default
+        slack_configs:
+          - channel: "{{ .Values.alertmanager.slack.channels.default }}"
+            send_resolved: true
+            title: '[{{ "{{" }} .Status | toUpper {{ "}}" }}{{ "{{" }} if eq .Status "firing" {{ "}}" }}:{{ "{{" }} .Alerts.Firing | len {{ "}}" }}{{ "{{" }} end {{ "}}" }}] TeachLink Alert'
+            text: >-
+              {{ "{{" }} range .Alerts {{ "}}" }}
+              *Alert:* {{ "{{" }} .Annotations.summary {{ "}}" }}
+              *Severity:* {{ "{{" }} .Labels.severity {{ "}}" }}
+              *Description:* {{ "{{" }} .Annotations.description {{ "}}" }}
+              *Runbook:* {{ "{{" }} .Annotations.runbook_url {{ "}}" }}
+              {{ "{{" }} end {{ "}}" }}
+
+      - name: slack-critical
+        slack_configs:
+          - channel: "{{ .Values.alertmanager.slack.channels.critical }}"
+            send_resolved: true
+            title: '🚨 [CRITICAL] TeachLink Alert'
+            text: >-
+              {{ "{{" }} range .Alerts {{ "}}" }}
+              *Alert:* {{ "{{" }} .Annotations.summary {{ "}}" }}
+              *Description:* {{ "{{" }} .Annotations.description {{ "}}" }}
+              *Runbook:* {{ "{{" }} .Annotations.runbook_url {{ "}}" }}
+              {{ "{{" }} end {{ "}}" }}
+
+      - name: slack-warning
+        slack_configs:
+          - channel: "{{ .Values.alertmanager.slack.channels.warning }}"
+            send_resolved: true
+            title: '⚠️ [WARNING] TeachLink Alert'
+            text: >-
+              {{ "{{" }} range .Alerts {{ "}}" }}
+              *Alert:* {{ "{{" }} .Annotations.summary {{ "}}" }}
+              *Description:* {{ "{{" }} .Annotations.description {{ "}}" }}
+              *Runbook:* {{ "{{" }} .Annotations.runbook_url {{ "}}" }}
+              {{ "{{" }} end {{ "}}" }}
+
+    inhibit_rules:
+      # Suppress warnings when a critical alert for the same service is already
+      # firing, to reduce noise.
+      - source_matchers:
+          - severity = "critical"
+        target_matchers:
+          - severity = "warning"
+        equal: ['service', 'namespace']
diff --git a/docs/API_VERSIONING_POLICY.md b/docs/API_VERSIONING_POLICY.md
@@ -15,6 +15,18 @@ This project uses URL-based API versioning to protect clients from breaking chan
 - Path-based versioning is the primary version selection mechanism
 - API clients should prefer explicit `/api/v1/...` paths when available
 
+## Supported version numbers
+
+The middleware validates version strings against the pattern `/^v\d+$/` (the letter
+`v` followed by one or more digits). Any other format is rejected with `400 Bad Request`.
+
+| Version | Status  | Notes                   |
+|---------|---------|-------------------------|
+| `v1`    | Active  | Current stable version  |
+| `v2`    | Planned | Reserved for future use |
+
+Examples of **invalid** version strings that are rejected: `vABC`, `v1.2`, `../v1`, `123`.
+
 ## Compatibility layer
 
 The middleware rewrites legacy API requests from `/api/*` to `/api/v1/*`.