Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions charts/teachlink-backend/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: v2
name: teachlink-backend
description: Helm chart for the TeachLink backend API service
type: application
version: 0.1.0
appVersion: "1.0.0"

keywords:
- teachlink
- backend
- nestjs

maintainers:
- name: rinafcode
url: https://github.com/rinafcode

dependencies: []
49 changes: 49 additions & 0 deletions charts/teachlink-backend/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "teachlink-backend.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Create a default fully qualified app name.
*/}}
{{- define "teachlink-backend.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}

{{/*
Common labels
*/}}
{{- define "teachlink-backend.labels" -}}
helm.sh/chart: {{ include "teachlink-backend.chart" . }}
{{ include "teachlink-backend.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}

{{/*
Selector labels
*/}}
{{- define "teachlink-backend.selectorLabels" -}}
app.kubernetes.io/name: {{ include "teachlink-backend.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

{{/*
Chart label
*/}}
{{- define "teachlink-backend.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
117 changes: 117 additions & 0 deletions charts/teachlink-backend/templates/prometheus-rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
{{- if .Values.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "teachlink-backend.fullname" . }}-alerts
namespace: {{ .Release.Namespace }}
labels:
app.kubernetes.io/name: {{ include "teachlink-backend.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
helm.sh/chart: {{ include "teachlink-backend.chart" . }}
{{- if .Values.prometheusRule.additionalLabels }}
{{- toYaml .Values.prometheusRule.additionalLabels | nindent 4 }}
{{- end }}
spec:
groups:
# ──────────────────────────────────────────────────────────────────────────
# Group: SLA — fires when the service breaches contractual error/latency SLAs
# ──────────────────────────────────────────────────────────────────────────
- name: teachlink.sla
interval: 30s
rules:

# HighErrorRate
# Fires when the proportion of 5xx responses exceeds 1 % over a 5-minute
# window. Requires the "http_requests_total" counter with a "status_code"
# label emitted by the NestJS Prometheus interceptor.
- alert: HighErrorRate
expr: |
(
sum(rate(http_requests_total{
job="{{ .Values.prometheusRule.jobLabel }}",
status_code=~"5.."
}[5m]))
/
sum(rate(http_requests_total{
job="{{ .Values.prometheusRule.jobLabel }}"
}[5m]))
) > {{ .Values.prometheusRule.thresholds.errorRatePct | default 0.01 }}
for: 5m
labels:
severity: critical
team: backend
service: teachlink-backend
annotations:
summary: "High 5xx error rate on TeachLink backend"
description: "The 5xx error rate has exceeded the 1% SLA threshold. See runbook for triage steps."
runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#higherrorrate"

# HighP99Latency
# Fires when the 99th-percentile request latency exceeds 1 second for
# 10 consecutive minutes. Requires the "http_request_duration_seconds"
# histogram emitted by the NestJS metrics module.
- alert: HighP99Latency
expr: |
histogram_quantile(
0.99,
sum by (le) (
rate(http_request_duration_seconds_bucket{
job="{{ .Values.prometheusRule.jobLabel }}"
}[5m])
)
) > {{ .Values.prometheusRule.thresholds.p99LatencySeconds | default 1.0 }}
for: 10m
labels:
severity: warning
team: backend
service: teachlink-backend
annotations:
summary: "P99 request latency exceeds 1 s on TeachLink backend"
description: "P99 latency is above the 1 s SLA threshold for 10 minutes. See runbook for triage steps."
runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#highp99latency"

# ──────────────────────────────────────────────────────────────────────────
# Group: queues — fires on job-queue saturation or dead-letter accumulation
# ──────────────────────────────────────────────────────────────────────────
- name: teachlink.queues
interval: 60s
rules:

# QueueDepthHigh
# Fires when any Bull/BullMQ queue has more than 1 000 waiting jobs for
# 10 consecutive minutes. Requires the "bull_queue_waiting" gauge emitted
# by the @willsoto/nestjs-prometheus Bull metrics plugin.
- alert: QueueDepthHigh
expr: |
bull_queue_waiting{
job="{{ .Values.prometheusRule.jobLabel }}"
} > {{ .Values.prometheusRule.thresholds.queueDepth | default 1000 }}
for: 10m
labels:
severity: warning
team: backend
service: teachlink-backend
annotations:
summary: "Job queue depth exceeds 1 000 on TeachLink backend"
description: "A Bull/BullMQ queue has more than 1000 waiting jobs for 10 minutes. See runbook for triage steps."
runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#queuedepthhigh"

# DLQDepthHigh
# Fires when the dead-letter queue depth exceeds threshold, indicating
# jobs are repeatedly failing. Requires the "bull_queue_failed" gauge.
- alert: DLQDepthHigh
expr: |
bull_queue_failed{
job="{{ .Values.prometheusRule.jobLabel }}"
} > {{ .Values.prometheusRule.thresholds.dlqDepth | default 50 }}
for: 5m
labels:
severity: critical
team: backend
service: teachlink-backend
annotations:
summary: "Dead-letter queue depth is growing on TeachLink backend"
description: "More than 50 jobs have moved to the failed DLQ and are not being retried. See runbook for triage steps."
runbook_url: "{{ .Values.prometheusRule.runbookBaseUrl }}/RUNBOOKS.md#dlqdepthhigh"
{{- end }}
159 changes: 159 additions & 0 deletions charts/teachlink-backend/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# ─────────────────────────────────────────────────────────────────────────────
# TeachLink Backend – Helm chart default values
# Override any of these in values-staging.yaml / values-production.yaml
# ─────────────────────────────────────────────────────────────────────────────

# -- Deployment metadata
replicaCount: 2

image:
repository: ghcr.io/rinafcode/teachlink-backend
pullPolicy: IfNotPresent
tag: "" # Defaults to Chart.appVersion when empty

service:
type: ClusterIP
port: 3001

# ─────────────────────────────────────────────────────────────────────────────
# Prometheus alerting rules
# ─────────────────────────────────────────────────────────────────────────────
prometheusRule:
# Set to true to deploy the PrometheusRule CR.
# Requires kube-prometheus-stack (or prometheus-operator) to be installed.
enabled: true

# Must match the ruleSelector labels configured on your Prometheus CR.
# With kube-prometheus-stack defaults this label is sufficient.
additionalLabels:
release: kube-prometheus-stack

# The Prometheus `job` label value that the backend pods are scraped under.
# Matches the ServiceMonitor / PodMonitor `jobLabel` field.
jobLabel: teachlink-backend

# Base URL prepended to runbook fragment links embedded in alert annotations.
# Point this at your Git-hosting URL so on-call engineers can click straight
# through from Alertmanager / PagerDuty / Slack to the runbook section.
runbookBaseUrl: "https://github.com/rinafcode/teachLink_backend/blob/main/docs"

# SLA thresholds – override per environment if needed.
thresholds:
errorRatePct: 0.01 # 1 % – fraction (not percentage)
p99LatencySeconds: 1.0 # 1 s
queueDepth: 1000 # waiting jobs
dlqDepth: 50 # failed jobs in DLQ

# ─────────────────────────────────────────────────────────────────────────────
# Alertmanager – Slack / webhook routing
#
# This section configures the Alertmanager config secret that kube-prometheus-
# stack creates. Set alertmanager.enabled=true and fill in your Slack
# webhook URL (or supply it via a pre-existing Secret; see secretRef below).
# ─────────────────────────────────────────────────────────────────────────────
alertmanager:
enabled: true

# ── Slack integration ──────────────────────────────────────────────────────
slack:
# Whether to enable Slack notifications.
enabled: true

# Webhook URL for your Slack app.
# NEVER commit a real token here. Supply it at deploy-time:
# helm upgrade ... --set alertmanager.slack.webhookUrl="https://hooks.slack.com/..."
# or store it in a Kubernetes Secret and reference it via secretRef below.
webhookUrl: ""

# Alternatively, reference a pre-existing Secret that contains the key
# `slack-webhook-url`. When set, webhookUrl above is ignored.
secretRef:
name: "" # e.g. teachlink-alertmanager-secrets
key: slack-webhook-url

# Slack channels per severity level.
channels:
critical: "#teachlink-alerts-critical"
warning: "#teachlink-alerts-warning"
default: "#teachlink-alerts"

# ── Generic webhook (PagerDuty, OpsGenie, custom endpoint, …) ─────────────
webhook:
enabled: false
url: ""

# ── Alertmanager routing config (rendered into the kube-prometheus-stack
# alertmanager.config value).
# Adjust `group_wait`, `group_interval`, and `repeat_interval` to taste.
config:
global:
resolve_timeout: 5m
# slack_api_url is set dynamically from alertmanager.slack.webhookUrl
# by the chart helper; do not set it here.

route:
group_by: ['alertname', 'service', 'namespace']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: slack-default

routes:
# Critical alerts → dedicated critical channel, short repeat.
- matchers:
- severity = "critical"
receiver: slack-critical
repeat_interval: 1h

# Warning alerts → warning channel.
- matchers:
- severity = "warning"
receiver: slack-warning
repeat_interval: 4h

receivers:
- name: slack-default
slack_configs:
- channel: "{{ .Values.alertmanager.slack.channels.default }}"
send_resolved: true
title: '[{{ "{{" }} .Status | toUpper {{ "}}" }}{{ "{{" }} if eq .Status "firing" {{ "}}" }}:{{ "{{" }} .Alerts.Firing | len {{ "}}" }}{{ "{{" }} end {{ "}}" }}] TeachLink Alert'
text: >-
{{ "{{" }} range .Alerts {{ "}}" }}
*Alert:* {{ "{{" }} .Annotations.summary {{ "}}" }}
*Severity:* {{ "{{" }} .Labels.severity {{ "}}" }}
*Description:* {{ "{{" }} .Annotations.description {{ "}}" }}
*Runbook:* {{ "{{" }} .Annotations.runbook_url {{ "}}" }}
{{ "{{" }} end {{ "}}" }}

- name: slack-critical
slack_configs:
- channel: "{{ .Values.alertmanager.slack.channels.critical }}"
send_resolved: true
title: '🚨 [CRITICAL] TeachLink Alert'
text: >-
{{ "{{" }} range .Alerts {{ "}}" }}
*Alert:* {{ "{{" }} .Annotations.summary {{ "}}" }}
*Description:* {{ "{{" }} .Annotations.description {{ "}}" }}
*Runbook:* {{ "{{" }} .Annotations.runbook_url {{ "}}" }}
{{ "{{" }} end {{ "}}" }}

- name: slack-warning
slack_configs:
- channel: "{{ .Values.alertmanager.slack.channels.warning }}"
send_resolved: true
title: '⚠️ [WARNING] TeachLink Alert'
text: >-
{{ "{{" }} range .Alerts {{ "}}" }}
*Alert:* {{ "{{" }} .Annotations.summary {{ "}}" }}
*Description:* {{ "{{" }} .Annotations.description {{ "}}" }}
*Runbook:* {{ "{{" }} .Annotations.runbook_url {{ "}}" }}
{{ "{{" }} end {{ "}}" }}

inhibit_rules:
# Suppress warnings when a critical alert for the same service is already
# firing, to reduce noise.
- source_matchers:
- severity = "critical"
target_matchers:
- severity = "warning"
equal: ['service', 'namespace']
12 changes: 12 additions & 0 deletions docs/API_VERSIONING_POLICY.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,18 @@ This project uses URL-based API versioning to protect clients from breaking chan
- Path-based versioning is the primary version selection mechanism
- API clients should prefer explicit `/api/v1/...` paths when available

## Supported version numbers

The middleware validates version strings against the pattern `/^v\d+$/` (the letter
`v` followed by one or more digits). Any other format is rejected with `400 Bad Request`.

| Version | Status | Notes |
|---------|---------|-------------------------|
| `v1` | Active | Current stable version |
| `v2` | Planned | Reserved for future use |

Examples of **invalid** version strings that are rejected: `vABC`, `v1.2`, `../v1`, `123`.

## Compatibility layer

The middleware rewrites legacy API requests from `/api/*` to `/api/v1/*`.
Expand Down
Loading