diff --git a/README.md b/README.md index 3c895a8..18bbeb9 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,5 @@ See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for detailed documentation. ## TODO -- Managed Databases: Deploy standalone databases (PostgreSQL, MySQL, Redis, etc.) with automated configuration - Notifications: Alert channels for deployment events and system alerts - Templates: Pre-configured templates for popular applications diff --git a/agent/internal/agent/backup.go b/agent/internal/agent/backup.go index 7469298..2397ee0 100644 --- a/agent/internal/agent/backup.go +++ b/agent/internal/agent/backup.go @@ -13,7 +13,6 @@ import ( "os" "path/filepath" "strings" - "time" "techulus/cloud-agent/internal/container" agenthttp "techulus/cloud-agent/internal/http" @@ -34,96 +33,6 @@ type StorageConfig struct { SecretKey string `json:"secretKey"` } -func detectDatabaseType(image string) string { - image = strings.ToLower(image) - switch { - case strings.Contains(image, "postgres"): - return "postgres" - case strings.Contains(image, "mysql"): - return "mysql" - case strings.Contains(image, "mariadb"): - return "mariadb" - case strings.Contains(image, "mongo"): - return "mongodb" - case strings.Contains(image, "redis"): - return "redis" - default: - return "" - } -} - -func getDatabaseBackupCommand(dbType string) []string { - switch dbType { - case "postgres": - return []string{"sh", "-c", "pg_dump -Fc --no-acl --no-owner -U ${POSTGRES_USER:-postgres} ${POSTGRES_DB:-postgres}"} - case "mysql": - return []string{"sh", "-c", "mysqldump -u root -p$MYSQL_ROOT_PASSWORD --all-databases --single-transaction"} - case "mariadb": - return []string{"sh", "-c", "mariadb-dump -u root -p$MARIADB_ROOT_PASSWORD --all-databases --single-transaction"} - case "mongodb": - return []string{"sh", "-c", "mongodump ${MONGO_INITDB_ROOT_USERNAME:+--username=$MONGO_INITDB_ROOT_USERNAME --password=$MONGO_INITDB_ROOT_PASSWORD --authenticationDatabase=admin} --archive --gzip"} - case "redis": - return []string{"redis-cli", "BGSAVE"} - default: - return nil - } -} - -func getDatabaseRestoreCommand(dbType string) []string { - switch dbType { - case "postgres": - return []string{"sh", "-c", "pg_restore -U ${POSTGRES_USER:-postgres} -d ${POSTGRES_DB:-postgres} --clean --if-exists"} - case "mysql": - return []string{"sh", "-c", "mysql -u root -p$MYSQL_ROOT_PASSWORD"} - case "mariadb": - return []string{"sh", "-c", "mariadb -u root -p$MARIADB_ROOT_PASSWORD"} - case "mongodb": - return []string{"sh", "-c", "mongorestore ${MONGO_INITDB_ROOT_USERNAME:+--username=$MONGO_INITDB_ROOT_USERNAME --password=$MONGO_INITDB_ROOT_PASSWORD --authenticationDatabase=admin} --archive --gzip"} - default: - return nil - } -} - -func getBackupFileExtension(dbType string) string { - switch dbType { - case "postgres": - return ".dump" - case "mysql", "mariadb": - return ".sql" - case "mongodb": - return ".archive.gz" - case "redis": - return ".rdb" - default: - return ".backup" - } -} - -var credentialErrors = map[string]string{ - "postgres": "PostgreSQL backup failed. Ensure POSTGRES_USER and POSTGRES_PASSWORD env vars are set, or that local trust authentication is enabled.", - "mysql": "MySQL backup failed. Ensure MYSQL_ROOT_PASSWORD env var is set in your container.", - "mariadb": "MariaDB backup failed. Ensure MARIADB_ROOT_PASSWORD or MYSQL_ROOT_PASSWORD env var is set.", - "mongodb": "MongoDB backup failed. Ensure MONGO_INITDB_ROOT_USERNAME and MONGO_INITDB_ROOT_PASSWORD env vars are set, or that auth is disabled.", - "redis": "Redis backup failed.", -} - -func isAuthError(output string) bool { - authPatterns := []string{ - "password authentication failed", - "Access denied", - "authentication failed", - "NOAUTH", - "auth failed", - } - outputLower := strings.ToLower(output) - for _, pattern := range authPatterns { - if strings.Contains(outputLower, strings.ToLower(pattern)) { - return true - } - } - return false -} - func (a *Agent) ProcessBackupVolume(item agenthttp.WorkQueueItem) error { var payload struct { BackupID string `json:"backupId"` @@ -132,18 +41,12 @@ func (a *Agent) ProcessBackupVolume(item agenthttp.WorkQueueItem) error { VolumeName string `json:"volumeName"` StoragePath string `json:"storagePath"` StorageConfig StorageConfig `json:"storageConfig"` - BackupType string `json:"backupType"` - ServiceImage string `json:"serviceImage"` } if err := json.Unmarshal([]byte(item.Payload), &payload); err != nil { return fmt.Errorf("failed to parse backup_volume payload: %w", err) } - if payload.BackupType == "database" { - return a.processDatabaseBackup(payload.BackupID, payload.ServiceID, payload.ContainerID, payload.ServiceImage, payload.StoragePath, payload.StorageConfig) - } - return a.processVolumeBackup(payload.BackupID, payload.ServiceID, payload.ContainerID, payload.VolumeName, payload.StoragePath, payload.StorageConfig) } @@ -158,6 +61,10 @@ func (a *Agent) processVolumeBackup(backupID, serviceID, containerID, volumeName volumePath := filepath.Join(a.DataDir, "volumes", serviceID, volumeName) log.Printf("[backup_volume] backing up volume %s from %s", volumeName, volumePath) + if !strings.HasSuffix(storagePath, ".tar.gz") { + return reportFailure(fmt.Errorf("unsupported backup archive path: %s", storagePath)) + } + if _, err := os.Stat(volumePath); os.IsNotExist(err) { return reportFailure(fmt.Errorf("volume path does not exist: %s", volumePath)) } @@ -169,26 +76,26 @@ func (a *Agent) processVolumeBackup(backupID, serviceID, containerID, volumeName } if running { - log.Printf("[backup_volume] pausing container %s", Truncate(containerID, 12)) - if err := container.Pause(containerID); err != nil { - return reportFailure(fmt.Errorf("failed to pause container: %w", err)) + log.Printf("[backup_volume] stopping container %s before backup", Truncate(containerID, 12)) + if err := container.Stop(containerID); err != nil { + return reportFailure(fmt.Errorf("failed to stop container: %w", err)) } defer func() { - log.Printf("[backup_volume] resuming container %s", Truncate(containerID, 12)) + log.Printf("[backup_volume] starting container %s after backup", Truncate(containerID, 12)) err := retry.WithBackoff(context.Background(), retry.UnpauseBackoff, func() (bool, error) { - if err := container.Unpause(containerID); err != nil { - log.Printf("[backup_volume] unpause attempt failed for container %s: %v", Truncate(containerID, 12), err) + if err := container.Start(containerID); err != nil { + log.Printf("[backup_volume] start attempt failed for container %s: %v", Truncate(containerID, 12), err) return false, err } return true, nil }) if err != nil { - log.Printf("[backup_volume] CRITICAL: failed to resume container %s: %v", Truncate(containerID, 12), err) + log.Printf("[backup_volume] CRITICAL: failed to start container %s after backup: %v", Truncate(containerID, 12), err) } }() } else { - log.Printf("[backup_volume] container %s not running; skipping pause", Truncate(containerID, 12)) + log.Printf("[backup_volume] container %s not running; skipping stop", Truncate(containerID, 12)) } } @@ -220,207 +127,6 @@ func (a *Agent) processVolumeBackup(backupID, serviceID, containerID, volumeName return nil } -func (a *Agent) processDatabaseBackup(backupID, serviceID, containerID, serviceImage, storagePath string, storageConfig StorageConfig) error { - reportFailure := func(err error) error { - if reportErr := a.Client.ReportBackupFailed(backupID, err.Error()); reportErr != nil { - log.Printf("[backup_database] warning: failed to report backup failure: %v", reportErr) - } - return err - } - - dbType := detectDatabaseType(serviceImage) - if dbType == "" { - log.Printf("[backup_database] unknown database type for image %s, falling back to volume backup", serviceImage) - return reportFailure(fmt.Errorf("database backup not supported for image: %s", serviceImage)) - } - - log.Printf("[backup_database] detected database type: %s for image %s", dbType, serviceImage) - - if containerID == "" { - return reportFailure(fmt.Errorf("containerId is required for database backup")) - } - - running, err := container.IsContainerRunning(containerID) - if err != nil { - return reportFailure(fmt.Errorf("failed to check container status: %w", err)) - } - if !running { - return reportFailure(fmt.Errorf("container %s is not running", containerID)) - } - - if dbType == "redis" { - return a.processRedisBackup(backupID, serviceID, containerID, storagePath, storageConfig) - } - - cmd := getDatabaseBackupCommand(dbType) - if cmd == nil { - return reportFailure(fmt.Errorf("no backup command for database type: %s", dbType)) - } - - log.Printf("[backup_database] executing backup command in container %s", Truncate(containerID, 12)) - - output, err := container.Exec(containerID, cmd) - if err != nil { - outputStr := string(output) - if isAuthError(outputStr) { - if errMsg, ok := credentialErrors[dbType]; ok { - return reportFailure(fmt.Errorf("%s\n\nOriginal error: %s", errMsg, outputStr)) - } - } - return reportFailure(fmt.Errorf("database backup failed: %s: %w", outputStr, err)) - } - - backupPath := filepath.Join(os.TempDir(), fmt.Sprintf("dbbackup-%s%s", backupID, getBackupFileExtension(dbType))) - defer os.Remove(backupPath) - - if err := os.WriteFile(backupPath, output, 0600); err != nil { - return reportFailure(fmt.Errorf("failed to write backup file: %w", err)) - } - - stat, err := os.Stat(backupPath) - if err != nil { - return reportFailure(fmt.Errorf("failed to stat backup file: %w", err)) - } - - checksum, err := calculateChecksum(backupPath) - if err != nil { - return reportFailure(fmt.Errorf("failed to calculate checksum: %w", err)) - } - - size := stat.Size() - - log.Printf("[backup_database] created backup: size=%d, checksum=%s", size, checksum) - - s3Client, err := createS3Client(storageConfig) - if err != nil { - return reportFailure(fmt.Errorf("failed to create S3 client: %w", err)) - } - - if err := uploadToS3(s3Client, storageConfig.Bucket, storagePath, backupPath); err != nil { - return reportFailure(fmt.Errorf("failed to upload to S3: %w", err)) - } - - log.Printf("[backup_database] uploaded to S3: %s/%s", storageConfig.Bucket, storagePath) - - if err := a.Client.ReportBackupComplete(backupID, size, checksum); err != nil { - return fmt.Errorf("failed to report backup complete: %w", err) - } - - return nil -} - -func (a *Agent) processRedisBackup(backupID, serviceID, containerID, storagePath string, storageConfig StorageConfig) error { - reportFailure := func(err error) error { - if reportErr := a.Client.ReportBackupFailed(backupID, err.Error()); reportErr != nil { - log.Printf("[backup_database] warning: failed to report backup failure: %v", reportErr) - } - return err - } - - log.Printf("[backup_database] getting Redis dump path from container %s", Truncate(containerID, 12)) - - rdbPath, err := getRedisRDBPath(containerID) - if err != nil { - return reportFailure(fmt.Errorf("failed to get Redis RDB path: %w", err)) - } - - lastSaveOutput, err := container.Exec(containerID, []string{"redis-cli", "LASTSAVE"}) - if err != nil { - return reportFailure(fmt.Errorf("failed to get LASTSAVE: %w", err)) - } - lastSaveBefore := strings.TrimSpace(string(lastSaveOutput)) - - log.Printf("[backup_database] triggering Redis BGSAVE in container %s", Truncate(containerID, 12)) - - output, err := container.Exec(containerID, []string{"redis-cli", "BGSAVE"}) - if err != nil { - return reportFailure(fmt.Errorf("redis BGSAVE failed: %s: %w", string(output), err)) - } - - log.Printf("[backup_database] waiting for BGSAVE to complete") - for i := 0; i < 60; i++ { - time.Sleep(time.Second) - lastSaveOutput, err := container.Exec(containerID, []string{"redis-cli", "LASTSAVE"}) - if err != nil { - continue - } - lastSaveAfter := strings.TrimSpace(string(lastSaveOutput)) - if lastSaveAfter != lastSaveBefore { - log.Printf("[backup_database] BGSAVE completed") - break - } - if i == 59 { - return reportFailure(fmt.Errorf("BGSAVE did not complete within 60 seconds")) - } - } - - log.Printf("[backup_database] copying %s from container", rdbPath) - - rdbOutput, err := container.Exec(containerID, []string{"cat", rdbPath}) - if err != nil { - return reportFailure(fmt.Errorf("failed to read Redis dump file: %s: %w", string(rdbOutput), err)) - } - - backupPath := filepath.Join(os.TempDir(), fmt.Sprintf("dbbackup-%s.rdb", backupID)) - defer os.Remove(backupPath) - - if err := os.WriteFile(backupPath, rdbOutput, 0600); err != nil { - return reportFailure(fmt.Errorf("failed to write backup file: %w", err)) - } - - stat, err := os.Stat(backupPath) - if err != nil { - return reportFailure(fmt.Errorf("failed to stat backup file: %w", err)) - } - - checksum, err := calculateChecksum(backupPath) - if err != nil { - return reportFailure(fmt.Errorf("failed to calculate checksum: %w", err)) - } - - s3Client, err := createS3Client(storageConfig) - if err != nil { - return reportFailure(fmt.Errorf("failed to create S3 client: %w", err)) - } - - if err := uploadToS3(s3Client, storageConfig.Bucket, storagePath, backupPath); err != nil { - return reportFailure(fmt.Errorf("failed to upload to S3: %w", err)) - } - - log.Printf("[backup_database] uploaded to S3: %s/%s", storageConfig.Bucket, storagePath) - - if err := a.Client.ReportBackupComplete(backupID, stat.Size(), checksum); err != nil { - return fmt.Errorf("failed to report backup complete: %w", err) - } - - return nil -} - -func getRedisRDBPath(containerID string) (string, error) { - dirOutput, err := container.Exec(containerID, []string{"redis-cli", "CONFIG", "GET", "dir"}) - if err != nil { - return "/data/dump.rdb", nil - } - dirParts := strings.Split(strings.TrimSpace(string(dirOutput)), "\n") - dir := "/data" - if len(dirParts) >= 2 { - dir = strings.TrimSpace(dirParts[1]) - } - - fileOutput, err := container.Exec(containerID, []string{"redis-cli", "CONFIG", "GET", "dbfilename"}) - if err != nil { - return filepath.Join(dir, "dump.rdb"), nil - } - fileParts := strings.Split(strings.TrimSpace(string(fileOutput)), "\n") - filename := "dump.rdb" - if len(fileParts) >= 2 { - filename = strings.TrimSpace(fileParts[1]) - } - - return filepath.Join(dir, filename), nil -} - - func (a *Agent) ProcessRestoreVolume(item agenthttp.WorkQueueItem) error { var payload struct { BackupID string `json:"backupId"` @@ -430,18 +136,12 @@ func (a *Agent) ProcessRestoreVolume(item agenthttp.WorkQueueItem) error { StoragePath string `json:"storagePath"` ExpectedChecksum string `json:"expectedChecksum"` StorageConfig StorageConfig `json:"storageConfig"` - BackupType string `json:"backupType"` - ServiceImage string `json:"serviceImage"` } if err := json.Unmarshal([]byte(item.Payload), &payload); err != nil { return fmt.Errorf("failed to parse restore_volume payload: %w", err) } - if payload.BackupType == "database" { - return a.processDatabaseRestore(payload.BackupID, payload.ServiceID, payload.ContainerID, payload.VolumeName, payload.ServiceImage, payload.StoragePath, payload.ExpectedChecksum, payload.StorageConfig) - } - return a.processVolumeRestore(payload.BackupID, payload.ServiceID, payload.ContainerID, payload.VolumeName, payload.StoragePath, payload.ExpectedChecksum, payload.StorageConfig) } @@ -459,6 +159,10 @@ func (a *Agent) processVolumeRestore(backupID, serviceID, containerID, volumeNam tarPath := filepath.Join(os.TempDir(), fmt.Sprintf("restore-%s.tar.gz", backupID)) defer os.Remove(tarPath) + if !strings.HasSuffix(storagePath, ".tar.gz") { + return reportFailure(fmt.Errorf("unsupported backup archive path: %s", storagePath)) + } + s3Client, err := createS3Client(storageConfig) if err != nil { return reportFailure(fmt.Errorf("failed to create S3 client: %w", err)) @@ -552,236 +256,6 @@ func (a *Agent) processVolumeRestore(backupID, serviceID, containerID, volumeNam return nil } -func (a *Agent) processDatabaseRestore(backupID, serviceID, containerID, volumeName, serviceImage, storagePath, expectedChecksum string, storageConfig StorageConfig) error { - dbType := detectDatabaseType(serviceImage) - if dbType == "" { - return fmt.Errorf("database restore not supported for image: %s", serviceImage) - } - - log.Printf("[restore_database] detected database type: %s for image %s", dbType, serviceImage) - - reportFailure := func(err error) error { - if reportErr := a.Client.ReportRestoreComplete(backupID, false, err.Error()); reportErr != nil { - log.Printf("[restore_database] warning: failed to report restore failure: %v", reportErr) - } - return err - } - - if dbType == "redis" && containerID == "" { - return a.processRedisRestoreToVolume(backupID, serviceID, volumeName, storagePath, expectedChecksum, storageConfig) - } - - if containerID == "" { - return reportFailure(fmt.Errorf("containerId is required for %s database restore", dbType)) - } - - var running bool - for i := 0; i < 30; i++ { - var err error - running, err = container.IsContainerRunning(containerID) - if err != nil { - log.Printf("[restore_database] failed to check container status (attempt %d): %v", i+1, err) - time.Sleep(time.Second) - continue - } - if running { - break - } - log.Printf("[restore_database] container %s not running yet, waiting... (attempt %d/30)", Truncate(containerID, 12), i+1) - time.Sleep(time.Second) - } - if !running { - return reportFailure(fmt.Errorf("container %s is not running after 30 seconds", containerID)) - } - - restorePath := filepath.Join(os.TempDir(), fmt.Sprintf("dbrestore-%s%s", backupID, getBackupFileExtension(dbType))) - defer os.Remove(restorePath) - - s3Client, err := createS3Client(storageConfig) - if err != nil { - return reportFailure(fmt.Errorf("failed to create S3 client: %w", err)) - } - - if err := downloadFromS3(s3Client, storageConfig.Bucket, storagePath, restorePath); err != nil { - return reportFailure(fmt.Errorf("failed to download from S3: %w", err)) - } - - log.Printf("[restore_database] downloaded from S3: %s/%s", storageConfig.Bucket, storagePath) - - checksum, err := calculateChecksum(restorePath) - if err != nil { - return reportFailure(fmt.Errorf("failed to calculate checksum: %w", err)) - } - - if checksum != expectedChecksum { - return reportFailure(fmt.Errorf("checksum mismatch: expected %s, got %s", expectedChecksum, checksum)) - } - - if dbType == "redis" { - return a.processRedisRestore(backupID, containerID, restorePath) - } - - log.Printf("[restore_database] copying backup file to container %s", Truncate(containerID, 12)) - - containerRestorePath := fmt.Sprintf("/tmp/restore%s", getBackupFileExtension(dbType)) - if err := copyFileToContainer(containerID, restorePath, containerRestorePath); err != nil { - return reportFailure(fmt.Errorf("failed to copy backup to container: %w", err)) - } - - defer container.Exec(containerID, []string{"rm", "-f", containerRestorePath}) - - log.Printf("[restore_database] executing restore command in container %s", Truncate(containerID, 12)) - - var restoreCmd []string - switch dbType { - case "postgres": - restoreCmd = []string{"sh", "-c", fmt.Sprintf("pg_restore -U ${POSTGRES_USER:-postgres} -d ${POSTGRES_DB:-postgres} --clean --if-exists %s", containerRestorePath)} - case "mysql": - restoreCmd = []string{"sh", "-c", fmt.Sprintf("mysql -u root -p$MYSQL_ROOT_PASSWORD < %s", containerRestorePath)} - case "mariadb": - restoreCmd = []string{"sh", "-c", fmt.Sprintf("mariadb -u root -p$MARIADB_ROOT_PASSWORD < %s", containerRestorePath)} - case "mongodb": - restoreCmd = []string{"sh", "-c", fmt.Sprintf("mongorestore ${MONGO_INITDB_ROOT_USERNAME:+--username=$MONGO_INITDB_ROOT_USERNAME --password=$MONGO_INITDB_ROOT_PASSWORD --authenticationDatabase=admin} --archive=%s --gzip", containerRestorePath)} - default: - return reportFailure(fmt.Errorf("unsupported database type for restore: %s", dbType)) - } - - output, err := container.Exec(containerID, restoreCmd) - if err != nil { - outputStr := string(output) - if isAuthError(outputStr) { - if errMsg, ok := credentialErrors[dbType]; ok { - return reportFailure(fmt.Errorf("%s\n\nOriginal error: %s", errMsg, outputStr)) - } - } - return reportFailure(fmt.Errorf("database restore failed: %s: %w", outputStr, err)) - } - - log.Printf("[restore_database] restored database successfully") - - if err := a.Client.ReportRestoreComplete(backupID, true, ""); err != nil { - log.Printf("[restore_database] warning: failed to report restore complete: %v", err) - } - - return nil -} - -func (a *Agent) processRedisRestore(backupID, containerID, restorePath string) error { - log.Printf("[restore_database] restoring Redis from %s", restorePath) - - reportFailure := func(err error) error { - if reportErr := a.Client.ReportRestoreComplete(backupID, false, err.Error()); reportErr != nil { - log.Printf("[restore_database] warning: failed to report restore failure: %v", reportErr) - } - return err - } - - rdbPath, err := getRedisRDBPath(containerID) - if err != nil { - return reportFailure(fmt.Errorf("failed to get Redis RDB path: %w", err)) - } - - log.Printf("[restore_database] stopping Redis container %s", Truncate(containerID, 12)) - if err := container.Stop(containerID); err != nil { - return reportFailure(fmt.Errorf("failed to stop container for restore: %w", err)) - } - - log.Printf("[restore_database] copying RDB file to %s", rdbPath) - if err := copyFileToContainer(containerID, restorePath, rdbPath); err != nil { - container.Start(containerID) - return reportFailure(fmt.Errorf("failed to copy RDB to container: %w", err)) - } - - log.Printf("[restore_database] starting Redis container %s", Truncate(containerID, 12)) - if err := container.Start(containerID); err != nil { - return reportFailure(fmt.Errorf("failed to start container after restore: %w", err)) - } - - log.Printf("[restore_database] restored Redis successfully") - - if err := a.Client.ReportRestoreComplete(backupID, true, ""); err != nil { - log.Printf("[restore_database] warning: failed to report restore complete: %v", err) - } - - return nil -} - -func (a *Agent) processRedisRestoreToVolume(backupID, serviceID, volumeName, storagePath, expectedChecksum string, storageConfig StorageConfig) error { - log.Printf("[restore_database] restoring Redis directly to volume path (no container)") - - reportFailure := func(err error) error { - if reportErr := a.Client.ReportRestoreComplete(backupID, false, err.Error()); reportErr != nil { - log.Printf("[restore_database] warning: failed to report restore failure: %v", reportErr) - } - return err - } - - if volumeName == "" { - volumeName = "data" - } - volumePath := filepath.Join(a.DataDir, "volumes", serviceID, volumeName) - rdbPath := filepath.Join(volumePath, "dump.rdb") - - restorePath := filepath.Join(os.TempDir(), fmt.Sprintf("redis-restore-%s.rdb", serviceID)) - defer os.Remove(restorePath) - - s3Client, err := createS3Client(storageConfig) - if err != nil { - return reportFailure(fmt.Errorf("failed to create S3 client: %w", err)) - } - - if err := downloadFromS3(s3Client, storageConfig.Bucket, storagePath, restorePath); err != nil { - return reportFailure(fmt.Errorf("failed to download from S3: %w", err)) - } - - checksum, err := calculateChecksum(restorePath) - if err != nil { - return reportFailure(fmt.Errorf("failed to calculate checksum: %w", err)) - } - if checksum != expectedChecksum { - return reportFailure(fmt.Errorf("checksum mismatch: expected %s, got %s", expectedChecksum, checksum)) - } - - if err := os.MkdirAll(volumePath, 0755); err != nil { - return reportFailure(fmt.Errorf("failed to create volume directory: %w", err)) - } - - if err := copyFile(restorePath, rdbPath); err != nil { - return reportFailure(fmt.Errorf("failed to copy RDB file: %w", err)) - } - - log.Printf("[restore_database] restored Redis RDB to %s", rdbPath) - - if err := a.Client.ReportRestoreComplete(backupID, true, ""); err != nil { - log.Printf("[restore_database] warning: failed to report restore complete: %v", err) - } - - return nil -} - -func copyFile(src, dst string) error { - in, err := os.Open(src) - if err != nil { - return err - } - defer in.Close() - - out, err := os.Create(dst) - if err != nil { - return err - } - defer out.Close() - - if _, err := io.Copy(out, in); err != nil { - return err - } - return out.Sync() -} - -func copyFileToContainer(containerID, srcPath, destPath string) error { - return container.CopyToContainer(containerID, srcPath, destPath) -} - func createS3Client(cfg StorageConfig) (*s3.Client, error) { awsCfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion(cfg.Region), @@ -990,7 +464,7 @@ func extractTarGz(archivePath, destPath string) error { } resolvedLink := filepath.Clean(linkTarget) if !strings.HasPrefix(resolvedLink, filepath.Clean(destPath)+string(os.PathSeparator)) && - resolvedLink != filepath.Clean(destPath) { + resolvedLink != filepath.Clean(destPath) { return fmt.Errorf("invalid symlink target: %s -> %s", header.Name, header.Linkname) } if err := os.MkdirAll(filepath.Dir(targetPath), 0755); err != nil { diff --git a/deployment/README.md b/deployment/README.md index 884fd1c..92f4e9d 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -62,13 +62,13 @@ WEB_REPLICAS=2 ``` Traefik discovers the replicated `web` containers through the Docker provider -and load balances requests for `${ROOT_DOMAIN}` across them. The startup schema -sync remains in the web container entrypoint, so keep in mind that simultaneous -replica starts may run `drizzle-kit push` concurrently during upgrades. +and load balances requests for `${ROOT_DOMAIN}` across them. Schema sync runs +once from the dedicated `migrate` service before the replicated `web` containers +start, so scaling `WEB_REPLICAS` does not run migrations from every replica. ## Database Migrations -Schema is synced automatically on container startup via `drizzle-kit push`. This approach auto-confirms non-destructive changes (adding tables, columns, indexes) but will **not** auto-apply destructive changes like dropping columns or tables — those require manual intervention. +Schema is synced automatically by the one-shot `migrate` service via `drizzle-kit push`. This approach auto-confirms non-destructive changes (adding tables, columns, indexes) but will **not** auto-apply destructive changes like dropping columns or tables — those require manual intervention. If schema sync fails, `web` startup is blocked; inspect the failure with `docker compose -f compose.production.yml logs migrate`. **Future plan:** Once the schema stabilizes, switch to `drizzle-kit generate` + `drizzle-orm migrate()` with pre-generated SQL migration files. This will eliminate the esbuild/drizzle-kit dependency from the production image. diff --git a/deployment/compose.postgres.yml b/deployment/compose.postgres.yml index 3d309c8..3c8d0dd 100644 --- a/deployment/compose.postgres.yml +++ b/deployment/compose.postgres.yml @@ -69,6 +69,28 @@ services: start_period: 30s restart: unless-stopped + migrate: + image: ghcr.io/techulus/cloud/web:tip + env_file: + - ./.env + environment: + - DATABASE_URL=${DATABASE_URL} + - BETTER_AUTH_URL=https://${ROOT_DOMAIN} + - APP_URL=https://${ROOT_DOMAIN} + - VICTORIA_LOGS_URL=https://${VL_USERNAME}:${VL_PASSWORD}@logs.${ROOT_DOMAIN} + - VICTORIA_LOGS_PRIVATE_URL=http://${VL_USERNAME}:${VL_PASSWORD}@victoria-logs:9428 + - REGISTRY_URL=registry:5000 + - REGISTRY_HOST=registry.${ROOT_DOMAIN} + - INNGEST_BASE_URL=http://inngest:8288 + - INNGEST_SIGNING_KEY=${INNGEST_SIGNING_KEY} + - INNGEST_EVENT_KEY=${INNGEST_EVENT_KEY} + - ALLOW_SIGNUP=${ALLOW_SIGNUP:-false} + depends_on: + postgres: + condition: service_healthy + command: ["sh", "-c", "echo y | npx drizzle-kit push"] + restart: on-failure + web: image: ghcr.io/techulus/cloud/web:tip scale: ${WEB_REPLICAS:-1} @@ -87,10 +109,12 @@ services: - INNGEST_EVENT_KEY=${INNGEST_EVENT_KEY} - ALLOW_SIGNUP=${ALLOW_SIGNUP:-false} depends_on: - - postgres - - victoria-logs - - registry - - inngest + migrate: + condition: service_completed_successfully + victoria-logs: + condition: service_started + registry: + condition: service_started labels: - "traefik.enable=true" - "traefik.http.routers.web.rule=Host(`${ROOT_DOMAIN}`)" @@ -168,6 +192,9 @@ services: - "start" - "--sdk-url" - "http://web:3000/api/inngest" + depends_on: + web: + condition: service_healthy healthcheck: test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8288/health"] interval: 30s diff --git a/deployment/compose.production.yml b/deployment/compose.production.yml index 7b18b68..e95a9b5 100644 --- a/deployment/compose.production.yml +++ b/deployment/compose.production.yml @@ -51,6 +51,25 @@ services: retries: 3 restart: unless-stopped + migrate: + image: ghcr.io/techulus/cloud/web:tip + env_file: + - ./.env + environment: + - DATABASE_URL=${DATABASE_URL} + - BETTER_AUTH_URL=https://${ROOT_DOMAIN} + - APP_URL=https://${ROOT_DOMAIN} + - VICTORIA_LOGS_URL=https://${VL_USERNAME}:${VL_PASSWORD}@logs.${ROOT_DOMAIN} + - VICTORIA_LOGS_PRIVATE_URL=http://${VL_USERNAME}:${VL_PASSWORD}@victoria-logs:9428 + - REGISTRY_URL=registry:5000 + - REGISTRY_HOST=registry.${ROOT_DOMAIN} + - INNGEST_BASE_URL=http://inngest:8288 + - INNGEST_SIGNING_KEY=${INNGEST_SIGNING_KEY} + - INNGEST_EVENT_KEY=${INNGEST_EVENT_KEY} + - ALLOW_SIGNUP=${ALLOW_SIGNUP:-false} + command: ["sh", "-c", "echo y | npx drizzle-kit push"] + restart: on-failure + web: image: ghcr.io/techulus/cloud/web:tip scale: ${WEB_REPLICAS:-1} @@ -69,9 +88,12 @@ services: - INNGEST_EVENT_KEY=${INNGEST_EVENT_KEY} - ALLOW_SIGNUP=${ALLOW_SIGNUP:-false} depends_on: - - victoria-logs - - registry - - inngest + migrate: + condition: service_completed_successfully + victoria-logs: + condition: service_started + registry: + condition: service_started labels: - "traefik.enable=true" - "traefik.http.routers.web.rule=Host(`${ROOT_DOMAIN}`)" @@ -149,6 +171,9 @@ services: - "start" - "--sdk-url" - "http://web:3000/api/inngest" + depends_on: + web: + condition: service_healthy healthcheck: test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8288/health"] interval: 30s diff --git a/docs/deployments/compose.mdx b/docs/deployments/compose.mdx index 44b881d..e3afabc 100644 --- a/docs/deployments/compose.mdx +++ b/docs/deployments/compose.mdx @@ -35,4 +35,4 @@ Each service in the compose file becomes a separate Techulus Cloud service withi If a service in the compose file defines volumes, it is automatically marked as stateful. Stateful services are limited to 1 replica and pinned to a single server. -Imported stateful services use single-server local storage. Techulus Cloud does not currently provide replicated volumes or automatic failover for these services. Avoid importing production databases unless you accept the single-node storage risk and have an external backup and recovery plan. +Imported stateful services use single-server local storage. Techulus Cloud does not currently provide replicated volumes or automatic failover for these services. Do not import production databases into Techulus Cloud local volumes; use an external managed or HA database instead. diff --git a/docs/infrastructure/backups.mdx b/docs/infrastructure/backups.mdx index 9400a99..8aa8e88 100644 --- a/docs/infrastructure/backups.mdx +++ b/docs/infrastructure/backups.mdx @@ -1,25 +1,17 @@ --- title: "Backups" -description: "Automated database backups to S3-compatible storage." +description: "Automated volume backups to S3-compatible storage." --- -Techulus Cloud can automatically back up databases running in your containers to S3-compatible storage. Backups are triggered on a schedule or manually from the web UI. +Techulus Cloud can automatically back up service volumes to S3-compatible storage. Backups are triggered on a schedule or manually from the web UI. > **Backups are not high availability:** Backups provide point-in-time disaster recovery only. If a server hosting a stateful service is lost, any writes after the last successful backup may be lost, and recovery requires restoring data before the service can run elsewhere. Backups do not provide replicated storage or automatic failover. -## Supported Databases +## How It Works -The agent detects the database type from the container image name and runs the appropriate dump command: +The agent stops the running container, archives the volume directory as a `.tar.gz` file, uploads the archive to S3-compatible storage, then starts the container again if it was running. -| Database | Dump Command | -| --- | --- | -| PostgreSQL | `pg_dump -Fc` | -| MySQL | `mysqldump` | -| MariaDB | `mysqldump` | -| MongoDB | `mongodump --archive --gzip` | -| Redis | `redis-cli BGSAVE` | - -Backups are compressed and uploaded as `.tar.gz` archives. +Backups are volume snapshots. Techulus Cloud does not run database-native dump tools, and running production databases on local single-node volumes is not recommended. ## Configuration diff --git a/docs/installation.mdx b/docs/installation.mdx index 7af57c2..f272e2b 100644 --- a/docs/installation.mdx +++ b/docs/installation.mdx @@ -100,8 +100,9 @@ use the common commands below when investigating a self-hosted service. When `WEB_REPLICAS` is greater than `1`, Traefik discovers the replicated web containers through Docker and load balances requests for `` across -them. Startup schema sync still runs from each web container, so simultaneous -replica starts may run `drizzle-kit push` concurrently during upgrades. +them. Schema sync runs once from the dedicated `migrate` service before the +replicated web containers start, so scaling `WEB_REPLICAS` does not run +migrations from every replica. ### GitHub Integration (Optional) @@ -143,7 +144,7 @@ Traefik handles TLS termination and automatic certificate renewal via Let's Encr ## Database Migrations -The schema is synced automatically on container startup via `drizzle-kit push`. Non-destructive changes (adding tables, columns, indexes) are applied automatically. Destructive changes like dropping columns require manual intervention. +The schema is synced automatically by the one-shot `migrate` service via `drizzle-kit push`. Non-destructive changes (adding tables, columns, indexes) are applied automatically. Destructive changes like dropping columns require manual intervention. If schema sync fails, `web` startup is blocked; inspect the failure with `docker compose logs migrate`. ## Common Commands diff --git a/docs/networking/service-discovery.mdx b/docs/networking/service-discovery.mdx index 062d447..c59afde 100644 --- a/docs/networking/service-discovery.mdx +++ b/docs/networking/service-discovery.mdx @@ -25,10 +25,10 @@ No manual configuration is needed. Services can reference each other by name imm ## Example -If you have a `postgres` service and a `web` service, the web service can connect to the database using: +If you have an `api` service and a `web` service, the web service can connect to the API using: ``` -postgres://user:pass@postgres.internal:5432/mydb +http://api.internal:3000 ``` -For production databases, prefer an external managed database or another HA database setup. Techulus Cloud stateful volumes are single-server local storage and do not currently provide replicated storage or automatic failover. +For production databases, use an external managed database or another HA database setup. Techulus Cloud stateful volumes are single-server local storage and do not currently provide replicated storage or automatic failover. diff --git a/docs/networking/tcp-udp-proxy.mdx b/docs/networking/tcp-udp-proxy.mdx index 4975291..4ebb501 100644 --- a/docs/networking/tcp-udp-proxy.mdx +++ b/docs/networking/tcp-udp-proxy.mdx @@ -5,7 +5,7 @@ description: "Expose non-HTTP services like game servers and custom protocols." Not every service speaks HTTP. Techulus Cloud supports exposing raw TCP and UDP ports through proxy nodes for services like game servers or custom protocols. -> **Database exposure warning:** Database ports should usually remain private on the WireGuard network. Public database access increases security risk, and Techulus Cloud does not currently provide HA storage or automatic failover for production databases. +> **Database exposure warning:** Do not expose database ports publicly through the TCP/UDP proxy. Public database access increases security risk, and Techulus Cloud does not provide HA storage or automatic failover for production databases. ## Configuration diff --git a/docs/services/configuration.mdx b/docs/services/configuration.mdx index 21ea234..17282c2 100644 --- a/docs/services/configuration.mdx +++ b/docs/services/configuration.mdx @@ -37,6 +37,8 @@ You can set CPU and memory limits per service: New services default to the Large preset: 2 CPU cores and 1024 MB memory. You can change the preset or choose No limit to let the container use whatever resources are available on the host. +Resource limits are runtime caps, not reserved capacity. Placement uses current server health and replica distribution, but it does not require the sum of configured limits to fit as guaranteed minimum allocation. + ## Health Checks Health checks verify that a container is ready to receive traffic. When configured, the platform waits for the health check to pass before routing traffic to a new deployment. diff --git a/docs/services/volumes.mdx b/docs/services/volumes.mdx index 474d437..49079f7 100644 --- a/docs/services/volumes.mdx +++ b/docs/services/volumes.mdx @@ -5,7 +5,7 @@ description: "Persistent storage for stateful services." Volumes provide persistent storage that survives container restarts and redeployments. -> **Stateful storage warning:** Volumes are stored on a single server's local filesystem. Techulus Cloud does not currently provide replicated volumes, automatic storage failover, or high availability for stateful services. If the server hosting a volume is lost, data can only be recovered from completed backups. We do not recommend running production databases on Techulus Cloud until HA storage and failover are implemented, unless you accept this risk and maintain an external recovery plan. +> **Stateful storage warning:** Volumes are stored on a single server's local filesystem. Techulus Cloud does not currently provide replicated volumes, automatic storage failover, or high availability for stateful services. If the server hosting a volume is lost, data can only be recovered from completed backups. Do not use Techulus Cloud local volumes for production databases. ## Adding Volumes @@ -14,7 +14,7 @@ Each volume has a name and a container path: | Field | Description | | --- | --- | | Name | Unique identifier for the volume | -| Container path | Where the volume is mounted inside the container (e.g., `/var/lib/postgresql/data`) | +| Container path | Where the volume is mounted inside the container (e.g., `/data`) | When you add a volume, the service automatically becomes **stateful**. Stateful services are locked to a single server and limited to 1 replica so the container always mounts the same local data path. When the last volume is removed, the service reverts to stateless. @@ -27,7 +27,7 @@ Volumes can be backed up to S3-compatible storage on a schedule or on demand. | Backup enabled | Toggle automatic backups | | Backup schedule | Cron expression for backup frequency | -Backups are compressed as `.tar.gz` archives and uploaded to the configured [backup storage](/infrastructure/backups). Each backup tracks its size, checksum, and completion status. +Backups stop the running container, compress the volume as a `.tar.gz` archive, upload it to the configured [backup storage](/infrastructure/backups), then start the container again if it was running. Each backup tracks its size, checksum, and completion status. Backup statuses: diff --git a/web/Dockerfile b/web/Dockerfile index 04db67d..282f5fc 100644 --- a/web/Dockerfile +++ b/web/Dockerfile @@ -30,4 +30,4 @@ COPY --from=builder /app/db ./db COPY --from=builder /app/drizzle.config.ts ./drizzle.config.ts COPY --from=drizzle /drizzle/node_modules ./node_modules EXPOSE 3000 -CMD ["sh", "-c", "echo y | npx drizzle-kit push && node server.js"] +CMD ["node", "server.js"] diff --git a/web/actions/backups.ts b/web/actions/backups.ts index b027807..00817e0 100644 --- a/web/actions/backups.ts +++ b/web/actions/backups.ts @@ -10,15 +10,10 @@ import { inngest } from "@/lib/inngest/client"; import { inngestEvents } from "@/lib/inngest/events"; import { deleteFromS3 } from "@/lib/s3"; -export async function createBackup( - serviceId: string, - volumeId: string, - backupTypeOverride?: "volume" | "database", -) { +export async function createBackup(serviceId: string, volumeId: string) { const result = await triggerBackup({ serviceId, volumeId, - backupTypeOverride, }); await inngest.send( diff --git a/web/actions/migrations.ts b/web/actions/migrations.ts index 10ea9bf..0a69e32 100644 --- a/web/actions/migrations.ts +++ b/web/actions/migrations.ts @@ -1,11 +1,10 @@ "use server"; import { and, eq } from "drizzle-orm"; +import { revalidatePath } from "next/cache"; import { db } from "@/db"; -import { services, serviceVolumes, deployments } from "@/db/schema"; import { getBackupStorageConfig } from "@/db/queries"; -import { detectDatabaseType } from "@/lib/database-utils"; -import { revalidatePath } from "next/cache"; +import { deployments, services, serviceVolumes } from "@/db/schema"; import { inngest } from "@/lib/inngest/client"; import { inngestEvents } from "@/lib/inngest/events"; @@ -74,9 +73,6 @@ export async function startMigration( throw new Error("Service is already running on the target server"); } - const dbType = detectDatabaseType(service.image); - const isDatabase = Boolean(dbType); - await db .update(services) .set({ @@ -95,7 +91,6 @@ export async function startMigration( sourceDeploymentId: deployment.id, sourceContainerId: deployment.containerId, volumes: volumes.map((v) => ({ id: v.id, name: v.name })), - isDatabase, }), ); diff --git a/web/components/service/details/backup-tab.tsx b/web/components/service/details/backup-tab.tsx index 4129728..f0ae417 100644 --- a/web/components/service/details/backup-tab.tsx +++ b/web/components/service/details/backup-tab.tsx @@ -1,10 +1,20 @@ "use client"; +import { + Archive, + CheckCircle, + Clock, + Download, + Loader2, + RefreshCcw, + Trash2, + XCircle, +} from "lucide-react"; import { memo, useState } from "react"; +import { toast } from "sonner"; import useSWR from "swr"; -import { formatDateTime } from "@/lib/date"; -import { Button } from "@/components/ui/button"; -import { Item, ItemContent, ItemMedia, ItemTitle } from "@/components/ui/item"; +import { createBackup, deleteBackup, restoreBackup } from "@/actions/backups"; +import { updateServiceBackupSettings } from "@/actions/projects"; import { AlertDialog, AlertDialogAction, @@ -15,25 +25,14 @@ import { AlertDialogHeader, AlertDialogTitle, } from "@/components/ui/alert-dialog"; +import { Button } from "@/components/ui/button"; +import { Item, ItemContent, ItemMedia, ItemTitle } from "@/components/ui/item"; import { NativeSelect, NativeSelectOption, } from "@/components/ui/native-select"; -import { - Archive, - Download, - Trash2, - RefreshCcw, - Clock, - CheckCircle, - XCircle, - Loader2, -} from "lucide-react"; -import { createBackup, restoreBackup, deleteBackup } from "@/actions/backups"; -import { toast } from "sonner"; -import { detectDatabaseType } from "@/lib/database-utils"; -import { updateServiceBackupSettings } from "@/actions/projects"; import type { ServiceWithDetails as Service } from "@/db/types"; +import { formatDateTime } from "@/lib/date"; import { fetcher } from "@/lib/fetcher"; type BackupItem = { @@ -91,8 +90,6 @@ export const BackupTab = memo(function BackupTab({ const [error, setError] = useState(null); const volumes = service.volumes || []; - const detectedDbType = detectDatabaseType(service.image); - const isDatabaseService = detectedDbType !== null; const hasChanges = backupEnabled !== (service.backupEnabled ?? false) || @@ -213,13 +210,6 @@ export const BackupTab = memo(function BackupTab({ Weekly - - {isDatabaseService && ( -

- Database detected ({detectedDbType}). Scheduled backups will - use native database tools for portable backups. -

- )} )} diff --git a/web/components/service/details/tcp-proxy-section.tsx b/web/components/service/details/tcp-proxy-section.tsx index f62eb1d..abd176f 100644 --- a/web/components/service/details/tcp-proxy-section.tsx +++ b/web/components/service/details/tcp-proxy-section.tsx @@ -1,12 +1,12 @@ "use client"; -import { useState, memo } from "react"; +import { Check, Copy, Lock, Network, Plus, X } from "lucide-react"; +import { memo, useState } from "react"; +import { updateServiceConfig } from "@/actions/projects"; import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; -import { Switch } from "@/components/ui/switch"; import { Item, ItemContent, ItemMedia, ItemTitle } from "@/components/ui/item"; -import { Network, X, Plus, Copy, Check, Lock } from "lucide-react"; -import { updateServiceConfig } from "@/actions/projects"; +import { Switch } from "@/components/ui/switch"; import type { ServiceWithDetails as Service } from "@/db/types"; export const TCPProxySection = memo(function TCPProxySection({ @@ -108,8 +108,7 @@ export const TCPProxySection = memo(function TCPProxySection({

- Expose TCP/UDP ports directly through the proxy for databases and - other non-HTTP services. + Expose TCP/UDP ports directly through the proxy for non-HTTP services.

{tcpUdpPorts.length > 0 && ( @@ -208,14 +207,20 @@ export const TCPProxySection = memo(function TCPProxySection({
{protocol === "tcp" && ( -