From b067b6bc9ceda761e3e89192a96fe57f32d20946 Mon Sep 17 00:00:00 2001 From: zukwiz Date: Wed, 27 May 2026 19:07:30 +0200 Subject: [PATCH] cmd/seed: synthetic dataset for local development MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a deterministic ~6-month synthetic dataset (~9.8k rows, gentle sinusoid with occasional spikes and quiet days) for exercising the dashboard locally without needing real production exports. The generator deliberately spans every period (7d / 30d / 3m / 6m / 1y) so the chart UI has data to render at any range. Safety properties: - Refuses to run unless Config.Environment == "development". - INSERT … ON CONFLICT (id) DO NOTHING, so re-running is a no-op. - Steam IDs use a clearly-synthetic 76561198000000000 prefix. - Snowflake IDs encode the same created_at + sequence layout as the production generator, so synthetic rows sort chronologically alongside any real rows already in the DB. internal-docs/ and internal/devseed/fixtures/ are added to .gitignore to keep author scratch space and any future local CSV fixtures out of the public repo. Co-authored-by: Cursor --- .gitignore | 3 + README.md | 15 +++ cmd/seed/main.go | 61 ++++++++++++ internal/devseed/synthetic.go | 179 ++++++++++++++++++++++++++++++++++ 4 files changed, 258 insertions(+) create mode 100644 cmd/seed/main.go create mode 100644 internal/devseed/synthetic.go diff --git a/.gitignore b/.gitignore index 595dfa4..4e1454c 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ static/*.db-wal .secrets .env + +internal-docs/ +internal/devseed/fixtures/ diff --git a/README.md b/README.md index 1926b7f..3805cbb 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,21 @@ If you're looking to participate by contributing reversal reports (i.e. marketpl The server starts on port `80` by default (configurable via `HTTP_PORT`). +### Seeding local data + +Production ingests live data from contributing marketplaces. For local development, a deterministic synthetic dataset (~6 months, ~9.8k rows with realistic daily variance) can be loaded with: + +```bash +go run ./cmd/seed +``` + +The seed: + +- Refuses to run unless `Environment` is `development`. +- Uses `INSERT … ON CONFLICT (id) DO NOTHING`, so it's safe to re-run. +- Generates a deterministic 6-month dataset so the dashboard at `/` has enough data to exercise every period (7d / 30d / 3m / 6m / 1y). +- Uses a synthetic Steam ID prefix (`76561198000000000`) so generated IDs are clearly fake. + ## Configuration Configuration is loaded from environment variables or a `config.json` file. diff --git a/cmd/seed/main.go b/cmd/seed/main.go new file mode 100644 index 0000000..e683514 --- /dev/null +++ b/cmd/seed/main.go @@ -0,0 +1,61 @@ +// Command seed loads a deterministic synthetic dataset into the local +// Reverse Watch Postgres databases for local dashboard development. +// It is NEVER intended to run in production and refuses to run unless +// Config.Environment is "development". +// +// go run ./cmd/seed +// +// The insert uses ON CONFLICT (id) DO NOTHING, so re-running is safe. +package main + +import ( + "fmt" + "os" + "time" + + "reverse-watch/config" + "reverse-watch/domain/models" + "reverse-watch/domain/models/constants" + "reverse-watch/internal/devseed" + "reverse-watch/logging" + "reverse-watch/repository/factory" + "reverse-watch/secret" +) + +func main() { + logging.Initialize() + cfg := config.Load() + + if cfg.Environment != constants.EnvironmentDevelopment { + fmt.Fprintf(os.Stderr, "refusing to seed: environment is %q (only %q is allowed)\n", cfg.Environment, constants.EnvironmentDevelopment) + os.Exit(1) + } + + // Required by factory bootstrap (e.g. admin API key seeding). The + // synthetic generator pre-populates its own IDs, so the snowflake + // generator does not actually run for them. + models.InitSnowflakeGenerator(0, 0) + + keygen := secret.NewKeyGenerator(cfg.Environment) + f, err := factory.NewFactory(cfg, keygen) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to create factory: %v\n", err) + os.Exit(1) + } + defer func() { + if err := f.Close(); err != nil { + fmt.Fprintf(os.Stderr, "failed to close factory: %v\n", err) + } + }() + + reversals := devseed.GenerateSynthetic(time.Now().UTC()) + fmt.Printf("generated %d synthetic reversals (deterministic seed)\n", len(reversals)) + + inserted, err := devseed.InsertReversals(f.PublicDB(), reversals) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to insert reversals: %v\n", err) + os.Exit(1) + } + skipped := int64(len(reversals)) - inserted + fmt.Printf("seed complete: %d inserted, %d already present (skipped)\n", inserted, skipped) +} diff --git a/internal/devseed/synthetic.go b/internal/devseed/synthetic.go new file mode 100644 index 0000000..8b93957 --- /dev/null +++ b/internal/devseed/synthetic.go @@ -0,0 +1,179 @@ +// Package devseed loads dev-only fixture data into the local Postgres +// instance. It is intentionally not wired into the main binary — call it +// from cmd/seed (or a test) when you need realistic data locally. +package devseed + +import ( + "math" + "math/rand" + "time" + + "reverse-watch/domain/models" + + "gorm.io/gorm" + "gorm.io/gorm/clause" +) + +const ( + syntheticRNGSeed int64 = 42 + syntheticDays = 180 + syntheticTargetTotal = 9800 + syntheticBaseSteamID uint64 = 76561198000000000 + syntheticBaseReporter uint = 2_900_000 +) + +var syntheticMarketplaces = []struct { + slug string + weight float64 +}{ + {"csfloat", 0.80}, + {"tradeit", 0.10}, + {"skinport", 0.05}, + {"swap.gg", 0.05}, +} + +// GenerateSynthetic returns a deterministic ~6-month dataset (~9,800 rows, +// at least one per day, gentle sinusoid with occasional spikes / quiet +// days). Snowflake IDs are unique within the slice and won't collide with +// real CSV-seeded IDs, so callers can pipe the result straight into +// InsertReversals. +func GenerateSynthetic(now time.Time) []*models.Reversal { + rng := rand.New(rand.NewSource(syntheticRNGSeed)) + nowMs := uint64(now.UnixMilli()) + today := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) + + counts := make([]int, syntheticDays) + for d := 0; d < syntheticDays; d++ { + base := 40.0 + 10.0*math.Sin(float64(d)/30.0) + var mult float64 + switch r := rng.Float64(); { + case r < 0.05: + mult = 2.5 + rng.Float64()*2.5 + case r < 0.15: + mult = 0.2 + rng.Float64()*0.3 + default: + mult = 0.7 + rng.Float64()*0.6 + } + counts[d] = int(math.Max(1, base*mult+rng.NormFloat64()*5)) + } + + total := 0 + for _, c := range counts { + total += c + } + if total > 0 { + scale := float64(syntheticTargetTotal) / float64(total) + for d := range counts { + counts[d] = int(math.Max(1, math.Round(float64(counts[d])*scale))) + } + } + + rows := make([]*models.Reversal, 0, syntheticTargetTotal+200) + var steamOffset uint64 = 1 + var seq uint16 + + for d := 0; d < syntheticDays; d++ { + dayStart := today.AddDate(0, 0, -(syntheticDays-1-d)) + for i := 0; i < counts[d]; i++ { + reversedAt := dayStart.Add(time.Duration(rng.Float64() * float64(24*time.Hour))) + if uint64(reversedAt.UnixMilli()) > nowMs { + reversedAt = now.Add(-1 * time.Minute) + } + reportDelay := time.Duration(rng.Intn(8*60)+1) * time.Minute + createdAt := reversedAt.Add(reportDelay) + if uint64(createdAt.UnixMilli()) > nowMs { + createdAt = now + } + + srcRoll := rng.Float64() + var src models.Source + var related *models.SteamID + switch { + case srcRoll < 0.90: + src = models.SourceDirect + case srcRoll < 0.95: + src = models.SourceRelatedUser + relID := models.SteamID(syntheticBaseSteamID + (steamOffset+10_000)*97) + related = &relID + default: + src = models.SourceUserReport + } + + var expunged *uint64 + if rng.Float64() < 0.015 { + eAt := createdAt.Add(time.Duration(rng.Intn(24*60)+1) * time.Minute) + if uint64(eAt.UnixMilli()) < nowMs && uint64(eAt.UnixMilli()) > uint64(createdAt.UnixMilli()) { + ems := uint64(eAt.UnixMilli()) + expunged = &ems + } + } + + steamID := models.SteamID(syntheticBaseSteamID + steamOffset*73 + uint64(rng.Intn(50))) + steamOffset++ + + // Snowflake encodes created_at + a 12-bit per-ms sequence; + // mirrors domain/models/snowflake.go so generated IDs sort + // chronologically alongside production rows. + seq = (seq + 1) & 0x0FFF + sfTs := uint64(createdAt.UnixMilli()) - models.Epoch + sf := models.Snowflake((sfTs << 22) | uint64(seq)) + + reporter := syntheticBaseReporter + uint(steamOffset) + mp := pickMarketplace(rng) + + rows = append(rows, &models.Reversal{ + Model: models.Model{ + ID: sf, + CreatedAt: uint64(createdAt.UnixMilli()), + UpdatedAt: uint64(createdAt.UnixMilli()), + }, + SteamID: steamID, + MarketplaceSlug: mp, + Source: &src, + RelatedSteamID: related, + ReversedAt: uint64(reversedAt.UnixMilli()), + ReporterInternalID: &reporter, + ExpungedAt: expunged, + }) + } + } + return rows +} + +func pickMarketplace(rng *rand.Rand) string { + r := rng.Float64() + cum := 0.0 + for _, mp := range syntheticMarketplaces { + cum += mp.weight + if r < cum { + return mp.slug + } + } + return syntheticMarketplaces[0].slug +} + +// insertChunkSize keeps each bulk insert under Postgres's 65,535 +// parameter-per-statement cap. At ~11 columns per Reversal, 1,000 rows +// uses ~11k parameters. +const insertChunkSize = 1000 + +// InsertReversals bulk-inserts reversals with ON CONFLICT (id) DO NOTHING, +// so the seed is idempotent. Returns the number of rows actually inserted. +func InsertReversals(db *gorm.DB, reversals []*models.Reversal) (int64, error) { + if len(reversals) == 0 { + return 0, nil + } + var inserted int64 + for i := 0; i < len(reversals); i += insertChunkSize { + end := i + insertChunkSize + if end > len(reversals) { + end = len(reversals) + } + res := db.Clauses(clause.OnConflict{DoNothing: true}).Create(reversals[i:end]) + if res.Error != nil { + return inserted, res.Error + } + inserted += res.RowsAffected + } + return inserted, nil +}