diff --git a/MC/utils/AODBcRewriter.C b/MC/utils/AODBcRewriter.C index c8b5cd6df..c6ff68879 100644 --- a/MC/utils/AODBcRewriter.C +++ b/MC/utils/AODBcRewriter.C @@ -132,6 +132,45 @@ static const char *collIndexBranch(TTree *t) { return nullptr; } +// ---------------------------------------------------------------------------- +// Paste-join relationships (authoritative; derived from AnalysisDataModel.h +// comments such as "Table joined to the collision table containing the MC +// index" and from the SOA EXTENDED_TABLE declarations for cov / extra tables). +// +// A paste-joined CHILD has NO row of its own — its row N corresponds to row N +// of its PARENT. If the parent is reordered or has rows dropped, the child +// must follow row-for-row to preserve the 1:1 alignment. Any index columns +// the child carries (e.g. fIndexMcCollisions in O2mccollisionlabel) are then +// remapped *value-wise* via the appropriate parent stage's permutation, but +// rows are NEVER added or dropped on the child's own initiative. +// +// Matching uses TString::BeginsWith on the child name; parent matching uses +// allPerms keys (so versioned names like O2collision_001 resolve via prefix). +// When several parent candidates are listed for the same child, the first one +// found in allPerms wins (this lets us prefer O2track_iu over O2track). +static const std::vector> kPasteJoins = { + // { paste-joined child prefix, parent prefix } + { "O2bcflag", "O2bc" }, // BCFlags joinable with BCs + { "O2mccollisionlabel", "O2collision" }, // McCollisionLabels -> Collisions + { "O2mctracklabel", "O2track_iu" }, // McTrackLabels -> Tracks (prefer _iu) + { "O2mctracklabel", "O2track" }, + { "O2mcfwdtracklabel", "O2fwdtrack" }, // McFwdTrackLabels -> FwdTracks + { "O2mcmfttracklabel", "O2mfttrack" }, // McMFTTrackLabels -> MFTTracks + { "O2mccalolabel", "O2calo" }, // McCaloLabels -> Calos + { "O2trackcov_iu", "O2track_iu" }, // TracksCovIU -> TracksIU (cov) + { "O2trackextra", "O2track_iu" }, // TracksExtra -> TracksIU + { "O2fwdtrackcov", "O2fwdtrack" }, // FwdTracksCov -> FwdTracks + // Note: O2mfttrackcov has its own fIndexMFTTracks column — it is NOT + // paste-joined and must NOT be listed here. +}; + +// True if the given tree name matches any registered paste-join child prefix. +static bool isPasteJoinChild(const std::string &tname) { + for (auto &kv : kPasteJoins) + if (TString(tname.c_str()).BeginsWith(kv.first.c_str())) return true; + return false; +} + // ============================================================================ // SECTION 2 — Generic ROOT branch I/O helpers // ============================================================================ @@ -691,6 +730,16 @@ stage2_MCCollIndexedTables(TDirectory *dirIn, TDirectory *dirOut, const char *idxBr = mcCollIndexBranch(src); if (!idxBr) continue; + // A paste-join child (e.g. O2mccollisionlabel) MUST follow its parent's + // row order and never drop rows on its own. Defer it to + // processPasteJoinTables, which will remap any of its own index columns + // (fIndexMcCollisions, ...) value-wise without touching the row count. + if (isPasteJoinChild(tname)) { + std::cout << " Stage2: deferring paste-join child " << tname + << " to paste-join handler\n"; + continue; + } + std::cout << " Stage2 [MCColl-indexed]: " << tname << "\n"; Long64_t nSrc = src->GetEntries(); @@ -760,20 +809,23 @@ stage2_MCCollIndexedTables(TDirectory *dirIn, TDirectory *dirOut, // SECTION 8 — Paste-join table handling // ============================================================================ // -// A paste-joined table has NO index column. Its row N corresponds to row N -// of its parent table. When the parent is reordered, the paste-join table -// must follow with the identical row permutation. +// A paste-joined CHILD has no row of its own — its row N corresponds to row N +// of its PARENT table. When the parent is reordered, the child must follow +// row-for-row to preserve the 1:1 alignment. Paste-join children may still +// carry their own index columns; those values are remapped in-place via the +// appropriate parent-stage permutation. // -// Known paste-join relationships in the AO2D data model -// (parent table prefix -> paste-joined table prefix): +// The list of paste-join pairs is in kPasteJoins (Section 1). // -// O2collision_* -> O2mccollisionlabel_* -// O2track_* -> O2mctracklabel_* -// O2trackiu_* -> O2mctracklabel_* (alternative track table) -// O2fwdtrack_* -> O2mcfwdtracklabel_* -// O2mfttrack_* -> O2mcmfttracklabel_* +// Index columns we know how to remap in a child: +// fIndexMcCollisions -> via mcCollPerm +// fIndexCollisions -> via collPerm +// fIndexMcParticles -> via mcParticlePerm +// fIndexArrayMcParticles -> via mcParticlePerm (VLA) // -// The PermMap from the parent stage is used directly as the row order. +// When the named parent is not in allPerms (e.g. tracks aren't reordered in +// this build), the child is processed with identity row order so the +// value-wise remaps still apply but the row order is unchanged. // Build the row order from a PermMap (srcRow -> outRow), inverted. static std::vector rowOrderFromPerm(const PermMap &perm) { @@ -790,34 +842,35 @@ static std::vector rowOrderFromPerm(const PermMap &perm) { return order; } -// The paste-join map: paste-joined table prefix -> parent table prefix -// We match by prefix (BeginsWith) because table names carry a numeric suffix. -static const std::vector> kPasteJoins = { - // { paste-joined prefix, parent prefix } - { "O2mccollisionlabel", "O2collision" }, - { "O2mctracklabel", "O2track" }, - { "O2mctracklabel", "O2trackiu" }, // same label table, alt parent - { "O2mcfwdtracklabel", "O2fwdtrack" }, - { "O2mcmfttracklabel", "O2mfttrack" }, -}; - -static void processPasteJoinTables( - TDirectory *dirIn, TDirectory *dirOut, +// Locate a permutation in allPerms whose key begins with a given prefix. +// Returns nullptr if none found. +static const PermMap *findPermByPrefix( const std::unordered_map &allPerms, - const std::unordered_set &alreadyWritten) { - - // Find the MC-particle permutation (produced by stage2 for O2mcparticle_*). - // Label tables (O2mctracklabel, O2mcfwdtracklabel, O2mcmfttracklabel, - // O2mccalolabel) carry fIndexMcParticles / fIndexArrayMcParticles that must - // be remapped via this permutation regardless of whether the label table's - // row order changes. - const PermMap *mcParticlePerm = nullptr; + const char *prefix, + std::string *foundName = nullptr) { for (auto &[name, perm] : allPerms) { - if (TString(name.c_str()).BeginsWith("O2mcparticle")) { - mcParticlePerm = &perm; - break; + if (TString(name.c_str()).BeginsWith(prefix)) { + if (foundName) *foundName = name; + return &perm; } } + return nullptr; +} + +static void processPasteJoinTables( + TDirectory *dirIn, TDirectory *dirOut, + const std::unordered_map &allPerms, + const std::unordered_set &alreadyWritten, + const PermMap *bcPermP = nullptr) { + + // Pre-locate the parent permutations that paste-join children may want to + // apply to their own index columns. Any of these may legitimately be null + // (e.g. mcParticlePerm absent if there is no O2mcparticle in this DF). + const PermMap *mcParticlePerm = findPermByPrefix(allPerms, "O2mcparticle"); + const PermMap *mcCollPermP = findPermByPrefix(allPerms, "O2mccollision_"); + const PermMap *collPermP = findPermByPrefix(allPerms, "O2collision_"); + // bcPermP is passed in from processDF (the BC table is the only stage + // whose permutation isn't already published in allPerms). TIter it(dirIn->GetListOfKeys()); while (TKey *key = static_cast(it())) { @@ -829,10 +882,14 @@ static void processPasteJoinTables( std::string tname = src->GetName(); if (alreadyWritten.count(tname)) continue; if (isBCTable(tname.c_str())) continue; - if (bcIndexBranch(src) || mcCollIndexBranch(src)) continue; - - // Build extra remaps for any fIndexMcParticles / fIndexArrayMcParticles - // branches in this table (label tables pointing into O2mcparticle). + // Stage-1 BC-indexed and Stage-2 MCColl-indexed non-paste-join tables are + // already in alreadyWritten. A paste-join child carrying its own MCColl + // index (e.g. O2mccollisionlabel) was deferred from stage2 and lands here. + if (bcIndexBranch(src)) continue; + if (mcCollIndexBranch(src) && !isPasteJoinChild(tname)) continue; + + // Build value-wise extra remaps for any index column this table carries + // that points into a table whose row order may have changed. std::vector extraRemaps; if (mcParticlePerm) { if (src->GetBranch("fIndexMcParticles")) @@ -840,19 +897,32 @@ static void processPasteJoinTables( if (src->GetBranch("fIndexArrayMcParticles")) extraRemaps.push_back({"fIndexArrayMcParticles", mcParticlePerm}); } + if (mcCollPermP && src->GetBranch("fIndexMcCollisions")) + extraRemaps.push_back({"fIndexMcCollisions", mcCollPermP}); + if (collPermP && src->GetBranch("fIndexCollisions")) + extraRemaps.push_back({"fIndexCollisions", collPermP}); + if (bcPermP) { + // BC-pointing indices that weren't already remapped in Stage 1. + // Stage-1 BC-indexed tables (with fIndexBCs / fIndexBC) are in + // alreadyWritten by now, so this only fires for tables that escaped + // Stage 1 — chiefly the O2ambiguous* family, which carries the SOA + // SLICE_INDEX_COLUMN(BC, bc) stored on disk as fIndexSliceBCs[2]/I. + // After BC dedup the slice endpoints would otherwise point past the + // compacted BC table; remapping through bcPerm fixes this. + if (src->GetBranch("fIndexSliceBCs")) + extraRemaps.push_back({"fIndexSliceBCs", bcPermP}); + if (src->GetBranch("fIndexBCs")) + extraRemaps.push_back({"fIndexBCs", bcPermP}); + if (src->GetBranch("fIndexBC")) + extraRemaps.push_back({"fIndexBC", bcPermP}); + } - // Check if this is a known paste-join table + // Find a paste-join parent for this table (kPasteJoins lookup). const PermMap *parentPerm = nullptr; std::string parentName; for (auto &[pastePrefix, parentPrefix] : kPasteJoins) { if (!TString(tname.c_str()).BeginsWith(pastePrefix.c_str())) continue; - for (auto &[pname, perm] : allPerms) { - if (TString(pname.c_str()).BeginsWith(parentPrefix.c_str())) { - parentPerm = &perm; - parentName = pname; - break; - } - } + parentPerm = findPermByPrefix(allPerms, parentPrefix.c_str(), &parentName); if (parentPerm) break; } @@ -870,10 +940,11 @@ static void processPasteJoinTables( } else { rewriteTable(src, dirOut, rowOrder, "", {}, extraRemaps); } - } else if (!extraRemaps.empty()) { - // Not paste-joined but has indices that need remapping (e.g. O2mccalolabel - // which is not in kPasteJoins but carries fIndexArrayMcParticles). - std::cout << " Remap-only: " << tname << "\n"; + } else if (!extraRemaps.empty() || isPasteJoinChild(tname)) { + // Parent wasn't reordered (or not present in this DF) — keep row order + // identical but still apply value-wise index remaps and follow the + // paste-join 1:1 invariant by going through the identity row order. + std::cout << " Identity-order remap: " << tname << "\n"; Long64_t n = src->GetEntries(); std::vector identity(n); std::iota(identity.begin(), identity.end(), 0LL); @@ -986,7 +1057,7 @@ static void processDF(TDirectory *dirIn, TDirectory *dirOut) { // ---- Paste-join tables + unrelated tables ---- std::cout << "-- Paste-join and unrelated tables --\n"; - processPasteJoinTables(dirIn, dirOut, stage1Perms, written); + processPasteJoinTables(dirIn, dirOut, stage1Perms, written, &s0.bcPerm); // ---- Non-tree objects (TMap metadata) ---- copyNonTreeObjects(dirIn, dirOut); @@ -1002,14 +1073,109 @@ static void processDF(TDirectory *dirIn, TDirectory *dirOut) { // 1. BC table is strictly monotonic in fGlobalBC. // 2. MC particle intra-table daughter/mother indices are in range and point // to particles belonging to the same MC collision. -// 3. fIndexMcParticles in label tables is in range. +// 3. Every paste-joined child table has the same row count as its parent +// (e.g. O2mccollisionlabel matches O2collision). +// 4. Every fIndex* value across the DF is in range w.r.t. its referent +// table (value -1 is always permitted as the "no link" sentinel). // -// Returns true if all checks pass. Prints a summary to stdout. +// Returns true if all checks pass. Prints [FAIL] lines for each violation. + +// Map from fIndex* branch name to the table-name prefix it refers to. The +// match on the referent side uses TString::BeginsWith so versioned suffixes +// (O2collision_001, O2bc_001, ...) are handled. Branches not in this list +// are skipped by the range check (this includes O2mcparticle's intra-table +// fIndexArray_Mothers / fIndexSlice_Daughters, which are checked separately +// with stricter semantics in the MC-particle block). +static const std::vector> kIndexBranchToTable = { + { "fIndexBCs", "O2bc_" }, + { "fIndexBC", "O2bc_" }, + { "fIndexSliceBCs", "O2bc_" }, + { "fIndexCollisions", "O2collision_" }, + { "fIndexCollision", "O2collision_" }, + { "fIndexMcCollisions", "O2mccollision_" }, + { "fIndexMcParticles", "O2mcparticle" }, + { "fIndexArrayMcParticles", "O2mcparticle" }, + { "fIndexTracks", "O2track_iu" }, + { "fIndexTracks_0", "O2track_iu" }, + { "fIndexTracks_1", "O2track_iu" }, + { "fIndexTracks_2", "O2track_iu" }, + { "fIndexTracks_Pos", "O2track_iu" }, + { "fIndexTracks_Neg", "O2track_iu" }, + { "fIndexTracks_ITS", "O2track_iu" }, + { "fIndexFwdTracks", "O2fwdtrack" }, + { "fIndexFwdTracks_MatchMCHTrack", "O2fwdtrack" }, + { "fIndexMFTTracks", "O2mfttrack" }, + { "fIndexV0s", "O2v0_" }, + { "fIndexCascades", "O2cascade_" }, + { "fIndexDecay3Bodys", "O2decay3body" }, +}; + +// Find a tree in d whose name begins with the given prefix. Returns the +// number of entries, or -1 if not found. +static Long64_t treeEntriesByPrefix(TDirectory *d, const char *prefix) { + TIter it(d->GetListOfKeys()); + TKey *k; + while ((k = (TKey*)it())) { + if (!TString(k->GetName()).BeginsWith(prefix)) continue; + TObject *obj = d->Get(k->GetName()); + if (!obj || !obj->InheritsFrom(TTree::Class())) continue; + return ((TTree*)obj)->GetEntries(); + } + return -1; +} + +// Generic in-range check for every fIndex* branch listed above. Reads the +// branch's leaf (scalar, fixed-array, or VLA), iterates all entries, and +// counts how many values are outside [-1, nReferent). +static Long64_t checkIndexRange(TTree *t, const char *branchName, + Long64_t nReferent) { + TBranch *br = t->GetBranch(branchName); + if (!br) return 0; + TLeaf *leaf = (TLeaf*)br->GetListOfLeaves()->At(0); + if (!leaf) return 0; + if (TString(leaf->GetTypeName()) != "Int_t") return 0; // only Int_t indices + + TLeaf *cntLeaf = leaf->GetLeafCount(); // VLA? + int fixedN = leaf->GetLen(); // 1 for scalar, >1 for fixed array + + // Allocate worst-case buffer. For a VLA we need a prescan to size it. + Long64_t maxLen = fixedN; + if (cntLeaf) { + // simple prescan + Int_t cnt = 0; + TBranch *cntBr = cntLeaf->GetBranch(); + cntBr->SetAddress(&cnt); + for (Long64_t i = 0; i < t->GetEntries(); ++i) { + cntBr->GetEntry(i); + if (cnt > maxLen) maxLen = cnt; + } + } + std::vector buf(std::max(1, maxLen), 0); + Int_t cnt = fixedN; + TBranch *cntBr = cntLeaf ? cntLeaf->GetBranch() : nullptr; + br->SetAddress(buf.data()); + if (cntBr) cntBr->SetAddress(&cnt); + + Long64_t bad = 0; + for (Long64_t i = 0; i < t->GetEntries(); ++i) { + br->GetEntry(i); + if (cntBr) cntBr->GetEntry(i); + int n = cntBr ? (int)cnt : fixedN; + for (int j = 0; j < n; ++j) { + Int_t v = buf[j]; + if (v < -1) { ++bad; continue; } + if (v >= (Int_t)nReferent) { ++bad; continue; } + } + } + br->ResetAddress(); + if (cntBr) cntBr->ResetAddress(); + return bad; +} static bool validateDF(TDirectory *d) { bool ok = true; - // ---- BC monotonicity ---- + // ---- discover key trees ---- TIter it(d->GetListOfKeys()); TKey *k; TTree *bcTree = nullptr; @@ -1084,6 +1250,47 @@ static bool validateDF(TDirectory *d) { mcpTree->SetBranchStatus("*", 1); } + // ---- Paste-join row-count parity ---- + // For every (child, parent) pair in kPasteJoins, if both are present in the + // DF their row counts must be identical. This catches the class of bugs + // where a child was sorted/dropped on its own index (e.g. a previous + // version dropped O2mccollisionlabel rows on MC-collision dedup while + // leaving O2collision_001 intact, producing an off-by-N mismatch). + for (auto &[childPrefix, parentPrefix] : kPasteJoins) { + Long64_t nChild = treeEntriesByPrefix(d, childPrefix.c_str()); + Long64_t nParent = treeEntriesByPrefix(d, parentPrefix.c_str()); + if (nChild < 0 || nParent < 0) continue; // pair not both present + if (nChild != nParent) { + std::cerr << " [FAIL] paste-join size mismatch: " << childPrefix << "*" + << " has " << nChild << " rows but parent " << parentPrefix << "*" + << " has " << nParent << "\n"; + ok = false; + } + } + + // ---- Generic fIndex* range check ---- + // For each table in the DF, scan all fIndex* branches and confirm every + // value lies in [-1, nReferent). This catches stale pointers across + // tables (cross-table index drift) which a per-DF-tree-only check misses. + TIter it2(d->GetListOfKeys()); + TKey *k2; + while ((k2 = (TKey*)it2())) { + TObject *obj = d->Get(k2->GetName()); + if (!obj || !obj->InheritsFrom(TTree::Class())) continue; + TTree *t = (TTree*)obj; + for (auto &[branchName, referentPrefix] : kIndexBranchToTable) { + if (!t->GetBranch(branchName.c_str())) continue; + Long64_t nRef = treeEntriesByPrefix(d, referentPrefix.c_str()); + if (nRef < 0) continue; // referent not in this DF; skip silently + Long64_t bad = checkIndexRange(t, branchName.c_str(), nRef); + if (bad > 0) { + std::cerr << " [FAIL] " << t->GetName() << "." << branchName + << ": " << bad << " value(s) out of range [-1, " << nRef << ")\n"; + ok = false; + } + } + } + return ok; } diff --git a/MC/utils/CLAUDE.md b/MC/utils/CLAUDE.md new file mode 100644 index 000000000..0ff309146 --- /dev/null +++ b/MC/utils/CLAUDE.md @@ -0,0 +1,281 @@ +# CLAUDE.md — AODBcRewriter Development Handoff + +## What this tool does + +`AODBcRewriter.C` is a ROOT macro that fixes structural integrity problems in +ALICE Run3 AO2D files after merging. AO2D files are ROOT files containing +`DF_*` subdirectories, each holding a set of TTrees that form a relational +schema (similar to a database). After merging two AO2D files with `hadd` or +similar tools, three problems can arise: + +1. **Non-monotonic `fGlobalBC`** in the BC table — the framework requires + strictly increasing values. +2. **Duplicate `fGlobalBC` entries** — the same bunch crossing represented by + multiple rows. +3. **Duplicate MCCollision entries** — the same MC event appearing twice + because it was present in both source files before merging. + +Run with: +```bash +root -l -b -q 'AODBcRewriter.C("AO2D.root","AO2D_rewritten.root")' +``` + +--- + +## AO2D data model (relevant subset) + +The tables form a dependency graph. Every stage of the tool processes one +level of this graph and produces a **PermMap** (`vector`, +`permMap[oldRow] = newRow`, -1 = row dropped) which the next stage consumes. + +``` +BCs (O2bc_*) [Stage 0] + │ fIndexBCs / fIndexBC + ├─► Collisions (O2collision_*) [Stage 1] + │ │ paste-join ──► McCollisionLabels (O2mccollisionlabel_*) + │ └─► Tracks (O2track_*, O2trackiu_*, ...) [Stage 1] + │ paste-join ─► McTrackLabels (O2mctracklabel_*) + │ paste-join ─► McFwdTrackLabels, McMFTTrackLabels + │ + └─► MCCollisions (O2mccollision_*) [Stage 1, deduped] + │ fIndexMcCollisions + ├─► HepMCXSections (O2hepmcxsection_*) [Stage 2] + ├─► HepMCPdfInfos (O2hepmcpdfinfo_*) [Stage 2] + └─► HepMCHeavyIons (O2hepmcheavyion_*) [Stage 2] +``` + +**Index joins** (`fIndexBCs`, `fIndexCollisions`, `fIndexMcCollisions`) are +explicit integer columns pointing to a row in another table by position. + +**Paste joins** are implicit: table row N of a paste-joined table corresponds +to row N of its parent table. These tables have *no index column*. They must +be reordered to match their parent whenever the parent is reordered. + +The known paste-join relationships are hardcoded in `kPasteJoins` (Section 1). +The list is authoritative — derived from `AnalysisDataModel.h` comments +("Table joined to the collision table containing the MC index", etc.) and +from the SOA `EXTENDED_TABLE` declarations for cov / extra tables: +``` +O2bcflag → parent: O2bc_* (BCFlags joinable with BCs) +O2mccollisionlabel → parent: O2collision_* (McCollisionLabels) +O2mctracklabel → parent: O2track_iu (or O2track) +O2mcfwdtracklabel → parent: O2fwdtrack +O2mcmfttracklabel → parent: O2mfttrack +O2mccalolabel → parent: O2calo (McCaloLabels) +O2trackcov_iu → parent: O2track_iu (TracksCovIU extension) +O2trackextra → parent: O2track_iu (TracksExtra extension) +O2fwdtrackcov → parent: O2fwdtrack (FwdTracksCov extension) +``` +NOT in this list (despite the suffix): `O2mfttrackcov` carries its own +`fIndexMFTTracks` and is **index-linked**, not paste-joined. + +A child may carry its own index columns (e.g. `O2mccollisionlabel` carries +`fIndexMcCollisions`). Those values are remapped *value-wise* through the +appropriate parent-stage permutation, but the child's row count and row +order strictly follow its paste-join parent. + +--- + +## Code structure (11 sections) + +| Section | Function(s) | Purpose | +|---------|-------------|---------| +| 1 | `PermMap`, `isBCTable`, `bcIndexBranch`, `mcCollIndexBranch`, `collIndexBranch`, `kPasteJoins`, `isPasteJoinChild` | Core types, name-probe helpers, and the authoritative paste-join list | +| 2 | `ScalarTag`, `tagOf`, `byteSize`, `readAsInt`, `writeAsInt`, `BranchDesc`, `describeBranches` | Generic ROOT branch I/O over raw byte buffers | +| 3 | `rewriteTable` | **Central engine**: writes any table in a given row order, remapping one nominated index column via a PermMap | +| 4 | `BCStage0Result`, `stage0_sortBCs` | Sort + deduplicate the BC table; produce `bcPerm` | +| 5 | `stage0_copyBCFlags` | Copy BC flags table following BC row selection | +| 6 | `MCCollKey`, `MCCollKeyHash`, `stage1_BCindexedTables` | Process all BC-indexed tables; deduplicate MCCollisions | +| 7 | `stage2_MCCollIndexedTables` | Process all MCCollision-indexed tables; drop rows whose parent was deduped | +| 8 | `rowOrderFromPerm`, `findPermByPrefix`, `processPasteJoinTables` | Reorder paste-joined tables to follow their parent (1:1 row count guaranteed); remap any of their own index columns value-wise; copy unrelated tables verbatim | +| 9 | `copyNonTreeObjects` | Copy TMap metadata and other non-TTree objects | +| 10 | `processDF` | Orchestrates all stages for one `DF_*` directory | +| 11 | `AODBcRewriter` | Top-level entry: opens files, iterates `DF_*` dirs, preserves compression | + +### `rewriteTable` — the central engine + +```cpp +PermMap rewriteTable(TTree *src, TDirectory *dirOut, + const vector &rowOrder, + const string &indexBranch, + const PermMap &parentPerm); +``` + +- `rowOrder`: which source rows to emit and in what sequence (may be a subset + for deduplication, or reordered for sorting) +- `indexBranch`: name of the one index column to remap (e.g. `"fIndexBCs"`), + or `""` for none +- `parentPerm`: the PermMap from the parent stage used to translate the old + index value to a new one +- Returns `srcToOut` PermMap: `srcToOut[srcRow] = outRow`, -1 if dropped + +The function handles both scalar branches and VLA (variable-length array) +branches generically. For VLAs it pre-scans the count branch to find the +maximum array length and allocates buffers accordingly. Input and output +branches share the same raw byte buffers; ROOT handles the VLA count +implicitly through the shared count buffer. + +--- + +## MCCollision deduplication + +Implemented in `stage1_BCindexedTables` when the current table begins with +`O2mccollision`. + +**Key**: `MCCollKey { Long64_t newBCrow; Float_t weight; }` using `fEventWeight`. + +**Important constraint**: deduplication is only enabled when `fEventWeight` is +present in the tree. If it is absent, all rows are kept (only reordered). This +is intentional: deduplicating on `newBCrow` alone would incorrectly collapse +distinct MC events that happen to share the same bunch crossing. + +When a MCCollision row is dropped (PermMap entry = -1), Stage 2 propagates +the drop: any `O2hepmcxsection_*` / `O2hepmcpdfinfo_*` / `O2hepmcheavyion_*` +row whose `fIndexMcCollisions` pointed to a dropped row is also dropped. + +--- + +## Known gaps / TODO items + +These were identified during the refactor but not yet implemented: + +### 1. `fIndexCollisions` inside `O2mccollision` is not remapped + +`O2mccollision` has both `fIndexBCs` (handled) and `fIndexCollisions` (linking +back to the reconstructed `O2collision` row). After Stage 1 reorders +`O2collision`, this second index in `O2mccollision` becomes stale. + +**Fix**: After `stage1_BCindexedTables` runs, find `collPerm` (the PermMap for +`O2collision_*`) in `stage1Perms`, then apply a second `rewriteTable` pass on +`O2mccollision_*` to remap `fIndexCollisions` via `collPerm`. The +`ExtraRemap` mechanism in `rewriteTable` already supports this pattern. + +### 2. Deduplication key could be strengthened + +The current `(newBCrow, fEventWeight)` key is a good heuristic. A more robust +key would additionally include `fImpactParameter` and/or `fGeneratorsID` if +those branches are present. Consider making the key construction a small +helper function that probes which fields are available and builds the strongest +possible key. + +### 3. `O2mccollision` has two potential parents for paste-join lookup + +In `processDF`, the MCColl PermMap is extracted by scanning `stage1Perms` for +a name beginning with `"O2mccollision"`. If the DF contains both +`O2mccollision_000` and `O2mccollision_001` (schema version coexistence), +only the first found is used. Add a warning and handle this explicitly if it +becomes relevant. + +### 4. Paste-join size-mismatch fallback is silent-ish + +When a paste-joined table has a different row count from its parent (schema +drift), the tool falls back to `CloneTree(-1, "fast")` and prints a warning. +This produces a structurally inconsistent output. Consider making this a hard +error, or implement a best-effort row-count reconciliation. + +### ~~5. No validation pass~~ (RESOLVED) + +`AODBcRewriterValidate(fname)` (Section 11) now validates BC monotonicity, +MC-particle intra-table index integrity, paste-join row-count parity, and +generic `fIndex*` range against the referent table. Call it after rewriting +to confirm output correctness. + +### ~~6. fIndexArray_Mothers / fIndexSlice_Daughters not remapped~~ (RESOLVED) + +This was the root cause of the O2Physics FATAL +`MC particle N has daughter with index M > MC particle table size`. +After Stage 2 reorders `O2mcparticle`, the intra-table mother/daughter indices +now get remapped via `ExtraRemap` in the same pass (Section 7). + +`fIndexMcParticles` in label tables (`O2mctracklabel`, `O2mcfwdtracklabel`, +`O2mcmfttracklabel`, `O2mccalolabel`) is also now remapped via the MC-particle +permutation in `processPasteJoinTables` (Section 8). + +### ~~8. fIndexSliceBCs in O2ambiguous* not remapped after BC dedup~~ (RESOLVED) + +`fIndexSliceBCs` is a SOA `SLICE_INDEX_COLUMN(BC, bc)` (header line 1029), +stored on disk as a fixed `[2]/I` `{first, last}` pair pointing into the BC +table. It appears in `O2ambiguoustrack`, `O2ambiguousmfttr`, +`O2ambiguousfwdtr` — none of which carry `fIndexBCs` and therefore none +were processed by Stage 1. After BC dedup the slice endpoints would then +point past the compacted table. + +**Fix**: `processPasteJoinTables` now also accepts the BC permutation +(passed explicitly from `processDF`) and applies it value-wise to any +`fIndexSliceBCs` / `fIndexBCs` / `fIndexBC` column it finds. Validated +against `example_AOD/AO2D_pre.root`: pre-fix the rewritten output had 7 +and 19 out-of-range slice endpoints in DF_3594457012003; post-fix the +validator reports zero. + +### ~~7. Paste-join row-count drift on MC-collision dedup~~ (RESOLVED) + +`O2mccollisionlabel` is paste-joined to `O2collision_*` (row N ↔ row N) but +also carries `fIndexMcCollisions`. The previous code routed it through Stage +2 (because of the MC-collision index), which sorted it by new MC-collision +position and *dropped* rows whose MC collision had been deduplicated. That +left `O2mccollisionlabel` shorter than `O2collision_*` by N rows — leading +to downstream "O2collision_001 is one larger than O2mccollisionlabel" crashes. + +**Fix**: `kPasteJoins` was extended to cover every joined pair from +`AnalysisDataModel.h`. Paste-join children are now *deferred* from Stage 2 +to `processPasteJoinTables`, where they take the parent's row order and have +their own index columns remapped value-wise. Rows that lose their MC label +on dedup now correctly produce `fIndexMcCollisions == -1`, and the row count +matches the parent collision table. + +The new validator catches the regression class as +`[FAIL] paste-join size mismatch: O2mccollisionlabel* has N rows but parent + O2collision* has M`. + +--- + +## Testing checklist + +When testing a new AO2D: + +1. Run `AODBcRewriterValidate("AO2D_rewritten.root")` (Section 11). + It checks BC monotonicity, MC-particle intra-table integrity, paste-join + row-count parity for every pair in `kPasteJoins`, and `fIndex*` value + ranges against the referent table. Failures appear as `[FAIL] ...` lines. +2. Check stdout from the rewrite run itself for any `[warn]` lines — these + indicate branches or tables that fell through to a fallback path. +3. If deduplication ran, verify the dropped count is as expected by comparing + the input DF MCCollision count vs. output. + +A standalone minimal validation script (kept here for reference; in practice +just call `AODBcRewriterValidate`): +```cpp +// validate.C +void validate(const char *fname) { + TFile *f = TFile::Open(fname); + TIter top(f->GetListOfKeys()); + while (TKey *k = (TKey*)top()) { + if (!TString(k->GetName()).BeginsWith("DF_")) continue; + TDirectory *d = (TDirectory*)f->Get(k->GetName()); + // check BC monotonicity + TTree *bc = (TTree*)d->Get("O2bc_001"); // adjust suffix + if (bc) { + ULong64_t gbc, prev = 0; bool ok = true; + bc->SetBranchAddress("fGlobalBC", &gbc); + for (Long64_t i = 0; i < bc->GetEntries(); ++i) { + bc->GetEntry(i); + if (i > 0 && gbc <= prev) { printf("BC non-monotonic at row %lld\n", i); ok=false; } + prev = gbc; + } + if (ok) printf("%s: BCs OK (%lld entries)\n", k->GetName(), bc->GetEntries()); + } + } +} +``` + +--- + +## Data model reference + +Full table schema: https://aliceo2group.github.io/analysis-framework/docs/datamodel/ao2dTables.html + +Source definitions: `AliceO2/Framework/Core/include/Framework/AnalysisDataModel.h` + +The upstream PR this work improves upon: https://github.com/AliceO2Group/O2DPG/pull/2317 + +Target file location in O2DPG: `MC/utils/AODBcRewriter.C` \ No newline at end of file