Skip to content

Commit 194cd7a

Browse files
committed
feat(scripts/diff-flat): match remaining moves by ancestor and leaf words
Add a fallback pass to `detectMoves` for features without `spec_url` or `mdn_url`. Score each candidate by common-ancestor depth (primary) plus inverse-frequency-weighted shared leaf tokens (secondary). Tokens that appear in more than half of unmatched removed or added features are treated as scaffolding (e.g. `init`, `parameter`) and ignored. Catches renames like `api.fetch.init_keepalive_parameter` → `api.fetch.options_parameter.keepalive` that pass 1 misses for lack of a spec URL.
1 parent c872504 commit 194cd7a

1 file changed

Lines changed: 126 additions & 14 deletions

File tree

scripts/diff-flat.js

Lines changed: 126 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -263,11 +263,13 @@ const deepMerge = (target, source) => {
263263
};
264264

265265
/**
266-
* Collects URL fingerprints (spec_url and mdn_url) for each feature.
266+
* Collects URL fingerprints (spec_url and mdn_url) for each feature, and
267+
* includes features without URLs as empty entries so they're available to
268+
* the token-based fallback matcher.
267269
* @param {*} contents the merged data tree.
268-
* @returns {Map<string, Set<string>>} map from feature path to its set of URL keys.
270+
* @returns {Map<string, Set<string>>} map from feature path to URL set (possibly empty).
269271
*/
270-
const collectFeatureUrls = (contents) => {
272+
const collectFeatures = (contents) => {
271273
/** @type {Map<string, Set<string>>} */
272274
const features = new Map();
273275
for (const { path, compat } of walk(undefined, contents)) {
@@ -281,13 +283,28 @@ const collectFeatureUrls = (contents) => {
281283
if (compat.mdn_url) {
282284
urls.add(`mdn:${compat.mdn_url}`);
283285
}
284-
if (urls.size) {
285-
features.set(path, urls);
286-
}
286+
features.set(path, urls);
287287
}
288288
return features;
289289
};
290290

291+
/**
292+
* Tokenizes a feature path's leaf segment into lowercase words, splitting on
293+
* `_`, `.` and camelCase boundaries. Returns a Set so each word counts once
294+
* per feature.
295+
* @param {string} path the feature path.
296+
* @returns {Set<string>} the leaf tokens.
297+
*/
298+
const tokenizeLeaf = (path) => {
299+
const leaf = path.split('.').pop() ?? '';
300+
return new Set(
301+
leaf
302+
.split(/[_.]+|(?=[A-Z])/)
303+
.filter(Boolean)
304+
.map((w) => w.toLowerCase()),
305+
);
306+
};
307+
291308
/**
292309
* Reads the value at a dot-separated path within a tree.
293310
* @param {*} root the root object.
@@ -351,18 +368,20 @@ const projectMoves = (baseContents, moves) => {
351368
};
352369

353370
/**
354-
* Detects features that were moved (renamed) by matching shared spec_url/mdn_url
355-
* between features removed in base and features added in head. When multiple
356-
* candidates share a URL, prefers the candidate with the longest shared path
357-
* prefix (so `api.fetch.init_X` prefers `api.fetch.options_parameter.X` over
358-
* `api.Request.Request.options_parameter.X`).
371+
* Detects features that were moved (renamed) in two passes:
372+
* 1. Match by shared spec_url/mdn_url, with longest-shared-path-prefix as
373+
* tiebreaker when multiple candidates share a URL.
374+
* 2. For features still unmatched, match by common ancestor path plus
375+
* shared non-scaffold leaf words (`keepalive`, `signal`, etc.).
376+
* Scaffold tokens — those appearing in more than half of unmatched
377+
* removed or added features (e.g. `init`, `parameter`) — are ignored.
359378
* @param {*} baseContents the merged base data tree.
360379
* @param {*} headContents the merged head data tree.
361380
* @returns {Map<string, string>} map from removed path to added path.
362381
*/
363382
const detectMoves = (baseContents, headContents) => {
364-
const baseFeatures = collectFeatureUrls(baseContents);
365-
const headFeatures = collectFeatureUrls(headContents);
383+
const baseFeatures = collectFeatures(baseContents);
384+
const headFeatures = collectFeatures(headContents);
366385

367386
/** @type {Map<string, string[]>} */
368387
const addedByUrl = new Map();
@@ -379,8 +398,10 @@ const detectMoves = (baseContents, headContents) => {
379398

380399
/** @type {Map<string, string>} */
381400
const moves = new Map();
401+
/** @type {Set<string>} */
402+
const matchedDests = new Set();
382403
for (const [removedPath, urls] of baseFeatures) {
383-
if (headFeatures.has(removedPath)) {
404+
if (headFeatures.has(removedPath) || urls.size === 0) {
384405
continue;
385406
}
386407
/** @type {Set<string>} */
@@ -413,6 +434,97 @@ const detectMoves = (baseContents, headContents) => {
413434
}
414435
}
415436
moves.set(removedPath, best);
437+
matchedDests.add(best);
438+
}
439+
440+
// Pass 2: token + common-ancestor matching for the rest.
441+
const unmatchedRemoved = [...baseFeatures.keys()].filter(
442+
(p) => !headFeatures.has(p) && !moves.has(p),
443+
);
444+
const unmatchedAdded = [...headFeatures.keys()].filter(
445+
(p) => !baseFeatures.has(p) && !matchedDests.has(p),
446+
);
447+
if (unmatchedRemoved.length === 0 || unmatchedAdded.length === 0) {
448+
return moves;
449+
}
450+
451+
/** @type {Map<string, Set<string>>} */
452+
const removedTokens = new Map();
453+
/** @type {Map<string, Set<string>>} */
454+
const addedTokens = new Map();
455+
/** @type {Map<string, number>} */
456+
const removedFreq = new Map();
457+
/** @type {Map<string, number>} */
458+
const addedFreq = new Map();
459+
for (const p of unmatchedRemoved) {
460+
const tokens = tokenizeLeaf(p);
461+
removedTokens.set(p, tokens);
462+
for (const t of tokens) {
463+
removedFreq.set(t, (removedFreq.get(t) ?? 0) + 1);
464+
}
465+
}
466+
for (const p of unmatchedAdded) {
467+
const tokens = tokenizeLeaf(p);
468+
addedTokens.set(p, tokens);
469+
for (const t of tokens) {
470+
addedFreq.set(t, (addedFreq.get(t) ?? 0) + 1);
471+
}
472+
}
473+
/**
474+
* @param {string} token
475+
* @returns {boolean} true if the token is too common to be distinctive.
476+
*/
477+
const isScaffold = (token) =>
478+
(removedFreq.get(token) ?? 0) > unmatchedRemoved.length / 2 ||
479+
(addedFreq.get(token) ?? 0) > unmatchedAdded.length / 2;
480+
481+
for (const removedPath of unmatchedRemoved) {
482+
const rTokens = /** @type {Set<string>} */ (removedTokens.get(removedPath));
483+
const rParts = removedPath.split('.');
484+
let best = '';
485+
let bestScore = -1;
486+
487+
for (const addedPath of unmatchedAdded) {
488+
if (matchedDests.has(addedPath)) {
489+
continue;
490+
}
491+
const aTokens = /** @type {Set<string>} */ (addedTokens.get(addedPath));
492+
const aParts = addedPath.split('.');
493+
494+
let ancestor = 0;
495+
while (
496+
ancestor < rParts.length - 1 &&
497+
ancestor < aParts.length - 1 &&
498+
rParts[ancestor] === aParts[ancestor]
499+
) {
500+
ancestor++;
501+
}
502+
if (ancestor === 0) {
503+
continue;
504+
}
505+
506+
let tokenScore = 0;
507+
for (const t of rTokens) {
508+
if (aTokens.has(t) && !isScaffold(t)) {
509+
const freq = (removedFreq.get(t) ?? 0) + (addedFreq.get(t) ?? 0) || 1;
510+
tokenScore += 1 / freq;
511+
}
512+
}
513+
if (tokenScore === 0) {
514+
continue;
515+
}
516+
517+
const score = ancestor * 1000 + tokenScore;
518+
if (score > bestScore) {
519+
best = addedPath;
520+
bestScore = score;
521+
}
522+
}
523+
524+
if (best) {
525+
moves.set(removedPath, best);
526+
matchedDests.add(best);
527+
}
416528
}
417529

418530
return moves;

0 commit comments

Comments
 (0)