Skip to content

Commit b0067d7

Browse files
authored
fix: add spatial sort in getRawTextContent to ensure reading order (#422)
1 parent 48b50bf commit b0067d7

6 files changed

Lines changed: 765 additions & 1222 deletions

File tree

lib/pdf.js

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import PDFFont from "./pdffont.js";
2020
import PDFUnit from "./pdfunit.js";
2121
import PTIXmlParser from "./ptixmlinject.js";
2222
import { createScratchCanvas } from "./pdfcanvas.js";
23+
import { BASELINE_TOLERANCE_RATIO, sortBidiTexts } from "./pdftextsorter.js";
2324

2425
//start of helper classes
2526
class PDFPageParser {
@@ -333,41 +334,42 @@ export default class PDFJSClass extends EventEmitter {
333334

334335
this.rawTextContents.forEach((textContent, index) => {
335336
let prevText = null;
336-
337-
textContent.bidiTexts.forEach((textObj, idx) => {
338-
// Check if on same line
339-
// Use a tolerance relative to font size for better accuracy
340-
// Typical line spacing is 120% of font size, so 10-15% tolerance is reasonable
341-
const tolerance = prevText ? (prevText.fontSize || 12) * 0.15 : 2;
337+
338+
// Spatially sort bidiTexts into visual reading order (top-to-bottom, left-to-right)
339+
const bidiTexts = sortBidiTexts(textContent.bidiTexts);
340+
341+
bidiTexts.forEach((textObj) => {
342+
// Check if on same line using the same tolerance ratio as the sorter
343+
const tolerance = prevText ? (prevText.fontSize || 12) * BASELINE_TOLERANCE_RATIO : 2;
342344
const sameLine = prevText && Math.abs(textObj.y - prevText.y) <= tolerance;
343-
345+
344346
if (sameLine) {
345347
// spaceWidth is in unscaled coordinates (no textHScale, matching JSON w property)
346348
const { spaceWidth, startX, width, textHScale } = prevText;
347-
349+
348350
// Use actual calculated text width (from glyph widths)
349351
// width is in unscaled coordinates, but startX is in scaled coordinates
350352
// So we must apply textHScale to width before adding to startX
351353
// This matches canvas.js: current.x += x * textHScale (line 1267)
352354
const prevTextEndX = startX + (width * textHScale);
353-
355+
354356
// Calculate gap between end of previous text and start of current text
355357
// gap is in SCALED coordinates (both textObj.x and prevTextEndX are scaled)
356358
const gap = textObj.x - prevTextEndX;
357-
359+
358360
// Scale spaceWidth to match gap's coordinate system
359361
const scaledSpaceWidth = spaceWidth * textHScale;
360-
362+
361363
// Add spaces if gap is positive and significant (> 30% of scaled space width)
362364
// Also check that scaledSpaceWidth is valid to avoid division by zero
363365
if (scaledSpaceWidth > 0 && gap > scaledSpaceWidth * 0.3) {
364366
const numSpaces = Math.round(gap / scaledSpaceWidth);
365367
prevText.str += ' '.repeat(Math.max(1, numSpaces));
366368
}
367-
369+
368370
// Append current text
369371
prevText.str += textObj.str;
370-
372+
371373
// Update prevText to track current text for next iteration
372374
prevText.startX = textObj.x;
373375
prevText.width = textObj.width;
@@ -378,11 +380,11 @@ export default class PDFJSClass extends EventEmitter {
378380
if (prevText) {
379381
retVal += `${prevText.str}\r\n`;
380382
}
381-
383+
382384
// Initialize new text object with font metrics
383-
prevText = {
384-
str: textObj.str,
385-
y: textObj.y,
385+
prevText = {
386+
str: textObj.str,
387+
y: textObj.y,
386388
startX: textObj.x,
387389
width: textObj.width,
388390
spaceWidth: textObj.spaceWidth,
@@ -391,7 +393,7 @@ export default class PDFJSClass extends EventEmitter {
391393
};
392394
}
393395
});
394-
396+
395397
if (prevText) {
396398
retVal += prevText.str;
397399
}

lib/pdftextsorter.js

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/**
2+
* Ratio used to compute the Y-axis grouping tolerance for a text element.
3+
* A text element with fontSize 12pt yields tolerance 12 * 0.15 = 1.8pt.
4+
* This absorbs normal subscript/superscript baseline shifts while keeping
5+
* genuinely different lines separate.
6+
*/
7+
const BASELINE_TOLERANCE_RATIO = 0.15;
8+
9+
/**
10+
* Spatial Sort: Sort an array of bidiText objects into spatial reading order:
11+
* 1. Group elements into horizontal lines by their Y coordinate, using a
12+
* font-size-proportional tolerance to keep subscripts/superscripts on
13+
* the same line as their base characters.
14+
* 2. Sort the resulting lines top-to-bottom (ascending Y).
15+
* 3. Within each line, sort elements left-to-right (ascending X).
16+
*
17+
* The original array is not mutated; a new sorted flat array is returned.
18+
*
19+
* @param {Array<{str:string, x:number, y:number, width:number, spaceWidth:number, textHScale:number, fontSize?:number}>} bidiTexts
20+
* @returns {typeof bidiTexts}
21+
*/
22+
function sortBidiTexts(bidiTexts) {
23+
if (!bidiTexts || bidiTexts.length === 0) return bidiTexts;
24+
25+
// — Phase 1: bucket elements into line groups by Y ——————————————————————
26+
const lines = [];
27+
28+
for (const textObj of bidiTexts) {
29+
const tolerance = (textObj.fontSize || 12) * BASELINE_TOLERANCE_RATIO;
30+
let foundLine = null;
31+
32+
for (const line of lines) {
33+
// Compare against the Y of the first element added to the line.
34+
// Using the group's representative Y keeps the bucket anchor stable
35+
// even when mixed-size fonts appear consecutively.
36+
if (Math.abs(textObj.y - line[0].y) <= tolerance) {
37+
foundLine = line;
38+
break;
39+
}
40+
}
41+
42+
if (foundLine) {
43+
foundLine.push(textObj);
44+
} else {
45+
lines.push([textObj]);
46+
}
47+
}
48+
49+
// — Phase 2: sort lines top-to-bottom ——————————————————————————————————
50+
lines.sort((a, b) => a[0].y - b[0].y);
51+
52+
// — Phase 3: sort elements within each line left-to-right ——————————————
53+
for (const line of lines) {
54+
line.sort((a, b) => a.x - b.x);
55+
}
56+
57+
return lines.flat();
58+
}
59+
60+
export { BASELINE_TOLERANCE_RATIO, sortBidiTexts };

0 commit comments

Comments
 (0)