@@ -20,6 +20,7 @@ import PDFFont from "./pdffont.js";
2020import PDFUnit from "./pdfunit.js" ;
2121import PTIXmlParser from "./ptixmlinject.js" ;
2222import { createScratchCanvas } from "./pdfcanvas.js" ;
23+ import { BASELINE_TOLERANCE_RATIO , sortBidiTexts } from "./pdftextsorter.js" ;
2324
2425//start of helper classes
2526class PDFPageParser {
@@ -333,41 +334,42 @@ export default class PDFJSClass extends EventEmitter {
333334
334335 this . rawTextContents . forEach ( ( textContent , index ) => {
335336 let prevText = null ;
336-
337- textContent . bidiTexts . forEach ( ( textObj , idx ) => {
338- // Check if on same line
339- // Use a tolerance relative to font size for better accuracy
340- // Typical line spacing is 120% of font size, so 10-15% tolerance is reasonable
341- const tolerance = prevText ? ( prevText . fontSize || 12 ) * 0.15 : 2 ;
337+
338+ // Spatially sort bidiTexts into visual reading order (top-to-bottom, left-to-right)
339+ const bidiTexts = sortBidiTexts ( textContent . bidiTexts ) ;
340+
341+ bidiTexts . forEach ( ( textObj ) => {
342+ // Check if on same line using the same tolerance ratio as the sorter
343+ const tolerance = prevText ? ( prevText . fontSize || 12 ) * BASELINE_TOLERANCE_RATIO : 2 ;
342344 const sameLine = prevText && Math . abs ( textObj . y - prevText . y ) <= tolerance ;
343-
345+
344346 if ( sameLine ) {
345347 // spaceWidth is in unscaled coordinates (no textHScale, matching JSON w property)
346348 const { spaceWidth, startX, width, textHScale } = prevText ;
347-
349+
348350 // Use actual calculated text width (from glyph widths)
349351 // width is in unscaled coordinates, but startX is in scaled coordinates
350352 // So we must apply textHScale to width before adding to startX
351353 // This matches canvas.js: current.x += x * textHScale (line 1267)
352354 const prevTextEndX = startX + ( width * textHScale ) ;
353-
355+
354356 // Calculate gap between end of previous text and start of current text
355357 // gap is in SCALED coordinates (both textObj.x and prevTextEndX are scaled)
356358 const gap = textObj . x - prevTextEndX ;
357-
359+
358360 // Scale spaceWidth to match gap's coordinate system
359361 const scaledSpaceWidth = spaceWidth * textHScale ;
360-
362+
361363 // Add spaces if gap is positive and significant (> 30% of scaled space width)
362364 // Also check that scaledSpaceWidth is valid to avoid division by zero
363365 if ( scaledSpaceWidth > 0 && gap > scaledSpaceWidth * 0.3 ) {
364366 const numSpaces = Math . round ( gap / scaledSpaceWidth ) ;
365367 prevText . str += ' ' . repeat ( Math . max ( 1 , numSpaces ) ) ;
366368 }
367-
369+
368370 // Append current text
369371 prevText . str += textObj . str ;
370-
372+
371373 // Update prevText to track current text for next iteration
372374 prevText . startX = textObj . x ;
373375 prevText . width = textObj . width ;
@@ -378,11 +380,11 @@ export default class PDFJSClass extends EventEmitter {
378380 if ( prevText ) {
379381 retVal += `${ prevText . str } \r\n` ;
380382 }
381-
383+
382384 // Initialize new text object with font metrics
383- prevText = {
384- str : textObj . str ,
385- y : textObj . y ,
385+ prevText = {
386+ str : textObj . str ,
387+ y : textObj . y ,
386388 startX : textObj . x ,
387389 width : textObj . width ,
388390 spaceWidth : textObj . spaceWidth ,
@@ -391,7 +393,7 @@ export default class PDFJSClass extends EventEmitter {
391393 } ;
392394 }
393395 } ) ;
394-
396+
395397 if ( prevText ) {
396398 retVal += prevText . str ;
397399 }
0 commit comments