maint: remove dependency on @xmldom/xmldom (#390)

modesty · web-flow · commit 2298a8663e7e · 2025-05-23T16:53:57.000-07:00
* maint: remove dependency on @xmldom/xmldom

* doc: update readme for zero dependency update
diff --git a/lib/ptixmlinject.js b/lib/ptixmlinject.js
@@ -1,29 +1,43 @@
 import fs from "fs";
-import { DOMParser } from "@xmldom/xmldom";
+import { DOMParser } from "./simpleXmlParser.js";
 
+/**
+ * XML Parser for PTI format
+ * @class
+ */
 export default class PTIXmlParser {
+    /** @type {string|null} */
     xmlData = null;
+	/** @type {Array<any>} */
 	ptiPageArray = [];
 
-	// constructor
+	/**
+	 * Create a new PTIXmlParser
+	 */
 	constructor() {
         this.xmlData = null;
         this.ptiPageArray = [];
     }
 
+	/**
+	 * Parse an XML file
+	 * @param {string} filePath - The path to the XML file
+	 * @param {Function} callback - The callback function
+	 */
 	parseXml(filePath, callback) {
 		fs.readFile(filePath, 'utf8', (err, data) => {
 			if (err) {
                 callback(err);
 			}
 			else {
+				/** @type {string} */
 				this.xmlData = data;
 
 				var parser = new DOMParser();
 				var dom = parser.parseFromString(this.xmlData);
 				var root = dom.documentElement;
 
-				var xmlFields = root.getElementsByTagName("field");
+				var xmlFields = root ? root.getElementsByTagName("field") : [];
 				var fields = [];
 
 				for (var i = 0; i < xmlFields.length; i++) {
@@ -37,38 +51,46 @@ export default class PTIXmlParser {
 					var fontName = xmlFields[i].getAttribute('fontName');
 					var fontSize = xmlFields[i].getAttribute('fontSize');
 
+					/** @type {Record<string, any>} */
 					var item = {};
 
-					var rectLeft = parseInt(xPos) - 21; //was 23.5
-					var rectTop = parseInt(yPos) - 20;//was 23
-					var rectRight = parseInt(rectLeft) + parseInt(width) - 4;
-					var rectBottom = parseInt(rectTop) + parseInt(height) - 4;
+					var rectLeft = parseInt(xPos || '0') - 21; //was 23.5
+					var rectTop = parseInt(yPos || '0') - 20;//was 23
+					var rectRight = parseInt(String(rectLeft)) + parseInt(width || '0') - 4;
+					var rectBottom = parseInt(String(rectTop)) + parseInt(height || '0') - 4;
 
 					item.fieldType = "Tx";
 					if (type === "Boolean") {
 						item.fieldType="Btn";
 					}
 					else if (type === "SSN" ||  type === "Phone" || type === "zip") {
-						item.TName = type.toLowerCase();
+						item.TName = type ? type.toLowerCase() : '';
 					}
 					item.alternativeText = "";
-					item.fullName = id;
-					item.fontSize = fontSize;
-					item.fontName = fontName;
+					item.fullName = id || '';
+					item.fontSize = fontSize || '';
+					item.fontName = fontName || '';
 					item.subtype = "Widget";
 
 					item.rect = [rectLeft, rectTop, rectRight, rectBottom];
 
 					fields.push(item);
 
-					this.ptiPageArray[parseInt(page)]=fields;
+					if (page) {
+						this.ptiPageArray[parseInt(page)] = fields;
+					}
 				}
 
 			}
 			callback();
 		});
 	}
 
+	/**
+	 * Get fields for a specific page
+	 * @param {number} pageNum - The page number
+	 * @returns {Array<any>|undefined} The fields for the page
+	 */
 	getFields(pageNum) {
 		return this.ptiPageArray[pageNum];
 	}
diff --git a/lib/simpleXmlParser.js b/lib/simpleXmlParser.js
@@ -0,0 +1,190 @@
+// A simple XML parser to replace @xmldom/xmldom dependency
+// This implements just enough functionality to support the existing code
+
+/**
+ * A simple XML Element implementation
+ * @class
+ */
+class Element {
+  /**
+   * Create a new Element
+   * @param {string} nodeName - The name of the node/tag
+   */
+  constructor(nodeName) {
+    /** @type {string} */
+    this.nodeName = nodeName;
+    /** @type {Array<Element>} */
+    this.childNodes = [];
+    /** @type {Object.<string, string>} */
+    this.attributes = {};
+    /** @type {string} */
+    this.textContent = "";
+  }
+
+  /**
+   * Get attribute value by name
+   * @param {string} name - The attribute name
+   * @returns {string|null} The attribute value or null
+   */
+  getAttribute(name) {
+    return this.attributes[name] || null;
+  }
+
+  /**
+   * Get elements by tag name
+   * @param {string} tagName - The tag name to search for
+   * @returns {Array<Element>} The matching elements
+   */
+  getElementsByTagName(tagName) {
+    /** @type {Array<Element>} */
+    let results = [];
+
+    // Check if this element matches
+    if (this.nodeName === tagName) {
+      results.push(this);
+    }
+
+    // Check child elements recursively
+    for (const child of this.childNodes) {
+      if (child instanceof Element) {
+        if (tagName === "*" || child.nodeName === tagName) {
+          results.push(child);
+        }
+
+        // Add matching descendants
+        const childMatches = child.getElementsByTagName(tagName);
+        results = results.concat(childMatches);
+      }
+    }
+
+    return results;
+  }
+}
+
+/**
+ * A simple XML Document implementation
+ * @class
+ */
+class Document {
+  constructor() {
+    /** @type {Element|null} */
+    this.documentElement = null;
+  }
+}
+
+/**
+ * A minimal DOMParser implementation that supports the basic features needed
+ * @class
+ */
+class SimpleDOMParser {
+  /**
+   * Parse XML string into a Document
+   * @param {string} xmlString - The XML string to parse
+   * @returns {Document} The parsed document
+   */
+  parseFromString(xmlString) {
+    const doc = new Document();
+
+    // Remove XML declaration if present
+    xmlString = xmlString.replace(/<\?xml[^?]*\?>/, "").trim();
+
+    // Parse the document
+    doc.documentElement = this.parseElement(xmlString);
+
+    return doc;
+  }
+
+  /**
+   * Parse an XML element
+   * @param {string} xmlString - The XML string to parse
+   * @returns {Element|null} The parsed element or null
+   */
+  parseElement(xmlString) {
+    // Regular expressions for parsing XML
+    const startTagRegex = /<([^\s/>]+)([^>]*)>/;
+    const attributeRegex = /([^\s=]+)=(?:"([^"]*)"|'([^']*)')/g;
+
+    // Find the start tag
+    const startMatch = xmlString.match(startTagRegex);
+    if (!startMatch) {
+      return null;
+    }
+
+    const tagName = startMatch[1];
+    const attributeString = startMatch[2];
+
+    // Create the element
+    const element = new Element(tagName);
+
+    // Parse attributes
+    let attributeMatch;
+    while ((attributeMatch = attributeRegex.exec(attributeString)) !== null) {
+      const attrName = attributeMatch[1];
+      const attrValue = attributeMatch[2] || attributeMatch[3]; // Use whichever capture group matched
+      element.attributes[attrName] = attrValue;
+    }
+
+    // Find the content between start and end tags
+    const startTagEnd = startMatch[0].length;
+    const endTagSearch = new RegExp(`</${tagName}>`);
+    const endMatch = xmlString.slice(startTagEnd).search(endTagSearch);
+
+    if (endMatch === -1) {
+      // Self-closing or malformed tag
+      return element;
+    }
+
+    const contentString = xmlString.slice(startTagEnd, startTagEnd + endMatch);
+
+    // Parse child elements
+    let remainingContent = contentString.trim();
+    while (remainingContent.length > 0) {
+      // Check if there's a child element
+      if (remainingContent.startsWith("<") && !remainingContent.startsWith("</")) {
+        // Find the next child element
+        const childStartMatch = remainingContent.match(startTagRegex);
+        if (childStartMatch) {
+          const childTagName = childStartMatch[1];
+          const childEndTagSearch = new RegExp(`</${childTagName}>`);
+          const childEndIndex = remainingContent.search(childEndTagSearch);
+
+          if (childEndIndex !== -1) {
+            // Extract the complete child element string (including its end tag)
+            const childEndTagLength = childTagName.length + 3; // "</tag>"
+            const childXmlString = remainingContent.slice(0, childEndIndex + childEndTagLength);
+
+            // Parse the child element and add it to parent
+            const childElement = this.parseElement(childXmlString);
+            if (childElement) {
+              element.childNodes.push(childElement);
+            }
+
+            // Remove the processed child from remaining content
+            remainingContent = remainingContent.slice(childXmlString.length).trim();
+            continue;
+          }
+        }
+      }
+
+      // Handle text content
+      const nextTagIndex = remainingContent.indexOf("<");
+      if (nextTagIndex === -1) {
+        // The rest is all text
+        element.textContent += remainingContent.trim();
+        break;
+      } else if (nextTagIndex > 0) {
+        // There's some text before the next tag
+        element.textContent += remainingContent.slice(0, nextTagIndex).trim();
+        remainingContent = remainingContent.slice(nextTagIndex).trim();
+      } else {
+        // Can't parse further, just break
+        break;
+      }
+    }
+
+    return element;
+  }
+}
+
+// Export DOMParser as a class
+export { SimpleDOMParser as DOMParser };
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "pdf2json",
-	"version": "3.1.5",
+	"version": "3.1.6",
 	"description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js",
 	"keywords": [
 		"pdf",
@@ -68,10 +68,8 @@
 		"pdf2json": "./bin/pdf2json.js"
 	},
 	"dependencies": {
-		"@xmldom/xmldom": "^0.9.6"
 	},
 	"bundleDependencies": [
-		"@xmldom/xmldom"
 	],
 	"devDependencies": {
 		"@rollup/plugin-commonjs": "^28.0.2",
diff --git a/readme.md b/readme.md
@@ -8,15 +8,16 @@
 ![GitHub top language](https://img.shields.io/github/languages/top/modesty/pdf2json)
 ![GitHub last commit](https://img.shields.io/github/last-commit/modesty/pdf2json?color=red)
 
-pdf2json is a [node.js](http://nodejs.org/) module converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use.
+pdf2json is a [node.js](http://nodejs.org/) module that converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use.
 
 ## Features
 
 - **PDF text extraction**: extracts textual content of PDF documents into structured JSON.
 - **Form element handling**: parses interactive form fields within PDFs for flexible data capture.
 - **Server-side and command-line versatility**: Integrate with web services for remote PDF processing or use as a standalone command-line tool for local file conversion.
-- **Swift Performance**: fast performance with minimal depdendencies
+- **Swift Performance**: fast performance with zero dependencies (since v3.1.6)
 - **Community driven**: decade+ long community driven development ensures continuous improvement.
+- **Zero dependencies**: completely dependency-free since v3.1.6, only pure JavaScript code.
 
 ## Install
 
diff --git a/rollup.config.js b/rollup.config.js
@@ -18,7 +18,6 @@ const external = [
 	"url",
 	"buffer",
 	"stream",
-	"@xmldom/xmldom",
 ];
 
 export default [
diff --git a/rollup/bundle-pdfjs-base.js b/rollup/bundle-pdfjs-base.js
@@ -64,7 +64,7 @@ const _baseCode = _pdfjsFiles.reduce(
 
 fs.writeFileSync(path.join(__dirname, "../lib/pdfjs-code.js"),
 	`
-  ${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from '@xmldom/xmldom';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"}
+  ${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from './simpleXmlParser.js';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"}
   ${"export const PDFJS = {};"}
   ${"const globalScope = { console };"}
   ${_baseCode}

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ const _baseCode = _pdfjsFiles.reduce(`
`64`	`64`
`65`	`65`	`fs.writeFileSync(path.join(__dirname, "../lib/pdfjs-code.js"),`
`66`	`66`	`
`67`		`- ${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from '@xmldom/xmldom';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"}`
	`67`	`+ ${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from './simpleXmlParser.js';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"}`
`68`	`68`	`${"export const PDFJS = {};"}`
`69`	`69`	`${"const globalScope = { console };"}`
`70`	`70`	`${_baseCode}`