Skip to content

Commit 2298a86

Browse files
authored
maint: remove dependency on @xmldom/xmldom (#390)
* maint: remove dependency on @xmldom/xmldom * doc: update readme for zero dependency update
1 parent e5e4236 commit 2298a86

6 files changed

Lines changed: 229 additions & 19 deletions

File tree

lib/ptixmlinject.js

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,43 @@
11
import fs from "fs";
2-
import { DOMParser } from "@xmldom/xmldom";
2+
import { DOMParser } from "./simpleXmlParser.js";
33

4+
/**
5+
* XML Parser for PTI format
6+
* @class
7+
*/
48
export default class PTIXmlParser {
9+
/** @type {string|null} */
510
xmlData = null;
11+
/** @type {Array<any>} */
612
ptiPageArray = [];
713

8-
// constructor
14+
/**
15+
* Create a new PTIXmlParser
16+
*/
917
constructor() {
1018
this.xmlData = null;
1119
this.ptiPageArray = [];
1220
}
1321

22+
/**
23+
* Parse an XML file
24+
* @param {string} filePath - The path to the XML file
25+
* @param {Function} callback - The callback function
26+
*/
1427
parseXml(filePath, callback) {
1528
fs.readFile(filePath, 'utf8', (err, data) => {
1629
if (err) {
1730
callback(err);
1831
}
1932
else {
33+
/** @type {string} */
2034
this.xmlData = data;
2135

2236
var parser = new DOMParser();
2337
var dom = parser.parseFromString(this.xmlData);
2438
var root = dom.documentElement;
2539

26-
var xmlFields = root.getElementsByTagName("field");
40+
var xmlFields = root ? root.getElementsByTagName("field") : [];
2741
var fields = [];
2842

2943
for (var i = 0; i < xmlFields.length; i++) {
@@ -37,38 +51,46 @@ export default class PTIXmlParser {
3751
var fontName = xmlFields[i].getAttribute('fontName');
3852
var fontSize = xmlFields[i].getAttribute('fontSize');
3953

54+
/** @type {Record<string, any>} */
4055
var item = {};
4156

42-
var rectLeft = parseInt(xPos) - 21; //was 23.5
43-
var rectTop = parseInt(yPos) - 20;//was 23
44-
var rectRight = parseInt(rectLeft) + parseInt(width) - 4;
45-
var rectBottom = parseInt(rectTop) + parseInt(height) - 4;
57+
var rectLeft = parseInt(xPos || '0') - 21; //was 23.5
58+
var rectTop = parseInt(yPos || '0') - 20;//was 23
59+
var rectRight = parseInt(String(rectLeft)) + parseInt(width || '0') - 4;
60+
var rectBottom = parseInt(String(rectTop)) + parseInt(height || '0') - 4;
4661

4762
item.fieldType = "Tx";
4863
if (type === "Boolean") {
4964
item.fieldType="Btn";
5065
}
5166
else if (type === "SSN" || type === "Phone" || type === "zip") {
52-
item.TName = type.toLowerCase();
67+
item.TName = type ? type.toLowerCase() : '';
5368
}
5469
item.alternativeText = "";
55-
item.fullName = id;
56-
item.fontSize = fontSize;
57-
item.fontName = fontName;
70+
item.fullName = id || '';
71+
item.fontSize = fontSize || '';
72+
item.fontName = fontName || '';
5873
item.subtype = "Widget";
5974

6075
item.rect = [rectLeft, rectTop, rectRight, rectBottom];
6176

6277
fields.push(item);
6378

64-
this.ptiPageArray[parseInt(page)]=fields;
79+
if (page) {
80+
this.ptiPageArray[parseInt(page)] = fields;
81+
}
6582
}
6683

6784
}
6885
callback();
6986
});
7087
}
7188

89+
/**
90+
* Get fields for a specific page
91+
* @param {number} pageNum - The page number
92+
* @returns {Array<any>|undefined} The fields for the page
93+
*/
7294
getFields(pageNum) {
7395
return this.ptiPageArray[pageNum];
7496
}

lib/simpleXmlParser.js

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
// A simple XML parser to replace @xmldom/xmldom dependency
2+
// This implements just enough functionality to support the existing code
3+
4+
/**
5+
* A simple XML Element implementation
6+
* @class
7+
*/
8+
class Element {
9+
/**
10+
* Create a new Element
11+
* @param {string} nodeName - The name of the node/tag
12+
*/
13+
constructor(nodeName) {
14+
/** @type {string} */
15+
this.nodeName = nodeName;
16+
/** @type {Array<Element>} */
17+
this.childNodes = [];
18+
/** @type {Object.<string, string>} */
19+
this.attributes = {};
20+
/** @type {string} */
21+
this.textContent = "";
22+
}
23+
24+
/**
25+
* Get attribute value by name
26+
* @param {string} name - The attribute name
27+
* @returns {string|null} The attribute value or null
28+
*/
29+
getAttribute(name) {
30+
return this.attributes[name] || null;
31+
}
32+
33+
/**
34+
* Get elements by tag name
35+
* @param {string} tagName - The tag name to search for
36+
* @returns {Array<Element>} The matching elements
37+
*/
38+
getElementsByTagName(tagName) {
39+
/** @type {Array<Element>} */
40+
let results = [];
41+
42+
// Check if this element matches
43+
if (this.nodeName === tagName) {
44+
results.push(this);
45+
}
46+
47+
// Check child elements recursively
48+
for (const child of this.childNodes) {
49+
if (child instanceof Element) {
50+
if (tagName === "*" || child.nodeName === tagName) {
51+
results.push(child);
52+
}
53+
54+
// Add matching descendants
55+
const childMatches = child.getElementsByTagName(tagName);
56+
results = results.concat(childMatches);
57+
}
58+
}
59+
60+
return results;
61+
}
62+
}
63+
64+
/**
65+
* A simple XML Document implementation
66+
* @class
67+
*/
68+
class Document {
69+
constructor() {
70+
/** @type {Element|null} */
71+
this.documentElement = null;
72+
}
73+
}
74+
75+
/**
76+
* A minimal DOMParser implementation that supports the basic features needed
77+
* @class
78+
*/
79+
class SimpleDOMParser {
80+
/**
81+
* Parse XML string into a Document
82+
* @param {string} xmlString - The XML string to parse
83+
* @returns {Document} The parsed document
84+
*/
85+
parseFromString(xmlString) {
86+
const doc = new Document();
87+
88+
// Remove XML declaration if present
89+
xmlString = xmlString.replace(/<\?xml[^?]*\?>/, "").trim();
90+
91+
// Parse the document
92+
doc.documentElement = this.parseElement(xmlString);
93+
94+
return doc;
95+
}
96+
97+
/**
98+
* Parse an XML element
99+
* @param {string} xmlString - The XML string to parse
100+
* @returns {Element|null} The parsed element or null
101+
*/
102+
parseElement(xmlString) {
103+
// Regular expressions for parsing XML
104+
const startTagRegex = /<([^\s/>]+)([^>]*)>/;
105+
const attributeRegex = /([^\s=]+)=(?:"([^"]*)"|'([^']*)')/g;
106+
107+
// Find the start tag
108+
const startMatch = xmlString.match(startTagRegex);
109+
if (!startMatch) {
110+
return null;
111+
}
112+
113+
const tagName = startMatch[1];
114+
const attributeString = startMatch[2];
115+
116+
// Create the element
117+
const element = new Element(tagName);
118+
119+
// Parse attributes
120+
let attributeMatch;
121+
while ((attributeMatch = attributeRegex.exec(attributeString)) !== null) {
122+
const attrName = attributeMatch[1];
123+
const attrValue = attributeMatch[2] || attributeMatch[3]; // Use whichever capture group matched
124+
element.attributes[attrName] = attrValue;
125+
}
126+
127+
// Find the content between start and end tags
128+
const startTagEnd = startMatch[0].length;
129+
const endTagSearch = new RegExp(`</${tagName}>`);
130+
const endMatch = xmlString.slice(startTagEnd).search(endTagSearch);
131+
132+
if (endMatch === -1) {
133+
// Self-closing or malformed tag
134+
return element;
135+
}
136+
137+
const contentString = xmlString.slice(startTagEnd, startTagEnd + endMatch);
138+
139+
// Parse child elements
140+
let remainingContent = contentString.trim();
141+
while (remainingContent.length > 0) {
142+
// Check if there's a child element
143+
if (remainingContent.startsWith("<") && !remainingContent.startsWith("</")) {
144+
// Find the next child element
145+
const childStartMatch = remainingContent.match(startTagRegex);
146+
if (childStartMatch) {
147+
const childTagName = childStartMatch[1];
148+
const childEndTagSearch = new RegExp(`</${childTagName}>`);
149+
const childEndIndex = remainingContent.search(childEndTagSearch);
150+
151+
if (childEndIndex !== -1) {
152+
// Extract the complete child element string (including its end tag)
153+
const childEndTagLength = childTagName.length + 3; // "</tag>"
154+
const childXmlString = remainingContent.slice(0, childEndIndex + childEndTagLength);
155+
156+
// Parse the child element and add it to parent
157+
const childElement = this.parseElement(childXmlString);
158+
if (childElement) {
159+
element.childNodes.push(childElement);
160+
}
161+
162+
// Remove the processed child from remaining content
163+
remainingContent = remainingContent.slice(childXmlString.length).trim();
164+
continue;
165+
}
166+
}
167+
}
168+
169+
// Handle text content
170+
const nextTagIndex = remainingContent.indexOf("<");
171+
if (nextTagIndex === -1) {
172+
// The rest is all text
173+
element.textContent += remainingContent.trim();
174+
break;
175+
} else if (nextTagIndex > 0) {
176+
// There's some text before the next tag
177+
element.textContent += remainingContent.slice(0, nextTagIndex).trim();
178+
remainingContent = remainingContent.slice(nextTagIndex).trim();
179+
} else {
180+
// Can't parse further, just break
181+
break;
182+
}
183+
}
184+
185+
return element;
186+
}
187+
}
188+
189+
// Export DOMParser as a class
190+
export { SimpleDOMParser as DOMParser };

package.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "pdf2json",
3-
"version": "3.1.5",
3+
"version": "3.1.6",
44
"description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js",
55
"keywords": [
66
"pdf",
@@ -68,10 +68,8 @@
6868
"pdf2json": "./bin/pdf2json.js"
6969
},
7070
"dependencies": {
71-
"@xmldom/xmldom": "^0.9.6"
7271
},
7372
"bundleDependencies": [
74-
"@xmldom/xmldom"
7573
],
7674
"devDependencies": {
7775
"@rollup/plugin-commonjs": "^28.0.2",

readme.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,16 @@
88
![GitHub top language](https://img.shields.io/github/languages/top/modesty/pdf2json)
99
![GitHub last commit](https://img.shields.io/github/last-commit/modesty/pdf2json?color=red)
1010

11-
pdf2json is a [node.js](http://nodejs.org/) module converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use.
11+
pdf2json is a [node.js](http://nodejs.org/) module that converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use.
1212

1313
## Features
1414

1515
- **PDF text extraction**: extracts textual content of PDF documents into structured JSON.
1616
- **Form element handling**: parses interactive form fields within PDFs for flexible data capture.
1717
- **Server-side and command-line versatility**: Integrate with web services for remote PDF processing or use as a standalone command-line tool for local file conversion.
18-
- **Swift Performance**: fast performance with minimal depdendencies
18+
- **Swift Performance**: fast performance with zero dependencies (since v3.1.6)
1919
- **Community driven**: decade+ long community driven development ensures continuous improvement.
20+
- **Zero dependencies**: completely dependency-free since v3.1.6, only pure JavaScript code.
2021

2122
## Install
2223

rollup.config.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ const external = [
1818
"url",
1919
"buffer",
2020
"stream",
21-
"@xmldom/xmldom",
2221
];
2322

2423
export default [

rollup/bundle-pdfjs-base.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ const _baseCode = _pdfjsFiles.reduce(
6464

6565
fs.writeFileSync(path.join(__dirname, "../lib/pdfjs-code.js"),
6666
`
67-
${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from '@xmldom/xmldom';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"}
67+
${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from './simpleXmlParser.js';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"}
6868
${"export const PDFJS = {};"}
6969
${"const globalScope = { console };"}
7070
${_baseCode}

0 commit comments

Comments
 (0)