Skip to content

Commit dab556a

Browse files
feat: reading multiple pdf files with a single PDFParser object (#371)
* feat: reading multiple pdf files with a single PDFParser object * Removed useless if statement * Disposed the object by calling destroy method * Added a new method for resetting the PDFJS object * Removed removal of listeners when reading multiple pdf files with a single object * Restoring minimum NodeJS version to 18.12.1 for semver compliance #381 * [EVO] Added parameter for using PDFParser as singleton in CLI #381 * [ROLL] Restored node and npm version changes * [FIX] pdfParser always a singleton, even if not provided by parameter * [FIX] Possible race condition --------- Co-authored-by: Modesty Zhang <modestyz@hotmail.com>
1 parent 7d562af commit dab556a

8 files changed

Lines changed: 92 additions & 11 deletions

File tree

jest.config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"testMatch": ["**/test/_test_.cjs"],
2+
"testMatch": ["**/test/_test_*"],
33
"testEnvironment": "node",
44
"bail": false,
55
"testFailureExitCode": 1,

lib/pdf.js

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,7 @@ export default class PDFJSClass extends EventEmitter {
189189
}
190190

191191
parsePDFData(arrayBuffer, password) {
192-
this.pdfDocument = null;
193-
192+
this.resetCurrentObject();
194193
const parameters = { password, data: arrayBuffer };
195194
PDFJS.getDocument(parameters).then(
196195
(pdfDocument) => this.load(pdfDocument, 1),
@@ -299,6 +298,7 @@ export default class PDFJSClass extends EventEmitter {
299298
};
300299

301300
this.pages.push(page);
301+
302302
this.emit("data", page);
303303

304304
if (this.needRawText) {
@@ -405,6 +405,14 @@ export default class PDFJSClass extends EventEmitter {
405405
return { Pages: this.pages };
406406
}
407407

408+
resetCurrentObject(){
409+
if (this.pdfDocument) this.pdfDocument.destroy();
410+
this.pdfDocument = null;
411+
412+
this.pages = [];
413+
this.rawTextContents = [];
414+
}
415+
408416
destroy() {
409417
this.removeAllListeners();
410418

pdfparser.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,15 @@ export default class PDFParser extends EventEmitter {
304304
return ParserStream.createContentStream(this.getMergedTextBlocksIfNeeded());
305305
}
306306

307+
/**
308+
* Destroys the current instance of PDFJS and sets a new one
309+
* @param {boolean} needRawText - Whether raw text is needed or not
310+
*/
311+
resetPDFJS(needRawText){
312+
this.#PDFJS.destroy();
313+
this.#PDFJS=new PDFJS(needRawText);
314+
}
315+
307316
/**
308317
* Destroy the PDFParser instance.
309318
*/

src/cli/p2jcli.ts

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ const PROCESS_RAW_TEXT_CONTENT = "c" in argv;
1818
const PROCESS_FIELDS_CONTENT = "t" in argv;
1919
const PROCESS_MERGE_BROKEN_TEXT_BLOCKS = "m" in argv;
2020
const PROCESS_WITH_STREAM = "r" in argv;
21+
const SINGLETON_PDF_PARSER= "si" in argv;
2122

2223
const INPUT_DIR_OR_FILE = argv.f;
2324

@@ -123,10 +124,13 @@ class PDFProcessor {
123124

124125
private parseOnePDFStream() {
125126
return new Promise((resolve, reject) => {
126-
this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT);
127-
this.pdfParser.on("pdfParser_dataError", (evtData: any) =>
128-
this.onPrimaryError(evtData.parserError, reject)
129-
);
127+
if((SINGLETON_PDF_PARSER && !this.pdfParser) || !SINGLETON_PDF_PARSER){
128+
//initialize the parser if the singleton parameter was not provided, or if the singleton parameter was provided and the parser is not initialized
129+
this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT);
130+
this.pdfParser.on("pdfParser_dataError", (evtData: any) =>
131+
this.onPrimaryError(evtData.parserError, reject)
132+
);
133+
}
130134

131135
const outputStream = fs.createWriteStream(this.outputPath);
132136
outputStream.on("finish", () => this.onPrimarySuccess(resolve, reject));
@@ -145,10 +149,13 @@ class PDFProcessor {
145149

146150
private parseOnePDF() {
147151
return new Promise((resolve, reject) => {
148-
this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT);
149-
this.pdfParser.on("pdfParser_dataError", (evtData: any) => {
150-
this.onPrimaryError(evtData.parserError, reject);
151-
});
152+
if((SINGLETON_PDF_PARSER && !this.pdfParser) || !SINGLETON_PDF_PARSER){
153+
//initialize the parser if the singleton parameter was not provided, or if the singleton parameter was provided and the parser is not initialized
154+
this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT);
155+
this.pdfParser.on("pdfParser_dataError", (evtData: any) =>
156+
this.onPrimaryError(evtData.parserError, reject)
157+
);
158+
}
152159

153160
this.pdfParser.on("pdfParser_dataReady", (evtData: any) => {
154161
fs.writeFile(this.outputPath, JSON.stringify(evtData), (err) => {

src/cli/p2jcliarg.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ export type Argv = {
2121
c?: string,
2222
m?: string,
2323
r?: string,
24+
si?: string,
2425
};
2526

2627
export class CLIArgParser {
@@ -199,4 +200,8 @@ export const yargs = new CLIArgParser(process.argv.slice(2))
199200
"r",
200201
"stream",
201202
"(optional) when specified, will process and parse with buffer/object transform stream rather than file system."
203+
).alias(
204+
"si",
205+
"singleton",
206+
"(optional) when specified, only an instance of PDFParser will be initialized."
202207
);

test/_test_testMultipleDataPDF.cjs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
const assert = require("assert");
2+
const fs = require("fs");
3+
4+
const PDFParser = require("../dist/pdfparser.cjs");
5+
// we want to read two (or more) PDF files without recreating a reference to PDFParser
6+
describe("Multiple PDFs with same structure",()=>{
7+
test("Read different values",async ()=>{
8+
// the target PDFs for this test have only 3 values: Name, Surname and BirthDate.
9+
// you can find the PDFs in test/pdf/mpf
10+
let parser=new PDFParser();
11+
const firstPDFLocation=__dirname+"/pdf/mpf/testPDF.pdf";
12+
const secondPDFLocation=__dirname+"/pdf/mpf/testPDF2.pdf";
13+
const firstPDFBuffer=fs.readFileSync(firstPDFLocation);
14+
const secondPDFBuffer=fs.readFileSync(secondPDFLocation);
15+
//we need to check if buffers are indeed different, otherwise it's useless!
16+
expect(firstPDFBuffer).not.toBe(secondPDFBuffer);
17+
const firstData=await new Promise((resolve,reject)=>{
18+
parser.parseBuffer(firstPDFBuffer,5);
19+
parser.on("pdfParser_dataReady", (evtData) => {
20+
resolve(evtData);
21+
});
22+
23+
parser.on("pdfParser_dataError", (evtData) => {
24+
reject(evtData);
25+
});
26+
});
27+
const secondData=await new Promise((resolve,reject)=>{
28+
parser.parseBuffer(secondPDFBuffer,5);
29+
parser.on("pdfParser_dataReady", (evtData) => {
30+
resolve(evtData);
31+
});
32+
33+
parser.on("pdfParser_dataError", (evtData) => {
34+
reject(evtData);
35+
});
36+
});
37+
//first, make sure the files are read
38+
expect(firstData).toBeDefined();
39+
expect(firstData.Pages[0]).toBeDefined();
40+
expect(firstData.Pages[0].Fields).toBeDefined();
41+
expect(secondData).toBeDefined();
42+
expect(secondData.Pages[0]).toBeDefined();
43+
expect(secondData.Pages[0].Fields).toBeDefined();
44+
//then, we check if the files have the correct values
45+
expect(firstData.Pages[0].Fields[0].V).toBe("Mario");
46+
expect(firstData.Pages[0].Fields[1].V).toBe("Rossi");
47+
expect(firstData.Pages[0].Fields[2].V).toBe("01/01/1990");
48+
expect(secondData.Pages[0].Fields[0].V).toBe("Luigi");
49+
expect(secondData.Pages[0].Fields[1].V).toBe("Verdi");
50+
expect(secondData.Pages[0].Fields[2].V).toBe("01/01/1991");
51+
});
52+
});

test/pdf/mpf/testPDF.pdf

115 KB
Binary file not shown.

test/pdf/mpf/testPDF2.pdf

108 KB
Binary file not shown.

0 commit comments

Comments
 (0)