Skip to content

Commit b208e7f

Browse files
committed
merge PR#314 nql5161:docs/document-pdfparser
2 parents c53b3ee + aac18d0 commit b208e7f

1 file changed

Lines changed: 148 additions & 71 deletions

File tree

pdfparser.js

Lines changed: 148 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -7,104 +7,147 @@ import PDFJS from "./lib/pdf.js";
77
import {ParserStream} from "./lib/parserstream.js";
88
import {kColors, kFontFaces, kFontStyles} from "./lib/pdfconst.js";
99

10-
export default class PDFParser extends EventEmitter { // inherit from event emitter
11-
//public static
12-
static get colorDict() {return kColors; }
10+
/**
11+
* Class representing a PDF Parser.
12+
* @extends EventEmitter
13+
*/
14+
export default class PDFParser extends EventEmitter {
15+
/**
16+
* Static method to retrieve color dictionary.
17+
* @returns {object} Color dictionary
18+
*/
19+
static get colorDict() { return kColors; }
20+
21+
/**
22+
* Static method to retrieve font face dictionary.
23+
* @returns {object} Font face dictionary
24+
*/
1325
static get fontFaceDict() { return kFontFaces; }
26+
27+
/**
28+
* Static method to retrieve font style dictionary.
29+
* @returns {object} Font style dictionary
30+
*/
1431
static get fontStyleDict() { return kFontStyles; }
1532

16-
//private static
1733
static #maxBinBufferCount = 10;
1834
static #binBuffer = {};
1935

20-
//private
2136
#password = "";
22-
23-
#context = null; // service context object, only used in Web Service project; null in command line
24-
25-
#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started
26-
#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache
27-
#data = null; //if file read success, data is PDF content; if failed, data is "err" object
28-
#PDFJS = null; //will be initialized in constructor
29-
#processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
30-
31-
// constructor
37+
#context = null; // service context object, only used in Web Service project; null in command line #pdfFilePath = null;
38+
#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started #data = null;
39+
#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache #PDFJS = null;
40+
#data = null; //if file read success, data is PDF content; if failed, data is "err" object #processFieldInfoXML = false;
41+
#PDFJS = null; //will be initialized in constructor
42+
#processFieldInfoXML = false; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
43+
44+
/**
45+
* PDFParser constructor.
46+
* @param {object} context - The context object (only used in Web Service project); null in command line
47+
* @param {boolean} needRawText - Whether raw text is needed or not
48+
* @param {string} password - The password for PDF file
49+
* @info Private methods accessible using the [funcName].call(this, ...) syntax
50+
*/
3251
constructor(context, needRawText, password) {
33-
//call constructor for super class
3452
super();
35-
36-
// private
37-
// service context object, only used in Web Service project; null in command line
3853
this.#context = context;
39-
40-
this.#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started
41-
this.#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache
42-
this.#data = null; //if file read success, data is PDF content; if failed, data is "err" object
54+
this.#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started this.#pdfFileMTime = null;
55+
this.#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache this.#data = null;
56+
this.#data = null; //if file read success, data is PDF content; if failed, data is "err" object this.#processFieldInfoXML = false;
4357
this.#processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
4458

4559
this.#PDFJS = new PDFJS(needRawText);
4660
this.#password = password;
47-
}
48-
49-
//private methods, needs to invoked by [funcName].call(this, ...)
50-
#onPDFJSParseDataReady(data) {
51-
if (!data) { //v1.1.2: data===null means end of parsed data
52-
nodeUtil.p2jinfo("PDF parsing completed.");
53-
this.emit("pdfParser_dataReady", this.#data);
54-
}
55-
else {
56-
this.#data = {...this.#data, ...data};
57-
}
58-
}
61+
}
5962

60-
#onPDFJSParserDataError(err) {
61-
this.#data = null;
62-
this.emit("pdfParser_dataError", {"parserError": err});
63-
// this.emit("error", err);
64-
}
63+
/**
64+
* @private
65+
* @param {object} data - The parsed data
66+
*/
67+
#onPDFJSParseDataReady(data) {
68+
if (!data) {
69+
nodeUtil.p2jinfo("PDF parsing completed.");
70+
this.emit("pdfParser_dataReady", this.#data);
71+
}
72+
else {
73+
this.#data = { ...this.#data, ...data };
74+
}
75+
}
6576

66-
#startParsingPDF(buffer) {
67-
this.#data = {};
77+
/**
78+
* @private
79+
* @param {Error} err - The error object
80+
*/
81+
#onPDFJSParserDataError(err) {
82+
this.#data = null;
83+
this.emit("pdfParser_dataError", { "parserError": err });
84+
}
6885

69-
this.#PDFJS.on("pdfjs_parseDataReady", data => this.#onPDFJSParseDataReady(data));
70-
this.#PDFJS.on("pdfjs_parseDataError", err => this.#onPDFJSParserDataError(err));
86+
/**
87+
* @private
88+
* @param {Buffer} buffer - The PDF buffer
89+
*/
90+
#startParsingPDF(buffer) {
91+
this.#data = {};
92+
this.#PDFJS.on("pdfjs_parseDataReady", data => this.#onPDFJSParseDataReady(data));
93+
this.#PDFJS.on("pdfjs_parseDataError", err => this.#onPDFJSParserDataError(err));
7194

7295
//v1.3.0 the following Readable Stream-like events are replacement for the top two custom events
7396
this.#PDFJS.on("readable", meta => this.emit("readable", meta));
7497
this.#PDFJS.on("data", data => this.emit("data", data));
7598
this.#PDFJS.on("error", err => this.#onPDFJSParserDataError(err));
99+
100+
this.#PDFJS.parsePDFData(buffer || PDFParser.#binBuffer[this.binBufferKey], this.#password);
101+
}
76102

77-
this.#PDFJS.parsePDFData(buffer || PDFParser.#binBuffer[this.binBufferKey], this.#password);
78-
}
79-
80-
#processBinaryCache() {
103+
/**
104+
* @private
105+
* @returns {boolean}
106+
*/
107+
#processBinaryCache() {
81108
if (this.binBufferKey in PDFParser.#binBuffer) {
82109
this.#startParsingPDF();
83110
return true;
84111
}
112+
113+
const allKeys = Object.keys(PDFParser.#binBuffer);
114+
if (allKeys.length > PDFParser.#maxBinBufferCount) {
115+
const idx = this.id % PDFParser.#maxBinBufferCount;
116+
const key = allKeys[idx];
117+
PDFParser.#binBuffer[key] = null;
118+
delete PDFParser.#binBuffer[key];
85119

86-
const allKeys = Object.keys(PDFParser.#binBuffer);
87-
if (allKeys.length > PDFParser.#maxBinBufferCount) {
88-
const idx = this.id % PDFParser.#maxBinBufferCount;
89-
const key = allKeys[idx];
90-
PDFParser.#binBuffer[key] = null;
91-
delete PDFParser.#binBuffer[key];
92-
93-
nodeUtil.p2jinfo("re-cycled cache for " + key);
94-
}
120+
nodeUtil.p2jinfo("re-cycled cache for " + key);
121+
}
95122

96123
return false;
97124
}
98125

99-
//public getter
126+
/**
127+
* Getter for #data
128+
* @returns {object|null} Data
129+
*/
100130
get data() { return this.#data; }
131+
132+
/**
133+
* Getter for binBufferKey
134+
* @returns {string} The binBufferKey
135+
*/
101136
get binBufferKey() { return this.#pdfFilePath + this.#pdfFileMTime; }
102-
103-
//public APIs
137+
138+
/**
139+
* Creates a parser stream
140+
* @returns {ParserStream} A new parser stream
141+
*/
104142
createParserStream() {
105-
return new ParserStream(this, {objectMode: true, bufferSize: 64 * 1024});
143+
return new ParserStream(this, { objectMode: true, bufferSize: 64 * 1024 });
106144
}
107145

146+
/**
147+
* Asynchronously load a PDF from a file path.
148+
* @param {string} pdfFilePath - Path of the PDF file
149+
* @param {number} verbosity - Verbosity level
150+
*/
108151
async loadPDF(pdfFilePath, verbosity) {
109152
nodeUtil.verbosity(verbosity || 0);
110153
nodeUtil.p2jinfo("about to load PDF file " + pdfFilePath);
@@ -130,21 +173,55 @@ export default class PDFParser extends EventEmitter { // inherit from event emit
130173
}
131174
}
132175

133-
// Introduce a way to directly process buffers without the need to write it to a temporary file
176+
/**
177+
* Parse PDF buffer. Introduce a way to directly process buffers without the need to write it to a temporary file
178+
* @param {Buffer} pdfBuffer - PDF buffer
179+
* @param {number} verbosity - Verbosity level
180+
*/
134181
parseBuffer(pdfBuffer, verbosity) {
135182
nodeUtil.verbosity(verbosity || 0);
136183
this.#startParsingPDF(pdfBuffer);
137184
}
138185

139-
getRawTextContent() { return this.#PDFJS.getRawTextContent(); }
140-
getRawTextContentStream() { return ParserStream.createContentStream(this.getRawTextContent()); }
141-
142-
getAllFieldsTypes() { return this.#PDFJS.getAllFieldsTypes(); };
143-
getAllFieldsTypesStream() { return ParserStream.createContentStream(this.getAllFieldsTypes()); }
144-
145-
getMergedTextBlocksIfNeeded() { return this.#PDFJS.getMergedTextBlocksIfNeeded(); }
146-
getMergedTextBlocksStream() { return ParserStream.createContentStream(this.getMergedTextBlocksIfNeeded()) }
147-
186+
/**
187+
* Retrieve raw text content from PDF.
188+
* @returns {string} Raw text content
189+
*/
190+
getRawTextContent() { return this.#PDFJS.getRawTextContent(); }
191+
192+
/**
193+
* Retrieve raw text content stream.
194+
* @returns {Stream} Raw text content stream
195+
*/
196+
getRawTextContentStream() { return ParserStream.createContentStream(this.getRawTextContent()); }
197+
198+
/**
199+
* Retrieve all field types.
200+
* @returns {object[]} All field types
201+
*/
202+
getAllFieldsTypes() { return this.#PDFJS.getAllFieldsTypes(); }
203+
204+
/**
205+
* Retrieve all field types stream.
206+
* @returns {Stream} All field types stream
207+
*/
208+
getAllFieldsTypesStream() { return ParserStream.createContentStream(this.getAllFieldsTypes()); }
209+
210+
/**
211+
* Retrieve merged text blocks if needed.
212+
* @returns {object} Merged text blocks
213+
*/
214+
getMergedTextBlocksIfNeeded() { return this.#PDFJS.getMergedTextBlocksIfNeeded(); }
215+
216+
/**
217+
* Retrieve merged text blocks stream.
218+
* @returns {Stream} Merged text blocks stream
219+
*/
220+
getMergedTextBlocksStream() { return ParserStream.createContentStream(this.getMergedTextBlocksIfNeeded()) }
221+
222+
/**
223+
* Destroy the PDFParser instance.
224+
*/
148225
destroy() { // invoked with stream transform process
149226
super.removeAllListeners();
150227

0 commit comments

Comments
 (0)