Skip to content

Commit 7d9de79

Browse files
committed
docs: add jsdocs to pdfparser.js
1 parent fd9d127 commit 7d9de79

1 file changed

Lines changed: 152 additions & 72 deletions

File tree

pdfparser.js

Lines changed: 152 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -7,104 +7,149 @@ import PDFJS from "./lib/pdf.js";
77
import {ParserStream} from "./lib/parserstream.js";
88
import {kColors, kFontFaces, kFontStyles} from "./lib/pdfconst.js";
99

10-
export default class PDFParser extends EventEmitter { // inherit from event emitter
11-
//public static
12-
static get colorDict() {return kColors; }
10+
/**
11+
* Class representing a PDF Parser.
12+
* @extends EventEmitter
13+
*/
14+
export default class PDFParser extends EventEmitter {
15+
/**
16+
* Static method to retrieve color dictionary.
17+
* @returns {object} Color dictionary
18+
*/
19+
static get colorDict() { return kColors; }
20+
21+
/**
22+
* Static method to retrieve font face dictionary.
23+
* @returns {object} Font face dictionary
24+
*/
1325
static get fontFaceDict() { return kFontFaces; }
26+
27+
/**
28+
* Static method to retrieve font style dictionary.
29+
* @returns {object} Font style dictionary
30+
*/
1431
static get fontStyleDict() { return kFontStyles; }
1532

16-
//private static
1733
static #maxBinBufferCount = 10;
1834
static #binBuffer = {};
1935

20-
//private
2136
#password = "";
22-
23-
#context = null; // service context object, only used in Web Service project; null in command line
24-
25-
#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started
26-
#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache
27-
#data = null; //if file read success, data is PDF content; if failed, data is "err" object
28-
#PDFJS = null; //will be initialized in constructor
29-
#processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
30-
31-
// constructor
37+
#context = null; // service context object, only used in Web Service project; null in command line #pdfFilePath = null;
38+
#pdfFileMTime = null;
39+
#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started #data = null;
40+
#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache #PDFJS = null;
41+
#data = null; //if file read success, data is PDF content; if failed, data is "err" object #processFieldInfoXML = false;
42+
#PDFJS = null; //will be initialized in constructor
43+
#processFieldInfoXML = false; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
44+
45+
/**
46+
* PDFParser constructor.
47+
* @param {object} context - The context object (only used in Web Service project); null in command line
48+
* @param {boolean} needRawText - Whether raw text is needed or not
49+
* @param {string} password - The password for PDF file
50+
* @info Private methods accessible using the [funcName].call(this, ...) syntax
51+
*/
3252
constructor(context, needRawText, password) {
33-
//call constructor for super class
3453
super();
35-
36-
// private
37-
// service context object, only used in Web Service project; null in command line
3854
this.#context = context;
39-
40-
this.#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started
41-
this.#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache
42-
this.#data = null; //if file read success, data is PDF content; if failed, data is "err" object
55+
this.#pdfFilePath = null;
56+
this.#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started this.#pdfFileMTime = null;
57+
this.#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache this.#data = null;
58+
this.#data = null; //if file read success, data is PDF content; if failed, data is "err" object this.#processFieldInfoXML = false;
4359
this.#processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
4460

4561
this.#PDFJS = new PDFJS(needRawText);
4662
this.#password = password;
47-
}
48-
49-
//private methods, needs to invoked by [funcName].call(this, ...)
50-
#onPDFJSParseDataReady(data) {
51-
if (!data) { //v1.1.2: data===null means end of parsed data
52-
nodeUtil.p2jinfo("PDF parsing completed.");
53-
this.emit("pdfParser_dataReady", this.#data);
54-
}
55-
else {
56-
this.#data = {...this.#data, ...data};
57-
}
58-
}
63+
}
5964

60-
#onPDFJSParserDataError(err) {
61-
this.#data = null;
62-
this.emit("pdfParser_dataError", {"parserError": err});
63-
// this.emit("error", err);
64-
}
65+
/**
66+
* @private
67+
* @param {object} data - The parsed data
68+
*/
69+
#onPDFJSParseDataReady(data) {
70+
if (!data) {
71+
nodeUtil.p2jinfo("PDF parsing completed.");
72+
this.emit("pdfParser_dataReady", this.#data);
73+
}
74+
else {
75+
this.#data = { ...this.#data, ...data };
76+
}
77+
}
6578

66-
#startParsingPDF(buffer) {
67-
this.#data = {};
79+
/**
80+
* @private
81+
* @param {Error} err - The error object
82+
*/
83+
#onPDFJSParserDataError(err) {
84+
this.#data = null;
85+
this.emit("pdfParser_dataError", { "parserError": err });
86+
}
6887

69-
this.#PDFJS.on("pdfjs_parseDataReady", data => this.#onPDFJSParseDataReady(data));
70-
this.#PDFJS.on("pdfjs_parseDataError", err => this.#onPDFJSParserDataError(err));
88+
/**
89+
* @private
90+
* @param {Buffer} buffer - The PDF buffer
91+
*/
92+
#startParsingPDF(buffer) {
93+
this.#data = {};
94+
this.#PDFJS.on("pdfjs_parseDataReady", data => this.#onPDFJSParseDataReady(data));
95+
this.#PDFJS.on("pdfjs_parseDataError", err => this.#onPDFJSParserDataError(err));
7196

7297
//v1.3.0 the following Readable Stream-like events are replacement for the top two custom events
7398
this.#PDFJS.on("readable", meta => this.emit("readable", meta));
7499
this.#PDFJS.on("data", data => this.emit("data", data));
75100
this.#PDFJS.on("error", err => this.#onPDFJSParserDataError(err));
101+
102+
this.#PDFJS.parsePDFData(buffer || PDFParser.#binBuffer[this.binBufferKey], this.#password);
103+
}
76104

77-
this.#PDFJS.parsePDFData(buffer || PDFParser.#binBuffer[this.binBufferKey], this.#password);
78-
}
79-
80-
#processBinaryCache() {
105+
/**
106+
* @private
107+
* @returns {boolean}
108+
*/
109+
#processBinaryCache() {
81110
if (this.binBufferKey in PDFParser.#binBuffer) {
82111
this.#startParsingPDF();
83112
return true;
84113
}
114+
115+
const allKeys = Object.keys(PDFParser.#binBuffer);
116+
if (allKeys.length > PDFParser.#maxBinBufferCount) {
117+
const idx = this.id % PDFParser.#maxBinBufferCount;
118+
const key = allKeys[idx];
119+
PDFParser.#binBuffer[key] = null;
120+
delete PDFParser.#binBuffer[key];
85121

86-
const allKeys = Object.keys(PDFParser.#binBuffer);
87-
if (allKeys.length > PDFParser.#maxBinBufferCount) {
88-
const idx = this.id % PDFParser.#maxBinBufferCount;
89-
const key = allKeys[idx];
90-
PDFParser.#binBuffer[key] = null;
91-
delete PDFParser.#binBuffer[key];
92-
93-
nodeUtil.p2jinfo("re-cycled cache for " + key);
94-
}
122+
nodeUtil.p2jinfo("re-cycled cache for " + key);
123+
}
95124

96125
return false;
97126
}
98127

99-
//public getter
128+
/**
129+
* Getter for #data
130+
* @returns {object|null} Data
131+
*/
100132
get data() { return this.#data; }
133+
134+
/**
135+
* Getter for binBufferKey
136+
* @returns {string} The binBufferKey
137+
*/
101138
get binBufferKey() { return this.#pdfFilePath + this.#pdfFileMTime; }
102-
103-
//public APIs
139+
140+
/**
141+
* Creates a parser stream
142+
* @returns {ParserStream} A new parser stream
143+
*/
104144
createParserStream() {
105-
return new ParserStream(this, {objectMode: true, bufferSize: 64 * 1024});
145+
return new ParserStream(this, { objectMode: true, bufferSize: 64 * 1024 });
106146
}
107147

148+
/**
149+
* Asynchronously load a PDF from a file path.
150+
* @param {string} pdfFilePath - Path of the PDF file
151+
* @param {number} verbosity - Verbosity level
152+
*/
108153
async loadPDF(pdfFilePath, verbosity) {
109154
nodeUtil.verbosity(verbosity || 0);
110155
nodeUtil.p2jinfo("about to load PDF file " + pdfFilePath);
@@ -130,20 +175,55 @@ export default class PDFParser extends EventEmitter { // inherit from event emit
130175
}
131176
}
132177

133-
// Introduce a way to directly process buffers without the need to write it to a temporary file
134-
parseBuffer(pdfBuffer) {
178+
/**
179+
* Parse PDF buffer.
180+
* @param {Buffer} pdfBuffer - PDF buffer
181+
* @param {number} verbosity - Verbosity level
182+
*/
183+
parseBuffer(pdfBuffer, verbosity) {
184+
nodeUtil.verbosity(verbosity || 0);
135185
this.#startParsingPDF(pdfBuffer);
136186
}
137187

138-
getRawTextContent() { return this.#PDFJS.getRawTextContent(); }
139-
getRawTextContentStream() { return ParserStream.createContentStream(this.getRawTextContent()); }
140-
141-
getAllFieldsTypes() { return this.#PDFJS.getAllFieldsTypes(); };
142-
getAllFieldsTypesStream() { return ParserStream.createContentStream(this.getAllFieldsTypes()); }
143-
144-
getMergedTextBlocksIfNeeded() { return this.#PDFJS.getMergedTextBlocksIfNeeded(); }
145-
getMergedTextBlocksStream() { return ParserStream.createContentStream(this.getMergedTextBlocksIfNeeded()) }
146-
188+
/**
189+
* Retrieve raw text content from PDF.
190+
* @returns {string} Raw text content
191+
*/
192+
getRawTextContent() { return this.#PDFJS.getRawTextContent(); }
193+
194+
/**
195+
* Retrieve raw text content stream.
196+
* @returns {Stream} Raw text content stream
197+
*/
198+
getRawTextContentStream() { return ParserStream.createContentStream(this.getRawTextContent()); }
199+
200+
/**
201+
* Retrieve all field types.
202+
* @returns {object[]} All field types
203+
*/
204+
getAllFieldsTypes() { return this.#PDFJS.getAllFieldsTypes(); }
205+
206+
/**
207+
* Retrieve all field types stream.
208+
* @returns {Stream} All field types stream
209+
*/
210+
getAllFieldsTypesStream() { return ParserStream.createContentStream(this.getAllFieldsTypes()); }
211+
212+
/**
213+
* Retrieve merged text blocks if needed.
214+
* @returns {object} Merged text blocks
215+
*/
216+
getMergedTextBlocksIfNeeded() { return this.#PDFJS.getMergedTextBlocksIfNeeded(); }
217+
218+
/**
219+
* Retrieve merged text blocks stream.
220+
* @returns {Stream} Merged text blocks stream
221+
*/
222+
getMergedTextBlocksStream() { return ParserStream.createContentStream(this.getMergedTextBlocksIfNeeded()) }
223+
224+
/**
225+
* Destroy the PDFParser instance.
226+
*/
147227
destroy() { // invoked with stream transform process
148228
super.removeAllListeners();
149229

0 commit comments

Comments
 (0)