@@ -7,104 +7,149 @@ import PDFJS from "./lib/pdf.js";
77import { ParserStream } from "./lib/parserstream.js" ;
88import { kColors , kFontFaces , kFontStyles } from "./lib/pdfconst.js" ;
99
10- export default class PDFParser extends EventEmitter { // inherit from event emitter
11- //public static
12- static get colorDict ( ) { return kColors ; }
10+ /**
11+ * Class representing a PDF Parser.
12+ * @extends EventEmitter
13+ */
14+ export default class PDFParser extends EventEmitter {
15+ /**
16+ * Static method to retrieve color dictionary.
17+ * @returns {object } Color dictionary
18+ */
19+ static get colorDict ( ) { return kColors ; }
20+
21+ /**
22+ * Static method to retrieve font face dictionary.
23+ * @returns {object } Font face dictionary
24+ */
1325 static get fontFaceDict ( ) { return kFontFaces ; }
26+
27+ /**
28+ * Static method to retrieve font style dictionary.
29+ * @returns {object } Font style dictionary
30+ */
1431 static get fontStyleDict ( ) { return kFontStyles ; }
1532
16- //private static
1733 static #maxBinBufferCount = 10 ;
1834 static #binBuffer = { } ;
1935
20- //private
2136 #password = "" ;
22-
23- #context = null ; // service context object, only used in Web Service project; null in command line
24-
25- #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started
26- #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache
27- #data = null ; //if file read success, data is PDF content; if failed, data is "err" object
28- #PDFJS = null ; //will be initialized in constructor
29- #processFieldInfoXML = false ; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
30-
31- // constructor
37+ #context = null ; // service context object, only used in Web Service project; null in command line #pdfFilePath = null;
38+ #pdfFileMTime = null ;
39+ #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started #data = null;
40+ #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache #PDFJS = null;
41+ #data = null ; //if file read success, data is PDF content; if failed, data is "err" object #processFieldInfoXML = false;
42+ #PDFJS = null ; //will be initialized in constructor
43+ #processFieldInfoXML = false ; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
44+
45+ /**
46+ * PDFParser constructor.
47+ * @param {object } context - The context object (only used in Web Service project); null in command line
48+ * @param {boolean } needRawText - Whether raw text is needed or not
49+ * @param {string } password - The password for PDF file
50+ * @info Private methods accessible using the [funcName].call(this, ...) syntax
51+ */
3252 constructor ( context , needRawText , password ) {
33- //call constructor for super class
3453 super ( ) ;
35-
36- // private
37- // service context object, only used in Web Service project; null in command line
3854 this . #context = context ;
39-
40- this . #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started
41- this . #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache
42- this . #data = null ; //if file read success, data is PDF content; if failed, data is "err" object
55+ this . #pdfFilePath = null ;
56+ this . #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started this.#pdfFileMTime = null;
57+ this . #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache this.#data = null;
58+ this . #data = null ; //if file read success, data is PDF content; if failed, data is "err" object this.#processFieldInfoXML = false;
4359 this . #processFieldInfoXML = false ; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
4460
4561 this . #PDFJS = new PDFJS ( needRawText ) ;
4662 this . #password = password ;
47- }
48-
49- //private methods, needs to invoked by [funcName].call(this, ...)
50- #onPDFJSParseDataReady( data ) {
51- if ( ! data ) { //v1.1.2: data===null means end of parsed data
52- nodeUtil . p2jinfo ( "PDF parsing completed." ) ;
53- this . emit ( "pdfParser_dataReady" , this . #data) ;
54- }
55- else {
56- this . #data = { ...this . #data, ...data } ;
57- }
58- }
63+ }
5964
60- #onPDFJSParserDataError( err ) {
61- this . #data = null ;
62- this . emit ( "pdfParser_dataError" , { "parserError" : err } ) ;
63- // this.emit("error", err);
64- }
65+ /**
66+ * @private
67+ * @param {object } data - The parsed data
68+ */
69+ #onPDFJSParseDataReady( data ) {
70+ if ( ! data ) {
71+ nodeUtil . p2jinfo ( "PDF parsing completed." ) ;
72+ this . emit ( "pdfParser_dataReady" , this . #data) ;
73+ }
74+ else {
75+ this . #data = { ...this . #data, ...data } ;
76+ }
77+ }
6578
66- #startParsingPDF( buffer ) {
67- this . #data = { } ;
79+ /**
80+ * @private
81+ * @param {Error } err - The error object
82+ */
83+ #onPDFJSParserDataError( err ) {
84+ this . #data = null ;
85+ this . emit ( "pdfParser_dataError" , { "parserError" : err } ) ;
86+ }
6887
69- this . #PDFJS. on ( "pdfjs_parseDataReady" , data => this . #onPDFJSParseDataReady( data ) ) ;
70- this . #PDFJS. on ( "pdfjs_parseDataError" , err => this . #onPDFJSParserDataError( err ) ) ;
88+ /**
89+ * @private
90+ * @param {Buffer } buffer - The PDF buffer
91+ */
92+ #startParsingPDF( buffer ) {
93+ this . #data = { } ;
94+ this . #PDFJS. on ( "pdfjs_parseDataReady" , data => this . #onPDFJSParseDataReady( data ) ) ;
95+ this . #PDFJS. on ( "pdfjs_parseDataError" , err => this . #onPDFJSParserDataError( err ) ) ;
7196
7297 //v1.3.0 the following Readable Stream-like events are replacement for the top two custom events
7398 this . #PDFJS. on ( "readable" , meta => this . emit ( "readable" , meta ) ) ;
7499 this . #PDFJS. on ( "data" , data => this . emit ( "data" , data ) ) ;
75100 this . #PDFJS. on ( "error" , err => this . #onPDFJSParserDataError( err ) ) ;
101+
102+ this . #PDFJS. parsePDFData ( buffer || PDFParser . #binBuffer[ this . binBufferKey ] , this . #password) ;
103+ }
76104
77- this . #PDFJS. parsePDFData ( buffer || PDFParser . #binBuffer[ this . binBufferKey ] , this . #password) ;
78- }
79-
80- #processBinaryCache( ) {
105+ /**
106+ * @private
107+ * @returns {boolean }
108+ */
109+ #processBinaryCache( ) {
81110 if ( this . binBufferKey in PDFParser . #binBuffer) {
82111 this . #startParsingPDF( ) ;
83112 return true ;
84113 }
114+
115+ const allKeys = Object . keys ( PDFParser . #binBuffer) ;
116+ if ( allKeys . length > PDFParser . #maxBinBufferCount) {
117+ const idx = this . id % PDFParser . #maxBinBufferCount;
118+ const key = allKeys [ idx ] ;
119+ PDFParser . #binBuffer[ key ] = null ;
120+ delete PDFParser . #binBuffer[ key ] ;
85121
86- const allKeys = Object . keys ( PDFParser . #binBuffer) ;
87- if ( allKeys . length > PDFParser . #maxBinBufferCount) {
88- const idx = this . id % PDFParser . #maxBinBufferCount;
89- const key = allKeys [ idx ] ;
90- PDFParser . #binBuffer[ key ] = null ;
91- delete PDFParser . #binBuffer[ key ] ;
92-
93- nodeUtil . p2jinfo ( "re-cycled cache for " + key ) ;
94- }
122+ nodeUtil . p2jinfo ( "re-cycled cache for " + key ) ;
123+ }
95124
96125 return false ;
97126 }
98127
99- //public getter
128+ /**
129+ * Getter for #data
130+ * @returns {object|null } Data
131+ */
100132 get data ( ) { return this . #data; }
133+
134+ /**
135+ * Getter for binBufferKey
136+ * @returns {string } The binBufferKey
137+ */
101138 get binBufferKey ( ) { return this . #pdfFilePath + this . #pdfFileMTime; }
102-
103- //public APIs
139+
140+ /**
141+ * Creates a parser stream
142+ * @returns {ParserStream } A new parser stream
143+ */
104144 createParserStream ( ) {
105- return new ParserStream ( this , { objectMode : true , bufferSize : 64 * 1024 } ) ;
145+ return new ParserStream ( this , { objectMode : true , bufferSize : 64 * 1024 } ) ;
106146 }
107147
148+ /**
149+ * Asynchronously load a PDF from a file path.
150+ * @param {string } pdfFilePath - Path of the PDF file
151+ * @param {number } verbosity - Verbosity level
152+ */
108153 async loadPDF ( pdfFilePath , verbosity ) {
109154 nodeUtil . verbosity ( verbosity || 0 ) ;
110155 nodeUtil . p2jinfo ( "about to load PDF file " + pdfFilePath ) ;
@@ -130,20 +175,55 @@ export default class PDFParser extends EventEmitter { // inherit from event emit
130175 }
131176 }
132177
133- // Introduce a way to directly process buffers without the need to write it to a temporary file
134- parseBuffer ( pdfBuffer ) {
178+ /**
179+ * Parse PDF buffer.
180+ * @param {Buffer } pdfBuffer - PDF buffer
181+ * @param {number } verbosity - Verbosity level
182+ */
183+ parseBuffer ( pdfBuffer , verbosity ) {
184+ nodeUtil . verbosity ( verbosity || 0 ) ;
135185 this . #startParsingPDF( pdfBuffer ) ;
136186 }
137187
138- getRawTextContent ( ) { return this . #PDFJS. getRawTextContent ( ) ; }
139- getRawTextContentStream ( ) { return ParserStream . createContentStream ( this . getRawTextContent ( ) ) ; }
140-
141- getAllFieldsTypes ( ) { return this . #PDFJS. getAllFieldsTypes ( ) ; } ;
142- getAllFieldsTypesStream ( ) { return ParserStream . createContentStream ( this . getAllFieldsTypes ( ) ) ; }
143-
144- getMergedTextBlocksIfNeeded ( ) { return this . #PDFJS. getMergedTextBlocksIfNeeded ( ) ; }
145- getMergedTextBlocksStream ( ) { return ParserStream . createContentStream ( this . getMergedTextBlocksIfNeeded ( ) ) }
146-
188+ /**
189+ * Retrieve raw text content from PDF.
190+ * @returns {string } Raw text content
191+ */
192+ getRawTextContent ( ) { return this . #PDFJS. getRawTextContent ( ) ; }
193+
194+ /**
195+ * Retrieve raw text content stream.
196+ * @returns {Stream } Raw text content stream
197+ */
198+ getRawTextContentStream ( ) { return ParserStream . createContentStream ( this . getRawTextContent ( ) ) ; }
199+
200+ /**
201+ * Retrieve all field types.
202+ * @returns {object[] } All field types
203+ */
204+ getAllFieldsTypes ( ) { return this . #PDFJS. getAllFieldsTypes ( ) ; }
205+
206+ /**
207+ * Retrieve all field types stream.
208+ * @returns {Stream } All field types stream
209+ */
210+ getAllFieldsTypesStream ( ) { return ParserStream . createContentStream ( this . getAllFieldsTypes ( ) ) ; }
211+
212+ /**
213+ * Retrieve merged text blocks if needed.
214+ * @returns {object } Merged text blocks
215+ */
216+ getMergedTextBlocksIfNeeded ( ) { return this . #PDFJS. getMergedTextBlocksIfNeeded ( ) ; }
217+
218+ /**
219+ * Retrieve merged text blocks stream.
220+ * @returns {Stream } Merged text blocks stream
221+ */
222+ getMergedTextBlocksStream ( ) { return ParserStream . createContentStream ( this . getMergedTextBlocksIfNeeded ( ) ) }
223+
224+ /**
225+ * Destroy the PDFParser instance.
226+ */
147227 destroy ( ) { // invoked with stream transform process
148228 super . removeAllListeners ( ) ;
149229
0 commit comments