@@ -7,104 +7,147 @@ import PDFJS from "./lib/pdf.js";
77import { ParserStream } from "./lib/parserstream.js" ;
88import { kColors , kFontFaces , kFontStyles } from "./lib/pdfconst.js" ;
99
10- export default class PDFParser extends EventEmitter { // inherit from event emitter
11- //public static
12- static get colorDict ( ) { return kColors ; }
10+ /**
11+ * Class representing a PDF Parser.
12+ * @extends EventEmitter
13+ */
14+ export default class PDFParser extends EventEmitter {
15+ /**
16+ * Static method to retrieve color dictionary.
17+ * @returns {object } Color dictionary
18+ */
19+ static get colorDict ( ) { return kColors ; }
20+
21+ /**
22+ * Static method to retrieve font face dictionary.
23+ * @returns {object } Font face dictionary
24+ */
1325 static get fontFaceDict ( ) { return kFontFaces ; }
26+
27+ /**
28+ * Static method to retrieve font style dictionary.
29+ * @returns {object } Font style dictionary
30+ */
1431 static get fontStyleDict ( ) { return kFontStyles ; }
1532
16- //private static
1733 static #maxBinBufferCount = 10 ;
1834 static #binBuffer = { } ;
1935
20- //private
2136 #password = "" ;
22-
23- #context = null ; // service context object, only used in Web Service project; null in command line
24-
25- #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started
26- #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache
27- #data = null ; //if file read success, data is PDF content; if failed, data is "err" object
28- #PDFJS = null ; //will be initialized in constructor
29- #processFieldInfoXML = false ; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
30-
31- // constructor
37+ #context = null ; // service context object, only used in Web Service project; null in command line #pdfFilePath = null;
38+ #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started #data = null;
39+ #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache #PDFJS = null;
40+ #data = null ; //if file read success, data is PDF content; if failed, data is "err" object #processFieldInfoXML = false;
41+ #PDFJS = null ; //will be initialized in constructor
42+ #processFieldInfoXML = false ; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
43+
44+ /**
45+ * PDFParser constructor.
46+ * @param {object } context - The context object (only used in Web Service project); null in command line
47+ * @param {boolean } needRawText - Whether raw text is needed or not
48+ * @param {string } password - The password for PDF file
49+ * @info Private methods accessible using the [funcName].call(this, ...) syntax
50+ */
3251 constructor ( context , needRawText , password ) {
33- //call constructor for super class
3452 super ( ) ;
35-
36- // private
37- // service context object, only used in Web Service project; null in command line
3853 this . #context = context ;
39-
40- this . #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started
41- this . #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache
42- this . #data = null ; //if file read success, data is PDF content; if failed, data is "err" object
54+ this . #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started this.#pdfFileMTime = null;
55+ this . #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache this.#data = null;
56+ this . #data = null ; //if file read success, data is PDF content; if failed, data is "err" object this.#processFieldInfoXML = false;
4357 this . #processFieldInfoXML = false ; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
4458
4559 this . #PDFJS = new PDFJS ( needRawText ) ;
4660 this . #password = password ;
47- }
48-
49- //private methods, needs to invoked by [funcName].call(this, ...)
50- #onPDFJSParseDataReady( data ) {
51- if ( ! data ) { //v1.1.2: data===null means end of parsed data
52- nodeUtil . p2jinfo ( "PDF parsing completed." ) ;
53- this . emit ( "pdfParser_dataReady" , this . #data) ;
54- }
55- else {
56- this . #data = { ...this . #data, ...data } ;
57- }
58- }
61+ }
5962
60- #onPDFJSParserDataError( err ) {
61- this . #data = null ;
62- this . emit ( "pdfParser_dataError" , { "parserError" : err } ) ;
63- // this.emit("error", err);
64- }
63+ /**
64+ * @private
65+ * @param {object } data - The parsed data
66+ */
67+ #onPDFJSParseDataReady( data ) {
68+ if ( ! data ) {
69+ nodeUtil . p2jinfo ( "PDF parsing completed." ) ;
70+ this . emit ( "pdfParser_dataReady" , this . #data) ;
71+ }
72+ else {
73+ this . #data = { ...this . #data, ...data } ;
74+ }
75+ }
6576
66- #startParsingPDF( buffer ) {
67- this . #data = { } ;
77+ /**
78+ * @private
79+ * @param {Error } err - The error object
80+ */
81+ #onPDFJSParserDataError( err ) {
82+ this . #data = null ;
83+ this . emit ( "pdfParser_dataError" , { "parserError" : err } ) ;
84+ }
6885
69- this . #PDFJS. on ( "pdfjs_parseDataReady" , data => this . #onPDFJSParseDataReady( data ) ) ;
70- this . #PDFJS. on ( "pdfjs_parseDataError" , err => this . #onPDFJSParserDataError( err ) ) ;
86+ /**
87+ * @private
88+ * @param {Buffer } buffer - The PDF buffer
89+ */
90+ #startParsingPDF( buffer ) {
91+ this . #data = { } ;
92+ this . #PDFJS. on ( "pdfjs_parseDataReady" , data => this . #onPDFJSParseDataReady( data ) ) ;
93+ this . #PDFJS. on ( "pdfjs_parseDataError" , err => this . #onPDFJSParserDataError( err ) ) ;
7194
7295 //v1.3.0 the following Readable Stream-like events are replacement for the top two custom events
7396 this . #PDFJS. on ( "readable" , meta => this . emit ( "readable" , meta ) ) ;
7497 this . #PDFJS. on ( "data" , data => this . emit ( "data" , data ) ) ;
7598 this . #PDFJS. on ( "error" , err => this . #onPDFJSParserDataError( err ) ) ;
99+
100+ this . #PDFJS. parsePDFData ( buffer || PDFParser . #binBuffer[ this . binBufferKey ] , this . #password) ;
101+ }
76102
77- this . #PDFJS. parsePDFData ( buffer || PDFParser . #binBuffer[ this . binBufferKey ] , this . #password) ;
78- }
79-
80- #processBinaryCache( ) {
103+ /**
104+ * @private
105+ * @returns {boolean }
106+ */
107+ #processBinaryCache( ) {
81108 if ( this . binBufferKey in PDFParser . #binBuffer) {
82109 this . #startParsingPDF( ) ;
83110 return true ;
84111 }
112+
113+ const allKeys = Object . keys ( PDFParser . #binBuffer) ;
114+ if ( allKeys . length > PDFParser . #maxBinBufferCount) {
115+ const idx = this . id % PDFParser . #maxBinBufferCount;
116+ const key = allKeys [ idx ] ;
117+ PDFParser . #binBuffer[ key ] = null ;
118+ delete PDFParser . #binBuffer[ key ] ;
85119
86- const allKeys = Object . keys ( PDFParser . #binBuffer) ;
87- if ( allKeys . length > PDFParser . #maxBinBufferCount) {
88- const idx = this . id % PDFParser . #maxBinBufferCount;
89- const key = allKeys [ idx ] ;
90- PDFParser . #binBuffer[ key ] = null ;
91- delete PDFParser . #binBuffer[ key ] ;
92-
93- nodeUtil . p2jinfo ( "re-cycled cache for " + key ) ;
94- }
120+ nodeUtil . p2jinfo ( "re-cycled cache for " + key ) ;
121+ }
95122
96123 return false ;
97124 }
98125
99- //public getter
126+ /**
127+ * Getter for #data
128+ * @returns {object|null } Data
129+ */
100130 get data ( ) { return this . #data; }
131+
132+ /**
133+ * Getter for binBufferKey
134+ * @returns {string } The binBufferKey
135+ */
101136 get binBufferKey ( ) { return this . #pdfFilePath + this . #pdfFileMTime; }
102-
103- //public APIs
137+
138+ /**
139+ * Creates a parser stream
140+ * @returns {ParserStream } A new parser stream
141+ */
104142 createParserStream ( ) {
105- return new ParserStream ( this , { objectMode : true , bufferSize : 64 * 1024 } ) ;
143+ return new ParserStream ( this , { objectMode : true , bufferSize : 64 * 1024 } ) ;
106144 }
107145
146+ /**
147+ * Asynchronously load a PDF from a file path.
148+ * @param {string } pdfFilePath - Path of the PDF file
149+ * @param {number } verbosity - Verbosity level
150+ */
108151 async loadPDF ( pdfFilePath , verbosity ) {
109152 nodeUtil . verbosity ( verbosity || 0 ) ;
110153 nodeUtil . p2jinfo ( "about to load PDF file " + pdfFilePath ) ;
@@ -130,21 +173,55 @@ export default class PDFParser extends EventEmitter { // inherit from event emit
130173 }
131174 }
132175
133- // Introduce a way to directly process buffers without the need to write it to a temporary file
176+ /**
177+ * Parse PDF buffer. Introduce a way to directly process buffers without the need to write it to a temporary file
178+ * @param {Buffer } pdfBuffer - PDF buffer
179+ * @param {number } verbosity - Verbosity level
180+ */
134181 parseBuffer ( pdfBuffer , verbosity ) {
135182 nodeUtil . verbosity ( verbosity || 0 ) ;
136183 this . #startParsingPDF( pdfBuffer ) ;
137184 }
138185
139- getRawTextContent ( ) { return this . #PDFJS. getRawTextContent ( ) ; }
140- getRawTextContentStream ( ) { return ParserStream . createContentStream ( this . getRawTextContent ( ) ) ; }
141-
142- getAllFieldsTypes ( ) { return this . #PDFJS. getAllFieldsTypes ( ) ; } ;
143- getAllFieldsTypesStream ( ) { return ParserStream . createContentStream ( this . getAllFieldsTypes ( ) ) ; }
144-
145- getMergedTextBlocksIfNeeded ( ) { return this . #PDFJS. getMergedTextBlocksIfNeeded ( ) ; }
146- getMergedTextBlocksStream ( ) { return ParserStream . createContentStream ( this . getMergedTextBlocksIfNeeded ( ) ) }
147-
186+ /**
187+ * Retrieve raw text content from PDF.
188+ * @returns {string } Raw text content
189+ */
190+ getRawTextContent ( ) { return this . #PDFJS. getRawTextContent ( ) ; }
191+
192+ /**
193+ * Retrieve raw text content stream.
194+ * @returns {Stream } Raw text content stream
195+ */
196+ getRawTextContentStream ( ) { return ParserStream . createContentStream ( this . getRawTextContent ( ) ) ; }
197+
198+ /**
199+ * Retrieve all field types.
200+ * @returns {object[] } All field types
201+ */
202+ getAllFieldsTypes ( ) { return this . #PDFJS. getAllFieldsTypes ( ) ; }
203+
204+ /**
205+ * Retrieve all field types stream.
206+ * @returns {Stream } All field types stream
207+ */
208+ getAllFieldsTypesStream ( ) { return ParserStream . createContentStream ( this . getAllFieldsTypes ( ) ) ; }
209+
210+ /**
211+ * Retrieve merged text blocks if needed.
212+ * @returns {object } Merged text blocks
213+ */
214+ getMergedTextBlocksIfNeeded ( ) { return this . #PDFJS. getMergedTextBlocksIfNeeded ( ) ; }
215+
216+ /**
217+ * Retrieve merged text blocks stream.
218+ * @returns {Stream } Merged text blocks stream
219+ */
220+ getMergedTextBlocksStream ( ) { return ParserStream . createContentStream ( this . getMergedTextBlocksIfNeeded ( ) ) }
221+
222+ /**
223+ * Destroy the PDFParser instance.
224+ */
148225 destroy ( ) { // invoked with stream transform process
149226 super . removeAllListeners ( ) ;
150227
0 commit comments