@@ -33,6 +33,7 @@ const { gfm } = require('turndown-plugin-gfm');
3333const BASE_URL = 'https://cloud.tencent.com' ;
3434const COMMON_PARAMS_URL = 'https://cloud.tencent.com/document/api/876/34812' ;
3535const API_OVERVIEW_URL = 'https://cloud.tencent.com/document/api/876/34809' ;
36+ const DEPS_INDEX_URL = 'https://cloud.tencent.com/document/api/876/34808' ;
3637const CONCURRENCY = 2 ; // 降低并发数,避免触发限流
3738const REQUEST_DELAY_MS = 500 ; // 请求间隔(毫秒)
3839const MAX_RETRIES = 3 ; // 最大重试次数
@@ -164,10 +165,32 @@ function extractApiLinks(content: string): string[] {
164165 return links ;
165166}
166167
168+ // Extract cross-product API links from the deps index page (product/1003, product/583, api/436, etc.)
169+ function extractDepsLinks ( content : string ) : string [ ] {
170+ const links : string [ ] = [ ] ;
171+ const seen = new Set < string > ( ) ;
172+ // Match full URLs: https://cloud.tencent.com/document/(api|product)/{productId}/{docId}
173+ const re = / h t t p s : \/ \/ c l o u d \. t e n c e n t \. c o m ( \/ d o c u m e n t \/ ( a p i | p r o d u c t ) \/ ( \d + ) \/ ( \d + ) ) / g;
174+ let m : RegExpExecArray | null ;
175+ while ( ( m = re . exec ( content ) ) !== null ) {
176+ const productId = m [ 3 ] ;
177+ // Skip TCB (876) links — those are handled by the main crawler
178+ if ( productId === '876' ) continue ;
179+ const url = `${ BASE_URL } ${ m [ 1 ] } ` ;
180+ if ( ! seen . has ( url ) ) {
181+ seen . add ( url ) ;
182+ links . push ( url ) ;
183+ }
184+ }
185+ return links ;
186+ }
187+
167188function urlToFilename ( url : string , title : string ) : string {
168- const docId = url . match ( / \/ ( \d + ) $ / ) ?. [ 1 ] || 'unknown' ;
189+ // Extract product id + doc id for namespacing, e.g. "1003-71660" or "876-34812"
190+ const productMatch = url . match ( / \/ ( a p i | p r o d u c t ) \/ ( \d + ) \/ ( \d + ) / ) ;
191+ const prefix = productMatch ? `${ productMatch [ 2 ] } -${ productMatch [ 3 ] } ` : url . match ( / \/ ( \d + ) $ / ) ?. [ 1 ] || 'unknown' ;
169192 const cleanTitle = title . replace ( / [ ^ \w \u4e00 - \u9fa5 ] / g, '-' ) . replace ( / - + / g, '-' ) . replace ( / ^ - | - $ / g, '' ) . slice ( 0 , 50 ) ;
170- return `${ docId } -${ cleanTitle } .md` ;
193+ return `${ prefix } -${ cleanTitle } .md` ;
171194}
172195
173196async function main ( ) {
@@ -254,6 +277,48 @@ async function main() {
254277 results . push ( ...validResults ) ;
255278 console . log ( ` ⏱️ Fetched in ${ ( ( Date . now ( ) - startTime ) / 1000 ) . toFixed ( 1 ) } s` ) ;
256279
280+ // --- Deps docs ---
281+ console . log ( '\n📄 Fetching deps index page...' ) ;
282+ let depsLinks : string [ ] = [ ] ;
283+ try {
284+ const depsIndex = await fetchMarkdownWithRetry ( DEPS_INDEX_URL ) ;
285+ console . log ( ` ✅ ${ depsIndex . title } ` ) ;
286+ depsLinks = extractDepsLinks ( depsIndex . content ) ;
287+ console . log ( ` 📋 Found ${ depsLinks . length } deps API links` ) ;
288+ } catch ( err ) {
289+ console . error ( ` ❌ Failed to fetch deps index: ${ err } ` ) ;
290+ failedCount ++ ;
291+ }
292+
293+ if ( depsLinks . length > 0 ) {
294+ console . log ( `\n📄 Fetching ${ depsLinks . length } deps API docs...` ) ;
295+ const depsStartTime = Date . now ( ) ;
296+ let depsCompleted = 0 ;
297+
298+ const depsTasks = depsLinks . map ( ( url ) =>
299+ limit ( async ( ) => {
300+ try {
301+ const doc = await fetchMarkdownWithRetry ( url ) ;
302+ depsCompleted ++ ;
303+ console . log ( ` [${ depsCompleted } /${ depsLinks . length } ] ✅ ${ doc . title } ` ) ;
304+ return { success : true , result : { url, title : doc . title , content : doc . content , filename : urlToFilename ( url , doc . title ) } as CrawlResult } ;
305+ } catch ( err ) {
306+ depsCompleted ++ ;
307+ console . error ( ` [${ depsCompleted } /${ depsLinks . length } ] ❌ ${ url } : ${ err } ` ) ;
308+ return { success : false , result : null } ;
309+ }
310+ } )
311+ ) ;
312+
313+ const depsResults = await Promise . all ( depsTasks ) ;
314+ const validDepsResults = depsResults . filter ( ( r ) => r . success && r . result !== null ) . map ( ( r ) => r . result as CrawlResult ) ;
315+ const depsFailed = depsResults . filter ( ( r ) => ! r . success ) . length ;
316+ failedCount += depsFailed ;
317+
318+ results . push ( ...validDepsResults ) ;
319+ console . log ( ` ⏱️ Fetched in ${ ( ( Date . now ( ) - depsStartTime ) / 1000 ) . toFixed ( 1 ) } s` ) ;
320+ }
321+
257322 console . log ( `\n💾 Saving ${ results . length } documents...` ) ;
258323 results . forEach ( ( r ) => writeFileSync ( join ( outputDir , r . filename ) , r . content ) ) ;
259324
0 commit comments