Skip to content

Commit 734a514

Browse files
committed
fix: crawl depend api doc
1 parent eca11ae commit 734a514

2 files changed

Lines changed: 69 additions & 3 deletions

File tree

scripts/crawl-tcb-docs.ts

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ const { gfm } = require('turndown-plugin-gfm');
3333
const BASE_URL = 'https://cloud.tencent.com';
3434
const COMMON_PARAMS_URL = 'https://cloud.tencent.com/document/api/876/34812';
3535
const API_OVERVIEW_URL = 'https://cloud.tencent.com/document/api/876/34809';
36+
const DEPS_INDEX_URL = 'https://cloud.tencent.com/document/api/876/34808';
3637
const CONCURRENCY = 2; // 降低并发数,避免触发限流
3738
const REQUEST_DELAY_MS = 500; // 请求间隔(毫秒)
3839
const MAX_RETRIES = 3; // 最大重试次数
@@ -164,10 +165,32 @@ function extractApiLinks(content: string): string[] {
164165
return links;
165166
}
166167

168+
// Extract cross-product API links from the deps index page (product/1003, product/583, api/436, etc.)
169+
function extractDepsLinks(content: string): string[] {
170+
const links: string[] = [];
171+
const seen = new Set<string>();
172+
// Match full URLs: https://cloud.tencent.com/document/(api|product)/{productId}/{docId}
173+
const re = /https:\/\/cloud\.tencent\.com(\/document\/(api|product)\/(\d+)\/(\d+))/g;
174+
let m: RegExpExecArray | null;
175+
while ((m = re.exec(content)) !== null) {
176+
const productId = m[3];
177+
// Skip TCB (876) links — those are handled by the main crawler
178+
if (productId === '876') continue;
179+
const url = `${BASE_URL}${m[1]}`;
180+
if (!seen.has(url)) {
181+
seen.add(url);
182+
links.push(url);
183+
}
184+
}
185+
return links;
186+
}
187+
167188
function urlToFilename(url: string, title: string): string {
168-
const docId = url.match(/\/(\d+)$/)?.[1] || 'unknown';
189+
// Extract product id + doc id for namespacing, e.g. "1003-71660" or "876-34812"
190+
const productMatch = url.match(/\/(api|product)\/(\d+)\/(\d+)/);
191+
const prefix = productMatch ? `${productMatch[2]}-${productMatch[3]}` : url.match(/\/(\d+)$/)?.[1] || 'unknown';
169192
const cleanTitle = title.replace(/[^\w\u4e00-\u9fa5]/g, '-').replace(/-+/g, '-').replace(/^-|-$/g, '').slice(0, 50);
170-
return `${docId}-${cleanTitle}.md`;
193+
return `${prefix}-${cleanTitle}.md`;
171194
}
172195

173196
async function main() {
@@ -254,6 +277,48 @@ async function main() {
254277
results.push(...validResults);
255278
console.log(` ⏱️ Fetched in ${((Date.now() - startTime) / 1000).toFixed(1)}s`);
256279

280+
// --- Deps docs ---
281+
console.log('\n📄 Fetching deps index page...');
282+
let depsLinks: string[] = [];
283+
try {
284+
const depsIndex = await fetchMarkdownWithRetry(DEPS_INDEX_URL);
285+
console.log(` ✅ ${depsIndex.title}`);
286+
depsLinks = extractDepsLinks(depsIndex.content);
287+
console.log(` 📋 Found ${depsLinks.length} deps API links`);
288+
} catch (err) {
289+
console.error(` ❌ Failed to fetch deps index: ${err}`);
290+
failedCount++;
291+
}
292+
293+
if (depsLinks.length > 0) {
294+
console.log(`\n📄 Fetching ${depsLinks.length} deps API docs...`);
295+
const depsStartTime = Date.now();
296+
let depsCompleted = 0;
297+
298+
const depsTasks = depsLinks.map((url) =>
299+
limit(async () => {
300+
try {
301+
const doc = await fetchMarkdownWithRetry(url);
302+
depsCompleted++;
303+
console.log(` [${depsCompleted}/${depsLinks.length}] ✅ ${doc.title}`);
304+
return { success: true, result: { url, title: doc.title, content: doc.content, filename: urlToFilename(url, doc.title) } as CrawlResult };
305+
} catch (err) {
306+
depsCompleted++;
307+
console.error(` [${depsCompleted}/${depsLinks.length}] ❌ ${url}: ${err}`);
308+
return { success: false, result: null };
309+
}
310+
})
311+
);
312+
313+
const depsResults = await Promise.all(depsTasks);
314+
const validDepsResults = depsResults.filter((r) => r.success && r.result !== null).map((r) => r.result as CrawlResult);
315+
const depsFailed = depsResults.filter((r) => !r.success).length;
316+
failedCount += depsFailed;
317+
318+
results.push(...validDepsResults);
319+
console.log(` ⏱️ Fetched in ${((Date.now() - depsStartTime) / 1000).toFixed(1)}s`);
320+
}
321+
257322
console.log(`\n💾 Saving ${results.length} documents...`);
258323
results.forEach((r) => writeFileSync(join(outputDir, r.filename), r.content));
259324

scripts/generate-actionlist.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,10 @@ async function extractActions(): Promise<string[]> {
2121
// 需要跳过的文件(非具体 API 文档)
2222
const skipFiles = ['README.md', 'API-概览', '公共参数'];
2323

24-
// 读取并过滤文件
24+
// 读取并过滤文件:只处理 TCB (876) 文档,跳过依赖产品文档
2525
const files = fs.readdirSync(REFERENCES_DIR)
2626
.filter(f => f.endsWith('.md'))
27+
.filter(f => f.startsWith('876-'))
2728
.filter(f => !skipFiles.some(skip => f.includes(skip)));
2829

2930
// 并行读取文件并提取 action

0 commit comments

Comments
 (0)