From 23588b52b70b6a4ad90850c27d42c3376fad597c Mon Sep 17 00:00:00 2001 From: resation Date: Fri, 20 Feb 2026 20:39:41 +0300 Subject: [PATCH] fix: stabilize GPU OCR on 8GB and correct tokenizer wiring --- backends/cuda_backend.py | 71 +++++++++++++++++++++++++++++----------- docker-compose.gpu.yml | 20 +++++++---- web_service_gpu.py | 12 +++---- 3 files changed, 71 insertions(+), 32 deletions(-) diff --git a/backends/cuda_backend.py b/backends/cuda_backend.py index 9ffd47b..f6759e0 100644 --- a/backends/cuda_backend.py +++ b/backends/cuda_backend.py @@ -5,9 +5,9 @@ # DeepSeek-OCR-2 default parameters (from official config) DEFAULT_MODEL_PATH = "deepseek-ai/DeepSeek-OCR-2" -DEFAULT_BASE_SIZE = 1024 -DEFAULT_IMAGE_SIZE = 768 -DEFAULT_CROP_MODE = True +DEFAULT_BASE_SIZE = int(os.environ.get("OCR_BASE_SIZE", "1024")) +DEFAULT_IMAGE_SIZE = int(os.environ.get("OCR_IMAGE_SIZE", "768")) +DEFAULT_CROP_MODE = os.environ.get("OCR_CROP_MODE", "true").strip().lower() in ("1", "true", "yes", "on") class CUDABackend: def __init__(self, model_path: str = DEFAULT_MODEL_PATH): @@ -72,22 +72,55 @@ def load_model(self, source: str = "huggingface", timeout: int = 300): def infer(self, prompt: str, image_path: str, **kwargs) -> str: """Run inference on CUDA""" - try: - result = self.model.infer( - tokenizer=self.tokenizer, - prompt=prompt, - image_file=image_path, - output_path='./output', - base_size=DEFAULT_BASE_SIZE, - image_size=DEFAULT_IMAGE_SIZE, - crop_mode=DEFAULT_CROP_MODE, - save_results=False, - eval_mode=True - ) - return result if result else "" - except Exception as e: - print(f"❌ Inference failed: {e}") - raise + fallback_profiles = [ + (DEFAULT_BASE_SIZE, DEFAULT_IMAGE_SIZE, DEFAULT_CROP_MODE), + (DEFAULT_BASE_SIZE, DEFAULT_IMAGE_SIZE, True), + (640, 448, True), + (576, 384, True), + (512, 320, True), + ] + seen = set() + last_error = None + + for base_size, image_size, crop_mode in fallback_profiles: + profile = (base_size, image_size, crop_mode) + if profile in seen: + continue + seen.add(profile) + + try: + print( + f"🧪 Inference profile: base_size={base_size}, image_size={image_size}, crop_mode={crop_mode}" + ) + result = self.model.infer( + tokenizer=self.tokenizer, + prompt=prompt, + image_file=image_path, + output_path='./output', + base_size=base_size, + image_size=image_size, + crop_mode=crop_mode, + save_results=False, + eval_mode=True + ) + return result if result else "" + except Exception as e: + last_error = e + msg = str(e) + # Upstream OCR-2 bug path: param_img is undefined when crop_mode=False. + if "param_img" in msg and not crop_mode: + print(f"⚠️ Upstream param_img bug with crop_mode={crop_mode}, retrying with crop_mode=True") + continue + if isinstance(e, torch.OutOfMemoryError) or "out of memory" in msg.lower(): + print(f"⚠️ CUDA OOM with profile {profile}, trying smaller profile...") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + continue + print(f"❌ Inference failed: {e}") + raise + + print(f"❌ Inference failed after all fallback profiles: {last_error}") + raise last_error @staticmethod def is_available() -> bool: diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml index 864d689..9781158 100644 --- a/docker-compose.gpu.yml +++ b/docker-compose.gpu.yml @@ -16,6 +16,10 @@ services: - CUDA_VISIBLE_DEVICES=0 - GPU_IDLE_TIMEOUT=${GPU_IDLE_TIMEOUT:-60} - PYTHONUNBUFFERED=1 + - PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True,max_split_size_mb:128} + - OCR_BASE_SIZE=${OCR_BASE_SIZE:-768} + - OCR_IMAGE_SIZE=${OCR_IMAGE_SIZE:-448} + - OCR_CROP_MODE=${OCR_CROP_MODE:-true} volumes: - ./models:/root/.cache/huggingface @@ -23,7 +27,14 @@ services: shm_size: "8g" restart: unless-stopped - + + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8001/health"] interval: 30s @@ -33,10 +44,5 @@ services: mem_limit: 32g memswap_limit: 32g - - networks: - - ocr-network -networks: - ocr-network: - driver: bridge + network_mode: bridge diff --git a/web_service_gpu.py b/web_service_gpu.py index 3999c41..f7f8e81 100644 --- a/web_service_gpu.py +++ b/web_service_gpu.py @@ -230,19 +230,19 @@ async def ocr_endpoint( # 步骤1: 懒加载模型 if gpu_manager: - model, processor = gpu_manager.get_model(load_func=load_model_func) + model, tokenizer = gpu_manager.get_model(load_func=load_model_func) else: # CPU 模式 from backends.cpu_backend import CPUBackend backend = CPUBackend() backend.load_model() - model, processor = backend.model, backend.processor + model, tokenizer = backend.model, backend.tokenizer # 步骤2: 推理 from backends.cuda_backend import CUDABackend backend = CUDABackend() backend.model = model - backend.processor = processor + backend.tokenizer = tokenizer text = backend.infer(prompt=prompt, image_path=tmp_file) # 步骤3: 立即卸载(关键!) @@ -368,17 +368,17 @@ async def ocr_pdf_endpoint( # 获取模型(只加载一次) if gpu_manager: - model, processor = gpu_manager.get_model(load_func=load_model_func) + model, tokenizer = gpu_manager.get_model(load_func=load_model_func) else: from backends.cpu_backend import CPUBackend backend = CPUBackend() backend.load_model() - model, processor = backend.model, backend.processor + model, tokenizer = backend.model, backend.tokenizer from backends.cuda_backend import CUDABackend backend = CUDABackend() backend.model = model - backend.processor = processor + backend.tokenizer = tokenizer # 构建提示词 prompt = build_prompt(prompt_type, custom_prompt, find_term)