""" Deployment helper for Qwen models on Vast.ai """ import time from typing import Dict import logging class ModelDeployer: """Helper class for deploying Qwen models""" def __init__(self): self.logger = logging.getLogger("ModelDeployer") self.logger.setLevel(logging.INFO) if not self.logger.handlers: handler = logging.StreamHandler() formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) self.logger.addHandler(handler) def generate_deployment_command(self, model_config: Dict) -> str: """Generate vLLM deployment command""" cmd_parts = [ "python -m vllm.entrypoints.openai.api_server", f"--model {model_config['name']}", f"--tensor-parallel-size {model_config['gpus']}", f"--port {model_config['port']}", "--max-model-len 4096" ] if model_config.get("quantization"): cmd_parts.append(f"--quantization {model_config['quantization']}") return " \\ ".join(cmd_parts) def print_deployment_instructions(self): """Print deployment instructions""" from pipeline.common_defs import ModelConfig print("\n" + "=" * 80) print("QWEN MODEL DEPLOYMENT INSTRUCTIONS") print("=" * 80) print("\n1. RENT GPUS ON VAST.AI") print("-" * 80) print("\nFor Qwen 3 235B (Primary):") print(" - Select: 4 × A100 80GB PCIe") print(" - Image: pytorch/pytorch:latest") print(f" - Cost: ${ModelConfig.QWEN3_235B['cost_per_hour']}/hr") print("\nFor Qwen 2.5 72B (Secondary):") print(" - Select: 2 × A100 80GB PCIe") print(" - Image: pytorch/pytorch:latest") print(f" - Cost: ${ModelConfig.QWEN25_72B['cost_per_hour']}/hr") print("\n2. INSTALL DEPENDENCIES") print("-" * 80) print("pip install vllm transformers accelerate") print("\n3. DEPLOY QWEN 3 235B (Primary)") print("-" * 80) qwen3_cmd = self.generate_deployment_command(ModelConfig.QWEN3_235B) print(qwen3_cmd) print("\n4. DEPLOY QWEN 2.5 72B (Secondary)") print("-" * 80) qwen25_cmd = self.generate_deployment_command(ModelConfig.QWEN25_72B) print(qwen25_cmd) print("\n5. VERIFY DEPLOYMENT") print("-" * 80) print("curl http://localhost:8000/health # Qwen 3") print("curl http://localhost:8001/health # Qwen 2.5") print("\n" + "=" * 80) if __name__ == "__main__": deployer = ModelDeployer() deployer.print_deployment_instructions()