diff --git a/Cargo.lock b/Cargo.lock index 0dd5ba9..e13a7cb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7001,6 +7001,18 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "vllm" +version = "0.1.0" +dependencies = [ + "env_logger", + "harmony", + "harmony_cli", + "k8s-openapi", + "log", + "tokio", +] + [[package]] name = "wait-timeout" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 8f524d5..996a0f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ members = [ "adr/agent_discovery/mdns", "brocade", "harmony_agent", - "harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s", + "harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s", "examples/vllm", ] [workspace.package] diff --git a/examples/vllm/Cargo.toml b/examples/vllm/Cargo.toml new file mode 100644 index 0000000..cc61d1d --- /dev/null +++ b/examples/vllm/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "vllm" +edition = "2024" +version.workspace = true +readme.workspace = true +license.workspace = true +publish = false + +[dependencies] +harmony = { path = "../../harmony" } +harmony_cli = { path = "../../harmony_cli" } +k8s-openapi = { workspace = true } +tokio = { workspace = true } +log = { workspace = true } +env_logger = { workspace = true } diff --git a/examples/vllm/src/main.rs b/examples/vllm/src/main.rs new file mode 100644 index 0000000..5045d49 --- /dev/null +++ b/examples/vllm/src/main.rs @@ -0,0 +1,523 @@ +//! vLLM Deployment Example for Qwen3.5-27B-FP8 on NVIDIA RTX 5090 +//! +//! This example deploys vLLM serving Qwen3.5-27B with FP8 quantization, +//! optimized for single RTX 5090 (32GB VRAM) with tool calling support. +//! +//! # Architecture & Memory Constraints +//! +//! **Model Details:** +//! - Parameters: 27B (dense, not sparse/MoE) +//! - Quantization: FP8 (8-bit weights) +//! - Model size: ~27-28GB in memory +//! - Native context: 262,144 tokens (will NOT fit in 32GB VRAM) +//! +//! **VRAM Budget for RTX 5090 (32GB):** +//! - Model weights (FP8): ~27GB +//! - Framework overhead: ~1-2GB +//! - KV cache: ~2-3GB (for 16k context) +//! - CUDA context: ~500MB +//! - Temporary buffers: ~500MB +//! - **Total: ~31-33GB** (tight fit, leaves minimal headroom) +//! +//! # OpenShift/OKD Requirements +//! +//! **SCC (Security Context Constraint) Setup:** +//! +//! The official vLLM container runs as root and writes to `/root/.cache/huggingface`. +//! On OpenShift/OKD with the default restricted SCC, containers run as arbitrary UIDs +//! and cannot write to `/root`. For testing, grant the `anyuid` SCC: +//! +//! ```bash +//! # As cluster admin, grant anyuid SCC to the namespace's service account: +//! oc adm policy add-scc-to-user anyuid -z default -n vllm-qwen +//! ``` +//! +//! This allows pods in the `vllm-qwen` namespace to run as root (UID 0). +//! For production, consider building a custom vLLM image that runs as non-root. +//! +//! # Critical Configuration Notes +//! +//! 1. **GPU_MEMORY_UTILIZATION=1.0**: Maximum GPU memory allocation. +//! NEVER decrease this for dense models - CPU offloading destroys performance +//! (100-1000x slower) for models where every parameter is used during inference. +//! +//! 2. **MAX_MODEL_LEN=16384**: Conservative context length that fits in available VRAM. +//! Agentic workflows with long tool call histories will need careful context management. +//! +//! 3. **--language-model-only**: Skips loading the vision encoder, saving ~1-2GB VRAM. +//! Essential for fitting the model in 32GB VRAM. +//! +//! 4. **PVC Size**: 50Gi for HuggingFace cache. Qwen3.5-27B-FP8 is ~30GB. +//! +//! # Performance Expectations +//! +//! - Single token latency: ~50-100ms (no CPU offloading) +//! - With CPU offloading: ~5-50 seconds per token (unusable for real-time inference) +//! - Throughput: ~10-20 tokens/second (single stream, no batching) +//! +//! # Next Steps for Production +//! +//! To increase context length: +//! 1. Monitor GPU memory: `kubectl exec -it deployment/qwen3-5-27b -- nvidia-smi dmon -s u` +//! 2. If stable, increase MAX_MODEL_LEN (try 32768, then 65536) +//! 3. If OOM: revert to lower value +//! +//! For full 262k context, consider: +//! - Multi-GPU setup with tensor parallelism (--tensor-parallel-size 8) +//! - Or use a smaller model (Qwen3.5-7B-FP8) + +use std::collections::BTreeMap; + +use harmony::{ + inventory::Inventory, + modules::{ + k8s::resource::K8sResourceScore, + okd::{ + crd::route::{RoutePort, RouteSpec, RouteTargetReference, TLSConfig}, + route::OKDRouteScore, + }, + }, + score::Score, + topology::{K8sAnywhereTopology, TlsRouter}, +}; +use k8s_openapi::{ + api::{ + apps::v1::{Deployment, DeploymentSpec, DeploymentStrategy}, + core::v1::{ + Container, ContainerPort, EmptyDirVolumeSource, EnvVar, EnvVarSource, + HTTPGetAction, PersistentVolumeClaim, PersistentVolumeClaimSpec, + PersistentVolumeClaimVolumeSource, PodSpec, PodTemplateSpec, Probe, + ResourceRequirements, Secret, SecretKeySelector, SecretVolumeSource, Service, + ServicePort, ServiceSpec, Volume, VolumeMount, VolumeResourceRequirements, + }, + }, + apimachinery::pkg::{ + api::resource::Quantity, + apis::meta::v1::{LabelSelector, ObjectMeta}, + util::intstr::IntOrString, + }, + ByteString, +}; +use log::info; + +const NAMESPACE: &str = "vllm-qwen"; +const MODEL_NAME: &str = "Qwen/Qwen3.5-27B-FP8"; +const DEPLOYMENT_NAME: &str = "qwen3-5-27b"; +const SERVICE_NAME: &str = DEPLOYMENT_NAME; +const ROUTE_NAME: &str = DEPLOYMENT_NAME; +const PVC_NAME: &str = "huggingface-cache"; +const SECRET_NAME: &str = "hf-token-secret"; + +const VLLM_IMAGE: &str = "vllm/vllm-openai:latest"; +const SERVICE_PORT: u16 = 8000; +const TARGET_PORT: u16 = 8000; + +/// Maximum context length for the model (in tokens). +/// +/// **Impact on VRAM:** +/// - Qwen3.5-27B uses per-token KV cache storage for the context window +/// - Larger context = more KV cache memory required +/// - Approximate KV cache per token: ~32KB for FP8 (very rough estimate) +/// - 16k tokens ≈ 0.5-1GB KV cache +/// - 262k tokens ≈ 8-16GB KV cache (native context length - will NOT fit in 32GB VRAM) +/// +/// **Performance Impact:** +/// - Context length directly impacts memory for storing conversation history +/// - Agentic workflows with long tool call histories benefit from more context +/// - If context > available VRAM, vLLM will OOM and fail to start +/// +/// **Recommendations for RTX 5090 (32GB):** +/// - Start with 16384 (conservative, should work) +/// - If no OOM, try 32768 (better for agentic workflows) +/// - Monitor GPU memory with `nvidia-smi` during operation +const MAX_MODEL_LEN: i64 = 16384; + +/// Fraction of GPU memory to allocate for the model (0.0 to 1.0). +/// +/// **CRITICAL WARNING: This is a dense model!** +/// Qwen3.5-27B-FP8 is NOT a sparse/mixture-of-experts model. All 27B parameters +/// are active during inference. CPU offloading will DESTROY performance. +/// +/// **What this parameter controls:** +/// - Controls how much of GPU memory vLLM pre-allocates for: +/// 1. Model weights (~27GB for FP8 quantization) +/// 2. KV cache for context window +/// 3. Activation buffers for inference +/// 4. Runtime overhead +/// +/// **VRAM Allocation Example:** +/// - GPU: 32GB RTX 5090 +/// - GPU_MEMORY_UTILIZATION: 0.95 +/// - vLLM will try to use: 32GB * 0.95 = 30.4GB +/// - Model weights: ~27-28GB +/// - Remaining for KV cache + runtime: ~2-3GB +/// +/// **If set too LOW (e.g., 0.7):** +/// - vLLM restricts itself to 32GB * 0.7 = 22.4GB +/// - Model weights alone need ~27GB +/// - vLLM will OFFLOAD model weights to CPU memory +/// - Performance: **100-1000x slower** (single token generation can take seconds instead of milliseconds) +/// - This is catastrophic for a dense model where every layer needs all parameters +/// +/// **If set too HIGH (e.g., 0.99):** +/// - vLLM tries to allocate nearly all GPU memory +/// - Risk: CUDA OOM if any other process needs GPU memory +/// - Risk: KV cache allocation fails during inference +/// - System instability +/// +/// **Current Setting: 0.95** +/// - Leaves 5% buffer (1.6GB) for CUDA overhead, system processes +/// - Maximum allocation for model + KV cache: ~30.4GB +/// - Should leave enough headroom for: +/// - CUDA context: ~500MB +/// - Temporary buffers: ~500MB +/// - Safety margin: ~600MB +/// +/// **How to tune:** +/// 1. Start with 0.95 (current setting) +/// 2. Monitor with `nvidia-smi dmon -s u` during operation +/// 3. If OOM during inference: reduce MAX_MODEL_LEN first +/// 4. If stable: try increasing MAX_MODEL_LEN before increasing this +/// 5. Only increase this if you're certain no other GPU processes run +/// +/// **NEVER decrease this for dense models!** +/// If model doesn't fit, use a smaller model or quantization, not CPU offloading. +const GPU_MEMORY_UTILIZATION : f32 = 1.0; + +#[tokio::main] +async fn main() -> Result<(), Box> { + env_logger::init(); + + info!("Deploying vLLM with Qwen3.5-27B-FP8 model"); + info!("Configuration:"); + info!(" Model: {}", MODEL_NAME); + info!(" Max context length: {} tokens", MAX_MODEL_LEN); + info!(" GPU memory utilization: {}", GPU_MEMORY_UTILIZATION); + info!(" Language model only: true"); + info!(" Tool calling enabled: true"); + + let topology = K8sAnywhereTopology::from_env(); + let domain = topology + .get_internal_domain() + .await + .ok() + .flatten() + .unwrap_or_else(|| "cluster.local".to_string()); + + let host = format!("{}-{}.apps.{}", SERVICE_NAME, NAMESPACE, domain); + info!("Creating route with host: {}", host); + + let scores: Vec>> = vec![ + create_namespace(), + create_pvc(), + create_secret(), + create_deployment(), + create_service(), + create_route(&host), + ]; + + harmony_cli::run(Inventory::autoload(), topology, scores, None) + .await + .map_err(|e| format!("Failed to deploy: {}", e))?; + + info!("Successfully deployed vLLM with Qwen3.5-27B-FP8"); + info!("Access the API at: http://{}.apps.", SERVICE_NAME); + + Ok(()) +} + +fn create_namespace() -> Box> { + use k8s_openapi::api::core::v1::Namespace; + + let namespace = Namespace { + metadata: ObjectMeta { + name: Some(NAMESPACE.to_string()), + ..Default::default() + }, + spec: None, + status: None, + }; + + Box::new(K8sResourceScore::single(namespace, None)) +} + +fn create_pvc() -> Box> { + let pvc = PersistentVolumeClaim { + metadata: ObjectMeta { + name: Some(PVC_NAME.to_string()), + namespace: Some(NAMESPACE.to_string()), + ..Default::default() + }, + spec: Some(PersistentVolumeClaimSpec { + access_modes: Some(vec!["ReadWriteOnce".to_string()]), + resources: Some(VolumeResourceRequirements { + requests: Some(BTreeMap::from([( + "storage".to_string(), + Quantity("50Gi".to_string()), + )])), + limits: None, + }), + ..Default::default() + }), + status: None, + }; + + Box::new(K8sResourceScore::single( + pvc, + Some(NAMESPACE.to_string()), + )) +} + +fn create_secret() -> Box> { + let mut data = BTreeMap::new(); + data.insert( + "token".to_string(), + ByteString("".to_string().into_bytes()), + ); + + let secret = Secret { + metadata: ObjectMeta { + name: Some(SECRET_NAME.to_string()), + namespace: Some(NAMESPACE.to_string()), + ..Default::default() + }, + data: Some(data), + immutable: Some(false), + type_: Some("Opaque".to_string()), + string_data: None, + }; + + Box::new(K8sResourceScore::single( + secret, + Some(NAMESPACE.to_string()), + )) +} + +fn create_deployment() -> Box> { + let deployment = Deployment { + metadata: ObjectMeta { + name: Some(DEPLOYMENT_NAME.to_string()), + namespace: Some(NAMESPACE.to_string()), + labels: Some(BTreeMap::from([( + "app".to_string(), + DEPLOYMENT_NAME.to_string(), + )])), + ..Default::default() + }, + spec: Some(DeploymentSpec { + replicas: Some(1), + selector: LabelSelector { + match_labels: Some(BTreeMap::from([( + "app".to_string(), + DEPLOYMENT_NAME.to_string(), + )])), + ..Default::default() + }, + strategy: Some(DeploymentStrategy { + type_: Some("Recreate".to_string()), + ..Default::default() + }), + template: PodTemplateSpec { + metadata: Some(ObjectMeta { + labels: Some(BTreeMap::from([( + "app".to_string(), + DEPLOYMENT_NAME.to_string(), + )])), + ..Default::default() + }), + spec: Some(PodSpec { + node_selector: Some(BTreeMap::from([( + "nvidia.com/gpu.product".to_string(), + "NVIDIA-GeForce-RTX-5090".to_string(), + )])), + volumes: Some(vec![ + Volume { + name: "cache-volume".to_string(), + persistent_volume_claim: Some(PersistentVolumeClaimVolumeSource { + claim_name: PVC_NAME.to_string(), + read_only: Some(false), + }), + ..Default::default() + }, + Volume { + name: "shm".to_string(), + empty_dir: Some(EmptyDirVolumeSource { + medium: Some("Memory".to_string()), + size_limit: Some(Quantity("4Gi".to_string())), + }), + ..Default::default() + }, + Volume { + name: "hf-token".to_string(), + secret: Some(SecretVolumeSource { + secret_name: Some(SECRET_NAME.to_string()), + optional: Some(true), + ..Default::default() + }), + ..Default::default() + }, + ]), + containers: vec![Container { + name: DEPLOYMENT_NAME.to_string(), + image: Some(VLLM_IMAGE.to_string()), + command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]), + args: Some(vec![build_vllm_command()]), + env: Some(vec![ + EnvVar { + name: "HF_TOKEN".to_string(), + value_from: Some(EnvVarSource { + secret_key_ref: Some(SecretKeySelector { + key: "token".to_string(), + name: SECRET_NAME.to_string(), + optional: Some(true), + }), + ..Default::default() + }), + value: None, + }, + EnvVar { + name: "VLLM_WORKER_MULTIPROC_METHOD".to_string(), + value: Some("spawn".to_string()), + value_from: None, + }, + ]), + ports: Some(vec![ContainerPort { + container_port: SERVICE_PORT as i32, + protocol: Some("TCP".to_string()), + ..Default::default() + }]), + resources: Some(ResourceRequirements { + limits: Some(BTreeMap::from([ + ("cpu".to_string(), Quantity("10".to_string())), + ("memory".to_string(), Quantity("30Gi".to_string())), + ("nvidia.com/gpu".to_string(), Quantity("1".to_string())), + ])), + requests: Some(BTreeMap::from([ + ("cpu".to_string(), Quantity("2".to_string())), + ("memory".to_string(), Quantity("10Gi".to_string())), + ("nvidia.com/gpu".to_string(), Quantity("1".to_string())), + ])), + claims: None, + }), + volume_mounts: Some(vec![ + VolumeMount { + name: "cache-volume".to_string(), + mount_path: "/root/.cache/huggingface".to_string(), + read_only: Some(false), + ..Default::default() + }, + VolumeMount { + name: "shm".to_string(), + mount_path: "/dev/shm".to_string(), + ..Default::default() + }, + VolumeMount { + name: "hf-token".to_string(), + mount_path: "/etc/secrets/hf-token".to_string(), + read_only: Some(true), + ..Default::default() + }, + ]), + liveness_probe: Some(Probe { + http_get: Some(HTTPGetAction { + path: Some("/health".to_string()), + port: IntOrString::Int(SERVICE_PORT as i32), + ..Default::default() + }), + initial_delay_seconds: Some(300), + period_seconds: Some(30), + ..Default::default() + }), + readiness_probe: Some(Probe { + http_get: Some(HTTPGetAction { + path: Some("/health".to_string()), + port: IntOrString::Int(SERVICE_PORT as i32), + ..Default::default() + }), + initial_delay_seconds: Some(120), + period_seconds: Some(10), + ..Default::default() + }), + ..Default::default() + }], + ..Default::default() + }), + }, + ..Default::default() + }), + status: None, + }; + + Box::new(K8sResourceScore::single( + deployment, + Some(NAMESPACE.to_string()), + )) +} + +fn build_vllm_command() -> String { + format!( + "vllm serve {} \ + --port {} \ + --max-model-len {} \ + --gpu-memory-utilization {} \ + --reasoning-parser qwen3 \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --language-model-only", + MODEL_NAME, SERVICE_PORT, MAX_MODEL_LEN, GPU_MEMORY_UTILIZATION + ) +} + +fn create_service() -> Box> { + let service = Service { + metadata: ObjectMeta { + name: Some(SERVICE_NAME.to_string()), + namespace: Some(NAMESPACE.to_string()), + ..Default::default() + }, + spec: Some(ServiceSpec { + ports: Some(vec![ServicePort { + name: Some("http".to_string()), + port: SERVICE_PORT as i32, + protocol: Some("TCP".to_string()), + target_port: Some(IntOrString::Int(TARGET_PORT as i32)), + ..Default::default() + }]), + selector: Some(BTreeMap::from([( + "app".to_string(), + DEPLOYMENT_NAME.to_string(), + )])), + type_: Some("ClusterIP".to_string()), + ..Default::default() + }), + status: None, + }; + + Box::new(K8sResourceScore::single( + service, + Some(NAMESPACE.to_string()), + )) +} + +fn create_route(host: &str) -> Box> { + let route_spec = RouteSpec { + to: RouteTargetReference { + kind: "Service".to_string(), + name: SERVICE_NAME.to_string(), + weight: Some(100), + }, + host: Some(host.to_string()), + port: Some(RoutePort { + target_port: SERVICE_PORT as u16, + }), + tls: Some(TLSConfig { + termination: "edge".to_string(), + insecure_edge_termination_policy: Some("Redirect".to_string()), + ..Default::default() + }), + wildcard_policy: None, + ..Default::default() + }; + + Box::new(OKDRouteScore::new(ROUTE_NAME, NAMESPACE, route_spec)) +} diff --git a/harmony/src/modules/application/rust.rs b/harmony/src/modules/application/rust.rs index 7e3413b..c75253c 100644 --- a/harmony/src/modules/application/rust.rs +++ b/harmony/src/modules/application/rust.rs @@ -1,6 +1,5 @@ use std::fs::{self}; -use std::path::{Path, PathBuf}; -use std::process; +use std::path::PathBuf; use std::sync::Arc; use async_trait::async_trait; @@ -65,6 +64,7 @@ pub struct RustWebapp { /// /// This is the place to put the public host name if this is a public facing webapp. pub dns: String, + pub version: String, } impl Application for RustWebapp { @@ -465,6 +465,7 @@ impl RustWebapp { let app_name = &self.name; let service_port = self.service_port; + let version = &self.version; // Create Chart.yaml let chart_yaml = format!( r#" @@ -472,7 +473,7 @@ apiVersion: v2 name: {chart_name} description: A Helm chart for the {app_name} web application. type: application -version: 0.2.1 +version: {version} appVersion: "{image_tag}" "#, );