wip: vllm example

2026-03-23 08:40:29 -04:00
5 changed files with 555 additions and 4 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7001,6 +7001,18 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"

+[[package]]
+name = "vllm"
+version = "0.1.0"
+dependencies = [
+ "env_logger",
+ "harmony",
+ "harmony_cli",
+ "k8s-openapi",
+ "log",
+ "tokio",
+]
+
 [[package]]
 name = "wait-timeout"
 version = "0.2.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ members = [
  "adr/agent_discovery/mdns",
   "brocade",
   "harmony_agent",
-   "harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
+   "harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s", "examples/vllm",
 ]

 [workspace.package]
--- a/examples/vllm/Cargo.toml
+++ b/examples/vllm/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "vllm"
+edition = "2024"
+version.workspace = true
+readme.workspace = true
+license.workspace = true
+publish = false
+
+[dependencies]
+harmony = { path = "../../harmony" }
+harmony_cli = { path = "../../harmony_cli" }
+k8s-openapi = { workspace = true }
+tokio = { workspace = true }
+log = { workspace = true }
+env_logger = { workspace = true }
--- a/examples/vllm/src/main.rs
+++ b/examples/vllm/src/main.rs
@@ -0,0 +1,523 @@
+//! vLLM Deployment Example for Qwen3.5-27B-FP8 on NVIDIA RTX 5090
+//!
+//! This example deploys vLLM serving Qwen3.5-27B with FP8 quantization,
+//! optimized for single RTX 5090 (32GB VRAM) with tool calling support.
+//!
+//! # Architecture & Memory Constraints
+//!
+//! **Model Details:**
+//! - Parameters: 27B (dense, not sparse/MoE)
+//! - Quantization: FP8 (8-bit weights)
+//! - Model size: ~27-28GB in memory
+//! - Native context: 262,144 tokens (will NOT fit in 32GB VRAM)
+//!
+//! **VRAM Budget for RTX 5090 (32GB):**
+//! - Model weights (FP8): ~27GB
+//! - Framework overhead: ~1-2GB
+//! - KV cache: ~2-3GB (for 16k context)
+//! - CUDA context: ~500MB
+//! - Temporary buffers: ~500MB
+//! - **Total: ~31-33GB** (tight fit, leaves minimal headroom)
+//!
+//! # OpenShift/OKD Requirements
+//!
+//! **SCC (Security Context Constraint) Setup:**
+//!
+//! The official vLLM container runs as root and writes to `/root/.cache/huggingface`.
+//! On OpenShift/OKD with the default restricted SCC, containers run as arbitrary UIDs
+//! and cannot write to `/root`. For testing, grant the `anyuid` SCC:
+//!
+//! ```bash
+//! # As cluster admin, grant anyuid SCC to the namespace's service account:
+//! oc adm policy add-scc-to-user anyuid -z default -n vllm-qwen
+//! ```
+//!
+//! This allows pods in the `vllm-qwen` namespace to run as root (UID 0).
+//! For production, consider building a custom vLLM image that runs as non-root.
+//!
+//! # Critical Configuration Notes
+//!
+//! 1. **GPU_MEMORY_UTILIZATION=1.0**: Maximum GPU memory allocation.
+//!    NEVER decrease this for dense models - CPU offloading destroys performance
+//!    (100-1000x slower) for models where every parameter is used during inference.
+//!
+//! 2. **MAX_MODEL_LEN=16384**: Conservative context length that fits in available VRAM.
+//!    Agentic workflows with long tool call histories will need careful context management.
+//!
+//! 3. **--language-model-only**: Skips loading the vision encoder, saving ~1-2GB VRAM.
+//!    Essential for fitting the model in 32GB VRAM.
+//!
+//! 4. **PVC Size**: 50Gi for HuggingFace cache. Qwen3.5-27B-FP8 is ~30GB.
+//!
+//! # Performance Expectations
+//!
+//! - Single token latency: ~50-100ms (no CPU offloading)
+//! - With CPU offloading: ~5-50 seconds per token (unusable for real-time inference)
+//! - Throughput: ~10-20 tokens/second (single stream, no batching)
+//!
+//! # Next Steps for Production
+//!
+//! To increase context length:
+//! 1. Monitor GPU memory: `kubectl exec -it deployment/qwen3-5-27b -- nvidia-smi dmon -s u`
+//! 2. If stable, increase MAX_MODEL_LEN (try 32768, then 65536)
+//! 3. If OOM: revert to lower value
+//!
+//! For full 262k context, consider:
+//! - Multi-GPU setup with tensor parallelism (--tensor-parallel-size 8)
+//! - Or use a smaller model (Qwen3.5-7B-FP8)
+
+use std::collections::BTreeMap;
+
+use harmony::{
+    inventory::Inventory,
+    modules::{
+        k8s::resource::K8sResourceScore,
+        okd::{
+            crd::route::{RoutePort, RouteSpec, RouteTargetReference, TLSConfig},
+            route::OKDRouteScore,
+        },
+    },
+    score::Score,
+    topology::{K8sAnywhereTopology, TlsRouter},
+};
+use k8s_openapi::{
+    api::{
+        apps::v1::{Deployment, DeploymentSpec, DeploymentStrategy},
+        core::v1::{
+            Container, ContainerPort, EmptyDirVolumeSource, EnvVar, EnvVarSource,
+            HTTPGetAction, PersistentVolumeClaim, PersistentVolumeClaimSpec,
+            PersistentVolumeClaimVolumeSource, PodSpec, PodTemplateSpec, Probe,
+            ResourceRequirements, Secret, SecretKeySelector, SecretVolumeSource, Service,
+            ServicePort, ServiceSpec, Volume, VolumeMount, VolumeResourceRequirements,
+        },
+    },
+    apimachinery::pkg::{
+        api::resource::Quantity,
+        apis::meta::v1::{LabelSelector, ObjectMeta},
+        util::intstr::IntOrString,
+    },
+    ByteString,
+};
+use log::info;
+
+const NAMESPACE: &str = "vllm-qwen";
+const MODEL_NAME: &str = "Qwen/Qwen3.5-27B-FP8";
+const DEPLOYMENT_NAME: &str = "qwen3-5-27b";
+const SERVICE_NAME: &str = DEPLOYMENT_NAME;
+const ROUTE_NAME: &str = DEPLOYMENT_NAME;
+const PVC_NAME: &str = "huggingface-cache";
+const SECRET_NAME: &str = "hf-token-secret";
+
+const VLLM_IMAGE: &str = "vllm/vllm-openai:latest";
+const SERVICE_PORT: u16 = 8000;
+const TARGET_PORT: u16 = 8000;
+
+/// Maximum context length for the model (in tokens).
+///
+/// **Impact on VRAM:**
+/// - Qwen3.5-27B uses per-token KV cache storage for the context window
+/// - Larger context = more KV cache memory required
+/// - Approximate KV cache per token: ~32KB for FP8 (very rough estimate)
+/// - 16k tokens ≈ 0.5-1GB KV cache
+/// - 262k tokens ≈ 8-16GB KV cache (native context length - will NOT fit in 32GB VRAM)
+///
+/// **Performance Impact:**
+/// - Context length directly impacts memory for storing conversation history
+/// - Agentic workflows with long tool call histories benefit from more context
+/// - If context > available VRAM, vLLM will OOM and fail to start
+///
+/// **Recommendations for RTX 5090 (32GB):**
+/// - Start with 16384 (conservative, should work)
+/// - If no OOM, try 32768 (better for agentic workflows)
+/// - Monitor GPU memory with `nvidia-smi` during operation
+const MAX_MODEL_LEN: i64 = 16384;
+
+/// Fraction of GPU memory to allocate for the model (0.0 to 1.0).
+///
+/// **CRITICAL WARNING: This is a dense model!**
+/// Qwen3.5-27B-FP8 is NOT a sparse/mixture-of-experts model. All 27B parameters
+/// are active during inference. CPU offloading will DESTROY performance.
+///
+/// **What this parameter controls:**
+/// - Controls how much of GPU memory vLLM pre-allocates for:
+///   1. Model weights (~27GB for FP8 quantization)
+///   2. KV cache for context window
+///   3. Activation buffers for inference
+///   4. Runtime overhead
+///
+/// **VRAM Allocation Example:**
+/// - GPU: 32GB RTX 5090
+/// - GPU_MEMORY_UTILIZATION: 0.95
+/// - vLLM will try to use: 32GB * 0.95 = 30.4GB
+/// - Model weights: ~27-28GB
+/// - Remaining for KV cache + runtime: ~2-3GB
+///
+/// **If set too LOW (e.g., 0.7):**
+/// - vLLM restricts itself to 32GB * 0.7 = 22.4GB
+/// - Model weights alone need ~27GB
+/// - vLLM will OFFLOAD model weights to CPU memory
+/// - Performance: **100-1000x slower** (single token generation can take seconds instead of milliseconds)
+/// - This is catastrophic for a dense model where every layer needs all parameters
+///
+/// **If set too HIGH (e.g., 0.99):**
+/// - vLLM tries to allocate nearly all GPU memory
+/// - Risk: CUDA OOM if any other process needs GPU memory
+/// - Risk: KV cache allocation fails during inference
+/// - System instability
+///
+/// **Current Setting: 0.95**
+/// - Leaves 5% buffer (1.6GB) for CUDA overhead, system processes
+/// - Maximum allocation for model + KV cache: ~30.4GB
+/// - Should leave enough headroom for:
+///   - CUDA context: ~500MB
+///   - Temporary buffers: ~500MB
+///   - Safety margin: ~600MB
+///
+/// **How to tune:**
+/// 1. Start with 0.95 (current setting)
+/// 2. Monitor with `nvidia-smi dmon -s u` during operation
+/// 3. If OOM during inference: reduce MAX_MODEL_LEN first
+/// 4. If stable: try increasing MAX_MODEL_LEN before increasing this
+/// 5. Only increase this if you're certain no other GPU processes run
+///
+/// **NEVER decrease this for dense models!** 
+/// If model doesn't fit, use a smaller model or quantization, not CPU offloading.
+const GPU_MEMORY_UTILIZATION : f32 = 1.0;
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    env_logger::init();
+
+    info!("Deploying vLLM with Qwen3.5-27B-FP8 model");
+    info!("Configuration:");
+    info!("  Model: {}", MODEL_NAME);
+    info!("  Max context length: {} tokens", MAX_MODEL_LEN);
+    info!("  GPU memory utilization: {}", GPU_MEMORY_UTILIZATION);
+    info!("  Language model only: true");
+    info!("  Tool calling enabled: true");
+
+    let topology = K8sAnywhereTopology::from_env();
+    let domain = topology
+        .get_internal_domain()
+        .await
+        .ok()
+        .flatten()
+        .unwrap_or_else(|| "cluster.local".to_string());
+
+    let host = format!("{}-{}.apps.{}", SERVICE_NAME, NAMESPACE, domain);
+    info!("Creating route with host: {}", host);
+
+    let scores: Vec<Box<dyn Score<K8sAnywhereTopology>>> = vec![
+        create_namespace(),
+        create_pvc(),
+        create_secret(),
+        create_deployment(),
+        create_service(),
+        create_route(&host),
+    ];
+
+    harmony_cli::run(Inventory::autoload(), topology, scores, None)
+        .await
+        .map_err(|e| format!("Failed to deploy: {}", e))?;
+
+    info!("Successfully deployed vLLM with Qwen3.5-27B-FP8");
+    info!("Access the API at: http://{}.apps.<cluster-domain>", SERVICE_NAME);
+
+    Ok(())
+}
+
+fn create_namespace() -> Box<dyn Score<K8sAnywhereTopology>> {
+    use k8s_openapi::api::core::v1::Namespace;
+
+    let namespace = Namespace {
+        metadata: ObjectMeta {
+            name: Some(NAMESPACE.to_string()),
+            ..Default::default()
+        },
+        spec: None,
+        status: None,
+    };
+
+    Box::new(K8sResourceScore::single(namespace, None))
+}
+
+fn create_pvc() -> Box<dyn Score<K8sAnywhereTopology>> {
+    let pvc = PersistentVolumeClaim {
+        metadata: ObjectMeta {
+            name: Some(PVC_NAME.to_string()),
+            namespace: Some(NAMESPACE.to_string()),
+            ..Default::default()
+        },
+        spec: Some(PersistentVolumeClaimSpec {
+            access_modes: Some(vec!["ReadWriteOnce".to_string()]),
+            resources: Some(VolumeResourceRequirements {
+                requests: Some(BTreeMap::from([(
+                    "storage".to_string(),
+                    Quantity("50Gi".to_string()),
+                )])),
+                limits: None,
+            }),
+            ..Default::default()
+        }),
+        status: None,
+    };
+
+    Box::new(K8sResourceScore::single(
+        pvc,
+        Some(NAMESPACE.to_string()),
+    ))
+}
+
+fn create_secret() -> Box<dyn Score<K8sAnywhereTopology>> {
+    let mut data = BTreeMap::new();
+    data.insert(
+        "token".to_string(),
+        ByteString("".to_string().into_bytes()),
+    );
+
+    let secret = Secret {
+        metadata: ObjectMeta {
+            name: Some(SECRET_NAME.to_string()),
+            namespace: Some(NAMESPACE.to_string()),
+            ..Default::default()
+        },
+        data: Some(data),
+        immutable: Some(false),
+        type_: Some("Opaque".to_string()),
+        string_data: None,
+    };
+
+    Box::new(K8sResourceScore::single(
+        secret,
+        Some(NAMESPACE.to_string()),
+    ))
+}
+
+fn create_deployment() -> Box<dyn Score<K8sAnywhereTopology>> {
+    let deployment = Deployment {
+        metadata: ObjectMeta {
+            name: Some(DEPLOYMENT_NAME.to_string()),
+            namespace: Some(NAMESPACE.to_string()),
+            labels: Some(BTreeMap::from([(
+                "app".to_string(),
+                DEPLOYMENT_NAME.to_string(),
+            )])),
+            ..Default::default()
+        },
+        spec: Some(DeploymentSpec {
+            replicas: Some(1),
+            selector: LabelSelector {
+                match_labels: Some(BTreeMap::from([(
+                    "app".to_string(),
+                    DEPLOYMENT_NAME.to_string(),
+                )])),
+                ..Default::default()
+            },
+            strategy: Some(DeploymentStrategy {
+                type_: Some("Recreate".to_string()),
+                ..Default::default()
+            }),
+            template: PodTemplateSpec {
+                metadata: Some(ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        "app".to_string(),
+                        DEPLOYMENT_NAME.to_string(),
+                    )])),
+                    ..Default::default()
+                }),
+                spec: Some(PodSpec {
+                    node_selector: Some(BTreeMap::from([(
+                        "nvidia.com/gpu.product".to_string(),
+                        "NVIDIA-GeForce-RTX-5090".to_string(),
+                    )])),
+                    volumes: Some(vec![
+                        Volume {
+                            name: "cache-volume".to_string(),
+                            persistent_volume_claim: Some(PersistentVolumeClaimVolumeSource {
+                                claim_name: PVC_NAME.to_string(),
+                                read_only: Some(false),
+                            }),
+                            ..Default::default()
+                        },
+                        Volume {
+                            name: "shm".to_string(),
+                            empty_dir: Some(EmptyDirVolumeSource {
+                                medium: Some("Memory".to_string()),
+                                size_limit: Some(Quantity("4Gi".to_string())),
+                            }),
+                            ..Default::default()
+                        },
+                        Volume {
+                            name: "hf-token".to_string(),
+                            secret: Some(SecretVolumeSource {
+                                secret_name: Some(SECRET_NAME.to_string()),
+                                optional: Some(true),
+                                ..Default::default()
+                            }),
+                            ..Default::default()
+                        },
+                    ]),
+                    containers: vec![Container {
+                        name: DEPLOYMENT_NAME.to_string(),
+                        image: Some(VLLM_IMAGE.to_string()),
+                        command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]),
+                        args: Some(vec![build_vllm_command()]),
+                        env: Some(vec![
+                            EnvVar {
+                                name: "HF_TOKEN".to_string(),
+                                value_from: Some(EnvVarSource {
+                                    secret_key_ref: Some(SecretKeySelector {
+                                        key: "token".to_string(),
+                                        name: SECRET_NAME.to_string(),
+                                        optional: Some(true),
+                                    }),
+                                    ..Default::default()
+                                }),
+                                value: None,
+                            },
+                            EnvVar {
+                                name: "VLLM_WORKER_MULTIPROC_METHOD".to_string(),
+                                value: Some("spawn".to_string()),
+                                value_from: None,
+                            },
+                        ]),
+                        ports: Some(vec![ContainerPort {
+                            container_port: SERVICE_PORT as i32,
+                            protocol: Some("TCP".to_string()),
+                            ..Default::default()
+                        }]),
+                        resources: Some(ResourceRequirements {
+                            limits: Some(BTreeMap::from([
+                                ("cpu".to_string(), Quantity("10".to_string())),
+                                ("memory".to_string(), Quantity("30Gi".to_string())),
+                                ("nvidia.com/gpu".to_string(), Quantity("1".to_string())),
+                            ])),
+                            requests: Some(BTreeMap::from([
+                                ("cpu".to_string(), Quantity("2".to_string())),
+                                ("memory".to_string(), Quantity("10Gi".to_string())),
+                                ("nvidia.com/gpu".to_string(), Quantity("1".to_string())),
+                            ])),
+                            claims: None,
+                        }),
+                        volume_mounts: Some(vec![
+                            VolumeMount {
+                                name: "cache-volume".to_string(),
+                                mount_path: "/root/.cache/huggingface".to_string(),
+                                read_only: Some(false),
+                                ..Default::default()
+                            },
+                            VolumeMount {
+                                name: "shm".to_string(),
+                                mount_path: "/dev/shm".to_string(),
+                                ..Default::default()
+                            },
+                            VolumeMount {
+                                name: "hf-token".to_string(),
+                                mount_path: "/etc/secrets/hf-token".to_string(),
+                                read_only: Some(true),
+                                ..Default::default()
+                            },
+                        ]),
+                        liveness_probe: Some(Probe {
+                            http_get: Some(HTTPGetAction {
+                                path: Some("/health".to_string()),
+                                port: IntOrString::Int(SERVICE_PORT as i32),
+                                ..Default::default()
+                            }),
+                            initial_delay_seconds: Some(300),
+                            period_seconds: Some(30),
+                            ..Default::default()
+                        }),
+                        readiness_probe: Some(Probe {
+                            http_get: Some(HTTPGetAction {
+                                path: Some("/health".to_string()),
+                                port: IntOrString::Int(SERVICE_PORT as i32),
+                                ..Default::default()
+                            }),
+                            initial_delay_seconds: Some(120),
+                            period_seconds: Some(10),
+                            ..Default::default()
+                        }),
+                        ..Default::default()
+                    }],
+                    ..Default::default()
+                }),
+            },
+            ..Default::default()
+        }),
+        status: None,
+    };
+
+    Box::new(K8sResourceScore::single(
+        deployment,
+        Some(NAMESPACE.to_string()),
+    ))
+}
+
+fn build_vllm_command() -> String {
+    format!(
+        "vllm serve {} \
+         --port {} \
+         --max-model-len {} \
+         --gpu-memory-utilization {} \
+         --reasoning-parser qwen3 \
+         --enable-auto-tool-choice \
+         --tool-call-parser qwen3_coder \
+         --language-model-only",
+        MODEL_NAME, SERVICE_PORT, MAX_MODEL_LEN, GPU_MEMORY_UTILIZATION
+    )
+}
+
+fn create_service() -> Box<dyn Score<K8sAnywhereTopology>> {
+    let service = Service {
+        metadata: ObjectMeta {
+            name: Some(SERVICE_NAME.to_string()),
+            namespace: Some(NAMESPACE.to_string()),
+            ..Default::default()
+        },
+        spec: Some(ServiceSpec {
+            ports: Some(vec![ServicePort {
+                name: Some("http".to_string()),
+                port: SERVICE_PORT as i32,
+                protocol: Some("TCP".to_string()),
+                target_port: Some(IntOrString::Int(TARGET_PORT as i32)),
+                ..Default::default()
+            }]),
+            selector: Some(BTreeMap::from([(
+                "app".to_string(),
+                DEPLOYMENT_NAME.to_string(),
+            )])),
+            type_: Some("ClusterIP".to_string()),
+            ..Default::default()
+        }),
+        status: None,
+    };
+
+    Box::new(K8sResourceScore::single(
+        service,
+        Some(NAMESPACE.to_string()),
+    ))
+}
+
+fn create_route(host: &str) -> Box<dyn Score<K8sAnywhereTopology>> {
+    let route_spec = RouteSpec {
+        to: RouteTargetReference {
+            kind: "Service".to_string(),
+            name: SERVICE_NAME.to_string(),
+            weight: Some(100),
+        },
+        host: Some(host.to_string()),
+        port: Some(RoutePort {
+            target_port: SERVICE_PORT as u16,
+        }),
+        tls: Some(TLSConfig {
+            termination: "edge".to_string(),
+            insecure_edge_termination_policy: Some("Redirect".to_string()),
+            ..Default::default()
+        }),
+        wildcard_policy: None,
+        ..Default::default()
+    };
+
+    Box::new(OKDRouteScore::new(ROUTE_NAME, NAMESPACE, route_spec))
+}
--- a/harmony/src/modules/application/rust.rs
+++ b/harmony/src/modules/application/rust.rs
@@ -1,6 +1,5 @@
 use std::fs::{self};
-use std::path::{Path, PathBuf};
-use std::process;
+use std::path::PathBuf;
 use std::sync::Arc;

 use async_trait::async_trait;
@@ -65,6 +64,7 @@ pub struct RustWebapp {
    ///
    /// This is the place to put the public host name if this is a public facing webapp.
    pub dns: String,
+    pub version: String,
 }

 impl Application for RustWebapp {
@@ -465,6 +465,7 @@ impl RustWebapp {

        let app_name = &self.name;
        let service_port = self.service_port;
+        let version = &self.version;
        // Create Chart.yaml
        let chart_yaml = format!(
            r#"
@@ -472,7 +473,7 @@ apiVersion: v2
 name: {chart_name}
 description: A Helm chart for the {app_name} web application.
 type: application
-version: 0.2.1
+version: {version}
 appVersion: "{image_tag}"
 "#,
        );