Compare commits

...

3 Commits

Author SHA1 Message Date
304490977c wip: vllm example
Some checks failed
Run Check Script / check (pull_request) Failing after 36s
2026-03-23 08:40:29 -04:00
8499f4d1b7 Merge pull request 'fix: small details were preventing to re-save frontends,backends and healthchecks in opnsense UI' (#248) from fix/load-balancer-xml into master
Some checks failed
Run Check Script / check (push) Has been cancelled
Compile and package harmony_composer / package_harmony_composer (push) Has been cancelled
Reviewed-on: #248
2026-03-17 14:38:35 +00:00
67c3265286 fix: small details were preventing to re-save frontends,backends and healthchecks in opnsense UI
All checks were successful
Run Check Script / check (pull_request) Successful in 2m12s
2026-03-13 10:31:17 -04:00
6 changed files with 580 additions and 8 deletions

12
Cargo.lock generated
View File

@@ -7001,6 +7001,18 @@ version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "vllm"
version = "0.1.0"
dependencies = [
"env_logger",
"harmony",
"harmony_cli",
"k8s-openapi",
"log",
"tokio",
]
[[package]]
name = "wait-timeout"
version = "0.2.1"

View File

@@ -18,7 +18,7 @@ members = [
"adr/agent_discovery/mdns",
"brocade",
"harmony_agent",
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s",
"harmony_agent/deploy", "harmony_node_readiness", "harmony-k8s", "examples/vllm",
]
[workspace.package]

15
examples/vllm/Cargo.toml Normal file
View File

@@ -0,0 +1,15 @@
[package]
name = "vllm"
edition = "2024"
version.workspace = true
readme.workspace = true
license.workspace = true
publish = false
[dependencies]
harmony = { path = "../../harmony" }
harmony_cli = { path = "../../harmony_cli" }
k8s-openapi = { workspace = true }
tokio = { workspace = true }
log = { workspace = true }
env_logger = { workspace = true }

523
examples/vllm/src/main.rs Normal file
View File

@@ -0,0 +1,523 @@
//! vLLM Deployment Example for Qwen3.5-27B-FP8 on NVIDIA RTX 5090
//!
//! This example deploys vLLM serving Qwen3.5-27B with FP8 quantization,
//! optimized for single RTX 5090 (32GB VRAM) with tool calling support.
//!
//! # Architecture & Memory Constraints
//!
//! **Model Details:**
//! - Parameters: 27B (dense, not sparse/MoE)
//! - Quantization: FP8 (8-bit weights)
//! - Model size: ~27-28GB in memory
//! - Native context: 262,144 tokens (will NOT fit in 32GB VRAM)
//!
//! **VRAM Budget for RTX 5090 (32GB):**
//! - Model weights (FP8): ~27GB
//! - Framework overhead: ~1-2GB
//! - KV cache: ~2-3GB (for 16k context)
//! - CUDA context: ~500MB
//! - Temporary buffers: ~500MB
//! - **Total: ~31-33GB** (tight fit, leaves minimal headroom)
//!
//! # OpenShift/OKD Requirements
//!
//! **SCC (Security Context Constraint) Setup:**
//!
//! The official vLLM container runs as root and writes to `/root/.cache/huggingface`.
//! On OpenShift/OKD with the default restricted SCC, containers run as arbitrary UIDs
//! and cannot write to `/root`. For testing, grant the `anyuid` SCC:
//!
//! ```bash
//! # As cluster admin, grant anyuid SCC to the namespace's service account:
//! oc adm policy add-scc-to-user anyuid -z default -n vllm-qwen
//! ```
//!
//! This allows pods in the `vllm-qwen` namespace to run as root (UID 0).
//! For production, consider building a custom vLLM image that runs as non-root.
//!
//! # Critical Configuration Notes
//!
//! 1. **GPU_MEMORY_UTILIZATION=1.0**: Maximum GPU memory allocation.
//! NEVER decrease this for dense models - CPU offloading destroys performance
//! (100-1000x slower) for models where every parameter is used during inference.
//!
//! 2. **MAX_MODEL_LEN=16384**: Conservative context length that fits in available VRAM.
//! Agentic workflows with long tool call histories will need careful context management.
//!
//! 3. **--language-model-only**: Skips loading the vision encoder, saving ~1-2GB VRAM.
//! Essential for fitting the model in 32GB VRAM.
//!
//! 4. **PVC Size**: 50Gi for HuggingFace cache. Qwen3.5-27B-FP8 is ~30GB.
//!
//! # Performance Expectations
//!
//! - Single token latency: ~50-100ms (no CPU offloading)
//! - With CPU offloading: ~5-50 seconds per token (unusable for real-time inference)
//! - Throughput: ~10-20 tokens/second (single stream, no batching)
//!
//! # Next Steps for Production
//!
//! To increase context length:
//! 1. Monitor GPU memory: `kubectl exec -it deployment/qwen3-5-27b -- nvidia-smi dmon -s u`
//! 2. If stable, increase MAX_MODEL_LEN (try 32768, then 65536)
//! 3. If OOM: revert to lower value
//!
//! For full 262k context, consider:
//! - Multi-GPU setup with tensor parallelism (--tensor-parallel-size 8)
//! - Or use a smaller model (Qwen3.5-7B-FP8)
use std::collections::BTreeMap;
use harmony::{
inventory::Inventory,
modules::{
k8s::resource::K8sResourceScore,
okd::{
crd::route::{RoutePort, RouteSpec, RouteTargetReference, TLSConfig},
route::OKDRouteScore,
},
},
score::Score,
topology::{K8sAnywhereTopology, TlsRouter},
};
use k8s_openapi::{
api::{
apps::v1::{Deployment, DeploymentSpec, DeploymentStrategy},
core::v1::{
Container, ContainerPort, EmptyDirVolumeSource, EnvVar, EnvVarSource,
HTTPGetAction, PersistentVolumeClaim, PersistentVolumeClaimSpec,
PersistentVolumeClaimVolumeSource, PodSpec, PodTemplateSpec, Probe,
ResourceRequirements, Secret, SecretKeySelector, SecretVolumeSource, Service,
ServicePort, ServiceSpec, Volume, VolumeMount, VolumeResourceRequirements,
},
},
apimachinery::pkg::{
api::resource::Quantity,
apis::meta::v1::{LabelSelector, ObjectMeta},
util::intstr::IntOrString,
},
ByteString,
};
use log::info;
const NAMESPACE: &str = "vllm-qwen";
const MODEL_NAME: &str = "Qwen/Qwen3.5-27B-FP8";
const DEPLOYMENT_NAME: &str = "qwen3-5-27b";
const SERVICE_NAME: &str = DEPLOYMENT_NAME;
const ROUTE_NAME: &str = DEPLOYMENT_NAME;
const PVC_NAME: &str = "huggingface-cache";
const SECRET_NAME: &str = "hf-token-secret";
const VLLM_IMAGE: &str = "vllm/vllm-openai:latest";
const SERVICE_PORT: u16 = 8000;
const TARGET_PORT: u16 = 8000;
/// Maximum context length for the model (in tokens).
///
/// **Impact on VRAM:**
/// - Qwen3.5-27B uses per-token KV cache storage for the context window
/// - Larger context = more KV cache memory required
/// - Approximate KV cache per token: ~32KB for FP8 (very rough estimate)
/// - 16k tokens ≈ 0.5-1GB KV cache
/// - 262k tokens ≈ 8-16GB KV cache (native context length - will NOT fit in 32GB VRAM)
///
/// **Performance Impact:**
/// - Context length directly impacts memory for storing conversation history
/// - Agentic workflows with long tool call histories benefit from more context
/// - If context > available VRAM, vLLM will OOM and fail to start
///
/// **Recommendations for RTX 5090 (32GB):**
/// - Start with 16384 (conservative, should work)
/// - If no OOM, try 32768 (better for agentic workflows)
/// - Monitor GPU memory with `nvidia-smi` during operation
const MAX_MODEL_LEN: i64 = 16384;
/// Fraction of GPU memory to allocate for the model (0.0 to 1.0).
///
/// **CRITICAL WARNING: This is a dense model!**
/// Qwen3.5-27B-FP8 is NOT a sparse/mixture-of-experts model. All 27B parameters
/// are active during inference. CPU offloading will DESTROY performance.
///
/// **What this parameter controls:**
/// - Controls how much of GPU memory vLLM pre-allocates for:
/// 1. Model weights (~27GB for FP8 quantization)
/// 2. KV cache for context window
/// 3. Activation buffers for inference
/// 4. Runtime overhead
///
/// **VRAM Allocation Example:**
/// - GPU: 32GB RTX 5090
/// - GPU_MEMORY_UTILIZATION: 0.95
/// - vLLM will try to use: 32GB * 0.95 = 30.4GB
/// - Model weights: ~27-28GB
/// - Remaining for KV cache + runtime: ~2-3GB
///
/// **If set too LOW (e.g., 0.7):**
/// - vLLM restricts itself to 32GB * 0.7 = 22.4GB
/// - Model weights alone need ~27GB
/// - vLLM will OFFLOAD model weights to CPU memory
/// - Performance: **100-1000x slower** (single token generation can take seconds instead of milliseconds)
/// - This is catastrophic for a dense model where every layer needs all parameters
///
/// **If set too HIGH (e.g., 0.99):**
/// - vLLM tries to allocate nearly all GPU memory
/// - Risk: CUDA OOM if any other process needs GPU memory
/// - Risk: KV cache allocation fails during inference
/// - System instability
///
/// **Current Setting: 0.95**
/// - Leaves 5% buffer (1.6GB) for CUDA overhead, system processes
/// - Maximum allocation for model + KV cache: ~30.4GB
/// - Should leave enough headroom for:
/// - CUDA context: ~500MB
/// - Temporary buffers: ~500MB
/// - Safety margin: ~600MB
///
/// **How to tune:**
/// 1. Start with 0.95 (current setting)
/// 2. Monitor with `nvidia-smi dmon -s u` during operation
/// 3. If OOM during inference: reduce MAX_MODEL_LEN first
/// 4. If stable: try increasing MAX_MODEL_LEN before increasing this
/// 5. Only increase this if you're certain no other GPU processes run
///
/// **NEVER decrease this for dense models!**
/// If model doesn't fit, use a smaller model or quantization, not CPU offloading.
const GPU_MEMORY_UTILIZATION : f32 = 1.0;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
env_logger::init();
info!("Deploying vLLM with Qwen3.5-27B-FP8 model");
info!("Configuration:");
info!(" Model: {}", MODEL_NAME);
info!(" Max context length: {} tokens", MAX_MODEL_LEN);
info!(" GPU memory utilization: {}", GPU_MEMORY_UTILIZATION);
info!(" Language model only: true");
info!(" Tool calling enabled: true");
let topology = K8sAnywhereTopology::from_env();
let domain = topology
.get_internal_domain()
.await
.ok()
.flatten()
.unwrap_or_else(|| "cluster.local".to_string());
let host = format!("{}-{}.apps.{}", SERVICE_NAME, NAMESPACE, domain);
info!("Creating route with host: {}", host);
let scores: Vec<Box<dyn Score<K8sAnywhereTopology>>> = vec![
create_namespace(),
create_pvc(),
create_secret(),
create_deployment(),
create_service(),
create_route(&host),
];
harmony_cli::run(Inventory::autoload(), topology, scores, None)
.await
.map_err(|e| format!("Failed to deploy: {}", e))?;
info!("Successfully deployed vLLM with Qwen3.5-27B-FP8");
info!("Access the API at: http://{}.apps.<cluster-domain>", SERVICE_NAME);
Ok(())
}
fn create_namespace() -> Box<dyn Score<K8sAnywhereTopology>> {
use k8s_openapi::api::core::v1::Namespace;
let namespace = Namespace {
metadata: ObjectMeta {
name: Some(NAMESPACE.to_string()),
..Default::default()
},
spec: None,
status: None,
};
Box::new(K8sResourceScore::single(namespace, None))
}
fn create_pvc() -> Box<dyn Score<K8sAnywhereTopology>> {
let pvc = PersistentVolumeClaim {
metadata: ObjectMeta {
name: Some(PVC_NAME.to_string()),
namespace: Some(NAMESPACE.to_string()),
..Default::default()
},
spec: Some(PersistentVolumeClaimSpec {
access_modes: Some(vec!["ReadWriteOnce".to_string()]),
resources: Some(VolumeResourceRequirements {
requests: Some(BTreeMap::from([(
"storage".to_string(),
Quantity("50Gi".to_string()),
)])),
limits: None,
}),
..Default::default()
}),
status: None,
};
Box::new(K8sResourceScore::single(
pvc,
Some(NAMESPACE.to_string()),
))
}
fn create_secret() -> Box<dyn Score<K8sAnywhereTopology>> {
let mut data = BTreeMap::new();
data.insert(
"token".to_string(),
ByteString("".to_string().into_bytes()),
);
let secret = Secret {
metadata: ObjectMeta {
name: Some(SECRET_NAME.to_string()),
namespace: Some(NAMESPACE.to_string()),
..Default::default()
},
data: Some(data),
immutable: Some(false),
type_: Some("Opaque".to_string()),
string_data: None,
};
Box::new(K8sResourceScore::single(
secret,
Some(NAMESPACE.to_string()),
))
}
fn create_deployment() -> Box<dyn Score<K8sAnywhereTopology>> {
let deployment = Deployment {
metadata: ObjectMeta {
name: Some(DEPLOYMENT_NAME.to_string()),
namespace: Some(NAMESPACE.to_string()),
labels: Some(BTreeMap::from([(
"app".to_string(),
DEPLOYMENT_NAME.to_string(),
)])),
..Default::default()
},
spec: Some(DeploymentSpec {
replicas: Some(1),
selector: LabelSelector {
match_labels: Some(BTreeMap::from([(
"app".to_string(),
DEPLOYMENT_NAME.to_string(),
)])),
..Default::default()
},
strategy: Some(DeploymentStrategy {
type_: Some("Recreate".to_string()),
..Default::default()
}),
template: PodTemplateSpec {
metadata: Some(ObjectMeta {
labels: Some(BTreeMap::from([(
"app".to_string(),
DEPLOYMENT_NAME.to_string(),
)])),
..Default::default()
}),
spec: Some(PodSpec {
node_selector: Some(BTreeMap::from([(
"nvidia.com/gpu.product".to_string(),
"NVIDIA-GeForce-RTX-5090".to_string(),
)])),
volumes: Some(vec![
Volume {
name: "cache-volume".to_string(),
persistent_volume_claim: Some(PersistentVolumeClaimVolumeSource {
claim_name: PVC_NAME.to_string(),
read_only: Some(false),
}),
..Default::default()
},
Volume {
name: "shm".to_string(),
empty_dir: Some(EmptyDirVolumeSource {
medium: Some("Memory".to_string()),
size_limit: Some(Quantity("4Gi".to_string())),
}),
..Default::default()
},
Volume {
name: "hf-token".to_string(),
secret: Some(SecretVolumeSource {
secret_name: Some(SECRET_NAME.to_string()),
optional: Some(true),
..Default::default()
}),
..Default::default()
},
]),
containers: vec![Container {
name: DEPLOYMENT_NAME.to_string(),
image: Some(VLLM_IMAGE.to_string()),
command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]),
args: Some(vec![build_vllm_command()]),
env: Some(vec![
EnvVar {
name: "HF_TOKEN".to_string(),
value_from: Some(EnvVarSource {
secret_key_ref: Some(SecretKeySelector {
key: "token".to_string(),
name: SECRET_NAME.to_string(),
optional: Some(true),
}),
..Default::default()
}),
value: None,
},
EnvVar {
name: "VLLM_WORKER_MULTIPROC_METHOD".to_string(),
value: Some("spawn".to_string()),
value_from: None,
},
]),
ports: Some(vec![ContainerPort {
container_port: SERVICE_PORT as i32,
protocol: Some("TCP".to_string()),
..Default::default()
}]),
resources: Some(ResourceRequirements {
limits: Some(BTreeMap::from([
("cpu".to_string(), Quantity("10".to_string())),
("memory".to_string(), Quantity("30Gi".to_string())),
("nvidia.com/gpu".to_string(), Quantity("1".to_string())),
])),
requests: Some(BTreeMap::from([
("cpu".to_string(), Quantity("2".to_string())),
("memory".to_string(), Quantity("10Gi".to_string())),
("nvidia.com/gpu".to_string(), Quantity("1".to_string())),
])),
claims: None,
}),
volume_mounts: Some(vec![
VolumeMount {
name: "cache-volume".to_string(),
mount_path: "/root/.cache/huggingface".to_string(),
read_only: Some(false),
..Default::default()
},
VolumeMount {
name: "shm".to_string(),
mount_path: "/dev/shm".to_string(),
..Default::default()
},
VolumeMount {
name: "hf-token".to_string(),
mount_path: "/etc/secrets/hf-token".to_string(),
read_only: Some(true),
..Default::default()
},
]),
liveness_probe: Some(Probe {
http_get: Some(HTTPGetAction {
path: Some("/health".to_string()),
port: IntOrString::Int(SERVICE_PORT as i32),
..Default::default()
}),
initial_delay_seconds: Some(300),
period_seconds: Some(30),
..Default::default()
}),
readiness_probe: Some(Probe {
http_get: Some(HTTPGetAction {
path: Some("/health".to_string()),
port: IntOrString::Int(SERVICE_PORT as i32),
..Default::default()
}),
initial_delay_seconds: Some(120),
period_seconds: Some(10),
..Default::default()
}),
..Default::default()
}],
..Default::default()
}),
},
..Default::default()
}),
status: None,
};
Box::new(K8sResourceScore::single(
deployment,
Some(NAMESPACE.to_string()),
))
}
fn build_vllm_command() -> String {
format!(
"vllm serve {} \
--port {} \
--max-model-len {} \
--gpu-memory-utilization {} \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--language-model-only",
MODEL_NAME, SERVICE_PORT, MAX_MODEL_LEN, GPU_MEMORY_UTILIZATION
)
}
fn create_service() -> Box<dyn Score<K8sAnywhereTopology>> {
let service = Service {
metadata: ObjectMeta {
name: Some(SERVICE_NAME.to_string()),
namespace: Some(NAMESPACE.to_string()),
..Default::default()
},
spec: Some(ServiceSpec {
ports: Some(vec![ServicePort {
name: Some("http".to_string()),
port: SERVICE_PORT as i32,
protocol: Some("TCP".to_string()),
target_port: Some(IntOrString::Int(TARGET_PORT as i32)),
..Default::default()
}]),
selector: Some(BTreeMap::from([(
"app".to_string(),
DEPLOYMENT_NAME.to_string(),
)])),
type_: Some("ClusterIP".to_string()),
..Default::default()
}),
status: None,
};
Box::new(K8sResourceScore::single(
service,
Some(NAMESPACE.to_string()),
))
}
fn create_route(host: &str) -> Box<dyn Score<K8sAnywhereTopology>> {
let route_spec = RouteSpec {
to: RouteTargetReference {
kind: "Service".to_string(),
name: SERVICE_NAME.to_string(),
weight: Some(100),
},
host: Some(host.to_string()),
port: Some(RoutePort {
target_port: SERVICE_PORT as u16,
}),
tls: Some(TLSConfig {
termination: "edge".to_string(),
insecure_edge_termination_policy: Some("Redirect".to_string()),
..Default::default()
}),
wildcard_policy: None,
..Default::default()
};
Box::new(OKDRouteScore::new(ROUTE_NAME, NAMESPACE, route_spec))
}

View File

@@ -267,10 +267,16 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
SSL::Default => "".into(),
SSL::Other(other) => other.as_str().into(),
};
let path_without_query = path.split_once('?').map_or(path.as_str(), |(p, _)| p);
let (port, port_name) = match port {
Some(port) => (Some(port.to_string()), port.to_string()),
None => (None, "serverport".to_string()),
};
let haproxy_check = HAProxyHealthCheck {
name: format!("HTTP_{http_method}_{path}"),
name: format!("HTTP_{http_method}_{path_without_query}_{port_name}"),
uuid: Uuid::new_v4().to_string(),
http_method: http_method.to_string().into(),
http_method: http_method.to_string().to_lowercase().into(),
health_check_type: "http".to_string(),
http_uri: path.clone().into(),
interval: "2s".to_string(),
@@ -314,7 +320,10 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
let mut backend = HAProxyBackend {
uuid: Uuid::new_v4().to_string(),
enabled: 1,
name: format!("backend_{}", service.listening_port),
name: format!(
"backend_{}",
service.listening_port.to_string().replace(':', "_")
),
algorithm: "roundrobin".to_string(),
random_draws: Some(2),
stickiness_expire: "30m".to_string(),
@@ -346,10 +355,22 @@ pub(crate) fn harmony_load_balancer_service_to_haproxy_xml(
let frontend = Frontend {
uuid: uuid::Uuid::new_v4().to_string(),
enabled: 1,
name: format!("frontend_{}", service.listening_port),
name: format!(
"frontend_{}",
service.listening_port.to_string().replace(':', "_")
),
bind: service.listening_port.to_string(),
mode: "tcp".to_string(), // TODO do not depend on health check here
default_backend: Some(backend.uuid.clone()),
stickiness_expire: "30m".to_string().into(),
stickiness_size: "50k".to_string().into(),
stickiness_conn_rate_period: "10s".to_string().into(),
stickiness_sess_rate_period: "10s".to_string().into(),
stickiness_http_req_rate_period: "10s".to_string().into(),
stickiness_http_err_rate_period: "10s".to_string().into(),
stickiness_bytes_in_rate_period: "1m".to_string().into(),
stickiness_bytes_out_rate_period: "1m".to_string().into(),
ssl_hsts_max_age: 15768000,
..Default::default()
};
info!("HAPRoxy frontend and backend mode currently hardcoded to tcp");

View File

@@ -1,6 +1,5 @@
use std::fs::{self};
use std::path::{Path, PathBuf};
use std::process;
use std::path::PathBuf;
use std::sync::Arc;
use async_trait::async_trait;
@@ -65,6 +64,7 @@ pub struct RustWebapp {
///
/// This is the place to put the public host name if this is a public facing webapp.
pub dns: String,
pub version: String,
}
impl Application for RustWebapp {
@@ -465,6 +465,7 @@ impl RustWebapp {
let app_name = &self.name;
let service_port = self.service_port;
let version = &self.version;
// Create Chart.yaml
let chart_yaml = format!(
r#"
@@ -472,7 +473,7 @@ apiVersion: v2
name: {chart_name}
description: A Helm chart for the {app_name} web application.
type: application
version: 0.2.1
version: {version}
appVersion: "{image_tag}"
"#,
);