|
|
|
|
@@ -0,0 +1,523 @@
|
|
|
|
|
//! vLLM Deployment Example for Qwen3.5-27B-FP8 on NVIDIA RTX 5090
|
|
|
|
|
//!
|
|
|
|
|
//! This example deploys vLLM serving Qwen3.5-27B with FP8 quantization,
|
|
|
|
|
//! optimized for single RTX 5090 (32GB VRAM) with tool calling support.
|
|
|
|
|
//!
|
|
|
|
|
//! # Architecture & Memory Constraints
|
|
|
|
|
//!
|
|
|
|
|
//! **Model Details:**
|
|
|
|
|
//! - Parameters: 27B (dense, not sparse/MoE)
|
|
|
|
|
//! - Quantization: FP8 (8-bit weights)
|
|
|
|
|
//! - Model size: ~27-28GB in memory
|
|
|
|
|
//! - Native context: 262,144 tokens (will NOT fit in 32GB VRAM)
|
|
|
|
|
//!
|
|
|
|
|
//! **VRAM Budget for RTX 5090 (32GB):**
|
|
|
|
|
//! - Model weights (FP8): ~27GB
|
|
|
|
|
//! - Framework overhead: ~1-2GB
|
|
|
|
|
//! - KV cache: ~2-3GB (for 16k context)
|
|
|
|
|
//! - CUDA context: ~500MB
|
|
|
|
|
//! - Temporary buffers: ~500MB
|
|
|
|
|
//! - **Total: ~31-33GB** (tight fit, leaves minimal headroom)
|
|
|
|
|
//!
|
|
|
|
|
//! # OpenShift/OKD Requirements
|
|
|
|
|
//!
|
|
|
|
|
//! **SCC (Security Context Constraint) Setup:**
|
|
|
|
|
//!
|
|
|
|
|
//! The official vLLM container runs as root and writes to `/root/.cache/huggingface`.
|
|
|
|
|
//! On OpenShift/OKD with the default restricted SCC, containers run as arbitrary UIDs
|
|
|
|
|
//! and cannot write to `/root`. For testing, grant the `anyuid` SCC:
|
|
|
|
|
//!
|
|
|
|
|
//! ```bash
|
|
|
|
|
//! # As cluster admin, grant anyuid SCC to the namespace's service account:
|
|
|
|
|
//! oc adm policy add-scc-to-user anyuid -z default -n vllm-qwen
|
|
|
|
|
//! ```
|
|
|
|
|
//!
|
|
|
|
|
//! This allows pods in the `vllm-qwen` namespace to run as root (UID 0).
|
|
|
|
|
//! For production, consider building a custom vLLM image that runs as non-root.
|
|
|
|
|
//!
|
|
|
|
|
//! # Critical Configuration Notes
|
|
|
|
|
//!
|
|
|
|
|
//! 1. **GPU_MEMORY_UTILIZATION=1.0**: Maximum GPU memory allocation.
|
|
|
|
|
//! NEVER decrease this for dense models - CPU offloading destroys performance
|
|
|
|
|
//! (100-1000x slower) for models where every parameter is used during inference.
|
|
|
|
|
//!
|
|
|
|
|
//! 2. **MAX_MODEL_LEN=16384**: Conservative context length that fits in available VRAM.
|
|
|
|
|
//! Agentic workflows with long tool call histories will need careful context management.
|
|
|
|
|
//!
|
|
|
|
|
//! 3. **--language-model-only**: Skips loading the vision encoder, saving ~1-2GB VRAM.
|
|
|
|
|
//! Essential for fitting the model in 32GB VRAM.
|
|
|
|
|
//!
|
|
|
|
|
//! 4. **PVC Size**: 50Gi for HuggingFace cache. Qwen3.5-27B-FP8 is ~30GB.
|
|
|
|
|
//!
|
|
|
|
|
//! # Performance Expectations
|
|
|
|
|
//!
|
|
|
|
|
//! - Single token latency: ~50-100ms (no CPU offloading)
|
|
|
|
|
//! - With CPU offloading: ~5-50 seconds per token (unusable for real-time inference)
|
|
|
|
|
//! - Throughput: ~10-20 tokens/second (single stream, no batching)
|
|
|
|
|
//!
|
|
|
|
|
//! # Next Steps for Production
|
|
|
|
|
//!
|
|
|
|
|
//! To increase context length:
|
|
|
|
|
//! 1. Monitor GPU memory: `kubectl exec -it deployment/qwen3-5-27b -- nvidia-smi dmon -s u`
|
|
|
|
|
//! 2. If stable, increase MAX_MODEL_LEN (try 32768, then 65536)
|
|
|
|
|
//! 3. If OOM: revert to lower value
|
|
|
|
|
//!
|
|
|
|
|
//! For full 262k context, consider:
|
|
|
|
|
//! - Multi-GPU setup with tensor parallelism (--tensor-parallel-size 8)
|
|
|
|
|
//! - Or use a smaller model (Qwen3.5-7B-FP8)
|
|
|
|
|
|
|
|
|
|
use std::collections::BTreeMap;
|
|
|
|
|
|
|
|
|
|
use harmony::{
|
|
|
|
|
inventory::Inventory,
|
|
|
|
|
modules::{
|
|
|
|
|
k8s::resource::K8sResourceScore,
|
|
|
|
|
okd::{
|
|
|
|
|
crd::route::{RoutePort, RouteSpec, RouteTargetReference, TLSConfig},
|
|
|
|
|
route::OKDRouteScore,
|
|
|
|
|
},
|
|
|
|
|
},
|
|
|
|
|
score::Score,
|
|
|
|
|
topology::{K8sAnywhereTopology, TlsRouter},
|
|
|
|
|
};
|
|
|
|
|
use k8s_openapi::{
|
|
|
|
|
api::{
|
|
|
|
|
apps::v1::{Deployment, DeploymentSpec, DeploymentStrategy},
|
|
|
|
|
core::v1::{
|
|
|
|
|
Container, ContainerPort, EmptyDirVolumeSource, EnvVar, EnvVarSource,
|
|
|
|
|
HTTPGetAction, PersistentVolumeClaim, PersistentVolumeClaimSpec,
|
|
|
|
|
PersistentVolumeClaimVolumeSource, PodSpec, PodTemplateSpec, Probe,
|
|
|
|
|
ResourceRequirements, Secret, SecretKeySelector, SecretVolumeSource, Service,
|
|
|
|
|
ServicePort, ServiceSpec, Volume, VolumeMount, VolumeResourceRequirements,
|
|
|
|
|
},
|
|
|
|
|
},
|
|
|
|
|
apimachinery::pkg::{
|
|
|
|
|
api::resource::Quantity,
|
|
|
|
|
apis::meta::v1::{LabelSelector, ObjectMeta},
|
|
|
|
|
util::intstr::IntOrString,
|
|
|
|
|
},
|
|
|
|
|
ByteString,
|
|
|
|
|
};
|
|
|
|
|
use log::info;
|
|
|
|
|
|
|
|
|
|
const NAMESPACE: &str = "vllm-qwen";
|
|
|
|
|
const MODEL_NAME: &str = "Qwen/Qwen3.5-27B-FP8";
|
|
|
|
|
const DEPLOYMENT_NAME: &str = "qwen3-5-27b";
|
|
|
|
|
const SERVICE_NAME: &str = DEPLOYMENT_NAME;
|
|
|
|
|
const ROUTE_NAME: &str = DEPLOYMENT_NAME;
|
|
|
|
|
const PVC_NAME: &str = "huggingface-cache";
|
|
|
|
|
const SECRET_NAME: &str = "hf-token-secret";
|
|
|
|
|
|
|
|
|
|
const VLLM_IMAGE: &str = "vllm/vllm-openai:latest";
|
|
|
|
|
const SERVICE_PORT: u16 = 8000;
|
|
|
|
|
const TARGET_PORT: u16 = 8000;
|
|
|
|
|
|
|
|
|
|
/// Maximum context length for the model (in tokens).
|
|
|
|
|
///
|
|
|
|
|
/// **Impact on VRAM:**
|
|
|
|
|
/// - Qwen3.5-27B uses per-token KV cache storage for the context window
|
|
|
|
|
/// - Larger context = more KV cache memory required
|
|
|
|
|
/// - Approximate KV cache per token: ~32KB for FP8 (very rough estimate)
|
|
|
|
|
/// - 16k tokens ≈ 0.5-1GB KV cache
|
|
|
|
|
/// - 262k tokens ≈ 8-16GB KV cache (native context length - will NOT fit in 32GB VRAM)
|
|
|
|
|
///
|
|
|
|
|
/// **Performance Impact:**
|
|
|
|
|
/// - Context length directly impacts memory for storing conversation history
|
|
|
|
|
/// - Agentic workflows with long tool call histories benefit from more context
|
|
|
|
|
/// - If context > available VRAM, vLLM will OOM and fail to start
|
|
|
|
|
///
|
|
|
|
|
/// **Recommendations for RTX 5090 (32GB):**
|
|
|
|
|
/// - Start with 16384 (conservative, should work)
|
|
|
|
|
/// - If no OOM, try 32768 (better for agentic workflows)
|
|
|
|
|
/// - Monitor GPU memory with `nvidia-smi` during operation
|
|
|
|
|
const MAX_MODEL_LEN: i64 = 16384;
|
|
|
|
|
|
|
|
|
|
/// Fraction of GPU memory to allocate for the model (0.0 to 1.0).
|
|
|
|
|
///
|
|
|
|
|
/// **CRITICAL WARNING: This is a dense model!**
|
|
|
|
|
/// Qwen3.5-27B-FP8 is NOT a sparse/mixture-of-experts model. All 27B parameters
|
|
|
|
|
/// are active during inference. CPU offloading will DESTROY performance.
|
|
|
|
|
///
|
|
|
|
|
/// **What this parameter controls:**
|
|
|
|
|
/// - Controls how much of GPU memory vLLM pre-allocates for:
|
|
|
|
|
/// 1. Model weights (~27GB for FP8 quantization)
|
|
|
|
|
/// 2. KV cache for context window
|
|
|
|
|
/// 3. Activation buffers for inference
|
|
|
|
|
/// 4. Runtime overhead
|
|
|
|
|
///
|
|
|
|
|
/// **VRAM Allocation Example:**
|
|
|
|
|
/// - GPU: 32GB RTX 5090
|
|
|
|
|
/// - GPU_MEMORY_UTILIZATION: 0.95
|
|
|
|
|
/// - vLLM will try to use: 32GB * 0.95 = 30.4GB
|
|
|
|
|
/// - Model weights: ~27-28GB
|
|
|
|
|
/// - Remaining for KV cache + runtime: ~2-3GB
|
|
|
|
|
///
|
|
|
|
|
/// **If set too LOW (e.g., 0.7):**
|
|
|
|
|
/// - vLLM restricts itself to 32GB * 0.7 = 22.4GB
|
|
|
|
|
/// - Model weights alone need ~27GB
|
|
|
|
|
/// - vLLM will OFFLOAD model weights to CPU memory
|
|
|
|
|
/// - Performance: **100-1000x slower** (single token generation can take seconds instead of milliseconds)
|
|
|
|
|
/// - This is catastrophic for a dense model where every layer needs all parameters
|
|
|
|
|
///
|
|
|
|
|
/// **If set too HIGH (e.g., 0.99):**
|
|
|
|
|
/// - vLLM tries to allocate nearly all GPU memory
|
|
|
|
|
/// - Risk: CUDA OOM if any other process needs GPU memory
|
|
|
|
|
/// - Risk: KV cache allocation fails during inference
|
|
|
|
|
/// - System instability
|
|
|
|
|
///
|
|
|
|
|
/// **Current Setting: 0.95**
|
|
|
|
|
/// - Leaves 5% buffer (1.6GB) for CUDA overhead, system processes
|
|
|
|
|
/// - Maximum allocation for model + KV cache: ~30.4GB
|
|
|
|
|
/// - Should leave enough headroom for:
|
|
|
|
|
/// - CUDA context: ~500MB
|
|
|
|
|
/// - Temporary buffers: ~500MB
|
|
|
|
|
/// - Safety margin: ~600MB
|
|
|
|
|
///
|
|
|
|
|
/// **How to tune:**
|
|
|
|
|
/// 1. Start with 0.95 (current setting)
|
|
|
|
|
/// 2. Monitor with `nvidia-smi dmon -s u` during operation
|
|
|
|
|
/// 3. If OOM during inference: reduce MAX_MODEL_LEN first
|
|
|
|
|
/// 4. If stable: try increasing MAX_MODEL_LEN before increasing this
|
|
|
|
|
/// 5. Only increase this if you're certain no other GPU processes run
|
|
|
|
|
///
|
|
|
|
|
/// **NEVER decrease this for dense models!**
|
|
|
|
|
/// If model doesn't fit, use a smaller model or quantization, not CPU offloading.
|
|
|
|
|
const GPU_MEMORY_UTILIZATION : f32 = 1.0;
|
|
|
|
|
|
|
|
|
|
#[tokio::main]
|
|
|
|
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
|
|
|
env_logger::init();
|
|
|
|
|
|
|
|
|
|
info!("Deploying vLLM with Qwen3.5-27B-FP8 model");
|
|
|
|
|
info!("Configuration:");
|
|
|
|
|
info!(" Model: {}", MODEL_NAME);
|
|
|
|
|
info!(" Max context length: {} tokens", MAX_MODEL_LEN);
|
|
|
|
|
info!(" GPU memory utilization: {}", GPU_MEMORY_UTILIZATION);
|
|
|
|
|
info!(" Language model only: true");
|
|
|
|
|
info!(" Tool calling enabled: true");
|
|
|
|
|
|
|
|
|
|
let topology = K8sAnywhereTopology::from_env();
|
|
|
|
|
let domain = topology
|
|
|
|
|
.get_internal_domain()
|
|
|
|
|
.await
|
|
|
|
|
.ok()
|
|
|
|
|
.flatten()
|
|
|
|
|
.unwrap_or_else(|| "cluster.local".to_string());
|
|
|
|
|
|
|
|
|
|
let host = format!("{}-{}.apps.{}", SERVICE_NAME, NAMESPACE, domain);
|
|
|
|
|
info!("Creating route with host: {}", host);
|
|
|
|
|
|
|
|
|
|
let scores: Vec<Box<dyn Score<K8sAnywhereTopology>>> = vec![
|
|
|
|
|
create_namespace(),
|
|
|
|
|
create_pvc(),
|
|
|
|
|
create_secret(),
|
|
|
|
|
create_deployment(),
|
|
|
|
|
create_service(),
|
|
|
|
|
create_route(&host),
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
harmony_cli::run(Inventory::autoload(), topology, scores, None)
|
|
|
|
|
.await
|
|
|
|
|
.map_err(|e| format!("Failed to deploy: {}", e))?;
|
|
|
|
|
|
|
|
|
|
info!("Successfully deployed vLLM with Qwen3.5-27B-FP8");
|
|
|
|
|
info!("Access the API at: http://{}.apps.<cluster-domain>", SERVICE_NAME);
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn create_namespace() -> Box<dyn Score<K8sAnywhereTopology>> {
|
|
|
|
|
use k8s_openapi::api::core::v1::Namespace;
|
|
|
|
|
|
|
|
|
|
let namespace = Namespace {
|
|
|
|
|
metadata: ObjectMeta {
|
|
|
|
|
name: Some(NAMESPACE.to_string()),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
spec: None,
|
|
|
|
|
status: None,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Box::new(K8sResourceScore::single(namespace, None))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn create_pvc() -> Box<dyn Score<K8sAnywhereTopology>> {
|
|
|
|
|
let pvc = PersistentVolumeClaim {
|
|
|
|
|
metadata: ObjectMeta {
|
|
|
|
|
name: Some(PVC_NAME.to_string()),
|
|
|
|
|
namespace: Some(NAMESPACE.to_string()),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
spec: Some(PersistentVolumeClaimSpec {
|
|
|
|
|
access_modes: Some(vec!["ReadWriteOnce".to_string()]),
|
|
|
|
|
resources: Some(VolumeResourceRequirements {
|
|
|
|
|
requests: Some(BTreeMap::from([(
|
|
|
|
|
"storage".to_string(),
|
|
|
|
|
Quantity("50Gi".to_string()),
|
|
|
|
|
)])),
|
|
|
|
|
limits: None,
|
|
|
|
|
}),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
status: None,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Box::new(K8sResourceScore::single(
|
|
|
|
|
pvc,
|
|
|
|
|
Some(NAMESPACE.to_string()),
|
|
|
|
|
))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn create_secret() -> Box<dyn Score<K8sAnywhereTopology>> {
|
|
|
|
|
let mut data = BTreeMap::new();
|
|
|
|
|
data.insert(
|
|
|
|
|
"token".to_string(),
|
|
|
|
|
ByteString("".to_string().into_bytes()),
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let secret = Secret {
|
|
|
|
|
metadata: ObjectMeta {
|
|
|
|
|
name: Some(SECRET_NAME.to_string()),
|
|
|
|
|
namespace: Some(NAMESPACE.to_string()),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
data: Some(data),
|
|
|
|
|
immutable: Some(false),
|
|
|
|
|
type_: Some("Opaque".to_string()),
|
|
|
|
|
string_data: None,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Box::new(K8sResourceScore::single(
|
|
|
|
|
secret,
|
|
|
|
|
Some(NAMESPACE.to_string()),
|
|
|
|
|
))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn create_deployment() -> Box<dyn Score<K8sAnywhereTopology>> {
|
|
|
|
|
let deployment = Deployment {
|
|
|
|
|
metadata: ObjectMeta {
|
|
|
|
|
name: Some(DEPLOYMENT_NAME.to_string()),
|
|
|
|
|
namespace: Some(NAMESPACE.to_string()),
|
|
|
|
|
labels: Some(BTreeMap::from([(
|
|
|
|
|
"app".to_string(),
|
|
|
|
|
DEPLOYMENT_NAME.to_string(),
|
|
|
|
|
)])),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
spec: Some(DeploymentSpec {
|
|
|
|
|
replicas: Some(1),
|
|
|
|
|
selector: LabelSelector {
|
|
|
|
|
match_labels: Some(BTreeMap::from([(
|
|
|
|
|
"app".to_string(),
|
|
|
|
|
DEPLOYMENT_NAME.to_string(),
|
|
|
|
|
)])),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
strategy: Some(DeploymentStrategy {
|
|
|
|
|
type_: Some("Recreate".to_string()),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
template: PodTemplateSpec {
|
|
|
|
|
metadata: Some(ObjectMeta {
|
|
|
|
|
labels: Some(BTreeMap::from([(
|
|
|
|
|
"app".to_string(),
|
|
|
|
|
DEPLOYMENT_NAME.to_string(),
|
|
|
|
|
)])),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
spec: Some(PodSpec {
|
|
|
|
|
node_selector: Some(BTreeMap::from([(
|
|
|
|
|
"nvidia.com/gpu.product".to_string(),
|
|
|
|
|
"NVIDIA-GeForce-RTX-5090".to_string(),
|
|
|
|
|
)])),
|
|
|
|
|
volumes: Some(vec![
|
|
|
|
|
Volume {
|
|
|
|
|
name: "cache-volume".to_string(),
|
|
|
|
|
persistent_volume_claim: Some(PersistentVolumeClaimVolumeSource {
|
|
|
|
|
claim_name: PVC_NAME.to_string(),
|
|
|
|
|
read_only: Some(false),
|
|
|
|
|
}),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
Volume {
|
|
|
|
|
name: "shm".to_string(),
|
|
|
|
|
empty_dir: Some(EmptyDirVolumeSource {
|
|
|
|
|
medium: Some("Memory".to_string()),
|
|
|
|
|
size_limit: Some(Quantity("4Gi".to_string())),
|
|
|
|
|
}),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
Volume {
|
|
|
|
|
name: "hf-token".to_string(),
|
|
|
|
|
secret: Some(SecretVolumeSource {
|
|
|
|
|
secret_name: Some(SECRET_NAME.to_string()),
|
|
|
|
|
optional: Some(true),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
]),
|
|
|
|
|
containers: vec![Container {
|
|
|
|
|
name: DEPLOYMENT_NAME.to_string(),
|
|
|
|
|
image: Some(VLLM_IMAGE.to_string()),
|
|
|
|
|
command: Some(vec!["/bin/sh".to_string(), "-c".to_string()]),
|
|
|
|
|
args: Some(vec![build_vllm_command()]),
|
|
|
|
|
env: Some(vec![
|
|
|
|
|
EnvVar {
|
|
|
|
|
name: "HF_TOKEN".to_string(),
|
|
|
|
|
value_from: Some(EnvVarSource {
|
|
|
|
|
secret_key_ref: Some(SecretKeySelector {
|
|
|
|
|
key: "token".to_string(),
|
|
|
|
|
name: SECRET_NAME.to_string(),
|
|
|
|
|
optional: Some(true),
|
|
|
|
|
}),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
value: None,
|
|
|
|
|
},
|
|
|
|
|
EnvVar {
|
|
|
|
|
name: "VLLM_WORKER_MULTIPROC_METHOD".to_string(),
|
|
|
|
|
value: Some("spawn".to_string()),
|
|
|
|
|
value_from: None,
|
|
|
|
|
},
|
|
|
|
|
]),
|
|
|
|
|
ports: Some(vec![ContainerPort {
|
|
|
|
|
container_port: SERVICE_PORT as i32,
|
|
|
|
|
protocol: Some("TCP".to_string()),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}]),
|
|
|
|
|
resources: Some(ResourceRequirements {
|
|
|
|
|
limits: Some(BTreeMap::from([
|
|
|
|
|
("cpu".to_string(), Quantity("10".to_string())),
|
|
|
|
|
("memory".to_string(), Quantity("30Gi".to_string())),
|
|
|
|
|
("nvidia.com/gpu".to_string(), Quantity("1".to_string())),
|
|
|
|
|
])),
|
|
|
|
|
requests: Some(BTreeMap::from([
|
|
|
|
|
("cpu".to_string(), Quantity("2".to_string())),
|
|
|
|
|
("memory".to_string(), Quantity("10Gi".to_string())),
|
|
|
|
|
("nvidia.com/gpu".to_string(), Quantity("1".to_string())),
|
|
|
|
|
])),
|
|
|
|
|
claims: None,
|
|
|
|
|
}),
|
|
|
|
|
volume_mounts: Some(vec![
|
|
|
|
|
VolumeMount {
|
|
|
|
|
name: "cache-volume".to_string(),
|
|
|
|
|
mount_path: "/root/.cache/huggingface".to_string(),
|
|
|
|
|
read_only: Some(false),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
VolumeMount {
|
|
|
|
|
name: "shm".to_string(),
|
|
|
|
|
mount_path: "/dev/shm".to_string(),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
VolumeMount {
|
|
|
|
|
name: "hf-token".to_string(),
|
|
|
|
|
mount_path: "/etc/secrets/hf-token".to_string(),
|
|
|
|
|
read_only: Some(true),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
]),
|
|
|
|
|
liveness_probe: Some(Probe {
|
|
|
|
|
http_get: Some(HTTPGetAction {
|
|
|
|
|
path: Some("/health".to_string()),
|
|
|
|
|
port: IntOrString::Int(SERVICE_PORT as i32),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
initial_delay_seconds: Some(300),
|
|
|
|
|
period_seconds: Some(30),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
readiness_probe: Some(Probe {
|
|
|
|
|
http_get: Some(HTTPGetAction {
|
|
|
|
|
path: Some("/health".to_string()),
|
|
|
|
|
port: IntOrString::Int(SERVICE_PORT as i32),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
initial_delay_seconds: Some(120),
|
|
|
|
|
period_seconds: Some(10),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}],
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
},
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
status: None,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Box::new(K8sResourceScore::single(
|
|
|
|
|
deployment,
|
|
|
|
|
Some(NAMESPACE.to_string()),
|
|
|
|
|
))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn build_vllm_command() -> String {
|
|
|
|
|
format!(
|
|
|
|
|
"vllm serve {} \
|
|
|
|
|
--port {} \
|
|
|
|
|
--max-model-len {} \
|
|
|
|
|
--gpu-memory-utilization {} \
|
|
|
|
|
--reasoning-parser qwen3 \
|
|
|
|
|
--enable-auto-tool-choice \
|
|
|
|
|
--tool-call-parser qwen3_coder \
|
|
|
|
|
--language-model-only",
|
|
|
|
|
MODEL_NAME, SERVICE_PORT, MAX_MODEL_LEN, GPU_MEMORY_UTILIZATION
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn create_service() -> Box<dyn Score<K8sAnywhereTopology>> {
|
|
|
|
|
let service = Service {
|
|
|
|
|
metadata: ObjectMeta {
|
|
|
|
|
name: Some(SERVICE_NAME.to_string()),
|
|
|
|
|
namespace: Some(NAMESPACE.to_string()),
|
|
|
|
|
..Default::default()
|
|
|
|
|
},
|
|
|
|
|
spec: Some(ServiceSpec {
|
|
|
|
|
ports: Some(vec![ServicePort {
|
|
|
|
|
name: Some("http".to_string()),
|
|
|
|
|
port: SERVICE_PORT as i32,
|
|
|
|
|
protocol: Some("TCP".to_string()),
|
|
|
|
|
target_port: Some(IntOrString::Int(TARGET_PORT as i32)),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}]),
|
|
|
|
|
selector: Some(BTreeMap::from([(
|
|
|
|
|
"app".to_string(),
|
|
|
|
|
DEPLOYMENT_NAME.to_string(),
|
|
|
|
|
)])),
|
|
|
|
|
type_: Some("ClusterIP".to_string()),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
status: None,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Box::new(K8sResourceScore::single(
|
|
|
|
|
service,
|
|
|
|
|
Some(NAMESPACE.to_string()),
|
|
|
|
|
))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn create_route(host: &str) -> Box<dyn Score<K8sAnywhereTopology>> {
|
|
|
|
|
let route_spec = RouteSpec {
|
|
|
|
|
to: RouteTargetReference {
|
|
|
|
|
kind: "Service".to_string(),
|
|
|
|
|
name: SERVICE_NAME.to_string(),
|
|
|
|
|
weight: Some(100),
|
|
|
|
|
},
|
|
|
|
|
host: Some(host.to_string()),
|
|
|
|
|
port: Some(RoutePort {
|
|
|
|
|
target_port: SERVICE_PORT as u16,
|
|
|
|
|
}),
|
|
|
|
|
tls: Some(TLSConfig {
|
|
|
|
|
termination: "edge".to_string(),
|
|
|
|
|
insecure_edge_termination_policy: Some("Redirect".to_string()),
|
|
|
|
|
..Default::default()
|
|
|
|
|
}),
|
|
|
|
|
wildcard_policy: None,
|
|
|
|
|
..Default::default()
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Box::new(OKDRouteScore::new(ROUTE_NAME, NAMESPACE, route_spec))
|
|
|
|
|
}
|