Compare commits
1 Commits
feat/zitad
...
32d0c2aa1e
| Author | SHA1 | Date | |
|---|---|---|---|
| 32d0c2aa1e |
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -2704,6 +2704,7 @@ dependencies = [
|
||||
"k8s-openapi",
|
||||
"kube",
|
||||
"log",
|
||||
"reqwest 0.12.23",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
|
||||
@@ -12,3 +12,4 @@ serde_json.workspace = true
|
||||
env_logger.workspace = true
|
||||
log.workspace = true
|
||||
tokio.workspace = true
|
||||
reqwest.workspace = true
|
||||
|
||||
@@ -18,15 +18,16 @@ This service provides the true source-of-truth with fast reaction time.
|
||||
|
||||
## Features & Roadmap
|
||||
|
||||
| Check | Description | Status | Configurable |
|
||||
|------------------------------------|--------------------------------------------------|---------------------|--------------|
|
||||
| **Node readiness (API)** | Queries `Node.status.conditions[Ready]` via Kubernetes API | **Implemented** | Yes |
|
||||
| Filesystem readonly | Detects read-only mounts via `/proc/mounts` | To be implemented | Yes |
|
||||
| Kubelet running | Local probe to kubelet `/healthz` (port 10248) | To be implemented | Yes |
|
||||
| CRI-O / container runtime health | Socket check + runtime status | To be implemented | Yes |
|
||||
| Disk / inode pressure | Threshold checks on key filesystems | To be implemented | Yes |
|
||||
| Network reachability | DNS resolution + gateway connectivity | To be implemented | Yes |
|
||||
| Custom NodeConditions | Reacts to extra conditions (NPD, etc.) | To be implemented | Yes |
|
||||
| Check | Description | Status | Check Name |
|
||||
|------------------------------------|--------------------------------------------------|---------------------|--------------------|
|
||||
| **Node readiness (API)** | Queries `Node.status.conditions[Ready]` via Kubernetes API | **Implemented** | `node_ready` |
|
||||
| **OKD Router health** | Probes OpenShift router healthz on port 1936 | **Implemented** | `okd_router_1936` |
|
||||
| Filesystem readonly | Detects read-only mounts via `/proc/mounts` | To be implemented | `filesystem_ro` |
|
||||
| Kubelet running | Local probe to kubelet `/healthz` (port 10248) | To be implemented | `kubelet` |
|
||||
| CRI-O / container runtime health | Socket check + runtime status | To be implemented | `container_runtime`|
|
||||
| Disk / inode pressure | Threshold checks on key filesystems | To be implemented | `disk_pressure` |
|
||||
| Network reachability | DNS resolution + gateway connectivity | To be implemented | `network` |
|
||||
| Custom NodeConditions | Reacts to extra conditions (NPD, etc.) | To be implemented | `custom_conditions`|
|
||||
|
||||
All checks are combined with logical **AND** — any failure results in 503.
|
||||
|
||||
@@ -82,20 +83,108 @@ kubectl apply -f deploy/daemonset.yaml
|
||||
|
||||
## Health Endpoint Examples
|
||||
|
||||
**Healthy node**
|
||||
### Query Parameter
|
||||
|
||||
Use the `check` query parameter to specify which checks to run. Multiple checks can be comma-separated.
|
||||
|
||||
| Request | Behavior |
|
||||
|--------------------------------------|---------------------------------------------|
|
||||
| `GET /health` | Runs `node_ready` (default) |
|
||||
| `GET /health?check=okd_router_1936` | Runs only OKD router check |
|
||||
| `GET /health?check=node_ready,okd_router_1936` | Runs both checks |
|
||||
|
||||
**Note:** When the `check` parameter is provided, only the specified checks run. You must explicitly include `node_ready` if you want it along with other checks.
|
||||
|
||||
### Response Format
|
||||
|
||||
Each check result includes:
|
||||
- `name`: The check identifier
|
||||
- `passed`: Boolean indicating success or failure
|
||||
- `reason`: (Optional) Failure reason if the check failed
|
||||
- `duration_ms`: Time taken to execute the check in milliseconds
|
||||
|
||||
**Healthy node (default check)**
|
||||
```http
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: application/json
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
**Healthy node (multiple checks)**
|
||||
```http
|
||||
GET /health?check=node_ready,okd_router_1936
|
||||
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: application/json
|
||||
|
||||
```
|
||||
|
||||
**Unhealthy node (one check failed)**
|
||||
```http
|
||||
```http
|
||||
GET /health?check=node_ready,okd_router_1936
|
||||
|
||||
HTTP/1.1 503 Service Unavailable
|
||||
Content-Type: application/json
|
||||
|
||||
```
|
||||
|
||||
**Unhealthy node (default check)**
|
||||
```http
|
||||
HTTP/1.1 503 Service Unavailable
|
||||
Content-Type: application/json
|
||||
|
||||
```
|
||||
|
||||
## Configuration (via DaemonSet env vars)
|
||||
|
||||
```yaml
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: LISTEN_PORT
|
||||
value: "25001"
|
||||
```
|
||||
|
||||
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
# Run locally (set NODE_NAME env var)
|
||||
NODE_NAME=my-test-node cargo run
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*
|
||||
|
||||
"name": "okd_router_1936",
|
||||
"passed": false,
|
||||
"reason": "Failed to connect to OKD router: connection refused",
|
||||
"duration_ms": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Unhealthy node (default check)**
|
||||
```http
|
||||
HTTP/1.1 503 Service Unavailable
|
||||
Content-Type: application/json
|
||||
|
||||
|
||||
{
|
||||
"status": "not-ready",
|
||||
"checks": [
|
||||
{
|
||||
"name": "node_ready",
|
||||
"passed": false,
|
||||
"reason": "KubeletNotReady",
|
||||
"duration_ms": 35
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration (via DaemonSet env vars)
|
||||
@@ -108,11 +197,9 @@ env:
|
||||
fieldPath: metadata.name
|
||||
- name: LISTEN_PORT
|
||||
value: "25001"
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
## Development
|
||||
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
|
||||
|
||||
## Development
|
||||
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
# TODO
|
||||
# This is meant to be run on a machine with harmony development tools installed (cargo, etc)
|
||||
|
||||
DOCKER_TAG="${DOCKER_TAG:-dev}"
|
||||
|
||||
cargo build --release
|
||||
|
||||
cp ../target/release/harmony-node-readiness-endpoint .
|
||||
|
||||
docker build . -t hub.nationtech.io/harmony/harmony-node-readiness-endpoint:latest
|
||||
docker build . -t hub.nationtech.io/harmony/harmony-node-readiness-endpoint:${DOCKER_TAG}
|
||||
|
||||
|
||||
Binary file not shown.
@@ -1,21 +1,27 @@
|
||||
use actix_web::{App, HttpResponse, HttpServer, Responder, get};
|
||||
use actix_web::{App, HttpResponse, HttpServer, Responder, get, web};
|
||||
use k8s_openapi::api::core::v1::Node;
|
||||
use kube::{Api, Client, Config};
|
||||
use log::{error, info, warn};
|
||||
use kube::{Api, Client};
|
||||
use log::{debug, error, info, warn};
|
||||
use reqwest;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use std::time::Instant;
|
||||
use tokio::task::JoinSet;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct HealthStatus {
|
||||
status: String,
|
||||
checks: HealthChecks,
|
||||
checks: Vec<CheckResult>,
|
||||
total_duration_ms: u128,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct HealthChecks {
|
||||
node_ready: bool,
|
||||
struct CheckResult {
|
||||
name: String,
|
||||
passed: bool,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
reason: Option<String>,
|
||||
duration_ms: u128,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -24,7 +30,13 @@ struct HealthError {
|
||||
error: String,
|
||||
}
|
||||
|
||||
/// Check if the node's Ready condition is true
|
||||
#[derive(Deserialize)]
|
||||
struct HealthQuery {
|
||||
#[serde(rename = "check")]
|
||||
checks: Option<String>,
|
||||
}
|
||||
|
||||
/// Check if the node's Ready condition is true via Kubernetes API
|
||||
async fn check_node_ready(client: Client, node_name: &str) -> Result<(), String> {
|
||||
let nodes: Api<Node> = Api::all(client);
|
||||
|
||||
@@ -53,8 +65,71 @@ async fn check_node_ready(client: Client, node_name: &str) -> Result<(), String>
|
||||
Err("Ready condition not found".to_string())
|
||||
}
|
||||
|
||||
/// Check OKD router health endpoint on port 1936
|
||||
async fn check_okd_router_1936() -> Result<(), String> {
|
||||
debug!("Checking okd router 1936");
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(5))
|
||||
.build()
|
||||
.map_err(|e| format!("Failed to build HTTP client: {}", e))?;
|
||||
|
||||
let response = client
|
||||
.get("http://127.0.0.1:1936/healthz/ready")
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("Failed to connect to OKD router: {}", e))?;
|
||||
|
||||
debug!("okd router 1936 response status {}", response.status());
|
||||
|
||||
if response.status().is_success() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(format!("OKD router returned status: {}", response.status()))
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse comma-separated check names from query parameter
|
||||
fn parse_checks(checks_param: Option<&str>) -> Vec<String> {
|
||||
match checks_param {
|
||||
None => vec!["node_ready".to_string()],
|
||||
Some(s) if s.is_empty() => vec!["node_ready".to_string()],
|
||||
Some(s) => s.split(',').map(|c| c.trim().to_string()).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Run a single health check by name and return the result
|
||||
async fn run_check(check_name: &str, client: Option<Client>, node_name: &str) -> CheckResult {
|
||||
let start = Instant::now();
|
||||
|
||||
let result = match check_name {
|
||||
"node_ready" => match client {
|
||||
Some(c) => check_node_ready(c, node_name).await,
|
||||
None => Err("Kubernetes client not available".to_string()),
|
||||
},
|
||||
"okd_router_1936" => check_okd_router_1936().await,
|
||||
_ => Err(format!("Unknown check: {}", check_name)),
|
||||
};
|
||||
|
||||
let duration_ms = start.elapsed().as_millis();
|
||||
|
||||
match result {
|
||||
Ok(()) => CheckResult {
|
||||
name: check_name.to_string(),
|
||||
passed: true,
|
||||
reason: None,
|
||||
duration_ms,
|
||||
},
|
||||
Err(reason) => CheckResult {
|
||||
name: check_name.to_string(),
|
||||
passed: false,
|
||||
reason: Some(reason),
|
||||
duration_ms,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[get("/health")]
|
||||
async fn health() -> impl Responder {
|
||||
async fn health(query: web::Query<HealthQuery>) -> impl Responder {
|
||||
let node_name = match env::var("NODE_NAME") {
|
||||
Ok(name) => name,
|
||||
Err(_) => {
|
||||
@@ -66,9 +141,16 @@ async fn health() -> impl Responder {
|
||||
}
|
||||
};
|
||||
|
||||
// Initialize Kubernetes client using in-cluster config
|
||||
let client = match Client::try_default().await {
|
||||
Ok(c) => c,
|
||||
// Parse requested checks from query parameter
|
||||
let requested_checks = parse_checks(query.checks.as_deref());
|
||||
|
||||
// Check if node_ready check requires Kubernetes client
|
||||
let needs_k8s_client = requested_checks.contains(&"node_ready".to_string());
|
||||
|
||||
// Initialize Kubernetes client only if needed
|
||||
let k8s_client = if needs_k8s_client {
|
||||
match Client::try_default().await {
|
||||
Ok(c) => Some(c),
|
||||
Err(e) => {
|
||||
error!("Failed to create Kubernetes client: {}", e);
|
||||
return HttpResponse::InternalServerError().json(HealthError {
|
||||
@@ -76,45 +158,57 @@ async fn health() -> impl Responder {
|
||||
error: format!("Failed to create Kubernetes client: {}", e),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Check if node readiness check is enabled
|
||||
let check_node_enabled = env::var("CHECK_NODE_READY")
|
||||
.unwrap_or_else(|_| "true".to_string())
|
||||
.to_lowercase()
|
||||
!= "false";
|
||||
|
||||
if check_node_enabled {
|
||||
match check_node_ready(client, &node_name).await {
|
||||
Ok(()) => {
|
||||
info!("Node '{}' is ready", node_name);
|
||||
HttpResponse::Ok().json(HealthStatus {
|
||||
status: "ready".to_string(),
|
||||
checks: HealthChecks {
|
||||
node_ready: true,
|
||||
reason: None,
|
||||
},
|
||||
})
|
||||
}
|
||||
Err(reason) => {
|
||||
warn!("Node '{}' is not ready: {}", node_name, reason);
|
||||
HttpResponse::ServiceUnavailable().json(HealthStatus {
|
||||
status: "not-ready".to_string(),
|
||||
checks: HealthChecks {
|
||||
node_ready: false,
|
||||
reason: Some(reason),
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Node check disabled, return healthy
|
||||
None
|
||||
};
|
||||
|
||||
// Run all requested checks in parallel
|
||||
let start = Instant::now();
|
||||
let mut join_set = JoinSet::new();
|
||||
debug!("Running checks {requested_checks:?}");
|
||||
|
||||
for check_name in requested_checks {
|
||||
let client = k8s_client.clone();
|
||||
let node_name = node_name.clone();
|
||||
join_set.spawn(async move { run_check(&check_name, client, &node_name).await });
|
||||
}
|
||||
let mut check_results = Vec::new();
|
||||
while let Some(result) = join_set.join_next().await {
|
||||
match result {
|
||||
Ok(check) => check_results.push(check),
|
||||
Err(e) => error!("Check task failed: {}", e),
|
||||
}
|
||||
}
|
||||
let total_duration_ms = start.elapsed().as_millis();
|
||||
|
||||
// Determine overall status
|
||||
let all_passed = check_results.iter().all(|c| c.passed);
|
||||
|
||||
if all_passed {
|
||||
info!(
|
||||
"All health checks passed for node '{}' in {}ms",
|
||||
node_name, total_duration_ms
|
||||
);
|
||||
HttpResponse::Ok().json(HealthStatus {
|
||||
status: "ready".to_string(),
|
||||
checks: HealthChecks {
|
||||
node_ready: true,
|
||||
reason: None,
|
||||
},
|
||||
checks: check_results,
|
||||
total_duration_ms,
|
||||
})
|
||||
} else {
|
||||
let failed_checks: Vec<&str> = check_results
|
||||
.iter()
|
||||
.filter(|c| !c.passed)
|
||||
.map(|c| c.name.as_str())
|
||||
.collect();
|
||||
warn!(
|
||||
"Health checks failed for node '{}' in {}ms: {:?}",
|
||||
node_name, total_duration_ms, failed_checks
|
||||
);
|
||||
HttpResponse::ServiceUnavailable().json(HealthStatus {
|
||||
status: "not-ready".to_string(),
|
||||
checks: check_results,
|
||||
total_duration_ms,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user