Compare commits

...

13 Commits

Author SHA1 Message Date
d0a1a73710 doc: fix example code to use ignore instead of no_run
All checks were successful
Run Check Script / check (pull_request) Successful in 1m43s
-  fails because  cannot be used at module level
- Use  to skip doc compilation while keeping example visible
2026-03-07 17:30:24 -05:00
bc2b328296 okd: include workers in load balancer backend pool + add tests and docs
Some checks failed
Run Check Script / check (pull_request) Failing after 24s
- Add nodes_to_backend_server() function to include both control plane and worker nodes
- Update public services (ports 80, 443) to use worker-inclusive backend pool
- Add comprehensive tests covering all backend configurations
- Add documentation with OKD reference link and usage examples
2026-03-07 17:15:24 -05:00
a93896707f okd: add worker nodes to load balancer backend pool
All checks were successful
Run Check Script / check (pull_request) Successful in 1m29s
Include both control plane and worker nodes in ports 80 and 443 backend pools
2026-03-07 16:46:47 -05:00
0e9b23a320 Merge branch 'feat/change-node-readiness-strategy'
Some checks failed
Run Check Script / check (push) Successful in 1m26s
Compile and package harmony_composer / package_harmony_composer (push) Failing after 2m11s
2026-03-07 16:35:14 -05:00
f532ba2b40 doc: Update node readiness readme and deployed port to 25001
All checks were successful
Run Check Script / check (pull_request) Successful in 1m27s
2026-03-07 16:33:28 -05:00
fafca31798 fix: formatting and check script
All checks were successful
Run Check Script / check (pull_request) Successful in 1m28s
2026-03-07 16:08:52 -05:00
5412c34957 Merge pull request 'fix: change vlan definition from MaybeString to RawXml' (#245) from feat/opnsense-config-xml-support-vlan into master
Some checks failed
Run Check Script / check (push) Successful in 1m47s
Compile and package harmony_composer / package_harmony_composer (push) Failing after 2m7s
Reviewed-on: #245
2026-03-07 20:59:28 +00:00
55de206523 fix: change vlan definition from MaybeString to RawXml
All checks were successful
Run Check Script / check (pull_request) Successful in 1m29s
2026-03-07 10:03:03 -05:00
64893a84f5 fix(node health endpoint): Setup sane timeouts for usage as a load balancer health check. The default k8s client timeout of 30 seconds caused haproxy health check to fail even though we still returned 200OK after 30 seconds
Some checks failed
Run Check Script / check (pull_request) Failing after 25s
2026-03-06 16:28:13 -05:00
f941672662 fix: Node readiness always fails open when kube api call fails on note status check
Some checks failed
Run Check Script / check (pull_request) Failing after 1m54s
2026-03-06 15:45:38 -05:00
5db1a31d33 ... 2026-03-06 15:24:33 -05:00
d7e5bf11d5 removing bad stuff I did this morning and trying to make it simple, and adding a couple tests 2026-03-06 14:41:08 -05:00
2b157ad7fd feat: add a background loop checking the node status every X seconds. If NotReady for Y seconds, kill the router pod if there's one 2026-03-06 11:57:39 -05:00
9 changed files with 425 additions and 145 deletions

6
Cargo.lock generated
View File

@@ -2779,6 +2779,7 @@ name = "harmony-node-readiness-endpoint"
version = "0.1.0"
dependencies = [
"actix-web",
"chrono",
"env_logger",
"k8s-openapi",
"kube",
@@ -2787,6 +2788,7 @@ dependencies = [
"serde",
"serde_json",
"tokio",
"tower",
]
[[package]]
@@ -6915,9 +6917,9 @@ checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
[[package]]
name = "tower"
version = "0.5.2"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
dependencies = [
"futures-core",
"futures-util",

View File

@@ -2,7 +2,6 @@
resolver = "2"
members = [
"private_repos/*",
"examples/*",
"harmony",
"harmony_types",
"harmony_macros",
@@ -20,6 +19,7 @@ members = [
"brocade",
"harmony_agent",
"harmony_agent/deploy", "harmony_node_readiness",
"examples/*",
]
[workspace.package]

View File

@@ -8,7 +8,7 @@ use crate::{
score::Score,
topology::{
BackendServer, HAClusterTopology, HealthCheck, HttpMethod, HttpStatusCode, LoadBalancer,
LoadBalancerService, SSL, Topology,
LoadBalancerService, LogicalHost, Router, SSL, Topology,
},
};
@@ -23,17 +23,45 @@ pub struct OKDLoadBalancerScore {
load_balancer_score: LoadBalancerScore,
}
/// OKD Load Balancer Score configuration
///
/// This module configures the load balancer for OKD (OpenShift Kubernetes Distribution)
/// bare metal installations.
///
/// # Backend Server Configuration
///
/// For ports 80 and 443 (ingress traffic), the load balancer includes both control plane
/// and worker nodes in the backend pool. This is consistent with OKD's requirement that
/// ingress traffic should be load balanced across all nodes that may run ingress router pods.
///
/// For ports 22623 (Ignition API) and 6443 (Kubernetes API), only control plane nodes
/// are included as backends, as these services are control plane specific.
///
/// # References
///
/// - [OKD Bare Metal Installation - External Load Balancer Configuration]
/// (<https://docs.okd.io/latest/installing/installing_bare_metal/ipi/ipi-install-installation-workflow.html#nw-osp-configuring-external-load-balancer_ipi-install-installation-workflow>)
///
/// # Example
///
/// ```ignore
/// use harmony::topology::HAClusterTopology;
/// use harmony::modules::okd::OKDLoadBalancerScore;
///
/// let topology: HAClusterTopology = /* get topology from your infrastructure */;
/// let score = OKDLoadBalancerScore::new(&topology);
/// ```
impl OKDLoadBalancerScore {
pub fn new(topology: &HAClusterTopology) -> Self {
let public_ip = topology.router.get_gateway();
let public_services = vec![
LoadBalancerService {
backend_servers: Self::control_plane_to_backend_server(topology, 80),
backend_servers: Self::nodes_to_backend_server(topology, 80),
listening_port: SocketAddr::new(public_ip, 80),
health_check: Some(HealthCheck::TCP(None)),
},
LoadBalancerService {
backend_servers: Self::control_plane_to_backend_server(topology, 443),
backend_servers: Self::nodes_to_backend_server(topology, 443),
listening_port: SocketAddr::new(public_ip, 443),
health_check: Some(HealthCheck::TCP(None)),
},
@@ -41,12 +69,12 @@ impl OKDLoadBalancerScore {
let private_services = vec![
LoadBalancerService {
backend_servers: Self::control_plane_to_backend_server(topology, 80),
backend_servers: Self::nodes_to_backend_server(topology, 80),
listening_port: SocketAddr::new(public_ip, 80),
health_check: Some(HealthCheck::TCP(None)),
},
LoadBalancerService {
backend_servers: Self::control_plane_to_backend_server(topology, 443),
backend_servers: Self::nodes_to_backend_server(topology, 443),
listening_port: SocketAddr::new(public_ip, 443),
health_check: Some(HealthCheck::TCP(None)),
},
@@ -74,6 +102,11 @@ impl OKDLoadBalancerScore {
}
}
/// Creates backend servers list for control plane nodes only
///
/// Use this for control plane-specific services like:
/// - Port 22623: Ignition API (machine configuration during bootstrap)
/// - Port 6443: Kubernetes API server
fn control_plane_to_backend_server(
topology: &HAClusterTopology,
port: u16,
@@ -87,6 +120,216 @@ impl OKDLoadBalancerScore {
})
.collect()
}
/// Creates backend servers list for all nodes (control plane + workers)
///
/// Use this for ingress traffic that should be distributed across all nodes:
/// - Port 80: HTTP ingress traffic
/// - Port 443: HTTPS ingress traffic
///
/// In OKD, ingress router pods can run on any node, so both control plane
/// and worker nodes should be included in the load balancer backend pool.
fn nodes_to_backend_server(topology: &HAClusterTopology, port: u16) -> Vec<BackendServer> {
let mut nodes = Vec::new();
for cp in &topology.control_plane {
nodes.push(BackendServer {
address: cp.ip.to_string(),
port,
});
}
for worker in &topology.workers {
nodes.push(BackendServer {
address: worker.ip.to_string(),
port,
});
}
nodes
}
}
#[cfg(test)]
mod tests {
use std::sync::{Arc, OnceLock};
use super::*;
use crate::topology::DummyInfra;
use harmony_macros::ip;
use harmony_types::net::IpAddress;
fn create_test_topology() -> HAClusterTopology {
let router = Arc::new(DummyRouter {
gateway: ip!("192.168.1.1"),
});
HAClusterTopology {
domain_name: "test.example.com".to_string(),
router,
load_balancer: Arc::new(DummyInfra),
firewall: Arc::new(DummyInfra),
dhcp_server: Arc::new(DummyInfra),
tftp_server: Arc::new(DummyInfra),
http_server: Arc::new(DummyInfra),
dns_server: Arc::new(DummyInfra),
node_exporter: Arc::new(DummyInfra),
switch_client: Arc::new(DummyInfra),
bootstrap_host: LogicalHost {
ip: ip!("192.168.1.100"),
name: "bootstrap".to_string(),
},
control_plane: vec![
LogicalHost {
ip: ip!("192.168.1.10"),
name: "control-plane-0".to_string(),
},
LogicalHost {
ip: ip!("192.168.1.11"),
name: "control-plane-1".to_string(),
},
LogicalHost {
ip: ip!("192.168.1.12"),
name: "control-plane-2".to_string(),
},
],
workers: vec![
LogicalHost {
ip: ip!("192.168.1.20"),
name: "worker-0".to_string(),
},
LogicalHost {
ip: ip!("192.168.1.21"),
name: "worker-1".to_string(),
},
],
kubeconfig: None,
network_manager: OnceLock::new(),
}
}
struct DummyRouter {
gateway: IpAddress,
}
impl Router for DummyRouter {
fn get_gateway(&self) -> IpAddress {
self.gateway
}
fn get_cidr(&self) -> cidr::Ipv4Cidr {
let ipv4 = match self.gateway {
IpAddress::V4(ip) => ip,
IpAddress::V6(_) => panic!("IPv6 not supported"),
};
cidr::Ipv4Cidr::new(ipv4, 24).unwrap()
}
fn get_host(&self) -> LogicalHost {
LogicalHost {
ip: self.gateway,
name: "router".to_string(),
}
}
}
#[test]
fn test_nodes_to_backend_server_includes_control_plane_and_workers() {
let topology = create_test_topology();
let backend_servers = OKDLoadBalancerScore::nodes_to_backend_server(&topology, 80);
assert_eq!(backend_servers.len(), 5);
let addresses: Vec<&str> = backend_servers.iter().map(|s| s.address.as_str()).collect();
assert!(addresses.contains(&"192.168.1.10"));
assert!(addresses.contains(&"192.168.1.11"));
assert!(addresses.contains(&"192.168.1.12"));
assert!(addresses.contains(&"192.168.1.20"));
assert!(addresses.contains(&"192.168.1.21"));
}
#[test]
fn test_control_plane_to_backend_server_only_includes_control_plane() {
let topology = create_test_topology();
let backend_servers = OKDLoadBalancerScore::control_plane_to_backend_server(&topology, 80);
assert_eq!(backend_servers.len(), 3);
let addresses: Vec<&str> = backend_servers.iter().map(|s| s.address.as_str()).collect();
assert!(addresses.contains(&"192.168.1.10"));
assert!(addresses.contains(&"192.168.1.11"));
assert!(addresses.contains(&"192.168.1.12"));
assert!(!addresses.contains(&"192.168.1.20"));
assert!(!addresses.contains(&"192.168.1.21"));
}
#[test]
fn test_public_services_include_all_nodes_on_port_80_and_443() {
let topology = create_test_topology();
let score = OKDLoadBalancerScore::new(&topology);
let public_service_80 = score
.load_balancer_score
.public_services
.iter()
.find(|s| s.listening_port.port() == 80)
.expect("Public service on port 80 not found");
let public_service_443 = score
.load_balancer_score
.public_services
.iter()
.find(|s| s.listening_port.port() == 443)
.expect("Public service on port 443 not found");
assert_eq!(public_service_80.backend_servers.len(), 5);
assert_eq!(public_service_443.backend_servers.len(), 5);
}
#[test]
fn test_private_service_port_22623_only_control_plane() {
let topology = create_test_topology();
let score = OKDLoadBalancerScore::new(&topology);
let private_service_22623 = score
.load_balancer_score
.private_services
.iter()
.find(|s| s.listening_port.port() == 22623)
.expect("Private service on port 22623 not found");
assert_eq!(private_service_22623.backend_servers.len(), 3);
}
#[test]
fn test_private_service_port_6443_only_control_plane() {
let topology = create_test_topology();
let score = OKDLoadBalancerScore::new(&topology);
let private_service_6443 = score
.load_balancer_score
.private_services
.iter()
.find(|s| s.listening_port.port() == 6443)
.expect("Private service on port 6443 not found");
assert_eq!(private_service_6443.backend_servers.len(), 3);
assert!(
matches!(
private_service_6443.health_check,
Some(HealthCheck::HTTP(_, _, _, _))
),
"Expected HTTP health check for port 6443"
);
}
#[test]
fn test_all_backend_servers_have_correct_port() {
let topology = create_test_topology();
let backend_servers = OKDLoadBalancerScore::nodes_to_backend_server(&topology, 443);
for server in backend_servers {
assert_eq!(server.port, 443);
}
}
}
impl<T: Topology + LoadBalancer> Score<T> for OKDLoadBalancerScore {

View File

@@ -13,3 +13,5 @@ env_logger.workspace = true
log.workspace = true
tokio.workspace = true
reqwest.workspace = true
chrono.workspace = true
tower = "0.5.3"

View File

@@ -4,10 +4,11 @@
Designed for **bare-metal Kubernetes clusters** with external load balancers (HAProxy, OPNsense, F5, etc.).
It exposes a simple, reliable HTTP endpoint (`/health`) on each node that returns:
Exposes a simple HTTP endpoint (`/health`) on each node:
- **200 OK** — node is healthy and ready to receive traffic
- **503 Service Unavailable** — node should be removed from the load balancer pool
- **500 Internal Server Error** — misconfiguration (e.g. `NODE_NAME` not set)
This project is **not dependent on Harmony**, but is commonly used as part of Harmony bare-metal Kubernetes deployments.
@@ -16,199 +17,181 @@ This project is **not dependent on Harmony**, but is commonly used as part of Ha
In bare-metal environments, external load balancers often rely on pod-level or router-level checks that can lag behind the authoritative Kubernetes `Node.status.conditions[Ready]`.
This service provides the true source-of-truth with fast reaction time.
## Features & Roadmap
## Available checks
| Check | Description | Status | Check Name |
|------------------------------------|--------------------------------------------------|---------------------|--------------------|
| **Node readiness (API)** | Queries `Node.status.conditions[Ready]` via Kubernetes API | **Implemented** | `node_ready` |
| **OKD Router health** | Probes OpenShift router healthz on port 1936 | **Implemented** | `okd_router_1936` |
| Filesystem readonly | Detects read-only mounts via `/proc/mounts` | To be implemented | `filesystem_ro` |
| Kubelet running | Local probe to kubelet `/healthz` (port 10248) | To be implemented | `kubelet` |
| CRI-O / container runtime health | Socket check + runtime status | To be implemented | `container_runtime`|
| Disk / inode pressure | Threshold checks on key filesystems | To be implemented | `disk_pressure` |
| Network reachability | DNS resolution + gateway connectivity | To be implemented | `network` |
| Custom NodeConditions | Reacts to extra conditions (NPD, etc.) | To be implemented | `custom_conditions`|
| Check name | Description | Status |
|--------------------|-------------------------------------------------------------|-------------------|
| `node_ready` | Queries `Node.status.conditions[Ready]` via Kubernetes API | Implemented |
| `okd_router_1936` | Probes OpenShift router `/healthz/ready` on port 1936 | Implemented |
| `filesystem_ro` | Detects read-only mounts via `/proc/mounts` | To be implemented |
| `kubelet` | Local probe to kubelet `/healthz` (port 10248) | To be implemented |
| `container_runtime`| Socket check + runtime status | To be implemented |
| `disk_pressure` | Threshold checks on key filesystems | To be implemented |
| `network` | DNS resolution + gateway connectivity | To be implemented |
| `custom_conditions`| Reacts to extra conditions (NPD, etc.) | To be implemented |
All checks are combined with logical **AND** — any failure results in 503.
All checks are combined with logical **AND** — any single failure results in 503.
## Behavior
### `node_ready` check — fail-open design
The `node_ready` check queries the Kubernetes API server to read `Node.status.conditions[Ready]`.
Because this service runs on the node it is checking, there are scenarios where the API server is temporarily
unreachable (e.g. during a control-plane restart). To avoid incorrectly draining a healthy node in such cases,
the check is **fail-open**: it passes (reports ready) whenever the Kubernetes API is unavailable.
| Situation | Result | HTTP status |
|------------------------------------------------------|-------------------|-------------|
| `Node.conditions[Ready] == True` | Pass | 200 |
| `Node.conditions[Ready] == False` | Fail | 503 |
| `Ready` condition absent | Fail | 503 |
| API server unreachable or timed out (1 s timeout) | Pass (assumes ready) | 200 |
| Kubernetes client initialization failed | Pass (assumes ready) | 200 |
| `NODE_NAME` env var not set | Hard error | 500 |
A warning is logged whenever the API is unavailable and the check falls back to assuming ready.
### `okd_router_1936` check
Sends `GET http://127.0.0.1:1936/healthz/ready` with a 5-second timeout.
Returns pass on any 2xx response, fail otherwise.
### Unknown check names
Requesting an unknown check name (e.g. `check=bogus`) results in that check returning `passed: false`
with reason `"Unknown check: bogus"`, and the overall response is 503.
## How it works
### Node Name Discovery
The service automatically discovers its own node name using the **Kubernetes Downward API**:
### Node name discovery
The service reads the `NODE_NAME` environment variable, which must be injected via the Kubernetes Downward API:
```yaml
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
fieldPath: spec.nodeName
```
### Kubernetes API Authentication
### Kubernetes API authentication
- Uses standard **in-cluster configuration** (no external credentials needed).
- The ServiceAccount token and CA certificate are automatically mounted by Kubernetes at `/var/run/secrets/kubernetes.io/serviceaccount/`.
- The application (via `kube-rs` or your Harmony higher-level client) calls the equivalent of `Config::incluster_config()`.
- Requires only minimal RBAC: `get` permission on the `nodes` resource (see `deploy/rbac.yaml`).
- Uses standard **in-cluster configuration** no external credentials needed.
- The ServiceAccount token and CA certificate are automatically mounted at `/var/run/secrets/kubernetes.io/serviceaccount/`.
- Requires only minimal RBAC: `get` and `list` on the `nodes` resource (see `deploy/resources.yaml`).
- Connect and write timeouts are set to **1 second** to keep checks fast.
## Quick Start
## Deploy
All Kubernetes resources (Namespace, ServiceAccount, ClusterRole, ClusterRoleBinding, and an OpenShift SCC RoleBinding for `hostnetwork`) are in a single file.
### 1. Build and push
```bash
cargo build --release --bin harmony-node-readiness-endpoint
docker build -t your-registry/harmony-node-readiness-endpoint:v1.0.0 .
docker push your-registry/harmony-node-readiness-endpoint:v1.0.0
```
### 2. Deploy
```bash
kubectl apply -f deploy/namespace.yaml
kubectl apply -f deploy/rbac.yaml
kubectl apply -f deploy/resources.yaml
kubectl apply -f deploy/daemonset.yaml
```
(The DaemonSet uses `hostPort: 25001` by default so the endpoint is reachable directly on the node's IP.)
The DaemonSet uses `hostNetwork: true` and `hostPort: 25001`, so the endpoint is reachable directly on the node's IP at port 25001.
It tolerates all taints, ensuring it runs even on nodes marked unschedulable.
### 3. Configure your external load balancer
### Configure your external load balancer
**Example for HAProxy / OPNsense:**
- Check type: **HTTP**
- URI: `/health`
- Port: `25001` (configurable via `LISTEN_PORT`)
- Port: `25001` (configurable via `LISTEN_PORT` env var)
- Interval: 510 s
- Rise: 2
- Fall: 3
- Expect: `2xx`
## Health Endpoint Examples
## Endpoint usage
### Query Parameter
### Query parameter
Use the `check` query parameter to specify which checks to run. Multiple checks can be comma-separated.
Use the `check` query parameter to select which checks to run (comma-separated).
When omitted, only `node_ready` runs.
| Request | Behavior |
|--------------------------------------|---------------------------------------------|
| `GET /health` | Runs `node_ready` (default) |
| `GET /health?check=okd_router_1936` | Runs only OKD router check |
| `GET /health?check=node_ready,okd_router_1936` | Runs both checks |
| Request | Checks run |
|------------------------------------------------|-----------------------------------|
| `GET /health` | `node_ready` |
| `GET /health?check=okd_router_1936` | `okd_router_1936` only |
| `GET /health?check=node_ready,okd_router_1936` | `node_ready` and `okd_router_1936`|
**Note:** When the `check` parameter is provided, only the specified checks run. You must explicitly include `node_ready` if you want it along with other checks.
> **Note:** specifying `check=` replaces the default. Include `node_ready` explicitly if you need it alongside other checks.
### Response Format
Each check result includes:
- `name`: The check identifier
- `passed`: Boolean indicating success or failure
- `reason`: (Optional) Failure reason if the check failed
- `duration_ms`: Time taken to execute the check in milliseconds
**Healthy node (default check)**
```http
HTTP/1.1 200 OK
Content-Type: application/json
### Response format
```json
{
"status": "ready" | "not-ready",
"checks": [
{
GET /health?check=node_ready,okd_router_1936
"name": "<check-name>",
"passed": true | false,
"reason": "<failure reason, omitted on success>",
"duration_ms": 42
}
],
"total_duration_ms": 42
}
```
```http
**Healthy node (default)**
```http
HTTP/1.1 503 Service Unavailable
HTTP/1.1 200 OK
{
"status": "ready",
```http
HTTP/1.1 503 Service Unavailable
Content-Type: application/json
```
## Configuration (via DaemonSet env vars)
```yaml
env:
- name: NODE_NAME
valueFrom:
"checks": [{ "name": "node_ready", "passed": true, "duration_ms": 42 }],
"total_duration_ms": 42
}
```
value: "25001"
**Unhealthy node**
```http
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
HTTP/1.1 503 Service Unavailable
## Development
{
"status": "not-ready",
"checks": [
```
---
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*
"name": "okd_router_1936",
"passed": false,
"reason": "Failed to connect to OKD router: connection refused",
"duration_ms": 5
}
]
{ "name": "node_ready", "passed": false, "reason": "KubeletNotReady", "duration_ms": 35 }
],
"total_duration_ms": 35
}
```
**Unhealthy node (default check)**
**API server unreachable (fail-open)**
```http
HTTP/1.1 503 Service Unavailable
Content-Type: application/json
HTTP/1.1 200 OK
{
"status": "not-ready",
"checks": [
{
"name": "node_ready",
"passed": false,
"reason": "KubeletNotReady",
"duration_ms": 35
}
]
"status": "ready",
"checks": [{ "name": "node_ready", "passed": true, "duration_ms": 1001 }],
"total_duration_ms": 1001
}
```
*(A warning is logged: `Kubernetes API appears to be down … Assuming node is ready.`)*
## Configuration (via DaemonSet env vars)
## Configuration
```yaml
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: LISTEN_PORT
value: "25001"
```
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
| Env var | Default | Description |
|---------------|----------|--------------------------------------|
| `NODE_NAME` | required | Node name, injected via Downward API |
| `LISTEN_PORT` | `25001` | TCP port the HTTP server binds to |
| `RUST_LOG` | — | Log level (e.g. `info`, `debug`) |
## Development
```bash
# Run locally (set NODE_NAME env var)
# Run locally
NODE_NAME=my-test-node cargo run
# Run tests
cargo test
```
---
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*

0
harmony_node_readiness/build-docker.sh Normal file → Executable file
View File

View File

@@ -27,8 +27,8 @@ spec:
fieldRef:
fieldPath: spec.nodeName
ports:
- containerPort: 8080
hostPort: 8080
- containerPort: 25001
hostPort: 25001
name: health-port
resources:
requests:

View File

@@ -1,13 +1,16 @@
use actix_web::{App, HttpResponse, HttpServer, Responder, get, web};
use k8s_openapi::api::core::v1::Node;
use kube::{Api, Client};
use kube::{Api, Client, Config};
use log::{debug, error, info, warn};
use reqwest;
use serde::{Deserialize, Serialize};
use std::env;
use std::time::Instant;
use std::time::{Duration, Instant};
use tokio::task::JoinSet;
const K8S_CLIENT_TIMEOUT: Duration = Duration::from_secs(1);
#[derive(Serialize, Deserialize)]
struct HealthStatus {
status: String,
@@ -40,10 +43,16 @@ struct HealthQuery {
async fn check_node_ready(client: Client, node_name: &str) -> Result<(), String> {
let nodes: Api<Node> = Api::all(client);
let node = nodes
.get(node_name)
.await
.map_err(|e| format!("Failed to get node '{}': {}", node_name, e))?;
let node = match nodes.get(node_name).await {
Ok(n) => n,
Err(e) => {
warn!(
"Kubernetes API appears to be down, unreachable, or timed out for node '{}': {}. Assuming node is ready.",
node_name, e
);
return Ok(());
}
};
let conditions = node.status.and_then(|s| s.conditions).unwrap_or_default();
@@ -104,7 +113,13 @@ async fn run_check(check_name: &str, client: Option<Client>, node_name: &str) ->
let result = match check_name {
"node_ready" => match client {
Some(c) => check_node_ready(c, node_name).await,
None => Err("Kubernetes client not available".to_string()),
None => {
warn!(
"Kubernetes client not available for node '{}'. Assuming node is ready.",
node_name
);
Ok(())
}
},
"okd_router_1936" => check_okd_router_1936().await,
_ => Err(format!("Unknown check: {}", check_name)),
@@ -149,16 +164,30 @@ async fn health(query: web::Query<HealthQuery>) -> impl Responder {
// Initialize Kubernetes client only if needed
let k8s_client = if needs_k8s_client {
match Client::try_default().await {
Ok(c) => Some(c),
match Config::infer().await {
Ok(mut config) => {
config.write_timeout = Some(K8S_CLIENT_TIMEOUT);
config.connect_timeout = Some(K8S_CLIENT_TIMEOUT);
Some(Client::try_from(config).map_err(|e| e.to_string()))
}
Err(e) => {
error!("Failed to create Kubernetes client: {}", e);
return HttpResponse::InternalServerError().json(HealthError {
status: "error".to_string(),
error: format!("Failed to create Kubernetes client: {}", e),
});
warn!(
"Failed to infer Kubernetes config for node '{}': {}. Assuming node_ready is healthy.",
node_name, e
);
None
}
}
.and_then(|result| match result {
Ok(client) => Some(client),
Err(e) => {
warn!(
"Failed to create Kubernetes client for node '{}': {}. Assuming node_ready is healthy.",
node_name, e
);
None
}
})
} else {
None
};
@@ -226,7 +255,28 @@ async fn main() -> std::io::Result<()> {
info!("Starting harmony-node-readiness-endpoint on {}", bind_addr);
HttpServer::new(|| App::new().service(health))
.workers(3)
.bind(&bind_addr)?
.run()
.await
}
#[cfg(test)]
mod tests {
use super::*;
use kube::error::ErrorResponse;
#[test]
fn parse_checks_defaults_to_node_ready() {
assert_eq!(parse_checks(None), vec!["node_ready"]);
assert_eq!(parse_checks(Some("")), vec!["node_ready"]);
}
#[test]
fn parse_checks_splits_and_trims_values() {
assert_eq!(
parse_checks(Some("node_ready, okd_router_1936 ")),
vec!["node_ready", "okd_router_1936"]
);
}
}

View File

@@ -1540,7 +1540,7 @@ pub struct Dyndns {
pub struct Vlans {
#[yaserde(attribute = true)]
pub version: String,
pub vlan: MaybeString,
pub vlan: RawXml,
}
#[derive(Default, PartialEq, Debug, YaSerialize, YaDeserialize)]