Compare commits
1 Commits
a3158d637f
...
feat/impro
| Author | SHA1 | Date | |
|---|---|---|---|
| 063a4d4f5c |
15
Cargo.lock
generated
15
Cargo.lock
generated
@@ -2695,21 +2695,6 @@ dependencies = [
|
|||||||
"walkdir",
|
"walkdir",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "harmony-node-readiness-endpoint"
|
|
||||||
version = "0.1.0"
|
|
||||||
dependencies = [
|
|
||||||
"actix-web",
|
|
||||||
"env_logger",
|
|
||||||
"k8s-openapi",
|
|
||||||
"kube",
|
|
||||||
"log",
|
|
||||||
"reqwest 0.12.23",
|
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"tokio",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "harmony_agent"
|
name = "harmony_agent"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ members = [
|
|||||||
"adr/agent_discovery/mdns",
|
"adr/agent_discovery/mdns",
|
||||||
"brocade",
|
"brocade",
|
||||||
"harmony_agent",
|
"harmony_agent",
|
||||||
"harmony_agent/deploy", "harmony_node_readiness",
|
"harmony_agent/deploy",
|
||||||
]
|
]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ async fn main() {
|
|||||||
role: HostRole::Worker,
|
role: HostRole::Worker,
|
||||||
number_desired_hosts: 3,
|
number_desired_hosts: 3,
|
||||||
discovery_strategy: HarmonyDiscoveryStrategy::SUBNET {
|
discovery_strategy: HarmonyDiscoveryStrategy::SUBNET {
|
||||||
cidr: cidrv4!("192.168.0.1/25"),
|
cidr: cidrv4!("192.168.2.0/24"),
|
||||||
port: 25000,
|
port: 25000,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -20,7 +20,7 @@ async fn main() {
|
|||||||
role: HostRole::ControlPlane,
|
role: HostRole::ControlPlane,
|
||||||
number_desired_hosts: 3,
|
number_desired_hosts: 3,
|
||||||
discovery_strategy: HarmonyDiscoveryStrategy::SUBNET {
|
discovery_strategy: HarmonyDiscoveryStrategy::SUBNET {
|
||||||
cidr: cidrv4!("192.168.0.1/25"),
|
cidr: cidrv4!("192.168.2.0/24"),
|
||||||
port: 25000,
|
port: 25000,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@@ -28,7 +28,8 @@ async fn main() {
|
|||||||
harmony_cli::run(
|
harmony_cli::run(
|
||||||
Inventory::autoload(),
|
Inventory::autoload(),
|
||||||
LocalhostTopology::new(),
|
LocalhostTopology::new(),
|
||||||
vec![Box::new(discover_worker), Box::new(discover_control_plane)],
|
vec![Box::new(discover_worker)],
|
||||||
|
//vec![Box::new(discover_worker), Box::new(discover_control_plane)],
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ impl<T: Topology> Interpret<T> for DiscoverHostForRoleInterpret {
|
|||||||
topology: &T,
|
topology: &T,
|
||||||
) -> Result<Outcome, InterpretError> {
|
) -> Result<Outcome, InterpretError> {
|
||||||
info!(
|
info!(
|
||||||
"Launching discovery agent, make sure that your nodes are successfully PXE booted and running inventory agent. They should answer on `http://<node_ip>:8080/inventory`"
|
"Launching discovery agent, make sure that your nodes are successfully PXE booted and running inventory agent. They should answer on `http://<node_ip>:25000/inventory`"
|
||||||
);
|
);
|
||||||
LaunchDiscoverInventoryAgentScore {
|
LaunchDiscoverInventoryAgentScore {
|
||||||
discovery_timeout: None,
|
discovery_timeout: None,
|
||||||
@@ -58,6 +58,8 @@ impl<T: Topology> Interpret<T> for DiscoverHostForRoleInterpret {
|
|||||||
let host_repo = InventoryRepositoryFactory::build().await?;
|
let host_repo = InventoryRepositoryFactory::build().await?;
|
||||||
|
|
||||||
let mut assigned_hosts = 0;
|
let mut assigned_hosts = 0;
|
||||||
|
// let hosts_for_role = host_repo.get_hosts_for_role(&self.score.role);
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let all_hosts = host_repo.get_all_hosts().await?;
|
let all_hosts = host_repo.get_all_hosts().await?;
|
||||||
|
|
||||||
|
|||||||
@@ -1,15 +0,0 @@
|
|||||||
[package]
|
|
||||||
name = "harmony-node-readiness-endpoint"
|
|
||||||
version = "0.1.0"
|
|
||||||
edition = "2024"
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
actix-web = "4"
|
|
||||||
kube.workspace = true
|
|
||||||
k8s-openapi.workspace = true
|
|
||||||
serde.workspace = true
|
|
||||||
serde_json.workspace = true
|
|
||||||
env_logger.workspace = true
|
|
||||||
log.workspace = true
|
|
||||||
tokio.workspace = true
|
|
||||||
reqwest.workspace = true
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
FROM debian:13-slim
|
|
||||||
|
|
||||||
# RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
# ca-certificates \
|
|
||||||
# && rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
COPY harmony-node-readiness-endpoint /usr/local/bin/harmony-node-readiness-endpoint
|
|
||||||
|
|
||||||
ENV RUST_LOG=info
|
|
||||||
|
|
||||||
EXPOSE 25001
|
|
||||||
|
|
||||||
CMD ["harmony-node-readiness-endpoint"]
|
|
||||||
@@ -1,214 +0,0 @@
|
|||||||
# harmony-node-readiness-endpoint
|
|
||||||
|
|
||||||
**A lightweight, standalone Rust service for Kubernetes node health checking.**
|
|
||||||
|
|
||||||
Designed for **bare-metal Kubernetes clusters** with external load balancers (HAProxy, OPNsense, F5, etc.).
|
|
||||||
|
|
||||||
It exposes a simple, reliable HTTP endpoint (`/health`) on each node that returns:
|
|
||||||
|
|
||||||
- **200 OK** — node is healthy and ready to receive traffic
|
|
||||||
- **503 Service Unavailable** — node should be removed from the load balancer pool
|
|
||||||
|
|
||||||
This project is **not dependent on Harmony**, but is commonly used as part of Harmony bare-metal Kubernetes deployments.
|
|
||||||
|
|
||||||
## Why this project exists
|
|
||||||
|
|
||||||
In bare-metal environments, external load balancers often rely on pod-level or router-level checks that can lag behind the authoritative Kubernetes `Node.status.conditions[Ready]`.
|
|
||||||
This service provides the true source-of-truth with fast reaction time.
|
|
||||||
|
|
||||||
## Features & Roadmap
|
|
||||||
|
|
||||||
| Check | Description | Status | Check Name |
|
|
||||||
|------------------------------------|--------------------------------------------------|---------------------|--------------------|
|
|
||||||
| **Node readiness (API)** | Queries `Node.status.conditions[Ready]` via Kubernetes API | **Implemented** | `node_ready` |
|
|
||||||
| **OKD Router health** | Probes OpenShift router healthz on port 1936 | **Implemented** | `okd_router_1936` |
|
|
||||||
| Filesystem readonly | Detects read-only mounts via `/proc/mounts` | To be implemented | `filesystem_ro` |
|
|
||||||
| Kubelet running | Local probe to kubelet `/healthz` (port 10248) | To be implemented | `kubelet` |
|
|
||||||
| CRI-O / container runtime health | Socket check + runtime status | To be implemented | `container_runtime`|
|
|
||||||
| Disk / inode pressure | Threshold checks on key filesystems | To be implemented | `disk_pressure` |
|
|
||||||
| Network reachability | DNS resolution + gateway connectivity | To be implemented | `network` |
|
|
||||||
| Custom NodeConditions | Reacts to extra conditions (NPD, etc.) | To be implemented | `custom_conditions`|
|
|
||||||
|
|
||||||
All checks are combined with logical **AND** — any failure results in 503.
|
|
||||||
|
|
||||||
## How it works
|
|
||||||
|
|
||||||
### Node Name Discovery
|
|
||||||
The service automatically discovers its own node name using the **Kubernetes Downward API**:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
env:
|
|
||||||
- name: NODE_NAME
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: metadata.name
|
|
||||||
```
|
|
||||||
|
|
||||||
### Kubernetes API Authentication
|
|
||||||
|
|
||||||
- Uses standard **in-cluster configuration** (no external credentials needed).
|
|
||||||
- The ServiceAccount token and CA certificate are automatically mounted by Kubernetes at `/var/run/secrets/kubernetes.io/serviceaccount/`.
|
|
||||||
- The application (via `kube-rs` or your Harmony higher-level client) calls the equivalent of `Config::incluster_config()`.
|
|
||||||
- Requires only minimal RBAC: `get` permission on the `nodes` resource (see `deploy/rbac.yaml`).
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### 1. Build and push
|
|
||||||
```bash
|
|
||||||
cargo build --release --bin harmony-node-readiness-endpoint
|
|
||||||
|
|
||||||
docker build -t your-registry/harmony-node-readiness-endpoint:v1.0.0 .
|
|
||||||
docker push your-registry/harmony-node-readiness-endpoint:v1.0.0
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Deploy
|
|
||||||
```bash
|
|
||||||
kubectl apply -f deploy/namespace.yaml
|
|
||||||
kubectl apply -f deploy/rbac.yaml
|
|
||||||
kubectl apply -f deploy/daemonset.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
(The DaemonSet uses `hostPort: 25001` by default so the endpoint is reachable directly on the node's IP.)
|
|
||||||
|
|
||||||
### 3. Configure your external load balancer
|
|
||||||
|
|
||||||
**Example for HAProxy / OPNsense:**
|
|
||||||
- Check type: **HTTP**
|
|
||||||
- URI: `/health`
|
|
||||||
- Port: `25001` (configurable via `LISTEN_PORT`)
|
|
||||||
- Interval: 5–10 s
|
|
||||||
- Rise: 2
|
|
||||||
- Fall: 3
|
|
||||||
- Expect: `2xx`
|
|
||||||
|
|
||||||
## Health Endpoint Examples
|
|
||||||
|
|
||||||
### Query Parameter
|
|
||||||
|
|
||||||
Use the `check` query parameter to specify which checks to run. Multiple checks can be comma-separated.
|
|
||||||
|
|
||||||
| Request | Behavior |
|
|
||||||
|--------------------------------------|---------------------------------------------|
|
|
||||||
| `GET /health` | Runs `node_ready` (default) |
|
|
||||||
| `GET /health?check=okd_router_1936` | Runs only OKD router check |
|
|
||||||
| `GET /health?check=node_ready,okd_router_1936` | Runs both checks |
|
|
||||||
|
|
||||||
**Note:** When the `check` parameter is provided, only the specified checks run. You must explicitly include `node_ready` if you want it along with other checks.
|
|
||||||
|
|
||||||
### Response Format
|
|
||||||
|
|
||||||
Each check result includes:
|
|
||||||
- `name`: The check identifier
|
|
||||||
- `passed`: Boolean indicating success or failure
|
|
||||||
- `reason`: (Optional) Failure reason if the check failed
|
|
||||||
- `duration_ms`: Time taken to execute the check in milliseconds
|
|
||||||
|
|
||||||
**Healthy node (default check)**
|
|
||||||
```http
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
Content-Type: application/json
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
**Healthy node (multiple checks)**
|
|
||||||
```http
|
|
||||||
GET /health?check=node_ready,okd_router_1936
|
|
||||||
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
Content-Type: application/json
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
**Unhealthy node (one check failed)**
|
|
||||||
```http
|
|
||||||
GET /health?check=node_ready,okd_router_1936
|
|
||||||
|
|
||||||
HTTP/1.1 503 Service Unavailable
|
|
||||||
Content-Type: application/json
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
**Unhealthy node (default check)**
|
|
||||||
```http
|
|
||||||
HTTP/1.1 503 Service Unavailable
|
|
||||||
Content-Type: application/json
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
## Configuration (via DaemonSet env vars)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
env:
|
|
||||||
- name: NODE_NAME
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: metadata.name
|
|
||||||
- name: LISTEN_PORT
|
|
||||||
value: "25001"
|
|
||||||
```
|
|
||||||
|
|
||||||
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run locally (set NODE_NAME env var)
|
|
||||||
NODE_NAME=my-test-node cargo run
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*
|
|
||||||
|
|
||||||
"name": "okd_router_1936",
|
|
||||||
"passed": false,
|
|
||||||
"reason": "Failed to connect to OKD router: connection refused",
|
|
||||||
"duration_ms": 5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Unhealthy node (default check)**
|
|
||||||
```http
|
|
||||||
HTTP/1.1 503 Service Unavailable
|
|
||||||
Content-Type: application/json
|
|
||||||
|
|
||||||
{
|
|
||||||
"status": "not-ready",
|
|
||||||
"checks": [
|
|
||||||
{
|
|
||||||
"name": "node_ready",
|
|
||||||
"passed": false,
|
|
||||||
"reason": "KubeletNotReady",
|
|
||||||
"duration_ms": 35
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Configuration (via DaemonSet env vars)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
env:
|
|
||||||
- name: NODE_NAME
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: metadata.name
|
|
||||||
- name: LISTEN_PORT
|
|
||||||
value: "25001"
|
|
||||||
```
|
|
||||||
|
|
||||||
Checks are selected via the `check` query parameter on the `/health` endpoint. See the usage examples above.
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Run locally (set NODE_NAME env var)
|
|
||||||
NODE_NAME=my-test-node cargo run
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
*Minimal, auditable, and built for production bare-metal Kubernetes environments.*
|
|
||||||
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
# This is meant to be run on a machine with harmony development tools installed (cargo, etc)
|
|
||||||
|
|
||||||
DOCKER_TAG="${DOCKER_TAG:-dev}"
|
|
||||||
|
|
||||||
cargo build --release
|
|
||||||
|
|
||||||
cp ../target/release/harmony-node-readiness-endpoint .
|
|
||||||
|
|
||||||
docker build . -t hub.nationtech.io/harmony/harmony-node-readiness-endpoint:${DOCKER_TAG}
|
|
||||||
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
apiVersion: apps/v1
|
|
||||||
kind: DaemonSet
|
|
||||||
metadata:
|
|
||||||
name: node-healthcheck
|
|
||||||
namespace: harmony-node-healthcheck
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: node-healthcheck
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: node-healthcheck
|
|
||||||
spec:
|
|
||||||
serviceAccountName: node-healthcheck-sa
|
|
||||||
hostNetwork: true
|
|
||||||
# This ensures the pod runs even if the node is already "unschedulable"
|
|
||||||
# so it can report the status correctly.
|
|
||||||
tolerations:
|
|
||||||
- operator: Exists
|
|
||||||
containers:
|
|
||||||
- name: checker
|
|
||||||
image: hub.nationtech.io/harmony/harmony-node-readiness-endpoint:latest
|
|
||||||
env:
|
|
||||||
- name: NODE_NAME
|
|
||||||
valueFrom:
|
|
||||||
fieldRef:
|
|
||||||
fieldPath: spec.nodeName
|
|
||||||
ports:
|
|
||||||
- containerPort: 8080
|
|
||||||
hostPort: 8080
|
|
||||||
name: health-port
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 50Mi
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: Namespace
|
|
||||||
metadata:
|
|
||||||
name: harmony-node-healthcheck
|
|
||||||
labels:
|
|
||||||
name: harmony-node-healthcheck
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: node-healthcheck-sa
|
|
||||||
namespace: harmony-node-healthcheck
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRole
|
|
||||||
metadata:
|
|
||||||
name: node-healthcheck-role
|
|
||||||
rules:
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources: ["nodes"]
|
|
||||||
verbs: ["get", "list"]
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: Role
|
|
||||||
metadata:
|
|
||||||
name: allow-hostnetwork-scc
|
|
||||||
namespace: harmony-node-healthcheck
|
|
||||||
rules:
|
|
||||||
- apiGroups: ["security.openshift.io"]
|
|
||||||
resources: ["securitycontextconstraints"]
|
|
||||||
resourceNames: ["hostnetwork"]
|
|
||||||
verbs: ["use"]
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: RoleBinding
|
|
||||||
metadata:
|
|
||||||
name: node-status-querier-scc-binding
|
|
||||||
namespace: harmony-node-healthcheck
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: node-healthcheck-sa
|
|
||||||
namespace: harmony-node-healthcheck
|
|
||||||
roleRef:
|
|
||||||
kind: Role
|
|
||||||
name: allow-hostnetwork-scc
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRoleBinding
|
|
||||||
metadata:
|
|
||||||
name: read-nodes-binding
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: node-healthcheck-sa
|
|
||||||
namespace: harmony-node-healthcheck
|
|
||||||
roleRef:
|
|
||||||
kind: ClusterRole
|
|
||||||
name: node-healthcheck-role
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
Binary file not shown.
@@ -1,232 +0,0 @@
|
|||||||
use actix_web::{App, HttpResponse, HttpServer, Responder, get, web};
|
|
||||||
use k8s_openapi::api::core::v1::Node;
|
|
||||||
use kube::{Api, Client};
|
|
||||||
use log::{debug, error, info, warn};
|
|
||||||
use reqwest;
|
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use std::env;
|
|
||||||
use std::time::Instant;
|
|
||||||
use tokio::task::JoinSet;
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
struct HealthStatus {
|
|
||||||
status: String,
|
|
||||||
checks: Vec<CheckResult>,
|
|
||||||
total_duration_ms: u128,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
struct CheckResult {
|
|
||||||
name: String,
|
|
||||||
passed: bool,
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
|
||||||
reason: Option<String>,
|
|
||||||
duration_ms: u128,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize)]
|
|
||||||
struct HealthError {
|
|
||||||
status: String,
|
|
||||||
error: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
|
||||||
struct HealthQuery {
|
|
||||||
#[serde(rename = "check")]
|
|
||||||
checks: Option<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if the node's Ready condition is true via Kubernetes API
|
|
||||||
async fn check_node_ready(client: Client, node_name: &str) -> Result<(), String> {
|
|
||||||
let nodes: Api<Node> = Api::all(client);
|
|
||||||
|
|
||||||
let node = nodes
|
|
||||||
.get(node_name)
|
|
||||||
.await
|
|
||||||
.map_err(|e| format!("Failed to get node '{}': {}", node_name, e))?;
|
|
||||||
|
|
||||||
let conditions = node.status.and_then(|s| s.conditions).unwrap_or_default();
|
|
||||||
|
|
||||||
for condition in conditions {
|
|
||||||
if condition.type_ == "Ready" {
|
|
||||||
let is_ready = condition.status == "True";
|
|
||||||
let reason = condition
|
|
||||||
.reason
|
|
||||||
.clone()
|
|
||||||
.unwrap_or_else(|| "Unknown".to_string());
|
|
||||||
|
|
||||||
if !is_ready {
|
|
||||||
return Err(reason);
|
|
||||||
}
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Err("Ready condition not found".to_string())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check OKD router health endpoint on port 1936
|
|
||||||
async fn check_okd_router_1936() -> Result<(), String> {
|
|
||||||
debug!("Checking okd router 1936");
|
|
||||||
let client = reqwest::Client::builder()
|
|
||||||
.timeout(std::time::Duration::from_secs(5))
|
|
||||||
.build()
|
|
||||||
.map_err(|e| format!("Failed to build HTTP client: {}", e))?;
|
|
||||||
|
|
||||||
let response = client
|
|
||||||
.get("http://127.0.0.1:1936/healthz/ready")
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.map_err(|e| format!("Failed to connect to OKD router: {}", e))?;
|
|
||||||
|
|
||||||
debug!("okd router 1936 response status {}", response.status());
|
|
||||||
|
|
||||||
if response.status().is_success() {
|
|
||||||
Ok(())
|
|
||||||
} else {
|
|
||||||
Err(format!("OKD router returned status: {}", response.status()))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parse comma-separated check names from query parameter
|
|
||||||
fn parse_checks(checks_param: Option<&str>) -> Vec<String> {
|
|
||||||
match checks_param {
|
|
||||||
None => vec!["node_ready".to_string()],
|
|
||||||
Some(s) if s.is_empty() => vec!["node_ready".to_string()],
|
|
||||||
Some(s) => s.split(',').map(|c| c.trim().to_string()).collect(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Run a single health check by name and return the result
|
|
||||||
async fn run_check(check_name: &str, client: Option<Client>, node_name: &str) -> CheckResult {
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let result = match check_name {
|
|
||||||
"node_ready" => match client {
|
|
||||||
Some(c) => check_node_ready(c, node_name).await,
|
|
||||||
None => Err("Kubernetes client not available".to_string()),
|
|
||||||
},
|
|
||||||
"okd_router_1936" => check_okd_router_1936().await,
|
|
||||||
_ => Err(format!("Unknown check: {}", check_name)),
|
|
||||||
};
|
|
||||||
|
|
||||||
let duration_ms = start.elapsed().as_millis();
|
|
||||||
|
|
||||||
match result {
|
|
||||||
Ok(()) => CheckResult {
|
|
||||||
name: check_name.to_string(),
|
|
||||||
passed: true,
|
|
||||||
reason: None,
|
|
||||||
duration_ms,
|
|
||||||
},
|
|
||||||
Err(reason) => CheckResult {
|
|
||||||
name: check_name.to_string(),
|
|
||||||
passed: false,
|
|
||||||
reason: Some(reason),
|
|
||||||
duration_ms,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[get("/health")]
|
|
||||||
async fn health(query: web::Query<HealthQuery>) -> impl Responder {
|
|
||||||
let node_name = match env::var("NODE_NAME") {
|
|
||||||
Ok(name) => name,
|
|
||||||
Err(_) => {
|
|
||||||
error!("NODE_NAME environment variable not set");
|
|
||||||
return HttpResponse::InternalServerError().json(HealthError {
|
|
||||||
status: "error".to_string(),
|
|
||||||
error: "NODE_NAME environment variable not set".to_string(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Parse requested checks from query parameter
|
|
||||||
let requested_checks = parse_checks(query.checks.as_deref());
|
|
||||||
|
|
||||||
// Check if node_ready check requires Kubernetes client
|
|
||||||
let needs_k8s_client = requested_checks.contains(&"node_ready".to_string());
|
|
||||||
|
|
||||||
// Initialize Kubernetes client only if needed
|
|
||||||
let k8s_client = if needs_k8s_client {
|
|
||||||
match Client::try_default().await {
|
|
||||||
Ok(c) => Some(c),
|
|
||||||
Err(e) => {
|
|
||||||
error!("Failed to create Kubernetes client: {}", e);
|
|
||||||
return HttpResponse::InternalServerError().json(HealthError {
|
|
||||||
status: "error".to_string(),
|
|
||||||
error: format!("Failed to create Kubernetes client: {}", e),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
// Run all requested checks in parallel
|
|
||||||
let start = Instant::now();
|
|
||||||
let mut join_set = JoinSet::new();
|
|
||||||
debug!("Running checks {requested_checks:?}");
|
|
||||||
|
|
||||||
for check_name in requested_checks {
|
|
||||||
let client = k8s_client.clone();
|
|
||||||
let node_name = node_name.clone();
|
|
||||||
join_set.spawn(async move { run_check(&check_name, client, &node_name).await });
|
|
||||||
}
|
|
||||||
let mut check_results = Vec::new();
|
|
||||||
while let Some(result) = join_set.join_next().await {
|
|
||||||
match result {
|
|
||||||
Ok(check) => check_results.push(check),
|
|
||||||
Err(e) => error!("Check task failed: {}", e),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let total_duration_ms = start.elapsed().as_millis();
|
|
||||||
|
|
||||||
// Determine overall status
|
|
||||||
let all_passed = check_results.iter().all(|c| c.passed);
|
|
||||||
|
|
||||||
if all_passed {
|
|
||||||
info!(
|
|
||||||
"All health checks passed for node '{}' in {}ms",
|
|
||||||
node_name, total_duration_ms
|
|
||||||
);
|
|
||||||
HttpResponse::Ok().json(HealthStatus {
|
|
||||||
status: "ready".to_string(),
|
|
||||||
checks: check_results,
|
|
||||||
total_duration_ms,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
let failed_checks: Vec<&str> = check_results
|
|
||||||
.iter()
|
|
||||||
.filter(|c| !c.passed)
|
|
||||||
.map(|c| c.name.as_str())
|
|
||||||
.collect();
|
|
||||||
warn!(
|
|
||||||
"Health checks failed for node '{}' in {}ms: {:?}",
|
|
||||||
node_name, total_duration_ms, failed_checks
|
|
||||||
);
|
|
||||||
HttpResponse::ServiceUnavailable().json(HealthStatus {
|
|
||||||
status: "not-ready".to_string(),
|
|
||||||
checks: check_results,
|
|
||||||
total_duration_ms,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[actix_web::main]
|
|
||||||
async fn main() -> std::io::Result<()> {
|
|
||||||
env_logger::init();
|
|
||||||
|
|
||||||
let port = env::var("LISTEN_PORT").unwrap_or_else(|_| "25001".to_string());
|
|
||||||
let port = port
|
|
||||||
.parse::<u16>()
|
|
||||||
.unwrap_or_else(|_| panic!("Invalid port number: {}", port));
|
|
||||||
let bind_addr = format!("0.0.0.0:{}", port);
|
|
||||||
|
|
||||||
info!("Starting harmony-node-readiness-endpoint on {}", bind_addr);
|
|
||||||
|
|
||||||
HttpServer::new(|| App::new().service(health))
|
|
||||||
.bind(&bind_addr)?
|
|
||||||
.run()
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user