Compare commits
8 Commits
feat/ceph-
...
fix/ceph_r
| Author | SHA1 | Date | |
|---|---|---|---|
| fb0fc50eac | |||
| 63ae213c35 | |||
| 3567d0f420 | |||
| 3ca31179d0 | |||
| a9fe4ab267 | |||
| 65cc9befeb | |||
| d456a1f9ee | |||
| 3867d436ad |
9
Cargo.lock
generated
9
Cargo.lock
generated
@@ -4255,6 +4255,15 @@ version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||
|
||||
[[package]]
|
||||
name = "remove_rook_osd"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"harmony",
|
||||
"harmony_cli",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.11.27"
|
||||
|
||||
11
examples/remove_rook_osd/Cargo.toml
Normal file
11
examples/remove_rook_osd/Cargo.toml
Normal file
@@ -0,0 +1,11 @@
|
||||
[package]
|
||||
name = "example-remove-rook-osd"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
harmony = { version = "0.1.0", path = "../../harmony" }
|
||||
harmony_cli = { version = "0.1.0", path = "../../harmony_cli" }
|
||||
tokio.workspace = true
|
||||
18
examples/remove_rook_osd/src/main.rs
Normal file
18
examples/remove_rook_osd/src/main.rs
Normal file
@@ -0,0 +1,18 @@
|
||||
use harmony::{
|
||||
inventory::Inventory, modules::storage::ceph::ceph_remove_osd_score::CephRemoveOsd,
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let ceph_score = CephRemoveOsd {
|
||||
osd_deployment_name: "rook-ceph-osd-2".to_string(),
|
||||
rook_ceph_namespace: "rook-ceph".to_string(),
|
||||
};
|
||||
|
||||
let topology = K8sAnywhereTopology::from_env();
|
||||
let inventory = Inventory::autoload();
|
||||
harmony_cli::run(inventory, topology, vec![Box::new(ceph_score)], None)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
11
examples/validate_ceph_cluster_health/Cargo.toml
Normal file
11
examples/validate_ceph_cluster_health/Cargo.toml
Normal file
@@ -0,0 +1,11 @@
|
||||
[package]
|
||||
name = "example_validate_ceph_cluster_health"
|
||||
edition = "2024"
|
||||
version.workspace = true
|
||||
readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
harmony = { version = "0.1.0", path = "../../harmony" }
|
||||
harmony_cli = { version = "0.1.0", path = "../../harmony_cli" }
|
||||
tokio.workspace = true
|
||||
18
examples/validate_ceph_cluster_health/src/main.rs
Normal file
18
examples/validate_ceph_cluster_health/src/main.rs
Normal file
@@ -0,0 +1,18 @@
|
||||
use harmony::{
|
||||
inventory::Inventory,
|
||||
modules::storage::ceph::ceph_validate_health_score::CephVerifyClusterHealth,
|
||||
topology::K8sAnywhereTopology,
|
||||
};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let ceph_health_score = CephVerifyClusterHealth {
|
||||
rook_ceph_namespace: "rook-ceph".to_string(),
|
||||
};
|
||||
|
||||
let topology = K8sAnywhereTopology::from_env();
|
||||
let inventory = Inventory::autoload();
|
||||
harmony_cli::run(inventory, topology, vec![Box::new(ceph_health_score)], None)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
@@ -32,6 +32,8 @@ pub enum InterpretName {
|
||||
Lamp,
|
||||
ApplicationMonitoring,
|
||||
K8sPrometheusCrdAlerting,
|
||||
CephRemoveOsd,
|
||||
CephClusterHealth,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for InterpretName {
|
||||
@@ -58,6 +60,8 @@ impl std::fmt::Display for InterpretName {
|
||||
InterpretName::Lamp => f.write_str("LAMP"),
|
||||
InterpretName::ApplicationMonitoring => f.write_str("ApplicationMonitoring"),
|
||||
InterpretName::K8sPrometheusCrdAlerting => f.write_str("K8sPrometheusCrdAlerting"),
|
||||
InterpretName::CephRemoveOsd => f.write_str("CephRemoveOsd"),
|
||||
InterpretName::CephClusterHealth => f.write_str("CephClusterHealth"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ use kube::{
|
||||
api::{ApiResource, GroupVersionKind},
|
||||
runtime::wait::await_condition,
|
||||
};
|
||||
use log::{debug, error, trace};
|
||||
use log::{debug, error, info, trace};
|
||||
use serde::{Serialize, de::DeserializeOwned};
|
||||
use serde_json::json;
|
||||
use similar::TextDiff;
|
||||
@@ -59,10 +59,13 @@ impl K8sClient {
|
||||
namespace: Option<&str>,
|
||||
) -> Result<Option<Deployment>, Error> {
|
||||
let deps: Api<Deployment> = if let Some(ns) = namespace {
|
||||
debug!("getting namespaced deployment");
|
||||
Api::namespaced(self.client.clone(), ns)
|
||||
} else {
|
||||
debug!("getting default namespace deployment");
|
||||
Api::default_namespaced(self.client.clone())
|
||||
};
|
||||
debug!("getting deployment {} in ns {}", name, namespace.unwrap());
|
||||
Ok(deps.get_opt(name).await?)
|
||||
}
|
||||
|
||||
@@ -93,7 +96,7 @@ impl K8sClient {
|
||||
}
|
||||
});
|
||||
let pp = PatchParams::default();
|
||||
let scale = Patch::Apply(&patch);
|
||||
let scale = Patch::Merge(&patch);
|
||||
deployments.patch_scale(name, &pp, &scale).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
use std::{
|
||||
process::Command,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use log::{info, warn};
|
||||
use log::{debug, warn};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::sleep;
|
||||
|
||||
@@ -19,8 +18,8 @@ use crate::{
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct CephRemoveOsd {
|
||||
osd_deployment_name: String,
|
||||
rook_ceph_namespace: String,
|
||||
pub osd_deployment_name: String,
|
||||
pub rook_ceph_namespace: String,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for CephRemoveOsd {
|
||||
@@ -54,18 +53,17 @@ impl<T: Topology + K8sclient> Interpret<T> for CephRemoveOsdInterpret {
|
||||
self.verify_deployment_scaled(client.clone()).await?;
|
||||
self.delete_deployment(client.clone()).await?;
|
||||
self.verify_deployment_deleted(client.clone()).await?;
|
||||
let osd_id_full = self.get_ceph_osd_id().unwrap();
|
||||
self.purge_ceph_osd(client.clone(), &osd_id_full).await?;
|
||||
self.verify_ceph_osd_removal(client.clone(), &osd_id_full)
|
||||
.await?;
|
||||
self.purge_ceph_osd(client.clone()).await?;
|
||||
self.verify_ceph_osd_removal(client.clone()).await?;
|
||||
|
||||
let osd_id_full = self.get_ceph_osd_id().unwrap();
|
||||
Ok(Outcome::success(format!(
|
||||
"Successfully removed OSD {} from rook-ceph cluster by deleting deployment {}",
|
||||
osd_id_full, self.score.osd_deployment_name
|
||||
)))
|
||||
}
|
||||
fn get_name(&self) -> InterpretName {
|
||||
todo!()
|
||||
InterpretName::CephRemoveOsd
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
@@ -82,7 +80,7 @@ impl<T: Topology + K8sclient> Interpret<T> for CephRemoveOsdInterpret {
|
||||
}
|
||||
|
||||
impl CephRemoveOsdInterpret {
|
||||
pub fn get_ceph_osd_id(&self) -> Result<String, InterpretError> {
|
||||
pub fn get_ceph_osd_id_numeric(&self) -> Result<String, InterpretError> {
|
||||
let osd_id_numeric = self
|
||||
.score
|
||||
.osd_deployment_name
|
||||
@@ -94,9 +92,14 @@ impl CephRemoveOsdInterpret {
|
||||
self.score.osd_deployment_name
|
||||
))
|
||||
})?;
|
||||
Ok(osd_id_numeric.to_string())
|
||||
}
|
||||
|
||||
pub fn get_ceph_osd_id(&self) -> Result<String, InterpretError> {
|
||||
let osd_id_numeric = self.get_ceph_osd_id_numeric().unwrap();
|
||||
let osd_id_full = format!("osd.{}", osd_id_numeric);
|
||||
|
||||
info!(
|
||||
debug!(
|
||||
"Targeting Ceph OSD: {} (parsed from deployment {})",
|
||||
osd_id_full, self.score.osd_deployment_name
|
||||
);
|
||||
@@ -108,6 +111,7 @@ impl CephRemoveOsdInterpret {
|
||||
&self,
|
||||
client: Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
debug!("verifying toolbox exists");
|
||||
let toolbox_dep = "rook-ceph-tools".to_string();
|
||||
|
||||
match client
|
||||
@@ -149,7 +153,7 @@ impl CephRemoveOsdInterpret {
|
||||
&self,
|
||||
client: Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
info!(
|
||||
debug!(
|
||||
"Scaling down OSD deployment: {}",
|
||||
self.score.osd_deployment_name
|
||||
);
|
||||
@@ -172,7 +176,7 @@ impl CephRemoveOsdInterpret {
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let (timeout, interval, start) = self.build_timer();
|
||||
|
||||
info!("Waiting for OSD deployment to scale down to 0 replicas");
|
||||
debug!("Waiting for OSD deployment to scale down to 0 replicas");
|
||||
loop {
|
||||
let dep = client
|
||||
.get_deployment(
|
||||
@@ -180,11 +184,9 @@ impl CephRemoveOsdInterpret {
|
||||
Some(&self.score.rook_ceph_namespace),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(deployment) = dep {
|
||||
if let Some(status) = deployment.status {
|
||||
if status.replicas.unwrap_or(1) == 0 && status.ready_replicas.unwrap_or(1) == 0
|
||||
{
|
||||
if status.replicas == None && status.ready_replicas == None {
|
||||
return Ok(Outcome::success(
|
||||
"Deployment successfully scaled down.".to_string(),
|
||||
));
|
||||
@@ -212,7 +214,7 @@ impl CephRemoveOsdInterpret {
|
||||
&self,
|
||||
client: Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
info!(
|
||||
debug!(
|
||||
"Deleting OSD deployment: {}",
|
||||
self.score.osd_deployment_name
|
||||
);
|
||||
@@ -234,7 +236,7 @@ impl CephRemoveOsdInterpret {
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let (timeout, interval, start) = self.build_timer();
|
||||
|
||||
info!("Waiting for OSD deployment to scale down to 0 replicas");
|
||||
debug!("Verifying OSD deployment deleted");
|
||||
loop {
|
||||
let dep = client
|
||||
.get_deployment(
|
||||
@@ -244,7 +246,7 @@ impl CephRemoveOsdInterpret {
|
||||
.await?;
|
||||
|
||||
if dep.is_none() {
|
||||
info!(
|
||||
debug!(
|
||||
"Deployment {} successfully deleted.",
|
||||
self.score.osd_deployment_name
|
||||
);
|
||||
@@ -276,12 +278,10 @@ impl CephRemoveOsdInterpret {
|
||||
Ok(tree)
|
||||
}
|
||||
|
||||
pub async fn purge_ceph_osd(
|
||||
&self,
|
||||
client: Arc<K8sClient>,
|
||||
osd_id_full: &str,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
info!(
|
||||
pub async fn purge_ceph_osd(&self, client: Arc<K8sClient>) -> Result<Outcome, InterpretError> {
|
||||
let osd_id_numeric = self.get_ceph_osd_id_numeric().unwrap();
|
||||
let osd_id_full = self.get_ceph_osd_id().unwrap();
|
||||
debug!(
|
||||
"Purging OSD {} from Ceph cluster and removing its auth key",
|
||||
osd_id_full
|
||||
);
|
||||
@@ -291,8 +291,9 @@ impl CephRemoveOsdInterpret {
|
||||
"app".to_string(),
|
||||
Some(&self.score.rook_ceph_namespace),
|
||||
vec![
|
||||
format!("ceph osd purge {osd_id_full} --yes-i-really-mean-it").as_str(),
|
||||
format!("ceph auth del osd.{osd_id_full}").as_str(),
|
||||
"sh",
|
||||
"-c",
|
||||
format!("ceph osd purge {osd_id_numeric} --yes-i-really-mean-it && ceph auth del {osd_id_full}").as_str(),
|
||||
],
|
||||
)
|
||||
.await?;
|
||||
@@ -305,10 +306,10 @@ impl CephRemoveOsdInterpret {
|
||||
pub async fn verify_ceph_osd_removal(
|
||||
&self,
|
||||
client: Arc<K8sClient>,
|
||||
osd_id_full: &str,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let (timeout, interval, start) = self.build_timer();
|
||||
info!(
|
||||
let osd_id_full = self.get_ceph_osd_id().unwrap();
|
||||
debug!(
|
||||
"Verifying OSD {} has been removed from the Ceph tree...",
|
||||
osd_id_full
|
||||
);
|
||||
@@ -318,7 +319,7 @@ impl CephRemoveOsdInterpret {
|
||||
"rook-ceph-tools".to_string(),
|
||||
"app".to_string(),
|
||||
Some(&self.score.rook_ceph_namespace),
|
||||
vec!["ceph osd tree -f json"],
|
||||
vec!["sh", "-c", "ceph osd tree -f json"],
|
||||
)
|
||||
.await?;
|
||||
let tree =
|
||||
136
harmony/src/modules/storage/ceph/ceph_validate_health_score.rs
Normal file
136
harmony/src/modules/storage/ceph/ceph_validate_health_score.rs
Normal file
@@ -0,0 +1,136 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use log::debug;
|
||||
use serde::Serialize;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::{
|
||||
data::{Id, Version},
|
||||
interpret::{Interpret, InterpretError, InterpretName, InterpretStatus, Outcome},
|
||||
inventory::Inventory,
|
||||
score::Score,
|
||||
topology::{K8sclient, Topology, k8s::K8sClient},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct CephVerifyClusterHealth {
|
||||
pub rook_ceph_namespace: String,
|
||||
}
|
||||
|
||||
impl<T: Topology + K8sclient> Score<T> for CephVerifyClusterHealth {
|
||||
fn name(&self) -> String {
|
||||
format!("CephValidateClusterHealth")
|
||||
}
|
||||
|
||||
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
|
||||
Box::new(CephVerifyClusterHealthInterpret {
|
||||
score: self.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CephVerifyClusterHealthInterpret {
|
||||
score: CephVerifyClusterHealth,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl<T: Topology + K8sclient> Interpret<T> for CephVerifyClusterHealthInterpret {
|
||||
async fn execute(
|
||||
&self,
|
||||
_inventory: &Inventory,
|
||||
topology: &T,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let client = topology.k8s_client().await.unwrap();
|
||||
self.verify_ceph_toolbox_exists(client.clone()).await?;
|
||||
self.validate_ceph_cluster_health(client.clone()).await?;
|
||||
Ok(Outcome::success("Ceph cluster healthy".to_string()))
|
||||
}
|
||||
|
||||
fn get_name(&self) -> InterpretName {
|
||||
InterpretName::CephClusterHealth
|
||||
}
|
||||
|
||||
fn get_version(&self) -> Version {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_status(&self) -> InterpretStatus {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_children(&self) -> Vec<Id> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl CephVerifyClusterHealthInterpret {
|
||||
pub async fn verify_ceph_toolbox_exists(
|
||||
&self,
|
||||
client: Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
let toolbox_dep = "rook-ceph-tools".to_string();
|
||||
|
||||
match client
|
||||
.get_deployment(&toolbox_dep, Some(&self.score.rook_ceph_namespace))
|
||||
.await
|
||||
{
|
||||
Ok(Some(deployment)) => {
|
||||
if let Some(status) = deployment.status {
|
||||
let ready_count = status.ready_replicas.unwrap_or(0);
|
||||
if ready_count >= 1 {
|
||||
return Ok(Outcome::success(format!(
|
||||
"'{}' is ready with {} replica(s).",
|
||||
&toolbox_dep, ready_count
|
||||
)));
|
||||
} else {
|
||||
return Err(InterpretError::new(
|
||||
"ceph-tool-box not ready in cluster".to_string(),
|
||||
));
|
||||
}
|
||||
} else {
|
||||
Err(InterpretError::new(format!(
|
||||
"failed to get deployment status {}",
|
||||
&toolbox_dep
|
||||
)))
|
||||
}
|
||||
}
|
||||
Ok(None) => Err(InterpretError::new(format!(
|
||||
"Deployment '{}' not found in namespace '{}'.",
|
||||
&toolbox_dep, self.score.rook_ceph_namespace
|
||||
))),
|
||||
Err(e) => Err(InterpretError::new(format!(
|
||||
"Failed to query for deployment '{}': {}",
|
||||
&toolbox_dep, e
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn validate_ceph_cluster_health(
|
||||
&self,
|
||||
client: Arc<K8sClient>,
|
||||
) -> Result<Outcome, InterpretError> {
|
||||
debug!("Verifying ceph cluster is in healthy state");
|
||||
|
||||
let health = client
|
||||
.exec_app_capture_output(
|
||||
"rook-ceph-tools".to_string(),
|
||||
"app".to_string(),
|
||||
Some(&self.score.rook_ceph_namespace),
|
||||
vec!["sh", "-c", "ceph health"],
|
||||
)
|
||||
.await?;
|
||||
|
||||
if health.contains("HEALTH_OK") {
|
||||
return Ok(Outcome::success(
|
||||
"Ceph Cluster in healthy state".to_string(),
|
||||
));
|
||||
} else {
|
||||
Err(InterpretError::new(format!(
|
||||
"Ceph cluster unhealthy {}",
|
||||
health
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1 +1,2 @@
|
||||
pub mod ceph_osd_replacement_score;
|
||||
pub mod ceph_remove_osd_score;
|
||||
pub mod ceph_validate_health_score;
|
||||
|
||||
Reference in New Issue
Block a user