From 3867d436ad42366a3f87e286ac4b07108c6f30f4 Mon Sep 17 00:00:00 2001 From: Willem Date: Mon, 25 Aug 2025 11:17:14 -0400 Subject: [PATCH] fix: remove ceph osd deletes and purges osd from ceph osd tree\ k8s returns None rather than zero when checking deployment for replicas exec_app requires commands 's' and '-c' to run correctly --- Cargo.lock | 10 +++ harmony/src/domain/interpret/mod.rs | 2 + harmony/src/domain/topology/k8s.rs | 11 +++- .../ceph/ceph_osd_replacement_score.rs | 62 ++++++++++--------- 4 files changed, 53 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b1295be..cb921e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1700,6 +1700,16 @@ dependencies = [ "url", ] +[[package]] +name = "example_remove_rook_osd" +version = "0.1.0" +dependencies = [ + "harmony", + "harmony_cli", + "harmony_tui", + "tokio", +] + [[package]] name = "eyre" version = "0.6.12" diff --git a/harmony/src/domain/interpret/mod.rs b/harmony/src/domain/interpret/mod.rs index cfbf2b5..c1dcead 100644 --- a/harmony/src/domain/interpret/mod.rs +++ b/harmony/src/domain/interpret/mod.rs @@ -32,6 +32,7 @@ pub enum InterpretName { Lamp, ApplicationMonitoring, K8sPrometheusCrdAlerting, + CephRemoveOsd, } impl std::fmt::Display for InterpretName { @@ -58,6 +59,7 @@ impl std::fmt::Display for InterpretName { InterpretName::Lamp => f.write_str("LAMP"), InterpretName::ApplicationMonitoring => f.write_str("ApplicationMonitoring"), InterpretName::K8sPrometheusCrdAlerting => f.write_str("K8sPrometheusCrdAlerting"), + InterpretName::CephRemoveOsd => f.write_str("CephRemoveOsd"), } } } diff --git a/harmony/src/domain/topology/k8s.rs b/harmony/src/domain/topology/k8s.rs index 1887c33..01f6a6e 100644 --- a/harmony/src/domain/topology/k8s.rs +++ b/harmony/src/domain/topology/k8s.rs @@ -15,7 +15,7 @@ use kube::{ api::{ApiResource, GroupVersionKind}, runtime::wait::await_condition, }; -use log::{debug, error, trace}; +use log::{debug, error, info, trace}; use serde::{Serialize, de::DeserializeOwned}; use serde_json::json; use similar::TextDiff; @@ -59,10 +59,17 @@ impl K8sClient { namespace: Option<&str>, ) -> Result, Error> { let deps: Api = if let Some(ns) = namespace { + debug!("getting namespaced deployment"); Api::namespaced(self.client.clone(), ns) } else { + debug!("getting default namespace deployment"); Api::default_namespaced(self.client.clone()) }; + debug!( + "getting deployment {} in ns {}", + name, + namespace.unwrap() + ); Ok(deps.get_opt(name).await?) } @@ -93,7 +100,7 @@ impl K8sClient { } }); let pp = PatchParams::default(); - let scale = Patch::Apply(&patch); + let scale = Patch::Merge(&patch); deployments.patch_scale(name, &pp, &scale).await?; Ok(()) } diff --git a/harmony/src/modules/storage/ceph/ceph_osd_replacement_score.rs b/harmony/src/modules/storage/ceph/ceph_osd_replacement_score.rs index a4b0cb0..8b8acc7 100644 --- a/harmony/src/modules/storage/ceph/ceph_osd_replacement_score.rs +++ b/harmony/src/modules/storage/ceph/ceph_osd_replacement_score.rs @@ -1,11 +1,10 @@ use std::{ - process::Command, sync::Arc, time::{Duration, Instant}, }; use async_trait::async_trait; -use log::{info, warn}; +use log::{debug, warn}; use serde::{Deserialize, Serialize}; use tokio::time::sleep; @@ -19,8 +18,8 @@ use crate::{ #[derive(Debug, Clone, Serialize)] pub struct CephRemoveOsd { - osd_deployment_name: String, - rook_ceph_namespace: String, + pub osd_deployment_name: String, + pub rook_ceph_namespace: String, } impl Score for CephRemoveOsd { @@ -54,18 +53,17 @@ impl Interpret for CephRemoveOsdInterpret { self.verify_deployment_scaled(client.clone()).await?; self.delete_deployment(client.clone()).await?; self.verify_deployment_deleted(client.clone()).await?; - let osd_id_full = self.get_ceph_osd_id().unwrap(); - self.purge_ceph_osd(client.clone(), &osd_id_full).await?; - self.verify_ceph_osd_removal(client.clone(), &osd_id_full) - .await?; + self.purge_ceph_osd(client.clone()).await?; + self.verify_ceph_osd_removal(client.clone()).await?; + let osd_id_full = self.get_ceph_osd_id().unwrap(); Ok(Outcome::success(format!( "Successfully removed OSD {} from rook-ceph cluster by deleting deployment {}", osd_id_full, self.score.osd_deployment_name ))) } fn get_name(&self) -> InterpretName { - todo!() + InterpretName::CephRemoveOsd } fn get_version(&self) -> Version { @@ -82,7 +80,7 @@ impl Interpret for CephRemoveOsdInterpret { } impl CephRemoveOsdInterpret { - pub fn get_ceph_osd_id(&self) -> Result { + pub fn get_ceph_osd_id_numeric(&self) -> Result { let osd_id_numeric = self .score .osd_deployment_name @@ -94,9 +92,14 @@ impl CephRemoveOsdInterpret { self.score.osd_deployment_name )) })?; + Ok(osd_id_numeric.to_string()) + } + + pub fn get_ceph_osd_id(&self) -> Result { + let osd_id_numeric = self.get_ceph_osd_id_numeric().unwrap(); let osd_id_full = format!("osd.{}", osd_id_numeric); - info!( + debug!( "Targeting Ceph OSD: {} (parsed from deployment {})", osd_id_full, self.score.osd_deployment_name ); @@ -108,6 +111,7 @@ impl CephRemoveOsdInterpret { &self, client: Arc, ) -> Result { + debug!("verifying toolbox exists"); let toolbox_dep = "rook-ceph-tools".to_string(); match client @@ -149,7 +153,7 @@ impl CephRemoveOsdInterpret { &self, client: Arc, ) -> Result { - info!( + debug!( "Scaling down OSD deployment: {}", self.score.osd_deployment_name ); @@ -172,7 +176,7 @@ impl CephRemoveOsdInterpret { ) -> Result { let (timeout, interval, start) = self.build_timer(); - info!("Waiting for OSD deployment to scale down to 0 replicas"); + debug!("Waiting for OSD deployment to scale down to 0 replicas"); loop { let dep = client .get_deployment( @@ -180,11 +184,9 @@ impl CephRemoveOsdInterpret { Some(&self.score.rook_ceph_namespace), ) .await?; - if let Some(deployment) = dep { if let Some(status) = deployment.status { - if status.replicas.unwrap_or(1) == 0 && status.ready_replicas.unwrap_or(1) == 0 - { + if status.replicas == None && status.ready_replicas == None { return Ok(Outcome::success( "Deployment successfully scaled down.".to_string(), )); @@ -212,7 +214,7 @@ impl CephRemoveOsdInterpret { &self, client: Arc, ) -> Result { - info!( + debug!( "Deleting OSD deployment: {}", self.score.osd_deployment_name ); @@ -234,7 +236,7 @@ impl CephRemoveOsdInterpret { ) -> Result { let (timeout, interval, start) = self.build_timer(); - info!("Waiting for OSD deployment to scale down to 0 replicas"); + debug!("Verifying OSD deployment deleted"); loop { let dep = client .get_deployment( @@ -244,7 +246,7 @@ impl CephRemoveOsdInterpret { .await?; if dep.is_none() { - info!( + debug!( "Deployment {} successfully deleted.", self.score.osd_deployment_name ); @@ -276,12 +278,10 @@ impl CephRemoveOsdInterpret { Ok(tree) } - pub async fn purge_ceph_osd( - &self, - client: Arc, - osd_id_full: &str, - ) -> Result { - info!( + pub async fn purge_ceph_osd(&self, client: Arc) -> Result { + let osd_id_numeric = self.get_ceph_osd_id_numeric().unwrap(); + let osd_id_full = self.get_ceph_osd_id().unwrap(); + debug!( "Purging OSD {} from Ceph cluster and removing its auth key", osd_id_full ); @@ -291,8 +291,10 @@ impl CephRemoveOsdInterpret { "app".to_string(), Some(&self.score.rook_ceph_namespace), vec![ - format!("ceph osd purge {osd_id_full} --yes-i-really-mean-it").as_str(), - format!("ceph auth del osd.{osd_id_full}").as_str(), + "sh", + "-c", + format!("ceph osd purge {osd_id_numeric} --yes-i-really-mean-it").as_str(), + format!("ceph auth del {osd_id_full}").as_str(), ], ) .await?; @@ -305,10 +307,10 @@ impl CephRemoveOsdInterpret { pub async fn verify_ceph_osd_removal( &self, client: Arc, - osd_id_full: &str, ) -> Result { let (timeout, interval, start) = self.build_timer(); - info!( + let osd_id_full = self.get_ceph_osd_id().unwrap(); + debug!( "Verifying OSD {} has been removed from the Ceph tree...", osd_id_full ); @@ -318,7 +320,7 @@ impl CephRemoveOsdInterpret { "rook-ceph-tools".to_string(), "app".to_string(), Some(&self.score.rook_ceph_namespace), - vec!["ceph osd tree -f json"], + vec!["sh", "-c", "ceph osd tree -f json"], ) .await?; let tree =