Compare commits

...

15 Commits

Author SHA1 Message Date
5c628b37b7 wip:added alertreceiver and alert rules which are built and added to the yaml before deploying prometheus, added a few dashboards to grafana. Trying to fix prometheus-server clusterrole/role/serviceaccount so that it can discover targets and kubernetes in a namespaced release where it does not have access to clusterrole 2025-07-09 15:09:38 -04:00
31661aaaf1 fix: prometheus deploys as namespaced resource without prometheus-server clusterrole and clusterrolebinding 2025-07-07 14:33:09 -04:00
2c208df143 fix: deploys by default in the application name namespace 2025-07-07 13:24:21 -04:00
1a6d72dc17 fix: uncommented example
All checks were successful
Run Check Script / check (pull_request) Successful in 1m37s
2025-07-04 16:30:13 -04:00
df9e21807e fix: git conflict
All checks were successful
Run Check Script / check (pull_request) Successful in -6s
2025-07-04 16:22:39 -04:00
b1bf4fd4d5 fix: cargo fmt
All checks were successful
Run Check Script / check (pull_request) Successful in 1m40s
2025-07-04 16:14:47 -04:00
f702ecd8c9 fix: deploys a lighter weight prometheus and grafana which is limited to their respective namespaces 2025-07-04 16:13:41 -04:00
a19b52e690 fix: properly append YAML in correct places in argoapplication (#80)
All checks were successful
Run Check Script / check (push) Successful in -7s
Compile and package harmony_composer / package_harmony_composer (push) Successful in 3m56s
Co-authored-by: tahahawa <tahahawa@gmail.com>
Reviewed-on: #80
2025-07-04 15:32:02 +00:00
b73f2e76d0 Merge pull request 'refact: Make RustWebappScore generic, it is now Application score and takes an application and list of features to attach to the application' (#81) from refact/application into master
All checks were successful
Run Check Script / check (push) Successful in -1s
Compile and package harmony_composer / package_harmony_composer (push) Successful in 3m38s
Reviewed-on: #81
Reviewed-by: wjro <wrolleman@nationtech.io>
2025-07-04 14:31:38 +00:00
b4534c6ee0 refact: Make RustWebappScore generic, it is now Application score and takes an application and list of features to attach to the application
All checks were successful
Run Check Script / check (pull_request) Successful in -8s
2025-07-04 10:27:16 -04:00
6149249a6c feat: create Argo interpret and kube client apply_yaml to install Argo Applications. Very messy implementation though, must be refactored/improved
All checks were successful
Run Check Script / check (push) Successful in -5s
Compile and package harmony_composer / package_harmony_composer (push) Successful in 4m13s
2025-07-04 09:49:43 -04:00
d9935e20cb Merge pull request 'feat: harmony now defaults to using local k3d cluster. Also created OCICompliant: Application trait to make building images cleaner' (#76) from feat/oci into master
All checks were successful
Run Check Script / check (push) Successful in -9s
Compile and package harmony_composer / package_harmony_composer (push) Successful in 4m4s
Reviewed-on: #76
2025-07-03 19:37:46 +00:00
7b0f3b79b1 Merge remote-tracking branch 'origin/master' into feat/oci
All checks were successful
Run Check Script / check (pull_request) Successful in -8s
2025-07-03 15:36:52 -04:00
e6612245a5 Merge pull request 'feat/cd/localdeploymentdemo' (#79) from feat/cd/localdeploymentdemo into feat/oci
All checks were successful
Run Check Script / check (pull_request) Successful in -9s
Reviewed-on: #79
2025-07-03 19:31:45 +00:00
b4f5b91a57 feat: WIP argocd_score (#78)
Some checks are pending
Compile and package harmony_composer / package_harmony_composer (push) Waiting to run
Run Check Script / check (push) Successful in -8s
Co-authored-by: tahahawa <tahahawa@gmail.com>
Reviewed-on: #78
Reviewed-by: johnride <jg@nationtech.io>
Co-authored-by: Taha Hawa <taha@taha.dev>
Co-committed-by: Taha Hawa <taha@taha.dev>
2025-07-03 19:30:00 +00:00
38 changed files with 2374 additions and 91 deletions

View File

@@ -3,8 +3,18 @@ use std::{path::PathBuf, sync::Arc};
use harmony::{
inventory::Inventory,
maestro::Maestro,
modules::application::{
RustWebFramework, RustWebapp, RustWebappScore, features::ContinuousDelivery,
modules::{
application::{
ApplicationScore, RustWebFramework, RustWebapp,
features::{ContinuousDelivery, PrometheusMonitoring},
},
monitoring::{
alert_channel::discord_alert_channel::DiscordWebhook,
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
},
prometheus::alerts::k8s::{
pod::pod_in_failed_state, pvc::high_pvc_fill_rate_over_two_days,
},
},
topology::{K8sAnywhereTopology, Url},
};
@@ -12,18 +22,34 @@ use harmony::{
#[tokio::main]
async fn main() {
env_logger::init();
let application = RustWebapp {
let application = Arc::new(RustWebapp {
name: "harmony-example-rust-webapp".to_string(),
domain: Url::Url(url::Url::parse("https://rustapp.harmony.example.com").unwrap()),
project_root: PathBuf::from("./examples/rust/webapp"),
framework: Some(RustWebFramework::Leptos),
};
// TODO RustWebappScore should simply take a RustWebApp as config
let app = RustWebappScore {
name: "Example Rust Webapp".to_string(),
domain: Url::Url(url::Url::parse("https://rustapp.harmony.example.com").unwrap()),
features: vec![Box::new(ContinuousDelivery {
application: Arc::new(application.clone()),
})],
});
let pod_failed = pod_in_failed_state();
let pod_failed_2 = pod_in_failed_state();
let pod_failed_3 = pod_in_failed_state();
let additional_rules = AlertManagerRuleGroup::new("pod-alerts", vec![pod_failed]);
let additional_rules_2 = AlertManagerRuleGroup::new("pod-alerts-2", vec![pod_failed_2, pod_failed_3]);
let app = ApplicationScore {
features: vec![
//Box::new(ContinuousDelivery {
// application: application.clone(),
//}),
Box::new(PrometheusMonitoring {
application: application.clone(),
alert_receivers: vec![Box::new(DiscordWebhook {
name: "dummy-discord".to_string(),
url: Url::Url(url::Url::parse("https://discord.doesnt.exist.com").unwrap()),
})],
alert_rules: vec![Box::new(additional_rules), Box::new(additional_rules_2)],
}),
// TODO add monitoring, backups, multisite ha, etc
],
application,
};

View File

@@ -13,7 +13,7 @@ reqwest = { version = "0.11", features = ["blocking", "json"] }
russh = "0.45.0"
rust-ipmi = "0.1.1"
semver = "1.0.23"
serde = { version = "1.0.209", features = ["derive"] }
serde = { version = "1.0.209", features = ["derive", "rc"] }
serde_json = "1.0.127"
tokio.workspace = true
derive-new.workspace = true

View File

@@ -22,6 +22,7 @@ pub enum InterpretName {
K3dInstallation,
TenantInterpret,
Application,
ArgoCD,
}
impl std::fmt::Display for InterpretName {
@@ -39,6 +40,7 @@ impl std::fmt::Display for InterpretName {
InterpretName::K3dInstallation => f.write_str("K3dInstallation"),
InterpretName::TenantInterpret => f.write_str("Tenant"),
InterpretName::Application => f.write_str("Application"),
InterpretName::ArgoCD => f.write_str("ArgoCD"),
}
}
}

View File

@@ -0,0 +1,59 @@
////////////////////
/// Working idea
///
///
trait ScoreWithDep<T> {
fn create_interpret(&self) -> Box<dyn Interpret<T>>;
fn name(&self) -> String;
fn get_dependencies(&self) -> Vec<TypeId>; // Force T to impl Installer<TypeId> or something
// like that
}
struct PrometheusAlertScore;
impl <T> ScoreWithDep<T> for PrometheusAlertScore {
fn create_interpret(&self) -> Box<dyn Interpret<T>> {
todo!()
}
fn name(&self) -> String {
todo!()
}
fn get_dependencies(&self) -> Vec<TypeId> {
// We have to find a way to constrait here so at compile time we are only allowed to return
// TypeId for types which can be installed by T
//
// This means, for example that T must implement HelmCommand if the impl <T: HelmCommand> Installable<T> for
// KubePrometheus calls for HelmCommand.
vec![TypeId::of::<KubePrometheus>()]
}
}
trait Installable{}
struct KubePrometheus;
impl Installable for KubePrometheus;
struct Maestro<T> {
topology: T
}
impl <T>Maestro<T> {
fn execute_store(&self, score: ScoreWithDep<T>) {
score.get_dependencies().iter().for_each(|dep| {
self.topology.ensure_dependency_ready(dep);
});
}
}
struct TopologyWithDep {
}
impl TopologyWithDep {
fn ensure_dependency_ready(&self, type_id: TypeId) -> Result<(), String> {
self.installer
}
}

View File

@@ -4,8 +4,6 @@ use k8s_openapi::{
ClusterResourceScope, NamespaceResourceScope,
api::{apps::v1::Deployment, core::v1::Pod},
};
use kube::runtime::conditions;
use kube::runtime::wait::await_condition;
use kube::{
Client, Config, Error, Resource,
api::{Api, AttachParams, ListParams, Patch, PatchParams, ResourceExt},
@@ -13,6 +11,11 @@ use kube::{
core::ErrorResponse,
runtime::reflector::Lookup,
};
use kube::{api::DynamicObject, runtime::conditions};
use kube::{
api::{ApiResource, GroupVersionKind},
runtime::wait::await_condition,
};
use log::{debug, error, trace};
use serde::de::DeserializeOwned;
use similar::{DiffableStr, TextDiff};
@@ -239,6 +242,54 @@ impl K8sClient {
Ok(result)
}
pub async fn apply_yaml_many(
&self,
yaml: &Vec<serde_yaml::Value>,
ns: Option<&str>,
) -> Result<(), Error> {
for y in yaml.iter() {
self.apply_yaml(y, ns).await?;
}
Ok(())
}
pub async fn apply_yaml(
&self,
yaml: &serde_yaml::Value,
ns: Option<&str>,
) -> Result<(), Error> {
let obj: DynamicObject = serde_yaml::from_value(yaml.clone()).expect("TODO do not unwrap");
let name = obj.metadata.name.as_ref().expect("YAML must have a name");
let namespace = obj
.metadata
.namespace
.as_ref()
.expect("YAML must have a namespace");
// 4. Define the API resource type using the GVK from the object.
// The plural name 'applications' is taken from your CRD definition.
error!("This only supports argocd application harcoded, very rrrong");
let gvk = GroupVersionKind::gvk("argoproj.io", "v1alpha1", "Application");
let api_resource = ApiResource::from_gvk_with_plural(&gvk, "applications");
// 5. Create a dynamic API client for this resource type.
let api: Api<DynamicObject> =
Api::namespaced_with(self.client.clone(), namespace, &api_resource);
// 6. Apply the object to the cluster using Server-Side Apply.
// This will create the resource if it doesn't exist, or update it if it does.
println!(
"Applying Argo Application '{}' in namespace '{}'...",
name, namespace
);
let patch_params = PatchParams::apply("harmony"); // Use a unique field manager name
let result = api.patch(name, &patch_params, &Patch::Apply(&obj)).await?;
println!("Successfully applied '{}'.", result.name_any());
Ok(())
}
pub(crate) async fn from_kubeconfig(path: &str) -> Option<K8sClient> {
let k = match Kubeconfig::read_from(path) {
Ok(k) => k,

View File

@@ -246,7 +246,7 @@ pub struct K8sAnywhereConfig {
///
/// default: true
pub use_local_k3d: bool,
harmony_profile: String,
pub harmony_profile: String,
}
impl K8sAnywhereConfig {

View File

@@ -1,3 +1,5 @@
use std::any::Any;
use async_trait::async_trait;
use log::debug;
@@ -9,7 +11,7 @@ use crate::{
};
#[async_trait]
pub trait AlertSender: Send + Sync + std::fmt::Debug {
pub trait AlertSender: Any + Send + Sync + std::fmt::Debug {
fn name(&self) -> String;
}

View File

@@ -3,7 +3,6 @@ use serde::Serialize;
use crate::topology::Topology;
use super::Application;
/// An ApplicationFeature provided by harmony, such as Backups, Monitoring, MultisiteAvailability,
/// ContinuousIntegration, ContinuousDelivery
#[async_trait]

View File

@@ -0,0 +1,357 @@
use std::{backtrace, collections::HashMap};
use k8s_openapi::{Metadata, NamespaceResourceScope, Resource};
use log::debug;
use serde::Serialize;
use serde_yaml::Value;
use url::Url;
use crate::modules::application::features::CDApplicationConfig;
#[derive(Clone, Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Helm {
pub pass_credentials: Option<bool>,
pub parameters: Vec<Value>,
pub file_parameters: Vec<Value>,
pub release_name: Option<String>,
pub value_files: Vec<String>,
pub ignore_missing_value_files: Option<bool>,
pub values: Option<String>,
pub values_object: Option<Value>,
pub skip_crds: Option<bool>,
pub skip_schema_validation: Option<bool>,
pub version: Option<String>,
pub kube_version: Option<String>,
pub api_versions: Vec<String>,
pub namespace: Option<String>,
}
#[derive(Clone, Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Source {
pub repo_url: Url,
pub target_revision: Option<String>,
pub chart: String,
pub helm: Helm,
}
#[derive(Clone, Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Automated {
pub prune: bool,
pub self_heal: bool,
pub allow_empty: bool,
}
#[derive(Clone, Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Backoff {
pub duration: String,
pub factor: u32,
pub max_duration: String,
}
#[derive(Clone, Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Retry {
pub limit: u32,
pub backoff: Backoff,
}
#[derive(Clone, Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct SyncPolicy {
pub automated: Automated,
pub sync_options: Vec<String>,
pub retry: Retry,
}
#[derive(Clone, Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ArgoApplication {
pub name: String,
pub namespace: Option<String>,
pub project: String,
pub source: Source,
pub sync_policy: SyncPolicy,
pub revision_history_limit: u32,
}
impl Default for ArgoApplication {
fn default() -> Self {
Self {
name: Default::default(),
namespace: Default::default(),
project: Default::default(),
source: Source {
repo_url: Url::parse("http://asdf").expect("Couldn't parse to URL"),
target_revision: None,
chart: "".to_string(),
helm: Helm {
pass_credentials: None,
parameters: vec![],
file_parameters: vec![],
release_name: None,
value_files: vec![],
ignore_missing_value_files: None,
values: None,
values_object: None,
skip_crds: None,
skip_schema_validation: None,
version: None,
kube_version: None,
api_versions: vec![],
namespace: None,
},
},
sync_policy: SyncPolicy {
automated: Automated {
prune: false,
self_heal: false,
allow_empty: false,
},
sync_options: vec![],
retry: Retry {
limit: 5,
backoff: Backoff {
duration: "5s".to_string(),
factor: 2,
max_duration: "3m".to_string(),
},
},
},
revision_history_limit: 10,
}
}
}
impl From<CDApplicationConfig> for ArgoApplication {
fn from(value: CDApplicationConfig) -> Self {
Self {
name: value.name,
namespace: Some(value.namespace),
project: "default".to_string(),
source: Source {
repo_url: Url::parse(value.helm_chart_repo_url.to_string().as_str())
.expect("couldn't convert to URL"),
target_revision: None,
chart: value.helm_chart_name,
helm: Helm {
pass_credentials: None,
parameters: vec![],
file_parameters: vec![],
release_name: None,
value_files: vec![],
ignore_missing_value_files: None,
values: None,
values_object: Some(value.values_overrides),
skip_crds: None,
skip_schema_validation: None,
version: None,
kube_version: None,
api_versions: vec![],
namespace: None,
},
},
sync_policy: SyncPolicy {
automated: Automated {
prune: false,
self_heal: false,
allow_empty: true,
},
sync_options: vec![],
retry: Retry {
limit: 5,
backoff: Backoff {
duration: "5s".to_string(),
factor: 2,
max_duration: "3m".to_string(),
},
},
},
..Self::default()
}
}
}
impl ArgoApplication {
pub fn to_yaml(&self) -> serde_yaml::Value {
let name = &self.name;
let namespace = if let Some(ns) = self.namespace.as_ref() {
&ns
} else {
"argocd"
};
let project = &self.project;
let source = &self.source;
let yaml_str = format!(
r#"
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: {name}
# You'll usually want to add your resources to the argocd namespace.
namespace: {namespace}
spec:
# The project the application belongs to.
project: {project}
# Destination cluster and namespace to deploy the application
destination:
# cluster API URL
server: https://kubernetes.default.svc
# or cluster name
# name: in-cluster
# The namespace will only be set for namespace-scoped resources that have not set a value for .metadata.namespace
namespace: {namespace}
"#
);
let mut yaml_value: Value =
serde_yaml::from_str(yaml_str.as_str()).expect("couldn't parse string to YAML");
let mut spec = yaml_value
.get_mut("spec")
.expect("couldn't get spec from yaml")
.as_mapping_mut()
.expect("couldn't unwrap spec as mutable mapping");
let source =
serde_yaml::to_value(&self.source).expect("couldn't serialize source to value");
let sync_policy = serde_yaml::to_value(&self.sync_policy)
.expect("couldn't serialize sync_policy to value");
let revision_history_limit = serde_yaml::to_value(&self.revision_history_limit)
.expect("couldn't serialize revision_history_limit to value");
spec.insert(
serde_yaml::to_value("source").expect("string to value failed"),
source,
);
spec.insert(
serde_yaml::to_value("syncPolicy").expect("string to value failed"),
sync_policy,
);
spec.insert(
serde_yaml::to_value("revisionHistoryLimit")
.expect("couldn't convert str to yaml value"),
revision_history_limit,
);
debug!("spec: {}", serde_yaml::to_string(spec).unwrap());
debug!(
"entire yaml_value: {}",
serde_yaml::to_string(&yaml_value).unwrap()
);
yaml_value
}
}
#[cfg(test)]
mod tests {
use url::Url;
use crate::modules::application::features::{
ArgoApplication, Automated, Backoff, Helm, Retry, Source, SyncPolicy,
};
#[test]
fn test_argo_application_to_yaml_happy_path() {
let app = ArgoApplication {
name: "test".to_string(),
namespace: Some("test-ns".to_string()),
project: "test-project".to_string(),
source: Source {
repo_url: Url::parse("http://test").unwrap(),
target_revision: None,
chart: "test-chart".to_string(),
helm: Helm {
pass_credentials: None,
parameters: vec![],
file_parameters: vec![],
release_name: Some("test-release-neame".to_string()),
value_files: vec![],
ignore_missing_value_files: None,
values: None,
values_object: None,
skip_crds: None,
skip_schema_validation: None,
version: None,
kube_version: None,
api_versions: vec![],
namespace: None,
},
},
sync_policy: SyncPolicy {
automated: Automated {
prune: false,
self_heal: false,
allow_empty: false,
},
sync_options: vec![],
retry: Retry {
limit: 5,
backoff: Backoff {
duration: "5s".to_string(),
factor: 2,
max_duration: "3m".to_string(),
},
},
},
revision_history_limit: 10,
};
let expected_yaml_output = r#"apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: test
namespace: test-ns
spec:
project: test-project
destination:
server: https://kubernetes.default.svc
namespace: test-ns
source:
repoUrl: http://test/
targetRevision: null
chart: test-chart
helm:
passCredentials: null
parameters: []
fileParameters: []
releaseName: test-release-neame
valueFiles: []
ignoreMissingValueFiles: null
values: null
valuesObject: null
skipCrds: null
skipSchemaValidation: null
version: null
kubeVersion: null
apiVersions: []
namespace: null
syncPolicy:
automated:
prune: false
selfHeal: false
allowEmpty: false
syncOptions: []
retry:
limit: 5
backoff:
duration: 5s
factor: 2
maxDuration: 3m
revisionHistoryLimit: 10"#;
assert_eq!(
expected_yaml_output.trim(),
serde_yaml::to_string(&app.clone().to_yaml())
.unwrap()
.trim()
);
}
}

View File

@@ -2,7 +2,7 @@ use std::{io::Write, process::Command, sync::Arc};
use async_trait::async_trait;
use log::{error, info};
use serde_json::Value;
use serde_yaml::Value;
use tempfile::NamedTempFile;
use crate::{
@@ -10,11 +10,14 @@ use crate::{
data::Version,
inventory::Inventory,
modules::{
application::{Application, ApplicationFeature, HelmPackage, OCICompliant},
application::{
Application, ApplicationFeature, HelmPackage, OCICompliant,
features::{ArgoApplication, ArgoHelmScore},
},
helm::chart::HelmChartScore,
},
score::Score,
topology::{DeploymentTarget, HelmCommand, MultiTargetTopology, Topology, Url},
topology::{DeploymentTarget, HelmCommand, K8sclient, MultiTargetTopology, Topology, Url},
};
/// ContinuousDelivery in Harmony provides this functionality :
@@ -139,7 +142,7 @@ impl<A: OCICompliant + HelmPackage> ContinuousDelivery<A> {
#[async_trait]
impl<
A: OCICompliant + HelmPackage + Clone + 'static,
T: Topology + HelmCommand + MultiTargetTopology + 'static,
T: Topology + HelmCommand + MultiTargetTopology + K8sclient + 'static,
> ApplicationFeature<T> for ContinuousDelivery<A>
{
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
@@ -150,12 +153,16 @@ impl<
"TODO reverse helm chart packaging and docker image build. I put helm package first for faster iterations"
);
// TODO Write CI/CD workflow files
// we can autotedect the CI type using the remote url (default to github action for github
// url, etc..)
// Or ask for it when unknown
let helm_chart = self.application.build_push_helm_package(&image).await?;
info!("Pushed new helm chart {helm_chart}");
// let image = self.application.build_push_oci_image().await?;
// info!("Pushed new docker image {image}");
error!("uncomment above");
let image = self.application.build_push_oci_image().await?;
info!("Pushed new docker image {image}");
info!("Installing ContinuousDelivery feature");
// TODO this is a temporary hack for demo purposes, the deployment target should be driven
@@ -178,29 +185,33 @@ impl<
}
target => {
info!("Deploying to target {target:?}");
let cd_server = HelmChartScore {
namespace: todo!(
"ArgoCD Helm chart with proper understanding of Tenant, see how Will did it for Monitoring for now"
),
release_name: todo!("argocd helm chart whatever"),
chart_name: todo!(),
chart_version: todo!(),
values_overrides: todo!(),
values_yaml: todo!(),
create_namespace: todo!(),
install_only: todo!(),
repository: todo!(),
let score = ArgoHelmScore {
namespace: "harmonydemo-staging".to_string(),
openshift: true,
domain: "argo.harmonydemo.apps.st.mcd".to_string(),
argo_apps: vec![ArgoApplication::from(CDApplicationConfig {
// helm pull oci://hub.nationtech.io/harmony/harmony-example-rust-webapp-chart/harmony-example-rust-webapp-chart --version 0.1.0
version: Version::from("0.1.0").unwrap(),
helm_chart_repo_url: Url::Url(url::Url::parse("oci://hub.nationtech.io/harmony/harmony-example-rust-webapp-chart/harmony-example-rust-webapp-chart").unwrap()),
helm_chart_name: "harmony-example-rust-webapp-chart".to_string(),
values_overrides: Value::Null,
name: "harmony-demo-rust-webapp".to_string(),
namespace: "harmonydemo-staging".to_string(),
})],
};
let interpret = cd_server.create_interpret();
interpret.execute(&Inventory::empty(), topology);
score
.create_interpret()
.execute(&Inventory::empty(), topology)
.await
.unwrap();
}
};
todo!("1. Create ArgoCD score that installs argo using helm chart, see if Taha's already done it
- [X] Package app (docker image, helm chart)
- [X] Push to registry
- [ ] Push only if staging or prod
- [ ] Deploy to local k3d when target is local
- [X] Push only if staging or prod
- [X] Deploy to local k3d when target is local
- [ ] Poke Argo
- [ ] Ensure app is up")
}
@@ -212,9 +223,12 @@ impl<
/// For now this is entirely bound to K8s / ArgoCD, will have to be revisited when we support
/// more CD systems
pub struct CDApplicationConfig {
version: Version,
helm_chart_url: Url,
values_overrides: Value,
pub version: Version,
pub helm_chart_repo_url: Url,
pub helm_chart_name: String,
pub values_overrides: Value,
pub name: String,
pub namespace: String,
}
pub trait ContinuousDeliveryApplication {

View File

@@ -2,7 +2,7 @@ use async_trait::async_trait;
use log::info;
use crate::{
modules::application::{Application, ApplicationFeature},
modules::application::ApplicationFeature,
topology::{K8sclient, Topology},
};

File diff suppressed because it is too large Load Diff

View File

@@ -6,3 +6,9 @@ pub use monitoring::*;
mod continuous_delivery;
pub use continuous_delivery::*;
mod helm_argocd_score;
pub use helm_argocd_score::*;
mod argo_types;
pub use argo_types::*;

View File

@@ -1,19 +1,53 @@
use std::sync::Arc;
use async_trait::async_trait;
use log::info;
use crate::{
modules::application::{Application, ApplicationFeature},
topology::{HelmCommand, Topology},
inventory::Inventory,
modules::{
application::{Application, ApplicationFeature},
monitoring::{
application_monitoring::k8s_application_monitoring_score::ApplicationPrometheusMonitoringScore,
kube_prometheus::types::{NamespaceSelector, ServiceMonitor}, prometheus::prometheus::Prometheus,
},
},
score::Score,
topology::{oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender}, tenant::TenantManager, HelmCommand, K8sclient, Topology},
};
#[derive(Debug, Default, Clone)]
pub struct Monitoring {}
#[derive(Debug, Clone)]
pub struct PrometheusMonitoring {
pub application: Arc<dyn Application>,
pub alert_receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
pub alert_rules: Vec<Box<dyn AlertRule<Prometheus>>>,
}
#[async_trait]
impl<T: Topology + HelmCommand + 'static> ApplicationFeature<T> for Monitoring {
async fn ensure_installed(&self, _topology: &T) -> Result<(), String> {
impl<T: Topology + HelmCommand + 'static + TenantManager> ApplicationFeature<T> for PrometheusMonitoring {
async fn ensure_installed(&self, topology: &T) -> Result<(), String> {
info!("Ensuring monitoring is available for application");
todo!("create and execute k8s prometheus score, depends on Will's work")
let ns = self.application.name();
let mut service_monitor = ServiceMonitor::default();
service_monitor.name = ns.clone();
service_monitor.namespace = ns.clone();
service_monitor.namespace_selector = Some(NamespaceSelector {
any: true,
match_names: vec![ns.clone()],
});
let alerting_score = ApplicationPrometheusMonitoringScore {
namespace: ns,
receivers: self.alert_receivers.clone(),
rules: self.alert_rules.clone(),
service_monitors: vec![service_monitor],
};
alerting_score
.create_interpret()
.execute(&Inventory::empty(), topology)
.await
.unwrap();
Ok(())
}
fn name(&self) -> String {
"Monitoring".to_string()

View File

@@ -23,13 +23,13 @@ pub trait Application: std::fmt::Debug + Send + Sync {
}
#[derive(Debug)]
pub struct ApplicationInterpret<T: Topology + std::fmt::Debug> {
pub struct ApplicationInterpret<A: Application, T: Topology + std::fmt::Debug> {
features: Vec<Box<dyn ApplicationFeature<T>>>,
application: Arc<Box<dyn Application>>,
application: Arc<A>,
}
#[async_trait]
impl<T: Topology + std::fmt::Debug> Interpret<T> for ApplicationInterpret<T> {
impl<A: Application, T: Topology + std::fmt::Debug> Interpret<T> for ApplicationInterpret<A, T> {
async fn execute(
&self,
_inventory: &Inventory,

View File

@@ -19,23 +19,30 @@ use crate::{
use super::{Application, ApplicationFeature, ApplicationInterpret, HelmPackage, OCICompliant};
#[derive(Debug, Serialize, Clone)]
pub struct RustWebappScore<T: Topology + Clone + Serialize> {
pub name: String,
pub domain: Url,
pub struct ApplicationScore<A: Application + Serialize, T: Topology + Clone + Serialize>
where
Arc<A>: Serialize + Clone,
{
pub features: Vec<Box<dyn ApplicationFeature<T>>>,
pub application: RustWebapp,
pub application: Arc<A>,
}
impl<T: Topology + std::fmt::Debug + Clone + Serialize + 'static> Score<T> for RustWebappScore<T> {
impl<
A: Application + Serialize + Clone + 'static,
T: Topology + std::fmt::Debug + Clone + Serialize + 'static,
> Score<T> for ApplicationScore<A, T>
where
Arc<A>: Serialize,
{
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
Box::new(ApplicationInterpret {
features: self.features.clone(),
application: Arc::new(Box::new(self.application.clone())),
application: self.application.clone(),
})
}
fn name(&self) -> String {
format!("{}-RustWebapp", self.name)
format!("Application: {}", self.application.name())
}
}
@@ -47,6 +54,7 @@ pub enum RustWebFramework {
#[derive(Debug, Clone, Serialize)]
pub struct RustWebapp {
pub name: String,
pub domain: Url,
/// The path to the root of the Rust project to be containerized.
pub project_root: PathBuf,
pub framework: Option<RustWebFramework>,

View File

@@ -220,7 +220,6 @@ impl<T: Topology + HelmCommand> Interpret<T> for HelmChartInterpret {
yaml_path,
Some(&helm_options),
);
let status = match res {
Ok(status) => status,
Err(err) => return Err(InterpretError::new(err.to_string())),

View File

@@ -1,14 +1,22 @@
use std::any::Any;
use async_trait::async_trait;
use serde::Serialize;
use serde_yaml::{Mapping, Value};
use crate::{
interpret::{InterpretError, Outcome},
modules::monitoring::kube_prometheus::{
prometheus::{Prometheus, PrometheusReceiver},
types::{AlertChannelConfig, AlertManagerChannelConfig},
modules::monitoring::{
kube_prometheus::{
prometheus::{KubePrometheus, KubePrometheusReceiver},
types::{AlertChannelConfig, AlertManagerChannelConfig},
},
prometheus::prometheus::{Prometheus, PrometheusReceiver},
},
topology::{
Url,
oberservability::monitoring::{AlertReceiver, AlertSender},
},
topology::{Url, oberservability::monitoring::AlertReceiver},
};
#[derive(Debug, Clone, Serialize)]
@@ -37,6 +45,26 @@ impl PrometheusReceiver for DiscordWebhook {
}
}
#[async_trait]
impl AlertReceiver<KubePrometheus> for DiscordWebhook {
async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await
}
fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> {
Box::new(self.clone())
}
}
#[async_trait]
impl KubePrometheusReceiver for DiscordWebhook {
fn name(&self) -> String {
self.name.clone()
}
async fn configure_receiver(&self) -> AlertManagerChannelConfig {
self.get_config().await
}
}
#[async_trait]
impl AlertChannelConfig for DiscordWebhook {
async fn get_config(&self) -> AlertManagerChannelConfig {

View File

@@ -4,9 +4,12 @@ use serde_yaml::{Mapping, Value};
use crate::{
interpret::{InterpretError, Outcome},
modules::monitoring::kube_prometheus::{
prometheus::{Prometheus, PrometheusReceiver},
types::{AlertChannelConfig, AlertManagerChannelConfig},
modules::monitoring::{
kube_prometheus::{
prometheus::{KubePrometheus, KubePrometheusReceiver},
types::{AlertChannelConfig, AlertManagerChannelConfig},
},
prometheus::prometheus::{Prometheus, PrometheusReceiver},
},
topology::{Url, oberservability::monitoring::AlertReceiver},
};
@@ -36,6 +39,25 @@ impl PrometheusReceiver for WebhookReceiver {
self.get_config().await
}
}
#[async_trait]
impl AlertReceiver<KubePrometheus> for WebhookReceiver {
async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> {
sender.install_receiver(self).await
}
fn clone_box(&self) -> Box<dyn AlertReceiver<KubePrometheus>> {
Box::new(self.clone())
}
}
#[async_trait]
impl KubePrometheusReceiver for WebhookReceiver {
fn name(&self) -> String {
self.name.clone()
}
async fn configure_receiver(&self) -> AlertManagerChannelConfig {
self.get_config().await
}
}
#[async_trait]
impl AlertChannelConfig for WebhookReceiver {

View File

@@ -5,13 +5,26 @@ use serde::Serialize;
use crate::{
interpret::{InterpretError, Outcome},
modules::monitoring::kube_prometheus::{
prometheus::{Prometheus, PrometheusRule},
types::{AlertGroup, AlertManagerAdditionalPromRules},
modules::monitoring::{
kube_prometheus::{
prometheus::{KubePrometheus, KubePrometheusRule},
types::{AlertGroup, AlertManagerAdditionalPromRules},
},
prometheus::prometheus::{Prometheus, PrometheusRule},
},
topology::oberservability::monitoring::AlertRule,
};
#[async_trait]
impl AlertRule<KubePrometheus> for AlertManagerRuleGroup {
async fn install(&self, sender: &KubePrometheus) -> Result<Outcome, InterpretError> {
sender.install_rule(&self).await
}
fn clone_box(&self) -> Box<dyn AlertRule<KubePrometheus>> {
Box::new(self.clone())
}
}
#[async_trait]
impl AlertRule<Prometheus> for AlertManagerRuleGroup {
async fn install(&self, sender: &Prometheus) -> Result<Outcome, InterpretError> {
@@ -41,6 +54,25 @@ impl PrometheusRule for AlertManagerRuleGroup {
}
}
}
#[async_trait]
impl KubePrometheusRule for AlertManagerRuleGroup {
fn name(&self) -> String {
self.name.clone()
}
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules {
let mut additional_prom_rules = BTreeMap::new();
additional_prom_rules.insert(
self.name.clone(),
AlertGroup {
groups: vec![self.clone()],
},
);
AlertManagerAdditionalPromRules {
rules: additional_prom_rules,
}
}
}
impl AlertManagerRuleGroup {
pub fn new(name: &str, rules: Vec<PrometheusAlertRule>) -> AlertManagerRuleGroup {

View File

@@ -0,0 +1,45 @@
use std::sync::{Arc, Mutex};
use log::debug;
use serde::Serialize;
use crate::{
modules::monitoring::{
kube_prometheus::types::ServiceMonitor,
prometheus::{prometheus::Prometheus, prometheus_config::HelmPrometheusConfig},
},
score::Score,
topology::{
oberservability::monitoring::{AlertReceiver, AlertRule, AlertingInterpret}, tenant::TenantManager, HelmCommand, K8sclient, Topology
},
};
#[derive(Clone, Debug, Serialize)]
pub struct ApplicationPrometheusMonitoringScore {
pub namespace: String,
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
pub service_monitors: Vec<ServiceMonitor>,
}
impl<T: Topology + HelmCommand + TenantManager> Score<T> for ApplicationPrometheusMonitoringScore {
fn create_interpret(&self) -> Box<dyn crate::interpret::Interpret<T>> {
let config = Arc::new(Mutex::new(HelmPrometheusConfig::new()));
config
.try_lock()
.expect("couldn't lock config")
.additional_service_monitors = self.service_monitors.clone();
let ns = self.namespace.clone();
config.try_lock().expect("couldn't lock config").namespace = Some(ns.clone());
debug!("set namespace to {}", ns);
Box::new(AlertingInterpret {
sender: Prometheus { config },
receivers: self.receivers.clone(),
rules: self.rules.clone(),
})
}
fn name(&self) -> String {
"ApplicationPrometheusMonitoringScore".to_string()
}
}

View File

@@ -0,0 +1 @@
pub mod k8s_application_monitoring_score;

View File

@@ -0,0 +1,64 @@
use std::str::FromStr;
use non_blank_string_rs::NonBlankString;
use crate::modules::helm::chart::HelmChartScore;
pub fn grafana_helm_chart_score(ns: &str) -> HelmChartScore {
let values = format!(
r#"
rbac:
namespaced: true
datasources:
datasources.yaml:
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus-server.{ns}.svc.cluster.local
isDefault: true
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
editable: true
options:
path: /var/lib/grafana/dashboards/default
dashboards:
default:
compute-usage:
url: https://grafana.com/api/dashboards/315/revisions/1/download
pod-health:
url: https://grafana.com/api/dashboards/15758/revisions/1/download
namespace-resources:
url: https://grafana.com/api/dashboards/9809/revisions/1/download
namespace-resources-vs-quotas:
url: https://grafana.com/api/dashboards/17044/revisions/1/download
persistent-volume-usage:
url: https://grafana.com/api/dashboards/7685/revisions/1/download
"#,
ns = ns
);
HelmChartScore {
namespace: Some(NonBlankString::from_str(ns).unwrap()),
release_name: NonBlankString::from_str("grafana").unwrap(),
chart_name: NonBlankString::from_str("oci://ghcr.io/grafana/helm-charts/grafana").unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: Some(values),
create_namespace: true,
install_only: false,
repository: None,
}
}

View File

@@ -0,0 +1 @@
pub mod helm_grafana;

View File

@@ -0,0 +1 @@
pub mod helm;

View File

@@ -68,6 +68,9 @@ pub fn kube_prometheus_helm_chart_score(
let mut values = format!(
r#"
global:
rbac:
create: false
prometheus:
enabled: {prometheus}
prometheusSpec:
@@ -242,7 +245,7 @@ prometheus-node-exporter:
cpu: 200m
memory: 250Mi
prometheusOperator:
enabled: {prometheus_operator}
enabled: false
resources:
requests:
cpu: 100m

View File

@@ -2,7 +2,7 @@ use std::sync::{Arc, Mutex};
use serde::Serialize;
use super::{helm::config::KubePrometheusConfig, prometheus::Prometheus};
use super::{helm::config::KubePrometheusConfig, prometheus::KubePrometheus};
use crate::{
modules::monitoring::kube_prometheus::types::ServiceMonitor,
score::Score,
@@ -15,8 +15,8 @@ use crate::{
#[derive(Clone, Debug, Serialize)]
pub struct HelmPrometheusAlertingScore {
pub receivers: Vec<Box<dyn AlertReceiver<Prometheus>>>,
pub rules: Vec<Box<dyn AlertRule<Prometheus>>>,
pub receivers: Vec<Box<dyn AlertReceiver<KubePrometheus>>>,
pub rules: Vec<Box<dyn AlertRule<KubePrometheus>>>,
pub service_monitors: Vec<ServiceMonitor>,
}
@@ -28,7 +28,7 @@ impl<T: Topology + HelmCommand + TenantManager> Score<T> for HelmPrometheusAlert
.expect("couldn't lock config")
.additional_service_monitors = self.service_monitors.clone();
Box::new(AlertingInterpret {
sender: Prometheus::new(),
sender: KubePrometheus::new(),
receivers: self.receivers.clone(),
rules: self.rules.clone(),
})

View File

@@ -10,7 +10,7 @@ use crate::{
modules::monitoring::alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
score,
topology::{
HelmCommand, K8sAnywhereTopology, Topology,
HelmCommand, Topology,
installable::Installable,
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
tenant::TenantManager,
@@ -27,14 +27,14 @@ use super::{
};
#[async_trait]
impl AlertSender for Prometheus {
impl AlertSender for KubePrometheus {
fn name(&self) -> String {
"HelmKubePrometheus".to_string()
}
}
#[async_trait]
impl<T: Topology + HelmCommand + TenantManager> Installable<T> for Prometheus {
impl<T: Topology + HelmCommand + TenantManager> Installable<T> for KubePrometheus {
async fn configure(&self, _inventory: &Inventory, topology: &T) -> Result<(), InterpretError> {
self.configure_with_topology(topology).await;
Ok(())
@@ -51,11 +51,11 @@ impl<T: Topology + HelmCommand + TenantManager> Installable<T> for Prometheus {
}
#[derive(Debug)]
pub struct Prometheus {
pub struct KubePrometheus {
pub config: Arc<Mutex<KubePrometheusConfig>>,
}
impl Prometheus {
impl KubePrometheus {
pub fn new() -> Self {
Self {
config: Arc::new(Mutex::new(KubePrometheusConfig::new())),
@@ -75,7 +75,7 @@ impl Prometheus {
pub async fn install_receiver(
&self,
prometheus_receiver: &dyn PrometheusReceiver,
prometheus_receiver: &dyn KubePrometheusReceiver,
) -> Result<Outcome, InterpretError> {
let prom_receiver = prometheus_receiver.configure_receiver().await;
debug!(
@@ -120,12 +120,12 @@ impl Prometheus {
}
#[async_trait]
pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug {
pub trait KubePrometheusReceiver: Send + Sync + std::fmt::Debug {
fn name(&self) -> String;
async fn configure_receiver(&self) -> AlertManagerChannelConfig;
}
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
impl Serialize for Box<dyn AlertReceiver<KubePrometheus>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
@@ -134,19 +134,19 @@ impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
}
}
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
impl Clone for Box<dyn AlertReceiver<KubePrometheus>> {
fn clone(&self) -> Self {
self.clone_box()
}
}
#[async_trait]
pub trait PrometheusRule: Send + Sync + std::fmt::Debug {
pub trait KubePrometheusRule: Send + Sync + std::fmt::Debug {
fn name(&self) -> String;
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules;
}
impl Serialize for Box<dyn AlertRule<Prometheus>> {
impl Serialize for Box<dyn AlertRule<KubePrometheus>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
@@ -155,7 +155,7 @@ impl Serialize for Box<dyn AlertRule<Prometheus>> {
}
}
impl Clone for Box<dyn AlertRule<Prometheus>> {
impl Clone for Box<dyn AlertRule<KubePrometheus>> {
fn clone(&self) -> Self {
self.clone_box()
}

View File

@@ -211,8 +211,10 @@ pub struct Selector {
pub struct ServiceMonitor {
pub name: String,
pub namespace: String,
// # Additional labels to set used for the ServiceMonitorSelector. Together with standard labels from the chart
pub additional_labels: Option<Mapping>,
pub additional_labels: Option<HashMap<String, String>>,
// # Service label for use in assembling a job name of the form <label value>-<port>
// # If no label is specified, the service name is used.
@@ -240,7 +242,7 @@ pub struct ServiceMonitor {
// any: bool,
// # Explicit list of namespace names to select
// matchNames: Vec,
pub namespace_selector: Option<Mapping>,
pub namespace_selector: Option<NamespaceSelector>,
// # Endpoints of the selected service to be monitored
pub endpoints: Vec<ServiceMonitorEndpoint>,
@@ -250,10 +252,18 @@ pub struct ServiceMonitor {
pub fallback_scrape_protocol: Option<String>,
}
#[derive(Debug, Serialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct NamespaceSelector {
pub any: bool,
pub match_names: Vec<String>,
}
impl Default for ServiceMonitor {
fn default() -> Self {
Self {
name: Default::default(),
namespace: Default::default(),
additional_labels: Default::default(),
job_label: Default::default(),
target_labels: Default::default(),

View File

@@ -1,4 +1,7 @@
pub mod alert_channel;
pub mod alert_rule;
pub mod application_monitoring;
pub mod grafana;
pub mod kube_prometheus;
pub mod ntfy;
pub mod prometheus;

View File

@@ -0,0 +1,2 @@
pub mod prometheus_helm;
pub mod types;

View File

@@ -0,0 +1,155 @@
use std::collections::BTreeMap;
use std::str::FromStr;
use std::sync::{Arc, Mutex};
use log::debug;
use non_blank_string_rs::NonBlankString;
use serde_yaml::{Mapping, Value};
use crate::modules::helm::chart::HelmChartScore;
use crate::modules::monitoring::kube_prometheus::types::{
AlertGroup, AlertManager, AlertManagerConfig, AlertManagerRoute, AlertManagerSpec,
ConfigReloader, Limits, Requests, Resources,
};
use crate::modules::monitoring::prometheus::helm::types::{
AlertFile, EnabledConfig, KsmRbacConfig, KubeStateMetricsConfig, LabelSelector, Monitor,
Prometheus, PrometheusHelmValues, RbacConfig, ServerConfig, ServerRbacConfig,
};
use crate::modules::monitoring::prometheus::prometheus_config::HelmPrometheusConfig;
pub fn prometheus_helm_chart_score(config: Arc<Mutex<HelmPrometheusConfig>>) -> HelmChartScore {
let config = config.lock().unwrap();
let ns = config.namespace.clone().unwrap();
let rbac_config = RbacConfig { create: false };
let ksm_config = KubeStateMetricsConfig {
enabled: true,
rbac: KsmRbacConfig {
use_cluster_role: false,
},
prometheus: Prometheus {
monitor: Monitor { enabled: true },
},
};
let mut selector_labels = BTreeMap::new();
selector_labels.insert("kubernetes.io/metadata.name".to_string(), ns.clone());
let mut kube_state_metrics_labels = BTreeMap::new();
kube_state_metrics_labels.insert(
"app.kubernetes.io/name".to_string(),
"kube-state-metrics".to_string(),
);
let selector = LabelSelector {
match_labels: selector_labels,
};
let server_config = ServerConfig {
namespaces: vec![ns.clone()],
use_existing_cluster_role_name: false,
};
let mut null_receiver = Mapping::new();
null_receiver.insert(
Value::String("receiver".to_string()),
Value::String("default-receiver".to_string()),
);
null_receiver.insert(
Value::String("matchers".to_string()),
Value::Sequence(vec![Value::String("alertname!=Watchdog".to_string())]),
);
null_receiver.insert(Value::String("continue".to_string()), Value::Bool(true));
let mut alert_manager_channel_config = AlertManagerConfig {
global: Mapping::new(),
route: AlertManagerRoute {
routes: vec![Value::Mapping(null_receiver)],
},
receivers: vec![serde_yaml::from_str("name: 'default-receiver'").unwrap()],
};
for receiver in config.alert_receiver_configs.iter() {
if let Some(global) = receiver.channel_global_config.clone() {
alert_manager_channel_config
.global
.insert(global.0, global.1);
}
alert_manager_channel_config
.route
.routes
.push(receiver.channel_route.clone());
alert_manager_channel_config
.receivers
.push(receiver.channel_receiver.clone());
}
let alert_manager_values = AlertManager {
enabled: config.alert_manager,
config: alert_manager_channel_config,
alertmanager_spec: AlertManagerSpec {
resources: Resources {
limits: Limits {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
requests: Requests {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
},
},
init_config_reloader: ConfigReloader {
resources: Resources {
limits: Limits {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
requests: Requests {
memory: "100Mi".to_string(),
cpu: "100m".to_string(),
},
},
},
};
let mut result: BTreeMap<String, AlertFile> = BTreeMap::new();
for rule in config.alert_rules.clone().iter() {
for (name, group) in &rule.rules {
result
.entry("alerting_rules.yml".to_string())
.and_modify(|e| e.groups.extend(group.groups.clone()))
.or_insert(AlertFile {
groups: group.groups.clone(),
});
}
}
let final_values = PrometheusHelmValues {
rbac: rbac_config,
kube_state_metrics: ksm_config,
server: server_config,
alertmanager: alert_manager_values,
server_files: result,
additional_service_monitors: config.additional_service_monitors.clone(),
prometheus_node_exporter: EnabledConfig { enabled: false },
prometheus_pushgateway: EnabledConfig { enabled: false },
};
let values_yaml =
serde_yaml::to_string(&final_values).expect("Failed to serialize final Helm values");
debug!("full values.yaml: \n{}", values_yaml);
HelmChartScore {
namespace: Some(NonBlankString::from_str(&ns).unwrap()),
release_name: NonBlankString::from_str("prometheus").unwrap(),
chart_name: NonBlankString::from_str(
"oci://ghcr.io/prometheus-community/charts/prometheus",
)
.unwrap(),
chart_version: None,
values_overrides: None,
values_yaml: Some(values_yaml),
create_namespace: true,
install_only: true,
repository: None,
}
}

View File

@@ -0,0 +1,94 @@
use std::collections::BTreeMap;
use serde::Serialize;
use crate::modules::monitoring::{alert_rule::prometheus_alert_rule::AlertManagerRuleGroup, kube_prometheus::types::{
AlertGroup, AlertManager, AlertManagerAdditionalPromRules, AlertManagerValues, ServiceMonitor
}};
#[derive(Debug, Clone, Serialize)]
pub struct RuleFilesConfig {
#[serde(rename = "ruleFiles")]
pub files: BTreeMap<String, AlertGroup>,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct PrometheusHelmValues {
pub rbac: RbacConfig,
#[serde(rename = "kube-state-metrics")]
pub kube_state_metrics: KubeStateMetricsConfig,
pub server: ServerConfig,
pub alertmanager: AlertManager, // You already have this
#[serde(rename = "serverFiles")]
pub server_files: BTreeMap<String, AlertFile>, // You already have this
pub additional_service_monitors: Vec<ServiceMonitor>, // You already have this
#[serde(rename = "prometheus-node-exporter")]
pub prometheus_node_exporter: EnabledConfig,
#[serde(rename = "prometheus-pushgateway")]
pub prometheus_pushgateway: EnabledConfig,
}
#[derive(Serialize, Debug, Clone)]
pub struct AlertFile {
pub groups: Vec<AlertManagerRuleGroup>,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct RbacConfig {
pub create: bool,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct KubeStateMetricsConfig {
pub enabled: bool,
pub rbac: KsmRbacConfig,
pub prometheus: Prometheus,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Prometheus {
pub monitor: Monitor
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Monitor{
pub enabled: bool
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct KsmRbacConfig {
pub use_cluster_role: bool,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct ServerConfig {
pub namespaces: Vec<String>,
pub use_existing_cluster_role_name: bool,
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct ServerRbacConfig {
pub create: bool,
pub use_cluster_role: bool,
pub namespaced: bool,
}
#[derive(Serialize, Debug, Clone)]
#[serde(rename_all = "camelCase")]
pub struct LabelSelector {
#[serde(rename = "matchLabels")]
pub match_labels: BTreeMap<String, String>,
}
#[derive(Serialize, Debug)]
pub struct EnabledConfig {
pub enabled: bool,
}

View File

@@ -0,0 +1,3 @@
pub mod helm;
pub mod prometheus;
pub mod prometheus_config;

View File

@@ -0,0 +1,189 @@
use std::sync::{Arc, Mutex};
use async_trait::async_trait;
use log::{debug, error};
use serde::Serialize;
use crate::{
interpret::{InterpretError, Outcome},
inventory::Inventory,
modules::monitoring::{
alert_rule::prometheus_alert_rule::AlertManagerRuleGroup,
grafana::helm::helm_grafana::grafana_helm_chart_score,
kube_prometheus::types::{AlertManagerAdditionalPromRules, AlertManagerChannelConfig},
},
score::Score,
topology::{
HelmCommand, K8sclient, Topology,
installable::Installable,
oberservability::monitoring::{AlertReceiver, AlertRule, AlertSender},
tenant::TenantManager,
},
};
use super::{
helm::prometheus_helm::prometheus_helm_chart_score, prometheus_config::HelmPrometheusConfig,
};
#[derive(Debug)]
pub struct Prometheus {
pub config: Arc<Mutex<HelmPrometheusConfig>>,
}
#[async_trait]
impl AlertSender for Prometheus {
fn name(&self) -> String {
"Prometheus".to_string()
}
}
impl Prometheus {
pub fn new() -> Self {
Self {
config: Arc::new(Mutex::new(HelmPrometheusConfig::new())),
}
}
pub async fn configure_with_topology<T: TenantManager>(&self, topology: &T) {
if let Some(cfg) = topology.get_tenant_config().await {
debug!("Overriding namespace with tenant config: {}", cfg.name);
self.config.lock().unwrap().namespace = Some(cfg.name.clone());
} else {
debug!("No tenant config found; keeping existing namespace.");
}
error!("This must be refactored, see comments in pr #74");
}
pub async fn install_receiver(
&self,
prometheus_receiver: &dyn PrometheusReceiver,
) -> Result<Outcome, InterpretError> {
let prom_receiver = prometheus_receiver.configure_receiver().await;
debug!(
"adding alert receiver to prometheus config: {:#?}",
&prom_receiver
);
let mut config = self.config.lock().unwrap();
config.alert_receiver_configs.push(prom_receiver);
let prom_receiver_name = prometheus_receiver.name();
debug!("installed alert receiver {}", &prom_receiver_name);
Ok(Outcome::success(format!(
"Sucessfully installed receiver {}",
prom_receiver_name
)))
}
pub async fn install_rule(
&self,
prometheus_rule: &AlertManagerRuleGroup,
) -> Result<Outcome, InterpretError> {
let prometheus_rule = prometheus_rule.configure_rule().await;
let mut config = self.config.lock().unwrap();
config.alert_rules.push(prometheus_rule.clone());
Ok(Outcome::success(format!(
"Successfully installed alert rule: {:#?},",
prometheus_rule
)))
}
pub async fn install_prometheus<T: Topology + HelmCommand + Send + Sync>(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
prometheus_helm_chart_score(self.config.clone())
.create_interpret()
.execute(inventory, topology)
.await
}
pub async fn install_grafana<T: Topology + HelmCommand + Send + Sync>(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<Outcome, InterpretError> {
let namespace = {
let config = self.config.lock().unwrap();
config.namespace.clone()
};
if let Some(ns) = namespace.as_deref() {
grafana_helm_chart_score(ns)
.create_interpret()
.execute(inventory, topology)
.await
} else {
Err(InterpretError::new(format!(
"could not install grafana, missing namespace",
)))
}
}
}
#[async_trait]
impl<T: Topology + HelmCommand + TenantManager> Installable<T> for Prometheus {
async fn configure(&self, _inventory: &Inventory, topology: &T) -> Result<(), InterpretError> {
self.configure_with_topology(topology).await;
Ok(())
}
async fn ensure_installed(
&self,
inventory: &Inventory,
topology: &T,
) -> Result<(), InterpretError> {
self.install_prometheus(inventory, topology).await?;
let install_grafana = {
let config = self.config.lock().unwrap();
config.grafana
};
if install_grafana {
self.install_grafana(inventory, topology).await?;
}
Ok(())
}
}
#[async_trait]
pub trait PrometheusReceiver: Send + Sync + std::fmt::Debug {
fn name(&self) -> String;
async fn configure_receiver(&self) -> AlertManagerChannelConfig;
}
impl Serialize for Box<dyn AlertReceiver<Prometheus>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}
impl Clone for Box<dyn AlertReceiver<Prometheus>> {
fn clone(&self) -> Self {
self.clone_box()
}
}
#[async_trait]
pub trait PrometheusRule: Send + Sync + std::fmt::Debug {
fn name(&self) -> String;
async fn configure_rule(&self) -> AlertManagerAdditionalPromRules;
}
impl Serialize for Box<dyn AlertRule<Prometheus>> {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
todo!()
}
}
impl Clone for Box<dyn AlertRule<Prometheus>> {
fn clone(&self) -> Self {
self.clone_box()
}
}

View File

@@ -0,0 +1,32 @@
use crate::modules::monitoring::kube_prometheus::types::{
AlertManagerAdditionalPromRules, AlertManagerChannelConfig, ServiceMonitor,
};
#[derive(Debug)]
pub struct HelmPrometheusConfig {
pub namespace: Option<String>,
pub alert_manager: bool,
pub node_exporter: bool,
pub kube_state_metrics: bool,
pub grafana: bool,
pub prometheus_pushgateway: bool,
pub alert_receiver_configs: Vec<AlertManagerChannelConfig>,
pub alert_rules: Vec<AlertManagerAdditionalPromRules>,
pub additional_service_monitors: Vec<ServiceMonitor>,
}
impl HelmPrometheusConfig {
pub fn new() -> Self {
Self {
namespace: None,
alert_manager: true,
node_exporter: false,
kube_state_metrics: false,
grafana: true,
prometheus_pushgateway: false,
alert_receiver_configs: vec![],
alert_rules: vec![],
additional_service_monitors: vec![],
}
}
}

View File

@@ -1 +1,2 @@
pub mod pvc;
pub mod pod;

View File

@@ -0,0 +1,38 @@
use crate::modules::monitoring::alert_rule::prometheus_alert_rule::PrometheusAlertRule;
pub fn pod_in_failed_state() -> PrometheusAlertRule {
PrometheusAlertRule::new(
"PodInFailedState",
// This expression checks for any pod where the status phase is 'Failed' and the value is 1 (true).
"kube_pod_status_phase{phase=\"Failed\"} == 1",
)
.for_duration("1m") // Fire if the pod is in this state for 1 minute.
.label("severity", "critical") // A failed pod is a critical issue.
.annotation(
"summary",
"Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has failed.",
)
.annotation(
"description",
"The pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has entered the 'Failed' state. This is a terminal error and the pod will not be automatically restarted. Please check the pod logs to diagnose the issue.",
)
}
pub fn pod_restarting_frequently() -> PrometheusAlertRule {
PrometheusAlertRule::new(
"PodRestartingFrequently",
// This expression calculates the increase in the restart count over the last 30 minutes.
// Alert if a container has restarted more than 5 times.
"increase(kube_pod_container_status_restarts_total[30m]) > 5",
)
.for_duration("15m") // The condition must persist for 15 minutes to avoid alerts for minor flaps.
.label("severity", "critical") // A crash-looping pod is effectively down.
.annotation(
"summary",
"Container {{ $labels.container }} in pod {{ $labels.pod }} is restarting frequently.",
)
.annotation(
"description",
"The container '{{ $labels.container }}' in pod '{{ $labels.pod }}' (namespace '{{ $labels.namespace }}') has restarted more than 5 times in the last 30 minutes. The pod is likely in a CrashLoopBackOff state.",
)
}