diff --git a/Cargo.lock b/Cargo.lock index 7d9cdcf..5c45111 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -690,6 +690,41 @@ dependencies = [ "tokio", ] +[[package]] +name = "brocade-snmp-server" +version = "0.1.0" +dependencies = [ + "base64 0.22.1", + "brocade", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_secret", + "harmony_types", + "log", + "serde", + "tokio", + "url", +] + +[[package]] +name = "brocade-switch" +version = "0.1.0" +dependencies = [ + "async-trait", + "brocade", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "log", + "serde", + "tokio", + "url", +] + [[package]] name = "brotli" version = "8.0.2" @@ -1835,6 +1870,21 @@ dependencies = [ "url", ] +[[package]] +name = "example-operatorhub-catalogsource" +version = "0.1.0" +dependencies = [ + "cidr", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "log", + "tokio", + "url", +] + [[package]] name = "example-opnsense" version = "0.1.0" @@ -1853,6 +1903,25 @@ dependencies = [ "url", ] +[[package]] +name = "example-opnsense-node-exporter" +version = "0.1.0" +dependencies = [ + "async-trait", + "cidr", + "env_logger", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_secret", + "harmony_secret_derive", + "harmony_types", + "log", + "serde", + "tokio", + "url", +] + [[package]] name = "example-pxe" version = "0.1.0" @@ -2479,6 +2548,19 @@ dependencies = [ "tokio", ] +[[package]] +name = "harmony_inventory_builder" +version = "0.1.0" +dependencies = [ + "cidr", + "harmony", + "harmony_cli", + "harmony_macros", + "harmony_types", + "tokio", + "url", +] + [[package]] name = "harmony_macros" version = "0.1.0" @@ -2544,6 +2626,7 @@ dependencies = [ name = "harmony_types" version = "0.1.0" dependencies = [ + "log", "rand 0.9.2", "serde", "url", diff --git a/adr/015-higher-order-topologies.md b/adr/015-higher-order-topologies.md new file mode 100644 index 0000000..41c3172 --- /dev/null +++ b/adr/015-higher-order-topologies.md @@ -0,0 +1,114 @@ +# Architecture Decision Record: Higher-Order Topologies + +**Initial Author:** Jean-Gabriel Gill-Couture +**Initial Date:** 2025-12-08 +**Last Updated Date:** 2025-12-08 + +## Status + +Implemented + +## Context + +Harmony models infrastructure as **Topologies** (deployment targets like `K8sAnywhereTopology`, `LinuxHostTopology`) implementing **Capabilities** (tech traits like `PostgreSQL`, `Docker`). + +**Higher-Order Topologies** (e.g., `FailoverTopology`) compose/orchestrate capabilities *across* multiple underlying topologies (e.g., primary+replica `T`). + +Naive design requires manual `impl Capability for HigherOrderTopology` *per T per capability*, causing: +- **Impl explosion**: N topologies × M capabilities = N×M boilerplate. +- **ISP violation**: Topologies forced to impl unrelated capabilities. +- **Maintenance hell**: New topology needs impls for *all* orchestrated capabilities; new capability needs impls for *all* topologies/higher-order. +- **Barrier to extension**: Users can't easily add topologies without todos/panics. + +This makes scaling Harmony impractical as ecosystem grows. + +## Decision + +Use **blanket trait impls** on higher-order topologies to *automatically* derive orchestration: + +````rust +/// Higher-Order Topology: Orchestrates capabilities across sub-topologies. +pub struct FailoverTopology { + /// Primary sub-topology. + primary: T, + /// Replica sub-topology. + replica: T, +} + +/// Automatically provides PostgreSQL failover for *any* `T: PostgreSQL`. +/// Delegates to primary for queries; orchestrates deploy across both. +#[async_trait] +impl PostgreSQL for FailoverTopology { + async fn deploy(&self, config: &PostgreSQLConfig) -> Result { + // Deploy primary; extract certs/endpoint; + // deploy replica with pg_basebackup + TLS passthrough. + // (Full impl logged/elaborated.) + } + + // Delegate queries to primary. + async fn get_replication_certs(&self, cluster_name: &str) -> Result { + self.primary.get_replication_certs(cluster_name).await + } + // ... +} + +/// Similarly for other capabilities. +#[async_trait] +impl Docker for FailoverTopology { + // Failover Docker orchestration. +} +```` + +**Key properties:** +- **Auto-derivation**: `Failover` gets `PostgreSQL` iff `K8sAnywhere: PostgreSQL`. +- **No boilerplate**: One blanket impl per capability *per higher-order type*. + +## Rationale + +- **Composition via generics**: Rust trait solver auto-selects impls; zero runtime cost. +- **Compile-time safety**: Missing `T: Capability` → compile error (no panics). +- **Scalable**: O(capabilities) impls per higher-order; new `T` auto-works. +- **ISP-respecting**: Capabilities only surface if sub-topology provides. +- **Centralized logic**: Orchestration (e.g., cert propagation) in one place. + +**Example usage:** +````rust +// ✅ Works: K8sAnywhere: PostgreSQL → Failover provides failover PG +let pg_failover: FailoverTopology = ...; +pg_failover.deploy_pg(config).await; + +// ✅ Works: LinuxHost: Docker → Failover provides failover Docker +let docker_failover: FailoverTopology = ...; +docker_failover.deploy_docker(...).await; + +// ❌ Compile fail: K8sAnywhere !: Docker +let invalid: FailoverTopology; +invalid.deploy_docker(...); // `T: Docker` bound unsatisfied +```` + +## Consequences + +**Pros:** +- **Extensible**: New topology `AWSTopology: PostgreSQL` → instant `Failover: PostgreSQL`. +- **Lean**: No useless impls (e.g., no `K8sAnywhere: Docker`). +- **Observable**: Logs trace every step. + +**Cons:** +- **Monomorphization**: Generics generate code per T (mitigated: few Ts). +- **Delegation opacity**: Relies on rustdoc/logs for internals. + +## Alternatives considered + +| Approach | Pros | Cons | +|----------|------|------| +| **Manual per-T impls**
`impl PG for Failover {..}`
`impl PG for Failover {..}` | Explicit control | N×M explosion; violates ISP; hard to extend. | +| **Dynamic trait objects**
`Box` | Runtime flex | Perf hit; type erasure; error-prone dispatch. | +| **Mega-topology trait**
All-in-one `OrchestratedTopology` | Simple wiring | Monolithic; poor composition. | +| **Registry dispatch**
Runtime capability lookup | Decoupled | Complex; no compile safety; perf/debug overhead. | + +**Selected**: Blanket impls leverage Rust generics for safe, zero-cost composition. + +## Additional Notes + +- Applies to `MultisiteTopology`, `ShardedTopology`, etc. +- `FailoverTopology` in `failover.rs` is first implementation. diff --git a/adr/015-higher-order-topologies/example.rs b/adr/015-higher-order-topologies/example.rs new file mode 100644 index 0000000..8c8911d --- /dev/null +++ b/adr/015-higher-order-topologies/example.rs @@ -0,0 +1,153 @@ +//! Example of Higher-Order Topologies in Harmony. +//! Demonstrates how `FailoverTopology` automatically provides failover for *any* capability +//! supported by a sub-topology `T` via blanket trait impls. +//! +//! Key insight: No manual impls per T or capability -- scales effortlessly. +//! Users can: +//! - Write new `Topology` (impl capabilities on a struct). +//! - Compose with `FailoverTopology` (gets capabilities if T has them). +//! - Compile fails if capability missing (safety). + +use async_trait::async_trait; +use tokio; + +/// Capability trait: Deploy and manage PostgreSQL. +#[async_trait] +pub trait PostgreSQL { + async fn deploy(&self, config: &PostgreSQLConfig) -> Result; + async fn get_replication_certs(&self, cluster_name: &str) -> Result; +} + +/// Capability trait: Deploy Docker. +#[async_trait] +pub trait Docker { + async fn deploy_docker(&self) -> Result; +} + +/// Configuration for PostgreSQL deployments. +#[derive(Clone)] +pub struct PostgreSQLConfig; + +/// Replication certificates. +#[derive(Clone)] +pub struct ReplicationCerts; + +/// Concrete topology: Kubernetes Anywhere (supports PostgreSQL). +#[derive(Clone)] +pub struct K8sAnywhereTopology; + +#[async_trait] +impl PostgreSQL for K8sAnywhereTopology { + async fn deploy(&self, _config: &PostgreSQLConfig) -> Result { + // Real impl: Use k8s helm chart, operator, etc. + Ok("K8sAnywhere PostgreSQL deployed".to_string()) + } + + async fn get_replication_certs(&self, _cluster_name: &str) -> Result { + Ok(ReplicationCerts) + } +} + +/// Concrete topology: Linux Host (supports Docker). +#[derive(Clone)] +pub struct LinuxHostTopology; + +#[async_trait] +impl Docker for LinuxHostTopology { + async fn deploy_docker(&self) -> Result { + // Real impl: Install/configure Docker on host. + Ok("LinuxHost Docker deployed".to_string()) + } +} + +/// Higher-Order Topology: Composes multiple sub-topologies (primary + replica). +/// Automatically derives *all* capabilities of `T` with failover orchestration. +/// +/// - If `T: PostgreSQL`, then `FailoverTopology: PostgreSQL` (blanket impl). +/// - Same for `Docker`, etc. No boilerplate! +/// - Compile-time safe: Missing `T: Capability` → error. +#[derive(Clone)] +pub struct FailoverTopology { + /// Primary sub-topology. + pub primary: T, + /// Replica sub-topology. + pub replica: T, +} + +/// Blanket impl: Failover PostgreSQL if T provides PostgreSQL. +/// Delegates reads to primary; deploys to both. +#[async_trait] +impl PostgreSQL for FailoverTopology { + async fn deploy(&self, config: &PostgreSQLConfig) -> Result { + // Orchestrate: Deploy primary first, then replica (e.g., via pg_basebackup). + let primary_result = self.primary.deploy(config).await?; + let replica_result = self.replica.deploy(config).await?; + Ok(format!("Failover PG deployed: {} | {}", primary_result, replica_result)) + } + + async fn get_replication_certs(&self, cluster_name: &str) -> Result { + // Delegate to primary (replica follows). + self.primary.get_replication_certs(cluster_name).await + } +} + +/// Blanket impl: Failover Docker if T provides Docker. +#[async_trait] +impl Docker for FailoverTopology { + async fn deploy_docker(&self) -> Result { + // Orchestrate across primary + replica. + let primary_result = self.primary.deploy_docker().await?; + let replica_result = self.replica.deploy_docker().await?; + Ok(format!("Failover Docker deployed: {} | {}", primary_result, replica_result)) + } +} + +#[tokio::main] +async fn main() { + let config = PostgreSQLConfig; + + println!("=== ✅ PostgreSQL Failover (K8sAnywhere supports PG) ==="); + let pg_failover = FailoverTopology { + primary: K8sAnywhereTopology, + replica: K8sAnywhereTopology, + }; + let result = pg_failover.deploy(&config).await.unwrap(); + println!("Result: {}", result); + + println!("\n=== ✅ Docker Failover (LinuxHost supports Docker) ==="); + let docker_failover = FailoverTopology { + primary: LinuxHostTopology, + replica: LinuxHostTopology, + }; + let result = docker_failover.deploy_docker().await.unwrap(); + println!("Result: {}", result); + + println!("\n=== ❌ Would fail to compile (K8sAnywhere !: Docker) ==="); + // let invalid = FailoverTopology { + // primary: K8sAnywhereTopology, + // replica: K8sAnywhereTopology, + // }; + // invalid.deploy_docker().await.unwrap(); // Error: `K8sAnywhereTopology: Docker` not satisfied! + // Very clear error message : + // error[E0599]: the method `deploy_docker` exists for struct `FailoverTopology`, but its trait bounds were not satisfied + // --> src/main.rs:90:9 + // | + // 4 | pub struct FailoverTopology { + // | ------------------------------ method `deploy_docker` not found for this struct because it doesn't satisfy `FailoverTopology: Docker` + // ... + // 37 | struct K8sAnywhereTopology; + // | -------------------------- doesn't satisfy `K8sAnywhereTopology: Docker` + // ... + // 90 | invalid.deploy_docker(); // `T: Docker` bound unsatisfied + // | ^^^^^^^^^^^^^ method cannot be called on `FailoverTopology` due to unsatisfied trait bounds + // | + // note: trait bound `K8sAnywhereTopology: Docker` was not satisfied + // --> src/main.rs:61:9 + // | + // 61 | impl Docker for FailoverTopology { + // | ^^^^^^ ------ ------------------- + // | | + // | unsatisfied trait bound introduced here + // note: the trait `Docker` must be implemented +} + diff --git a/adr/016-Harmony-Agent-And-Global-Mesh-For-Decentralized-Workload-Management.md b/adr/016-Harmony-Agent-And-Global-Mesh-For-Decentralized-Workload-Management.md new file mode 100644 index 0000000..5c99aec --- /dev/null +++ b/adr/016-Harmony-Agent-And-Global-Mesh-For-Decentralized-Workload-Management.md @@ -0,0 +1,90 @@ +# Architecture Decision Record: Global Orchestration Mesh & The Harmony Agent + +**Status:** Proposed +**Date:** 2025-12-19 + +## Context + +Harmony is designed to enable a truly decentralized infrastructure where independent clusters—owned by different organizations or running on diverse hardware—can collaborate reliably. This vision combines the decentralization of Web3 with the performance and capabilities of Web2. + +Currently, Harmony operates as a stateless CLI tool, invoked manually or via CI runners. While effective for deployment, this model presents a critical limitation: **a CLI cannot react to real-time events.** + +To achieve automated failover and dynamic workload management, we need a system that is "always on." Relying on manual intervention or scheduled CI jobs to recover from a cluster failure creates unacceptable latency and prevents us from scaling to thousands of nodes. + +Furthermore, we face a challenge in serving diverse workloads: +* **Financial workloads** require absolute consistency (CP - Consistency/Partition Tolerance). +* **AI/Inference workloads** require maximum availability (AP - Availability/Partition Tolerance). + +There are many more use cases, but those are the two extremes. + +We need a unified architecture that automates cluster coordination and supports both consistency models without requiring a complete re-architecture in the future. + +## Decision + +We propose a fundamental architectural evolution. It has been clear since the start of Harmony that it would be necessary to transition Harmony from a purely ephemeral CLI tool to a system that includes a persistent **Harmony Agent**. This Agent will connect to a **Global Orchestration Mesh** based on a strongly consistent protocol. + +The proposal consists of four key pillars: + +### 1. The Harmony Agent (New Component) +We will develop a long-running process (Daemon/Agent) to be deployed alongside workloads. +* **Shift from CLI:** Unlike the CLI, which applies configuration and exits, the Agent maintains a persistent connection to the mesh. +* **Responsibility:** It actively monitors cluster health, participates in consensus, and executes lifecycle commands (start/stop/fence) instantly when the mesh dictates a state change. + +### 2. The Technology: NATS JetStream +We will utilize **NATS JetStream** as the underlying transport and consensus layer for the Agent and the Mesh. +* **Why not raw Raft?** Implementing a raw Raft library requires building and maintaining the transport layer, log compaction, snapshotting, and peer discovery manually. NATS JetStream provides a battle-tested, distributed log and Key-Value store (based on Raft) out of the box, along with a high-performance pub/sub system for event propagation. +* **Role:** It will act as the "source of truth" for the cluster state. + +### 3. Strong Consistency at the Mesh Layer +The mesh will operate with **Strong Consistency** by default. +* All critical cluster state changes (topology updates, lease acquisitions, leadership elections) will require consensus among the Agents. +* This ensures that in the event of a network partition, we have a mathematical guarantee of which side holds the valid state, preventing data corruption. + +### 4. Public UX: The `FailoverStrategy` Abstraction +To keep the user experience stable and simple, we will expose the complexity of the mesh through a high-level configuration API, tentatively called `FailoverStrategy`. + +The user defines the *intent* in their config, and the Harmony Agent automates the *execution*: + +* **`FailoverStrategy::AbsoluteConsistency`**: + * *Use Case:* Banking, Transactional DBs. + * *Behavior:* If the mesh detects a partition, the Agent on the minority side immediately halts workloads. No split-brain is ever allowed. +* **`FailoverStrategy::SplitBrainAllowed`**: + * *Use Case:* LLM Inference, Stateless Web Servers. + * *Behavior:* If a partition occurs, the Agent keeps workloads running to maximize uptime. State is reconciled when connectivity returns. + +## Rationale + +**The Necessity of an Agent** +You cannot automate what you do not monitor. Moving to an Agent-based model is the only way to achieve sub-second reaction times to infrastructure failures. It transforms Harmony from a deployment tool into a self-healing platform. + +**Scaling & Decentralization** +To allow independent clusters to collaborate, they need a shared language. A strongly consistent mesh allows Cluster A (Organization X) and Cluster B (Organization Y) to agree on workload placement without a central authority. + +**Why Strong Consistency First?** +It is technically feasible to relax a strongly consistent system to allow for "Split Brain" behavior (AP) when the user requests it. However, it is nearly impossible to take an eventually consistent system and force it to be strongly consistent (CP) later. By starting with strict constraints, we cover the hardest use cases (Finance) immediately. + +**Future Topologies** +While our immediate need is `FailoverTopology` (Multi-site), this architecture supports any future topology logic: +* **`CostTopology`**: Agents negotiate to route workloads to the cluster with the cheapest spot instances. +* **`HorizontalTopology`**: Spreading a single workload across 100 clusters for massive scale. +* **`GeoTopology`**: Ensuring data stays within specific legal jurisdictions. + +The mesh provides the *capability* (consensus and messaging); the topology provides the *logic*. + +## Consequences + +**Positive** +* **Automation:** Eliminates manual failover, enabling massive scale. +* **Reliability:** Guarantees data safety for critical workloads by default. +* **Flexibility:** A single codebase serves both high-frequency trading and AI inference. +* **Stability:** The public API remains abstract, allowing us to optimize the mesh internals without breaking user code. + +**Negative** +* **Deployment Complexity:** Users must now deploy and maintain a running service (the Agent) rather than just downloading a binary. +* **Engineering Complexity:** Integrating NATS JetStream and handling distributed state machines is significantly more complex than the current CLI logic. + +## Implementation Plan (Short Term) +1. **Agent Bootstrap:** Create the initial scaffold for the Harmony Agent (daemon). +2. **Mesh Integration:** Prototype NATS JetStream embedding within the Agent. +3. **Strategy Implementation:** Add `FailoverStrategy` to the configuration schema and implement the logic in the Agent to read and act on it. +4. **Migration:** Transition the current manual failover scripts into event-driven logic handled by the Agent. diff --git a/brocade/examples/main.rs b/brocade/examples/main.rs index 34dec21..47d4a63 100644 --- a/brocade/examples/main.rs +++ b/brocade/examples/main.rs @@ -1,6 +1,6 @@ use std::net::{IpAddr, Ipv4Addr}; -use brocade::BrocadeOptions; +use brocade::{BrocadeOptions, ssh}; use harmony_secret::{Secret, SecretManager}; use harmony_types::switch::PortLocation; use serde::{Deserialize, Serialize}; @@ -16,23 +16,28 @@ async fn main() { env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); // let ip = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 250)); // old brocade @ ianlet - let ip = IpAddr::V4(Ipv4Addr::new(192, 168, 55, 101)); // brocade @ sto1 + let ip = IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)); // brocade @ sto1 // let ip = IpAddr::V4(Ipv4Addr::new(192, 168, 4, 11)); // brocade @ st let switch_addresses = vec![ip]; - let config = SecretManager::get_or_prompt::() - .await - .unwrap(); + // let config = SecretManager::get_or_prompt::() + // .await + // .unwrap(); let brocade = brocade::init( &switch_addresses, - 22, - &config.username, - &config.password, - Some(BrocadeOptions { + // &config.username, + // &config.password, + "admin", + "password", + BrocadeOptions { dry_run: true, + ssh: ssh::SshOptions { + port: 2222, + ..Default::default() + }, ..Default::default() - }), + }, ) .await .expect("Brocade client failed to connect"); @@ -54,6 +59,7 @@ async fn main() { } println!("--------------"); + todo!(); let channel_name = "1"; brocade.clear_port_channel(channel_name).await.unwrap(); diff --git a/brocade/src/fast_iron.rs b/brocade/src/fast_iron.rs index 5a3474e..cd425dc 100644 --- a/brocade/src/fast_iron.rs +++ b/brocade/src/fast_iron.rs @@ -1,7 +1,8 @@ use super::BrocadeClient; use crate::{ BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, MacAddressEntry, - PortChannelId, PortOperatingMode, parse_brocade_mac_address, shell::BrocadeShell, + PortChannelId, PortOperatingMode, SecurityLevel, parse_brocade_mac_address, + shell::BrocadeShell, }; use async_trait::async_trait; @@ -140,7 +141,7 @@ impl BrocadeClient for FastIronClient { async fn configure_interfaces( &self, - _interfaces: Vec<(String, PortOperatingMode)>, + _interfaces: &Vec<(String, PortOperatingMode)>, ) -> Result<(), Error> { todo!() } @@ -209,4 +210,20 @@ impl BrocadeClient for FastIronClient { info!("[Brocade] Port-channel '{channel_name}' cleared."); Ok(()) } + + async fn enable_snmp(&self, user_name: &str, auth: &str, des: &str) -> Result<(), Error> { + let commands = vec![ + "configure terminal".into(), + "snmp-server view ALL 1 included".into(), + "snmp-server group public v3 priv read ALL".into(), + format!( + "snmp-server user {user_name} groupname public auth md5 auth-password {auth} priv des priv-password {des}" + ), + "exit".into(), + ]; + self.shell + .run_commands(commands, ExecutionMode::Regular) + .await?; + Ok(()) + } } diff --git a/brocade/src/lib.rs b/brocade/src/lib.rs index 57b464a..05f4928 100644 --- a/brocade/src/lib.rs +++ b/brocade/src/lib.rs @@ -14,11 +14,12 @@ use async_trait::async_trait; use harmony_types::net::MacAddress; use harmony_types::switch::{PortDeclaration, PortLocation}; use regex::Regex; +use serde::Serialize; mod fast_iron; mod network_operating_system; mod shell; -mod ssh; +pub mod ssh; #[derive(Default, Clone, Debug)] pub struct BrocadeOptions { @@ -31,6 +32,7 @@ pub struct BrocadeOptions { pub struct TimeoutConfig { pub shell_ready: Duration, pub command_execution: Duration, + pub command_output: Duration, pub cleanup: Duration, pub message_wait: Duration, } @@ -40,6 +42,7 @@ impl Default for TimeoutConfig { Self { shell_ready: Duration::from_secs(10), command_execution: Duration::from_secs(60), // Commands like `deploy` (for a LAG) can take a while + command_output: Duration::from_secs(5), // Delay to start logging "waiting for command output" cleanup: Duration::from_secs(10), message_wait: Duration::from_millis(500), } @@ -116,7 +119,7 @@ impl fmt::Display for InterfaceType { } /// Defines the primary configuration mode of a switch interface, representing mutually exclusive roles. -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize)] pub enum PortOperatingMode { /// The interface is explicitly configured for Brocade fabric roles (ISL or Trunk enabled). Fabric, @@ -139,12 +142,11 @@ pub enum InterfaceStatus { pub async fn init( ip_addresses: &[IpAddr], - port: u16, username: &str, password: &str, - options: Option, + options: BrocadeOptions, ) -> Result, Error> { - let shell = BrocadeShell::init(ip_addresses, port, username, password, options).await?; + let shell = BrocadeShell::init(ip_addresses, username, password, options).await?; let version_info = shell .with_session(ExecutionMode::Regular, |session| { @@ -206,7 +208,7 @@ pub trait BrocadeClient: std::fmt::Debug { /// Configures a set of interfaces to be operated with a specified mode (access ports, ISL, etc.). async fn configure_interfaces( &self, - interfaces: Vec<(String, PortOperatingMode)>, + interfaces: &Vec<(String, PortOperatingMode)>, ) -> Result<(), Error>; /// Scans the existing configuration to find the next available (unused) @@ -235,6 +237,15 @@ pub trait BrocadeClient: std::fmt::Debug { ports: &[PortLocation], ) -> Result<(), Error>; + /// Enables Simple Network Management Protocol (SNMP) server for switch + /// + /// # Parameters + /// + /// * `user_name`: The user name for the snmp server + /// * `auth`: The password for authentication process for verifying the identity of a device + /// * `des`: The Data Encryption Standard algorithm key + async fn enable_snmp(&self, user_name: &str, auth: &str, des: &str) -> Result<(), Error>; + /// Removes all configuration associated with the specified Port-Channel name. /// /// This operation should be idempotent; attempting to clear a non-existent @@ -298,6 +309,11 @@ fn parse_brocade_mac_address(value: &str) -> Result { Ok(MacAddress(bytes)) } +#[derive(Debug)] +pub enum SecurityLevel { + AuthPriv(String), +} + #[derive(Debug)] pub enum Error { NetworkError(String), diff --git a/brocade/src/network_operating_system.rs b/brocade/src/network_operating_system.rs index 0ee4a88..994dbee 100644 --- a/brocade/src/network_operating_system.rs +++ b/brocade/src/network_operating_system.rs @@ -3,11 +3,12 @@ use std::str::FromStr; use async_trait::async_trait; use harmony_types::switch::{PortDeclaration, PortLocation}; use log::{debug, info}; +use regex::Regex; use crate::{ BrocadeClient, BrocadeInfo, Error, ExecutionMode, InterSwitchLink, InterfaceInfo, InterfaceStatus, InterfaceType, MacAddressEntry, PortChannelId, PortOperatingMode, - parse_brocade_mac_address, shell::BrocadeShell, + SecurityLevel, parse_brocade_mac_address, shell::BrocadeShell, }; #[derive(Debug)] @@ -103,13 +104,37 @@ impl NetworkOperatingSystemClient { }; Some(Ok(InterfaceInfo { - name: format!("{} {}", interface_type, port_location), + name: format!("{interface_type} {port_location}"), port_location, interface_type, operating_mode, status, })) } + + fn map_configure_interfaces_error(&self, err: Error) -> Error { + debug!("[Brocade] {err}"); + + if let Error::CommandError(message) = &err { + if message.contains("switchport") + && message.contains("Cannot configure aggregator member") + { + let re = Regex::new(r"\(conf-if-([a-zA-Z]+)-([\d/]+)\)#").unwrap(); + + if let Some(caps) = re.captures(message) { + let interface_type = &caps[1]; + let port_location = &caps[2]; + let interface = format!("{interface_type} {port_location}"); + + return Error::CommandError(format!( + "Cannot configure interface '{interface}', it is a member of a port-channel (LAG)" + )); + } + } + } + + err + } } #[async_trait] @@ -162,7 +187,7 @@ impl BrocadeClient for NetworkOperatingSystemClient { async fn configure_interfaces( &self, - interfaces: Vec<(String, PortOperatingMode)>, + interfaces: &Vec<(String, PortOperatingMode)>, ) -> Result<(), Error> { info!("[Brocade] Configuring {} interface(s)...", interfaces.len()); @@ -179,9 +204,12 @@ impl BrocadeClient for NetworkOperatingSystemClient { PortOperatingMode::Trunk => { commands.push("switchport".into()); commands.push("switchport mode trunk".into()); - commands.push("no spanning-tree shutdown".into()); + commands.push("switchport trunk allowed vlan all".into()); + commands.push("no switchport trunk tag native-vlan".into()); + commands.push("spanning-tree shutdown".into()); commands.push("no fabric isl enable".into()); commands.push("no fabric trunk enable".into()); + commands.push("no shutdown".into()); } PortOperatingMode::Access => { commands.push("switchport".into()); @@ -197,11 +225,10 @@ impl BrocadeClient for NetworkOperatingSystemClient { commands.push("exit".into()); } - commands.push("write memory".into()); - self.shell .run_commands(commands, ExecutionMode::Regular) - .await?; + .await + .map_err(|err| self.map_configure_interfaces_error(err))?; info!("[Brocade] Interfaces configured."); @@ -213,7 +240,7 @@ impl BrocadeClient for NetworkOperatingSystemClient { let output = self .shell - .run_command("show port-channel", ExecutionMode::Regular) + .run_command("show port-channel summary", ExecutionMode::Regular) .await?; let used_ids: Vec = output @@ -248,7 +275,12 @@ impl BrocadeClient for NetworkOperatingSystemClient { ports: &[PortLocation], ) -> Result<(), Error> { info!( - "[Brocade] Configuring port-channel '{channel_name} {channel_id}' with ports: {ports:?}" + "[Brocade] Configuring port-channel '{channel_id} {channel_name}' with ports: {}", + ports + .iter() + .map(|p| format!("{p}")) + .collect::>() + .join(", ") ); let interfaces = self.get_interfaces().await?; @@ -276,8 +308,6 @@ impl BrocadeClient for NetworkOperatingSystemClient { commands.push("exit".into()); } - commands.push("write memory".into()); - self.shell .run_commands(commands, ExecutionMode::Regular) .await?; @@ -294,7 +324,6 @@ impl BrocadeClient for NetworkOperatingSystemClient { "configure terminal".into(), format!("no interface port-channel {}", channel_name), "exit".into(), - "write memory".into(), ]; self.shell @@ -304,4 +333,20 @@ impl BrocadeClient for NetworkOperatingSystemClient { info!("[Brocade] Port-channel '{channel_name}' cleared."); Ok(()) } + + async fn enable_snmp(&self, user_name: &str, auth: &str, des: &str) -> Result<(), Error> { + let commands = vec![ + "configure terminal".into(), + "snmp-server view ALL 1 included".into(), + "snmp-server group public v3 priv read ALL".into(), + format!( + "snmp-server user {user_name} groupname public auth md5 auth-password {auth} priv des priv-password {des}" + ), + "exit".into(), + ]; + self.shell + .run_commands(commands, ExecutionMode::Regular) + .await?; + Ok(()) + } } diff --git a/brocade/src/shell.rs b/brocade/src/shell.rs index 28eceb8..f72c31b 100644 --- a/brocade/src/shell.rs +++ b/brocade/src/shell.rs @@ -16,7 +16,6 @@ use tokio::time::timeout; #[derive(Debug)] pub struct BrocadeShell { ip: IpAddr, - port: u16, username: String, password: String, options: BrocadeOptions, @@ -27,33 +26,31 @@ pub struct BrocadeShell { impl BrocadeShell { pub async fn init( ip_addresses: &[IpAddr], - port: u16, username: &str, password: &str, - options: Option, + options: BrocadeOptions, ) -> Result { let ip = ip_addresses .first() .ok_or_else(|| Error::ConfigurationError("No IP addresses provided".to_string()))?; - let base_options = options.unwrap_or_default(); - let options = ssh::try_init_client(username, password, ip, base_options).await?; + let brocade_ssh_client_options = + ssh::try_init_client(username, password, ip, options).await?; Ok(Self { ip: *ip, - port, username: username.to_string(), password: password.to_string(), before_all_commands: vec![], after_all_commands: vec![], - options, + options: brocade_ssh_client_options, }) } pub async fn open_session(&self, mode: ExecutionMode) -> Result { BrocadeSession::open( self.ip, - self.port, + self.options.ssh.port, &self.username, &self.password, self.options.clone(), @@ -211,7 +208,7 @@ impl BrocadeSession { let mut output = Vec::new(); let start = Instant::now(); let read_timeout = Duration::from_millis(500); - let log_interval = Duration::from_secs(3); + let log_interval = Duration::from_secs(5); let mut last_log = Instant::now(); loop { @@ -221,7 +218,9 @@ impl BrocadeSession { )); } - if start.elapsed() > Duration::from_secs(5) && last_log.elapsed() > log_interval { + if start.elapsed() > self.options.timeouts.command_output + && last_log.elapsed() > log_interval + { info!("[Brocade] Waiting for command output..."); last_log = Instant::now(); } @@ -276,7 +275,7 @@ impl BrocadeSession { let output_lower = output.to_lowercase(); if ERROR_PATTERNS.iter().any(|&p| output_lower.contains(p)) { return Err(Error::CommandError(format!( - "Command '{command}' failed: {}", + "Command error: {}", output.trim() ))); } diff --git a/brocade/src/ssh.rs b/brocade/src/ssh.rs index 08ff96f..cb804c7 100644 --- a/brocade/src/ssh.rs +++ b/brocade/src/ssh.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::sync::Arc; use async_trait::async_trait; +use log::debug; use russh::client::Handler; use russh::kex::DH_G1_SHA1; use russh::kex::ECDH_SHA2_NISTP256; @@ -10,29 +11,43 @@ use russh_keys::key::SSH_RSA; use super::BrocadeOptions; use super::Error; -#[derive(Default, Clone, Debug)] +#[derive(Clone, Debug)] pub struct SshOptions { pub preferred_algorithms: russh::Preferred, + pub port: u16, +} + +impl Default for SshOptions { + fn default() -> Self { + Self { + preferred_algorithms: Default::default(), + port: 22, + } + } } impl SshOptions { - fn ecdhsa_sha2_nistp256() -> Self { + fn ecdhsa_sha2_nistp256(port: u16) -> Self { Self { preferred_algorithms: russh::Preferred { kex: Cow::Borrowed(&[ECDH_SHA2_NISTP256]), key: Cow::Borrowed(&[SSH_RSA]), ..Default::default() }, + port, + ..Default::default() } } - fn legacy() -> Self { + fn legacy(port: u16) -> Self { Self { preferred_algorithms: russh::Preferred { kex: Cow::Borrowed(&[DH_G1_SHA1]), key: Cow::Borrowed(&[SSH_RSA]), ..Default::default() }, + port, + ..Default::default() } } } @@ -57,18 +72,21 @@ pub async fn try_init_client( ip: &std::net::IpAddr, base_options: BrocadeOptions, ) -> Result { + let mut default = SshOptions::default(); + default.port = base_options.ssh.port; let ssh_options = vec![ - SshOptions::default(), - SshOptions::ecdhsa_sha2_nistp256(), - SshOptions::legacy(), + default, + SshOptions::ecdhsa_sha2_nistp256(base_options.ssh.port), + SshOptions::legacy(base_options.ssh.port), ]; for ssh in ssh_options { let opts = BrocadeOptions { - ssh, + ssh: ssh.clone(), ..base_options.clone() }; - let client = create_client(*ip, 22, username, password, &opts).await; + debug!("Creating client {ip}:{} {username}", ssh.port); + let client = create_client(*ip, ssh.port, username, password, &opts).await; match client { Ok(_) => { diff --git a/data/pxe/okd/http_files/harmony_inventory_agent b/data/pxe/okd/http_files/harmony_inventory_agent index 1d802f7..47c7aaa 100755 --- a/data/pxe/okd/http_files/harmony_inventory_agent +++ b/data/pxe/okd/http_files/harmony_inventory_agent @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5244fa8968fe15c2415de6cc487e6112f8aedd9989951e018f9bdb536b1016d2 -size 8139216 +oid sha256:78b2cf5b2faa1a6b637c0d6ba3d37f427ee9f1b087e8605b95acce83dc417aa1 +size 8187248 diff --git a/docs/doc-clone-and-restore-coreos.md b/docs/doc-clone-and-restore-coreos.md new file mode 100644 index 0000000..5d7ad3e --- /dev/null +++ b/docs/doc-clone-and-restore-coreos.md @@ -0,0 +1,133 @@ +## Working procedure to clone and restore CoreOS disk from OKD Cluster + +### **Step 1 - take a backup** +``` +sudo dd if=/dev/old of=/dev/backup status=progress +``` + +### **Step 2 - clone beginning of old disk to new** +``` +sudo dd if=/dev/old of=/dev/backup status=progress count=1000 bs=1M +``` + +### **Step 3 - verify and modify disk partitions** +list disk partitions +``` +sgdisk -p /dev/new +``` +if new disk is smaller than old disk and there is space on the xfs partition of the old disk, modify partitions of new disk +``` +gdisk /dev/new +``` +inside of gdisk commands +``` +-v -> verify table +-p -> print table +-d -> select partition to delete partition +-n -> recreate partition with same partition number as deleted partition +``` +For end sector, either specify the new end or just press Enter for maximum available +When asked about partition type, enter the same type code (it will show the old one) +``` +p - >to verify +w -> to write +``` +make xfs file system for new partition +``` +sudo mkfs.xfs -f /dev/new4 +``` + +### **Step 4 - copy old PARTUUID ** + +**careful here** +get old patuuid: +``` +sgdisk -i /dev/old_disk # Note the "Partition unique GUID" +``` +get labels +``` +sgdisk -p /dev/old_disk # Shows partition names in the table + +blkid /dev/old_disk* # Shows PARTUUIDs and labels for all partitions +``` +set it on new disk +``` +sgdisk -u : /dev/sdc +``` +partition name: +``` +sgdisk -c :"" /dev/sdc +``` +verify all: +``` +lsblk -o NAME,SIZE,PARTUUID,PARTLABEL /dev/old_disk +``` + +### **Step 5 - Mount disks and copy files from old to new disk** + +mount files before copy: + +``` +mkdir -p /mnt/new +mkdir -p /mnt/old +mount /dev/old4 /mnt/old +mount /dev/new4 /mnt/new +``` +copy: + +with -n flag can run as dry-run +``` +rsync -aAXHvn --numeric-ids /source/ /destination/ +``` + +``` +rsync -aAXHv --numeric-ids /source/ /destination/ +``` + +### **Step 6 - Set correct UUID for new partition 4** +to set uuid with xfs_admin you must unmount first + +unmount old devices +``` +umount /mnt/new +umount /mnt/old +``` + +to set correct uuid for partition 4 +``` +blkid /dev/old4 +``` +``` +xfs_admin -U /dev/new_partition +``` +to set labels +get it +``` +sgdisk -i 4 /dev/sda | grep "Partition name" +``` +set it +``` +sgdisk -c 4:"" /dev/sdc + +or + +(check existing with xfs_admin -l /dev/old_partition) +Use xfs_admin -L