diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 3cdedc427..badba6a8a 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1486,6 +1486,65 @@ commands: help: "Target cluster pool ID or name" dest: target_pool type: str + - name: collect-logs + help: Collect simplyblock container logs for a given time window. + arguments: + - name: "--start-time" + help: "Start of the collection window (UTC assumed if no timezone given). Formats: \"2024-01-15T10:00:00\" or \"2024-01-15 10:00:00\"" + dest: start_time + type: str + - name: "--duration-minutes" + help: "Duration in minutes." + dest: duration_minutes + type: int + default: 60 + - name: "--output-dir" + help: "Directory to write the output tarball (default: current directory)." + dest: output_dir + type: str + default: "." + - name: "--use-opensearch" + help: "Query OpenSearch directly via scroll API instead of the Graylog REST API. + Useful for very large result sets or when Graylog is unreachable." + dest: use_opensearch + type: bool + default: false + action: store_true + - name: "--cluster-id" + help: "Target a specific cluster UUID (default: first cluster returned by sbctl)." + dest: cluster_id + type: str + - name: "--mgmt-ip" + help: "Override the management-node IP used to reach Graylog / OpenSearch." + dest: mgmt_ip + type: str + - name: "--monitoring-secret" + help: "Graylog / OpenSearch password to use instead of the cluster secret. + When provided this takes precedence over the cluster secret." + dest: monitoring_secret + type: str + - name: "--namespace" + help: "Kubernetes namespace to collect CSI / storage-node DS pod logs from (default: simplyblock). + Pass an empty string to skip kubectl collection." + dest: namespace + type: str + - name: "--diagnose" + help: "Print a diagnostic report from OpenSearch (indices, field names, + sample documents, container names present in the time window) and exit without collecting logs. + Use this when collections return 0 to understand the actual data layout. Implies --use-opensearch." + dest: diagnose + type: bool + action: store_true + - name: "--mode" + help: "Deployment mode: 'docker' (default) uses Docker Swarm service names + for control-plane log collection; 'kubernetes' uses Kubernetes container + names and skips Graylog-based SPDK log collection (kubectl is used instead)." + dest: mode + type: str + choices: + - docker + - kubernetes + default: docker - name: "volume" help: "Logical Volume Commands" aliases: diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 912d38b2d..0dfe633cc 100755 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -380,6 +380,7 @@ def init_cluster(self): self.init_cluster__set(subparser) self.init_cluster__change_name(subparser) self.init_cluster__add_replication(subparser) + self.init_cluster__collect_logs(subparser) def init_cluster__create(self, subparser): @@ -578,6 +579,19 @@ def init_cluster__add_replication(self, subparser): argument = subcommand.add_argument('--timeout', help='Snapshot replication network timeout', type=int, default=3600, dest='timeout') argument = subcommand.add_argument('--target-pool', help='Target cluster pool ID or name', type=str, dest='target_pool') + def init_cluster__collect_logs(self, subparser): + subcommand = self.add_sub_command(subparser, 'collect-logs', 'Collect simplyblock container logs for a given time window.') + argument = subcommand.add_argument('--start-time', help='Start of the collection window (UTC assumed if no timezone given). Formats: "2024-01-15T10:00:00" or "2024-01-15 10:00:00"', type=str, dest='start_time') + argument = subcommand.add_argument('--duration-minutes', help='Duration in minutes.', type=int, default=60, dest='duration_minutes') + argument = subcommand.add_argument('--output-dir', help='Directory to write the output tarball (default: current directory).', type=str, default='.', dest='output_dir') + argument = subcommand.add_argument('--use-opensearch', help='Query OpenSearch directly via scroll API instead of the Graylog REST API. Useful for very large result sets or when Graylog is unreachable.', default=False, dest='use_opensearch', action='store_true') + argument = subcommand.add_argument('--cluster-id', help='Target a specific cluster UUID (default: first cluster returned by sbctl).', type=str, dest='cluster_id') + argument = subcommand.add_argument('--mgmt-ip', help='Override the management-node IP used to reach Graylog / OpenSearch.', type=str, dest='mgmt_ip') + argument = subcommand.add_argument('--monitoring-secret', help='Graylog / OpenSearch password to use instead of the cluster secret. When provided this takes precedence over the cluster secret.', type=str, dest='monitoring_secret') + argument = subcommand.add_argument('--namespace', help='Kubernetes namespace to collect CSI / storage-node DS pod logs from (default: simplyblock). Pass an empty string to skip kubectl collection.', type=str, dest='namespace') + argument = subcommand.add_argument('--diagnose', help='Print a diagnostic report from OpenSearch (indices, field names, sample documents, container names present in the time window) and exit without collecting logs. Use this when collections return 0 to understand the actual data layout. Implies --use-opensearch.', dest='diagnose', action='store_true') + argument = subcommand.add_argument('--mode', help='Deployment mode: \'docker\' (default) uses Docker Swarm service names for control-plane log collection; \'kubernetes\' uses Kubernetes container names and skips Graylog-based SPDK log collection (kubectl is used instead).', type=str, default='docker', dest='mode', choices=['docker','kubernetes',]) + def init_volume(self): subparser = self.add_command('volume', 'Logical Volume Commands', aliases=['lvol',]) @@ -1293,6 +1307,8 @@ def run(self): ret = self.cluster__change_name(sub_command, args) elif sub_command in ['add-replication']: ret = self.cluster__add_replication(sub_command, args) + elif sub_command in ['collect-logs']: + ret = self.cluster__collect_logs(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index a918c3f1f..b406b6dda 100755 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -16,6 +16,7 @@ from simplyblock_core.controllers import health_controller from simplyblock_core.models.pool import Pool from simplyblock_core.models.cluster import Cluster +from simplyblock_core.scripts.collect_logs import collect_logs def range_type(min, max): @@ -534,6 +535,10 @@ def cluster__complete_expand(self, sub_command, args): def cluster__add_replication(self, sub_command, args): return cluster_ops.add_replication(args.cluster_id, args.target_cluster_id, args.timeout, args.target_pool) + def cluster__collect_logs(self, sub_command, args): + return collect_logs(args.start_time, args.duration_minutes, args.output_dir, args.use_opensearch, + args.cluster_id, args.mgmt_ip, args.monitoring_secret, args.namespace, args.diagnose, args.mode) + def volume__add(self, sub_command, args): import json as _json name = args.name diff --git a/simplyblock_core/scripts/collect_logs.py b/simplyblock_core/scripts/collect_logs.py index fd00bf007..4432c74f5 100755 --- a/simplyblock_core/scripts/collect_logs.py +++ b/simplyblock_core/scripts/collect_logs.py @@ -904,10 +904,21 @@ def main(): if args.diagnose: args.use_opensearch = True + collect_logs(args.start_time, args.duration_minutes, args.output_dir, args.use_opensearch, args.cluster_id, + args.mgmt_ip, args.monitoring_secret, args.namespace, args.diagnose, args.mode) + +def collect_logs(start_time, duration_minutes, output_dir, use_opensearch, cluster_id, mgmt_ip, monitoring_secret, + namespace, diagnose, mode): # ── 1. Parse time range ────────────────────────────────────────────────── + if not duration_minutes: + duration_minutes = 60 + + if not start_time: + start_time = str(datetime.now() - timedelta(minutes=duration_minutes)) + try: - start_dt = datetime.fromisoformat(args.start_time.replace(" ", "T")) + start_dt = datetime.fromisoformat(start_time.replace(" ", "T")) except ValueError as exc: print(f"ERROR: invalid start_time – {exc}", file=sys.stderr) sys.exit(1) @@ -915,21 +926,21 @@ def main(): if start_dt.tzinfo is None: start_dt = start_dt.replace(tzinfo=timezone.utc) - end_dt = start_dt + timedelta(minutes=args.duration_minutes) + end_dt = start_dt + timedelta(minutes=duration_minutes) from_iso = start_dt.strftime("%Y-%m-%dT%H:%M:%S.000Z") to_iso = end_dt.strftime("%Y-%m-%dT%H:%M:%S.000Z") print("=" * 64) print(" Simplyblock Log Collector") print("=" * 64) - print(f" Window : {from_iso} → {to_iso} ({args.duration_minutes} min)") - print(f" Deploy : {args.mode}") - print(f" Mode : {'OpenSearch (direct)' if args.use_opensearch else 'Graylog REST API'}") + print(f" Window : {from_iso} → {to_iso} ({duration_minutes} min)") + print(f" Deploy : {mode}") + print(f" Mode : {'OpenSearch (direct)' if use_opensearch else 'Graylog REST API'}") # ── 2. Cluster UUID + secret ───────────────────────────────────────────── print("\n[1] Retrieving cluster info …") - cluster_uuid = args.cluster_id + cluster_uuid = cluster_id if not cluster_uuid: clusters = sbctl_json("cluster", "list") if not clusters: @@ -948,8 +959,7 @@ def main(): # ── 3. Management-node IP ──────────────────────────────────────────────── print("\n[2] Resolving management node …") - if args.mgmt_ip: - mgmt_ip = args.mgmt_ip + if mgmt_ip: print(f" Using provided IP : {mgmt_ip}") else: cp_nodes = sbctl_json("control-plane", "list") @@ -959,7 +969,7 @@ def main(): mgmt_ip = cp_nodes[0]["IP"] print(f" Management IP : {mgmt_ip} ({len(cp_nodes)} node(s) total)") - if args.mode == "kubernetes": + if mode == "kubernetes": graylog_base = f"http://{mgmt_ip}:9000/api" opensearch_base = f"http://{mgmt_ip}:9200" else: @@ -977,8 +987,8 @@ def main(): # ── 5. HTTP sessions ───────────────────────────────────────────────────── - graylog_password = args.monitoring_secret if args.monitoring_secret else cluster_secret - if args.monitoring_secret: + graylog_password = monitoring_secret if monitoring_secret else cluster_secret + if monitoring_secret: print(" Using provided --monitoring-secret for Graylog auth.") gl_session = requests.Session() @@ -988,7 +998,7 @@ def main(): os_session = requests.Session() # Verify Graylog reachability (informational only) - if not args.use_opensearch: + if not use_opensearch: print(f"\n[4] Checking Graylog at {graylog_base} …") try: r = gl_session.get(f"{graylog_base}/system", timeout=10) @@ -1012,15 +1022,15 @@ def main(): print(f" WARN: {exc}.") # --diagnose: print full report and exit - if args.diagnose: + if diagnose: opensearch_diagnose(os_session, opensearch_base, from_iso, to_iso) sys.exit(0) # ── 6. Prepare temp workspace ──────────────────────────────────────────── ts_str = start_dt.strftime("%Y%m%d_%H%M%S") - bundle_name = f"sb_logs_{ts_str}_{args.duration_minutes}m" - output_dir = Path(args.output_dir).resolve() + bundle_name = f"sb_logs_{ts_str}_{duration_minutes}m" + output_dir = Path(output_dir).resolve() output_dir.mkdir(parents=True, exist_ok=True) tarball_path = output_dir / f"{bundle_name}.tar.gz" @@ -1031,7 +1041,7 @@ def main(): os_session=os_session, graylog_base=graylog_base, opensearch_base=opensearch_base, - use_opensearch=args.use_opensearch, + use_opensearch=use_opensearch, from_iso=from_iso, to_iso=to_iso, probe_cache=probe_cache, @@ -1045,14 +1055,14 @@ def main(): cp_services = ( CONTROL_PLANE_SERVICES_KUBERNETES - if args.mode == "kubernetes" + if mode == "kubernetes" else CONTROL_PLANE_SERVICES_DOCKER ) - print(f"\n[5] Collecting control-plane logs ({len(cp_services)} services, mode={args.mode}) …") + print(f"\n[5] Collecting control-plane logs ({len(cp_services)} services, mode={mode}) …") cp_dir = log_root / "control_plane" cp_dir.mkdir() - gl_cname_field = "kubernetes_container_name" if args.mode == "kubernetes" else "container_name" + gl_cname_field = "kubernetes_container_name" if mode == "kubernetes" else "container_name" total_cp_lines = 0 for svc in cp_services: @@ -1075,7 +1085,7 @@ def main(): # Docker mode: collect SPDK/SNodeAPI logs from Graylog/OpenSearch. # Kubernetes mode: SPDK logs are captured via kubectl in step 9. - if args.mode == "docker": + if mode == "docker": print("\n[6] Collecting storage-node logs (docker) …") sn_root = log_root / "storage_nodes" sn_root.mkdir() @@ -1168,7 +1178,7 @@ def main(): # ── 9. Kubernetes pod logs (CSI node + storage-node DS) ────────────── - k8s_ns = args.namespace + k8s_ns = namespace if k8s_ns: print(f"\n[7] Collecting Kubernetes pod logs (namespace: {k8s_ns}) …") k8s_dir = log_root / "k8s_pods" @@ -1299,11 +1309,11 @@ def save_sbctl(label, cmd_args, out_name, use_json=False): "collected_at": datetime.now(timezone.utc).isoformat(), "window_from": from_iso, "window_to": to_iso, - "duration_minutes": args.duration_minutes, + "duration_minutes": duration_minutes, "cluster_uuid": cluster_uuid, "mgmt_ip": mgmt_ip, - "deploy_mode": args.mode, - "log_source": "opensearch-direct" if args.use_opensearch else "graylog-api", + "deploy_mode": mode, + "log_source": "opensearch-direct" if use_opensearch else "graylog-api", "storage_nodes": [ { "hostname": n.get("Hostname"), @@ -1329,6 +1339,7 @@ def save_sbctl(label, cmd_args, out_name, use_json=False): print(f" Tarball : {tarball_path}") print(f" Size : {size_mb:.2f} MB") print(f"{'=' * 64}\n") + return True if __name__ == "__main__":