JohT
diff --git a/‎domains/anomaly-detection/summary/anomalyDetectionSummary.sh‎
Lines changed: 27 additions & 41 deletions b/‎domains/anomaly-detection/summary/anomalyDetectionSummary.sh‎
Lines changed: 27 additions & 41 deletions
diff --git a/‎domains/anomaly-detection/summary/report.template.md‎
Lines changed: 229 additions & 41 deletions b/‎domains/anomaly-detection/summary/report.template.md‎
Lines changed: 229 additions & 41 deletions
diff --git a/‎domains/anomaly-detection/summary/report_deep_dive.template.md‎
Lines changed: 3 additions & 1 deletion b/‎domains/anomaly-detection/summary/report_deep_dive.template.md‎
Lines changed: 3 additions & 1 deletion
@@ -32,41 +32,6 @@ MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-"${SCRIPTS_DIR}/markdown"}
 # Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
 source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
 
-# Appends a Markdown table to an existing file and
-# removes redundant header + separator rows.
-#
-# Usage:
-#   cat newTable.md | append_table myMarkdownFile.md
-#
-#   append_table myMarkdownFile.md <<'EOF'
-#   | Name | Score | Archetype |
-#   | ---  | ---   | ---       |
-#   | Bar  | 0.9   | Something |
-#   EOF
-#
-# Behavior:
-#   - Keeps the first header row and its following separator row.
-#   - Removes all subsequent duplicate header + separator pairs.
-#   - Leaves all data rows untouched.
-append_to_markdown_table() {
-  local file="$1"
-
-  # Append stdin to the target file
-  cat >> "${file}"
-  
-  # Clean up duplicate headers (header row + --- row)
-  awk '!seen[$0]++ || NR <= 2' "${file}" > "${file}.tmp" && mv "${file}.tmp" "${file}"
-}
-
-# Run the anomaly detection main report generation.
-anomaly_detection_report_first_section() {
-    local report_markdown_includes_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
-    mkdir -p "${report_markdown_includes_directory}"
-    
-    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesPerAbstractionLayer.md"
-    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesInTotal.md"
-}
-
 # Aggregates all results in a Markdown report.
 #
 # Required Parameters:
@@ -167,13 +132,38 @@ anomaly_detection_report() {
     # fi
 }
 
+anomaly_detection_front_matter_metadata_head() {
+    local current_date
+    current_date="$(date +'%Y-%m-%d')"
+
+    local nearest_tag
+    nearest_tag="$(git describe --tags)"
+    
+    local analysis_directory
+    analysis_directory="${PWD##*/}"
+
+    echo "---"
+    echo "title: \"Anomaly Detection Report\""
+    echo "generated: \"${current_date}\""
+    echo "model_version: \"${nearest_tag}\""
+    echo "dataset: \"${analysis_directory}\""
+    echo "authors: [\"JohT/code-graph-analysis-pipeline\"]"
+    echo "---"
+}
+
 # Finalize the anomaly detection report by taking the main template, applying includes and appending all deep dive reports
 anomaly_detection_finalize_report() {
-    echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Finalizing anomaly detection Markdown report..."
+    echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Assembling main anomaly detection Markdown report..."
 
-    report_include_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
+    local report_include_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
     mkdir -p "${report_include_directory}"
 
+    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_include_directory}/AnomaliesPerAbstractionLayer.md"
+    execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_include_directory}/AnomaliesInTotal.md"
+
+    # Write "front matter" metadata section
+    anomaly_detection_front_matter_metadata_head > "${report_include_directory}/AnomalyDetectionReportFrontMatter.md"
+
     # Concatenate all deep dive reports as Markdown include
     rm -rf "${report_include_directory}/AnomalyDetectionDeepDive.md"
     for markdown_file in $(find . -type f -name 'report_deep_dive_*.md' | sort); do
@@ -202,10 +192,6 @@ ALGORITHM_NODE="projection_node_label"
 ALGORITHM_LANGUAGE="projection_language"
 REPORT_NUMBER="report_number"
 
-# -- Overview Report for all code type -------------------------------
-
-anomaly_detection_report_first_section
-
 # -- Detail Reports for each code type -------------------------------
 
 anomaly_detection_report "${REPORT_NUMBER}=1" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_LANGUAGE}=Java"
 
@@ -1,10 +1,23 @@
+<!-- include:AnomalyDetectionReportFrontMatter.md -->
+
 # 📊 Anomaly Detection Report
 
 ## 1. Executive Overview
 
 This report analyzes structural and dependency anomalies across multiple abstraction levels of the codebase.
 The goal is to detect potential **software quality, design, and architecture issues** using graph-based features, anomaly detection (Isolation Forest), and SHAP explainability.
 
+## 📚 Table of Contents
+
+1. [Executive Overview](#1-executive-overview)
+1. [Deep Dives by Abstraction Level](#2-deep-dives-by-abstraction-level)
+1. [Plot Interpretation Guide](#3-plot-interpretation-guide)
+1. [Taxonomy of Anomaly Archetypes](#4-taxonomy-of-anomaly-archetypes)
+1. [Recommendations](#5-recommendations)
+1. [Appendix](#6-appendix)
+
+---
+
 ### 1.1 Anomalies in total
 
 <!-- include:AnomaliesInTotal.md -->
@@ -15,45 +28,220 @@ The goal is to detect potential **software quality, design, and architecture iss
 
 ## 2. Deep Dives by Abstraction Level
 
+Each abstraction level includes anomaly statistics, SHAP feature importance, archetype distribution, and example anomalies.
+
 <!-- include:AnomalyDetectionDeepDive.md -->
-## 3. Taxonomy of Anomaly Archetypes
-
-| Archetype | Feature Profile | Risk for Architecture |
-|-----------|----------------|------------------------|
-| **Hub** | High degree, low clustering coefficient | Central dependency, fragile hotspot |
-| **Bottleneck** | High betweenness, low redundancy | Single point of failure, slows evolution |
-| **Outlier** | High cluster distance, small cluster size | Misfit component, unusual dependency pattern |
-| **Authority** | High PageRank but low articleRank | Over-relied utility with few reverse connections |
-| **Bridge** | Embedding-driven anomaly, cross-cluster | Connects unrelated domains, risky coupling |
-
----
-
-## 4. Recommendations
-
-* **Refactor hubs:** Break down god classes/utilities into smaller abstractions.
-* **Mitigate bottlenecks:** Add redundancy or alternative paths.
-* **Investigate outliers:** Validate if they are justified exceptions or design flaws.
-* **Enforce cohesion:** Raise clustering coefficient via better modular boundaries.
-* **Stabilize authorities:** Encapsulate widely used but locally weak components, reduce over-generalization, and ensure stable APIs.  
-* **Clarify bridges:** Validate whether cross-cluster connectors are intentional (adapters/facades) or accidental; refactor or relocate responsibilities to preserve modularity.
-
----
-
-## 5. Appendix
-
-* **Methodology:** Isolation Forest, Random Forest proxy, SHAP explanations.
-* **Embedding generation:** Fast Random Projection, PCA (20–35 dims, \~0.9 target variance).
-* **Clustering:** HDBSCAN tuned against Leiden communities (golden reference, AMI optimization).
-* **Optimization:** Hyperparameter optimization for both Isolation Forest and Random Forest proxy with their F1 score
-* **Feature set:**
-  * Degree (in/out)
-  * PageRank
-  * ArticleRank
-  * Page-to-Article Rank Difference
-  * Betweenness centrality
-  * Local Clustering Coefficient
-  * Cluster Approximate Outlier Score ( = 1.0 - Cluster Probability)
-  * Cluster Radius Average
-  * Cluster Distance to Medoid
-  * Cluster Size
-  * Node Embedding
+
+## 3. Plot Interpretation Guide
+
+> **Purpose:** Understand each plot type’s diagnostic value.  
+> **Applies to:** All abstraction levels.
+
+| Plot Type | Best For | Adds | Why It Matters |
+| --- | --- | --- | --- |
+| **Anomalies Plot** | Seeing distribution of anomalies in clusters | Context of clusters & outliers | Reveals isolation or cluster-based anomalies |
+| **SHAP Summary** | Global feature importance | Feature impact direction | Shows what drives anomalies overall |
+| **Local SHAP Force** | Explaining a single anomaly | Feature contribution breakdown | Useful for debugging individual outliers |
+| **Dependence Plot** | Understanding feature influence | Interaction visualization | Reveals nonlinear feature effects |
+| **Cluster Metrics** | Cluster characteristics | Radius, cohesion, noise | Identifies weakly defined or noisy clusters |
+
+## 3. Plot Interpretation Guide
+
+> **Purpose:** Provide a direct mapping between all plots and their analytical meaning.  
+> **Scope:** Applies to plots for *Java Type*, *Java Package*, and similar abstraction levels.  
+> **Format:** Each entry includes `Best for`, `Adds`, and `Why`, matching the in-report descriptions.
+
+---
+
+### 📘 Main Plots
+
+| Plot | Description | Best For | Adds | Why |
+|------|--------------|----------|------|-----|
+| **Anomalies** | 2D visualization of all code units showing clusters and anomalies. | Understanding the overall distribution of anomalies in relation to clusters. | Context of clusters and outliers. | Reveals whether anomalies are isolated or cluster-based, guiding investigation. |
+| **Global Feature Importance (SHAP Summary)** | Mean absolute SHAP values ranking global feature impact. | Global understanding of which features drive anomalies. | Direction of impact (color shows feature value). | Explains which metrics consistently influence anomaly detection. |
+| **Feature Dependence (Top Important Features)** | Shows how specific feature values affect anomaly score; colored by interacting feature. | Understanding how one feature affects anomaly scores. | Color shows feature interaction or threshold effect. | Helps identify nonlinear relationships and feature interactions. |
+
+---
+
+### 📙 Local Explanation Plots
+
+| Plot | Description | Best For | Adds | Why |
+|------|--------------|----------|------|-----|
+| **Local SHAP Force Plots (Top Anomalies 1–6)** | Visualizes per-feature contributions to each anomaly’s score relative to baseline. | Explaining *why a specific data point* is anomalous. | Visual breakdown of how each feature contributes to anomaly score. | Enables debugging of individual anomalies through transparent explanation. |
+
+---
+
+### 📗 Cluster-Level Diagnostic Plots
+
+| Plot | Description | Best For | Adds | Why |
+|------|--------------|----------|------|-----|
+| **Clusters – Overall** | Shows all clusters since they all fit into one plot. | Gaining a holistic view of cluster characteristics in the dataset. | An overall summary of how all clusters are distributed and their key metrics. | Understanding the general structure and properties of clusters can help identify patterns and potential anomalies in the data. |
+| **Clusters – Largest Average Radius** | Ranks clusters by mean distance of members from their centroid. | Getting an overview of clusters that are more dispersed. | Identifies clusters with internal variability. | Large average radius suggests less cohesion and potential outliers. |
+| **Clusters – Largest Max Radius** | Shows clusters with the farthest outlying member. | Identifying clusters that have members farthest from cluster center. | Highlights clusters containing extreme outliers. | Indicates clusters that may contain hidden anomalies. |
+| **Clusters – Largest Size** | Displays cluster membership counts. | Understanding which clusters contain the most code units. | Provides sense of frequency of code structures. | Large clusters may represent common design patterns; small clusters are specialized. |
+| **Cluster Probabilities** | Distribution of HDBSCAN membership probabilities. | Detecting code units that don’t strongly belong to any cluster. | Measures how well-defined clusters are. | Highlights noisy or weakly defined clusters. |
+
+---
+
+### 📒 Cluster Noise & Bridge Diagnostics
+
+| Plot | Description | Best For | Adds | Why |
+|------|--------------|----------|------|-----|
+| **Cluster Noise – Highly Central and Popular** | Central nodes that don’t fit any cluster. | Detecting code units that are highly connected but anomalous. | Reveals influential but misfit nodes. | Such nodes may be key but unstable integration points. |
+| **Cluster Noise – Poorly Integrated Bridges** | Nodes connecting clusters but weakly integrated. | Detecting code units that bridge modules unusually. | Identifies cross-cutting or leaking dependencies. | May reveal architectural boundary violations. |
+| **Cluster Noise – Role Inverted Bridges** | Bridges with reversed structural roles compared to expected topology. | Detecting code units connecting clusters in unexpected ways. | Highlights anomalous coupling roles. | Indicates architectural inversion or misuse of interfaces. |
+
+---
+
+### 📙 Feature Distribution & Relationship Plots
+
+| Plot | Description | Best For | Adds | Why |
+|------|--------------|----------|------|-----|
+| **Betweenness Centrality Distribution** | Histogram of betweenness values. | Identifying code units that act as structural bridges. | Insight into flow of dependency control. | Detects potential bottlenecks or single points of failure. |
+| **Clustering Coefficient Distribution** | Histogram of local clustering coefficients. | Identifying modularity and local cohesion. | Insight into how tightly code units cluster. | Reveals how cohesive or isolated different regions of the graph are. |
+| **PageRank – ArticleRank Difference Distribution** | Distribution of `PageRank - ArticleRank`. | Identifying influential nodes beyond local connectivity. | Shows imbalance between influence and popularity. | Highlights components with disproportionate architectural impact. |
+| **Clustering Coefficient vs PageRank** | Scatterplot comparing local clustering to global influence. | Identifying relationships between cohesion and centrality. | Visualizes trade-offs between modularity and reach. | Helps spot code units that are both locally and globally critical. |
+
+---
+
+### 📔 Summary Categories
+
+| Category | Included Plots | Typical Usage |
+|-----------|----------------|----------------|
+| **Main Diagnostic** | Anomalies, Global SHAP, Feature Dependence | High-level anomaly review |
+| **Local Explanation** | Local SHAP Force Plots | Case-by-case anomaly debugging |
+| **Cluster Diagnostics** | Cluster Radius / Size / Probability | Assess cluster cohesion and outliers |
+| **Cluster Noise Analysis** | Cluster Noise (3 types) | Identify special structural anomalies |
+| **Feature Distributions** | Betweenness, Clustering, Rank Difference | Assess feature-based structure patterns |
+| **Feature Relationships** | Clustering vs PageRank | Evaluate global vs local influence balance |
+
+---
+
+### 💡 Reading Guidance
+
+- **Color Conventions:**  
+  Red = anomalous, Green = typical, Light grey = noise, Pale colors = clusters.  
+- **Scales:**  
+  SHAP values are normalized (mean absolute); graph metrics standardized by z-score.  
+- **How to Use:**  
+  1. Start with *Main Diagnostic* plots to identify anomalies and drivers.  
+  2. Use *Local SHAP* for detailed case analysis.  
+  3. Check *Cluster Diagnostics* and *Noise Plots* to verify grouping quality.  
+  4. Use *Feature Distributions* to contextualize metrics.  
+  5. Cross-reference *Feature Relationships* for architectural interpretation.
+
+---
+
+### 📄 Structured Form (YAML Summary)
+
+You can include this in your appendix for machine-readable mapping:
+
+```yaml
+plots:
+  main:
+    - name: Anomalies
+      purpose: Distribution of anomalies and clusters
+    - name: Global Feature Importance (SHAP)
+      purpose: Global feature ranking
+    - name: Feature Dependence
+      purpose: Feature–score relationship
+  local:
+    - name: Local SHAP Force Plots
+      purpose: Local explanations for top anomalies
+  cluster:
+    - name: Clusters Largest Average Radius
+      purpose: Identify dispersed clusters
+    - name: Clusters Largest Max Radius
+      purpose: Identify extreme outlier clusters
+    - name: Clusters Largest Size
+      purpose: Identify dominant cluster types
+    - name: Cluster Probabilities
+      purpose: Assess cluster definition strength
+  cluster_noise:
+    - name: Cluster Noise – Highly Central and Popular
+      purpose: Central anomalies without cluster fit
+    - name: Cluster Noise – Poorly Integrated Bridges
+      purpose: Weakly integrated bridges
+    - name: Cluster Noise – Role Inverted Bridges
+      purpose: Inverted bridge roles
+  feature_distributions:
+    - name: Betweenness Centrality Distribution
+      purpose: Bridge and bottleneck detection
+    - name: Clustering Coefficient Distribution
+      purpose: Cohesion and modularity measurement
+    - name: PageRank – ArticleRank Difference Distribution
+      purpose: Influence vs popularity analysis
+  feature_relationships:
+    - name: Clustering Coefficient vs PageRank
+      purpose: Local vs global influence comparison
+```
+
+## 4. Taxonomy of Anomaly Archetypes
+
+| Archetype | Feature Profile | Architectural Risk |
+|-----------|-----------------|--------------------|
+| **Hub** | High degree, low clustering coefficient | Central dependency; fragile hotspot |
+| **Bottleneck** | High betweenness, low redundancy | Single point of failure; slows evolution |
+| **Outlier** | High cluster distance, small cluster size | Misfit or irregular dependency pattern |
+| **Authority** | High PageRank, low ArticleRank | Over-relied utility; low local stability |
+| **Bridge** | Cross-cluster connection | Risky coupling; weak modular boundaries |
+
+**Structured form (for LLM parsing):**
+
+```yaml
+archetypes:
+  - name: Hub
+    profile: High degree, low clustering coefficient
+    risk: Central dependency, fragile hotspot
+  - name: Bottleneck
+    profile: High betweenness, low redundancy
+    risk: Single point of failure
+  - name: Outlier
+    profile: High cluster distance, small cluster size
+    risk: Misfit component
+  - name: Authority
+    profile: High PageRank, low ArticleRank
+    risk: Over-relied utility
+  - name: Bridge
+    profile: Cross-cluster connector
+    risk: Risky coupling
+```
+
+---
+
+## 5. Recommendations
+
+* **Refactor hubs:** Decompose large or over-connected utilities.
+* **Mitigate bottlenecks:** Introduce redundancy or alternative communication paths.
+* **Investigate outliers:** Determine if anomalies are justified exceptions.
+* **Raise cohesion:** Increase local clustering by improving modular boundaries.
+* **Stabilize authorities:** Encapsulate frequently used but fragile components.
+* **Validate bridges:** Confirm cross-cluster connectors are intentional and safe.
+
+---
+
+## 6. Appendix
+
+### 6.1 Methodology Overview
+
+1. Build dependency graph (types, packages, artifacts).
+1. Compute graph metrics: degree, PageRank, betweenness, clustering coefficient, etc.
+1. Generate embeddings via Fast Random Projection.
+1. Reduce embeddings with PCA (retain 90% variance).
+1. Train Isolation Forest for anomaly detection.
+1. Explain results using SHAP (via Random Forest proxy).
+1. Cluster anomalies via HDBSCAN, tuned with Leiden reference communities (AMI score).
+1. Hyperparameter optimization for both Isolation Forest and Random Forest proxy with their F1 score
+
+### 6.2 Feature Set
+
+* Degree (in/out)
+* PageRank
+* ArticleRank
+* Page-to-Article Rank Difference
+* Betweenness Centrality
+* Local Clustering Coefficient
+* Cluster Outlier Score (1.0 - cluster probability)
+* Cluster Radius (avg, max)
+* Cluster Size
+* Node Embedding (PCA 20–35 dims)
@@ -18,7 +18,9 @@
 
 <!-- include:DeepDiveTopAnomalies.md|report_no_data_info.template.md -->
 
-#### Plots
+#### Visualizations
+
+See [Plot Interpretation Guide](#3-plot-interpretation-guide) on how to read the plots in detail.
 
 <!-- include:report_deep_dive_anomaly_plots.md|empty.md -->