Skip to content

Commit 93e4788

Browse files
committed
Restructure anomaly detection Markdown report
For improved readability also for LLMs.
1 parent f68e362 commit 93e4788

8 files changed

+269
-199
lines changed

domains/anomaly-detection/summary/anomalyDetectionSummary.sh

Lines changed: 27 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -32,41 +32,6 @@ MARKDOWN_SCRIPTS_DIR=${MARKDOWN_SCRIPTS_DIR:-"${SCRIPTS_DIR}/markdown"}
3232
# Define functions to execute a cypher query from within a given file (first and only argument) like "execute_cypher" and "execute_cypher_summarized"
3333
source "${SCRIPTS_DIR}/executeQueryFunctions.sh"
3434

35-
# Appends a Markdown table to an existing file and
36-
# removes redundant header + separator rows.
37-
#
38-
# Usage:
39-
# cat newTable.md | append_table myMarkdownFile.md
40-
#
41-
# append_table myMarkdownFile.md <<'EOF'
42-
# | Name | Score | Archetype |
43-
# | --- | --- | --- |
44-
# | Bar | 0.9 | Something |
45-
# EOF
46-
#
47-
# Behavior:
48-
# - Keeps the first header row and its following separator row.
49-
# - Removes all subsequent duplicate header + separator pairs.
50-
# - Leaves all data rows untouched.
51-
append_to_markdown_table() {
52-
local file="$1"
53-
54-
# Append stdin to the target file
55-
cat >> "${file}"
56-
57-
# Clean up duplicate headers (header row + --- row)
58-
awk '!seen[$0]++ || NR <= 2' "${file}" > "${file}.tmp" && mv "${file}.tmp" "${file}"
59-
}
60-
61-
# Run the anomaly detection main report generation.
62-
anomaly_detection_report_first_section() {
63-
local report_markdown_includes_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
64-
mkdir -p "${report_markdown_includes_directory}"
65-
66-
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesPerAbstractionLayer.md"
67-
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_markdown_includes_directory}/AnomaliesInTotal.md"
68-
}
69-
7035
# Aggregates all results in a Markdown report.
7136
#
7237
# Required Parameters:
@@ -167,13 +132,38 @@ anomaly_detection_report() {
167132
# fi
168133
}
169134

135+
anomaly_detection_front_matter_metadata_head() {
136+
local current_date
137+
current_date="$(date +'%Y-%m-%d')"
138+
139+
local nearest_tag
140+
nearest_tag="$(git describe --tags)"
141+
142+
local analysis_directory
143+
analysis_directory="${PWD##*/}"
144+
145+
echo "---"
146+
echo "title: \"Anomaly Detection Report\""
147+
echo "generated: \"${current_date}\""
148+
echo "model_version: \"${nearest_tag}\""
149+
echo "dataset: \"${analysis_directory}\""
150+
echo "authors: [\"JohT/code-graph-analysis-pipeline\"]"
151+
echo "---"
152+
}
153+
170154
# Finalize the anomaly detection report by taking the main template, applying includes and appending all deep dive reports
171155
anomaly_detection_finalize_report() {
172-
echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Finalizing anomaly detection Markdown report..."
156+
echo "anomalyDetectionSummary: $(date +'%Y-%m-%dT%H:%M:%S%z') Assembling main anomaly detection Markdown report..."
173157

174-
report_include_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
158+
local report_include_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
175159
mkdir -p "${report_include_directory}"
176160

161+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesPerAbstractionLayer.cypher" --output-markdown-table > "${report_include_directory}/AnomaliesPerAbstractionLayer.md"
162+
execute_cypher "${ANOMALY_DETECTION_SUMMARY_DIR}/AnomaliesInTotal.cypher" --output-markdown-table > "${report_include_directory}/AnomaliesInTotal.md"
163+
164+
# Write "front matter" metadata section
165+
anomaly_detection_front_matter_metadata_head > "${report_include_directory}/AnomalyDetectionReportFrontMatter.md"
166+
177167
# Concatenate all deep dive reports as Markdown include
178168
rm -rf "${report_include_directory}/AnomalyDetectionDeepDive.md"
179169
for markdown_file in $(find . -type f -name 'report_deep_dive_*.md' | sort); do
@@ -202,10 +192,6 @@ ALGORITHM_NODE="projection_node_label"
202192
ALGORITHM_LANGUAGE="projection_language"
203193
REPORT_NUMBER="report_number"
204194

205-
# -- Overview Report for all code type -------------------------------
206-
207-
anomaly_detection_report_first_section
208-
209195
# -- Detail Reports for each code type -------------------------------
210196

211197
anomaly_detection_report "${REPORT_NUMBER}=1" "${ALGORITHM_NODE}=Artifact" "${ALGORITHM_LANGUAGE}=Java"
Lines changed: 229 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,23 @@
1+
<!-- include:AnomalyDetectionReportFrontMatter.md -->
2+
13
# 📊 Anomaly Detection Report
24

35
## 1. Executive Overview
46

57
This report analyzes structural and dependency anomalies across multiple abstraction levels of the codebase.
68
The goal is to detect potential **software quality, design, and architecture issues** using graph-based features, anomaly detection (Isolation Forest), and SHAP explainability.
79

10+
## 📚 Table of Contents
11+
12+
1. [Executive Overview](#1-executive-overview)
13+
1. [Deep Dives by Abstraction Level](#2-deep-dives-by-abstraction-level)
14+
1. [Plot Interpretation Guide](#3-plot-interpretation-guide)
15+
1. [Taxonomy of Anomaly Archetypes](#4-taxonomy-of-anomaly-archetypes)
16+
1. [Recommendations](#5-recommendations)
17+
1. [Appendix](#6-appendix)
18+
19+
---
20+
821
### 1.1 Anomalies in total
922

1023
<!-- include:AnomaliesInTotal.md -->
@@ -15,45 +28,220 @@ The goal is to detect potential **software quality, design, and architecture iss
1528

1629
## 2. Deep Dives by Abstraction Level
1730

31+
Each abstraction level includes anomaly statistics, SHAP feature importance, archetype distribution, and example anomalies.
32+
1833
<!-- include:AnomalyDetectionDeepDive.md -->
19-
## 3. Taxonomy of Anomaly Archetypes
20-
21-
| Archetype | Feature Profile | Risk for Architecture |
22-
|-----------|----------------|------------------------|
23-
| **Hub** | High degree, low clustering coefficient | Central dependency, fragile hotspot |
24-
| **Bottleneck** | High betweenness, low redundancy | Single point of failure, slows evolution |
25-
| **Outlier** | High cluster distance, small cluster size | Misfit component, unusual dependency pattern |
26-
| **Authority** | High PageRank but low articleRank | Over-relied utility with few reverse connections |
27-
| **Bridge** | Embedding-driven anomaly, cross-cluster | Connects unrelated domains, risky coupling |
28-
29-
---
30-
31-
## 4. Recommendations
32-
33-
* **Refactor hubs:** Break down god classes/utilities into smaller abstractions.
34-
* **Mitigate bottlenecks:** Add redundancy or alternative paths.
35-
* **Investigate outliers:** Validate if they are justified exceptions or design flaws.
36-
* **Enforce cohesion:** Raise clustering coefficient via better modular boundaries.
37-
* **Stabilize authorities:** Encapsulate widely used but locally weak components, reduce over-generalization, and ensure stable APIs.
38-
* **Clarify bridges:** Validate whether cross-cluster connectors are intentional (adapters/facades) or accidental; refactor or relocate responsibilities to preserve modularity.
39-
40-
---
41-
42-
## 5. Appendix
43-
44-
* **Methodology:** Isolation Forest, Random Forest proxy, SHAP explanations.
45-
* **Embedding generation:** Fast Random Projection, PCA (20–35 dims, \~0.9 target variance).
46-
* **Clustering:** HDBSCAN tuned against Leiden communities (golden reference, AMI optimization).
47-
* **Optimization:** Hyperparameter optimization for both Isolation Forest and Random Forest proxy with their F1 score
48-
* **Feature set:**
49-
* Degree (in/out)
50-
* PageRank
51-
* ArticleRank
52-
* Page-to-Article Rank Difference
53-
* Betweenness centrality
54-
* Local Clustering Coefficient
55-
* Cluster Approximate Outlier Score ( = 1.0 - Cluster Probability)
56-
* Cluster Radius Average
57-
* Cluster Distance to Medoid
58-
* Cluster Size
59-
* Node Embedding
34+
35+
## 3. Plot Interpretation Guide
36+
37+
> **Purpose:** Understand each plot type’s diagnostic value.
38+
> **Applies to:** All abstraction levels.
39+
40+
| Plot Type | Best For | Adds | Why It Matters |
41+
| --- | --- | --- | --- |
42+
| **Anomalies Plot** | Seeing distribution of anomalies in clusters | Context of clusters & outliers | Reveals isolation or cluster-based anomalies |
43+
| **SHAP Summary** | Global feature importance | Feature impact direction | Shows what drives anomalies overall |
44+
| **Local SHAP Force** | Explaining a single anomaly | Feature contribution breakdown | Useful for debugging individual outliers |
45+
| **Dependence Plot** | Understanding feature influence | Interaction visualization | Reveals nonlinear feature effects |
46+
| **Cluster Metrics** | Cluster characteristics | Radius, cohesion, noise | Identifies weakly defined or noisy clusters |
47+
48+
## 3. Plot Interpretation Guide
49+
50+
> **Purpose:** Provide a direct mapping between all plots and their analytical meaning.
51+
> **Scope:** Applies to plots for *Java Type*, *Java Package*, and similar abstraction levels.
52+
> **Format:** Each entry includes `Best for`, `Adds`, and `Why`, matching the in-report descriptions.
53+
54+
---
55+
56+
### 📘 Main Plots
57+
58+
| Plot | Description | Best For | Adds | Why |
59+
|------|--------------|----------|------|-----|
60+
| **Anomalies** | 2D visualization of all code units showing clusters and anomalies. | Understanding the overall distribution of anomalies in relation to clusters. | Context of clusters and outliers. | Reveals whether anomalies are isolated or cluster-based, guiding investigation. |
61+
| **Global Feature Importance (SHAP Summary)** | Mean absolute SHAP values ranking global feature impact. | Global understanding of which features drive anomalies. | Direction of impact (color shows feature value). | Explains which metrics consistently influence anomaly detection. |
62+
| **Feature Dependence (Top Important Features)** | Shows how specific feature values affect anomaly score; colored by interacting feature. | Understanding how one feature affects anomaly scores. | Color shows feature interaction or threshold effect. | Helps identify nonlinear relationships and feature interactions. |
63+
64+
---
65+
66+
### 📙 Local Explanation Plots
67+
68+
| Plot | Description | Best For | Adds | Why |
69+
|------|--------------|----------|------|-----|
70+
| **Local SHAP Force Plots (Top Anomalies 1–6)** | Visualizes per-feature contributions to each anomaly’s score relative to baseline. | Explaining *why a specific data point* is anomalous. | Visual breakdown of how each feature contributes to anomaly score. | Enables debugging of individual anomalies through transparent explanation. |
71+
72+
---
73+
74+
### 📗 Cluster-Level Diagnostic Plots
75+
76+
| Plot | Description | Best For | Adds | Why |
77+
|------|--------------|----------|------|-----|
78+
| **Clusters – Overall** | Shows all clusters since they all fit into one plot. | Gaining a holistic view of cluster characteristics in the dataset. | An overall summary of how all clusters are distributed and their key metrics. | Understanding the general structure and properties of clusters can help identify patterns and potential anomalies in the data. |
79+
| **Clusters – Largest Average Radius** | Ranks clusters by mean distance of members from their centroid. | Getting an overview of clusters that are more dispersed. | Identifies clusters with internal variability. | Large average radius suggests less cohesion and potential outliers. |
80+
| **Clusters – Largest Max Radius** | Shows clusters with the farthest outlying member. | Identifying clusters that have members farthest from cluster center. | Highlights clusters containing extreme outliers. | Indicates clusters that may contain hidden anomalies. |
81+
| **Clusters – Largest Size** | Displays cluster membership counts. | Understanding which clusters contain the most code units. | Provides sense of frequency of code structures. | Large clusters may represent common design patterns; small clusters are specialized. |
82+
| **Cluster Probabilities** | Distribution of HDBSCAN membership probabilities. | Detecting code units that don’t strongly belong to any cluster. | Measures how well-defined clusters are. | Highlights noisy or weakly defined clusters. |
83+
84+
---
85+
86+
### 📒 Cluster Noise & Bridge Diagnostics
87+
88+
| Plot | Description | Best For | Adds | Why |
89+
|------|--------------|----------|------|-----|
90+
| **Cluster Noise – Highly Central and Popular** | Central nodes that don’t fit any cluster. | Detecting code units that are highly connected but anomalous. | Reveals influential but misfit nodes. | Such nodes may be key but unstable integration points. |
91+
| **Cluster Noise – Poorly Integrated Bridges** | Nodes connecting clusters but weakly integrated. | Detecting code units that bridge modules unusually. | Identifies cross-cutting or leaking dependencies. | May reveal architectural boundary violations. |
92+
| **Cluster Noise – Role Inverted Bridges** | Bridges with reversed structural roles compared to expected topology. | Detecting code units connecting clusters in unexpected ways. | Highlights anomalous coupling roles. | Indicates architectural inversion or misuse of interfaces. |
93+
94+
---
95+
96+
### 📙 Feature Distribution & Relationship Plots
97+
98+
| Plot | Description | Best For | Adds | Why |
99+
|------|--------------|----------|------|-----|
100+
| **Betweenness Centrality Distribution** | Histogram of betweenness values. | Identifying code units that act as structural bridges. | Insight into flow of dependency control. | Detects potential bottlenecks or single points of failure. |
101+
| **Clustering Coefficient Distribution** | Histogram of local clustering coefficients. | Identifying modularity and local cohesion. | Insight into how tightly code units cluster. | Reveals how cohesive or isolated different regions of the graph are. |
102+
| **PageRank – ArticleRank Difference Distribution** | Distribution of `PageRank - ArticleRank`. | Identifying influential nodes beyond local connectivity. | Shows imbalance between influence and popularity. | Highlights components with disproportionate architectural impact. |
103+
| **Clustering Coefficient vs PageRank** | Scatterplot comparing local clustering to global influence. | Identifying relationships between cohesion and centrality. | Visualizes trade-offs between modularity and reach. | Helps spot code units that are both locally and globally critical. |
104+
105+
---
106+
107+
### 📔 Summary Categories
108+
109+
| Category | Included Plots | Typical Usage |
110+
|-----------|----------------|----------------|
111+
| **Main Diagnostic** | Anomalies, Global SHAP, Feature Dependence | High-level anomaly review |
112+
| **Local Explanation** | Local SHAP Force Plots | Case-by-case anomaly debugging |
113+
| **Cluster Diagnostics** | Cluster Radius / Size / Probability | Assess cluster cohesion and outliers |
114+
| **Cluster Noise Analysis** | Cluster Noise (3 types) | Identify special structural anomalies |
115+
| **Feature Distributions** | Betweenness, Clustering, Rank Difference | Assess feature-based structure patterns |
116+
| **Feature Relationships** | Clustering vs PageRank | Evaluate global vs local influence balance |
117+
118+
---
119+
120+
### 💡 Reading Guidance
121+
122+
- **Color Conventions:**
123+
Red = anomalous, Green = typical, Light grey = noise, Pale colors = clusters.
124+
- **Scales:**
125+
SHAP values are normalized (mean absolute); graph metrics standardized by z-score.
126+
- **How to Use:**
127+
1. Start with *Main Diagnostic* plots to identify anomalies and drivers.
128+
2. Use *Local SHAP* for detailed case analysis.
129+
3. Check *Cluster Diagnostics* and *Noise Plots* to verify grouping quality.
130+
4. Use *Feature Distributions* to contextualize metrics.
131+
5. Cross-reference *Feature Relationships* for architectural interpretation.
132+
133+
---
134+
135+
### 📄 Structured Form (YAML Summary)
136+
137+
You can include this in your appendix for machine-readable mapping:
138+
139+
```yaml
140+
plots:
141+
main:
142+
- name: Anomalies
143+
purpose: Distribution of anomalies and clusters
144+
- name: Global Feature Importance (SHAP)
145+
purpose: Global feature ranking
146+
- name: Feature Dependence
147+
purpose: Feature–score relationship
148+
local:
149+
- name: Local SHAP Force Plots
150+
purpose: Local explanations for top anomalies
151+
cluster:
152+
- name: Clusters Largest Average Radius
153+
purpose: Identify dispersed clusters
154+
- name: Clusters Largest Max Radius
155+
purpose: Identify extreme outlier clusters
156+
- name: Clusters Largest Size
157+
purpose: Identify dominant cluster types
158+
- name: Cluster Probabilities
159+
purpose: Assess cluster definition strength
160+
cluster_noise:
161+
- name: Cluster Noise – Highly Central and Popular
162+
purpose: Central anomalies without cluster fit
163+
- name: Cluster Noise – Poorly Integrated Bridges
164+
purpose: Weakly integrated bridges
165+
- name: Cluster Noise – Role Inverted Bridges
166+
purpose: Inverted bridge roles
167+
feature_distributions:
168+
- name: Betweenness Centrality Distribution
169+
purpose: Bridge and bottleneck detection
170+
- name: Clustering Coefficient Distribution
171+
purpose: Cohesion and modularity measurement
172+
- name: PageRank – ArticleRank Difference Distribution
173+
purpose: Influence vs popularity analysis
174+
feature_relationships:
175+
- name: Clustering Coefficient vs PageRank
176+
purpose: Local vs global influence comparison
177+
```
178+
179+
## 4. Taxonomy of Anomaly Archetypes
180+
181+
| Archetype | Feature Profile | Architectural Risk |
182+
|-----------|-----------------|--------------------|
183+
| **Hub** | High degree, low clustering coefficient | Central dependency; fragile hotspot |
184+
| **Bottleneck** | High betweenness, low redundancy | Single point of failure; slows evolution |
185+
| **Outlier** | High cluster distance, small cluster size | Misfit or irregular dependency pattern |
186+
| **Authority** | High PageRank, low ArticleRank | Over-relied utility; low local stability |
187+
| **Bridge** | Cross-cluster connection | Risky coupling; weak modular boundaries |
188+
189+
**Structured form (for LLM parsing):**
190+
191+
```yaml
192+
archetypes:
193+
- name: Hub
194+
profile: High degree, low clustering coefficient
195+
risk: Central dependency, fragile hotspot
196+
- name: Bottleneck
197+
profile: High betweenness, low redundancy
198+
risk: Single point of failure
199+
- name: Outlier
200+
profile: High cluster distance, small cluster size
201+
risk: Misfit component
202+
- name: Authority
203+
profile: High PageRank, low ArticleRank
204+
risk: Over-relied utility
205+
- name: Bridge
206+
profile: Cross-cluster connector
207+
risk: Risky coupling
208+
```
209+
210+
---
211+
212+
## 5. Recommendations
213+
214+
* **Refactor hubs:** Decompose large or over-connected utilities.
215+
* **Mitigate bottlenecks:** Introduce redundancy or alternative communication paths.
216+
* **Investigate outliers:** Determine if anomalies are justified exceptions.
217+
* **Raise cohesion:** Increase local clustering by improving modular boundaries.
218+
* **Stabilize authorities:** Encapsulate frequently used but fragile components.
219+
* **Validate bridges:** Confirm cross-cluster connectors are intentional and safe.
220+
221+
---
222+
223+
## 6. Appendix
224+
225+
### 6.1 Methodology Overview
226+
227+
1. Build dependency graph (types, packages, artifacts).
228+
1. Compute graph metrics: degree, PageRank, betweenness, clustering coefficient, etc.
229+
1. Generate embeddings via Fast Random Projection.
230+
1. Reduce embeddings with PCA (retain 90% variance).
231+
1. Train Isolation Forest for anomaly detection.
232+
1. Explain results using SHAP (via Random Forest proxy).
233+
1. Cluster anomalies via HDBSCAN, tuned with Leiden reference communities (AMI score).
234+
1. Hyperparameter optimization for both Isolation Forest and Random Forest proxy with their F1 score
235+
236+
### 6.2 Feature Set
237+
238+
* Degree (in/out)
239+
* PageRank
240+
* ArticleRank
241+
* Page-to-Article Rank Difference
242+
* Betweenness Centrality
243+
* Local Clustering Coefficient
244+
* Cluster Outlier Score (1.0 - cluster probability)
245+
* Cluster Radius (avg, max)
246+
* Cluster Size
247+
* Node Embedding (PCA 20–35 dims)

domains/anomaly-detection/summary/report_deep_dive.template.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818

1919
<!-- include:DeepDiveTopAnomalies.md|report_no_data_info.template.md -->
2020

21-
#### Plots
21+
#### Visualizations
22+
23+
See [Plot Interpretation Guide](#3-plot-interpretation-guide) on how to read the plots in detail.
2224

2325
<!-- include:report_deep_dive_anomaly_plots.md|empty.md -->
2426

0 commit comments

Comments
 (0)