From 6afd8ec8078e8f6f0b093ec324081ea6be399c47 Mon Sep 17 00:00:00 2001
From: JohT <7671054+JohT@users.noreply.github.com>
Date: Sun, 2 Nov 2025 21:44:41 +0100
Subject: [PATCH 1/2] Add Treemap visualization to anomaly detection

---
 .../anomalyDetectionPython.sh                 |  75 +-
 .../AnomalyDetectionTreeMapExploration.ipynb  | 814 ++++++++++++++++++
 .../queries/AnomalyDetectionFiles.cypher      |  34 +
 .../treemapVisualizations.py                  | 645 ++++++++++++++
 4 files changed, 1561 insertions(+), 7 deletions(-)
 create mode 100644 domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb
 create mode 100644 domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher
 create mode 100755 domains/anomaly-detection/treemapVisualizations.py

diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh
index 80c3c1109..f6f661814 100755
--- a/domains/anomaly-detection/anomalyDetectionPython.sh
+++ b/domains/anomaly-detection/anomalyDetectionPython.sh
@@ -15,12 +15,14 @@ set -o errexit -o pipefail
 # Overrideable Constants (defaults also defined in sub scripts)
 REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
 
+MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"} # Subdirectory that contains Markdown files to be included by the Markdown template for the report.
+
 ## Get this "scripts/reports" directory if not already set
 # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
 # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
 # This way non-standard tools like readlink aren't needed.
 ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
-echo "anomalyDetectionPipeline: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}"
+echo "anomalyDetectionPython: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}"
 # Get the "scripts" directory by taking the path of this script and going one directory up.
 SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Repository directory containing the shell scripts
 # Get the "cypher" query directory for gathering features.
@@ -49,7 +51,7 @@ while [[ $# -gt 0 ]]; do
       verboseMode="--verbose"
       ;;
     *)
-      echo -e "${COLOR_ERROR}anomalyDetectionPipeline: Error: Unknown option: ${key}${COLOR_DEFAULT}" >&2
+      echo -e "${COLOR_ERROR}anomalyDetectionPython: Error: Unknown option: ${key}${COLOR_DEFAULT}" >&2
       usage
       ;;
   esac
@@ -72,10 +74,10 @@ is_sufficient_data_available() {
     query_result=$( execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionNodeCount.cypher" "${@}" )
     node_count=$(get_csv_column_value "${query_result}" "node_count")
     if [ "${node_count}" -lt 15 ]; then
-        echo "anomalyDetectionPipeline: Warning: Skipping anomaly detection. Only ${node_count} ${language} ${nodeLabel} nodes. At least 15 required."
+        echo "anomalyDetectionPython: Warning: Skipping anomaly detection. Only ${node_count} ${language} ${nodeLabel} nodes. At least 15 required."
         false
     else
-        echo "anomalyDetectionPipeline: Info: Running anomaly detection with ${node_count} ${language} ${nodeLabel} nodes."
+        echo "anomalyDetectionPython: Info: Running anomaly detection with ${node_count} ${language} ${nodeLabel} nodes."
         true
     fi
 }
@@ -92,7 +94,7 @@ is_sufficient_data_available() {
 anomaly_detection_features() {
     local nodeLabel
     nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
-    echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..."
+    echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..."
 
     # Determine the Betweenness centrality (with the directed graph projection) if not already done
     execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Exists.cypher" \
@@ -127,7 +129,7 @@ anomaly_detection_using_python() {
     local language
     language=$( extractQueryParameter "projection_language" "${@}" )
     
-    echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..."
+    echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..."
 
     # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
     local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
@@ -152,6 +154,8 @@ anomaly_detection_using_python() {
 # Required Parameters:
 # - projection_node_label=...
 #   Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
 anomaly_detection_labels() {
     local nodeLabel
     nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -177,12 +181,63 @@ anomaly_detection_labels() {
 #   Label of the nodes that will be used for the projection. Example: "Package"
 # - projection_weight_property=...
 #   Name of the node property that contains the dependency weight. Example: "weight"
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
 anomaly_detection_python_reports() {
     time anomaly_detection_features "${@}"
     anomaly_detection_using_python "${@}"
     time anomaly_detection_labels "${@}"
 }
 
+# Creates the markdown file (to be included in the main summary) 
+# that contains the references to all treemap charts.
+anomaly_detection_treemap_charts_markdown_reference() {
+
+    echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Starting treemap charts markdown reference generation..."
+
+    local detail_report_include_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
+    mkdir -p "${detail_report_include_directory}"
+
+    local markdown_reference_file_name="TreemapChartsReference.md"
+    local markdown_reference_file="${detail_report_include_directory}/${markdown_reference_file_name}"
+
+    # Write markdown references section title
+    {
+      echo "#### Treemap Charts"
+    } > "${markdown_reference_file}"
+
+    # Find all treemap chart SVG files and add them to the markdown reference file
+    find "${FULL_REPORT_DIRECTORY}" -type f -name "*Treemap*.svg" | sort | while read -r chart_file; do
+      chart_filename=$(basename -- "${chart_file}")
+      chart_filename_without_extension="${chart_filename%.*}" # Remove file extension
+      {
+        echo ""
+        echo "![${chart_filename_without_extension}](./${chart_filename})"
+      } >> "${markdown_reference_file}"
+    done
+
+    # Add a horizontal rule at the end
+    {
+      echo ""
+      echo "---"
+    } >> "${markdown_reference_file}"
+
+    echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Finished treemap charts markdown reference generation..."
+}
+
+# Visualize results with treemap charts.
+# 
+# Required Parameters:
+# - projection_language=...
+#   Name of the associated programming language. Examples: "Java", "Typescript"
+anomaly_detection_treemap_charts() {
+    local language
+    language=$( extractQueryParameter "projection_language" "${@}" )
+    
+    echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Visualizing ${language} results..."
+    time "${ANOMALY_DETECTION_SCRIPT_DIR}/treemapVisualizations.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
+}
+
 # Create report directory
 REPORT_NAME="anomaly-detection"
 FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
@@ -229,6 +284,7 @@ if is_sufficient_data_available "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=we
   if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then
       createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed"
       anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+      anomaly_detection_treemap_charts "${ALGORITHM_LANGUAGE}=Java"
   fi
 fi
 
@@ -238,12 +294,17 @@ if is_sufficient_data_available "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=
   if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"; then
       createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"
       anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${ALGORITHM_LANGUAGE}=Typescript" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+      anomaly_detection_treemap_charts "${ALGORITHM_LANGUAGE}=Module"
   fi
 fi
 
+# -- Markdown summary  ---------------------------
+
+anomaly_detection_treemap_charts_markdown_reference
+
 # ---------------------------------------------------------------
 
 # Clean-up after report generation. Empty reports will be deleted.
 source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"
 
-echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
\ No newline at end of file
+echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
\ No newline at end of file
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb
new file mode 100644
index 000000000..79f073eae
--- /dev/null
+++ b/domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb
@@ -0,0 +1,814 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "2f0eabc4",
+   "metadata": {},
+   "source": [
+    "# Anomaly Detection - TreeMap Exploration\n",
+    "\n",
+    "This notebook demonstrates how to visualize anomalies with Treemap charts for static code analysis data using jQAssistant and Neo4j. \n",
+    "\n",
+    "<br>  \n",
+    "\n",
+    "### References\n",
+    "- [jqassistant](https://jqassistant.org)\n",
+    "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)\n",
+    "- [Plotly Treemap Chart](https://plotly.com/python/treemaps/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4191f259",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import typing\n",
+    "from typing import List, Tuple\n",
+    "\n",
+    "from IPython.display import display\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "from plotly import graph_objects as plotly_graph_objects\n",
+    "import plotly.colors as plotly_colors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0676813",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n",
+    "#This is especially needed for PDF export of tables with multiple columns."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebac1bb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%html\n",
+    "<style>\n",
+    "/* CSS style for smaller dataframe tables. */\n",
+    ".dataframe th {\n",
+    "    font-size: 8px;\n",
+    "}\n",
+    ".dataframe td {\n",
+    "    font-size: 8px;\n",
+    "}\n",
+    "</style>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9bc2241",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Pandas DataFrame Display Configuration\n",
+    "pd.set_option('display.max_colwidth', 500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8ef41ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sys import version as python_version\n",
+    "print('Python version: {}'.format(python_version))\n",
+    "\n",
+    "from numpy import __version__ as numpy_version\n",
+    "print('numpy version: {}'.format(numpy_version))\n",
+    "\n",
+    "from pandas import __version__ as pandas_version\n",
+    "print('pandas version: {}'.format(pandas_version))\n",
+    "\n",
+    "from neo4j import __version__ as neo4j_version\n",
+    "print('neo4j version: {}'.format(neo4j_version))\n",
+    "\n",
+    "from plotly import version as plotly_version\n",
+    "print('plotly version: {}'.format(plotly_version))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c5dab37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n",
+    "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n",
+    "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n",
+    "from neo4j import GraphDatabase, Driver\n",
+    "\n",
+    "def get_graph_database_driver() -> Driver:\n",
+    "    driver = GraphDatabase.driver(\n",
+    "        uri=\"bolt://localhost:7687\",\n",
+    "        auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")) # type: ignore\n",
+    "    )\n",
+    "    driver.verify_connectivity()\n",
+    "    return driver"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1db254b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None):\n",
+    "    records, summary, keys = driver.execute_query(query, parameters_=parameters)\n",
+    "    return pd.DataFrame([record.values() for record in records], columns=keys)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cf0993d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Base settings for Plotly Treemap\n",
+    "\n",
+    "plotly_main_layout_base_settings = {\n",
+    "    \"margin\": {\"t\": 50, \"l\": 15, \"r\": 15, \"b\": 15},\n",
+    "}\n",
+    "plotly_treemap_layout_base_settings = dict(\n",
+    "    **plotly_main_layout_base_settings\n",
+    ")\n",
+    "plotly_bar_layout_base_settings = dict(\n",
+    "    **plotly_main_layout_base_settings\n",
+    ")\n",
+    "plotly_treemap_figure_show_settings = {\n",
+    "    \"renderer\": None,\n",
+    "    \"width\": 1080,\n",
+    "    \"height\": 1080,\n",
+    "}\n",
+    "\n",
+    "plotly_treemap_marker_base_style = {\n",
+    "    \"cornerradius\": 5,\n",
+    "}\n",
+    "\n",
+    "#  Hot_r, ice_r, Viridis_r, speed_r, haline_r, thermal_r, Plasma_r, solar_r, Electric_r, Blackbody_r, deep_r, Turbo_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n",
+    "plotly_treemap_marker_base_color_scale = dict(\n",
+    "    **plotly_treemap_marker_base_style,\n",
+    "    colorscale='Hot_r',\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30a0b4e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_treemap_settings(data_frame: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = \"elementName\") -> plotly_graph_objects.Treemap:\n",
+    "    \"\"\"\n",
+    "    Creates a Plotly Treemap with the given settings and data frame.\n",
+    "    data_frame : pd.DataFrame : The input data frame\n",
+    "    return :plotly_graph_objects.Treemap : The prepared Plotly Treemap\n",
+    "    \"\"\"\n",
+    "    return plotly_graph_objects.Treemap(\n",
+    "        labels=data_frame[element_name_column],\n",
+    "        parents=data_frame['directoryParentPath'],\n",
+    "        ids=data_frame[element_path_column],\n",
+    "        customdata=data_frame[['fileCount', 'absoluteAnomalyScore', 'normalizedAuthorityRank', 'normalizedBottleneckRank', 'normalizedBridgeRank', 'normalizedHubRank', 'normalizedOutlierRank', 'elementPath']],\n",
+    "        hovertemplate='<b>%{label}</b><br>Highlighted anomalies: %{customdata[0]}<br>Anomaly Score: %{customdata[1]:.4f}<br>Authority: %{customdata[2]}, Bottleneck: %{customdata[3]}, Bridge: %{customdata[4]}, Hub: %{customdata[5]}, Outlier: %{customdata[6]}<br>Path: %{customdata[7]}',\n",
+    "        maxdepth=-1,\n",
+    "        root_color=\"lightgrey\",\n",
+    "        marker=dict(**plotly_treemap_marker_base_style),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc84a742",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_last_path_file_extension(file_path_elements: list) -> list:\n",
+    "    \"\"\"\n",
+    "    Removes the file extension of the last element of the file path so that only the file name remains.\n",
+    "    file_path_elements : list : The list of file path elements where the last one contains the file name with extension\n",
+    "    return : list : The list of the directories + the file name without extension as last element.\n",
+    "    \"\"\"\n",
+    "    if not file_path_elements:\n",
+    "        return ['']\n",
+    "    if len(file_path_elements) == 1:\n",
+    "        return [os.path.splitext(file_path_elements[0])[0]]\n",
+    "    return file_path_elements[:-1] + [os.path.splitext(file_path_elements[-1])[0]]\n",
+    "\n",
+    "def join_path_elements(file_path_elements: list) -> list:\n",
+    "    \"\"\"\n",
+    "    Joins the file path elements (and removes the file extension).\n",
+    "    file_path_elements : list : The list of levels to convert\n",
+    "    return : list : The list of directories\n",
+    "    \"\"\"\n",
+    "    prepared_path_elements = remove_last_path_file_extension(file_path_elements)\n",
+    "    return ['/'.join(prepared_path_elements[:i+1]) for i in range(len(prepared_path_elements))]\n",
+    "\n",
+    "def add_element_path_column(input_dataframe: pd.DataFrame, file_path_column: str, element_path_column: str = 'elementPath'):\n",
+    "    \"\"\"\n",
+    "    Adds a directory column to the input DataFrame based on the file path column.\n",
+    "    input_dataframe : pd.DataFrame : The input DataFrame\n",
+    "    file_path_column : str : The name of the file path column\n",
+    "    directory_column : str : The name of the directory column to be added\n",
+    "    return : pd.DataFrame : The DataFrame with added directory column\n",
+    "    \"\"\"\n",
+    "    if element_path_column in input_dataframe.columns:\n",
+    "        return input_dataframe # Column already exists\n",
+    "    \n",
+    "    input_dataframe.insert(0, element_path_column, input_dataframe[file_path_column].str.split('/').apply(join_path_elements))\n",
+    "    input_dataframe = input_dataframe.explode(element_path_column)\n",
+    "    return input_dataframe\n",
+    "\n",
+    "def add_element_name_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = 'elementName'):\n",
+    "    \"\"\"\n",
+    "    Adds a directory name column to the input DataFrame based on the directory column.\n",
+    "    input_dataframe : pd.DataFrame : The input DataFrame\n",
+    "    directory_column : str : The name of the directory column\n",
+    "    directory_name_column : str : The name of the directory name column to be added\n",
+    "    return : pd.DataFrame : The DataFrame with added directory name column\n",
+    "    \"\"\"\n",
+    "    if element_name_column in input_dataframe.columns:\n",
+    "        return input_dataframe # Column already exists\n",
+    "    \n",
+    "    splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)\n",
+    "    input_dataframe.insert(1, element_name_column, splitted_directories.apply(lambda x: (x[-1])))\n",
+    "    return input_dataframe\n",
+    "\n",
+    "def add_parent_directory_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', directory_parent_column: str = 'directoryParentPath'):\n",
+    "    \"\"\"\n",
+    "    Adds a directory parent column to the input DataFrame based on the directory column.\n",
+    "    input_dataframe : pd.DataFrame : The input DataFrame\n",
+    "    directory_column : str : The name of the directory column\n",
+    "    directory_parent_column : str : The name of the directory parent column to be added\n",
+    "    return : pd.DataFrame : The DataFrame with added directory parent column\n",
+    "    \"\"\"\n",
+    "    if directory_parent_column in input_dataframe.columns:\n",
+    "        return input_dataframe # Column already exists\n",
+    "    \n",
+    "    # Remove last path element from directory_column to get the directory_parent_column\n",
+    "    splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)\n",
+    "    input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))\n",
+    "    \n",
+    "    # Clear parent (set to empty string) when it equal to the directory\n",
+    "    input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[element_path_column], directory_parent_column] = ''\n",
+    "    return input_dataframe\n",
+    "\n",
+    "def count_unique_aggregated_values(values: pd.Series):\n",
+    "    \"\"\"\n",
+    "    Return the number of unique values from an array of array of strings.\n",
+    "    Meant to be used as an aggregation function for dataframe grouping.\n",
+    "    values : Series : The pandas Series of values\n",
+    "    return : int : The number of files\n",
+    "    \"\"\"\n",
+    "    return len(np.unique(np.concatenate(values.to_list())))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04166d63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Archetypes = typing.Literal[\"Authority\", \"Bottleneck\", \"Bridge\", \"Hub\", \"Outlier\"]\n",
+    "archetype_names: List[Archetypes] = [\"Authority\", \"Bottleneck\", \"Bridge\", \"Hub\", \"Outlier\"]\n",
+    "\n",
+    "def get_archetype_column_name(archetype: Archetypes) -> str:\n",
+    "    \"\"\"\n",
+    "    Returns the column name for the given archetype.\n",
+    "    archetype : Archetypes : The archetype name\n",
+    "    return : str : The column name for the given archetype\n",
+    "    \"\"\"\n",
+    "    return f\"normalized{archetype}Rank\"\n",
+    "\n",
+    "def get_archetype_index(archetype: Archetypes) -> int:\n",
+    "    \"\"\"\n",
+    "    Returns the index of the given archetype.\n",
+    "    archetype : Archetypes : The archetype name\n",
+    "    return : int : The index of the given archetype\n",
+    "    \"\"\"\n",
+    "    return archetype_names.index(archetype)\n",
+    "\n",
+    "archetype_columns = [get_archetype_column_name(name) for name in archetype_names]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e4caf24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query_data() -> pd.DataFrame:\n",
+    "    query: typing.LiteralString = \"\"\"\n",
+    "        MATCH (anomalyScoreStats:File&!Directory&!Archive)\n",
+    "        WHERE anomalyScoreStats.anomalyScore < 0\n",
+    "        ORDER BY anomalyScoreStats.anomalyScore ASCENDING\n",
+    "        LIMIT 150 // n largest negative anomaly scores as threshold\n",
+    "         WITH collect(anomalyScoreStats.anomalyScore)[-1] AS anomalyScoreThreshold\n",
+    "        MATCH (anomalyRankStats:File&!Directory&!Archive)\n",
+    "         WITH anomalyScoreThreshold\n",
+    "             ,max(anomalyRankStats.anomalyAuthorityRank)  AS maxAnomalyAuthorityRank\n",
+    "             ,max(anomalyRankStats.anomalyBottleneckRank) AS maxAnomalyBottleneckRank\n",
+    "             ,max(anomalyRankStats.anomalyBridgeRank)     AS maxAnomalyBridgeRank\n",
+    "             ,max(anomalyRankStats.anomalyHubRank)        AS maxAnomalyHubRank\n",
+    "             ,max(anomalyRankStats.anomalyOutlierRank)    AS maxAnomalyOutlierRank\n",
+    "        MATCH (anomalous:File&!Directory&!Archive)\n",
+    "        WHERE (anomalous.anomalyScore < anomalyScoreThreshold\n",
+    "           OR  anomalous.anomalyHubRank        IS NOT NULL\n",
+    "           OR  anomalous.anomalyAuthorityRank  IS NOT NULL\n",
+    "           OR  anomalous.anomalyBottleneckRank IS NOT NULL\n",
+    "           OR  anomalous.anomalyOutlierRank    IS NOT NULL\n",
+    "           OR  anomalous.anomalyBridgeRank     IS NOT NULL)\n",
+    "        OPTIONAL MATCH (project:Artifact|Project)-[:CONTAINS]->(anomalous)\n",
+    "          WITH *\n",
+    "              ,coalesce(project.name + '/', '')                     AS projectName\n",
+    "              ,coalesce(anomalous.fileName, anomalous.relativePath) AS fileName\n",
+    "        RETURN replace(projectName + fileName, '//', '/')   AS filePath\n",
+    "              ,CASE WHEN anomalous.anomalyScore < 0 THEN abs(anomalous.anomalyScore) ELSE 0 END AS absoluteAnomalyScore\n",
+    "              ,coalesce(toFloat(anomalous.anomalyAuthorityRank) / maxAnomalyAuthorityRank, 0)   AS normalizedAuthorityRank\n",
+    "              ,coalesce(toFloat(anomalous.anomalyBottleneckRank) / maxAnomalyBottleneckRank, 0) AS normalizedBottleneckRank\n",
+    "              ,coalesce(toFloat(anomalous.anomalyBridgeRank) / maxAnomalyBridgeRank, 0)         AS normalizedBridgeRank\n",
+    "              ,coalesce(toFloat(anomalous.anomalyHubRank / maxAnomalyHubRank), 0)               AS normalizedHubRank\n",
+    "              ,coalesce(toFloat(anomalous.anomalyOutlierRank) / maxAnomalyOutlierRank, 0)       AS normalizedOutlierRank\n",
+    "        ORDER BY filePath ASCENDING\n",
+    "        \"\"\"\n",
+    "    return query_cypher_to_data_frame(query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01e51a6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_data_for_treemap(data: pd.DataFrame, debug: bool = False) -> pd.DataFrame:\n",
+    "    if debug:\n",
+    "        display(\"1. query result ---------------------\")\n",
+    "        display(data)\n",
+    "\n",
+    "    # 3. Add multiple rows for each file path containing all its directories paths in the new column 'elementPath'\n",
+    "    data = add_element_path_column(data, 'filePath', 'elementPath')\n",
+    "\n",
+    "    if debug:\n",
+    "        display(\"3. added elementPath --------------\")\n",
+    "        display(data)\n",
+    "\n",
+    "    # Group the files by their directory and count the number of files of each directory (across all levels).\n",
+    "    common_named_aggregation = dict(\n",
+    "        absoluteAnomalyScore=pd.NamedAgg(column=\"absoluteAnomalyScore\", aggfunc=\"mean\"),\n",
+    "        normalizedAuthorityRank=pd.NamedAgg(column=\"normalizedAuthorityRank\", aggfunc=\"max\"),\n",
+    "        normalizedBottleneckRank=pd.NamedAgg(column=\"normalizedBottleneckRank\", aggfunc=\"max\"),\n",
+    "        normalizedBridgeRank=pd.NamedAgg(column=\"normalizedBridgeRank\", aggfunc=\"max\"),\n",
+    "        normalizedHubRank=pd.NamedAgg(column=\"normalizedHubRank\", aggfunc=\"max\"),\n",
+    "        normalizedOutlierRank=pd.NamedAgg(column=\"normalizedOutlierRank\", aggfunc=\"max\"),\n",
+    "    )\n",
+    "\n",
+    "    data = data.groupby(['elementPath']).aggregate(\n",
+    "        filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
+    "        firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
+    "        maxAnomalyScore=pd.NamedAgg(column=\"absoluteAnomalyScore\", aggfunc=\"max\"),\n",
+    "        **common_named_aggregation\n",
+    "    )\n",
+    "\n",
+    "    # Sort the grouped and aggregated entries by the name of the directory ascending and the anomaly score descending.\n",
+    "    # The author with the most commits will then be listed first for each directory.\n",
+    "    data = data.sort_values(by=['elementPath', 'absoluteAnomalyScore'], ascending=[True, False])\n",
+    "    data = data.reset_index()\n",
+    "\n",
+    "    if debug:\n",
+    "        display(\"4. grouped by elementPath --------------\")\n",
+    "        display(data)\n",
+    "\n",
+    "    # Group the entries again now only by their directory path to get the aggregated number of anomalies and ranks.\n",
+    "    data = data.groupby('elementPath').aggregate(\n",
+    "        fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n",
+    "        firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
+    "        maxAnomalyScore=pd.NamedAgg(column=\"maxAnomalyScore\", aggfunc=\"max\"),\n",
+    "        **common_named_aggregation\n",
+    "    )\n",
+    "    data = data.reset_index()\n",
+    "\n",
+    "    if debug:\n",
+    "        display(\"5. grouped by directory path --------------\")\n",
+    "        display(data)\n",
+    "\n",
+    "    # Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n",
+    "    data = add_element_name_column(data, 'elementPath', 'elementName')\n",
+    "    data = add_parent_directory_column(data, 'elementPath', 'directoryParentPath')\n",
+    "\n",
+    "    if debug:\n",
+    "        display(\"6. added directory and parent name --------------\")\n",
+    "        display(data)\n",
+    "\n",
+    "    # Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.\n",
+    "    all_column_names_except_for_the_directory_path = data.columns.to_list()[3:]\n",
+    "    data = data.groupby(all_column_names_except_for_the_directory_path).aggregate(\n",
+    "        elementName=pd.NamedAgg(column=\"elementName\", aggfunc=lambda names: '/'.join(names)),\n",
+    "        directoryParentPath=pd.NamedAgg(column=\"directoryParentPath\", aggfunc=\"first\"),\n",
+    "        elementPath=pd.NamedAgg(column=\"elementPath\", aggfunc=\"last\"),\n",
+    "    )\n",
+    "\n",
+    "    # Reorder the column positions so that the directory path is again the first column. \n",
+    "    all_column_names_with_the_directory_path_first = ['elementPath', 'directoryParentPath', 'elementName'] + all_column_names_except_for_the_directory_path\n",
+    "    data = data.reset_index()[all_column_names_with_the_directory_path_first]\n",
+    "\n",
+    "    if debug:\n",
+    "        display(\"7. final grouping --------------\")\n",
+    "        display(data)\n",
+    "        display(\"Statistics --------------\")\n",
+    "        data.describe()\n",
+    "    \n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c68aa20",
+   "metadata": {},
+   "source": [
+    "## 1. Anomalies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5222a25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "driver = get_graph_database_driver()\n",
+    "anomaly_file_paths = query_data()\n",
+    "anomaly_file_paths = prepare_data_for_treemap(anomaly_file_paths)\n",
+    "display(anomaly_file_paths)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "daa3a949",
+   "metadata": {},
+   "source": [
+    "### 1.1 Average anomaly score per file directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0cd2237",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_to_display = anomaly_file_paths.copy()\n",
+    "\n",
+    "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+    "    create_treemap_settings(data_to_display),\n",
+    "    marker=dict(\n",
+    "        **plotly_treemap_marker_base_color_scale,\n",
+    "        colors=data_to_display['absoluteAnomalyScore'], \n",
+    "        colorbar={\"title\": \"score\"},\n",
+    "    ),\n",
+    "))\n",
+    "figure.update_layout(\n",
+    "    **plotly_treemap_layout_base_settings, # type: ignore\n",
+    "    title='Average anomaly score per directory',\n",
+    ")\n",
+    "figure.show(**plotly_treemap_figure_show_settings)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24341675",
+   "metadata": {},
+   "source": [
+    "### 1.2 Overview of all anomaly archetypes per directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b773269c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def mutual_exclusive_ranks(data: pd.DataFrame) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Modifies the input data frame to ensure that only one archetype rank is non-zero per row.\n",
+    "    The archetype with the highest normalized rank is retained, and others are set to zero.\n",
+    "    data : pd.DataFrame : The input data frame\n",
+    "    return : pd.DataFrame : The modified data frame with mutual exclusive ranks\n",
+    "    \"\"\"\n",
+    "    modified_data = data.copy()\n",
+    "    \n",
+    "    for dataframe_index, row in modified_data.iterrows():\n",
+    "        index = typing.cast(int, dataframe_index)\n",
+    "        max_rank_value = 0\n",
+    "        max_rank_column = None\n",
+    "        \n",
+    "        for column in archetype_columns:\n",
+    "            if row[column] > max_rank_value:\n",
+    "                max_rank_value = row[column]\n",
+    "                max_rank_column = column\n",
+    "        \n",
+    "        for column in archetype_columns:\n",
+    "            if column != max_rank_column:\n",
+    "                modified_data.at[index, column] = 0\n",
+    "    \n",
+    "    return modified_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ef8e76a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def interpolate_color(low: Tuple[int, int, int], high: Tuple[int, int, int], normalized_value: float) -> str:\n",
+    "    \"\"\"Linear interpolation between two RGB tuples, returns rgba string.\"\"\"\n",
+    "    \n",
+    "    def linear_interpolation_of_color_component(color_component: int) -> int:\n",
+    "        return int(low[color_component] + (high[color_component] - low[color_component]) * normalized_value)\n",
+    "    \n",
+    "    red = linear_interpolation_of_color_component(0)\n",
+    "    green = linear_interpolation_of_color_component(1)\n",
+    "    blue = linear_interpolation_of_color_component(2)\n",
+    "    return f\"rgb({red},{green},{blue})\"\n",
+    "\n",
+    "\n",
+    "def get_rank_color(rank: float, low: Tuple[int, int, int], high: Tuple[int, int, int]) -> str:\n",
+    "    \"\"\"Return transparent if rank == 0, else interpolate between low and high.\"\"\"\n",
+    "    if rank <= 0:\n",
+    "        return \"rgb(255,255,255)\"\n",
+    "    return interpolate_color(low, high, rank)\n",
+    "\n",
+    "\n",
+    "def combine_rank_colors(\n",
+    "    dataframe: pd.DataFrame,\n",
+    "    rank_columns: List[str],\n",
+    "    color_pairs: List[Tuple[Tuple[int, int, int], Tuple[int, int, int]]],\n",
+    ") -> List[str]:\n",
+    "    \"\"\"Combine multiple ranks, using the first nonzero value's color.\"\"\"\n",
+    "    combined: List[str] = []\n",
+    "    for _, row in dataframe.iterrows():\n",
+    "        color = \"rgb(255,255,255)\"\n",
+    "        for rank_col, (low, high) in zip(rank_columns, color_pairs):\n",
+    "            rank = row[rank_col]\n",
+    "            if rank > 0:\n",
+    "                color = get_rank_color(rank, low, high)\n",
+    "                break\n",
+    "        combined.append(color)\n",
+    "    return combined\n",
+    "\n",
+    "\n",
+    "def get_rank_color_for_archetype(dataframe: pd.DataFrame, archetype: Archetypes) -> List[str]:\n",
+    "    \"\"\"Get combined rank colors for a specific archetype.\"\"\"\n",
+    "    archetype_column_name = get_archetype_column_name(archetype)\n",
+    "    coloring_pair = get_coloring_pairs()[archetype_names.index(archetype)]\n",
+    "    return combine_rank_colors(dataframe, [archetype_column_name], [coloring_pair])\n",
+    "\n",
+    "\n",
+    "def get_coloring_pairs() -> List[Tuple[Tuple[int, int, int], Tuple[int, int, int]]]:\n",
+    "    \"\"\"Define the coloring scheme for each archetype.\"\"\"\n",
+    "    assert len(archetype_names) == 5, \"Expected exactly 5 archetypes.\"\n",
+    "    return [\n",
+    "        ((222, 235, 247), (33, 113, 181)), # Authority, Red shades\n",
+    "        ((254, 230, 206), (217, 72, 1)),   # Bottleneck, Green shades\n",
+    "        ((239, 237, 245), (106, 81, 163)), # Bridge, Blue shades\n",
+    "        ((254, 224, 210), (165,15,21)),    # Hub, Orange shades\n",
+    "        ((240, 240, 240), (82, 82, 82)),   # Outlier, Purple shades\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1eb5f75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_to_display = mutual_exclusive_ranks(anomaly_file_paths)\n",
+    "# Optionally only keep rows where at least one archetype rank is greater than zero\n",
+    "data_to_display = data_to_display[data_to_display[archetype_columns].sum(axis=1) > 0]\n",
+    "\n",
+    "coloring_pairs = get_coloring_pairs()\n",
+    "combined_colors = combine_rank_colors(data_to_display, archetype_columns, coloring_pairs)\n",
+    "\n",
+    "figure = plotly_graph_objects.Figure()\n",
+    "\n",
+    "figure.add_trace(plotly_graph_objects.Treemap(\n",
+    "    create_treemap_settings(data_to_display),\n",
+    "    marker=dict(\n",
+    "        **plotly_treemap_marker_base_style,\n",
+    "        line=dict(width=1, color=\"black\"),\n",
+    "        showscale=False,\n",
+    "        colors=combined_colors,\n",
+    "    ),\n",
+    "    name=\"Anomalies\",\n",
+    "    opacity=0.8\n",
+    "))\n",
+    "\n",
+    "# Add dummy scatter traces for legend\n",
+    "for name, (low, high) in zip(archetype_names, coloring_pairs):\n",
+    "    bright_color = interpolate_color(low, high, 0.4)  # light tone for legend filling\n",
+    "    dark_color = interpolate_color(low, high, 1.0)  # darkest tone for legend outline\n",
+    "    figure.add_trace(plotly_graph_objects.Scatter(\n",
+    "        x=[None],\n",
+    "        y=[None],\n",
+    "        mode=\"markers\",\n",
+    "        marker=dict(size=12, color=bright_color, line=dict(width=2, color=dark_color)),\n",
+    "        name=name,\n",
+    "        legendgroup=name,\n",
+    "        showlegend=True,\n",
+    "    ))\n",
+    "\n",
+    "figure.update_layout(\n",
+    "    **plotly_treemap_layout_base_settings, # type: ignore\n",
+    "    title='Overview of all anomaly archetypes per directory',\n",
+    "    legend=dict(\n",
+    "        orientation=\"h\", # horizontal legend (use \"v\" for vertical)\n",
+    "        yanchor=\"bottom\",\n",
+    "        y=-0.12,\n",
+    "        xanchor=\"center\",\n",
+    "        x=0.5\n",
+    "    )\n",
+    ")\n",
+    "figure.update_xaxes(visible=False)\n",
+    "figure.update_yaxes(visible=False)\n",
+    "\n",
+    "figure.show(**plotly_treemap_figure_show_settings)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "379d568c",
+   "metadata": {},
+   "source": [
+    "### 1.3a Archetype - Authority per directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af0a16b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_single_archetype_treemap(archetype: Archetypes, data: pd.DataFrame):\n",
+    "    \"\"\"\n",
+    "    Plots a treemap for the given archetype using the provided data.\n",
+    "    archetype : Archetypes : The archetype to plot\n",
+    "    data : pd.DataFrame : The input data frame\n",
+    "    \"\"\"\n",
+    "    data_to_display = data.copy()\n",
+    "    data_to_display = data_to_display[data_to_display[archetype_columns].sum(axis=1) > 0]\n",
+    "\n",
+    "    archetype_column_name = get_archetype_column_name(archetype)\n",
+    "    combined_colors = get_rank_color_for_archetype(data_to_display, archetype)\n",
+    "\n",
+    "    figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+    "        create_treemap_settings(data_to_display),\n",
+    "        marker=dict(\n",
+    "            **plotly_treemap_marker_base_style,\n",
+    "            colors=combined_colors,\n",
+    "            line=dict(width=1, color=\"black\"),\n",
+    "            colorbar={\"title\": \"rank\"},\n",
+    "        ),\n",
+    "    ))\n",
+    "    figure.update_layout(\n",
+    "        **plotly_treemap_layout_base_settings, # type: ignore\n",
+    "        title=f'Archetype \"{archetype}\" per directory',\n",
+    "    )\n",
+    "    figure.show(**plotly_treemap_figure_show_settings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a497cd8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_single_archetype_treemap(\"Authority\", data_to_display)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c4eb401",
+   "metadata": {},
+   "source": [
+    "### 1.3b Archetype - Bottleneck per directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37656bfe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_single_archetype_treemap(\"Bottleneck\", anomaly_file_paths)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "06010d6d",
+   "metadata": {},
+   "source": [
+    "### 1.3c Archetype - Bridge per directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b0d3b99",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_single_archetype_treemap(\"Bridge\", anomaly_file_paths)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b375f191",
+   "metadata": {},
+   "source": [
+    "### 1.3d Archetype - Hub per directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3ee93ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_single_archetype_treemap(\"Hub\", anomaly_file_paths)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "772d80b4",
+   "metadata": {},
+   "source": [
+    "### 1.3e Archetype - Outlier per directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e9ac193",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_single_archetype_treemap(\"Outlier\", anomaly_file_paths)"
+   ]
+  }
+ ],
+ "metadata": {
+  "authors": [
+   {
+    "name": "JohT"
+   }
+  ],
+  "code_graph_analysis_pipeline_data_validation": "ValidateAlwaysFalse",
+  "kernelspec": {
+   "display_name": "codegraph",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  },
+  "title": "Anomaly Detection - Manual Exploration"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher b/domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher
new file mode 100644
index 000000000..e80a5b282
--- /dev/null
+++ b/domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher
@@ -0,0 +1,34 @@
+// List anomalous files
+
+MATCH (anomalyScoreStats:File&!Directory&!Archive)
+WHERE anomalyScoreStats.anomalyScore < 0
+ORDER BY anomalyScoreStats.anomalyScore ASCENDING
+LIMIT 150 // n largest negative anomaly scores as threshold
+ WITH collect(anomalyScoreStats.anomalyScore)[-1] AS anomalyScoreThreshold
+MATCH (anomalyRankStats:File&!Directory&!Archive)
+ WITH anomalyScoreThreshold
+     ,max(anomalyRankStats.anomalyAuthorityRank)  AS maxAnomalyAuthorityRank
+     ,max(anomalyRankStats.anomalyBottleneckRank) AS maxAnomalyBottleneckRank
+     ,max(anomalyRankStats.anomalyBridgeRank)     AS maxAnomalyBridgeRank
+     ,max(anomalyRankStats.anomalyHubRank)        AS maxAnomalyHubRank
+     ,max(anomalyRankStats.anomalyOutlierRank)    AS maxAnomalyOutlierRank
+MATCH (anomalous:File&!Directory&!Archive)
+WHERE (anomalous.anomalyScore < anomalyScoreThreshold
+   OR  anomalous.anomalyHubRank        IS NOT NULL
+   OR  anomalous.anomalyAuthorityRank  IS NOT NULL
+   OR  anomalous.anomalyBottleneckRank IS NOT NULL
+   OR  anomalous.anomalyOutlierRank    IS NOT NULL
+   OR  anomalous.anomalyBridgeRank     IS NOT NULL)
+OPTIONAL MATCH (project:Artifact|Project)-[:CONTAINS]->(anomalous)
+  WITH *
+      ,coalesce(project.name + '/', '')                     AS projectName
+      ,coalesce(anomalous.fileName, anomalous.relativePath) AS fileName
+RETURN replace(projectName + fileName, '//', '/')   AS filePath
+      ,CASE WHEN anomalous.anomalyScore < 0 THEN abs(anomalous.anomalyScore) ELSE 0 END AS absoluteAnomalyScore
+      ,coalesce(toFloat(anomalous.anomalyAuthorityRank) / maxAnomalyAuthorityRank, 0)   AS normalizedAuthorityRank
+      ,coalesce(toFloat(anomalous.anomalyBottleneckRank) / maxAnomalyBottleneckRank, 0) AS normalizedBottleneckRank
+      ,coalesce(toFloat(anomalous.anomalyBridgeRank) / maxAnomalyBridgeRank, 0)         AS normalizedBridgeRank
+      ,coalesce(toFloat(anomalous.anomalyHubRank / maxAnomalyHubRank), 0)               AS normalizedHubRank
+      ,coalesce(toFloat(anomalous.anomalyOutlierRank) / maxAnomalyOutlierRank, 0)       AS normalizedOutlierRank
+ORDER BY filePath ASCENDING
+LIMIT 200
\ No newline at end of file
diff --git a/domains/anomaly-detection/treemapVisualizations.py b/domains/anomaly-detection/treemapVisualizations.py
new file mode 100755
index 000000000..ce4680d6c
--- /dev/null
+++ b/domains/anomaly-detection/treemapVisualizations.py
@@ -0,0 +1,645 @@
+#!/usr/bin/env python
+
+# This Python script uses Plotly Treemap Charts (https://plotly.com/python/treemaps) to visualize anomaly detection results.
+
+from typing import Any, Dict, List, Tuple, Literal, LiteralString, Optional, cast
+
+import os
+import sys
+import argparse
+import pprint
+import logging
+
+import pandas as pd
+import numpy as np
+
+from neo4j import GraphDatabase, Driver
+
+from plotly import graph_objects as plotly_graph_objects
+
+
+class Parameters:
+    required_parameters_ = ["projection_language"]
+
+    def __init__(self, input_parameters: Dict[str, str], report_directory: str = "", verbose: bool = False):
+        self.query_parameters_ = input_parameters.copy()  # copy enforces immutability
+        self.report_directory = report_directory
+        self.verbose_ = verbose
+
+    def __repr__(self):
+        pretty_dict = pprint.pformat(self.query_parameters_, indent=4)
+        return f"Parameters: verbose={self.verbose_}, report_directory={self.report_directory}, query_parameters:\n{pretty_dict}"
+
+    @staticmethod
+    def log_dependency_versions_() -> None:
+        print('---------------------------------------')
+
+        print('Python version: {}'.format(sys.version))
+
+        from numpy import __version__ as numpy_version
+        print('numpy version: {}'.format(numpy_version))
+
+        from pandas import __version__ as pandas_version
+        print('pandas version: {}'.format(pandas_version))
+
+        from neo4j import __version__ as neo4j_version
+        print('neo4j version: {}'.format(neo4j_version))
+
+        from plotly import version as plotly_version
+        print('plotly version: {}'.format(plotly_version))
+
+        print('---------------------------------------')
+
+    @classmethod
+    def from_input_parameters(cls, input_parameters: Dict[str, str], report_directory: str = "", verbose: bool = False):
+        """
+        Creates a Parameters instance from a dictionary of input parameters.
+        The dictionary must contain the following keys:
+         - "projection_node_label": The node type of the projection.
+        """
+        missing_parameters = [parameter for parameter in cls.required_parameters_ if parameter not in input_parameters]
+        if missing_parameters:
+            raise ValueError("Missing parameters:", missing_parameters)
+        created_parameters = cls(input_parameters, report_directory, verbose)
+        if created_parameters.is_verbose():
+            print(created_parameters)
+            cls.log_dependency_versions_()
+        return created_parameters
+
+    def __is_code_language_available(self) -> bool:
+        return "projection_language" in self.query_parameters_
+
+    def __get_projection_language(self) -> str:
+        return self.query_parameters_["projection_language"] if self.__is_code_language_available() else ""
+
+    def get_title_prefix(self) -> str:
+        if self.__is_code_language_available():
+            return self.__get_projection_language()
+        return ""
+
+    def get_report_directory(self) -> str:
+        return self.report_directory
+
+    def is_verbose(self) -> bool:
+        return self.verbose_
+
+
+def parse_input_parameters() -> Parameters:
+    # Convert list of "key=value" strings to a dictionary
+    def parse_key_value_list(param_list: List[str]) -> Dict[str, str]:
+        param_dict = {}
+        for item in param_list:
+            if '=' in item:
+                key, value = item.split('=', 1)
+                param_dict[key] = value
+        return param_dict
+
+    parser = argparse.ArgumentParser(
+        description="Unsupervised clustering to assign labels to code units (Java packages, types,...) and their dependencies based on how structurally similar they are within a software system.")
+    parser.add_argument('--verbose', action='store_true', help='Enable verbose mode to log all details')
+    parser.add_argument('--report_directory', type=str, default="", help='Path to the report directory')
+    parser.add_argument('query_parameters', nargs='*', type=str, help='List of key=value Cypher query parameters')
+    parser.set_defaults(verbose=False)
+    args = parser.parse_args()
+    return Parameters.from_input_parameters(parse_key_value_list(args.query_parameters), args.report_directory, args.verbose)
+
+
+def get_file_path(name: str, parameters: Parameters, extension: str = 'svg') -> str:
+    name = parameters.get_report_directory() + '/' + name.replace(' ', '_') + '.' + extension
+    if parameters.is_verbose():
+        print(f"treemapVisualizations: Saving file {name}")
+    return name
+
+
+def get_graph_database_driver() -> Driver:
+    driver = GraphDatabase.driver(
+        uri="bolt://localhost:7687",
+        auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD"))
+    )
+    driver.verify_connectivity()
+    return driver
+
+
+def query_cypher_to_data_frame(query: LiteralString, parameters: Optional[Dict[str, Any]] = None):
+    records, _, keys = driver.execute_query(query, parameters_=parameters)
+    return pd.DataFrame([record.values() for record in records], columns=keys)
+
+
+# ----------------------------------------
+# Base settings for image rendering
+# ----------------------------------------
+
+
+image_rendering_settings = {
+    "format": "svg",
+    "width": 1920,
+    "height": 1080,
+}
+
+# ----------------------------------------
+# Base settings for Plotly Treemap
+# ----------------------------------------
+
+plotly_main_layout_base_settings = {
+    "margin": {"t": 60, "l": 15, "r": 15, "b": 20},
+}
+plotly_treemap_layout_base_settings = dict(
+    **plotly_main_layout_base_settings
+)
+plotly_bar_layout_base_settings = dict(
+    **plotly_main_layout_base_settings
+)
+plotly_treemap_marker_base_style = {
+    "cornerradius": 5
+}
+
+plotly_treemap_marker_base_color_scale = dict(
+    **plotly_treemap_marker_base_style,
+    colorscale='Hot_r',
+)
+
+
+# ----------------------------------------
+# Base functions for Treemap chart visualization
+# ----------------------------------------
+
+# Ignore kaleido logging noise when writing images
+logging.getLogger("kaleido").setLevel(logging.WARNING)
+
+def get_plotly_figure_write_image_settings(name: str, path: str):
+    """
+    Returns the settings for the plotly figure write_image method
+    :param name: Name of the figure
+    :return: Dictionary with settings for the write_image method
+    """
+    return {
+        "file": path + "/" + name + "." + image_rendering_settings['format'],
+        "format": image_rendering_settings['format'],
+        "width": image_rendering_settings['width'],
+        "height": image_rendering_settings['height'],
+    }
+
+
+def create_treemap_settings(data_frame: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = "elementName") -> plotly_graph_objects.Treemap:
+    """
+    Creates a Plotly Treemap with the given settings and data frame.
+    data_frame : pd.DataFrame : The input data frame
+    return :plotly_graph_objects.Treemap : The prepared Plotly Treemap
+    """
+    return plotly_graph_objects.Treemap(
+        labels=data_frame[element_name_column],
+        parents=data_frame['directoryParentPath'],
+        ids=data_frame[element_path_column],
+        customdata=data_frame[['fileCount', 'absoluteAnomalyScore', 'normalizedAuthorityRank', 'normalizedBottleneckRank', 'normalizedBridgeRank', 'normalizedHubRank', 'normalizedOutlierRank', 'elementPath']],
+        hovertemplate='<b>%{label}</b><br>Highlighted anomalies: %{customdata[0]}<br>Anomaly Score: %{customdata[1]:.4f}<br>Authority: %{customdata[2]}, Bottleneck: %{customdata[3]}, Bridge: %{customdata[4]}, Hub: %{customdata[5]}, Outlier: %{customdata[6]}<br>Path: %{customdata[7]}',
+        maxdepth=-1,
+        root_color="lightgrey",
+        marker=dict(**plotly_treemap_marker_base_style),
+    )
+
+
+# ----------------------------------------
+# Base functions to prepare data for Treemap chart visualization
+# ----------------------------------------
+
+
+def remove_last_path_file_extension(file_path_elements: list) -> list:
+    """
+    Removes the file extension of the last element of the file path so that only the file name remains.
+    file_path_elements : list : The list of file path elements where the last one contains the file name with extension
+    return : list : The list of the directories + the file name without extension as last element.
+    """
+    if not file_path_elements:
+        return ['']
+    if len(file_path_elements) == 1:
+        return [os.path.splitext(file_path_elements[0])[0]]
+    return file_path_elements[:-1] + [os.path.splitext(file_path_elements[-1])[0]]
+
+
+def join_path_elements(file_path_elements: list) -> list:
+    """
+    Joins the file path elements (and removes the file extension).
+    file_path_elements : list : The list of levels to convert
+    return : list : The list of directories
+    """
+    prepared_path_elements = remove_last_path_file_extension(file_path_elements)
+    return ['/'.join(prepared_path_elements[:i+1]) for i in range(len(prepared_path_elements))]
+
+
+def add_element_path_column(input_dataframe: pd.DataFrame, file_path_column: str, element_path_column: str = 'elementPath'):
+    """
+    Adds a directory column to the input DataFrame based on the file path column.
+    input_dataframe : pd.DataFrame : The input DataFrame
+    file_path_column : str : The name of the file path column
+    directory_column : str : The name of the directory column to be added
+    return : pd.DataFrame : The DataFrame with added directory column
+    """
+    if element_path_column in input_dataframe.columns:
+        return input_dataframe # Column already exists
+    
+    input_dataframe.insert(0, element_path_column, input_dataframe[file_path_column].str.split('/').apply(join_path_elements))
+    input_dataframe = input_dataframe.explode(element_path_column)
+    return input_dataframe
+
+
+def add_element_name_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = 'elementName'):
+    """
+    Adds a directory name column to the input DataFrame based on the directory column.
+    input_dataframe : pd.DataFrame : The input DataFrame
+    directory_column : str : The name of the directory column
+    directory_name_column : str : The name of the directory name column to be added
+    return : pd.DataFrame : The DataFrame with added directory name column
+    """
+    if element_name_column in input_dataframe.columns:
+        return input_dataframe # Column already exists
+    
+    splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)
+    input_dataframe.insert(1, element_name_column, splitted_directories.apply(lambda x: (x[-1])))
+    return input_dataframe
+
+
+def add_parent_directory_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', directory_parent_column: str = 'directoryParentPath'):
+    """
+    Adds a directory parent column to the input DataFrame based on the directory column.
+    input_dataframe : pd.DataFrame : The input DataFrame
+    directory_column : str : The name of the directory column
+    directory_parent_column : str : The name of the directory parent column to be added
+    return : pd.DataFrame : The DataFrame with added directory parent column
+    """
+    if directory_parent_column in input_dataframe.columns:
+        return input_dataframe # Column already exists
+    
+    # Remove last path element from directory_column to get the directory_parent_column
+    splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)
+    input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))
+    
+    # Clear parent (set to empty string) when it equal to the directory
+    input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[element_path_column], directory_parent_column] = ''
+    return input_dataframe
+
+
+def count_unique_aggregated_values(values: pd.Series):
+    """
+    Return the number of unique values from an array of array of strings.
+    Meant to be used as an aggregation function for dataframe grouping.
+    values : Series : The pandas Series of values
+    return : int : The number of files
+    """
+    return len(np.unique(np.concatenate(values.to_list())))
+
+
+def prepare_data_for_treemap(data: pd.DataFrame, debug: bool = False) -> pd.DataFrame:
+    if debug:
+        print("1. query result ---------------------")
+        print(data)
+
+    # 3. Add multiple rows for each file path containing all its directories paths in the new column 'elementPath'
+    data = add_element_path_column(data, 'filePath', 'elementPath')
+
+    if debug:
+        print("3. added elementPath --------------")
+        print(data)
+
+    # Group the files by their directory and count the number of files of each directory (across all levels).
+    common_named_aggregation = {
+        "absoluteAnomalyScore": pd.NamedAgg(column="absoluteAnomalyScore", aggfunc="mean"),
+        "normalizedAuthorityRank": pd.NamedAgg(column="normalizedAuthorityRank", aggfunc="max"),
+        "normalizedBottleneckRank": pd.NamedAgg(column="normalizedBottleneckRank", aggfunc="max"),
+        "normalizedBridgeRank": pd.NamedAgg(column="normalizedBridgeRank", aggfunc="max"),
+        "normalizedHubRank": pd.NamedAgg(column="normalizedHubRank", aggfunc="max"),
+        "normalizedOutlierRank": pd.NamedAgg(column="normalizedOutlierRank", aggfunc="max"),
+    }
+
+    data = data.groupby(['elementPath']).aggregate(
+        filePaths=pd.NamedAgg(column="filePath", aggfunc=np.unique),
+        firstFile=pd.NamedAgg(column="filePath", aggfunc="first"),
+        maxAnomalyScore=pd.NamedAgg(column="absoluteAnomalyScore", aggfunc="max"),
+        **common_named_aggregation
+    )
+
+    # Sort the grouped and aggregated entries by the name of the directory ascending and the anomaly score descending.
+    # The author with the most commits will then be listed first for each directory.
+    data = data.sort_values(by=['elementPath', 'absoluteAnomalyScore'], ascending=[True, False])
+    data = data.reset_index()
+
+    if debug:
+        print("4. grouped by elementPath --------------")
+        print(data)
+
+    # Group the entries again now only by their directory path to get the aggregated number of anomalies and ranks.
+    data = data.groupby('elementPath').aggregate(
+        fileCount=pd.NamedAgg(column="filePaths", aggfunc=count_unique_aggregated_values),
+        firstFile=pd.NamedAgg(column="firstFile", aggfunc="first"),
+        maxAnomalyScore=pd.NamedAgg(column="maxAnomalyScore", aggfunc="max"),
+        **common_named_aggregation
+    )
+    data = data.reset_index()
+
+    if debug:
+        print("5. grouped by directory path --------------")
+        print(data)
+
+    # Add the name of the directory (last '/' separated element) and the parent directory path to the table.
+    data = add_element_name_column(data, 'elementPath', 'elementName')
+    data = add_parent_directory_column(data, 'elementPath', 'directoryParentPath')
+
+    if debug:
+        print("6. added directory and parent name --------------")
+        print(data)
+
+    # Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.
+    all_column_names_except_for_the_directory_path = data.columns.to_list()[3:]
+    data = data.groupby(all_column_names_except_for_the_directory_path).aggregate(
+        elementName=pd.NamedAgg(column="elementName", aggfunc=lambda names: '/'.join(names)),
+        directoryParentPath=pd.NamedAgg(column="directoryParentPath", aggfunc="first"),
+        elementPath=pd.NamedAgg(column="elementPath", aggfunc="last"),
+    )
+
+    # Reorder the column positions so that the directory path is again the first column. 
+    all_column_names_with_the_directory_path_first = ['elementPath', 'directoryParentPath', 'elementName'] + all_column_names_except_for_the_directory_path
+    data = data.reset_index()[all_column_names_with_the_directory_path_first]
+
+    if debug:
+        print("7. final grouping --------------")
+        print(data)
+        print("Statistics --------------")
+        data.describe()
+    
+    return data
+
+
+def mutual_exclusive_ranks(data: pd.DataFrame) -> pd.DataFrame:
+    """
+    Modifies the input data frame to ensure that only one archetype rank is non-zero per row.
+    The archetype with the highest normalized rank is retained, and others are set to zero.
+    data : pd.DataFrame : The input data frame
+    return : pd.DataFrame : The modified data frame with mutual exclusive ranks
+    """
+    modified_data = data.copy()
+    
+    for dataframe_index, row in modified_data.iterrows():
+        index = cast(int, dataframe_index)
+        max_rank_value = 0
+        max_rank_column = None
+        
+        for column in archetype_columns:
+            if row[column] > max_rank_value:
+                max_rank_value = row[column]
+                max_rank_column = column
+        
+        for column in archetype_columns:
+            if column != max_rank_column:
+                modified_data.at[index, column] = 0
+    
+    return modified_data
+
+
+# ----------------------------------------
+# Archetypes
+# ----------------------------------------
+
+Archetypes = Literal["Authority", "Bottleneck", "Bridge", "Hub", "Outlier"]
+archetype_names: List[Archetypes] = ["Authority", "Bottleneck", "Bridge", "Hub", "Outlier"]
+
+def get_archetype_column_name(archetype: Archetypes) -> str:
+    """
+    Returns the column name for the given archetype.
+    archetype : Archetypes : The archetype name
+    return : str : The column name for the given archetype
+    """
+    return f"normalized{archetype}Rank"
+
+def get_archetype_index(archetype: Archetypes) -> int:
+    """
+    Returns the index of the given archetype.
+    archetype : Archetypes : The archetype name
+    return : int : The index of the given archetype
+    """
+    return archetype_names.index(archetype)
+
+archetype_columns = [get_archetype_column_name(name) for name in archetype_names]
+
+
+# ----------------------------------------
+# Archetype Coloring
+# ----------------------------------------
+
+Color = Tuple[int, int, int]  # RGB (red, green, blue) color tuple
+ColorPair = Tuple[Color, Color]  # Low and high color pair
+
+def interpolate_color(low: Color, high: Color, normalized_value: float) -> str:
+    """Linear interpolation between two RGB tuples, returns rgba string."""
+    
+    def linear_interpolation_of_color_component(color_component: int) -> int:
+        return int(low[color_component] + (high[color_component] - low[color_component]) * normalized_value)
+    
+    red = linear_interpolation_of_color_component(0)
+    green = linear_interpolation_of_color_component(1)
+    blue = linear_interpolation_of_color_component(2)
+    return f"rgb({red},{green},{blue})"
+
+
+def get_rank_color(rank: float, low: Color, high: Color) -> str:
+    """Return transparent if rank == 0, else interpolate between low and high."""
+    if rank <= 0:
+        return "rgb(255,255,255)"
+    return interpolate_color(low, high, rank)
+
+
+def combine_rank_colors(
+    dataframe: pd.DataFrame,
+    rank_columns: List[str],
+    color_pairs: List[ColorPair],
+) -> List[str]:
+    """Combine multiple ranks, using the first nonzero value's color."""
+    combined: List[str] = []
+    for _, row in dataframe.iterrows():
+        color = "rgb(255,255,255)"
+        for rank_col, (low, high) in zip(rank_columns, color_pairs):
+            rank = row[rank_col]
+            if rank > 0:
+                color = get_rank_color(rank, low, high)
+                break
+        combined.append(color)
+    return combined
+
+
+def get_rank_color_for_archetype(dataframe: pd.DataFrame, archetype: Archetypes) -> List[str]:
+    """Get combined rank colors for a specific archetype."""
+    archetype_column_name = get_archetype_column_name(archetype)
+    coloring_pair = get_coloring_pairs()[archetype_names.index(archetype)]
+    return combine_rank_colors(dataframe, [archetype_column_name], [coloring_pair])
+
+
+def get_coloring_pairs() -> List[Tuple[Tuple[int, int, int], Tuple[int, int, int]]]:
+    """Define the coloring scheme for each archetype."""
+    assert len(archetype_names) == 5, "Expected exactly 5 archetypes."
+    return [
+        ((222, 235, 247), (33, 113, 181)), # Authority, Red shades
+        ((254, 230, 206), (217, 72, 1)),   # Bottleneck, Green shades
+        ((239, 237, 245), (106, 81, 163)), # Bridge, Blue shades
+        ((254, 224, 210), (165,15,21)),    # Hub, Orange shades
+        ((240, 240, 240), (82, 82, 82)),   # Outlier, Purple shades
+    ]
+
+# ----------------------------------------
+# Data query
+# ----------------------------------------
+
+def query_data() -> pd.DataFrame:
+    query: LiteralString = """
+        MATCH (anomalyScoreStats:File&!Directory&!Archive)
+        WHERE anomalyScoreStats.anomalyScore < 0
+        ORDER BY anomalyScoreStats.anomalyScore ASCENDING
+        LIMIT 150 // n largest negative anomaly scores as threshold
+         WITH collect(anomalyScoreStats.anomalyScore)[-1] AS anomalyScoreThreshold
+        MATCH (anomalyRankStats:File&!Directory&!Archive)
+         WITH anomalyScoreThreshold
+             ,max(anomalyRankStats.anomalyAuthorityRank)  AS maxAnomalyAuthorityRank
+             ,max(anomalyRankStats.anomalyBottleneckRank) AS maxAnomalyBottleneckRank
+             ,max(anomalyRankStats.anomalyBridgeRank)     AS maxAnomalyBridgeRank
+             ,max(anomalyRankStats.anomalyHubRank)        AS maxAnomalyHubRank
+             ,max(anomalyRankStats.anomalyOutlierRank)    AS maxAnomalyOutlierRank
+        MATCH (anomalous:File&!Directory&!Archive)
+        WHERE (anomalous.anomalyScore < anomalyScoreThreshold
+           OR  anomalous.anomalyHubRank        IS NOT NULL
+           OR  anomalous.anomalyAuthorityRank  IS NOT NULL
+           OR  anomalous.anomalyBottleneckRank IS NOT NULL
+           OR  anomalous.anomalyOutlierRank    IS NOT NULL
+           OR  anomalous.anomalyBridgeRank     IS NOT NULL)
+        OPTIONAL MATCH (project:Artifact|Project)-[:CONTAINS]->(anomalous)
+          WITH *
+              ,coalesce(project.name + '/', '')                     AS projectName
+              ,coalesce(anomalous.fileName, anomalous.relativePath) AS fileName
+        RETURN replace(projectName + fileName, '//', '/')   AS filePath
+              ,CASE WHEN anomalous.anomalyScore < 0 THEN abs(anomalous.anomalyScore) ELSE 0 END AS absoluteAnomalyScore
+              ,coalesce(toFloat(anomalous.anomalyAuthorityRank) / maxAnomalyAuthorityRank, 0)   AS normalizedAuthorityRank
+              ,coalesce(toFloat(anomalous.anomalyBottleneckRank) / maxAnomalyBottleneckRank, 0) AS normalizedBottleneckRank
+              ,coalesce(toFloat(anomalous.anomalyBridgeRank) / maxAnomalyBridgeRank, 0)         AS normalizedBridgeRank
+              ,coalesce(toFloat(anomalous.anomalyHubRank / maxAnomalyHubRank), 0)               AS normalizedHubRank
+              ,coalesce(toFloat(anomalous.anomalyOutlierRank) / maxAnomalyOutlierRank, 0)       AS normalizedOutlierRank
+        ORDER BY filePath ASCENDING
+        """
+    return query_cypher_to_data_frame(query)
+
+
+# ------------------------------------------------------------------------------------------------------------
+#  MAIN
+# ------------------------------------------------------------------------------------------------------------
+
+
+parameters = parse_input_parameters()
+title_prefix = parameters.get_title_prefix()
+driver = get_graph_database_driver()
+
+print(f"treemapVisualizations: Querying {title_prefix} data for treemap visualization...")
+anomaly_file_paths = query_data()
+
+print(f"treemapVisualizations: Preparing {title_prefix} data for treemap visualization...")
+anomaly_file_paths = prepare_data_for_treemap(anomaly_file_paths)
+
+# --- Visualizing Anomaly Scores
+
+print(f"treemapVisualizations: Creating {title_prefix} anomaly scores treemap visualization...")
+figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
+    create_treemap_settings(anomaly_file_paths),
+    marker=dict(
+        **plotly_treemap_marker_base_color_scale,
+        colors=anomaly_file_paths['absoluteAnomalyScore'], 
+        colorbar={"title": "score"},
+    ),
+))
+figure.update_layout(
+    **plotly_treemap_layout_base_settings, # type: ignore
+    title=f'Average {title_prefix} anomaly score per directory',
+)
+figure.write_image(**get_plotly_figure_write_image_settings(f"{title_prefix}Treemap1AverageAnomalyScorePerDirectory", parameters.get_report_directory()))
+
+# --- Visualizing Archetypes
+
+print(f"treemapVisualizations: Creating {title_prefix} archetypes overview treemap visualization...")
+mutual_exclusive_archetype_ranks_data = mutual_exclusive_ranks(anomaly_file_paths)
+
+coloring_pairs = get_coloring_pairs()
+combined_colors = combine_rank_colors(mutual_exclusive_archetype_ranks_data, archetype_columns, coloring_pairs)
+
+figure = plotly_graph_objects.Figure()
+
+figure.add_trace(plotly_graph_objects.Treemap(
+    create_treemap_settings(mutual_exclusive_archetype_ranks_data),
+    marker=dict(
+        **plotly_treemap_marker_base_style,
+        line={"width": 1, "color": "black"},
+        showscale=False,
+        colors=combined_colors,
+    ),
+    name="Anomalies",
+    opacity=0.8
+))
+
+# Add dummy scatter traces for legend
+for name, (low, high) in zip(archetype_names, coloring_pairs):
+    bright_color = interpolate_color(low, high, 0.4)  # light tone for legend filling
+    dark_color = interpolate_color(low, high, 1.0)  # darkest tone for legend outline
+    figure.add_trace(plotly_graph_objects.Scatter(
+        x=[None],
+        y=[None],
+        mode="markers",
+        marker={"size": 12, "color": bright_color, "line": {"width": 2, "color": dark_color}},
+        name=name,
+        legendgroup=name,
+        showlegend=True,
+    ))
+
+figure.update_layout(
+    **plotly_treemap_layout_base_settings, # type: ignore
+    title=f'Overview of all {title_prefix} anomaly archetypes per directory',
+    legend={
+        "orientation": "h", # horizontal legend
+        "yanchor": "bottom",
+        "y": -0.12,
+        "xanchor": "center",
+        "x": 0.5
+    }
+)
+figure.update_xaxes(visible=False)
+figure.update_yaxes(visible=False)
+figure.write_image(**get_plotly_figure_write_image_settings(f"{title_prefix}Treemap2ArchetypesOverviewPerDirectory", parameters.get_report_directory()))
+
+# --- Visualizing Archetypes individually
+
+def plot_single_archetype_treemap(archetype: Archetypes, title_prefix: str, file_index: int, data: pd.DataFrame):
+    """
+    Plots a treemap for the given archetype using the provided data.
+    archetype : Archetypes : The archetype to plot
+    data : pd.DataFrame : The input data frame
+    """
+    print(f"treemapVisualizations: Creating {title_prefix} archetype '{archetype}' treemap visualization...")
+    data_to_display = data.copy()
+    data_to_display = data_to_display[data_to_display[archetype_columns].sum(axis=1) > 0]
+
+    combined_colors = get_rank_color_for_archetype(data_to_display, archetype)
+
+    figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
+        create_treemap_settings(data_to_display),
+        marker=dict(
+            **plotly_treemap_marker_base_style,
+            colors=combined_colors,
+            line={"width": 1, "color": "black"},
+            colorbar={"title": "rank"},
+        ),
+    ))
+    figure.update_layout(
+        **plotly_treemap_layout_base_settings, # type: ignore
+        title=f'{title_prefix} Archetype "{archetype}" per directory',
+    )
+    figure.write_image(**get_plotly_figure_write_image_settings(f"{title_prefix}Treemap{file_index}Archetype{archetype}PerDirectory", parameters.get_report_directory()))
+
+plot_single_archetype_treemap("Authority", title_prefix, 3, anomaly_file_paths)
+plot_single_archetype_treemap("Bottleneck", title_prefix, 4, anomaly_file_paths)
+plot_single_archetype_treemap("Bridge", title_prefix, 5, anomaly_file_paths)
+plot_single_archetype_treemap("Hub", title_prefix, 6, anomaly_file_paths)
+plot_single_archetype_treemap("Outlier", title_prefix, 7, anomaly_file_paths)
+
+driver.close()
+print("treemapVisualizations: Successfully created treemap visualizations.")
\ No newline at end of file

From a44c6459053b94317ebbe5369b6a3249751250d3 Mon Sep 17 00:00:00 2001
From: JohT <7671054+JohT@users.noreply.github.com>
Date: Fri, 7 Nov 2025 21:04:33 +0100
Subject: [PATCH 2/2] Add Treemap chars to anomaly detection summary

---
 domains/anomaly-detection/summary/anomalyDetectionSummary.sh  | 4 ++++
 domains/anomaly-detection/summary/report.template.md          | 4 ++++
 .../summary/report_no_anomaly_detection_treemaps.template.md  | 1 +
 3 files changed, 9 insertions(+)
 create mode 100644 domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md

diff --git a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh
index a7659df03..6335c8b9f 100755
--- a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh
+++ b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh
@@ -177,6 +177,10 @@ anomaly_detection_finalize_report() {
     # Remove empty Markdown includes
     source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${report_include_directory}"
 
+    # Collect static Markdown includes (after cleanup to not remove one-liner)
+    cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_no_anomaly_detection_treemaps.template.md" "${report_include_directory}/report_no_anomaly_detection_treemaps.md"
+
+    # Assemble final report by applying includes to the main template
     cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report.template.md" "${FULL_REPORT_DIRECTORY}/report.template.md"
     cat "${FULL_REPORT_DIRECTORY}/report.template.md" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${report_include_directory}" > "${FULL_REPORT_DIRECTORY}/anomaly_detection_report.md"
 
diff --git a/domains/anomaly-detection/summary/report.template.md b/domains/anomaly-detection/summary/report.template.md
index 0f846235c..59275eb46 100644
--- a/domains/anomaly-detection/summary/report.template.md
+++ b/domains/anomaly-detection/summary/report.template.md
@@ -26,6 +26,10 @@ The goal is to detect potential **software quality, design, and architecture iss
 
 <!-- include:AnomaliesPerAbstractionLayer.md -->
 
+### 1.3 Overview Charts
+
+<!-- include:TreemapChartsReference.md|report_no_anomaly_detection_treemaps.md> -->
+
 ## 2. Deep Dives by Abstraction Level
 
 Each abstraction level includes anomaly statistics, SHAP feature importance, archetype distribution, and example anomalies.
diff --git a/domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md b/domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md
new file mode 100644
index 000000000..26df86f3a
--- /dev/null
+++ b/domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md
@@ -0,0 +1 @@
+⚠️ _No anomaly detection treemap charts due to missing data._
\ No newline at end of file