From 6afd8ec8078e8f6f0b093ec324081ea6be399c47 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Sun, 2 Nov 2025 21:44:41 +0100 Subject: [PATCH 1/2] Add Treemap visualization to anomaly detection --- .../anomalyDetectionPython.sh | 75 +- .../AnomalyDetectionTreeMapExploration.ipynb | 814 ++++++++++++++++++ .../queries/AnomalyDetectionFiles.cypher | 34 + .../treemapVisualizations.py | 645 ++++++++++++++ 4 files changed, 1561 insertions(+), 7 deletions(-) create mode 100644 domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb create mode 100644 domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher create mode 100755 domains/anomaly-detection/treemapVisualizations.py diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh index 80c3c1109..f6f661814 100755 --- a/domains/anomaly-detection/anomalyDetectionPython.sh +++ b/domains/anomaly-detection/anomalyDetectionPython.sh @@ -15,12 +15,14 @@ set -o errexit -o pipefail # Overrideable Constants (defaults also defined in sub scripts) REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"} +MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"} # Subdirectory that contains Markdown files to be included by the Markdown template for the report. + ## Get this "scripts/reports" directory if not already set # Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution. # CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes. # This way non-standard tools like readlink aren't needed. ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)} -echo "anomalyDetectionPipeline: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}" +echo "anomalyDetectionPython: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}" # Get the "scripts" directory by taking the path of this script and going one directory up. SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Repository directory containing the shell scripts # Get the "cypher" query directory for gathering features. @@ -49,7 +51,7 @@ while [[ $# -gt 0 ]]; do verboseMode="--verbose" ;; *) - echo -e "${COLOR_ERROR}anomalyDetectionPipeline: Error: Unknown option: ${key}${COLOR_DEFAULT}" >&2 + echo -e "${COLOR_ERROR}anomalyDetectionPython: Error: Unknown option: ${key}${COLOR_DEFAULT}" >&2 usage ;; esac @@ -72,10 +74,10 @@ is_sufficient_data_available() { query_result=$( execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionNodeCount.cypher" "${@}" ) node_count=$(get_csv_column_value "${query_result}" "node_count") if [ "${node_count}" -lt 15 ]; then - echo "anomalyDetectionPipeline: Warning: Skipping anomaly detection. Only ${node_count} ${language} ${nodeLabel} nodes. At least 15 required." + echo "anomalyDetectionPython: Warning: Skipping anomaly detection. Only ${node_count} ${language} ${nodeLabel} nodes. At least 15 required." false else - echo "anomalyDetectionPipeline: Info: Running anomaly detection with ${node_count} ${language} ${nodeLabel} nodes." + echo "anomalyDetectionPython: Info: Running anomaly detection with ${node_count} ${language} ${nodeLabel} nodes." true fi } @@ -92,7 +94,7 @@ is_sufficient_data_available() { anomaly_detection_features() { local nodeLabel nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) - echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..." + echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..." # Determine the Betweenness centrality (with the directed graph projection) if not already done execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Exists.cypher" \ @@ -127,7 +129,7 @@ anomaly_detection_using_python() { local language language=$( extractQueryParameter "projection_language" "${@}" ) - echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..." + echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..." # Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...) local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}" @@ -152,6 +154,8 @@ anomaly_detection_using_python() { # Required Parameters: # - projection_node_label=... # Label of the nodes that will be used for the projection. Example: "Package" +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" anomaly_detection_labels() { local nodeLabel nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" ) @@ -177,12 +181,63 @@ anomaly_detection_labels() { # Label of the nodes that will be used for the projection. Example: "Package" # - projection_weight_property=... # Name of the node property that contains the dependency weight. Example: "weight" +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" anomaly_detection_python_reports() { time anomaly_detection_features "${@}" anomaly_detection_using_python "${@}" time anomaly_detection_labels "${@}" } +# Creates the markdown file (to be included in the main summary) +# that contains the references to all treemap charts. +anomaly_detection_treemap_charts_markdown_reference() { + + echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Starting treemap charts markdown reference generation..." + + local detail_report_include_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}" + mkdir -p "${detail_report_include_directory}" + + local markdown_reference_file_name="TreemapChartsReference.md" + local markdown_reference_file="${detail_report_include_directory}/${markdown_reference_file_name}" + + # Write markdown references section title + { + echo "#### Treemap Charts" + } > "${markdown_reference_file}" + + # Find all treemap chart SVG files and add them to the markdown reference file + find "${FULL_REPORT_DIRECTORY}" -type f -name "*Treemap*.svg" | sort | while read -r chart_file; do + chart_filename=$(basename -- "${chart_file}") + chart_filename_without_extension="${chart_filename%.*}" # Remove file extension + { + echo "" + echo "![${chart_filename_without_extension}](./${chart_filename})" + } >> "${markdown_reference_file}" + done + + # Add a horizontal rule at the end + { + echo "" + echo "---" + } >> "${markdown_reference_file}" + + echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Finished treemap charts markdown reference generation..." +} + +# Visualize results with treemap charts. +# +# Required Parameters: +# - projection_language=... +# Name of the associated programming language. Examples: "Java", "Typescript" +anomaly_detection_treemap_charts() { + local language + language=$( extractQueryParameter "projection_language" "${@}" ) + + echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Visualizing ${language} results..." + time "${ANOMALY_DETECTION_SCRIPT_DIR}/treemapVisualizations.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode} +} + # Create report directory REPORT_NAME="anomaly-detection" FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}" @@ -229,6 +284,7 @@ if is_sufficient_data_available "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=we if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed" anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" + anomaly_detection_treemap_charts "${ALGORITHM_LANGUAGE}=Java" fi fi @@ -238,12 +294,17 @@ if is_sufficient_data_available "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}= if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"; then createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript" anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${ALGORITHM_LANGUAGE}=Typescript" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}" + anomaly_detection_treemap_charts "${ALGORITHM_LANGUAGE}=Module" fi fi +# -- Markdown summary --------------------------- + +anomaly_detection_treemap_charts_markdown_reference + # --------------------------------------------------------------- # Clean-up after report generation. Empty reports will be deleted. source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}" -echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished." \ No newline at end of file +echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished." \ No newline at end of file diff --git a/domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb new file mode 100644 index 000000000..79f073eae --- /dev/null +++ b/domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb @@ -0,0 +1,814 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "2f0eabc4", + "metadata": {}, + "source": [ + "# Anomaly Detection - TreeMap Exploration\n", + "\n", + "This notebook demonstrates how to visualize anomalies with Treemap charts for static code analysis data using jQAssistant and Neo4j. \n", + "\n", + "
\n", + "\n", + "### References\n", + "- [jqassistant](https://jqassistant.org)\n", + "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)\n", + "- [Plotly Treemap Chart](https://plotly.com/python/treemaps/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4191f259", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import typing\n", + "from typing import List, Tuple\n", + "\n", + "from IPython.display import display\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from plotly import graph_objects as plotly_graph_objects\n", + "import plotly.colors as plotly_colors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0676813", + "metadata": {}, + "outputs": [], + "source": [ + "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", + "#This is especially needed for PDF export of tables with multiple columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebac1bb9", + "metadata": {}, + "outputs": [], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9bc2241", + "metadata": {}, + "outputs": [], + "source": [ + "# Pandas DataFrame Display Configuration\n", + "pd.set_option('display.max_colwidth', 500)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8ef41ff", + "metadata": {}, + "outputs": [], + "source": [ + "from sys import version as python_version\n", + "print('Python version: {}'.format(python_version))\n", + "\n", + "from numpy import __version__ as numpy_version\n", + "print('numpy version: {}'.format(numpy_version))\n", + "\n", + "from pandas import __version__ as pandas_version\n", + "print('pandas version: {}'.format(pandas_version))\n", + "\n", + "from neo4j import __version__ as neo4j_version\n", + "print('neo4j version: {}'.format(neo4j_version))\n", + "\n", + "from plotly import version as plotly_version\n", + "print('plotly version: {}'.format(plotly_version))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c5dab37", + "metadata": {}, + "outputs": [], + "source": [ + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", + "from neo4j import GraphDatabase, Driver\n", + "\n", + "def get_graph_database_driver() -> Driver:\n", + " driver = GraphDatabase.driver(\n", + " uri=\"bolt://localhost:7687\",\n", + " auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")) # type: ignore\n", + " )\n", + " driver.verify_connectivity()\n", + " return driver" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1db254b", + "metadata": {}, + "outputs": [], + "source": [ + "def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None):\n", + " records, summary, keys = driver.execute_query(query, parameters_=parameters)\n", + " return pd.DataFrame([record.values() for record in records], columns=keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cf0993d", + "metadata": {}, + "outputs": [], + "source": [ + "# Base settings for Plotly Treemap\n", + "\n", + "plotly_main_layout_base_settings = {\n", + " \"margin\": {\"t\": 50, \"l\": 15, \"r\": 15, \"b\": 15},\n", + "}\n", + "plotly_treemap_layout_base_settings = dict(\n", + " **plotly_main_layout_base_settings\n", + ")\n", + "plotly_bar_layout_base_settings = dict(\n", + " **plotly_main_layout_base_settings\n", + ")\n", + "plotly_treemap_figure_show_settings = {\n", + " \"renderer\": None,\n", + " \"width\": 1080,\n", + " \"height\": 1080,\n", + "}\n", + "\n", + "plotly_treemap_marker_base_style = {\n", + " \"cornerradius\": 5,\n", + "}\n", + "\n", + "# Hot_r, ice_r, Viridis_r, speed_r, haline_r, thermal_r, Plasma_r, solar_r, Electric_r, Blackbody_r, deep_r, Turbo_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n", + "plotly_treemap_marker_base_color_scale = dict(\n", + " **plotly_treemap_marker_base_style,\n", + " colorscale='Hot_r',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30a0b4e6", + "metadata": {}, + "outputs": [], + "source": [ + "def create_treemap_settings(data_frame: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = \"elementName\") -> plotly_graph_objects.Treemap:\n", + " \"\"\"\n", + " Creates a Plotly Treemap with the given settings and data frame.\n", + " data_frame : pd.DataFrame : The input data frame\n", + " return :plotly_graph_objects.Treemap : The prepared Plotly Treemap\n", + " \"\"\"\n", + " return plotly_graph_objects.Treemap(\n", + " labels=data_frame[element_name_column],\n", + " parents=data_frame['directoryParentPath'],\n", + " ids=data_frame[element_path_column],\n", + " customdata=data_frame[['fileCount', 'absoluteAnomalyScore', 'normalizedAuthorityRank', 'normalizedBottleneckRank', 'normalizedBridgeRank', 'normalizedHubRank', 'normalizedOutlierRank', 'elementPath']],\n", + " hovertemplate='%{label}
Highlighted anomalies: %{customdata[0]}
Anomaly Score: %{customdata[1]:.4f}
Authority: %{customdata[2]}, Bottleneck: %{customdata[3]}, Bridge: %{customdata[4]}, Hub: %{customdata[5]}, Outlier: %{customdata[6]}
Path: %{customdata[7]}',\n", + " maxdepth=-1,\n", + " root_color=\"lightgrey\",\n", + " marker=dict(**plotly_treemap_marker_base_style),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc84a742", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_last_path_file_extension(file_path_elements: list) -> list:\n", + " \"\"\"\n", + " Removes the file extension of the last element of the file path so that only the file name remains.\n", + " file_path_elements : list : The list of file path elements where the last one contains the file name with extension\n", + " return : list : The list of the directories + the file name without extension as last element.\n", + " \"\"\"\n", + " if not file_path_elements:\n", + " return ['']\n", + " if len(file_path_elements) == 1:\n", + " return [os.path.splitext(file_path_elements[0])[0]]\n", + " return file_path_elements[:-1] + [os.path.splitext(file_path_elements[-1])[0]]\n", + "\n", + "def join_path_elements(file_path_elements: list) -> list:\n", + " \"\"\"\n", + " Joins the file path elements (and removes the file extension).\n", + " file_path_elements : list : The list of levels to convert\n", + " return : list : The list of directories\n", + " \"\"\"\n", + " prepared_path_elements = remove_last_path_file_extension(file_path_elements)\n", + " return ['/'.join(prepared_path_elements[:i+1]) for i in range(len(prepared_path_elements))]\n", + "\n", + "def add_element_path_column(input_dataframe: pd.DataFrame, file_path_column: str, element_path_column: str = 'elementPath'):\n", + " \"\"\"\n", + " Adds a directory column to the input DataFrame based on the file path column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " file_path_column : str : The name of the file path column\n", + " directory_column : str : The name of the directory column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory column\n", + " \"\"\"\n", + " if element_path_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " input_dataframe.insert(0, element_path_column, input_dataframe[file_path_column].str.split('/').apply(join_path_elements))\n", + " input_dataframe = input_dataframe.explode(element_path_column)\n", + " return input_dataframe\n", + "\n", + "def add_element_name_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = 'elementName'):\n", + " \"\"\"\n", + " Adds a directory name column to the input DataFrame based on the directory column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " directory_column : str : The name of the directory column\n", + " directory_name_column : str : The name of the directory name column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory name column\n", + " \"\"\"\n", + " if element_name_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)\n", + " input_dataframe.insert(1, element_name_column, splitted_directories.apply(lambda x: (x[-1])))\n", + " return input_dataframe\n", + "\n", + "def add_parent_directory_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', directory_parent_column: str = 'directoryParentPath'):\n", + " \"\"\"\n", + " Adds a directory parent column to the input DataFrame based on the directory column.\n", + " input_dataframe : pd.DataFrame : The input DataFrame\n", + " directory_column : str : The name of the directory column\n", + " directory_parent_column : str : The name of the directory parent column to be added\n", + " return : pd.DataFrame : The DataFrame with added directory parent column\n", + " \"\"\"\n", + " if directory_parent_column in input_dataframe.columns:\n", + " return input_dataframe # Column already exists\n", + " \n", + " # Remove last path element from directory_column to get the directory_parent_column\n", + " splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)\n", + " input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))\n", + " \n", + " # Clear parent (set to empty string) when it equal to the directory\n", + " input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[element_path_column], directory_parent_column] = ''\n", + " return input_dataframe\n", + "\n", + "def count_unique_aggregated_values(values: pd.Series):\n", + " \"\"\"\n", + " Return the number of unique values from an array of array of strings.\n", + " Meant to be used as an aggregation function for dataframe grouping.\n", + " values : Series : The pandas Series of values\n", + " return : int : The number of files\n", + " \"\"\"\n", + " return len(np.unique(np.concatenate(values.to_list())))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04166d63", + "metadata": {}, + "outputs": [], + "source": [ + "Archetypes = typing.Literal[\"Authority\", \"Bottleneck\", \"Bridge\", \"Hub\", \"Outlier\"]\n", + "archetype_names: List[Archetypes] = [\"Authority\", \"Bottleneck\", \"Bridge\", \"Hub\", \"Outlier\"]\n", + "\n", + "def get_archetype_column_name(archetype: Archetypes) -> str:\n", + " \"\"\"\n", + " Returns the column name for the given archetype.\n", + " archetype : Archetypes : The archetype name\n", + " return : str : The column name for the given archetype\n", + " \"\"\"\n", + " return f\"normalized{archetype}Rank\"\n", + "\n", + "def get_archetype_index(archetype: Archetypes) -> int:\n", + " \"\"\"\n", + " Returns the index of the given archetype.\n", + " archetype : Archetypes : The archetype name\n", + " return : int : The index of the given archetype\n", + " \"\"\"\n", + " return archetype_names.index(archetype)\n", + "\n", + "archetype_columns = [get_archetype_column_name(name) for name in archetype_names]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e4caf24", + "metadata": {}, + "outputs": [], + "source": [ + "def query_data() -> pd.DataFrame:\n", + " query: typing.LiteralString = \"\"\"\n", + " MATCH (anomalyScoreStats:File&!Directory&!Archive)\n", + " WHERE anomalyScoreStats.anomalyScore < 0\n", + " ORDER BY anomalyScoreStats.anomalyScore ASCENDING\n", + " LIMIT 150 // n largest negative anomaly scores as threshold\n", + " WITH collect(anomalyScoreStats.anomalyScore)[-1] AS anomalyScoreThreshold\n", + " MATCH (anomalyRankStats:File&!Directory&!Archive)\n", + " WITH anomalyScoreThreshold\n", + " ,max(anomalyRankStats.anomalyAuthorityRank) AS maxAnomalyAuthorityRank\n", + " ,max(anomalyRankStats.anomalyBottleneckRank) AS maxAnomalyBottleneckRank\n", + " ,max(anomalyRankStats.anomalyBridgeRank) AS maxAnomalyBridgeRank\n", + " ,max(anomalyRankStats.anomalyHubRank) AS maxAnomalyHubRank\n", + " ,max(anomalyRankStats.anomalyOutlierRank) AS maxAnomalyOutlierRank\n", + " MATCH (anomalous:File&!Directory&!Archive)\n", + " WHERE (anomalous.anomalyScore < anomalyScoreThreshold\n", + " OR anomalous.anomalyHubRank IS NOT NULL\n", + " OR anomalous.anomalyAuthorityRank IS NOT NULL\n", + " OR anomalous.anomalyBottleneckRank IS NOT NULL\n", + " OR anomalous.anomalyOutlierRank IS NOT NULL\n", + " OR anomalous.anomalyBridgeRank IS NOT NULL)\n", + " OPTIONAL MATCH (project:Artifact|Project)-[:CONTAINS]->(anomalous)\n", + " WITH *\n", + " ,coalesce(project.name + '/', '') AS projectName\n", + " ,coalesce(anomalous.fileName, anomalous.relativePath) AS fileName\n", + " RETURN replace(projectName + fileName, '//', '/') AS filePath\n", + " ,CASE WHEN anomalous.anomalyScore < 0 THEN abs(anomalous.anomalyScore) ELSE 0 END AS absoluteAnomalyScore\n", + " ,coalesce(toFloat(anomalous.anomalyAuthorityRank) / maxAnomalyAuthorityRank, 0) AS normalizedAuthorityRank\n", + " ,coalesce(toFloat(anomalous.anomalyBottleneckRank) / maxAnomalyBottleneckRank, 0) AS normalizedBottleneckRank\n", + " ,coalesce(toFloat(anomalous.anomalyBridgeRank) / maxAnomalyBridgeRank, 0) AS normalizedBridgeRank\n", + " ,coalesce(toFloat(anomalous.anomalyHubRank / maxAnomalyHubRank), 0) AS normalizedHubRank\n", + " ,coalesce(toFloat(anomalous.anomalyOutlierRank) / maxAnomalyOutlierRank, 0) AS normalizedOutlierRank\n", + " ORDER BY filePath ASCENDING\n", + " \"\"\"\n", + " return query_cypher_to_data_frame(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01e51a6a", + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_data_for_treemap(data: pd.DataFrame, debug: bool = False) -> pd.DataFrame:\n", + " if debug:\n", + " display(\"1. query result ---------------------\")\n", + " display(data)\n", + "\n", + " # 3. Add multiple rows for each file path containing all its directories paths in the new column 'elementPath'\n", + " data = add_element_path_column(data, 'filePath', 'elementPath')\n", + "\n", + " if debug:\n", + " display(\"3. added elementPath --------------\")\n", + " display(data)\n", + "\n", + " # Group the files by their directory and count the number of files of each directory (across all levels).\n", + " common_named_aggregation = dict(\n", + " absoluteAnomalyScore=pd.NamedAgg(column=\"absoluteAnomalyScore\", aggfunc=\"mean\"),\n", + " normalizedAuthorityRank=pd.NamedAgg(column=\"normalizedAuthorityRank\", aggfunc=\"max\"),\n", + " normalizedBottleneckRank=pd.NamedAgg(column=\"normalizedBottleneckRank\", aggfunc=\"max\"),\n", + " normalizedBridgeRank=pd.NamedAgg(column=\"normalizedBridgeRank\", aggfunc=\"max\"),\n", + " normalizedHubRank=pd.NamedAgg(column=\"normalizedHubRank\", aggfunc=\"max\"),\n", + " normalizedOutlierRank=pd.NamedAgg(column=\"normalizedOutlierRank\", aggfunc=\"max\"),\n", + " )\n", + "\n", + " data = data.groupby(['elementPath']).aggregate(\n", + " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n", + " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n", + " maxAnomalyScore=pd.NamedAgg(column=\"absoluteAnomalyScore\", aggfunc=\"max\"),\n", + " **common_named_aggregation\n", + " )\n", + "\n", + " # Sort the grouped and aggregated entries by the name of the directory ascending and the anomaly score descending.\n", + " # The author with the most commits will then be listed first for each directory.\n", + " data = data.sort_values(by=['elementPath', 'absoluteAnomalyScore'], ascending=[True, False])\n", + " data = data.reset_index()\n", + "\n", + " if debug:\n", + " display(\"4. grouped by elementPath --------------\")\n", + " display(data)\n", + "\n", + " # Group the entries again now only by their directory path to get the aggregated number of anomalies and ranks.\n", + " data = data.groupby('elementPath').aggregate(\n", + " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n", + " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n", + " maxAnomalyScore=pd.NamedAgg(column=\"maxAnomalyScore\", aggfunc=\"max\"),\n", + " **common_named_aggregation\n", + " )\n", + " data = data.reset_index()\n", + "\n", + " if debug:\n", + " display(\"5. grouped by directory path --------------\")\n", + " display(data)\n", + "\n", + " # Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n", + " data = add_element_name_column(data, 'elementPath', 'elementName')\n", + " data = add_parent_directory_column(data, 'elementPath', 'directoryParentPath')\n", + "\n", + " if debug:\n", + " display(\"6. added directory and parent name --------------\")\n", + " display(data)\n", + "\n", + " # Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.\n", + " all_column_names_except_for_the_directory_path = data.columns.to_list()[3:]\n", + " data = data.groupby(all_column_names_except_for_the_directory_path).aggregate(\n", + " elementName=pd.NamedAgg(column=\"elementName\", aggfunc=lambda names: '/'.join(names)),\n", + " directoryParentPath=pd.NamedAgg(column=\"directoryParentPath\", aggfunc=\"first\"),\n", + " elementPath=pd.NamedAgg(column=\"elementPath\", aggfunc=\"last\"),\n", + " )\n", + "\n", + " # Reorder the column positions so that the directory path is again the first column. \n", + " all_column_names_with_the_directory_path_first = ['elementPath', 'directoryParentPath', 'elementName'] + all_column_names_except_for_the_directory_path\n", + " data = data.reset_index()[all_column_names_with_the_directory_path_first]\n", + "\n", + " if debug:\n", + " display(\"7. final grouping --------------\")\n", + " display(data)\n", + " display(\"Statistics --------------\")\n", + " data.describe()\n", + " \n", + " return data" + ] + }, + { + "cell_type": "markdown", + "id": "0c68aa20", + "metadata": {}, + "source": [ + "## 1. Anomalies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5222a25", + "metadata": {}, + "outputs": [], + "source": [ + "driver = get_graph_database_driver()\n", + "anomaly_file_paths = query_data()\n", + "anomaly_file_paths = prepare_data_for_treemap(anomaly_file_paths)\n", + "display(anomaly_file_paths)" + ] + }, + { + "cell_type": "markdown", + "id": "daa3a949", + "metadata": {}, + "source": [ + "### 1.1 Average anomaly score per file directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0cd2237", + "metadata": {}, + "outputs": [], + "source": [ + "data_to_display = anomaly_file_paths.copy()\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_settings(data_to_display),\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_color_scale,\n", + " colors=data_to_display['absoluteAnomalyScore'], \n", + " colorbar={\"title\": \"score\"},\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings, # type: ignore\n", + " title='Average anomaly score per directory',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "24341675", + "metadata": {}, + "source": [ + "### 1.2 Overview of all anomaly archetypes per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b773269c", + "metadata": {}, + "outputs": [], + "source": [ + "def mutual_exclusive_ranks(data: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Modifies the input data frame to ensure that only one archetype rank is non-zero per row.\n", + " The archetype with the highest normalized rank is retained, and others are set to zero.\n", + " data : pd.DataFrame : The input data frame\n", + " return : pd.DataFrame : The modified data frame with mutual exclusive ranks\n", + " \"\"\"\n", + " modified_data = data.copy()\n", + " \n", + " for dataframe_index, row in modified_data.iterrows():\n", + " index = typing.cast(int, dataframe_index)\n", + " max_rank_value = 0\n", + " max_rank_column = None\n", + " \n", + " for column in archetype_columns:\n", + " if row[column] > max_rank_value:\n", + " max_rank_value = row[column]\n", + " max_rank_column = column\n", + " \n", + " for column in archetype_columns:\n", + " if column != max_rank_column:\n", + " modified_data.at[index, column] = 0\n", + " \n", + " return modified_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ef8e76a", + "metadata": {}, + "outputs": [], + "source": [ + "def interpolate_color(low: Tuple[int, int, int], high: Tuple[int, int, int], normalized_value: float) -> str:\n", + " \"\"\"Linear interpolation between two RGB tuples, returns rgba string.\"\"\"\n", + " \n", + " def linear_interpolation_of_color_component(color_component: int) -> int:\n", + " return int(low[color_component] + (high[color_component] - low[color_component]) * normalized_value)\n", + " \n", + " red = linear_interpolation_of_color_component(0)\n", + " green = linear_interpolation_of_color_component(1)\n", + " blue = linear_interpolation_of_color_component(2)\n", + " return f\"rgb({red},{green},{blue})\"\n", + "\n", + "\n", + "def get_rank_color(rank: float, low: Tuple[int, int, int], high: Tuple[int, int, int]) -> str:\n", + " \"\"\"Return transparent if rank == 0, else interpolate between low and high.\"\"\"\n", + " if rank <= 0:\n", + " return \"rgb(255,255,255)\"\n", + " return interpolate_color(low, high, rank)\n", + "\n", + "\n", + "def combine_rank_colors(\n", + " dataframe: pd.DataFrame,\n", + " rank_columns: List[str],\n", + " color_pairs: List[Tuple[Tuple[int, int, int], Tuple[int, int, int]]],\n", + ") -> List[str]:\n", + " \"\"\"Combine multiple ranks, using the first nonzero value's color.\"\"\"\n", + " combined: List[str] = []\n", + " for _, row in dataframe.iterrows():\n", + " color = \"rgb(255,255,255)\"\n", + " for rank_col, (low, high) in zip(rank_columns, color_pairs):\n", + " rank = row[rank_col]\n", + " if rank > 0:\n", + " color = get_rank_color(rank, low, high)\n", + " break\n", + " combined.append(color)\n", + " return combined\n", + "\n", + "\n", + "def get_rank_color_for_archetype(dataframe: pd.DataFrame, archetype: Archetypes) -> List[str]:\n", + " \"\"\"Get combined rank colors for a specific archetype.\"\"\"\n", + " archetype_column_name = get_archetype_column_name(archetype)\n", + " coloring_pair = get_coloring_pairs()[archetype_names.index(archetype)]\n", + " return combine_rank_colors(dataframe, [archetype_column_name], [coloring_pair])\n", + "\n", + "\n", + "def get_coloring_pairs() -> List[Tuple[Tuple[int, int, int], Tuple[int, int, int]]]:\n", + " \"\"\"Define the coloring scheme for each archetype.\"\"\"\n", + " assert len(archetype_names) == 5, \"Expected exactly 5 archetypes.\"\n", + " return [\n", + " ((222, 235, 247), (33, 113, 181)), # Authority, Red shades\n", + " ((254, 230, 206), (217, 72, 1)), # Bottleneck, Green shades\n", + " ((239, 237, 245), (106, 81, 163)), # Bridge, Blue shades\n", + " ((254, 224, 210), (165,15,21)), # Hub, Orange shades\n", + " ((240, 240, 240), (82, 82, 82)), # Outlier, Purple shades\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1eb5f75", + "metadata": {}, + "outputs": [], + "source": [ + "data_to_display = mutual_exclusive_ranks(anomaly_file_paths)\n", + "# Optionally only keep rows where at least one archetype rank is greater than zero\n", + "data_to_display = data_to_display[data_to_display[archetype_columns].sum(axis=1) > 0]\n", + "\n", + "coloring_pairs = get_coloring_pairs()\n", + "combined_colors = combine_rank_colors(data_to_display, archetype_columns, coloring_pairs)\n", + "\n", + "figure = plotly_graph_objects.Figure()\n", + "\n", + "figure.add_trace(plotly_graph_objects.Treemap(\n", + " create_treemap_settings(data_to_display),\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_style,\n", + " line=dict(width=1, color=\"black\"),\n", + " showscale=False,\n", + " colors=combined_colors,\n", + " ),\n", + " name=\"Anomalies\",\n", + " opacity=0.8\n", + "))\n", + "\n", + "# Add dummy scatter traces for legend\n", + "for name, (low, high) in zip(archetype_names, coloring_pairs):\n", + " bright_color = interpolate_color(low, high, 0.4) # light tone for legend filling\n", + " dark_color = interpolate_color(low, high, 1.0) # darkest tone for legend outline\n", + " figure.add_trace(plotly_graph_objects.Scatter(\n", + " x=[None],\n", + " y=[None],\n", + " mode=\"markers\",\n", + " marker=dict(size=12, color=bright_color, line=dict(width=2, color=dark_color)),\n", + " name=name,\n", + " legendgroup=name,\n", + " showlegend=True,\n", + " ))\n", + "\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings, # type: ignore\n", + " title='Overview of all anomaly archetypes per directory',\n", + " legend=dict(\n", + " orientation=\"h\", # horizontal legend (use \"v\" for vertical)\n", + " yanchor=\"bottom\",\n", + " y=-0.12,\n", + " xanchor=\"center\",\n", + " x=0.5\n", + " )\n", + ")\n", + "figure.update_xaxes(visible=False)\n", + "figure.update_yaxes(visible=False)\n", + "\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "markdown", + "id": "379d568c", + "metadata": {}, + "source": [ + "### 1.3a Archetype - Authority per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af0a16b6", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_single_archetype_treemap(archetype: Archetypes, data: pd.DataFrame):\n", + " \"\"\"\n", + " Plots a treemap for the given archetype using the provided data.\n", + " archetype : Archetypes : The archetype to plot\n", + " data : pd.DataFrame : The input data frame\n", + " \"\"\"\n", + " data_to_display = data.copy()\n", + " data_to_display = data_to_display[data_to_display[archetype_columns].sum(axis=1) > 0]\n", + "\n", + " archetype_column_name = get_archetype_column_name(archetype)\n", + " combined_colors = get_rank_color_for_archetype(data_to_display, archetype)\n", + "\n", + " figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_settings(data_to_display),\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_style,\n", + " colors=combined_colors,\n", + " line=dict(width=1, color=\"black\"),\n", + " colorbar={\"title\": \"rank\"},\n", + " ),\n", + " ))\n", + " figure.update_layout(\n", + " **plotly_treemap_layout_base_settings, # type: ignore\n", + " title=f'Archetype \"{archetype}\" per directory',\n", + " )\n", + " figure.show(**plotly_treemap_figure_show_settings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a497cd8", + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_archetype_treemap(\"Authority\", data_to_display)" + ] + }, + { + "cell_type": "markdown", + "id": "4c4eb401", + "metadata": {}, + "source": [ + "### 1.3b Archetype - Bottleneck per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37656bfe", + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_archetype_treemap(\"Bottleneck\", anomaly_file_paths)" + ] + }, + { + "cell_type": "markdown", + "id": "06010d6d", + "metadata": {}, + "source": [ + "### 1.3c Archetype - Bridge per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b0d3b99", + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_archetype_treemap(\"Bridge\", anomaly_file_paths)" + ] + }, + { + "cell_type": "markdown", + "id": "b375f191", + "metadata": {}, + "source": [ + "### 1.3d Archetype - Hub per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3ee93ed", + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_archetype_treemap(\"Hub\", anomaly_file_paths)" + ] + }, + { + "cell_type": "markdown", + "id": "772d80b4", + "metadata": {}, + "source": [ + "### 1.3e Archetype - Outlier per directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e9ac193", + "metadata": {}, + "outputs": [], + "source": [ + "plot_single_archetype_treemap(\"Outlier\", anomaly_file_paths)" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "JohT" + } + ], + "code_graph_analysis_pipeline_data_validation": "ValidateAlwaysFalse", + "kernelspec": { + "display_name": "codegraph", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + }, + "title": "Anomaly Detection - Manual Exploration" + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher b/domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher new file mode 100644 index 000000000..e80a5b282 --- /dev/null +++ b/domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher @@ -0,0 +1,34 @@ +// List anomalous files + +MATCH (anomalyScoreStats:File&!Directory&!Archive) +WHERE anomalyScoreStats.anomalyScore < 0 +ORDER BY anomalyScoreStats.anomalyScore ASCENDING +LIMIT 150 // n largest negative anomaly scores as threshold + WITH collect(anomalyScoreStats.anomalyScore)[-1] AS anomalyScoreThreshold +MATCH (anomalyRankStats:File&!Directory&!Archive) + WITH anomalyScoreThreshold + ,max(anomalyRankStats.anomalyAuthorityRank) AS maxAnomalyAuthorityRank + ,max(anomalyRankStats.anomalyBottleneckRank) AS maxAnomalyBottleneckRank + ,max(anomalyRankStats.anomalyBridgeRank) AS maxAnomalyBridgeRank + ,max(anomalyRankStats.anomalyHubRank) AS maxAnomalyHubRank + ,max(anomalyRankStats.anomalyOutlierRank) AS maxAnomalyOutlierRank +MATCH (anomalous:File&!Directory&!Archive) +WHERE (anomalous.anomalyScore < anomalyScoreThreshold + OR anomalous.anomalyHubRank IS NOT NULL + OR anomalous.anomalyAuthorityRank IS NOT NULL + OR anomalous.anomalyBottleneckRank IS NOT NULL + OR anomalous.anomalyOutlierRank IS NOT NULL + OR anomalous.anomalyBridgeRank IS NOT NULL) +OPTIONAL MATCH (project:Artifact|Project)-[:CONTAINS]->(anomalous) + WITH * + ,coalesce(project.name + '/', '') AS projectName + ,coalesce(anomalous.fileName, anomalous.relativePath) AS fileName +RETURN replace(projectName + fileName, '//', '/') AS filePath + ,CASE WHEN anomalous.anomalyScore < 0 THEN abs(anomalous.anomalyScore) ELSE 0 END AS absoluteAnomalyScore + ,coalesce(toFloat(anomalous.anomalyAuthorityRank) / maxAnomalyAuthorityRank, 0) AS normalizedAuthorityRank + ,coalesce(toFloat(anomalous.anomalyBottleneckRank) / maxAnomalyBottleneckRank, 0) AS normalizedBottleneckRank + ,coalesce(toFloat(anomalous.anomalyBridgeRank) / maxAnomalyBridgeRank, 0) AS normalizedBridgeRank + ,coalesce(toFloat(anomalous.anomalyHubRank / maxAnomalyHubRank), 0) AS normalizedHubRank + ,coalesce(toFloat(anomalous.anomalyOutlierRank) / maxAnomalyOutlierRank, 0) AS normalizedOutlierRank +ORDER BY filePath ASCENDING +LIMIT 200 \ No newline at end of file diff --git a/domains/anomaly-detection/treemapVisualizations.py b/domains/anomaly-detection/treemapVisualizations.py new file mode 100755 index 000000000..ce4680d6c --- /dev/null +++ b/domains/anomaly-detection/treemapVisualizations.py @@ -0,0 +1,645 @@ +#!/usr/bin/env python + +# This Python script uses Plotly Treemap Charts (https://plotly.com/python/treemaps) to visualize anomaly detection results. + +from typing import Any, Dict, List, Tuple, Literal, LiteralString, Optional, cast + +import os +import sys +import argparse +import pprint +import logging + +import pandas as pd +import numpy as np + +from neo4j import GraphDatabase, Driver + +from plotly import graph_objects as plotly_graph_objects + + +class Parameters: + required_parameters_ = ["projection_language"] + + def __init__(self, input_parameters: Dict[str, str], report_directory: str = "", verbose: bool = False): + self.query_parameters_ = input_parameters.copy() # copy enforces immutability + self.report_directory = report_directory + self.verbose_ = verbose + + def __repr__(self): + pretty_dict = pprint.pformat(self.query_parameters_, indent=4) + return f"Parameters: verbose={self.verbose_}, report_directory={self.report_directory}, query_parameters:\n{pretty_dict}" + + @staticmethod + def log_dependency_versions_() -> None: + print('---------------------------------------') + + print('Python version: {}'.format(sys.version)) + + from numpy import __version__ as numpy_version + print('numpy version: {}'.format(numpy_version)) + + from pandas import __version__ as pandas_version + print('pandas version: {}'.format(pandas_version)) + + from neo4j import __version__ as neo4j_version + print('neo4j version: {}'.format(neo4j_version)) + + from plotly import version as plotly_version + print('plotly version: {}'.format(plotly_version)) + + print('---------------------------------------') + + @classmethod + def from_input_parameters(cls, input_parameters: Dict[str, str], report_directory: str = "", verbose: bool = False): + """ + Creates a Parameters instance from a dictionary of input parameters. + The dictionary must contain the following keys: + - "projection_node_label": The node type of the projection. + """ + missing_parameters = [parameter for parameter in cls.required_parameters_ if parameter not in input_parameters] + if missing_parameters: + raise ValueError("Missing parameters:", missing_parameters) + created_parameters = cls(input_parameters, report_directory, verbose) + if created_parameters.is_verbose(): + print(created_parameters) + cls.log_dependency_versions_() + return created_parameters + + def __is_code_language_available(self) -> bool: + return "projection_language" in self.query_parameters_ + + def __get_projection_language(self) -> str: + return self.query_parameters_["projection_language"] if self.__is_code_language_available() else "" + + def get_title_prefix(self) -> str: + if self.__is_code_language_available(): + return self.__get_projection_language() + return "" + + def get_report_directory(self) -> str: + return self.report_directory + + def is_verbose(self) -> bool: + return self.verbose_ + + +def parse_input_parameters() -> Parameters: + # Convert list of "key=value" strings to a dictionary + def parse_key_value_list(param_list: List[str]) -> Dict[str, str]: + param_dict = {} + for item in param_list: + if '=' in item: + key, value = item.split('=', 1) + param_dict[key] = value + return param_dict + + parser = argparse.ArgumentParser( + description="Unsupervised clustering to assign labels to code units (Java packages, types,...) and their dependencies based on how structurally similar they are within a software system.") + parser.add_argument('--verbose', action='store_true', help='Enable verbose mode to log all details') + parser.add_argument('--report_directory', type=str, default="", help='Path to the report directory') + parser.add_argument('query_parameters', nargs='*', type=str, help='List of key=value Cypher query parameters') + parser.set_defaults(verbose=False) + args = parser.parse_args() + return Parameters.from_input_parameters(parse_key_value_list(args.query_parameters), args.report_directory, args.verbose) + + +def get_file_path(name: str, parameters: Parameters, extension: str = 'svg') -> str: + name = parameters.get_report_directory() + '/' + name.replace(' ', '_') + '.' + extension + if parameters.is_verbose(): + print(f"treemapVisualizations: Saving file {name}") + return name + + +def get_graph_database_driver() -> Driver: + driver = GraphDatabase.driver( + uri="bolt://localhost:7687", + auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD")) + ) + driver.verify_connectivity() + return driver + + +def query_cypher_to_data_frame(query: LiteralString, parameters: Optional[Dict[str, Any]] = None): + records, _, keys = driver.execute_query(query, parameters_=parameters) + return pd.DataFrame([record.values() for record in records], columns=keys) + + +# ---------------------------------------- +# Base settings for image rendering +# ---------------------------------------- + + +image_rendering_settings = { + "format": "svg", + "width": 1920, + "height": 1080, +} + +# ---------------------------------------- +# Base settings for Plotly Treemap +# ---------------------------------------- + +plotly_main_layout_base_settings = { + "margin": {"t": 60, "l": 15, "r": 15, "b": 20}, +} +plotly_treemap_layout_base_settings = dict( + **plotly_main_layout_base_settings +) +plotly_bar_layout_base_settings = dict( + **plotly_main_layout_base_settings +) +plotly_treemap_marker_base_style = { + "cornerradius": 5 +} + +plotly_treemap_marker_base_color_scale = dict( + **plotly_treemap_marker_base_style, + colorscale='Hot_r', +) + + +# ---------------------------------------- +# Base functions for Treemap chart visualization +# ---------------------------------------- + +# Ignore kaleido logging noise when writing images +logging.getLogger("kaleido").setLevel(logging.WARNING) + +def get_plotly_figure_write_image_settings(name: str, path: str): + """ + Returns the settings for the plotly figure write_image method + :param name: Name of the figure + :return: Dictionary with settings for the write_image method + """ + return { + "file": path + "/" + name + "." + image_rendering_settings['format'], + "format": image_rendering_settings['format'], + "width": image_rendering_settings['width'], + "height": image_rendering_settings['height'], + } + + +def create_treemap_settings(data_frame: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = "elementName") -> plotly_graph_objects.Treemap: + """ + Creates a Plotly Treemap with the given settings and data frame. + data_frame : pd.DataFrame : The input data frame + return :plotly_graph_objects.Treemap : The prepared Plotly Treemap + """ + return plotly_graph_objects.Treemap( + labels=data_frame[element_name_column], + parents=data_frame['directoryParentPath'], + ids=data_frame[element_path_column], + customdata=data_frame[['fileCount', 'absoluteAnomalyScore', 'normalizedAuthorityRank', 'normalizedBottleneckRank', 'normalizedBridgeRank', 'normalizedHubRank', 'normalizedOutlierRank', 'elementPath']], + hovertemplate='%{label}
Highlighted anomalies: %{customdata[0]}
Anomaly Score: %{customdata[1]:.4f}
Authority: %{customdata[2]}, Bottleneck: %{customdata[3]}, Bridge: %{customdata[4]}, Hub: %{customdata[5]}, Outlier: %{customdata[6]}
Path: %{customdata[7]}', + maxdepth=-1, + root_color="lightgrey", + marker=dict(**plotly_treemap_marker_base_style), + ) + + +# ---------------------------------------- +# Base functions to prepare data for Treemap chart visualization +# ---------------------------------------- + + +def remove_last_path_file_extension(file_path_elements: list) -> list: + """ + Removes the file extension of the last element of the file path so that only the file name remains. + file_path_elements : list : The list of file path elements where the last one contains the file name with extension + return : list : The list of the directories + the file name without extension as last element. + """ + if not file_path_elements: + return [''] + if len(file_path_elements) == 1: + return [os.path.splitext(file_path_elements[0])[0]] + return file_path_elements[:-1] + [os.path.splitext(file_path_elements[-1])[0]] + + +def join_path_elements(file_path_elements: list) -> list: + """ + Joins the file path elements (and removes the file extension). + file_path_elements : list : The list of levels to convert + return : list : The list of directories + """ + prepared_path_elements = remove_last_path_file_extension(file_path_elements) + return ['/'.join(prepared_path_elements[:i+1]) for i in range(len(prepared_path_elements))] + + +def add_element_path_column(input_dataframe: pd.DataFrame, file_path_column: str, element_path_column: str = 'elementPath'): + """ + Adds a directory column to the input DataFrame based on the file path column. + input_dataframe : pd.DataFrame : The input DataFrame + file_path_column : str : The name of the file path column + directory_column : str : The name of the directory column to be added + return : pd.DataFrame : The DataFrame with added directory column + """ + if element_path_column in input_dataframe.columns: + return input_dataframe # Column already exists + + input_dataframe.insert(0, element_path_column, input_dataframe[file_path_column].str.split('/').apply(join_path_elements)) + input_dataframe = input_dataframe.explode(element_path_column) + return input_dataframe + + +def add_element_name_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = 'elementName'): + """ + Adds a directory name column to the input DataFrame based on the directory column. + input_dataframe : pd.DataFrame : The input DataFrame + directory_column : str : The name of the directory column + directory_name_column : str : The name of the directory name column to be added + return : pd.DataFrame : The DataFrame with added directory name column + """ + if element_name_column in input_dataframe.columns: + return input_dataframe # Column already exists + + splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1) + input_dataframe.insert(1, element_name_column, splitted_directories.apply(lambda x: (x[-1]))) + return input_dataframe + + +def add_parent_directory_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', directory_parent_column: str = 'directoryParentPath'): + """ + Adds a directory parent column to the input DataFrame based on the directory column. + input_dataframe : pd.DataFrame : The input DataFrame + directory_column : str : The name of the directory column + directory_parent_column : str : The name of the directory parent column to be added + return : pd.DataFrame : The DataFrame with added directory parent column + """ + if directory_parent_column in input_dataframe.columns: + return input_dataframe # Column already exists + + # Remove last path element from directory_column to get the directory_parent_column + splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1) + input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0]))) + + # Clear parent (set to empty string) when it equal to the directory + input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[element_path_column], directory_parent_column] = '' + return input_dataframe + + +def count_unique_aggregated_values(values: pd.Series): + """ + Return the number of unique values from an array of array of strings. + Meant to be used as an aggregation function for dataframe grouping. + values : Series : The pandas Series of values + return : int : The number of files + """ + return len(np.unique(np.concatenate(values.to_list()))) + + +def prepare_data_for_treemap(data: pd.DataFrame, debug: bool = False) -> pd.DataFrame: + if debug: + print("1. query result ---------------------") + print(data) + + # 3. Add multiple rows for each file path containing all its directories paths in the new column 'elementPath' + data = add_element_path_column(data, 'filePath', 'elementPath') + + if debug: + print("3. added elementPath --------------") + print(data) + + # Group the files by their directory and count the number of files of each directory (across all levels). + common_named_aggregation = { + "absoluteAnomalyScore": pd.NamedAgg(column="absoluteAnomalyScore", aggfunc="mean"), + "normalizedAuthorityRank": pd.NamedAgg(column="normalizedAuthorityRank", aggfunc="max"), + "normalizedBottleneckRank": pd.NamedAgg(column="normalizedBottleneckRank", aggfunc="max"), + "normalizedBridgeRank": pd.NamedAgg(column="normalizedBridgeRank", aggfunc="max"), + "normalizedHubRank": pd.NamedAgg(column="normalizedHubRank", aggfunc="max"), + "normalizedOutlierRank": pd.NamedAgg(column="normalizedOutlierRank", aggfunc="max"), + } + + data = data.groupby(['elementPath']).aggregate( + filePaths=pd.NamedAgg(column="filePath", aggfunc=np.unique), + firstFile=pd.NamedAgg(column="filePath", aggfunc="first"), + maxAnomalyScore=pd.NamedAgg(column="absoluteAnomalyScore", aggfunc="max"), + **common_named_aggregation + ) + + # Sort the grouped and aggregated entries by the name of the directory ascending and the anomaly score descending. + # The author with the most commits will then be listed first for each directory. + data = data.sort_values(by=['elementPath', 'absoluteAnomalyScore'], ascending=[True, False]) + data = data.reset_index() + + if debug: + print("4. grouped by elementPath --------------") + print(data) + + # Group the entries again now only by their directory path to get the aggregated number of anomalies and ranks. + data = data.groupby('elementPath').aggregate( + fileCount=pd.NamedAgg(column="filePaths", aggfunc=count_unique_aggregated_values), + firstFile=pd.NamedAgg(column="firstFile", aggfunc="first"), + maxAnomalyScore=pd.NamedAgg(column="maxAnomalyScore", aggfunc="max"), + **common_named_aggregation + ) + data = data.reset_index() + + if debug: + print("5. grouped by directory path --------------") + print(data) + + # Add the name of the directory (last '/' separated element) and the parent directory path to the table. + data = add_element_name_column(data, 'elementPath', 'elementName') + data = add_parent_directory_column(data, 'elementPath', 'directoryParentPath') + + if debug: + print("6. added directory and parent name --------------") + print(data) + + # Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple. + all_column_names_except_for_the_directory_path = data.columns.to_list()[3:] + data = data.groupby(all_column_names_except_for_the_directory_path).aggregate( + elementName=pd.NamedAgg(column="elementName", aggfunc=lambda names: '/'.join(names)), + directoryParentPath=pd.NamedAgg(column="directoryParentPath", aggfunc="first"), + elementPath=pd.NamedAgg(column="elementPath", aggfunc="last"), + ) + + # Reorder the column positions so that the directory path is again the first column. + all_column_names_with_the_directory_path_first = ['elementPath', 'directoryParentPath', 'elementName'] + all_column_names_except_for_the_directory_path + data = data.reset_index()[all_column_names_with_the_directory_path_first] + + if debug: + print("7. final grouping --------------") + print(data) + print("Statistics --------------") + data.describe() + + return data + + +def mutual_exclusive_ranks(data: pd.DataFrame) -> pd.DataFrame: + """ + Modifies the input data frame to ensure that only one archetype rank is non-zero per row. + The archetype with the highest normalized rank is retained, and others are set to zero. + data : pd.DataFrame : The input data frame + return : pd.DataFrame : The modified data frame with mutual exclusive ranks + """ + modified_data = data.copy() + + for dataframe_index, row in modified_data.iterrows(): + index = cast(int, dataframe_index) + max_rank_value = 0 + max_rank_column = None + + for column in archetype_columns: + if row[column] > max_rank_value: + max_rank_value = row[column] + max_rank_column = column + + for column in archetype_columns: + if column != max_rank_column: + modified_data.at[index, column] = 0 + + return modified_data + + +# ---------------------------------------- +# Archetypes +# ---------------------------------------- + +Archetypes = Literal["Authority", "Bottleneck", "Bridge", "Hub", "Outlier"] +archetype_names: List[Archetypes] = ["Authority", "Bottleneck", "Bridge", "Hub", "Outlier"] + +def get_archetype_column_name(archetype: Archetypes) -> str: + """ + Returns the column name for the given archetype. + archetype : Archetypes : The archetype name + return : str : The column name for the given archetype + """ + return f"normalized{archetype}Rank" + +def get_archetype_index(archetype: Archetypes) -> int: + """ + Returns the index of the given archetype. + archetype : Archetypes : The archetype name + return : int : The index of the given archetype + """ + return archetype_names.index(archetype) + +archetype_columns = [get_archetype_column_name(name) for name in archetype_names] + + +# ---------------------------------------- +# Archetype Coloring +# ---------------------------------------- + +Color = Tuple[int, int, int] # RGB (red, green, blue) color tuple +ColorPair = Tuple[Color, Color] # Low and high color pair + +def interpolate_color(low: Color, high: Color, normalized_value: float) -> str: + """Linear interpolation between two RGB tuples, returns rgba string.""" + + def linear_interpolation_of_color_component(color_component: int) -> int: + return int(low[color_component] + (high[color_component] - low[color_component]) * normalized_value) + + red = linear_interpolation_of_color_component(0) + green = linear_interpolation_of_color_component(1) + blue = linear_interpolation_of_color_component(2) + return f"rgb({red},{green},{blue})" + + +def get_rank_color(rank: float, low: Color, high: Color) -> str: + """Return transparent if rank == 0, else interpolate between low and high.""" + if rank <= 0: + return "rgb(255,255,255)" + return interpolate_color(low, high, rank) + + +def combine_rank_colors( + dataframe: pd.DataFrame, + rank_columns: List[str], + color_pairs: List[ColorPair], +) -> List[str]: + """Combine multiple ranks, using the first nonzero value's color.""" + combined: List[str] = [] + for _, row in dataframe.iterrows(): + color = "rgb(255,255,255)" + for rank_col, (low, high) in zip(rank_columns, color_pairs): + rank = row[rank_col] + if rank > 0: + color = get_rank_color(rank, low, high) + break + combined.append(color) + return combined + + +def get_rank_color_for_archetype(dataframe: pd.DataFrame, archetype: Archetypes) -> List[str]: + """Get combined rank colors for a specific archetype.""" + archetype_column_name = get_archetype_column_name(archetype) + coloring_pair = get_coloring_pairs()[archetype_names.index(archetype)] + return combine_rank_colors(dataframe, [archetype_column_name], [coloring_pair]) + + +def get_coloring_pairs() -> List[Tuple[Tuple[int, int, int], Tuple[int, int, int]]]: + """Define the coloring scheme for each archetype.""" + assert len(archetype_names) == 5, "Expected exactly 5 archetypes." + return [ + ((222, 235, 247), (33, 113, 181)), # Authority, Red shades + ((254, 230, 206), (217, 72, 1)), # Bottleneck, Green shades + ((239, 237, 245), (106, 81, 163)), # Bridge, Blue shades + ((254, 224, 210), (165,15,21)), # Hub, Orange shades + ((240, 240, 240), (82, 82, 82)), # Outlier, Purple shades + ] + +# ---------------------------------------- +# Data query +# ---------------------------------------- + +def query_data() -> pd.DataFrame: + query: LiteralString = """ + MATCH (anomalyScoreStats:File&!Directory&!Archive) + WHERE anomalyScoreStats.anomalyScore < 0 + ORDER BY anomalyScoreStats.anomalyScore ASCENDING + LIMIT 150 // n largest negative anomaly scores as threshold + WITH collect(anomalyScoreStats.anomalyScore)[-1] AS anomalyScoreThreshold + MATCH (anomalyRankStats:File&!Directory&!Archive) + WITH anomalyScoreThreshold + ,max(anomalyRankStats.anomalyAuthorityRank) AS maxAnomalyAuthorityRank + ,max(anomalyRankStats.anomalyBottleneckRank) AS maxAnomalyBottleneckRank + ,max(anomalyRankStats.anomalyBridgeRank) AS maxAnomalyBridgeRank + ,max(anomalyRankStats.anomalyHubRank) AS maxAnomalyHubRank + ,max(anomalyRankStats.anomalyOutlierRank) AS maxAnomalyOutlierRank + MATCH (anomalous:File&!Directory&!Archive) + WHERE (anomalous.anomalyScore < anomalyScoreThreshold + OR anomalous.anomalyHubRank IS NOT NULL + OR anomalous.anomalyAuthorityRank IS NOT NULL + OR anomalous.anomalyBottleneckRank IS NOT NULL + OR anomalous.anomalyOutlierRank IS NOT NULL + OR anomalous.anomalyBridgeRank IS NOT NULL) + OPTIONAL MATCH (project:Artifact|Project)-[:CONTAINS]->(anomalous) + WITH * + ,coalesce(project.name + '/', '') AS projectName + ,coalesce(anomalous.fileName, anomalous.relativePath) AS fileName + RETURN replace(projectName + fileName, '//', '/') AS filePath + ,CASE WHEN anomalous.anomalyScore < 0 THEN abs(anomalous.anomalyScore) ELSE 0 END AS absoluteAnomalyScore + ,coalesce(toFloat(anomalous.anomalyAuthorityRank) / maxAnomalyAuthorityRank, 0) AS normalizedAuthorityRank + ,coalesce(toFloat(anomalous.anomalyBottleneckRank) / maxAnomalyBottleneckRank, 0) AS normalizedBottleneckRank + ,coalesce(toFloat(anomalous.anomalyBridgeRank) / maxAnomalyBridgeRank, 0) AS normalizedBridgeRank + ,coalesce(toFloat(anomalous.anomalyHubRank / maxAnomalyHubRank), 0) AS normalizedHubRank + ,coalesce(toFloat(anomalous.anomalyOutlierRank) / maxAnomalyOutlierRank, 0) AS normalizedOutlierRank + ORDER BY filePath ASCENDING + """ + return query_cypher_to_data_frame(query) + + +# ------------------------------------------------------------------------------------------------------------ +# MAIN +# ------------------------------------------------------------------------------------------------------------ + + +parameters = parse_input_parameters() +title_prefix = parameters.get_title_prefix() +driver = get_graph_database_driver() + +print(f"treemapVisualizations: Querying {title_prefix} data for treemap visualization...") +anomaly_file_paths = query_data() + +print(f"treemapVisualizations: Preparing {title_prefix} data for treemap visualization...") +anomaly_file_paths = prepare_data_for_treemap(anomaly_file_paths) + +# --- Visualizing Anomaly Scores + +print(f"treemapVisualizations: Creating {title_prefix} anomaly scores treemap visualization...") +figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap( + create_treemap_settings(anomaly_file_paths), + marker=dict( + **plotly_treemap_marker_base_color_scale, + colors=anomaly_file_paths['absoluteAnomalyScore'], + colorbar={"title": "score"}, + ), +)) +figure.update_layout( + **plotly_treemap_layout_base_settings, # type: ignore + title=f'Average {title_prefix} anomaly score per directory', +) +figure.write_image(**get_plotly_figure_write_image_settings(f"{title_prefix}Treemap1AverageAnomalyScorePerDirectory", parameters.get_report_directory())) + +# --- Visualizing Archetypes + +print(f"treemapVisualizations: Creating {title_prefix} archetypes overview treemap visualization...") +mutual_exclusive_archetype_ranks_data = mutual_exclusive_ranks(anomaly_file_paths) + +coloring_pairs = get_coloring_pairs() +combined_colors = combine_rank_colors(mutual_exclusive_archetype_ranks_data, archetype_columns, coloring_pairs) + +figure = plotly_graph_objects.Figure() + +figure.add_trace(plotly_graph_objects.Treemap( + create_treemap_settings(mutual_exclusive_archetype_ranks_data), + marker=dict( + **plotly_treemap_marker_base_style, + line={"width": 1, "color": "black"}, + showscale=False, + colors=combined_colors, + ), + name="Anomalies", + opacity=0.8 +)) + +# Add dummy scatter traces for legend +for name, (low, high) in zip(archetype_names, coloring_pairs): + bright_color = interpolate_color(low, high, 0.4) # light tone for legend filling + dark_color = interpolate_color(low, high, 1.0) # darkest tone for legend outline + figure.add_trace(plotly_graph_objects.Scatter( + x=[None], + y=[None], + mode="markers", + marker={"size": 12, "color": bright_color, "line": {"width": 2, "color": dark_color}}, + name=name, + legendgroup=name, + showlegend=True, + )) + +figure.update_layout( + **plotly_treemap_layout_base_settings, # type: ignore + title=f'Overview of all {title_prefix} anomaly archetypes per directory', + legend={ + "orientation": "h", # horizontal legend + "yanchor": "bottom", + "y": -0.12, + "xanchor": "center", + "x": 0.5 + } +) +figure.update_xaxes(visible=False) +figure.update_yaxes(visible=False) +figure.write_image(**get_plotly_figure_write_image_settings(f"{title_prefix}Treemap2ArchetypesOverviewPerDirectory", parameters.get_report_directory())) + +# --- Visualizing Archetypes individually + +def plot_single_archetype_treemap(archetype: Archetypes, title_prefix: str, file_index: int, data: pd.DataFrame): + """ + Plots a treemap for the given archetype using the provided data. + archetype : Archetypes : The archetype to plot + data : pd.DataFrame : The input data frame + """ + print(f"treemapVisualizations: Creating {title_prefix} archetype '{archetype}' treemap visualization...") + data_to_display = data.copy() + data_to_display = data_to_display[data_to_display[archetype_columns].sum(axis=1) > 0] + + combined_colors = get_rank_color_for_archetype(data_to_display, archetype) + + figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap( + create_treemap_settings(data_to_display), + marker=dict( + **plotly_treemap_marker_base_style, + colors=combined_colors, + line={"width": 1, "color": "black"}, + colorbar={"title": "rank"}, + ), + )) + figure.update_layout( + **plotly_treemap_layout_base_settings, # type: ignore + title=f'{title_prefix} Archetype "{archetype}" per directory', + ) + figure.write_image(**get_plotly_figure_write_image_settings(f"{title_prefix}Treemap{file_index}Archetype{archetype}PerDirectory", parameters.get_report_directory())) + +plot_single_archetype_treemap("Authority", title_prefix, 3, anomaly_file_paths) +plot_single_archetype_treemap("Bottleneck", title_prefix, 4, anomaly_file_paths) +plot_single_archetype_treemap("Bridge", title_prefix, 5, anomaly_file_paths) +plot_single_archetype_treemap("Hub", title_prefix, 6, anomaly_file_paths) +plot_single_archetype_treemap("Outlier", title_prefix, 7, anomaly_file_paths) + +driver.close() +print("treemapVisualizations: Successfully created treemap visualizations.") \ No newline at end of file From a44c6459053b94317ebbe5369b6a3249751250d3 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Fri, 7 Nov 2025 21:04:33 +0100 Subject: [PATCH 2/2] Add Treemap chars to anomaly detection summary --- domains/anomaly-detection/summary/anomalyDetectionSummary.sh | 4 ++++ domains/anomaly-detection/summary/report.template.md | 4 ++++ .../summary/report_no_anomaly_detection_treemaps.template.md | 1 + 3 files changed, 9 insertions(+) create mode 100644 domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md diff --git a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh index a7659df03..6335c8b9f 100755 --- a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh +++ b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh @@ -177,6 +177,10 @@ anomaly_detection_finalize_report() { # Remove empty Markdown includes source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${report_include_directory}" + # Collect static Markdown includes (after cleanup to not remove one-liner) + cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_no_anomaly_detection_treemaps.template.md" "${report_include_directory}/report_no_anomaly_detection_treemaps.md" + + # Assemble final report by applying includes to the main template cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report.template.md" "${FULL_REPORT_DIRECTORY}/report.template.md" cat "${FULL_REPORT_DIRECTORY}/report.template.md" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${report_include_directory}" > "${FULL_REPORT_DIRECTORY}/anomaly_detection_report.md" diff --git a/domains/anomaly-detection/summary/report.template.md b/domains/anomaly-detection/summary/report.template.md index 0f846235c..59275eb46 100644 --- a/domains/anomaly-detection/summary/report.template.md +++ b/domains/anomaly-detection/summary/report.template.md @@ -26,6 +26,10 @@ The goal is to detect potential **software quality, design, and architecture iss +### 1.3 Overview Charts + + + ## 2. Deep Dives by Abstraction Level Each abstraction level includes anomaly statistics, SHAP feature importance, archetype distribution, and example anomalies. diff --git a/domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md b/domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md new file mode 100644 index 000000000..26df86f3a --- /dev/null +++ b/domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md @@ -0,0 +1 @@ +⚠️ _No anomaly detection treemap charts due to missing data._ \ No newline at end of file