diff --git a/domains/anomaly-detection/anomalyDetectionPython.sh b/domains/anomaly-detection/anomalyDetectionPython.sh
index 80c3c1109..f6f661814 100755
--- a/domains/anomaly-detection/anomalyDetectionPython.sh
+++ b/domains/anomaly-detection/anomalyDetectionPython.sh
@@ -15,12 +15,14 @@ set -o errexit -o pipefail
# Overrideable Constants (defaults also defined in sub scripts)
REPORTS_DIRECTORY=${REPORTS_DIRECTORY:-"reports"}
+MARKDOWN_INCLUDES_DIRECTORY=${MARKDOWN_INCLUDES_DIRECTORY:-"includes"} # Subdirectory that contains Markdown files to be included by the Markdown template for the report.
+
## Get this "scripts/reports" directory if not already set
# Even if $BASH_SOURCE is made for Bourne-like shells it is also supported by others and therefore here the preferred solution.
# CDPATH reduces the scope of the cd command to potentially prevent unintended directory changes.
# This way non-standard tools like readlink aren't needed.
ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR:-$(CDPATH=. cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd -P)}
-echo "anomalyDetectionPipeline: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}"
+echo "anomalyDetectionPython: ANOMALY_DETECTION_SCRIPT_DIR=${ANOMALY_DETECTION_SCRIPT_DIR}"
# Get the "scripts" directory by taking the path of this script and going one directory up.
SCRIPTS_DIR=${SCRIPTS_DIR:-"${ANOMALY_DETECTION_SCRIPT_DIR}/../../scripts"} # Repository directory containing the shell scripts
# Get the "cypher" query directory for gathering features.
@@ -49,7 +51,7 @@ while [[ $# -gt 0 ]]; do
verboseMode="--verbose"
;;
*)
- echo -e "${COLOR_ERROR}anomalyDetectionPipeline: Error: Unknown option: ${key}${COLOR_DEFAULT}" >&2
+ echo -e "${COLOR_ERROR}anomalyDetectionPython: Error: Unknown option: ${key}${COLOR_DEFAULT}" >&2
usage
;;
esac
@@ -72,10 +74,10 @@ is_sufficient_data_available() {
query_result=$( execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionNodeCount.cypher" "${@}" )
node_count=$(get_csv_column_value "${query_result}" "node_count")
if [ "${node_count}" -lt 15 ]; then
- echo "anomalyDetectionPipeline: Warning: Skipping anomaly detection. Only ${node_count} ${language} ${nodeLabel} nodes. At least 15 required."
+ echo "anomalyDetectionPython: Warning: Skipping anomaly detection. Only ${node_count} ${language} ${nodeLabel} nodes. At least 15 required."
false
else
- echo "anomalyDetectionPipeline: Info: Running anomaly detection with ${node_count} ${language} ${nodeLabel} nodes."
+ echo "anomalyDetectionPython: Info: Running anomaly detection with ${node_count} ${language} ${nodeLabel} nodes."
true
fi
}
@@ -92,7 +94,7 @@ is_sufficient_data_available() {
anomaly_detection_features() {
local nodeLabel
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
- echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..."
+ echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Collecting features for ${nodeLabel} nodes..."
# Determine the Betweenness centrality (with the directed graph projection) if not already done
execute_cypher_queries_until_results "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeature-Betweenness-Exists.cypher" \
@@ -127,7 +129,7 @@ anomaly_detection_using_python() {
local language
language=$( extractQueryParameter "projection_language" "${@}" )
- echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..."
+ echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..."
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
@@ -152,6 +154,8 @@ anomaly_detection_using_python() {
# Required Parameters:
# - projection_node_label=...
# Label of the nodes that will be used for the projection. Example: "Package"
+# - projection_language=...
+# Name of the associated programming language. Examples: "Java", "Typescript"
anomaly_detection_labels() {
local nodeLabel
nodeLabel=$( extractQueryParameter "projection_node_label" "${@}" )
@@ -177,12 +181,63 @@ anomaly_detection_labels() {
# Label of the nodes that will be used for the projection. Example: "Package"
# - projection_weight_property=...
# Name of the node property that contains the dependency weight. Example: "weight"
+# - projection_language=...
+# Name of the associated programming language. Examples: "Java", "Typescript"
anomaly_detection_python_reports() {
time anomaly_detection_features "${@}"
anomaly_detection_using_python "${@}"
time anomaly_detection_labels "${@}"
}
+# Creates the markdown file (to be included in the main summary)
+# that contains the references to all treemap charts.
+anomaly_detection_treemap_charts_markdown_reference() {
+
+ echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Starting treemap charts markdown reference generation..."
+
+ local detail_report_include_directory="${FULL_REPORT_DIRECTORY}/${MARKDOWN_INCLUDES_DIRECTORY}"
+ mkdir -p "${detail_report_include_directory}"
+
+ local markdown_reference_file_name="TreemapChartsReference.md"
+ local markdown_reference_file="${detail_report_include_directory}/${markdown_reference_file_name}"
+
+ # Write markdown references section title
+ {
+ echo "#### Treemap Charts"
+ } > "${markdown_reference_file}"
+
+ # Find all treemap chart SVG files and add them to the markdown reference file
+ find "${FULL_REPORT_DIRECTORY}" -type f -name "*Treemap*.svg" | sort | while read -r chart_file; do
+ chart_filename=$(basename -- "${chart_file}")
+ chart_filename_without_extension="${chart_filename%.*}" # Remove file extension
+ {
+ echo ""
+ echo ""
+ } >> "${markdown_reference_file}"
+ done
+
+ # Add a horizontal rule at the end
+ {
+ echo ""
+ echo "---"
+ } >> "${markdown_reference_file}"
+
+ echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Finished treemap charts markdown reference generation..."
+}
+
+# Visualize results with treemap charts.
+#
+# Required Parameters:
+# - projection_language=...
+# Name of the associated programming language. Examples: "Java", "Typescript"
+anomaly_detection_treemap_charts() {
+ local language
+ language=$( extractQueryParameter "projection_language" "${@}" )
+
+ echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Visualizing ${language} results..."
+ time "${ANOMALY_DETECTION_SCRIPT_DIR}/treemapVisualizations.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
+}
+
# Create report directory
REPORT_NAME="anomaly-detection"
FULL_REPORT_DIRECTORY="${REPORTS_DIRECTORY}/${REPORT_NAME}"
@@ -229,6 +284,7 @@ if is_sufficient_data_available "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=we
if createUndirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection"; then
createDirectedJavaTypeDependencyProjection "${PROJECTION_NAME}=type-anomaly-detection-directed"
anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=type-anomaly-detection" "${ALGORITHM_NODE}=Type" "${ALGORITHM_WEIGHT}=weight" "${ALGORITHM_LANGUAGE}=Java" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+ anomaly_detection_treemap_charts "${ALGORITHM_LANGUAGE}=Java"
fi
fi
@@ -238,12 +294,17 @@ if is_sufficient_data_available "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=
if createUndirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"; then
createDirectedDependencyProjection "${PROJECTION_NAME}=typescript-module-embedding-directed" "${PROJECTION_NODE}=Module" "${PROJECTION_WEIGHT}=lowCouplingElement25PercentWeight" "${PROJECTION_LANGUAGE}=Typescript"
anomaly_detection_python_reports "${ALGORITHM_PROJECTION}=typescript-module-embedding" "${ALGORITHM_NODE}=Module" "${ALGORITHM_WEIGHT}=lowCouplingElement25PercentWeight" "${ALGORITHM_LANGUAGE}=Typescript" "${COMMUNITY_PROPERTY}" "${EMBEDDING_PROPERTY}"
+ anomaly_detection_treemap_charts "${ALGORITHM_LANGUAGE}=Module"
fi
fi
+# -- Markdown summary ---------------------------
+
+anomaly_detection_treemap_charts_markdown_reference
+
# ---------------------------------------------------------------
# Clean-up after report generation. Empty reports will be deleted.
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"
-echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
\ No newline at end of file
+echo "anomalyDetectionPython: $(date +'%Y-%m-%dT%H:%M:%S%z') Successfully finished."
\ No newline at end of file
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb
new file mode 100644
index 000000000..79f073eae
--- /dev/null
+++ b/domains/anomaly-detection/explore/AnomalyDetectionTreeMapExploration.ipynb
@@ -0,0 +1,814 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "2f0eabc4",
+ "metadata": {},
+ "source": [
+ "# Anomaly Detection - TreeMap Exploration\n",
+ "\n",
+ "This notebook demonstrates how to visualize anomalies with Treemap charts for static code analysis data using jQAssistant and Neo4j. \n",
+ "\n",
+ "
\n",
+ "\n",
+ "### References\n",
+ "- [jqassistant](https://jqassistant.org)\n",
+ "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)\n",
+ "- [Plotly Treemap Chart](https://plotly.com/python/treemaps/)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4191f259",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import typing\n",
+ "from typing import List, Tuple\n",
+ "\n",
+ "from IPython.display import display\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "from plotly import graph_objects as plotly_graph_objects\n",
+ "import plotly.colors as plotly_colors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a0676813",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n",
+ "#This is especially needed for PDF export of tables with multiple columns."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ebac1bb9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%html\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c9bc2241",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Pandas DataFrame Display Configuration\n",
+ "pd.set_option('display.max_colwidth', 500)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f8ef41ff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sys import version as python_version\n",
+ "print('Python version: {}'.format(python_version))\n",
+ "\n",
+ "from numpy import __version__ as numpy_version\n",
+ "print('numpy version: {}'.format(numpy_version))\n",
+ "\n",
+ "from pandas import __version__ as pandas_version\n",
+ "print('pandas version: {}'.format(pandas_version))\n",
+ "\n",
+ "from neo4j import __version__ as neo4j_version\n",
+ "print('neo4j version: {}'.format(neo4j_version))\n",
+ "\n",
+ "from plotly import version as plotly_version\n",
+ "print('plotly version: {}'.format(plotly_version))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1c5dab37",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n",
+ "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n",
+ "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n",
+ "from neo4j import GraphDatabase, Driver\n",
+ "\n",
+ "def get_graph_database_driver() -> Driver:\n",
+ " driver = GraphDatabase.driver(\n",
+ " uri=\"bolt://localhost:7687\",\n",
+ " auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")) # type: ignore\n",
+ " )\n",
+ " driver.verify_connectivity()\n",
+ " return driver"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1db254b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def query_cypher_to_data_frame(query: typing.LiteralString, parameters: typing.Optional[typing.Dict[str, typing.Any]] = None):\n",
+ " records, summary, keys = driver.execute_query(query, parameters_=parameters)\n",
+ " return pd.DataFrame([record.values() for record in records], columns=keys)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7cf0993d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Base settings for Plotly Treemap\n",
+ "\n",
+ "plotly_main_layout_base_settings = {\n",
+ " \"margin\": {\"t\": 50, \"l\": 15, \"r\": 15, \"b\": 15},\n",
+ "}\n",
+ "plotly_treemap_layout_base_settings = dict(\n",
+ " **plotly_main_layout_base_settings\n",
+ ")\n",
+ "plotly_bar_layout_base_settings = dict(\n",
+ " **plotly_main_layout_base_settings\n",
+ ")\n",
+ "plotly_treemap_figure_show_settings = {\n",
+ " \"renderer\": None,\n",
+ " \"width\": 1080,\n",
+ " \"height\": 1080,\n",
+ "}\n",
+ "\n",
+ "plotly_treemap_marker_base_style = {\n",
+ " \"cornerradius\": 5,\n",
+ "}\n",
+ "\n",
+ "# Hot_r, ice_r, Viridis_r, speed_r, haline_r, thermal_r, Plasma_r, solar_r, Electric_r, Blackbody_r, deep_r, Turbo_r, amp, Reds, Blackbody_r, RdGy_r, RdBu_r\n",
+ "plotly_treemap_marker_base_color_scale = dict(\n",
+ " **plotly_treemap_marker_base_style,\n",
+ " colorscale='Hot_r',\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "30a0b4e6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_treemap_settings(data_frame: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = \"elementName\") -> plotly_graph_objects.Treemap:\n",
+ " \"\"\"\n",
+ " Creates a Plotly Treemap with the given settings and data frame.\n",
+ " data_frame : pd.DataFrame : The input data frame\n",
+ " return :plotly_graph_objects.Treemap : The prepared Plotly Treemap\n",
+ " \"\"\"\n",
+ " return plotly_graph_objects.Treemap(\n",
+ " labels=data_frame[element_name_column],\n",
+ " parents=data_frame['directoryParentPath'],\n",
+ " ids=data_frame[element_path_column],\n",
+ " customdata=data_frame[['fileCount', 'absoluteAnomalyScore', 'normalizedAuthorityRank', 'normalizedBottleneckRank', 'normalizedBridgeRank', 'normalizedHubRank', 'normalizedOutlierRank', 'elementPath']],\n",
+ " hovertemplate='%{label}
Highlighted anomalies: %{customdata[0]}
Anomaly Score: %{customdata[1]:.4f}
Authority: %{customdata[2]}, Bottleneck: %{customdata[3]}, Bridge: %{customdata[4]}, Hub: %{customdata[5]}, Outlier: %{customdata[6]}
Path: %{customdata[7]}',\n",
+ " maxdepth=-1,\n",
+ " root_color=\"lightgrey\",\n",
+ " marker=dict(**plotly_treemap_marker_base_style),\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fc84a742",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def remove_last_path_file_extension(file_path_elements: list) -> list:\n",
+ " \"\"\"\n",
+ " Removes the file extension of the last element of the file path so that only the file name remains.\n",
+ " file_path_elements : list : The list of file path elements where the last one contains the file name with extension\n",
+ " return : list : The list of the directories + the file name without extension as last element.\n",
+ " \"\"\"\n",
+ " if not file_path_elements:\n",
+ " return ['']\n",
+ " if len(file_path_elements) == 1:\n",
+ " return [os.path.splitext(file_path_elements[0])[0]]\n",
+ " return file_path_elements[:-1] + [os.path.splitext(file_path_elements[-1])[0]]\n",
+ "\n",
+ "def join_path_elements(file_path_elements: list) -> list:\n",
+ " \"\"\"\n",
+ " Joins the file path elements (and removes the file extension).\n",
+ " file_path_elements : list : The list of levels to convert\n",
+ " return : list : The list of directories\n",
+ " \"\"\"\n",
+ " prepared_path_elements = remove_last_path_file_extension(file_path_elements)\n",
+ " return ['/'.join(prepared_path_elements[:i+1]) for i in range(len(prepared_path_elements))]\n",
+ "\n",
+ "def add_element_path_column(input_dataframe: pd.DataFrame, file_path_column: str, element_path_column: str = 'elementPath'):\n",
+ " \"\"\"\n",
+ " Adds a directory column to the input DataFrame based on the file path column.\n",
+ " input_dataframe : pd.DataFrame : The input DataFrame\n",
+ " file_path_column : str : The name of the file path column\n",
+ " directory_column : str : The name of the directory column to be added\n",
+ " return : pd.DataFrame : The DataFrame with added directory column\n",
+ " \"\"\"\n",
+ " if element_path_column in input_dataframe.columns:\n",
+ " return input_dataframe # Column already exists\n",
+ " \n",
+ " input_dataframe.insert(0, element_path_column, input_dataframe[file_path_column].str.split('/').apply(join_path_elements))\n",
+ " input_dataframe = input_dataframe.explode(element_path_column)\n",
+ " return input_dataframe\n",
+ "\n",
+ "def add_element_name_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = 'elementName'):\n",
+ " \"\"\"\n",
+ " Adds a directory name column to the input DataFrame based on the directory column.\n",
+ " input_dataframe : pd.DataFrame : The input DataFrame\n",
+ " directory_column : str : The name of the directory column\n",
+ " directory_name_column : str : The name of the directory name column to be added\n",
+ " return : pd.DataFrame : The DataFrame with added directory name column\n",
+ " \"\"\"\n",
+ " if element_name_column in input_dataframe.columns:\n",
+ " return input_dataframe # Column already exists\n",
+ " \n",
+ " splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)\n",
+ " input_dataframe.insert(1, element_name_column, splitted_directories.apply(lambda x: (x[-1])))\n",
+ " return input_dataframe\n",
+ "\n",
+ "def add_parent_directory_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', directory_parent_column: str = 'directoryParentPath'):\n",
+ " \"\"\"\n",
+ " Adds a directory parent column to the input DataFrame based on the directory column.\n",
+ " input_dataframe : pd.DataFrame : The input DataFrame\n",
+ " directory_column : str : The name of the directory column\n",
+ " directory_parent_column : str : The name of the directory parent column to be added\n",
+ " return : pd.DataFrame : The DataFrame with added directory parent column\n",
+ " \"\"\"\n",
+ " if directory_parent_column in input_dataframe.columns:\n",
+ " return input_dataframe # Column already exists\n",
+ " \n",
+ " # Remove last path element from directory_column to get the directory_parent_column\n",
+ " splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)\n",
+ " input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))\n",
+ " \n",
+ " # Clear parent (set to empty string) when it equal to the directory\n",
+ " input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[element_path_column], directory_parent_column] = ''\n",
+ " return input_dataframe\n",
+ "\n",
+ "def count_unique_aggregated_values(values: pd.Series):\n",
+ " \"\"\"\n",
+ " Return the number of unique values from an array of array of strings.\n",
+ " Meant to be used as an aggregation function for dataframe grouping.\n",
+ " values : Series : The pandas Series of values\n",
+ " return : int : The number of files\n",
+ " \"\"\"\n",
+ " return len(np.unique(np.concatenate(values.to_list())))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "04166d63",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "Archetypes = typing.Literal[\"Authority\", \"Bottleneck\", \"Bridge\", \"Hub\", \"Outlier\"]\n",
+ "archetype_names: List[Archetypes] = [\"Authority\", \"Bottleneck\", \"Bridge\", \"Hub\", \"Outlier\"]\n",
+ "\n",
+ "def get_archetype_column_name(archetype: Archetypes) -> str:\n",
+ " \"\"\"\n",
+ " Returns the column name for the given archetype.\n",
+ " archetype : Archetypes : The archetype name\n",
+ " return : str : The column name for the given archetype\n",
+ " \"\"\"\n",
+ " return f\"normalized{archetype}Rank\"\n",
+ "\n",
+ "def get_archetype_index(archetype: Archetypes) -> int:\n",
+ " \"\"\"\n",
+ " Returns the index of the given archetype.\n",
+ " archetype : Archetypes : The archetype name\n",
+ " return : int : The index of the given archetype\n",
+ " \"\"\"\n",
+ " return archetype_names.index(archetype)\n",
+ "\n",
+ "archetype_columns = [get_archetype_column_name(name) for name in archetype_names]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e4caf24",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def query_data() -> pd.DataFrame:\n",
+ " query: typing.LiteralString = \"\"\"\n",
+ " MATCH (anomalyScoreStats:File&!Directory&!Archive)\n",
+ " WHERE anomalyScoreStats.anomalyScore < 0\n",
+ " ORDER BY anomalyScoreStats.anomalyScore ASCENDING\n",
+ " LIMIT 150 // n largest negative anomaly scores as threshold\n",
+ " WITH collect(anomalyScoreStats.anomalyScore)[-1] AS anomalyScoreThreshold\n",
+ " MATCH (anomalyRankStats:File&!Directory&!Archive)\n",
+ " WITH anomalyScoreThreshold\n",
+ " ,max(anomalyRankStats.anomalyAuthorityRank) AS maxAnomalyAuthorityRank\n",
+ " ,max(anomalyRankStats.anomalyBottleneckRank) AS maxAnomalyBottleneckRank\n",
+ " ,max(anomalyRankStats.anomalyBridgeRank) AS maxAnomalyBridgeRank\n",
+ " ,max(anomalyRankStats.anomalyHubRank) AS maxAnomalyHubRank\n",
+ " ,max(anomalyRankStats.anomalyOutlierRank) AS maxAnomalyOutlierRank\n",
+ " MATCH (anomalous:File&!Directory&!Archive)\n",
+ " WHERE (anomalous.anomalyScore < anomalyScoreThreshold\n",
+ " OR anomalous.anomalyHubRank IS NOT NULL\n",
+ " OR anomalous.anomalyAuthorityRank IS NOT NULL\n",
+ " OR anomalous.anomalyBottleneckRank IS NOT NULL\n",
+ " OR anomalous.anomalyOutlierRank IS NOT NULL\n",
+ " OR anomalous.anomalyBridgeRank IS NOT NULL)\n",
+ " OPTIONAL MATCH (project:Artifact|Project)-[:CONTAINS]->(anomalous)\n",
+ " WITH *\n",
+ " ,coalesce(project.name + '/', '') AS projectName\n",
+ " ,coalesce(anomalous.fileName, anomalous.relativePath) AS fileName\n",
+ " RETURN replace(projectName + fileName, '//', '/') AS filePath\n",
+ " ,CASE WHEN anomalous.anomalyScore < 0 THEN abs(anomalous.anomalyScore) ELSE 0 END AS absoluteAnomalyScore\n",
+ " ,coalesce(toFloat(anomalous.anomalyAuthorityRank) / maxAnomalyAuthorityRank, 0) AS normalizedAuthorityRank\n",
+ " ,coalesce(toFloat(anomalous.anomalyBottleneckRank) / maxAnomalyBottleneckRank, 0) AS normalizedBottleneckRank\n",
+ " ,coalesce(toFloat(anomalous.anomalyBridgeRank) / maxAnomalyBridgeRank, 0) AS normalizedBridgeRank\n",
+ " ,coalesce(toFloat(anomalous.anomalyHubRank / maxAnomalyHubRank), 0) AS normalizedHubRank\n",
+ " ,coalesce(toFloat(anomalous.anomalyOutlierRank) / maxAnomalyOutlierRank, 0) AS normalizedOutlierRank\n",
+ " ORDER BY filePath ASCENDING\n",
+ " \"\"\"\n",
+ " return query_cypher_to_data_frame(query)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "01e51a6a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def prepare_data_for_treemap(data: pd.DataFrame, debug: bool = False) -> pd.DataFrame:\n",
+ " if debug:\n",
+ " display(\"1. query result ---------------------\")\n",
+ " display(data)\n",
+ "\n",
+ " # 3. Add multiple rows for each file path containing all its directories paths in the new column 'elementPath'\n",
+ " data = add_element_path_column(data, 'filePath', 'elementPath')\n",
+ "\n",
+ " if debug:\n",
+ " display(\"3. added elementPath --------------\")\n",
+ " display(data)\n",
+ "\n",
+ " # Group the files by their directory and count the number of files of each directory (across all levels).\n",
+ " common_named_aggregation = dict(\n",
+ " absoluteAnomalyScore=pd.NamedAgg(column=\"absoluteAnomalyScore\", aggfunc=\"mean\"),\n",
+ " normalizedAuthorityRank=pd.NamedAgg(column=\"normalizedAuthorityRank\", aggfunc=\"max\"),\n",
+ " normalizedBottleneckRank=pd.NamedAgg(column=\"normalizedBottleneckRank\", aggfunc=\"max\"),\n",
+ " normalizedBridgeRank=pd.NamedAgg(column=\"normalizedBridgeRank\", aggfunc=\"max\"),\n",
+ " normalizedHubRank=pd.NamedAgg(column=\"normalizedHubRank\", aggfunc=\"max\"),\n",
+ " normalizedOutlierRank=pd.NamedAgg(column=\"normalizedOutlierRank\", aggfunc=\"max\"),\n",
+ " )\n",
+ "\n",
+ " data = data.groupby(['elementPath']).aggregate(\n",
+ " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
+ " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
+ " maxAnomalyScore=pd.NamedAgg(column=\"absoluteAnomalyScore\", aggfunc=\"max\"),\n",
+ " **common_named_aggregation\n",
+ " )\n",
+ "\n",
+ " # Sort the grouped and aggregated entries by the name of the directory ascending and the anomaly score descending.\n",
+ " # The author with the most commits will then be listed first for each directory.\n",
+ " data = data.sort_values(by=['elementPath', 'absoluteAnomalyScore'], ascending=[True, False])\n",
+ " data = data.reset_index()\n",
+ "\n",
+ " if debug:\n",
+ " display(\"4. grouped by elementPath --------------\")\n",
+ " display(data)\n",
+ "\n",
+ " # Group the entries again now only by their directory path to get the aggregated number of anomalies and ranks.\n",
+ " data = data.groupby('elementPath').aggregate(\n",
+ " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=count_unique_aggregated_values),\n",
+ " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
+ " maxAnomalyScore=pd.NamedAgg(column=\"maxAnomalyScore\", aggfunc=\"max\"),\n",
+ " **common_named_aggregation\n",
+ " )\n",
+ " data = data.reset_index()\n",
+ "\n",
+ " if debug:\n",
+ " display(\"5. grouped by directory path --------------\")\n",
+ " display(data)\n",
+ "\n",
+ " # Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n",
+ " data = add_element_name_column(data, 'elementPath', 'elementName')\n",
+ " data = add_parent_directory_column(data, 'elementPath', 'directoryParentPath')\n",
+ "\n",
+ " if debug:\n",
+ " display(\"6. added directory and parent name --------------\")\n",
+ " display(data)\n",
+ "\n",
+ " # Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.\n",
+ " all_column_names_except_for_the_directory_path = data.columns.to_list()[3:]\n",
+ " data = data.groupby(all_column_names_except_for_the_directory_path).aggregate(\n",
+ " elementName=pd.NamedAgg(column=\"elementName\", aggfunc=lambda names: '/'.join(names)),\n",
+ " directoryParentPath=pd.NamedAgg(column=\"directoryParentPath\", aggfunc=\"first\"),\n",
+ " elementPath=pd.NamedAgg(column=\"elementPath\", aggfunc=\"last\"),\n",
+ " )\n",
+ "\n",
+ " # Reorder the column positions so that the directory path is again the first column. \n",
+ " all_column_names_with_the_directory_path_first = ['elementPath', 'directoryParentPath', 'elementName'] + all_column_names_except_for_the_directory_path\n",
+ " data = data.reset_index()[all_column_names_with_the_directory_path_first]\n",
+ "\n",
+ " if debug:\n",
+ " display(\"7. final grouping --------------\")\n",
+ " display(data)\n",
+ " display(\"Statistics --------------\")\n",
+ " data.describe()\n",
+ " \n",
+ " return data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0c68aa20",
+ "metadata": {},
+ "source": [
+ "## 1. Anomalies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a5222a25",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "driver = get_graph_database_driver()\n",
+ "anomaly_file_paths = query_data()\n",
+ "anomaly_file_paths = prepare_data_for_treemap(anomaly_file_paths)\n",
+ "display(anomaly_file_paths)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "daa3a949",
+ "metadata": {},
+ "source": [
+ "### 1.1 Average anomaly score per file directory"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b0cd2237",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_to_display = anomaly_file_paths.copy()\n",
+ "\n",
+ "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+ " create_treemap_settings(data_to_display),\n",
+ " marker=dict(\n",
+ " **plotly_treemap_marker_base_color_scale,\n",
+ " colors=data_to_display['absoluteAnomalyScore'], \n",
+ " colorbar={\"title\": \"score\"},\n",
+ " ),\n",
+ "))\n",
+ "figure.update_layout(\n",
+ " **plotly_treemap_layout_base_settings, # type: ignore\n",
+ " title='Average anomaly score per directory',\n",
+ ")\n",
+ "figure.show(**plotly_treemap_figure_show_settings)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "24341675",
+ "metadata": {},
+ "source": [
+ "### 1.2 Overview of all anomaly archetypes per directory"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b773269c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def mutual_exclusive_ranks(data: pd.DataFrame) -> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " Modifies the input data frame to ensure that only one archetype rank is non-zero per row.\n",
+ " The archetype with the highest normalized rank is retained, and others are set to zero.\n",
+ " data : pd.DataFrame : The input data frame\n",
+ " return : pd.DataFrame : The modified data frame with mutual exclusive ranks\n",
+ " \"\"\"\n",
+ " modified_data = data.copy()\n",
+ " \n",
+ " for dataframe_index, row in modified_data.iterrows():\n",
+ " index = typing.cast(int, dataframe_index)\n",
+ " max_rank_value = 0\n",
+ " max_rank_column = None\n",
+ " \n",
+ " for column in archetype_columns:\n",
+ " if row[column] > max_rank_value:\n",
+ " max_rank_value = row[column]\n",
+ " max_rank_column = column\n",
+ " \n",
+ " for column in archetype_columns:\n",
+ " if column != max_rank_column:\n",
+ " modified_data.at[index, column] = 0\n",
+ " \n",
+ " return modified_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ef8e76a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def interpolate_color(low: Tuple[int, int, int], high: Tuple[int, int, int], normalized_value: float) -> str:\n",
+ " \"\"\"Linear interpolation between two RGB tuples, returns rgba string.\"\"\"\n",
+ " \n",
+ " def linear_interpolation_of_color_component(color_component: int) -> int:\n",
+ " return int(low[color_component] + (high[color_component] - low[color_component]) * normalized_value)\n",
+ " \n",
+ " red = linear_interpolation_of_color_component(0)\n",
+ " green = linear_interpolation_of_color_component(1)\n",
+ " blue = linear_interpolation_of_color_component(2)\n",
+ " return f\"rgb({red},{green},{blue})\"\n",
+ "\n",
+ "\n",
+ "def get_rank_color(rank: float, low: Tuple[int, int, int], high: Tuple[int, int, int]) -> str:\n",
+ " \"\"\"Return transparent if rank == 0, else interpolate between low and high.\"\"\"\n",
+ " if rank <= 0:\n",
+ " return \"rgb(255,255,255)\"\n",
+ " return interpolate_color(low, high, rank)\n",
+ "\n",
+ "\n",
+ "def combine_rank_colors(\n",
+ " dataframe: pd.DataFrame,\n",
+ " rank_columns: List[str],\n",
+ " color_pairs: List[Tuple[Tuple[int, int, int], Tuple[int, int, int]]],\n",
+ ") -> List[str]:\n",
+ " \"\"\"Combine multiple ranks, using the first nonzero value's color.\"\"\"\n",
+ " combined: List[str] = []\n",
+ " for _, row in dataframe.iterrows():\n",
+ " color = \"rgb(255,255,255)\"\n",
+ " for rank_col, (low, high) in zip(rank_columns, color_pairs):\n",
+ " rank = row[rank_col]\n",
+ " if rank > 0:\n",
+ " color = get_rank_color(rank, low, high)\n",
+ " break\n",
+ " combined.append(color)\n",
+ " return combined\n",
+ "\n",
+ "\n",
+ "def get_rank_color_for_archetype(dataframe: pd.DataFrame, archetype: Archetypes) -> List[str]:\n",
+ " \"\"\"Get combined rank colors for a specific archetype.\"\"\"\n",
+ " archetype_column_name = get_archetype_column_name(archetype)\n",
+ " coloring_pair = get_coloring_pairs()[archetype_names.index(archetype)]\n",
+ " return combine_rank_colors(dataframe, [archetype_column_name], [coloring_pair])\n",
+ "\n",
+ "\n",
+ "def get_coloring_pairs() -> List[Tuple[Tuple[int, int, int], Tuple[int, int, int]]]:\n",
+ " \"\"\"Define the coloring scheme for each archetype.\"\"\"\n",
+ " assert len(archetype_names) == 5, \"Expected exactly 5 archetypes.\"\n",
+ " return [\n",
+ " ((222, 235, 247), (33, 113, 181)), # Authority, Red shades\n",
+ " ((254, 230, 206), (217, 72, 1)), # Bottleneck, Green shades\n",
+ " ((239, 237, 245), (106, 81, 163)), # Bridge, Blue shades\n",
+ " ((254, 224, 210), (165,15,21)), # Hub, Orange shades\n",
+ " ((240, 240, 240), (82, 82, 82)), # Outlier, Purple shades\n",
+ " ]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1eb5f75",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_to_display = mutual_exclusive_ranks(anomaly_file_paths)\n",
+ "# Optionally only keep rows where at least one archetype rank is greater than zero\n",
+ "data_to_display = data_to_display[data_to_display[archetype_columns].sum(axis=1) > 0]\n",
+ "\n",
+ "coloring_pairs = get_coloring_pairs()\n",
+ "combined_colors = combine_rank_colors(data_to_display, archetype_columns, coloring_pairs)\n",
+ "\n",
+ "figure = plotly_graph_objects.Figure()\n",
+ "\n",
+ "figure.add_trace(plotly_graph_objects.Treemap(\n",
+ " create_treemap_settings(data_to_display),\n",
+ " marker=dict(\n",
+ " **plotly_treemap_marker_base_style,\n",
+ " line=dict(width=1, color=\"black\"),\n",
+ " showscale=False,\n",
+ " colors=combined_colors,\n",
+ " ),\n",
+ " name=\"Anomalies\",\n",
+ " opacity=0.8\n",
+ "))\n",
+ "\n",
+ "# Add dummy scatter traces for legend\n",
+ "for name, (low, high) in zip(archetype_names, coloring_pairs):\n",
+ " bright_color = interpolate_color(low, high, 0.4) # light tone for legend filling\n",
+ " dark_color = interpolate_color(low, high, 1.0) # darkest tone for legend outline\n",
+ " figure.add_trace(plotly_graph_objects.Scatter(\n",
+ " x=[None],\n",
+ " y=[None],\n",
+ " mode=\"markers\",\n",
+ " marker=dict(size=12, color=bright_color, line=dict(width=2, color=dark_color)),\n",
+ " name=name,\n",
+ " legendgroup=name,\n",
+ " showlegend=True,\n",
+ " ))\n",
+ "\n",
+ "figure.update_layout(\n",
+ " **plotly_treemap_layout_base_settings, # type: ignore\n",
+ " title='Overview of all anomaly archetypes per directory',\n",
+ " legend=dict(\n",
+ " orientation=\"h\", # horizontal legend (use \"v\" for vertical)\n",
+ " yanchor=\"bottom\",\n",
+ " y=-0.12,\n",
+ " xanchor=\"center\",\n",
+ " x=0.5\n",
+ " )\n",
+ ")\n",
+ "figure.update_xaxes(visible=False)\n",
+ "figure.update_yaxes(visible=False)\n",
+ "\n",
+ "figure.show(**plotly_treemap_figure_show_settings)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "379d568c",
+ "metadata": {},
+ "source": [
+ "### 1.3a Archetype - Authority per directory"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "af0a16b6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def plot_single_archetype_treemap(archetype: Archetypes, data: pd.DataFrame):\n",
+ " \"\"\"\n",
+ " Plots a treemap for the given archetype using the provided data.\n",
+ " archetype : Archetypes : The archetype to plot\n",
+ " data : pd.DataFrame : The input data frame\n",
+ " \"\"\"\n",
+ " data_to_display = data.copy()\n",
+ " data_to_display = data_to_display[data_to_display[archetype_columns].sum(axis=1) > 0]\n",
+ "\n",
+ " archetype_column_name = get_archetype_column_name(archetype)\n",
+ " combined_colors = get_rank_color_for_archetype(data_to_display, archetype)\n",
+ "\n",
+ " figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n",
+ " create_treemap_settings(data_to_display),\n",
+ " marker=dict(\n",
+ " **plotly_treemap_marker_base_style,\n",
+ " colors=combined_colors,\n",
+ " line=dict(width=1, color=\"black\"),\n",
+ " colorbar={\"title\": \"rank\"},\n",
+ " ),\n",
+ " ))\n",
+ " figure.update_layout(\n",
+ " **plotly_treemap_layout_base_settings, # type: ignore\n",
+ " title=f'Archetype \"{archetype}\" per directory',\n",
+ " )\n",
+ " figure.show(**plotly_treemap_figure_show_settings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6a497cd8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_single_archetype_treemap(\"Authority\", data_to_display)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4c4eb401",
+ "metadata": {},
+ "source": [
+ "### 1.3b Archetype - Bottleneck per directory"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37656bfe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_single_archetype_treemap(\"Bottleneck\", anomaly_file_paths)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "06010d6d",
+ "metadata": {},
+ "source": [
+ "### 1.3c Archetype - Bridge per directory"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b0d3b99",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_single_archetype_treemap(\"Bridge\", anomaly_file_paths)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b375f191",
+ "metadata": {},
+ "source": [
+ "### 1.3d Archetype - Hub per directory"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a3ee93ed",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_single_archetype_treemap(\"Hub\", anomaly_file_paths)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "772d80b4",
+ "metadata": {},
+ "source": [
+ "### 1.3e Archetype - Outlier per directory"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1e9ac193",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plot_single_archetype_treemap(\"Outlier\", anomaly_file_paths)"
+ ]
+ }
+ ],
+ "metadata": {
+ "authors": [
+ {
+ "name": "JohT"
+ }
+ ],
+ "code_graph_analysis_pipeline_data_validation": "ValidateAlwaysFalse",
+ "kernelspec": {
+ "display_name": "codegraph",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.9"
+ },
+ "title": "Anomaly Detection - Manual Exploration"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher b/domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher
new file mode 100644
index 000000000..e80a5b282
--- /dev/null
+++ b/domains/anomaly-detection/queries/AnomalyDetectionFiles.cypher
@@ -0,0 +1,34 @@
+// List anomalous files
+
+MATCH (anomalyScoreStats:File&!Directory&!Archive)
+WHERE anomalyScoreStats.anomalyScore < 0
+ORDER BY anomalyScoreStats.anomalyScore ASCENDING
+LIMIT 150 // n largest negative anomaly scores as threshold
+ WITH collect(anomalyScoreStats.anomalyScore)[-1] AS anomalyScoreThreshold
+MATCH (anomalyRankStats:File&!Directory&!Archive)
+ WITH anomalyScoreThreshold
+ ,max(anomalyRankStats.anomalyAuthorityRank) AS maxAnomalyAuthorityRank
+ ,max(anomalyRankStats.anomalyBottleneckRank) AS maxAnomalyBottleneckRank
+ ,max(anomalyRankStats.anomalyBridgeRank) AS maxAnomalyBridgeRank
+ ,max(anomalyRankStats.anomalyHubRank) AS maxAnomalyHubRank
+ ,max(anomalyRankStats.anomalyOutlierRank) AS maxAnomalyOutlierRank
+MATCH (anomalous:File&!Directory&!Archive)
+WHERE (anomalous.anomalyScore < anomalyScoreThreshold
+ OR anomalous.anomalyHubRank IS NOT NULL
+ OR anomalous.anomalyAuthorityRank IS NOT NULL
+ OR anomalous.anomalyBottleneckRank IS NOT NULL
+ OR anomalous.anomalyOutlierRank IS NOT NULL
+ OR anomalous.anomalyBridgeRank IS NOT NULL)
+OPTIONAL MATCH (project:Artifact|Project)-[:CONTAINS]->(anomalous)
+ WITH *
+ ,coalesce(project.name + '/', '') AS projectName
+ ,coalesce(anomalous.fileName, anomalous.relativePath) AS fileName
+RETURN replace(projectName + fileName, '//', '/') AS filePath
+ ,CASE WHEN anomalous.anomalyScore < 0 THEN abs(anomalous.anomalyScore) ELSE 0 END AS absoluteAnomalyScore
+ ,coalesce(toFloat(anomalous.anomalyAuthorityRank) / maxAnomalyAuthorityRank, 0) AS normalizedAuthorityRank
+ ,coalesce(toFloat(anomalous.anomalyBottleneckRank) / maxAnomalyBottleneckRank, 0) AS normalizedBottleneckRank
+ ,coalesce(toFloat(anomalous.anomalyBridgeRank) / maxAnomalyBridgeRank, 0) AS normalizedBridgeRank
+ ,coalesce(toFloat(anomalous.anomalyHubRank / maxAnomalyHubRank), 0) AS normalizedHubRank
+ ,coalesce(toFloat(anomalous.anomalyOutlierRank) / maxAnomalyOutlierRank, 0) AS normalizedOutlierRank
+ORDER BY filePath ASCENDING
+LIMIT 200
\ No newline at end of file
diff --git a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh
index a7659df03..6335c8b9f 100755
--- a/domains/anomaly-detection/summary/anomalyDetectionSummary.sh
+++ b/domains/anomaly-detection/summary/anomalyDetectionSummary.sh
@@ -177,6 +177,10 @@ anomaly_detection_finalize_report() {
# Remove empty Markdown includes
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${report_include_directory}"
+ # Collect static Markdown includes (after cleanup to not remove one-liner)
+ cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report_no_anomaly_detection_treemaps.template.md" "${report_include_directory}/report_no_anomaly_detection_treemaps.md"
+
+ # Assemble final report by applying includes to the main template
cp -f "${ANOMALY_DETECTION_SUMMARY_DIR}/report.template.md" "${FULL_REPORT_DIRECTORY}/report.template.md"
cat "${FULL_REPORT_DIRECTORY}/report.template.md" | "${MARKDOWN_SCRIPTS_DIR}/embedMarkdownIncludes.sh" "${report_include_directory}" > "${FULL_REPORT_DIRECTORY}/anomaly_detection_report.md"
diff --git a/domains/anomaly-detection/summary/report.template.md b/domains/anomaly-detection/summary/report.template.md
index 0f846235c..59275eb46 100644
--- a/domains/anomaly-detection/summary/report.template.md
+++ b/domains/anomaly-detection/summary/report.template.md
@@ -26,6 +26,10 @@ The goal is to detect potential **software quality, design, and architecture iss
+### 1.3 Overview Charts
+
+
+
## 2. Deep Dives by Abstraction Level
Each abstraction level includes anomaly statistics, SHAP feature importance, archetype distribution, and example anomalies.
diff --git a/domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md b/domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md
new file mode 100644
index 000000000..26df86f3a
--- /dev/null
+++ b/domains/anomaly-detection/summary/report_no_anomaly_detection_treemaps.template.md
@@ -0,0 +1 @@
+⚠️ _No anomaly detection treemap charts due to missing data._
\ No newline at end of file
diff --git a/domains/anomaly-detection/treemapVisualizations.py b/domains/anomaly-detection/treemapVisualizations.py
new file mode 100755
index 000000000..ce4680d6c
--- /dev/null
+++ b/domains/anomaly-detection/treemapVisualizations.py
@@ -0,0 +1,645 @@
+#!/usr/bin/env python
+
+# This Python script uses Plotly Treemap Charts (https://plotly.com/python/treemaps) to visualize anomaly detection results.
+
+from typing import Any, Dict, List, Tuple, Literal, LiteralString, Optional, cast
+
+import os
+import sys
+import argparse
+import pprint
+import logging
+
+import pandas as pd
+import numpy as np
+
+from neo4j import GraphDatabase, Driver
+
+from plotly import graph_objects as plotly_graph_objects
+
+
+class Parameters:
+ required_parameters_ = ["projection_language"]
+
+ def __init__(self, input_parameters: Dict[str, str], report_directory: str = "", verbose: bool = False):
+ self.query_parameters_ = input_parameters.copy() # copy enforces immutability
+ self.report_directory = report_directory
+ self.verbose_ = verbose
+
+ def __repr__(self):
+ pretty_dict = pprint.pformat(self.query_parameters_, indent=4)
+ return f"Parameters: verbose={self.verbose_}, report_directory={self.report_directory}, query_parameters:\n{pretty_dict}"
+
+ @staticmethod
+ def log_dependency_versions_() -> None:
+ print('---------------------------------------')
+
+ print('Python version: {}'.format(sys.version))
+
+ from numpy import __version__ as numpy_version
+ print('numpy version: {}'.format(numpy_version))
+
+ from pandas import __version__ as pandas_version
+ print('pandas version: {}'.format(pandas_version))
+
+ from neo4j import __version__ as neo4j_version
+ print('neo4j version: {}'.format(neo4j_version))
+
+ from plotly import version as plotly_version
+ print('plotly version: {}'.format(plotly_version))
+
+ print('---------------------------------------')
+
+ @classmethod
+ def from_input_parameters(cls, input_parameters: Dict[str, str], report_directory: str = "", verbose: bool = False):
+ """
+ Creates a Parameters instance from a dictionary of input parameters.
+ The dictionary must contain the following keys:
+ - "projection_node_label": The node type of the projection.
+ """
+ missing_parameters = [parameter for parameter in cls.required_parameters_ if parameter not in input_parameters]
+ if missing_parameters:
+ raise ValueError("Missing parameters:", missing_parameters)
+ created_parameters = cls(input_parameters, report_directory, verbose)
+ if created_parameters.is_verbose():
+ print(created_parameters)
+ cls.log_dependency_versions_()
+ return created_parameters
+
+ def __is_code_language_available(self) -> bool:
+ return "projection_language" in self.query_parameters_
+
+ def __get_projection_language(self) -> str:
+ return self.query_parameters_["projection_language"] if self.__is_code_language_available() else ""
+
+ def get_title_prefix(self) -> str:
+ if self.__is_code_language_available():
+ return self.__get_projection_language()
+ return ""
+
+ def get_report_directory(self) -> str:
+ return self.report_directory
+
+ def is_verbose(self) -> bool:
+ return self.verbose_
+
+
+def parse_input_parameters() -> Parameters:
+ # Convert list of "key=value" strings to a dictionary
+ def parse_key_value_list(param_list: List[str]) -> Dict[str, str]:
+ param_dict = {}
+ for item in param_list:
+ if '=' in item:
+ key, value = item.split('=', 1)
+ param_dict[key] = value
+ return param_dict
+
+ parser = argparse.ArgumentParser(
+ description="Unsupervised clustering to assign labels to code units (Java packages, types,...) and their dependencies based on how structurally similar they are within a software system.")
+ parser.add_argument('--verbose', action='store_true', help='Enable verbose mode to log all details')
+ parser.add_argument('--report_directory', type=str, default="", help='Path to the report directory')
+ parser.add_argument('query_parameters', nargs='*', type=str, help='List of key=value Cypher query parameters')
+ parser.set_defaults(verbose=False)
+ args = parser.parse_args()
+ return Parameters.from_input_parameters(parse_key_value_list(args.query_parameters), args.report_directory, args.verbose)
+
+
+def get_file_path(name: str, parameters: Parameters, extension: str = 'svg') -> str:
+ name = parameters.get_report_directory() + '/' + name.replace(' ', '_') + '.' + extension
+ if parameters.is_verbose():
+ print(f"treemapVisualizations: Saving file {name}")
+ return name
+
+
+def get_graph_database_driver() -> Driver:
+ driver = GraphDatabase.driver(
+ uri="bolt://localhost:7687",
+ auth=("neo4j", os.environ.get("NEO4J_INITIAL_PASSWORD"))
+ )
+ driver.verify_connectivity()
+ return driver
+
+
+def query_cypher_to_data_frame(query: LiteralString, parameters: Optional[Dict[str, Any]] = None):
+ records, _, keys = driver.execute_query(query, parameters_=parameters)
+ return pd.DataFrame([record.values() for record in records], columns=keys)
+
+
+# ----------------------------------------
+# Base settings for image rendering
+# ----------------------------------------
+
+
+image_rendering_settings = {
+ "format": "svg",
+ "width": 1920,
+ "height": 1080,
+}
+
+# ----------------------------------------
+# Base settings for Plotly Treemap
+# ----------------------------------------
+
+plotly_main_layout_base_settings = {
+ "margin": {"t": 60, "l": 15, "r": 15, "b": 20},
+}
+plotly_treemap_layout_base_settings = dict(
+ **plotly_main_layout_base_settings
+)
+plotly_bar_layout_base_settings = dict(
+ **plotly_main_layout_base_settings
+)
+plotly_treemap_marker_base_style = {
+ "cornerradius": 5
+}
+
+plotly_treemap_marker_base_color_scale = dict(
+ **plotly_treemap_marker_base_style,
+ colorscale='Hot_r',
+)
+
+
+# ----------------------------------------
+# Base functions for Treemap chart visualization
+# ----------------------------------------
+
+# Ignore kaleido logging noise when writing images
+logging.getLogger("kaleido").setLevel(logging.WARNING)
+
+def get_plotly_figure_write_image_settings(name: str, path: str):
+ """
+ Returns the settings for the plotly figure write_image method
+ :param name: Name of the figure
+ :return: Dictionary with settings for the write_image method
+ """
+ return {
+ "file": path + "/" + name + "." + image_rendering_settings['format'],
+ "format": image_rendering_settings['format'],
+ "width": image_rendering_settings['width'],
+ "height": image_rendering_settings['height'],
+ }
+
+
+def create_treemap_settings(data_frame: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = "elementName") -> plotly_graph_objects.Treemap:
+ """
+ Creates a Plotly Treemap with the given settings and data frame.
+ data_frame : pd.DataFrame : The input data frame
+ return :plotly_graph_objects.Treemap : The prepared Plotly Treemap
+ """
+ return plotly_graph_objects.Treemap(
+ labels=data_frame[element_name_column],
+ parents=data_frame['directoryParentPath'],
+ ids=data_frame[element_path_column],
+ customdata=data_frame[['fileCount', 'absoluteAnomalyScore', 'normalizedAuthorityRank', 'normalizedBottleneckRank', 'normalizedBridgeRank', 'normalizedHubRank', 'normalizedOutlierRank', 'elementPath']],
+ hovertemplate='%{label}
Highlighted anomalies: %{customdata[0]}
Anomaly Score: %{customdata[1]:.4f}
Authority: %{customdata[2]}, Bottleneck: %{customdata[3]}, Bridge: %{customdata[4]}, Hub: %{customdata[5]}, Outlier: %{customdata[6]}
Path: %{customdata[7]}',
+ maxdepth=-1,
+ root_color="lightgrey",
+ marker=dict(**plotly_treemap_marker_base_style),
+ )
+
+
+# ----------------------------------------
+# Base functions to prepare data for Treemap chart visualization
+# ----------------------------------------
+
+
+def remove_last_path_file_extension(file_path_elements: list) -> list:
+ """
+ Removes the file extension of the last element of the file path so that only the file name remains.
+ file_path_elements : list : The list of file path elements where the last one contains the file name with extension
+ return : list : The list of the directories + the file name without extension as last element.
+ """
+ if not file_path_elements:
+ return ['']
+ if len(file_path_elements) == 1:
+ return [os.path.splitext(file_path_elements[0])[0]]
+ return file_path_elements[:-1] + [os.path.splitext(file_path_elements[-1])[0]]
+
+
+def join_path_elements(file_path_elements: list) -> list:
+ """
+ Joins the file path elements (and removes the file extension).
+ file_path_elements : list : The list of levels to convert
+ return : list : The list of directories
+ """
+ prepared_path_elements = remove_last_path_file_extension(file_path_elements)
+ return ['/'.join(prepared_path_elements[:i+1]) for i in range(len(prepared_path_elements))]
+
+
+def add_element_path_column(input_dataframe: pd.DataFrame, file_path_column: str, element_path_column: str = 'elementPath'):
+ """
+ Adds a directory column to the input DataFrame based on the file path column.
+ input_dataframe : pd.DataFrame : The input DataFrame
+ file_path_column : str : The name of the file path column
+ directory_column : str : The name of the directory column to be added
+ return : pd.DataFrame : The DataFrame with added directory column
+ """
+ if element_path_column in input_dataframe.columns:
+ return input_dataframe # Column already exists
+
+ input_dataframe.insert(0, element_path_column, input_dataframe[file_path_column].str.split('/').apply(join_path_elements))
+ input_dataframe = input_dataframe.explode(element_path_column)
+ return input_dataframe
+
+
+def add_element_name_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', element_name_column: str = 'elementName'):
+ """
+ Adds a directory name column to the input DataFrame based on the directory column.
+ input_dataframe : pd.DataFrame : The input DataFrame
+ directory_column : str : The name of the directory column
+ directory_name_column : str : The name of the directory name column to be added
+ return : pd.DataFrame : The DataFrame with added directory name column
+ """
+ if element_name_column in input_dataframe.columns:
+ return input_dataframe # Column already exists
+
+ splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)
+ input_dataframe.insert(1, element_name_column, splitted_directories.apply(lambda x: (x[-1])))
+ return input_dataframe
+
+
+def add_parent_directory_column(input_dataframe: pd.DataFrame, element_path_column: str = 'elementPath', directory_parent_column: str = 'directoryParentPath'):
+ """
+ Adds a directory parent column to the input DataFrame based on the directory column.
+ input_dataframe : pd.DataFrame : The input DataFrame
+ directory_column : str : The name of the directory column
+ directory_parent_column : str : The name of the directory parent column to be added
+ return : pd.DataFrame : The DataFrame with added directory parent column
+ """
+ if directory_parent_column in input_dataframe.columns:
+ return input_dataframe # Column already exists
+
+ # Remove last path element from directory_column to get the directory_parent_column
+ splitted_directories = input_dataframe[element_path_column].str.rsplit('/', n=1)
+ input_dataframe.insert(1, directory_parent_column, splitted_directories.apply(lambda x: (x[0])))
+
+ # Clear parent (set to empty string) when it equal to the directory
+ input_dataframe.loc[input_dataframe[directory_parent_column] == input_dataframe[element_path_column], directory_parent_column] = ''
+ return input_dataframe
+
+
+def count_unique_aggregated_values(values: pd.Series):
+ """
+ Return the number of unique values from an array of array of strings.
+ Meant to be used as an aggregation function for dataframe grouping.
+ values : Series : The pandas Series of values
+ return : int : The number of files
+ """
+ return len(np.unique(np.concatenate(values.to_list())))
+
+
+def prepare_data_for_treemap(data: pd.DataFrame, debug: bool = False) -> pd.DataFrame:
+ if debug:
+ print("1. query result ---------------------")
+ print(data)
+
+ # 3. Add multiple rows for each file path containing all its directories paths in the new column 'elementPath'
+ data = add_element_path_column(data, 'filePath', 'elementPath')
+
+ if debug:
+ print("3. added elementPath --------------")
+ print(data)
+
+ # Group the files by their directory and count the number of files of each directory (across all levels).
+ common_named_aggregation = {
+ "absoluteAnomalyScore": pd.NamedAgg(column="absoluteAnomalyScore", aggfunc="mean"),
+ "normalizedAuthorityRank": pd.NamedAgg(column="normalizedAuthorityRank", aggfunc="max"),
+ "normalizedBottleneckRank": pd.NamedAgg(column="normalizedBottleneckRank", aggfunc="max"),
+ "normalizedBridgeRank": pd.NamedAgg(column="normalizedBridgeRank", aggfunc="max"),
+ "normalizedHubRank": pd.NamedAgg(column="normalizedHubRank", aggfunc="max"),
+ "normalizedOutlierRank": pd.NamedAgg(column="normalizedOutlierRank", aggfunc="max"),
+ }
+
+ data = data.groupby(['elementPath']).aggregate(
+ filePaths=pd.NamedAgg(column="filePath", aggfunc=np.unique),
+ firstFile=pd.NamedAgg(column="filePath", aggfunc="first"),
+ maxAnomalyScore=pd.NamedAgg(column="absoluteAnomalyScore", aggfunc="max"),
+ **common_named_aggregation
+ )
+
+ # Sort the grouped and aggregated entries by the name of the directory ascending and the anomaly score descending.
+ # The author with the most commits will then be listed first for each directory.
+ data = data.sort_values(by=['elementPath', 'absoluteAnomalyScore'], ascending=[True, False])
+ data = data.reset_index()
+
+ if debug:
+ print("4. grouped by elementPath --------------")
+ print(data)
+
+ # Group the entries again now only by their directory path to get the aggregated number of anomalies and ranks.
+ data = data.groupby('elementPath').aggregate(
+ fileCount=pd.NamedAgg(column="filePaths", aggfunc=count_unique_aggregated_values),
+ firstFile=pd.NamedAgg(column="firstFile", aggfunc="first"),
+ maxAnomalyScore=pd.NamedAgg(column="maxAnomalyScore", aggfunc="max"),
+ **common_named_aggregation
+ )
+ data = data.reset_index()
+
+ if debug:
+ print("5. grouped by directory path --------------")
+ print(data)
+
+ # Add the name of the directory (last '/' separated element) and the parent directory path to the table.
+ data = add_element_name_column(data, 'elementPath', 'elementName')
+ data = add_parent_directory_column(data, 'elementPath', 'directoryParentPath')
+
+ if debug:
+ print("6. added directory and parent name --------------")
+ print(data)
+
+ # Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.
+ all_column_names_except_for_the_directory_path = data.columns.to_list()[3:]
+ data = data.groupby(all_column_names_except_for_the_directory_path).aggregate(
+ elementName=pd.NamedAgg(column="elementName", aggfunc=lambda names: '/'.join(names)),
+ directoryParentPath=pd.NamedAgg(column="directoryParentPath", aggfunc="first"),
+ elementPath=pd.NamedAgg(column="elementPath", aggfunc="last"),
+ )
+
+ # Reorder the column positions so that the directory path is again the first column.
+ all_column_names_with_the_directory_path_first = ['elementPath', 'directoryParentPath', 'elementName'] + all_column_names_except_for_the_directory_path
+ data = data.reset_index()[all_column_names_with_the_directory_path_first]
+
+ if debug:
+ print("7. final grouping --------------")
+ print(data)
+ print("Statistics --------------")
+ data.describe()
+
+ return data
+
+
+def mutual_exclusive_ranks(data: pd.DataFrame) -> pd.DataFrame:
+ """
+ Modifies the input data frame to ensure that only one archetype rank is non-zero per row.
+ The archetype with the highest normalized rank is retained, and others are set to zero.
+ data : pd.DataFrame : The input data frame
+ return : pd.DataFrame : The modified data frame with mutual exclusive ranks
+ """
+ modified_data = data.copy()
+
+ for dataframe_index, row in modified_data.iterrows():
+ index = cast(int, dataframe_index)
+ max_rank_value = 0
+ max_rank_column = None
+
+ for column in archetype_columns:
+ if row[column] > max_rank_value:
+ max_rank_value = row[column]
+ max_rank_column = column
+
+ for column in archetype_columns:
+ if column != max_rank_column:
+ modified_data.at[index, column] = 0
+
+ return modified_data
+
+
+# ----------------------------------------
+# Archetypes
+# ----------------------------------------
+
+Archetypes = Literal["Authority", "Bottleneck", "Bridge", "Hub", "Outlier"]
+archetype_names: List[Archetypes] = ["Authority", "Bottleneck", "Bridge", "Hub", "Outlier"]
+
+def get_archetype_column_name(archetype: Archetypes) -> str:
+ """
+ Returns the column name for the given archetype.
+ archetype : Archetypes : The archetype name
+ return : str : The column name for the given archetype
+ """
+ return f"normalized{archetype}Rank"
+
+def get_archetype_index(archetype: Archetypes) -> int:
+ """
+ Returns the index of the given archetype.
+ archetype : Archetypes : The archetype name
+ return : int : The index of the given archetype
+ """
+ return archetype_names.index(archetype)
+
+archetype_columns = [get_archetype_column_name(name) for name in archetype_names]
+
+
+# ----------------------------------------
+# Archetype Coloring
+# ----------------------------------------
+
+Color = Tuple[int, int, int] # RGB (red, green, blue) color tuple
+ColorPair = Tuple[Color, Color] # Low and high color pair
+
+def interpolate_color(low: Color, high: Color, normalized_value: float) -> str:
+ """Linear interpolation between two RGB tuples, returns rgba string."""
+
+ def linear_interpolation_of_color_component(color_component: int) -> int:
+ return int(low[color_component] + (high[color_component] - low[color_component]) * normalized_value)
+
+ red = linear_interpolation_of_color_component(0)
+ green = linear_interpolation_of_color_component(1)
+ blue = linear_interpolation_of_color_component(2)
+ return f"rgb({red},{green},{blue})"
+
+
+def get_rank_color(rank: float, low: Color, high: Color) -> str:
+ """Return transparent if rank == 0, else interpolate between low and high."""
+ if rank <= 0:
+ return "rgb(255,255,255)"
+ return interpolate_color(low, high, rank)
+
+
+def combine_rank_colors(
+ dataframe: pd.DataFrame,
+ rank_columns: List[str],
+ color_pairs: List[ColorPair],
+) -> List[str]:
+ """Combine multiple ranks, using the first nonzero value's color."""
+ combined: List[str] = []
+ for _, row in dataframe.iterrows():
+ color = "rgb(255,255,255)"
+ for rank_col, (low, high) in zip(rank_columns, color_pairs):
+ rank = row[rank_col]
+ if rank > 0:
+ color = get_rank_color(rank, low, high)
+ break
+ combined.append(color)
+ return combined
+
+
+def get_rank_color_for_archetype(dataframe: pd.DataFrame, archetype: Archetypes) -> List[str]:
+ """Get combined rank colors for a specific archetype."""
+ archetype_column_name = get_archetype_column_name(archetype)
+ coloring_pair = get_coloring_pairs()[archetype_names.index(archetype)]
+ return combine_rank_colors(dataframe, [archetype_column_name], [coloring_pair])
+
+
+def get_coloring_pairs() -> List[Tuple[Tuple[int, int, int], Tuple[int, int, int]]]:
+ """Define the coloring scheme for each archetype."""
+ assert len(archetype_names) == 5, "Expected exactly 5 archetypes."
+ return [
+ ((222, 235, 247), (33, 113, 181)), # Authority, Red shades
+ ((254, 230, 206), (217, 72, 1)), # Bottleneck, Green shades
+ ((239, 237, 245), (106, 81, 163)), # Bridge, Blue shades
+ ((254, 224, 210), (165,15,21)), # Hub, Orange shades
+ ((240, 240, 240), (82, 82, 82)), # Outlier, Purple shades
+ ]
+
+# ----------------------------------------
+# Data query
+# ----------------------------------------
+
+def query_data() -> pd.DataFrame:
+ query: LiteralString = """
+ MATCH (anomalyScoreStats:File&!Directory&!Archive)
+ WHERE anomalyScoreStats.anomalyScore < 0
+ ORDER BY anomalyScoreStats.anomalyScore ASCENDING
+ LIMIT 150 // n largest negative anomaly scores as threshold
+ WITH collect(anomalyScoreStats.anomalyScore)[-1] AS anomalyScoreThreshold
+ MATCH (anomalyRankStats:File&!Directory&!Archive)
+ WITH anomalyScoreThreshold
+ ,max(anomalyRankStats.anomalyAuthorityRank) AS maxAnomalyAuthorityRank
+ ,max(anomalyRankStats.anomalyBottleneckRank) AS maxAnomalyBottleneckRank
+ ,max(anomalyRankStats.anomalyBridgeRank) AS maxAnomalyBridgeRank
+ ,max(anomalyRankStats.anomalyHubRank) AS maxAnomalyHubRank
+ ,max(anomalyRankStats.anomalyOutlierRank) AS maxAnomalyOutlierRank
+ MATCH (anomalous:File&!Directory&!Archive)
+ WHERE (anomalous.anomalyScore < anomalyScoreThreshold
+ OR anomalous.anomalyHubRank IS NOT NULL
+ OR anomalous.anomalyAuthorityRank IS NOT NULL
+ OR anomalous.anomalyBottleneckRank IS NOT NULL
+ OR anomalous.anomalyOutlierRank IS NOT NULL
+ OR anomalous.anomalyBridgeRank IS NOT NULL)
+ OPTIONAL MATCH (project:Artifact|Project)-[:CONTAINS]->(anomalous)
+ WITH *
+ ,coalesce(project.name + '/', '') AS projectName
+ ,coalesce(anomalous.fileName, anomalous.relativePath) AS fileName
+ RETURN replace(projectName + fileName, '//', '/') AS filePath
+ ,CASE WHEN anomalous.anomalyScore < 0 THEN abs(anomalous.anomalyScore) ELSE 0 END AS absoluteAnomalyScore
+ ,coalesce(toFloat(anomalous.anomalyAuthorityRank) / maxAnomalyAuthorityRank, 0) AS normalizedAuthorityRank
+ ,coalesce(toFloat(anomalous.anomalyBottleneckRank) / maxAnomalyBottleneckRank, 0) AS normalizedBottleneckRank
+ ,coalesce(toFloat(anomalous.anomalyBridgeRank) / maxAnomalyBridgeRank, 0) AS normalizedBridgeRank
+ ,coalesce(toFloat(anomalous.anomalyHubRank / maxAnomalyHubRank), 0) AS normalizedHubRank
+ ,coalesce(toFloat(anomalous.anomalyOutlierRank) / maxAnomalyOutlierRank, 0) AS normalizedOutlierRank
+ ORDER BY filePath ASCENDING
+ """
+ return query_cypher_to_data_frame(query)
+
+
+# ------------------------------------------------------------------------------------------------------------
+# MAIN
+# ------------------------------------------------------------------------------------------------------------
+
+
+parameters = parse_input_parameters()
+title_prefix = parameters.get_title_prefix()
+driver = get_graph_database_driver()
+
+print(f"treemapVisualizations: Querying {title_prefix} data for treemap visualization...")
+anomaly_file_paths = query_data()
+
+print(f"treemapVisualizations: Preparing {title_prefix} data for treemap visualization...")
+anomaly_file_paths = prepare_data_for_treemap(anomaly_file_paths)
+
+# --- Visualizing Anomaly Scores
+
+print(f"treemapVisualizations: Creating {title_prefix} anomaly scores treemap visualization...")
+figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
+ create_treemap_settings(anomaly_file_paths),
+ marker=dict(
+ **plotly_treemap_marker_base_color_scale,
+ colors=anomaly_file_paths['absoluteAnomalyScore'],
+ colorbar={"title": "score"},
+ ),
+))
+figure.update_layout(
+ **plotly_treemap_layout_base_settings, # type: ignore
+ title=f'Average {title_prefix} anomaly score per directory',
+)
+figure.write_image(**get_plotly_figure_write_image_settings(f"{title_prefix}Treemap1AverageAnomalyScorePerDirectory", parameters.get_report_directory()))
+
+# --- Visualizing Archetypes
+
+print(f"treemapVisualizations: Creating {title_prefix} archetypes overview treemap visualization...")
+mutual_exclusive_archetype_ranks_data = mutual_exclusive_ranks(anomaly_file_paths)
+
+coloring_pairs = get_coloring_pairs()
+combined_colors = combine_rank_colors(mutual_exclusive_archetype_ranks_data, archetype_columns, coloring_pairs)
+
+figure = plotly_graph_objects.Figure()
+
+figure.add_trace(plotly_graph_objects.Treemap(
+ create_treemap_settings(mutual_exclusive_archetype_ranks_data),
+ marker=dict(
+ **plotly_treemap_marker_base_style,
+ line={"width": 1, "color": "black"},
+ showscale=False,
+ colors=combined_colors,
+ ),
+ name="Anomalies",
+ opacity=0.8
+))
+
+# Add dummy scatter traces for legend
+for name, (low, high) in zip(archetype_names, coloring_pairs):
+ bright_color = interpolate_color(low, high, 0.4) # light tone for legend filling
+ dark_color = interpolate_color(low, high, 1.0) # darkest tone for legend outline
+ figure.add_trace(plotly_graph_objects.Scatter(
+ x=[None],
+ y=[None],
+ mode="markers",
+ marker={"size": 12, "color": bright_color, "line": {"width": 2, "color": dark_color}},
+ name=name,
+ legendgroup=name,
+ showlegend=True,
+ ))
+
+figure.update_layout(
+ **plotly_treemap_layout_base_settings, # type: ignore
+ title=f'Overview of all {title_prefix} anomaly archetypes per directory',
+ legend={
+ "orientation": "h", # horizontal legend
+ "yanchor": "bottom",
+ "y": -0.12,
+ "xanchor": "center",
+ "x": 0.5
+ }
+)
+figure.update_xaxes(visible=False)
+figure.update_yaxes(visible=False)
+figure.write_image(**get_plotly_figure_write_image_settings(f"{title_prefix}Treemap2ArchetypesOverviewPerDirectory", parameters.get_report_directory()))
+
+# --- Visualizing Archetypes individually
+
+def plot_single_archetype_treemap(archetype: Archetypes, title_prefix: str, file_index: int, data: pd.DataFrame):
+ """
+ Plots a treemap for the given archetype using the provided data.
+ archetype : Archetypes : The archetype to plot
+ data : pd.DataFrame : The input data frame
+ """
+ print(f"treemapVisualizations: Creating {title_prefix} archetype '{archetype}' treemap visualization...")
+ data_to_display = data.copy()
+ data_to_display = data_to_display[data_to_display[archetype_columns].sum(axis=1) > 0]
+
+ combined_colors = get_rank_color_for_archetype(data_to_display, archetype)
+
+ figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(
+ create_treemap_settings(data_to_display),
+ marker=dict(
+ **plotly_treemap_marker_base_style,
+ colors=combined_colors,
+ line={"width": 1, "color": "black"},
+ colorbar={"title": "rank"},
+ ),
+ ))
+ figure.update_layout(
+ **plotly_treemap_layout_base_settings, # type: ignore
+ title=f'{title_prefix} Archetype "{archetype}" per directory',
+ )
+ figure.write_image(**get_plotly_figure_write_image_settings(f"{title_prefix}Treemap{file_index}Archetype{archetype}PerDirectory", parameters.get_report_directory()))
+
+plot_single_archetype_treemap("Authority", title_prefix, 3, anomaly_file_paths)
+plot_single_archetype_treemap("Bottleneck", title_prefix, 4, anomaly_file_paths)
+plot_single_archetype_treemap("Bridge", title_prefix, 5, anomaly_file_paths)
+plot_single_archetype_treemap("Hub", title_prefix, 6, anomaly_file_paths)
+plot_single_archetype_treemap("Outlier", title_prefix, 7, anomaly_file_paths)
+
+driver.close()
+print("treemapVisualizations: Successfully created treemap visualizations.")
\ No newline at end of file