Skip to content

Commit 540fd71

Browse files
committed
Add Hierarchical Density-Based Spatial Clustering (HDBSCAN) Community Detection
1 parent fd0e7b4 commit 540fd71

6 files changed

+165
-4
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Estimate
2+
3+
CALL gds.hdbscan.write.estimate(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
writeProperty: $dependencies_projection_write_property,
7+
samples: 3
8+
})
9+
YIELD requiredMemory
10+
,nodeCount
11+
,relationshipCount
12+
,bytesMin
13+
,bytesMax
14+
,heapPercentageMin
15+
,heapPercentageMax
16+
,treeView
17+
,mapView
18+
RETURN requiredMemory
19+
,nodeCount
20+
,relationshipCount
21+
,bytesMin
22+
,bytesMax
23+
,heapPercentageMin
24+
,heapPercentageMax
25+
,treeView
26+
//,mapView //doesn't work on Windows with git bash jq version jq-1.7-dirty
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Statistics
2+
3+
CALL gds.hdbscan.stats(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
samples: 3
7+
})
8+
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
9+
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Mutate
2+
3+
CALL gds.hdbscan.mutate(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
mutateProperty: $dependencies_projection_write_property,
7+
samples: 3
8+
})
9+
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
10+
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Stream
2+
3+
CALL gds.hdbscan.stream(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
samples: 3
7+
})
8+
YIELD nodeId, label
9+
WITH gds.util.asNode(nodeId) AS member
10+
,label
11+
WITH member
12+
,coalesce(member.fqn, member.fileName, member.name) AS memberName
13+
,label
14+
WITH count(DISTINCT member) AS memberCount
15+
,collect(DISTINCT memberName) AS memberNames
16+
,label
17+
RETURN memberCount
18+
,label
19+
,memberNames
20+
ORDER BY memberCount DESC, label ASC
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - write node property e.g. communityHdbscanLabel
2+
3+
CALL gds.hdbscan.write(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
writeProperty: $dependencies_projection_write_property,
7+
samples: 3
8+
})
9+
// Samples = 3 turned out to be needed for
10+
YIELD nodeCount
11+
,numberOfClusters
12+
,numberOfNoisePoints
13+
,preProcessingMillis
14+
,computeMillis
15+
,writeMillis
16+
,postProcessingMillis
17+
,nodePropertiesWritten
18+
RETURN nodeCount
19+
,numberOfClusters
20+
,numberOfNoisePoints
21+
,preProcessingMillis
22+
,computeMillis
23+
,writeMillis
24+
,postProcessingMillis
25+
,nodePropertiesWritten

scripts/reports/CommunityCsv.sh

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,70 @@ detectCommunitiesWithKCoreDecomposition() {
242242
calculateCommunityMetrics "${@}" "${writePropertyName}"
243243
}
244244

245+
# Node Embeddings using Fast Random Projection
246+
#
247+
# Required Parameters:
248+
# - dependencies_projection=...
249+
# Name prefix for the in-memory projection name for dependencies. Example: "package"
250+
# - dependencies_projection_node=...
251+
# Label of the nodes that will be used for the projection. Example: "Package"
252+
# - dependencies_projection_weight_property=...
253+
# Name of the node property that contains the dependency weight. Example: "weight"
254+
nodeEmbeddingsWithFastRandomProjectionForHDBSCAN() {
255+
local NODE_EMBEDDINGS_CYPHER_DIR="${CYPHER_DIR}/Node_Embeddings"
256+
local mutatePropertyName="dependencies_projection_write_property=embeddingsFastRandomProjection"
257+
local embeddingsDimension="dependencies_projection_embedding_dimension=2"
258+
259+
# Statistics
260+
# execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1a_Fast_Random_Projection_Estimate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
261+
# execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1b_Fast_Random_Projection_Statistics.cypher" "${@}" ${embeddingsDimension}
262+
263+
# Run the algorithm and write the result into the in-memory projection ("mutate")
264+
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1c_Fast_Random_Projection_Mutate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
265+
}
266+
267+
# Community Detection using Hierarchical Density-Based Spatial Clustering (HDBSCAN) Algorithm
268+
#
269+
# Required Parameters:
270+
# - dependencies_projection=...
271+
# Name prefix for the in-memory projection name for dependencies. Example: "package"
272+
# - dependencies_projection_node=...
273+
# Label of the nodes that will be used for the projection. Example: "Package"
274+
# - dependencies_projection_weight_property=...
275+
# Name of the node property that contains the dependency weight. Example: "weight"
276+
# - dependencies_projection_node_embeddings_property=...
277+
# Name of the node property that contains node embeddings. Example: "embeddingsFastRandomProjection"
278+
#
279+
# Special Requirements:
280+
# - This algorithm needs a node property with an array of floats to compute clusters.
281+
# One possible way is to use node embeddings for that (like FastRP).
282+
detectCommunitiesWithHDBSCAN() {
283+
local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection"
284+
local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection"
285+
286+
local writePropertyName="dependencies_projection_write_property=communityHdbscanLabel"
287+
local writeLabelName="dependencies_projection_write_label=HDBSCAN"
288+
289+
# Statistics
290+
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11a_HDBSCAN_Estimate.cypher" "${@}" "${writePropertyName}"
291+
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11b_HDBSCAN_Statistics.cypher" "${@}"
292+
293+
# Run the algorithm and write the result into the in-memory projection ("mutate")
294+
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11c_HDBSCAN_Mutate.cypher" "${@}" "${writePropertyName}"
295+
296+
# Stream to CSV
297+
local nodeLabel
298+
nodeLabel=$( extractQueryParameter "dependencies_projection_node" "${@}")
299+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_8_Stream_Mutated_Grouped.cypher" "${@}" "${writePropertyName}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}_Communities_HDBSCAN.csv"
300+
301+
# Update Graph (node properties and labels) using the already mutated property projection
302+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}"
303+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
304+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
305+
306+
calculateCommunityMetrics "${@}" "${writePropertyName}"
307+
}
308+
245309
# Community Detection using the Approximate Maximum k-cut Algorithm
246310
#
247311
# Required Parameters:
@@ -402,6 +466,13 @@ detectCommunities() {
402466
time detectCommunitiesWithKCoreDecomposition "${@}"
403467
time detectCommunitiesWithApproximateMaximumKCut "${@}"
404468
time calculateLocalClusteringCoefficient "${@}"
469+
470+
# TODO Hard-wire build-in dependencies_projection_node_embeddings_property
471+
nodeEmbeddingsProperty=$( extractQueryParameter "dependencies_projection_node_embeddings_property" "${@}")
472+
if [ -n "${nodeEmbeddingsProperty}" ]; then
473+
time nodeEmbeddingsWithFastRandomProjectionForHDBSCAN "${@}"
474+
time detectCommunitiesWithHDBSCAN "${@}"
475+
fi
405476
compareCommunityDetectionResults "${@}"
406477
listAllResults "${@}"
407478
}
@@ -415,7 +486,7 @@ ARTIFACT_GAMMA="dependencies_leiden_gamma=1.11" # default = 1.00
415486
ARTIFACT_KCUT="dependencies_maxkcut=5" # default = 2
416487

417488
if createUndirectedDependencyProjection "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"; then
418-
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}"
489+
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}" # "${ARTIFACT_NODE_EMBEDDINGS}"
419490
writeLeidenModularity "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"
420491
fi
421492

@@ -426,9 +497,10 @@ PACKAGE_NODE="dependencies_projection_node=Package"
426497
PACKAGE_WEIGHT="dependencies_projection_weight_property=weight25PercentInterfaces"
427498
PACKAGE_GAMMA="dependencies_leiden_gamma=1.14" # default = 1.00
428499
PACKAGE_KCUT="dependencies_maxkcut=20" # default = 2
500+
PACKAGE_NODE_EMBEDDINGS="dependencies_projection_node_embeddings_property=embeddingsFastRandomProjection" # default = none
429501

430502
if createUndirectedDependencyProjection "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"; then
431-
detectCommunities "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" "${PACKAGE_GAMMA}" "${PACKAGE_KCUT}"
503+
detectCommunities "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" "${PACKAGE_GAMMA}" "${PACKAGE_KCUT}" "${PACKAGE_NODE_EMBEDDINGS}"
432504
writeLeidenModularity "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"
433505

434506
# Package Community Detection - Special CSV Queries after update
@@ -444,8 +516,7 @@ TYPE_GAMMA="dependencies_leiden_gamma=5.00" # default = 1.00
444516
TYPE_KCUT="dependencies_maxkcut=100" # default = 2
445517

446518
if createUndirectedJavaTypeDependencyProjection "${TYPE_PROJECTION}"; then
447-
detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}"
448-
519+
detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}" "${TYPE_NODE_EMBEDDINGS}"
449520
# Type Community Detection - Special CSV Queries after update
450521
execute_cypher "${CYPHER_DIR}/Community_Detection/Which_type_community_spans_several_artifacts_and_how_are_the_types_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Type_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
451522
execute_cypher "${CYPHER_DIR}/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher" > "${FULL_REPORT_DIRECTORY}/Type_communities_with_few_members_in_foreign_packages.csv"

0 commit comments

Comments
 (0)