diff --git a/Documentation/Metrics/arangodb_monitoring_tasks_existing.yaml b/Documentation/Metrics/arangodb_monitoring_tasks_existing.yaml new file mode 100644 index 000000000000..9ac8a8c79336 --- /dev/null +++ b/Documentation/Metrics/arangodb_monitoring_tasks_existing.yaml @@ -0,0 +1,14 @@ +name: arangodb_monitoring_tasks_existing +introducedIn: "3.12.5" +help: | + Number of currently existing monitoring tasks. +unit: number +type: gauge +category: Statistics +complexity: advanced +exposedBy: + - dbserver + - coordinator + - agent +description: | + Number of currently existing monitoring tasks. diff --git a/Documentation/Metrics/arangodb_monitoring_tasks_existing_thread_registries.yaml b/Documentation/Metrics/arangodb_monitoring_tasks_existing_thread_registries.yaml new file mode 100644 index 000000000000..4dd8fbbbe37c --- /dev/null +++ b/Documentation/Metrics/arangodb_monitoring_tasks_existing_thread_registries.yaml @@ -0,0 +1,14 @@ +name: arangodb_monitoring_tasks_existing_thread_registries +introducedIn: "3.12.5" +help: | + Number of threads that started currently existing monitoring tasks. +unit: number +type: gauge +category: Statistics +complexity: advanced +exposedBy: + - dbserver + - coordinator + - agent +description: | + Number of threads that started currently existing monitoring tasks. The thread itself does not need to exist any more. This number also includes still running threads that have started monitoring tasks that do not exist any more. diff --git a/Documentation/Metrics/arangodb_monitoring_tasks_ready_for_deletion.yaml b/Documentation/Metrics/arangodb_monitoring_tasks_ready_for_deletion.yaml new file mode 100644 index 000000000000..cb237dd4b322 --- /dev/null +++ b/Documentation/Metrics/arangodb_monitoring_tasks_ready_for_deletion.yaml @@ -0,0 +1,14 @@ +name: arangodb_monitoring_tasks_ready_for_deletion +introducedIn: "3.12.5" +help: | + Number of currently existing monitoring tasks that wait for their garbage collection. +unit: number +type: gauge +category: Statistics +complexity: advanced +exposedBy: + - dbserver + - coordinator + - agent +description: | + Number of currently existing monitoring tasks that wait for their garbage collection. diff --git a/Documentation/Metrics/arangodb_monitoring_tasks_thread_registries_total.yaml b/Documentation/Metrics/arangodb_monitoring_tasks_thread_registries_total.yaml new file mode 100644 index 000000000000..f2adb899b7d2 --- /dev/null +++ b/Documentation/Metrics/arangodb_monitoring_tasks_thread_registries_total.yaml @@ -0,0 +1,14 @@ +name: arangodb_monitoring_tasks_thread_registries_total +introducedIn: "3.12.5" +help: | + Total number of threads that started monitoring tasks since database creation. +unit: number +type: counter +category: Statistics +complexity: advanced +exposedBy: + - dbserver + - coordinator + - agent +description: | + Total number of threads that started monitoring tasks since database creation. diff --git a/Documentation/Metrics/arangodb_monitoring_tasks_total.yaml b/Documentation/Metrics/arangodb_monitoring_tasks_total.yaml new file mode 100644 index 000000000000..ea94ba8e2288 --- /dev/null +++ b/Documentation/Metrics/arangodb_monitoring_tasks_total.yaml @@ -0,0 +1,14 @@ +name: arangodb_monitoring_tasks_total +introducedIn: "3.12.5" +help: | + Total number of created monitoring tasks since database creation. +unit: number +type: counter +category: Statistics +complexity: advanced +exposedBy: + - dbserver + - coordinator + - agent +description: | + Total number of created monitoring tasks since database creation. diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/.gdbinit b/arangod/AsyncRegistryServer/PrettyPrinter/.gdbinit deleted file mode 100644 index 87b4c3f4befd..000000000000 --- a/arangod/AsyncRegistryServer/PrettyPrinter/.gdbinit +++ /dev/null @@ -1,8 +0,0 @@ -python -import sys -sys.path.insert(0, './arangod/AsyncRegistryServer/PrettyPrinter/src/') -end - -source ./arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/gdb_printer.py - -echo "asyncregistry pretty-printer loaded\n" diff --git a/arangod/AsyncRegistryServer/Stacktrace/CMakeLists.txt b/arangod/AsyncRegistryServer/Stacktrace/CMakeLists.txt deleted file mode 100644 index 485f0388e42f..000000000000 --- a/arangod/AsyncRegistryServer/Stacktrace/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -add_library(arango_async_registry_stacktrace INTERFACE - depth_first.h - forest.h) -target_include_directories(arango_async_registry_stacktrace - INTERFACE - ${PROJECT_SOURCE_DIR}/arangod) diff --git a/arangod/CMakeLists.txt b/arangod/CMakeLists.txt index 5c79ea9b5cf5..9bf5a4fac4ed 100644 --- a/arangod/CMakeLists.txt +++ b/arangod/CMakeLists.txt @@ -103,7 +103,6 @@ get_target_property(IRESEARCH_INCLUDE include(arangoserver.cmake) add_subdirectory(Agency) add_subdirectory(Aql) -add_subdirectory(AsyncRegistryServer) add_subdirectory(Cache) add_subdirectory(Cluster) add_subdirectory(ClusterEngine) @@ -118,6 +117,7 @@ add_subdirectory(RestHandler) add_subdirectory(RestServer) add_subdirectory(RocksDBEngine) add_subdirectory(StorageEngine) +add_subdirectory(SystemMonitor) add_subdirectory(Utils) if (USE_V8) add_subdirectory(V8Server) diff --git a/arangod/Cluster/Maintenance.cpp b/arangod/Cluster/Maintenance.cpp index f03a4b60959a..68378610ed64 100644 --- a/arangod/Cluster/Maintenance.cpp +++ b/arangod/Cluster/Maintenance.cpp @@ -34,7 +34,7 @@ #include "Cluster/ClusterFeature.h" #include "Cluster/ClusterInfo.h" #include "Cluster/FollowerInfo.h" -#include "Cluster/ResignShardLeadership.h" +#include "Cluster/MaintenanceActions/ResignShardLeadership.h" #include "Indexes/Index.h" #include "Inspection/VPack.h" #include "IResearch/IResearchCommon.h" diff --git a/arangod/Cluster/Action.cpp b/arangod/Cluster/MaintenanceActions/Action.cpp similarity index 91% rename from arangod/Cluster/Action.cpp rename to arangod/Cluster/MaintenanceActions/Action.cpp index b274a622592a..b82024a417e7 100644 --- a/arangod/Cluster/Action.cpp +++ b/arangod/Cluster/MaintenanceActions/Action.cpp @@ -25,18 +25,18 @@ #include "Action.h" #include "Basics/Exceptions.h" -#include "Cluster/CreateCollection.h" -#include "Cluster/CreateDatabase.h" -#include "Cluster/DropCollection.h" -#include "Cluster/DropDatabase.h" -#include "Cluster/DropIndex.h" -#include "Cluster/EnsureIndex.h" +#include "Cluster/MaintenanceActions/CreateCollection.h" +#include "Cluster/MaintenanceActions/CreateDatabase.h" +#include "Cluster/MaintenanceActions/DropCollection.h" +#include "Cluster/MaintenanceActions/DropDatabase.h" +#include "Cluster/MaintenanceActions/DropIndex.h" +#include "Cluster/MaintenanceActions/EnsureIndex.h" #include "Cluster/MaintenanceStrings.h" -#include "Cluster/ResignShardLeadership.h" -#include "Cluster/SynchronizeShard.h" -#include "Cluster/TakeoverShardLeadership.h" -#include "Cluster/UpdateCollection.h" -#include "Cluster/UpdateReplicatedLogAction.h" +#include "Cluster/MaintenanceActions/ResignShardLeadership.h" +#include "Cluster/MaintenanceActions/SynchronizeShard.h" +#include "Cluster/MaintenanceActions/TakeoverShardLeadership.h" +#include "Cluster/MaintenanceActions/UpdateCollection.h" +#include "Cluster/MaintenanceActions/UpdateReplicatedLogAction.h" using namespace arangodb; using namespace arangodb::maintenance; diff --git a/arangod/Cluster/Action.h b/arangod/Cluster/MaintenanceActions/Action.h similarity index 100% rename from arangod/Cluster/Action.h rename to arangod/Cluster/MaintenanceActions/Action.h diff --git a/arangod/Cluster/ActionBase.cpp b/arangod/Cluster/MaintenanceActions/ActionBase.cpp similarity index 99% rename from arangod/Cluster/ActionBase.cpp rename to arangod/Cluster/MaintenanceActions/ActionBase.cpp index ce8018d1e995..11ce40b8b8eb 100644 --- a/arangod/Cluster/ActionBase.cpp +++ b/arangod/Cluster/MaintenanceActions/ActionBase.cpp @@ -22,7 +22,7 @@ /// @author Matthew Von-Maszewski //////////////////////////////////////////////////////////////////////////////// -#include "Cluster/ActionBase.h" +#include "Cluster/MaintenanceActions/ActionBase.h" #include "ApplicationFeatures/ApplicationServer.h" #include "Basics/TimeString.h" diff --git a/arangod/Cluster/ActionBase.h b/arangod/Cluster/MaintenanceActions/ActionBase.h similarity index 100% rename from arangod/Cluster/ActionBase.h rename to arangod/Cluster/MaintenanceActions/ActionBase.h diff --git a/arangod/Cluster/ActionDescription.cpp b/arangod/Cluster/MaintenanceActions/ActionDescription.cpp similarity index 100% rename from arangod/Cluster/ActionDescription.cpp rename to arangod/Cluster/MaintenanceActions/ActionDescription.cpp diff --git a/arangod/Cluster/ActionDescription.h b/arangod/Cluster/MaintenanceActions/ActionDescription.h similarity index 100% rename from arangod/Cluster/ActionDescription.h rename to arangod/Cluster/MaintenanceActions/ActionDescription.h diff --git a/arangod/Cluster/CreateCollection.cpp b/arangod/Cluster/MaintenanceActions/CreateCollection.cpp similarity index 98% rename from arangod/Cluster/CreateCollection.cpp rename to arangod/Cluster/MaintenanceActions/CreateCollection.cpp index 0fd7fcde263a..e5a4b7fd6f66 100644 --- a/arangod/Cluster/CreateCollection.cpp +++ b/arangod/Cluster/MaintenanceActions/CreateCollection.cpp @@ -44,6 +44,7 @@ #include "Replication2/ReplicatedState/ReplicatedState.h" #include "Replication2/StateMachines/Document/DocumentFollowerState.h" #include "Replication2/StateMachines/Document/DocumentLeaderState.h" +#include "TaskMonitoring/task.h" #include #include @@ -112,6 +113,11 @@ bool CreateCollection::first() { auto const& leader = _description.get(THE_LEADER); auto const& props = properties(); + // Add task monitoring + auto task = task_monitoring::Task{"CreateCollection for DB: '" + database + + "', Collection: '" + collection + + "', Shard: '" + shard + "'"}; + std::string from; _description.get("from", from); diff --git a/arangod/Cluster/CreateCollection.h b/arangod/Cluster/MaintenanceActions/CreateCollection.h similarity index 100% rename from arangod/Cluster/CreateCollection.h rename to arangod/Cluster/MaintenanceActions/CreateCollection.h diff --git a/arangod/Cluster/CreateDatabase.cpp b/arangod/Cluster/MaintenanceActions/CreateDatabase.cpp similarity index 96% rename from arangod/Cluster/CreateDatabase.cpp rename to arangod/Cluster/MaintenanceActions/CreateDatabase.cpp index 8a944aef491c..bc9a2599611a 100644 --- a/arangod/Cluster/CreateDatabase.cpp +++ b/arangod/Cluster/MaintenanceActions/CreateDatabase.cpp @@ -37,6 +37,7 @@ #include "Utils/DatabaseGuard.h" #include "Utils/OperationOptions.h" #include "VocBase/Methods/Databases.h" +#include "TaskMonitoring/task.h" using namespace arangodb; using namespace arangodb::application_features; @@ -69,6 +70,10 @@ bool CreateDatabase::first() { VPackSlice users; auto database = _description.get(DATABASE); + // Add task monitoring + auto task = + task_monitoring::Task{"CreateDatabase for DB: '" + database + "'"}; + LOG_TOPIC("953b1", DEBUG, Logger::MAINTENANCE) << "CreateDatabase: creating database " << database; diff --git a/arangod/Cluster/CreateDatabase.h b/arangod/Cluster/MaintenanceActions/CreateDatabase.h similarity index 100% rename from arangod/Cluster/CreateDatabase.h rename to arangod/Cluster/MaintenanceActions/CreateDatabase.h diff --git a/arangod/Cluster/DropCollection.cpp b/arangod/Cluster/MaintenanceActions/DropCollection.cpp similarity index 96% rename from arangod/Cluster/DropCollection.cpp rename to arangod/Cluster/MaintenanceActions/DropCollection.cpp index aa71020576b4..3c4578559629 100644 --- a/arangod/Cluster/DropCollection.cpp +++ b/arangod/Cluster/MaintenanceActions/DropCollection.cpp @@ -40,6 +40,7 @@ #include "VocBase/Methods/Collections.h" #include "VocBase/Methods/Databases.h" #include "VocBase/vocbase.h" +#include "TaskMonitoring/task.h" using namespace arangodb; using namespace arangodb::application_features; @@ -69,6 +70,10 @@ bool DropCollection::first() { auto const& database = getDatabase(); auto const& shard = getShard(); + // Add task monitoring + auto task = task_monitoring::Task{"DropCollection for DB: '" + database + + "', Shard: '" + shard + "'"}; + LOG_TOPIC("a2961", DEBUG, Logger::MAINTENANCE) << "DropCollection: dropping local shard '" << database << "/" << shard; diff --git a/arangod/Cluster/DropCollection.h b/arangod/Cluster/MaintenanceActions/DropCollection.h similarity index 100% rename from arangod/Cluster/DropCollection.h rename to arangod/Cluster/MaintenanceActions/DropCollection.h diff --git a/arangod/Cluster/DropDatabase.cpp b/arangod/Cluster/MaintenanceActions/DropDatabase.cpp similarity index 95% rename from arangod/Cluster/DropDatabase.cpp rename to arangod/Cluster/MaintenanceActions/DropDatabase.cpp index ce2ff1ba7131..ee69075fef4a 100644 --- a/arangod/Cluster/DropDatabase.cpp +++ b/arangod/Cluster/MaintenanceActions/DropDatabase.cpp @@ -36,6 +36,7 @@ #include "Utils/ExecContext.h" #include "Utils/OperationOptions.h" #include "VocBase/Methods/Databases.h" +#include "TaskMonitoring/task.h" using namespace arangodb::application_features; using namespace arangodb::methods; @@ -69,6 +70,9 @@ bool DropDatabase::first() { LOG_TOPIC("22779", DEBUG, Logger::MAINTENANCE) << "DropDatabase: dropping " << database; + // Add task monitoring + auto task = task_monitoring::Task{"DropDatabase for DB: '" + database + "'"}; + try { auto& df = _feature.server().getFeature(); DatabaseGuard guard(df, StaticStrings::SystemDatabase); diff --git a/arangod/Cluster/DropDatabase.h b/arangod/Cluster/MaintenanceActions/DropDatabase.h similarity index 100% rename from arangod/Cluster/DropDatabase.h rename to arangod/Cluster/MaintenanceActions/DropDatabase.h diff --git a/arangod/Cluster/DropIndex.cpp b/arangod/Cluster/MaintenanceActions/DropIndex.cpp similarity index 96% rename from arangod/Cluster/DropIndex.cpp rename to arangod/Cluster/MaintenanceActions/DropIndex.cpp index a70ac036ca95..32965875e84d 100644 --- a/arangod/Cluster/DropIndex.cpp +++ b/arangod/Cluster/MaintenanceActions/DropIndex.cpp @@ -39,6 +39,7 @@ #include "VocBase/Methods/Collections.h" #include "VocBase/Methods/Databases.h" #include "VocBase/Methods/Indexes.h" +#include "TaskMonitoring/task.h" using namespace arangodb::application_features; using namespace arangodb::maintenance; @@ -90,6 +91,11 @@ bool DropIndex::first() { auto const& shard = _description.get(SHARD); auto const& id = _description.get(INDEX); + // Add task monitoring + auto task = + task_monitoring::Task{"DropIndex for DB: '" + database + "', Shard: '" + + shard + "', Index: '" + id + "'"}; + VPackBuilder index; index.add(VPackValue(_description.get(INDEX))); diff --git a/arangod/Cluster/DropIndex.h b/arangod/Cluster/MaintenanceActions/DropIndex.h similarity index 100% rename from arangod/Cluster/DropIndex.h rename to arangod/Cluster/MaintenanceActions/DropIndex.h diff --git a/arangod/Cluster/EnsureIndex.cpp b/arangod/Cluster/MaintenanceActions/EnsureIndex.cpp similarity index 97% rename from arangod/Cluster/EnsureIndex.cpp rename to arangod/Cluster/MaintenanceActions/EnsureIndex.cpp index 43007082df6d..2b6c55c0b6b6 100644 --- a/arangod/Cluster/EnsureIndex.cpp +++ b/arangod/Cluster/MaintenanceActions/EnsureIndex.cpp @@ -40,6 +40,7 @@ #include "Utils/DatabaseGuard.h" #include "VocBase/LogicalCollection.h" #include "VocBase/Methods/Databases.h" +#include "TaskMonitoring/task.h" using namespace arangodb; using namespace arangodb::application_features; @@ -114,6 +115,11 @@ bool EnsureIndex::first() { auto const& shard = _description.get(SHARD); auto const& id = properties().get(ID).copyString(); + // Add task monitoring + auto task = task_monitoring::Task{"EnsureIndex for DB: '" + database + + "', Collection: '" + collection + + "', Shard: '" + shard + "'"}; + try { // now try to guard the database auto& df = _feature.server().getFeature(); DatabaseGuard guard(df, database); diff --git a/arangod/Cluster/EnsureIndex.h b/arangod/Cluster/MaintenanceActions/EnsureIndex.h similarity index 100% rename from arangod/Cluster/EnsureIndex.h rename to arangod/Cluster/MaintenanceActions/EnsureIndex.h diff --git a/arangod/Cluster/ResignShardLeadership.cpp b/arangod/Cluster/MaintenanceActions/ResignShardLeadership.cpp similarity index 96% rename from arangod/Cluster/ResignShardLeadership.cpp rename to arangod/Cluster/MaintenanceActions/ResignShardLeadership.cpp index c390f071167c..df48fc6d9526 100644 --- a/arangod/Cluster/ResignShardLeadership.cpp +++ b/arangod/Cluster/MaintenanceActions/ResignShardLeadership.cpp @@ -41,6 +41,7 @@ #include "VocBase/LogicalCollection.h" #include "VocBase/Methods/Collections.h" #include "VocBase/Methods/Databases.h" +#include "TaskMonitoring/task.h" #include #include @@ -80,6 +81,11 @@ bool ResignShardLeadership::first() { std::string const& database = getDatabase(); std::string const& collection = getShard(); + // Add task monitoring + auto task = + task_monitoring::Task{"ResignShardLeadership for DB: '" + database + + "', Shard: '" + collection + "'"}; + LOG_TOPIC("14f43", DEBUG, Logger::MAINTENANCE) << "trying to withdraw as leader of shard '" << database << "/" << collection; diff --git a/arangod/Cluster/ResignShardLeadership.h b/arangod/Cluster/MaintenanceActions/ResignShardLeadership.h similarity index 100% rename from arangod/Cluster/ResignShardLeadership.h rename to arangod/Cluster/MaintenanceActions/ResignShardLeadership.h diff --git a/arangod/Cluster/SynchronizeShard.cpp b/arangod/Cluster/MaintenanceActions/SynchronizeShard.cpp similarity index 99% rename from arangod/Cluster/SynchronizeShard.cpp rename to arangod/Cluster/MaintenanceActions/SynchronizeShard.cpp index 51641892c66c..0ba01f8a55da 100644 --- a/arangod/Cluster/SynchronizeShard.cpp +++ b/arangod/Cluster/MaintenanceActions/SynchronizeShard.cpp @@ -33,7 +33,7 @@ #include "Basics/TimeString.h" #include "Basics/VelocyPackHelper.h" #include "Basics/debugging.h" -#include "Cluster/ActionDescription.h" +#include "Cluster/MaintenanceActions/ActionDescription.h" #include "Cluster/AgencyCache.h" #include "Cluster/ClusterFeature.h" #include "Cluster/ClusterInfo.h" @@ -41,7 +41,7 @@ #include "Cluster/FollowerInfo.h" #include "Cluster/Maintenance.h" #include "Cluster/MaintenanceFeature.h" -#include "Cluster/ResignShardLeadership.h" +#include "Cluster/MaintenanceActions/ResignShardLeadership.h" #include "Cluster/ReplicationTimeoutFeature.h" #include "Cluster/ServerState.h" #include "GeneralServer/AuthenticationFeature.h" @@ -66,6 +66,7 @@ #include "VocBase/LogicalCollection.h" #include "VocBase/Methods/Collections.h" #include "VocBase/Methods/Databases.h" +#include "TaskMonitoring/task.h" #include #include diff --git a/arangod/Cluster/SynchronizeShard.h b/arangod/Cluster/MaintenanceActions/SynchronizeShard.h similarity index 97% rename from arangod/Cluster/SynchronizeShard.h rename to arangod/Cluster/MaintenanceActions/SynchronizeShard.h index b430981427c6..8cd2ba0ba267 100644 --- a/arangod/Cluster/SynchronizeShard.h +++ b/arangod/Cluster/MaintenanceActions/SynchronizeShard.h @@ -25,8 +25,8 @@ #pragma once #include "Basics/ResultT.h" -#include "Cluster/ActionBase.h" -#include "Cluster/ActionDescription.h" +#include "Cluster/MaintenanceActions/ActionBase.h" +#include "Cluster/MaintenanceActions/ActionDescription.h" #include "Replication/utilities.h" #include "VocBase/voc-types.h" diff --git a/arangod/Cluster/TakeoverShardLeadership.cpp b/arangod/Cluster/MaintenanceActions/TakeoverShardLeadership.cpp similarity index 97% rename from arangod/Cluster/TakeoverShardLeadership.cpp rename to arangod/Cluster/MaintenanceActions/TakeoverShardLeadership.cpp index 183a117ec6c0..ccbb78e23d9e 100644 --- a/arangod/Cluster/TakeoverShardLeadership.cpp +++ b/arangod/Cluster/MaintenanceActions/TakeoverShardLeadership.cpp @@ -51,6 +51,7 @@ #include "VocBase/LogicalCollection.h" #include "VocBase/Methods/Collections.h" #include "VocBase/Methods/Databases.h" +#include "TaskMonitoring/task.h" #include #include @@ -267,6 +268,11 @@ bool TakeoverShardLeadership::first() { uint64_t planIndex = basics::StringUtils::uint64(planRaftIndex); Result res; + // Add task monitoring + auto task = task_monitoring::Task{"TakeoverShardLeadership for DB: '" + + database + "', Collection: '" + collection + + "', Shard: '" + shard + "'"}; + try { auto& df = _feature.server().getFeature(); DatabaseGuard guard(df, database); diff --git a/arangod/Cluster/TakeoverShardLeadership.h b/arangod/Cluster/MaintenanceActions/TakeoverShardLeadership.h similarity index 100% rename from arangod/Cluster/TakeoverShardLeadership.h rename to arangod/Cluster/MaintenanceActions/TakeoverShardLeadership.h diff --git a/arangod/Cluster/UpdateCollection.cpp b/arangod/Cluster/MaintenanceActions/UpdateCollection.cpp similarity index 96% rename from arangod/Cluster/UpdateCollection.cpp rename to arangod/Cluster/MaintenanceActions/UpdateCollection.cpp index 9e22e43d8dbd..6fb71deb4030 100644 --- a/arangod/Cluster/UpdateCollection.cpp +++ b/arangod/Cluster/MaintenanceActions/UpdateCollection.cpp @@ -43,6 +43,7 @@ #include "VocBase/LogicalCollection.h" #include "VocBase/Methods/Collections.h" #include "VocBase/Methods/Databases.h" +#include "TaskMonitoring/task.h" using namespace arangodb; using namespace arangodb::application_features; @@ -89,6 +90,11 @@ bool UpdateCollection::first() { auto const& props = properties(); Result res; + // Add task monitoring + auto task = task_monitoring::Task{"UpdateCollection for DB: '" + database + + "', Collection: '" + collection + + "', Shard: '" + shard + "'"}; + std::string from; _description.get("from", from); diff --git a/arangod/Cluster/UpdateCollection.h b/arangod/Cluster/MaintenanceActions/UpdateCollection.h similarity index 100% rename from arangod/Cluster/UpdateCollection.h rename to arangod/Cluster/MaintenanceActions/UpdateCollection.h diff --git a/arangod/Cluster/UpdateReplicatedLogAction.cpp b/arangod/Cluster/MaintenanceActions/UpdateReplicatedLogAction.cpp similarity index 95% rename from arangod/Cluster/UpdateReplicatedLogAction.cpp rename to arangod/Cluster/MaintenanceActions/UpdateReplicatedLogAction.cpp index 97cf44728e29..c73622f54591 100644 --- a/arangod/Cluster/UpdateReplicatedLogAction.cpp +++ b/arangod/Cluster/MaintenanceActions/UpdateReplicatedLogAction.cpp @@ -41,6 +41,7 @@ #include "UpdateReplicatedLogAction.h" #include "Utils/DatabaseGuard.h" #include "VocBase/vocbase.h" +#include "TaskMonitoring/task.h" using namespace arangodb; using namespace arangodb::basics; @@ -63,6 +64,11 @@ bool arangodb::maintenance::UpdateReplicatedLogAction::first() { auto const& database = _description.get(DATABASE); auto& df = _feature.server().getFeature(); + // Add task monitoring + auto task = + task_monitoring::Task{"UpdateReplicatedLogAction for DB: '" + database + + "', LogId: '" + std::to_string(logId.id()) + "'"}; + auto result = basics::catchToResult([&] { DatabaseGuard guard(df, database); diff --git a/arangod/Cluster/UpdateReplicatedLogAction.h b/arangod/Cluster/MaintenanceActions/UpdateReplicatedLogAction.h similarity index 95% rename from arangod/Cluster/UpdateReplicatedLogAction.h rename to arangod/Cluster/MaintenanceActions/UpdateReplicatedLogAction.h index 035504c83054..1f9346a65d9e 100644 --- a/arangod/Cluster/UpdateReplicatedLogAction.h +++ b/arangod/Cluster/MaintenanceActions/UpdateReplicatedLogAction.h @@ -23,7 +23,7 @@ #pragma once -#include "Cluster/ActionBase.h" +#include "Cluster/MaintenanceActions/ActionBase.h" namespace arangodb::maintenance { diff --git a/arangod/Cluster/MaintenanceFeature.cpp b/arangod/Cluster/MaintenanceFeature.cpp index 0597462db89a..c43759f2d905 100644 --- a/arangod/Cluster/MaintenanceFeature.cpp +++ b/arangod/Cluster/MaintenanceFeature.cpp @@ -42,12 +42,12 @@ #include "Basics/TimeString.h" #include "Basics/WriteLocker.h" #include "Basics/system-functions.h" -#include "Cluster/Action.h" -#include "Cluster/ActionDescription.h" +#include "Cluster/MaintenanceActions/Action.h" +#include "Cluster/MaintenanceActions/ActionDescription.h" #include "Cluster/AgencyCache.h" #include "Cluster/ClusterFeature.h" #include "Cluster/ClusterInfo.h" -#include "Cluster/CreateDatabase.h" +#include "Cluster/MaintenanceActions/CreateDatabase.h" #include "Cluster/MaintenanceWorker.h" #include "Cluster/ServerState.h" #include "Logger/LogMacros.h" diff --git a/arangod/Cluster/MaintenanceFeature.h b/arangod/Cluster/MaintenanceFeature.h index a8ba7ce0e23e..51a4a214bed8 100644 --- a/arangod/Cluster/MaintenanceFeature.h +++ b/arangod/Cluster/MaintenanceFeature.h @@ -26,7 +26,7 @@ #include "Basics/ConditionVariable.h" #include "Basics/Result.h" -#include "Cluster/Action.h" +#include "Cluster/MaintenanceActions/Action.h" #include "Cluster/MaintenanceWorker.h" #include "Cluster/Utils/ShardID.h" #include "ProgramOptions/ProgramOptions.h" diff --git a/arangod/Cluster/MaintenanceRestHandler.h b/arangod/Cluster/MaintenanceRestHandler.h index 5d13893f068c..b622c93e5edf 100644 --- a/arangod/Cluster/MaintenanceRestHandler.h +++ b/arangod/Cluster/MaintenanceRestHandler.h @@ -24,7 +24,7 @@ #pragma once -#include "Cluster/Action.h" +#include "Cluster/MaintenanceActions/Action.h" #include "RestHandler/RestBaseHandler.h" namespace arangodb { diff --git a/arangod/Cluster/MaintenanceWorker.h b/arangod/Cluster/MaintenanceWorker.h index 1a6bb3e197c1..ffb84a2f831d 100644 --- a/arangod/Cluster/MaintenanceWorker.h +++ b/arangod/Cluster/MaintenanceWorker.h @@ -25,7 +25,7 @@ #pragma once #include "Basics/Thread.h" -#include "Cluster/Action.h" +#include "Cluster/MaintenanceActions/Action.h" namespace arangodb { diff --git a/arangod/GeneralServer/GeneralServerFeature.cpp b/arangod/GeneralServer/GeneralServerFeature.cpp index c73c02125d55..7dcd2e3fac30 100644 --- a/arangod/GeneralServer/GeneralServerFeature.cpp +++ b/arangod/GeneralServer/GeneralServerFeature.cpp @@ -30,7 +30,6 @@ #include "Agency/RestAgencyPrivHandler.h" #include "ApplicationFeatures/HttpEndpointProvider.h" #include "Aql/RestAqlHandler.h" -#include "AsyncRegistryServer/RestHandler.h" #include "Basics/StringUtils.h" #include "Basics/application-exit.h" #include "Basics/debugging.h" @@ -129,6 +128,8 @@ #include "Scheduler/SchedulerFeature.h" #include "StorageEngine/EngineSelectorFeature.h" #include "StorageEngine/StorageEngine.h" +#include "SystemMonitor/AsyncRegistry/RestHandler.h" +#include "SystemMonitor/TaskMonitoring/RestHandler.h" #ifdef USE_V8 #include "V8Server/V8DealerFeature.h" #endif @@ -843,6 +844,10 @@ void GeneralServerFeature::defineRemainingHandlers( "/_admin/async-registry", RestHandlerCreator::createNoData); + f.addPrefixHandler( + "/_admin/task-monitoring", + RestHandlerCreator::createNoData); + f.addPrefixHandler( "/_admin/cluster", RestHandlerCreator::createNoData); diff --git a/arangod/Replication2/StateMachines/Document/MaintenanceActionExecutor.cpp b/arangod/Replication2/StateMachines/Document/MaintenanceActionExecutor.cpp index 0ef784a30f93..314e7535738c 100644 --- a/arangod/Replication2/StateMachines/Document/MaintenanceActionExecutor.cpp +++ b/arangod/Replication2/StateMachines/Document/MaintenanceActionExecutor.cpp @@ -23,9 +23,9 @@ #include "Replication2/StateMachines/Document/MaintenanceActionExecutor.h" -#include "Cluster/ActionDescription.h" -#include "Cluster/CreateCollection.h" -#include "Cluster/EnsureIndex.h" +#include "Cluster/MaintenanceActions/ActionDescription.h" +#include "Cluster/MaintenanceActions/CreateCollection.h" +#include "Cluster/MaintenanceActions/EnsureIndex.h" #include "Cluster/Maintenance.h" #include "Logger/LogMacros.h" #include "VocBase/Methods/Collections.h" @@ -172,4 +172,4 @@ auto MaintenanceActionExecutor::addDirty() noexcept -> Result { } return res; } -} // namespace arangodb::replication2::replicated_state::document \ No newline at end of file +} // namespace arangodb::replication2::replicated_state::document diff --git a/arangod/RestHandler/RestCollectionHandler.cpp b/arangod/RestHandler/RestCollectionHandler.cpp index 704ab3c89bb3..26efab253491 100644 --- a/arangod/RestHandler/RestCollectionHandler.cpp +++ b/arangod/RestHandler/RestCollectionHandler.cpp @@ -25,7 +25,7 @@ #include "Async/async.h" #include "ApplicationFeatures/ApplicationServer.h" -#include "Cluster/ActionDescription.h" +#include "Cluster/MaintenanceActions/ActionDescription.h" #include "Cluster/ClusterFeature.h" #include "Cluster/ClusterInfo.h" #include "Cluster/ClusterMethods.h" diff --git a/arangod/RestHandler/RestDatabaseHandler.cpp b/arangod/RestHandler/RestDatabaseHandler.cpp index e478ee42ed1e..d812baa374a4 100644 --- a/arangod/RestHandler/RestDatabaseHandler.cpp +++ b/arangod/RestHandler/RestDatabaseHandler.cpp @@ -29,6 +29,7 @@ #include "Cluster/ClusterFeature.h" #include "Cluster/ClusterInfo.h" #include "Cluster/ServerState.h" +#include "TaskMonitoring/task.h" #include "Utils/Events.h" #include "VocBase/Methods/Databases.h" @@ -48,10 +49,13 @@ RestStatus RestDatabaseHandler::execute() { // extract the request type rest::RequestType const type = _request->requestType(); if (type == rest::RequestType::GET) { + auto task = task_monitoring::Task{"Request: List Databases"}; return getDatabases(); } else if (type == rest::RequestType::POST) { + auto task = task_monitoring::Task{"Request: Create Database"}; return createDatabase(); } else if (type == rest::RequestType::DELETE_REQ) { + auto task = task_monitoring::Task{"Request: Delete Database"}; return deleteDatabase(); } else { generateError(rest::ResponseCode::METHOD_NOT_ALLOWED, diff --git a/arangod/RestHandler/RestReplicationHandler.cpp b/arangod/RestHandler/RestReplicationHandler.cpp index 95d6777bec81..67da2043d7b8 100644 --- a/arangod/RestHandler/RestReplicationHandler.cpp +++ b/arangod/RestHandler/RestReplicationHandler.cpp @@ -42,7 +42,7 @@ #include "Cluster/CollectionInfoCurrent.h" #include "Cluster/FollowerInfo.h" #include "Cluster/RebootTracker.h" -#include "Cluster/ResignShardLeadership.h" +#include "Cluster/MaintenanceActions/ResignShardLeadership.h" #include "Cluster/ServerState.h" #include "Containers/HashSet.h" #include "Containers/MerkleTree.h" diff --git a/arangod/RestServer/arangod.cpp b/arangod/RestServer/arangod.cpp index 334c1ddbc7b0..2d8a81f3f697 100644 --- a/arangod/RestServer/arangod.cpp +++ b/arangod/RestServer/arangod.cpp @@ -84,6 +84,9 @@ static int runServer(int argc, char** argv, ArangoGlobalContext& context) { [](auto& server, TypeTag) { return std::make_unique(server); }, + [](auto& server, TypeTag) { + return std::make_unique(server); + }, #ifdef TRI_HAVE_GETRLIMIT [](auto& server, TypeTag) { return std::make_unique( diff --git a/arangod/RestServer/arangod.h b/arangod/RestServer/arangod.h index 7d7c88f0ade6..7b11b01af6b8 100644 --- a/arangod/RestServer/arangod.h +++ b/arangod/RestServer/arangod.h @@ -61,6 +61,11 @@ namespace async_registry { class Feature; +} +namespace task_monitoring { + +class Feature; + } class BootstrapFeature; class BumpFileDescriptorsFeature; @@ -206,6 +211,7 @@ using ArangodFeaturesList = TypeList< ApiRecordingFeature, AqlFeature, async_registry::Feature, + task_monitoring::Feature, AuthenticationFeature, BootstrapFeature, #ifdef TRI_HAVE_GETRLIMIT diff --git a/arangod/RestServer/arangod_includes.h b/arangod/RestServer/arangod_includes.h index 17d6fe636713..5435143d15a5 100644 --- a/arangod/RestServer/arangod_includes.h +++ b/arangod/RestServer/arangod_includes.h @@ -51,7 +51,8 @@ #include "Aql/AqlFunctionFeature.h" #include "Aql/OptimizerRulesFeature.h" #include "Aql/QueryInfoLoggerFeature.h" -#include "AsyncRegistryServer/Feature.h" +#include "SystemMonitor/AsyncRegistry/Feature.h" +#include "SystemMonitor/TaskMonitoring/Feature.h" #include "Basics/ArangoGlobalContext.h" #include "Basics/FileUtils.h" #include "Basics/directories.h" diff --git a/arangod/AsyncRegistryServer/CMakeLists.txt b/arangod/SystemMonitor/AsyncRegistry/CMakeLists.txt similarity index 64% rename from arangod/AsyncRegistryServer/CMakeLists.txt rename to arangod/SystemMonitor/AsyncRegistry/CMakeLists.txt index acccd6c5e2b1..62b9e163798d 100644 --- a/arangod/AsyncRegistryServer/CMakeLists.txt +++ b/arangod/SystemMonitor/AsyncRegistry/CMakeLists.txt @@ -3,6 +3,4 @@ target_sources(arangoserver PRIVATE Metrics.cpp RestHandler.cpp) target_link_libraries(arangoserver - arango_async_registry_stacktrace) - -add_subdirectory(Stacktrace) + arango_forest) diff --git a/arangod/AsyncRegistryServer/Feature.cpp b/arangod/SystemMonitor/AsyncRegistry/Feature.cpp similarity index 100% rename from arangod/AsyncRegistryServer/Feature.cpp rename to arangod/SystemMonitor/AsyncRegistry/Feature.cpp diff --git a/arangod/AsyncRegistryServer/Feature.h b/arangod/SystemMonitor/AsyncRegistry/Feature.h similarity index 98% rename from arangod/AsyncRegistryServer/Feature.h rename to arangod/SystemMonitor/AsyncRegistry/Feature.h index 6b055729239a..09ef595cbb3f 100644 --- a/arangod/AsyncRegistryServer/Feature.h +++ b/arangod/SystemMonitor/AsyncRegistry/Feature.h @@ -23,7 +23,7 @@ #pragma once #include "Async/Registry/registry_variable.h" -#include "AsyncRegistryServer/Metrics.h" +#include "SystemMonitor/AsyncRegistry/Metrics.h" #include "Basics/FutureSharedLock.h" #include "RestServer/arangod.h" #include "Scheduler/SchedulerFeature.h" diff --git a/arangod/AsyncRegistryServer/Metrics.cpp b/arangod/SystemMonitor/AsyncRegistry/Metrics.cpp similarity index 97% rename from arangod/AsyncRegistryServer/Metrics.cpp rename to arangod/SystemMonitor/AsyncRegistry/Metrics.cpp index d31bc040e007..266b2273e2c0 100644 --- a/arangod/AsyncRegistryServer/Metrics.cpp +++ b/arangod/SystemMonitor/AsyncRegistry/Metrics.cpp @@ -25,6 +25,8 @@ #include "Metrics/Counter.h" #include "Metrics/Gauge.h" +using namespace arangodb::async_registry; + auto RegistryMetrics::increment_total_nodes() -> void { promises_total->count(); } diff --git a/arangod/AsyncRegistryServer/Metrics.h b/arangod/SystemMonitor/AsyncRegistry/Metrics.h similarity index 97% rename from arangod/AsyncRegistryServer/Metrics.h rename to arangod/SystemMonitor/AsyncRegistry/Metrics.h index c434bb1635fe..6aeafedc7fba 100644 --- a/arangod/AsyncRegistryServer/Metrics.h +++ b/arangod/SystemMonitor/AsyncRegistry/Metrics.h @@ -25,6 +25,8 @@ #include "Containers/Concurrent/metrics.h" #include "Metrics/Fwd.h" +namespace arangodb::async_registry { + struct RegistryMetrics : arangodb::containers::Metrics { RegistryMetrics( std::shared_ptr promises_total, @@ -60,3 +62,5 @@ struct RegistryMetrics : arangodb::containers::Metrics { std::shared_ptr> existing_thread_registries = nullptr; }; + +} // namespace arangodb::async_registry diff --git a/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/.gdbinit b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/.gdbinit new file mode 100644 index 000000000000..ec45b6912fda --- /dev/null +++ b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/.gdbinit @@ -0,0 +1,8 @@ +python +import sys +sys.path.insert(0, './arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/') +end + +source ./arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/gdb_printer.py + +echo "asyncregistry pretty-printer loaded\n" diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/README.md b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/README.md similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/README.md rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/README.md diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/__init__.py b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/__init__.py similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/__init__.py rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/__init__.py diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/__init__.py b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/__init__.py similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/__init__.py rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/__init__.py diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/gdb_data.py b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/gdb_data.py similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/gdb_data.py rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/gdb_data.py diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/gdb_forest.py b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/gdb_forest.py similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/gdb_forest.py rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/gdb_forest.py diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/gdb_printer.py b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/gdb_printer.py similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/gdb_printer.py rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/gdb_printer.py diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/stacktrace.py b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/stacktrace.py similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/src/asyncregistry/stacktrace.py rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/asyncregistry/stacktrace.py diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/src/pretty-printer.py b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/pretty-printer.py similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/src/pretty-printer.py rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/pretty-printer.py diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/src/tests/__init__.py b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/tests/__init__.py similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/src/tests/__init__.py rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/tests/__init__.py diff --git a/arangod/AsyncRegistryServer/PrettyPrinter/src/tests/test_forest.py b/arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/tests/test_forest.py similarity index 100% rename from arangod/AsyncRegistryServer/PrettyPrinter/src/tests/test_forest.py rename to arangod/SystemMonitor/AsyncRegistry/PrettyPrinter/src/tests/test_forest.py diff --git a/arangod/AsyncRegistryServer/RestHandler.cpp b/arangod/SystemMonitor/AsyncRegistry/RestHandler.cpp similarity index 98% rename from arangod/AsyncRegistryServer/RestHandler.cpp rename to arangod/SystemMonitor/AsyncRegistry/RestHandler.cpp index ab40713a3959..2f9f6bffe034 100644 --- a/arangod/AsyncRegistryServer/RestHandler.cpp +++ b/arangod/SystemMonitor/AsyncRegistry/RestHandler.cpp @@ -25,8 +25,8 @@ #include #include "Async/Registry/promise.h" -#include "AsyncRegistryServer/Stacktrace/depth_first.h" -#include "AsyncRegistryServer/Stacktrace/forest.h" +#include "Containers/Forest/depth_first.h" +#include "Containers/Forest/forest.h" #include "ApplicationFeatures/ApplicationServer.h" #include "Async/Registry/promise.h" #include "Async/Registry/registry_variable.h" @@ -42,7 +42,14 @@ using namespace arangodb; using namespace arangodb::async_registry; +using namespace arangodb::containers; +RestHandler::RestHandler(ArangodServer& server, GeneralRequest* request, + GeneralResponse* response) + : RestVocbaseBaseHandler(server, request, response), + _feature(server.getFeature()) {} + +namespace { struct Entry { TreeHierarchy hierarchy; PromiseSnapshot data; @@ -52,13 +59,6 @@ auto inspect(Inspector& f, Entry& x) { return f.object(x).fields(f.field("hierarchy", x.hierarchy), f.field("data", x.data)); } - -RestHandler::RestHandler(ArangodServer& server, GeneralRequest* request, - GeneralResponse* response) - : RestVocbaseBaseHandler(server, request, response), - _feature(server.getFeature()) {} - -namespace { /** Creates a forest of all promises in the async registry diff --git a/arangod/AsyncRegistryServer/RestHandler.h b/arangod/SystemMonitor/AsyncRegistry/RestHandler.h similarity index 96% rename from arangod/AsyncRegistryServer/RestHandler.h rename to arangod/SystemMonitor/AsyncRegistry/RestHandler.h index c16e2e44f2a5..e6759316bc3d 100644 --- a/arangod/AsyncRegistryServer/RestHandler.h +++ b/arangod/SystemMonitor/AsyncRegistry/RestHandler.h @@ -22,7 +22,7 @@ //////////////////////////////////////////////////////////////////////////////// #pragma once -#include "AsyncRegistryServer/Feature.h" +#include "SystemMonitor/AsyncRegistry/Feature.h" #include "RestHandler/RestVocbaseBaseHandler.h" namespace arangodb::async_registry { diff --git a/arangod/SystemMonitor/CMakeLists.txt b/arangod/SystemMonitor/CMakeLists.txt new file mode 100644 index 000000000000..1a0517cee88a --- /dev/null +++ b/arangod/SystemMonitor/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(AsyncRegistry) +add_subdirectory(TaskMonitoring) diff --git a/arangod/SystemMonitor/TaskMonitoring/CMakeLists.txt b/arangod/SystemMonitor/TaskMonitoring/CMakeLists.txt new file mode 100644 index 000000000000..1e244be9a727 --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/CMakeLists.txt @@ -0,0 +1,4 @@ +target_sources(arangoserver PRIVATE + Feature.cpp + Metrics.cpp + RestHandler.cpp) diff --git a/arangod/SystemMonitor/TaskMonitoring/Feature.cpp b/arangod/SystemMonitor/TaskMonitoring/Feature.cpp new file mode 100644 index 000000000000..a2b461325fa1 --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/Feature.cpp @@ -0,0 +1,118 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#include "Feature.h" + +#include "Basics/FutureSharedLock.h" +#include "Metrics/CounterBuilder.h" +#include "Metrics/GaugeBuilder.h" +#include "Metrics/MetricsFeature.h" +#include "ProgramOptions/Parameters.h" + +using namespace arangodb::task_monitoring; + +DECLARE_COUNTER( + arangodb_monitoring_tasks_total, + "Total number of created monitoring tasks since database creation"); + +DECLARE_GAUGE(arangodb_monitoring_tasks_existing, std::uint64_t, + "Number of currently existing monitoring tasks"); + +DECLARE_GAUGE(arangodb_monitoring_tasks_ready_for_deletion, std::uint64_t, + "Number of currently existing monitoring tasks that wait " + "for their garbage collection"); + +DECLARE_COUNTER(arangodb_monitoring_tasks_thread_registries_total, + "Total number of threads that started monitoring tasks " + "since database creation"); + +DECLARE_GAUGE( + arangodb_monitoring_tasks_existing_thread_registries, std::uint64_t, + "Number of threads that started currently existing monitoring tasks"); + +Feature::Feature(Server& server) + : ArangodFeature{server, *this}, _async_mutex{_schedulerWrapper} { + startsAfter(); + startsAfter(); +} + +auto Feature::create_metrics(arangodb::metrics::MetricsFeature& metrics_feature) + -> std::shared_ptr { + return std::make_shared( + metrics_feature.addShared(arangodb_monitoring_tasks_total{}), + metrics_feature.addShared(arangodb_monitoring_tasks_existing{}), + metrics_feature.addShared(arangodb_monitoring_tasks_ready_for_deletion{}), + metrics_feature.addShared( + arangodb_monitoring_tasks_thread_registries_total{}), + metrics_feature.addShared( + arangodb_monitoring_tasks_existing_thread_registries{})); +} +auto Feature::asyncLock() + -> futures::Future::LockGuard> { + return _async_mutex.asyncLockExclusive(); +} + +struct Feature::CleanupThread { + CleanupThread(size_t gc_timeout) + : _thread([gc_timeout, this](std::stop_token stoken) { + while (not stoken.stop_requested()) { + std::unique_lock guard(_mutex); + auto status = _cv.wait_for(guard, std::chrono::seconds{gc_timeout}); + if (status == std::cv_status::timeout) { + async_registry::registry.run_external_cleanup(); + } + } + }) {} + + ~CleanupThread() { + _thread.request_stop(); + _cv.notify_one(); + } + + std::mutex _mutex; + std::condition_variable _cv; + std::jthread _thread; +}; + +void Feature::start() { + metrics = create_metrics( + server().template getFeature()); + registry.set_metrics(metrics); + _cleanupThread = std::make_shared(_options.gc_timeout); +} + +void Feature::stop() { _cleanupThread.reset(); } + +void Feature::collectOptions(std::shared_ptr options) { + options->addSection("task-registry", "Options for the task-registry"); + + options + ->addOption("--task-registry.cleanup-timeout", + "Timeout in seconds between task-registry garbage collection " + "swipes.", + new options::SizeTParameter(&_options.gc_timeout, /*base*/ 1, + /*minValue*/ 1)) + .setLongDescription( + R"(Each thread that is involved in the task-registry needs to garbage collect its finished tasks regularly. This option controls how often this is done in seconds. This can possibly be performance relevant because each involved thread aquires a lock.)"); +} + +Feature::~Feature() { registry.set_metrics(nullptr); } diff --git a/arangod/SystemMonitor/TaskMonitoring/Feature.h b/arangod/SystemMonitor/TaskMonitoring/Feature.h new file mode 100644 index 000000000000..6f37f9d2151b --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/Feature.h @@ -0,0 +1,80 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "TaskMonitoring/task_registry_variable.h" +#include "SystemMonitor/TaskMonitoring/Metrics.h" +#include "Basics/FutureSharedLock.h" +#include "RestServer/arangod.h" +#include "Scheduler/SchedulerFeature.h" + +namespace arangodb::task_monitoring { + +class Feature final : public ArangodFeature { + private: + static auto create_metrics(arangodb::metrics::MetricsFeature& metrics_feature) + -> std::shared_ptr; + struct SchedulerWrapper { + using WorkHandle = Scheduler::WorkHandle; + template + void queue(F&& fn) { + SchedulerFeature::SCHEDULER->queue(RequestLane::CLUSTER_INTERNAL, + std::forward(fn)); + } + template + WorkHandle queueDelayed(F&& fn, std::chrono::milliseconds timeout) { + return SchedulerFeature::SCHEDULER->queueDelayed( + "rocksdb-meta-collection-lock-timeout", RequestLane::CLUSTER_INTERNAL, + timeout, std::forward(fn)); + } + }; + + public: + static constexpr std::string_view name() { return "Coroutines"; } + auto asyncLock() -> futures::Future< + futures::FutureSharedLock::LockGuard>; + + Feature(Server& server); + + ~Feature(); + + void start() override final; + void stop() override final; + void collectOptions(std::shared_ptr) override final; + + private: + struct Options { + size_t gc_timeout{1}; + }; + Options _options; + + std::shared_ptr metrics; + + struct CleanupThread; + std::shared_ptr _cleanupThread; + + SchedulerWrapper _schedulerWrapper; + futures::FutureSharedLock _async_mutex; +}; + +} // namespace arangodb::task_monitoring diff --git a/arangod/SystemMonitor/TaskMonitoring/Metrics.cpp b/arangod/SystemMonitor/TaskMonitoring/Metrics.cpp new file mode 100644 index 000000000000..49ee9ee883ea --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/Metrics.cpp @@ -0,0 +1,51 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#include "Metrics.h" + +#include "Metrics/Counter.h" +#include "Metrics/Gauge.h" + +using namespace arangodb::task_monitoring; + +auto RegistryMetrics::increment_total_nodes() -> void { tasks_total->count(); } +auto RegistryMetrics::increment_registered_nodes() -> void { + existing_tasks->fetch_add(1); +} +auto RegistryMetrics::decrement_registered_nodes() -> void { + existing_tasks->fetch_sub(1); +} +auto RegistryMetrics::increment_ready_for_deletion_nodes() -> void { + ready_for_deletion_tasks->fetch_add(1); +} +auto RegistryMetrics::decrement_ready_for_deletion_nodes() -> void { + ready_for_deletion_tasks->fetch_sub(1); +} +auto RegistryMetrics::increment_total_lists() -> void { + thread_registries_total->count(); +} +auto RegistryMetrics::increment_existing_lists() -> void { + existing_thread_registries->fetch_add(1); +} +auto RegistryMetrics::decrement_existing_lists() -> void { + existing_thread_registries->fetch_sub(1); +} diff --git a/arangod/SystemMonitor/TaskMonitoring/Metrics.h b/arangod/SystemMonitor/TaskMonitoring/Metrics.h new file mode 100644 index 000000000000..35086ed33b74 --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/Metrics.h @@ -0,0 +1,65 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "Containers/Concurrent/metrics.h" +#include "Metrics/Fwd.h" + +namespace arangodb::task_monitoring { + +struct RegistryMetrics : arangodb::containers::Metrics { + RegistryMetrics( + std::shared_ptr tasks_total, + std::shared_ptr> existing_tasks, + std::shared_ptr> + ready_for_deletion_tasks, + std::shared_ptr thread_registries_total, + std::shared_ptr> + existing_thread_registries) + : tasks_total{tasks_total}, + existing_tasks{existing_tasks}, + ready_for_deletion_tasks{ready_for_deletion_tasks}, + thread_registries_total{thread_registries_total}, + existing_thread_registries{existing_thread_registries} {} + ~RegistryMetrics() = default; + auto increment_total_nodes() -> void override; + auto increment_registered_nodes() -> void override; + auto decrement_registered_nodes() -> void override; + auto increment_ready_for_deletion_nodes() -> void override; + auto decrement_ready_for_deletion_nodes() -> void override; + auto increment_total_lists() -> void override; + auto increment_existing_lists() -> void override; + auto decrement_existing_lists() -> void override; + + private: + std::shared_ptr tasks_total = nullptr; + std::shared_ptr> existing_tasks = + nullptr; + std::shared_ptr> + ready_for_deletion_tasks = nullptr; + std::shared_ptr thread_registries_total = nullptr; + std::shared_ptr> + existing_thread_registries = nullptr; +}; + +} // namespace arangodb::task_monitoring diff --git a/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/README.md b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/README.md new file mode 100644 index 000000000000..8fc397a4e2fc --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/README.md @@ -0,0 +1,35 @@ +# Pretty Printer for ArangoDB's Task Monitoring Output + +This python-package provides a pretty-printer for the hierarchical task monitoring JSON output produced by ArangoDB. + +The pretty-printer groups tasks by their top-level (hierarchy 0, no parent) and by their state. The output is grouped and ordered as follows: +1. Running tasks +2. Finished tasks +3. Deleted tasks (optional, see below) + +Each group displays the task hierarchy as an ASCII tree for improved readability. + +## Usage + +To pretty-print a monitoring output JSON file: + +```sh +cat | ./src/pretty_printer.py [--show-deleted] +``` + +- By default, **Deleted** tasks are hidden. +- Use the `--show-deleted` flag to include Deleted tasks in the output. + +## Run tests + +Inside the src-folder run unittests via: + +```sh +python3 -m unittest discover +``` + +## Project Structure + +- `src/pretty_printer.py`: Main script for pretty-printing the monitoring output. +- `src/taskmonitoring/`: Python package with core logic for parsing and formatting the task monitoring data. +- `src/tests/`: Unit tests for the pretty printer. \ No newline at end of file diff --git a/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/pretty_printer.py b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/pretty_printer.py new file mode 100755 index 000000000000..cc27c8f4376f --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/pretty_printer.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +"""Task Monitoring Pretty Printer + +This script pretty-prints the hierarchical task monitoring JSON output from ArangoDB. +Groups by top-level task and state, and displays as ASCII trees. + +Usage: cat | ./pretty_printer.py [--show-deleted] +""" + +import sys +import json +import argparse +from taskmonitoring.tasktree import TaskTree + +def main(): + parser = argparse.ArgumentParser(description="Pretty print ArangoDB task monitoring output.") + parser.add_argument('--show-deleted', action='store_true', help='Show Deleted tasks (default: hide)') + args = parser.parse_args() + + string = sys.stdin.read() + data = json.loads(string)["task_stacktraces"] + tree = TaskTree.from_json(data) + tree.pretty_print(show_deleted=args.show_deleted) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/taskmonitoring/__init__.py b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/taskmonitoring/__init__.py new file mode 100644 index 000000000000..0519ecba6ea9 --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/taskmonitoring/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/taskmonitoring/tasktree.py b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/taskmonitoring/tasktree.py new file mode 100644 index 000000000000..264b2b1bb7e8 --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/taskmonitoring/tasktree.py @@ -0,0 +1,112 @@ +import collections +from typing import List, Dict, Any, Optional, Tuple + +class TaskNode: + def __init__(self, data: dict, hierarchy: int): + self.id = data["id"] + self.name = data["name"] + self.state = data["state"] + self.parent_id = data["parent"].get("id") if data["parent"] else None + self.thread = data["thread"] + self.source_location = data["source_location"] + self.hierarchy = hierarchy + self.children: List['TaskNode'] = [] + + def add_child(self, child: 'TaskNode'): + self.children.append(child) + + def group_key(self) -> Tuple: + # For 'Running' tasks, do not group (return unique key) + if self.state == "Running": + return (id(self),) + # For non-'Running', group by name, state, thread name (not ID), and source location + return ( + self.name, + self.state, + self.thread["name"], + self.source_location["file_name"], + self.source_location["line"], + self.source_location["function_name"] + ) + + def __str__(self): + if self.state == "Running": + return f"{self.name} [{self.state}] (thread: {self.thread['name']}:{self.thread['LWPID']}) @ {self.source_location['function_name']} ({self.source_location['file_name']}:{self.source_location['line']})" + else: + return f"{self.name} [{self.state}] (thread: {self.thread['name']}) @ {self.source_location['function_name']} ({self.source_location['file_name']}:{self.source_location['line']})" + +class TaskTree: + def __init__(self, roots: List[TaskNode]): + self.roots = roots + + @staticmethod + def from_json(task_stacktraces: List[List[Dict[str, Any]]]) -> 'TaskTree': + # Flatten all tasks and build id->node mapping + nodes = {} + all_nodes = [] + for stack in task_stacktraces: + for entry in stack: + node = TaskNode(entry["data"], entry["hierarchy"]) + nodes[node.id] = node + all_nodes.append(node) + # Build hierarchy + roots = [] + for node in all_nodes: + if node.parent_id and node.parent_id in nodes: + nodes[node.parent_id].add_child(node) + else: + roots.append(node) + return TaskTree(roots) + + def pretty_print(self, show_deleted: bool = False): + state_order = ["Running", "Finished", "Deleted"] + grouped = collections.defaultdict(list) + for node in self.roots: + grouped[node.state].append(node) + for state in state_order: + if state == "Deleted" and not show_deleted: + continue + if grouped[state]: + print(f"=== {state} Tasks ===") + if state == "Running": + for node in reversed(grouped[state]): + self._print_grouped_nodes([node], top_level=True, force_no_group=True) + else: + self._print_grouped_nodes(list(reversed(grouped[state])), top_level=True) + print() + + def _print_grouped_nodes(self, nodes: List[TaskNode], prefix: str = "", is_last: bool = True, top_level: bool = False, force_no_group: bool = False): + if force_no_group: + # Post-order: print children first + for idx, node in enumerate(reversed(nodes)): + count = 1 + if node.children: + self._print_grouped_nodes(list(reversed(node.children)), prefix + (" " if (is_last and idx == len(nodes) - 1) else "│ "), True, top_level=False, force_no_group=force_no_group) + if top_level: + count_str = f"{count:3d} x" + print(f"{count_str} {str(node)}") + else: + connector = "└─ " if (is_last and idx == len(nodes) - 1) else "├─ " + print(prefix + connector + str(node)) + return + # Group nodes by their group_key + group_map = collections.defaultdict(list) + for node in nodes: + group_map[node.group_key()].append(node) + group_items = list(group_map.items()) + for idx, (key, group) in enumerate(reversed(group_items)): + node = group[0] + count = len(group) + # Post-order: print children first + all_children = [] + for n in group: + all_children.extend(n.children) + if all_children: + self._print_grouped_nodes(list(reversed(all_children)), prefix + (" " if (is_last and idx == len(group_items) - 1) else "│ "), True, top_level=False) + if top_level: + count_str = f"{count:3d} x" if count < 1000 else f"{count} x" + print(f"{count_str} {str(node)}") + else: + connector = "└─ " if (is_last and idx == len(group_items) - 1) else "├─ " + count_str = f" [x{count}]" if count > 1 else "" + print(prefix + connector + str(node) + count_str) \ No newline at end of file diff --git a/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/tests/__init__.py b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/tests/__init__.py new file mode 100644 index 000000000000..0519ecba6ea9 --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/tests/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/tests/test_tasktree.py b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/tests/test_tasktree.py new file mode 100644 index 000000000000..80c1dfba96b5 --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/PrettyPrinter/src/tests/test_tasktree.py @@ -0,0 +1,617 @@ +import unittest +import io +import sys +import json +from taskmonitoring.tasktree import TaskTree + +SAMPLE_JSON = { + "task_stacktraces": [ + [ + { + "hierarchy": 0, + "data": { + "id": "root1", + "name": "Top Task 1", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + }, + { + "hierarchy": 1, + "data": { + "id": "child1", + "name": "Child Task 1", + "state": "Running", + "parent": {"id": "root1"}, + "thread": {"LWPID": 2, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 20, "function_name": "funcB"} + } + } + ], + [ + { + "hierarchy": 0, + "data": { + "id": "root2", + "name": "Top Task 2", + "state": "Deleted", + "parent": {}, + "thread": {"LWPID": 3, "name": "main"}, + "source_location": {"file_name": "file2.cpp", "line": 30, "function_name": "funcC"} + } + } + ] + ] +} + +SAMPLE_GROUP_JSON = { + "task_stacktraces": [ + [ + { + "hierarchy": 0, + "data": { + "id": "root1", + "name": "Top Task", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + }, + { + "hierarchy": 0, + "data": { + "id": "root2", + "name": "Top Task", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + }, + { + "hierarchy": 1, + "data": { + "id": "child1", + "name": "Child Task", + "state": "Running", + "parent": {"id": "root1"}, + "thread": {"LWPID": 2, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 20, "function_name": "funcB"} + } + }, + { + "hierarchy": 1, + "data": { + "id": "child2", + "name": "Child Task", + "state": "Running", + "parent": {"id": "root2"}, + "thread": {"LWPID": 2, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 20, "function_name": "funcB"} + } + } + ] + ] +} + +SAMPLE_GROUP_DIFF_STATE_JSON = { + "task_stacktraces": [ + [ + { + "hierarchy": 0, + "data": { + "id": "root1", + "name": "Top Task", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + }, + { + "hierarchy": 0, + "data": { + "id": "root2", + "name": "Top Task", + "state": "Deleted", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + } + ] + ] +} + +SAMPLE_GROUP_NESTED_JSON = { + "task_stacktraces": [ + [ + { + "hierarchy": 0, + "data": { + "id": "root1", + "name": "Top Task", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + }, + { + "hierarchy": 1, + "data": { + "id": "child1", + "name": "Child Task", + "state": "Running", + "parent": {"id": "root1"}, + "thread": {"LWPID": 2, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 20, "function_name": "funcB"} + } + }, + { + "hierarchy": 1, + "data": { + "id": "child2", + "name": "Child Task", + "state": "Running", + "parent": {"id": "root1"}, + "thread": {"LWPID": 2, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 20, "function_name": "funcB"} + } + } + ] + ] +} + +SAMPLE_REUSED_ID_SIBLINGS = { + "task_stacktraces": [ + [ + { # Parent + "hierarchy": 0, + "data": { + "id": "parent", + "name": "Parent", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 1, "function_name": "parentFunc"} + } + }, + { # Child1 (id reused) + "hierarchy": 1, + "data": { + "id": "child", + "name": "Child", + "state": "Running", + "parent": {"id": "parent"}, + "thread": {"LWPID": 2, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 2, "function_name": "childFunc"} + } + }, + { # Child2 (id reused) + "hierarchy": 1, + "data": { + "id": "child", + "name": "Child", + "state": "Running", + "parent": {"id": "parent"}, + "thread": {"LWPID": 3, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 3, "function_name": "childFunc2"} + } + } + ] + ] +} + +SAMPLE_REUSED_ID_COUSINS = { + "task_stacktraces": [ + [ + { # Grandparent + "hierarchy": 0, + "data": { + "id": "grandparent", + "name": "Grandparent", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 1, "function_name": "grandparentFunc"} + } + }, + { # Parent1 + "hierarchy": 1, + "data": { + "id": "parent1", + "name": "Parent1", + "state": "Running", + "parent": {"id": "grandparent"}, + "thread": {"LWPID": 2, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 2, "function_name": "parentFunc1"} + } + }, + { # Parent2 + "hierarchy": 1, + "data": { + "id": "parent2", + "name": "Parent2", + "state": "Running", + "parent": {"id": "grandparent"}, + "thread": {"LWPID": 3, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 3, "function_name": "parentFunc2"} + } + }, + { # Cousin1 (id reused) + "hierarchy": 2, + "data": { + "id": "cousin", + "name": "Cousin", + "state": "Running", + "parent": {"id": "parent1"}, + "thread": {"LWPID": 4, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 4, "function_name": "cousinFunc1"} + } + }, + { # Cousin2 (id reused) + "hierarchy": 2, + "data": { + "id": "cousin", + "name": "Cousin", + "state": "Running", + "parent": {"id": "parent2"}, + "thread": {"LWPID": 5, "name": "worker"}, + "source_location": {"file_name": "file.cpp", "line": 5, "function_name": "cousinFunc2"} + } + } + ] + ] +} + +SAMPLE_REUSED_ID_SEPARATE_TREES = { + "task_stacktraces": [ + [ + { # Root1 + "hierarchy": 0, + "data": { + "id": "root", + "name": "Root1", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 1, "function_name": "rootFunc1"} + } + }, + { # Root2 (id reused) + "hierarchy": 0, + "data": { + "id": "root", + "name": "Root2", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 2, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 2, "function_name": "rootFunc2"} + } + } + ] + ] +} + +SAMPLE_RUNNING_NOT_GROUPED = { + "task_stacktraces": [ + [ + { + "hierarchy": 0, + "data": { + "id": "root1", + "name": "Top Task", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + }, + { + "hierarchy": 0, + "data": { + "id": "root2", + "name": "Top Task", + "state": "Running", + "parent": {}, + "thread": {"LWPID": 2, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + } + ] + ] +} + +SAMPLE_NON_RUNNING_GROUP_THREAD_NAME = { + "task_stacktraces": [ + [ + { + "hierarchy": 0, + "data": { + "id": "root1", + "name": "Top Task", + "state": "Deleted", + "parent": {}, + "thread": {"LWPID": 1, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + }, + { + "hierarchy": 0, + "data": { + "id": "root2", + "name": "Top Task", + "state": "Deleted", + "parent": {}, + "thread": {"LWPID": 2, "name": "main"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + } + ] + ] +} + +SAMPLE_NON_RUNNING_NOT_GROUP_THREAD_NAME = { + "task_stacktraces": [ + [ + { + "hierarchy": 0, + "data": { + "id": "root1", + "name": "Top Task", + "state": "Deleted", + "parent": {}, + "thread": {"LWPID": 1, "name": "main1"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + }, + { + "hierarchy": 0, + "data": { + "id": "root2", + "name": "Top Task", + "state": "Deleted", + "parent": {}, + "thread": {"LWPID": 2, "name": "main2"}, + "source_location": {"file_name": "file.cpp", "line": 10, "function_name": "funcA"} + } + } + ] + ] +} + +class TestTaskTree(unittest.TestCase): + def test_hierarchy_and_grouping(self): + tree = TaskTree.from_json(SAMPLE_JSON["task_stacktraces"]) + self.assertEqual(len(tree.roots), 2) + running = [n for n in tree.roots if n.state == "Running"] + deleted = [n for n in tree.roots if n.state == "Deleted"] + self.assertEqual(len(running), 1) + self.assertEqual(len(deleted), 1) + self.assertEqual(running[0].name, "Top Task 1") + self.assertEqual(deleted[0].name, "Top Task 2") + self.assertEqual(len(running[0].children), 1) + self.assertEqual(running[0].children[0].name, "Child Task 1") + + def test_pretty_print_output(self): + tree = TaskTree.from_json(SAMPLE_JSON["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print(show_deleted=True) + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + self.assertIn("=== Running Tasks ===", output) + self.assertIn("Top Task 1 [Running]", output) + self.assertIn("Child Task 1 [Running]", output) + self.assertIn("=== Deleted Tasks ===", output) + self.assertIn("Top Task 2 [Deleted]", output) + + def test_grouping_identical_tasks(self): + tree = TaskTree.from_json(SAMPLE_GROUP_JSON["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print() + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Running tasks are not grouped, so expect two separate lines + self.assertEqual(output.count("Top Task [Running] (thread: main:1) @ funcA (file.cpp:10)"), 2) + self.assertNotIn("[x2]", output) + self.assertNotIn(" 2 x", output) + + def test_grouping_different_states(self): + tree = TaskTree.from_json(SAMPLE_GROUP_DIFF_STATE_JSON["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print(show_deleted=True) + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Running task: thread ID is printed, not grouped + self.assertIn("Top Task [Running] (thread: main:1) @ funcA (file.cpp:10)", output) + # Deleted task: thread ID is NOT printed, not grouped + self.assertIn("Top Task [Deleted] (thread: main) @ funcA (file.cpp:10)", output) + self.assertNotIn("[x2]", output) + self.assertNotIn(" 2 x", output) + + def test_grouping_nested(self): + tree = TaskTree.from_json(SAMPLE_GROUP_NESTED_JSON["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print() + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Running children are not grouped, so expect two lines + self.assertEqual(output.count("Child Task [Running] (thread: worker:2) @ funcB (file.cpp:20)"), 2) + self.assertNotIn("[x2]", output) + self.assertNotIn(" 2 x", output) + # Check reverse order: deepest child first, then parent, then root + idx_child2 = output.find("Child Task [Running] (thread: worker:2) @ funcB (file.cpp:20)") + idx_root = output.find("Top Task [Running] (thread: main:1) @ funcA (file.cpp:10)") + if not (idx_child2 < idx_root): + print("\nDEBUG OUTPUT (test_grouping_nested):\n" + output) + self.assertTrue(idx_child2 < idx_root, "Deepest child should appear before root in output") + + def test_reverse_ordering_deep_stack(self): + # Simulate a deep stack + deep_stack = { + "task_stacktraces": [[ + {"hierarchy": 0, "data": {"id": "root", "name": "Root", "state": "Running", "parent": {}, "thread": {"LWPID": 1, "name": "main"}, "source_location": {"file_name": "file.cpp", "line": 1, "function_name": "rootFunc"}}}, + {"hierarchy": 1, "data": {"id": "mid", "name": "Mid", "state": "Running", "parent": {"id": "root"}, "thread": {"LWPID": 1, "name": "main"}, "source_location": {"file_name": "file.cpp", "line": 2, "function_name": "midFunc"}}}, + {"hierarchy": 2, "data": {"id": "leaf", "name": "Leaf", "state": "Running", "parent": {"id": "mid"}, "thread": {"LWPID": 1, "name": "main"}, "source_location": {"file_name": "file.cpp", "line": 3, "function_name": "leafFunc"}}} + ]] + } + tree = TaskTree.from_json(deep_stack["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print() + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + idx_leaf = output.find("Leaf [Running] (thread: main:1) @ leafFunc (file.cpp:3)") + idx_mid = output.find("Mid [Running] (thread: main:1) @ midFunc (file.cpp:2)") + idx_root = output.find("Root [Running] (thread: main:1) @ rootFunc (file.cpp:1)") + if not (idx_leaf < idx_mid < idx_root): + print("\nDEBUG OUTPUT (test_reverse_ordering_deep_stack):\n" + output) + self.assertTrue(idx_leaf < idx_mid < idx_root, "Order should be leaf, then mid, then root") + + def test_reused_id_siblings(self): + tree = TaskTree.from_json(SAMPLE_REUSED_ID_SIBLINGS["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print() + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Both children should appear under the parent, even though they have the same id + self.assertIn("Parent [Running]", output) + self.assertIn("Child [Running] (thread: worker:2)", output) + self.assertIn("Child [Running] (thread: worker:3)", output) + + def test_reused_id_cousins(self): + tree = TaskTree.from_json(SAMPLE_REUSED_ID_COUSINS["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print() + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Both cousins should appear under their respective parents, even though they have the same id + self.assertIn("Parent1 [Running]", output) + self.assertIn("Parent2 [Running]", output) + self.assertIn("Cousin [Running] (thread: worker:4)", output) + self.assertIn("Cousin [Running] (thread: worker:5)", output) + + def test_reused_id_separate_trees(self): + tree = TaskTree.from_json(SAMPLE_REUSED_ID_SEPARATE_TREES["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print() + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Both roots should appear, even though they have the same id + self.assertIn("Root1 [Running] (thread: main:1)", output) + self.assertIn("Root2 [Running] (thread: main:2)", output) + + def test_running_not_grouped(self): + tree = TaskTree.from_json(SAMPLE_RUNNING_NOT_GROUPED["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print() + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Both should appear, not grouped, and thread ID is printed + self.assertIn("Top Task [Running] (thread: main:1)", output) + self.assertIn("Top Task [Running] (thread: main:2)", output) + self.assertNotIn("[x2]", output) + self.assertNotIn(" 2 x", output) + + def test_deleted_tasks_hidden_by_default(self): + tree = TaskTree.from_json(SAMPLE_NON_RUNNING_GROUP_THREAD_NAME["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print(show_deleted=False) + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Deleted tasks should not be printed + self.assertNotIn("Deleted Tasks", output) + self.assertNotIn("Top Task [Deleted]", output) + + def test_deleted_tasks_shown_with_flag(self): + tree = TaskTree.from_json(SAMPLE_NON_RUNNING_GROUP_THREAD_NAME["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print(show_deleted=True) + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Deleted tasks should be printed + self.assertIn("=== Deleted Tasks ===", output) + self.assertIn("Top Task [Deleted]", output) + + def test_non_running_group_by_thread_name(self): + tree = TaskTree.from_json(SAMPLE_NON_RUNNING_GROUP_THREAD_NAME["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print(show_deleted=True) + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Should be grouped, thread ID not printed + self.assertIn(" 2 x Top Task [Deleted] (thread: main) @ funcA (file.cpp:10)", output) + self.assertNotIn(":1)", output) + self.assertNotIn(":2)", output) + + def test_non_running_not_group_if_thread_name_differs(self): + tree = TaskTree.from_json(SAMPLE_NON_RUNNING_NOT_GROUP_THREAD_NAME["task_stacktraces"]) + captured = io.StringIO() + sys_stdout = sys.stdout + sys.stdout = captured + try: + tree.pretty_print(show_deleted=True) + finally: + sys.stdout = sys_stdout + output = captured.getvalue() + # Should not be grouped, thread ID not printed + self.assertIn("Top Task [Deleted] (thread: main1) @ funcA (file.cpp:10)", output) + self.assertIn("Top Task [Deleted] (thread: main2) @ funcA (file.cpp:10)", output) + self.assertNotIn("[x2]", output) + self.assertNotIn(" 2 x", output) + self.assertNotIn(":1)", output) + self.assertNotIn(":2)", output) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/arangod/SystemMonitor/TaskMonitoring/RestHandler.cpp b/arangod/SystemMonitor/TaskMonitoring/RestHandler.cpp new file mode 100644 index 000000000000..8715f24b09a2 --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/RestHandler.cpp @@ -0,0 +1,195 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#include "RestHandler.h" +#include +#include + +#include "Containers/Forest/depth_first.h" +#include "Containers/Forest/forest.h" +#include "ApplicationFeatures/ApplicationServer.h" +#include "TaskMonitoring/task.h" +#include "TaskMonitoring/task_registry_variable.h" +#include "Cluster/ClusterFeature.h" +#include "Cluster/ClusterInfo.h" +#include "Cluster/ServerState.h" +#include "Inspection/VPack.h" +#include "Network/ConnectionPool.h" +#include "Network/Methods.h" +#include "Network/NetworkFeature.h" +#include "Network/RequestOptions.h" +#include "Rest/CommonDefines.h" + +using namespace arangodb; +using namespace arangodb::task_monitoring; +using namespace arangodb::containers; + +RestHandler::RestHandler(ArangodServer& server, GeneralRequest* request, + GeneralResponse* response) + : RestVocbaseBaseHandler(server, request, response), + _feature(server.getFeature()) {} + +namespace { +struct Entry { + TreeHierarchy hierarchy; + TaskSnapshot data; +}; +template +auto inspect(Inspector& f, Entry& x) { + return f.object(x).fields(f.field("hierarchy", x.hierarchy), + f.field("data", x.data)); +} +/** + Creates a forest of all current tasks + + An edge between two tasks means that the lower hierarchy tasks started the + larger hierarchy task. + **/ +auto all_undeleted_promises() -> ForestWithRoots { + auto forest = Forest{}; + std::vector roots; + registry.for_node([&](TaskSnapshot task) { + // if (promise.state != State::Deleted) { + std::visit(overloaded{ + [&](TaskIdWrapper parent) { + forest.insert(task.id, parent.id, task); + }, + [&](RootTask root) { + forest.insert(task.id, nullptr, task); + roots.emplace_back(task.id); + }, + }, + task.parent); + // } + }); + return ForestWithRoots{forest, roots}; +} + +/** + Converts a forest of tasks into a list of stacktraces inside a + velocypack. + + The list of stacktraces include one stacktrace per tree in the forest. To + create one stacktrace, it uses a depth first search to traverse the forest in + post order, such that tasks with the highest hierarchy in a tree are given + first and the root task is given last. + **/ +auto getStacktraceData(IndexedForestWithRoots const& promises) + -> VPackBuilder { + VPackBuilder builder; + builder.openObject(); + builder.add(VPackValue("task_stacktraces")); + builder.openArray(); + for (auto const& root : promises.roots()) { + builder.openArray(); + auto dfs = DFS_PostOrder{promises, root}; + do { + auto next = dfs.next(); + if (next == std::nullopt) { + break; + } + auto [id, hierarchy] = next.value(); + auto data = promises.node(id); + if (data != std::nullopt) { + auto entry = Entry{.hierarchy = hierarchy, .data = data.value()}; + velocypack::serialize(builder, entry); + } + } while (true); + builder.close(); + } + builder.close(); + builder.close(); + return builder; +} +} // namespace + +auto RestHandler::executeAsync() -> futures::Future { + if (!ExecContext::current().isSuperuser()) { + generateError(rest::ResponseCode::FORBIDDEN, TRI_ERROR_HTTP_FORBIDDEN, + "you need super user rights for log operations"); + } + + if (_request->requestType() != rest::RequestType::GET) { + generateError(rest::ResponseCode::METHOD_NOT_ALLOWED, + TRI_ERROR_HTTP_METHOD_NOT_ALLOWED); + co_return; + } + + // forwarding + bool foundServerIdParameter; + std::string const& serverId = + _request->value("serverId", foundServerIdParameter); + + if (ServerState::instance()->isCoordinator() && foundServerIdParameter) { + if (serverId != ServerState::instance()->getId()) { + // not ourselves! - need to pass through the request + auto& ci = server().getFeature().clusterInfo(); + + bool found = false; + for (auto const& srv : ci.getServers()) { + // validate if server id exists + if (srv.first == serverId) { + found = true; + break; + } + } + + if (!found) { + generateError(rest::ResponseCode::NOT_FOUND, + TRI_ERROR_HTTP_BAD_PARAMETER, + "unknown serverId supplied."); + co_return; + } + + NetworkFeature const& nf = server().getFeature(); + network::ConnectionPool* pool = nf.pool(); + if (pool == nullptr) { + THROW_ARANGO_EXCEPTION(TRI_ERROR_SHUTTING_DOWN); + } + network::RequestOptions options; + options.timeout = network::Timeout(30.0); + options.database = _request->databaseName(); + options.parameters = _request->parameters(); + + auto f = network::sendRequestRetry( + pool, "server:" + serverId, fuerte::RestVerb::Get, + _request->requestPath(), VPackBuffer{}, options); + co_await std::move(f).thenValue( + [self = std::dynamic_pointer_cast(shared_from_this())]( + network::Response const& r) { + if (r.fail()) { + self->generateError(r.combinedResult()); + } else { + self->generateResult(rest::ResponseCode::OK, r.slice()); + } + }); + co_return; + } + } + + auto lock_guard = co_await _feature.asyncLock(); + + // do actual work + auto promises = all_undeleted_promises().index_by_awaitee(); + generateResult(rest::ResponseCode::OK, getStacktraceData(promises).slice()); + co_return; +} diff --git a/arangod/SystemMonitor/TaskMonitoring/RestHandler.h b/arangod/SystemMonitor/TaskMonitoring/RestHandler.h new file mode 100644 index 000000000000..bc3a4c298090 --- /dev/null +++ b/arangod/SystemMonitor/TaskMonitoring/RestHandler.h @@ -0,0 +1,42 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "SystemMonitor/TaskMonitoring/Feature.h" +#include "RestHandler/RestVocbaseBaseHandler.h" + +namespace arangodb::task_monitoring { + +class RestHandler : public arangodb::RestVocbaseBaseHandler { + public: + RestHandler(ArangodServer&, GeneralRequest*, GeneralResponse*); + + public: + char const* name() const override final { return "TaskRegistryRestHandler"; } + RequestLane lane() const override final { return RequestLane::CLUSTER_ADMIN; } + futures::Future executeAsync() override; + + Feature& _feature; +}; + +} // namespace arangodb::task_monitoring diff --git a/arangod/VocBase/Methods/Databases.cpp b/arangod/VocBase/Methods/Databases.cpp index 43661bef56a4..f62d9fb4bc89 100644 --- a/arangod/VocBase/Methods/Databases.cpp +++ b/arangod/VocBase/Methods/Databases.cpp @@ -69,12 +69,15 @@ #include #include +#include "TaskMonitoring/task.h" + using namespace arangodb; using namespace arangodb::methods; using namespace arangodb::velocypack; std::vector Databases::list(ArangodServer& server, std::string const& user) { + auto task = task_monitoring::Task{"Collect List of Databases"}; if (!server.hasFeature()) { return std::vector(); } @@ -95,6 +98,8 @@ std::vector Databases::list(ArangodServer& server, } Result Databases::info(TRI_vocbase_t* vocbase, velocypack::Builder& result) { + auto task = task_monitoring::Task{"Collect Database information for " + + vocbase->name()}; if (ServerState::instance()->isCoordinator()) { auto& cache = vocbase->server().getFeature().agencyCache(); auto [acb, idx] = cache.read(std::vector{ @@ -186,6 +191,9 @@ Result Databases::grantCurrentUser(CreateDatabaseInfo const& info, // Create database on cluster; Result Databases::createCoordinator(CreateDatabaseInfo const& info) { + auto task = task_monitoring::Task{"Create Database " + info.getName() + + " on Coordinator"}; + // TODO: Add status strings to task for phases. TRI_ASSERT(ServerState::instance()->isCoordinator()); DatabaseFeature& databaseFeature = @@ -361,6 +369,7 @@ Result Databases::createOther(CreateDatabaseInfo const& info) { Result Databases::create(ArangodServer& server, ExecContext const& exec, std::string const& dbName, velocypack::Slice users, velocypack::Slice options) { + auto task = task_monitoring::Task{"Create Database: " + dbName}; Result res = basics::catchToResult([&]() { Result res; @@ -506,6 +515,7 @@ ErrorCode dropDBCoordinator(DatabaseFeature& df, std::string const& dbName) { Result Databases::drop(ExecContext const& exec, TRI_vocbase_t* systemVocbase, std::string const& dbName) { + auto task = task_monitoring::Task{"Drop Database: " + dbName}; TRI_ASSERT(systemVocbase->isSystem()); if (exec.systemAuthLevel() != auth::Level::RW) { events::DropDatabase(dbName, Result(TRI_ERROR_FORBIDDEN), exec); diff --git a/arangod/VocBase/Methods/UpgradeTasks.cpp b/arangod/VocBase/Methods/UpgradeTasks.cpp index 1c0fb2e47f7c..65b52c9783b3 100644 --- a/arangod/VocBase/Methods/UpgradeTasks.cpp +++ b/arangod/VocBase/Methods/UpgradeTasks.cpp @@ -55,6 +55,7 @@ #include "VocBase/vocbase.h" #include +#include "TaskMonitoring/include/TaskMonitoring/task.h" using namespace arangodb; using namespace arangodb::methods; @@ -391,6 +392,8 @@ Result createSystemStatisticsIndices( Result createSystemCollectionsIndices( TRI_vocbase_t& vocbase, std::vector>& collections) { + auto task = task_monitoring::Task{"Create System Collections Indices for " + + vocbase.name()}; Result res; if (vocbase.isSystem()) { res = ::createIndex(StaticStrings::UsersCollection, @@ -439,6 +442,8 @@ Result createSystemCollectionsIndices( Result UpgradeTasks::createSystemCollectionsAndIndices( TRI_vocbase_t& vocbase, velocypack::Slice slice) { + auto task = + task_monitoring::Task{"Create System Collections for " + vocbase.name()}; // after the call to ::createSystemCollections this vector should contain // a LogicalCollection for *every* (required) system collection. std::vector> presentSystemCollections; @@ -478,6 +483,8 @@ Result UpgradeTasks::createSystemCollectionsAndIndices( Result UpgradeTasks::createStatisticsCollectionsAndIndices( TRI_vocbase_t& vocbase, velocypack::Slice slice) { + auto task = task_monitoring::Task{"Create Statistics Collections for " + + vocbase.name()}; // This vector should after the call to ::createSystemCollections contain // a LogicalCollection for *every* (required) system collection. std::vector> presentSystemCollections; @@ -507,6 +514,8 @@ Result UpgradeTasks::createStatisticsCollectionsAndIndices( //////////////////////////////////////////////////////////////////////////////// Result UpgradeTasks::dropLegacyAnalyzersCollection( TRI_vocbase_t& vocbase, velocypack::Slice /*upgradeParams*/) { + auto task = + task_monitoring::Task{"Drop Legacy Analyzers for " + vocbase.name()}; // drop legacy collection if upgrading the system vocbase and collection found #ifdef ARANGODB_ENABLE_MAINTAINER_MODE if (!vocbase.server().hasFeature()) { @@ -541,6 +550,7 @@ Result UpgradeTasks::dropLegacyAnalyzersCollection( Result UpgradeTasks::addDefaultUserOther(TRI_vocbase_t& vocbase, velocypack::Slice params) { + auto task = task_monitoring::Task{"Add Default User for " + vocbase.name()}; TRI_ASSERT(!vocbase.isSystem()); TRI_ASSERT(params.isObject()); @@ -598,6 +608,8 @@ Result UpgradeTasks::addDefaultUserOther(TRI_vocbase_t& vocbase, Result UpgradeTasks::renameReplicationApplierStateFiles( TRI_vocbase_t& vocbase, velocypack::Slice slice) { + auto task = task_monitoring::Task{"Rename Replication Applier Files " + + vocbase.name()}; std::string const path = vocbase.engine().databasePath(); std::string const source = arangodb::basics::FileUtils::buildFilename( @@ -634,6 +646,8 @@ Result UpgradeTasks::renameReplicationApplierStateFiles( Result UpgradeTasks::dropPregelQueriesCollection( TRI_vocbase_t& vocbase, velocypack::Slice /*upgradeParams*/) { + auto task = task_monitoring::Task{"Drop Pregel Queries Collection for " + + vocbase.name()}; std::shared_ptr col; auto res = arangodb::methods::Collections::lookup(vocbase, "_pregel_queries", col); diff --git a/arangod/arangoserver.cmake b/arangod/arangoserver.cmake index 56254ef10bc7..a6256ddd037f 100644 --- a/arangod/arangoserver.cmake +++ b/arangod/arangoserver.cmake @@ -6,9 +6,9 @@ add_library(arangoserver STATIC Auth/TokenCache.cpp Auth/User.cpp Auth/UserManager.cpp - Cluster/Action.cpp - Cluster/ActionBase.cpp - Cluster/ActionDescription.cpp + Cluster/MaintenanceActions/Action.cpp + Cluster/MaintenanceActions/ActionBase.cpp + Cluster/MaintenanceActions/ActionDescription.cpp Cluster/AgencyCache.cpp Cluster/AgencyCallback.cpp Cluster/AgencyCallbackRegistry.cpp @@ -21,13 +21,13 @@ add_library(arangoserver STATIC Cluster/ClusterTypes.cpp Cluster/ClusterUpgradeFeature.cpp Cluster/CollectionInfoCurrent.cpp - Cluster/CreateCollection.cpp - Cluster/CreateDatabase.cpp + Cluster/MaintenanceActions/CreateCollection.cpp + Cluster/MaintenanceActions/CreateDatabase.cpp Cluster/DBServerAgencySync.cpp - Cluster/DropCollection.cpp - Cluster/DropDatabase.cpp - Cluster/DropIndex.cpp - Cluster/EnsureIndex.cpp + Cluster/MaintenanceActions/DropCollection.cpp + Cluster/MaintenanceActions/DropDatabase.cpp + Cluster/MaintenanceActions/DropIndex.cpp + Cluster/MaintenanceActions/EnsureIndex.cpp Cluster/FollowerInfo.cpp Cluster/HeartbeatThread.cpp Cluster/Maintenance.cpp @@ -36,15 +36,15 @@ add_library(arangoserver STATIC Cluster/MaintenanceWorker.cpp Cluster/RebootTracker.cpp Cluster/ReplicationTimeoutFeature.cpp - Cluster/ResignShardLeadership.cpp + Cluster/MaintenanceActions/ResignShardLeadership.cpp Cluster/RestAgencyCallbacksHandler.cpp Cluster/RestClusterHandler.cpp Cluster/ServerDefaults.cpp Cluster/ServerState.cpp - Cluster/SynchronizeShard.cpp - Cluster/TakeoverShardLeadership.cpp - Cluster/UpdateCollection.cpp - Cluster/UpdateReplicatedLogAction.cpp + Cluster/MaintenanceActions/SynchronizeShard.cpp + Cluster/MaintenanceActions/TakeoverShardLeadership.cpp + Cluster/MaintenanceActions/UpdateCollection.cpp + Cluster/MaintenanceActions/UpdateReplicatedLogAction.cpp FeaturePhases/AgencyFeaturePhase.cpp FeaturePhases/AqlFeaturePhase.cpp FeaturePhases/BasicFeaturePhaseServer.cpp diff --git a/lib/Async/CMakeLists.txt b/lib/Async/CMakeLists.txt index 865fc6bbb965..47b325d4e202 100644 --- a/lib/Async/CMakeLists.txt +++ b/lib/Async/CMakeLists.txt @@ -1,15 +1,18 @@ +add_library(arango_async_interface INTERFACE) +target_include_directories(arango_async_interface INTERFACE + ${PROJECT_SOURCE_DIR}/lib + include) +target_link_libraries(arango_async_interface INTERFACE + arango_task_registry) + add_library(arango_async INTERFACE) -target_include_directories(arango_async - INTERFACE - include -) +target_include_directories(arango_async INTERFACE + include) -target_link_libraries(arango_async - INTERFACE +target_link_libraries(arango_async INTERFACE arango_async_registry arango_async_interface arango_basic_utils ) -add_subdirectory(include) add_subdirectory(Registry) diff --git a/lib/Async/Registry/promise.h b/lib/Async/Registry/promise.h index 2b40c8426a5d..ddec2fff4761 100644 --- a/lib/Async/Registry/promise.h +++ b/lib/Async/Registry/promise.h @@ -31,6 +31,7 @@ #include #include "Containers/Concurrent/ThreadOwnedList.h" #include "Containers/Concurrent/thread.h" +#include "Containers/Concurrent/source_location.h" #include "fmt/format.h" #include "fmt/std.h" @@ -46,34 +47,6 @@ overloaded(Ts...) -> overloaded; namespace arangodb::async_registry { -struct SourceLocationSnapshot { - std::string_view file_name; - std::string_view function_name; - std::uint_least32_t line; - bool operator==(SourceLocationSnapshot const&) const = default; - static auto from(std::source_location loc) -> SourceLocationSnapshot { - return SourceLocationSnapshot{.file_name = loc.file_name(), - .function_name = loc.function_name(), - .line = loc.line()}; - } -}; -template -auto inspect(Inspector& f, SourceLocationSnapshot& x) { - return f.object(x).fields(f.field("file_name", x.file_name), - f.field("line", x.line), - f.field("function_name", x.function_name)); -} -struct SourceLocation { - auto snapshot() -> SourceLocationSnapshot { - return SourceLocationSnapshot{.file_name = file_name, - .function_name = function_name, - .line = line.load()}; - } - const std::string_view file_name; - const std::string_view function_name; - std::atomic line; -}; - enum class State { Running = 0, Suspended, Resolved, Deleted }; template auto inspect(Inspector& f, State& x) { @@ -130,7 +103,7 @@ auto inspect(Inspector& f, Requester& x) { struct PromiseSnapshot { void* id; basics::ThreadId thread; - SourceLocationSnapshot source_location; + basics::SourceLocationSnapshot source_location; Requester requester; State state; bool operator==(PromiseSnapshot const&) const = default; @@ -144,6 +117,9 @@ auto inspect(Inspector& f, PromiseSnapshot& x) { f.field("state", x.state)); } +/** + Promise in the registry + */ struct Promise { using Snapshot = PromiseSnapshot; Promise(Requester requester, std::source_location location); @@ -162,8 +138,7 @@ struct Promise { } basics::ThreadId thread; - - SourceLocation source_location; + basics::VariableSourceLocation source_location; std::atomic requester; std::atomic state = State::Running; }; @@ -174,6 +149,14 @@ struct Promise { */ auto get_current_coroutine() noexcept -> Requester*; +/** + Wrapper promise for easier usage in the code + + This is a wrapper around the promise: On construction, it creates a promise + and registers it in the global register. On destruction, it marks the promise + for deletion in the register. Therefore it has a shorter lifetime than the + promise itself. + */ struct AddToAsyncRegistry { AddToAsyncRegistry() = default; AddToAsyncRegistry(std::source_location loc); diff --git a/lib/Async/include/Async/async.h b/lib/Async/include/Async/async.h index 324b8955324a..bebb084dd1a9 100644 --- a/lib/Async/include/Async/async.h +++ b/lib/Async/include/Async/async.h @@ -1,10 +1,10 @@ #pragma once -#include "Async/Registry/promise.h" -#include "Async/Registry/registry_variable.h" +#include "Async/context.h" #include "Async/coro-utils.h" #include "Async/expected.h" -#include "Utils/ExecContext.h" +#include "Async/Registry/promise.h" +#include "Async/Registry/registry_variable.h" #include "Inspection/Format.h" #include @@ -26,9 +26,7 @@ struct async_promise_base : async_registry::AddToAsyncRegistry { using promise_type = async_promise; async_promise_base(std::source_location loc) - : async_registry::AddToAsyncRegistry{std::move(loc)}, - _callerExecContext{ExecContext::currentAsShared()}, - _requester{*async_registry::get_current_coroutine()} { + : async_registry::AddToAsyncRegistry{std::move(loc)}, _context{} { *async_registry::get_current_coroutine() = {id()}; } @@ -41,8 +39,7 @@ struct async_promise_base : async_registry::AddToAsyncRegistry { bool await_ready() noexcept { return false; } std::coroutine_handle<> await_suspend( std::coroutine_handle<> self) noexcept { - ExecContext::set(_promise->_callerExecContext); - *async_registry::get_current_coroutine() = _promise->_requester; + _promise->_context.set(); auto addr = _promise->_continuation.exchange(self.address(), std::memory_order_acq_rel); if (addr == nullptr) { @@ -70,24 +67,21 @@ struct async_promise_base : async_registry::AddToAsyncRegistry { bool await_ready() { return inner_awaitable.await_ready(); } auto await_suspend(std::coroutine_handle<> handle) { outer_promise->update_state(async_registry::State::Suspended); - ExecContext::set(outer_promise->_callerExecContext); - *async_registry::get_current_coroutine() = outer_promise->_requester; + outer_promise->_context.set(); return inner_awaitable.await_suspend(handle); } auto await_resume() { auto old_state = outer_promise->update_state(async_registry::State::Running); if (old_state.value() == async_registry::State::Suspended) { - outer_promise->_callerExecContext = ExecContext::currentAsShared(); - outer_promise->_requester = *async_registry::get_current_coroutine(); + outer_promise->_context = Context{}; } - *async_registry::get_current_coroutine() = {outer_promise->id()}; - ExecContext::set(_myExecContext); + myContext.set(); return inner_awaitable.await_resume(); } async_promise_base* outer_promise; inner_awaitable_type inner_awaitable; - std::shared_ptr _myExecContext; + Context myContext; }; // update promises in registry @@ -99,12 +93,11 @@ struct async_promise_base : async_registry::AddToAsyncRegistry { return awaitable{.outer_promise = this, .inner_awaitable = get_awaitable_object( std::forward(co_awaited_expression)), - ._myExecContext = ExecContext::currentAsShared()}; + .myContext = Context{}}; } void unhandled_exception() { _value.set_exception(std::current_exception()); - *async_registry::get_current_coroutine() = _requester; - ExecContext::set(_callerExecContext); + _context.set(); } auto get_return_object() { return async{std::coroutine_handle::from_promise( @@ -113,8 +106,7 @@ struct async_promise_base : async_registry::AddToAsyncRegistry { std::atomic _continuation = nullptr; expected _value; - std::shared_ptr _callerExecContext; - async_registry::Requester _requester; + Context _context; }; template diff --git a/lib/Async/include/Async/context.h b/lib/Async/include/Async/context.h new file mode 100644 index 000000000000..a34219c87f63 --- /dev/null +++ b/lib/Async/include/Async/context.h @@ -0,0 +1,48 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "Async/Registry/promise.h" +#include "TaskMonitoring/task.h" +#include "Utils/ExecContext.h" + +namespace arangodb { + +struct Context { + std::shared_ptr _execContext; + async_registry::Requester _requester; + task_monitoring::Task* _task; + + Context() + : _execContext{ExecContext::currentAsShared()}, + _requester{*async_registry::get_current_coroutine()}, + _task{*task_monitoring::get_current_task()} {} + + auto set() -> void { + ExecContext::set(_execContext); + *async_registry::get_current_coroutine() = _requester; + *task_monitoring::get_current_task() = _task; + } +}; + +} // namespace arangodb diff --git a/lib/Async/include/CMakeLists.txt b/lib/Async/include/CMakeLists.txt deleted file mode 100644 index 04d3d52d4838..000000000000 --- a/lib/Async/include/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_library(arango_async_interface INTERFACE) -target_include_directories(arango_async_interface INTERFACE .) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index dd04f5f1488c..b01d92b4f20c 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -330,6 +330,7 @@ target_link_libraries(arango_lightweight absl::flat_hash_set absl::synchronization arango_async + arango_task_registry_global PRIVATE date_interface fmt arango_assertions @@ -451,3 +452,4 @@ add_subdirectory(CrashHandler) add_subdirectory(Assertions) add_subdirectory(Inspection) add_subdirectory(BuildId) +add_subdirectory(TaskMonitoring) diff --git a/lib/Containers/CMakeLists.txt b/lib/Containers/CMakeLists.txt index 3f37c693cf96..b6eefb6434f8 100644 --- a/lib/Containers/CMakeLists.txt +++ b/lib/Containers/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(Concurrent) +add_subdirectory(Forest) diff --git a/lib/Containers/Concurrent/ThreadOwnedList.h b/lib/Containers/Concurrent/ThreadOwnedList.h index 7c9721e8e8af..95df13e18b05 100644 --- a/lib/Containers/Concurrent/ThreadOwnedList.h +++ b/lib/Containers/Concurrent/ThreadOwnedList.h @@ -165,6 +165,8 @@ struct ThreadOwnedList Can be called from any thread. The node needs to be part of the list, crashes otherwise. + Caller needs to make sure that this is not called twice: otherwise there + will be a double free. */ auto mark_for_deletion(Node* node) noexcept -> void { // makes sure that node is really in this list diff --git a/lib/Containers/Concurrent/source_location.h b/lib/Containers/Concurrent/source_location.h new file mode 100644 index 000000000000..87f8fd28e01d --- /dev/null +++ b/lib/Containers/Concurrent/source_location.h @@ -0,0 +1,61 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include +#include +#include +#include + +namespace arangodb::basics { + +struct SourceLocationSnapshot { + std::string_view file_name; + std::string_view function_name; + std::uint_least32_t line; + bool operator==(SourceLocationSnapshot const&) const = default; + static auto from(std::source_location loc) -> SourceLocationSnapshot { + return SourceLocationSnapshot{.file_name = loc.file_name(), + .function_name = loc.function_name(), + .line = loc.line()}; + } +}; +template +auto inspect(Inspector& f, SourceLocationSnapshot& x) { + return f.object(x).fields(f.field("file_name", x.file_name), + f.field("line", x.line), + f.field("function_name", x.function_name)); +} + +struct VariableSourceLocation { + auto snapshot() -> SourceLocationSnapshot { + return SourceLocationSnapshot{.file_name = file_name, + .function_name = function_name, + .line = line.load()}; + } + const std::string_view file_name; + const std::string_view function_name; + std::atomic line; +}; + +} // namespace arangodb::basics diff --git a/lib/Containers/Forest/CMakeLists.txt b/lib/Containers/Forest/CMakeLists.txt new file mode 100644 index 000000000000..a973816911a7 --- /dev/null +++ b/lib/Containers/Forest/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library(arango_forest INTERFACE + depth_first.h + forest.h) +target_include_directories(arango_forest INTERFACE + ${PROJECT_SOURCE_DIR}/lib) diff --git a/arangod/AsyncRegistryServer/Stacktrace/depth_first.h b/lib/Containers/Forest/depth_first.h similarity index 91% rename from arangod/AsyncRegistryServer/Stacktrace/depth_first.h rename to lib/Containers/Forest/depth_first.h index b333fa69d46e..3aca3b71aa62 100644 --- a/arangod/AsyncRegistryServer/Stacktrace/depth_first.h +++ b/lib/Containers/Forest/depth_first.h @@ -27,7 +27,7 @@ #include #include -namespace arangodb::async_registry { +namespace arangodb::containers { using Id = void*; using TreeHierarchy = size_t; @@ -68,9 +68,9 @@ struct DFS_PostOrder { return next(); } - Forest const& _forest; - const Id _start; - std::stack> _stack; + Forest const& _forest = {}; + const Id _start = nullptr; + std::stack> _stack = {}; }; -} // namespace arangodb::async_registry +} // namespace arangodb::containers diff --git a/arangod/AsyncRegistryServer/Stacktrace/forest.h b/lib/Containers/Forest/forest.h similarity index 91% rename from arangod/AsyncRegistryServer/Stacktrace/forest.h rename to lib/Containers/Forest/forest.h index 2293379f633d..04830d501226 100644 --- a/arangod/AsyncRegistryServer/Stacktrace/forest.h +++ b/lib/Containers/Forest/forest.h @@ -26,7 +26,7 @@ #include #include -namespace arangodb::async_registry { +namespace arangodb::containers { using Id = void*; @@ -68,11 +68,11 @@ struct Forest { bool operator==(Forest const&) const = default; - std::vector _parent; // has one entry for each node - std::vector _node; // has one entry for each node - std::unordered_map - _position; // at which position of the vectors _waiter and _data to find - // entries for Id + std::vector _parent = {}; // has one entry for each node + std::vector _node = {}; // has one entry for each node + // at which position of the vectors _waiter and _data to find + // entries for Id + std::unordered_map _position = {}; }; /** @@ -119,4 +119,4 @@ struct IndexedForestWithRoots : IndexedForest { std::vector _roots; }; -} // namespace arangodb::async_registry +} // namespace arangodb::containers diff --git a/lib/Futures/include/Futures/coro-helper.h b/lib/Futures/include/Futures/coro-helper.h index 0a9b93e2e2a9..e6ac22a3e630 100644 --- a/lib/Futures/include/Futures/coro-helper.h +++ b/lib/Futures/include/Futures/coro-helper.h @@ -32,83 +32,226 @@ namespace std_coro = std; #endif #include +#include "Async/context.h" #include "Basics/Exceptions.h" #include "Basics/Result.h" #include "Promise.h" #include "Try.h" #include "Utils/ExecContext.h" -/// This file contains helper classes and tools for coroutines. We use -/// coroutines for asynchronous operations. Every function, method or -/// closure which contains at least one of the keywords -/// - co_await -/// - co_yield -/// - co_return -/// is a coroutine and is thus compiled differently by the C++ compiler -/// than normal. Essentially, the compiler creates a state machine for -/// each such functions. All instances of co_await and co_yield are potential -/// suspension points. The code before and after a co_await/co_yield can -/// be executed by different threads! -/// The return type of a coroutine plays a very special role. For us, it -/// will usually be `Future` for some type T. The code in this file -/// uses this fact and essentially implements the magic for coroutines -/// by providing a few helper classes. See below for details. - -/// See below at (*) for an explanation why we need this class -/// `FutureAwaitable` here! +/// This file contains helper classes and tools for coroutines. -namespace arangodb::futures { template -class Future; +struct future_promise; + +/** + Promise type for a future coroutine + + The type holds two pieces of data: + - first an `arangodb::futures::Promise` (not to be confused with the + promise_type!), and + - second an `arangodb::futures::Try` + After all, we want that the coroutine "returns" an empty `Future` + when it suspends, and it is supposed to set the return value (or + exception) via the corresponding `Promise` object to trigger + potential callbacks which are attached to the Future. + */ +template +struct future_promise_base { + using promise_type = future_promise; + + future_promise_base(std::source_location loc) + : promise{std::move(loc)}, context{} { + *arangodb::async_registry::get_current_coroutine() = {promise.id()}; + } + ~future_promise_base() {} + + auto initial_suspend() noexcept { + promise.update_state(arangodb::async_registry::State::Running); + return std_coro::suspend_never{}; + } + auto final_suspend() noexcept { + // TODO use symmetric transfer here + struct awaitable { + bool await_ready() noexcept { return false; } + bool await_suspend(std::coroutine_handle self) noexcept { + _promise->context.set(); + // we have to destroy the coroutine frame before + // we resolve the promise + _promise->promise.setTry(std::move(_promise->result)); + return false; + } + void await_resume() noexcept {} + + promise_type* _promise; + }; + + return awaitable{static_cast(this)}; + } + + template + auto await_transform( + U&& co_awaited_expression, + std::source_location loc = std::source_location::current()) noexcept { + using inner_awaitable_type = decltype(arangodb::get_awaitable_object( + std::forward(co_awaited_expression))); + + struct awaitable { + bool await_ready() { return inner_awaitable.await_ready(); } + auto await_suspend(std::coroutine_handle<> handle) { + outer_promise->promise.update_state( + arangodb::async_registry::State::Suspended); + outer_promise->context.set(); + return inner_awaitable.await_suspend(handle); + } + auto await_resume() { + auto old_state = outer_promise->promise.update_state( + arangodb::async_registry::State::Running); + if (old_state.has_value() && + old_state.value() == arangodb::async_registry::State::Suspended) { + outer_promise->context = arangodb::Context{}; + } + myContext.set(); + return inner_awaitable.await_resume(); + } + + promise_type* outer_promise; + inner_awaitable_type inner_awaitable; + arangodb::Context myContext; + }; + + // update promises in registry + if constexpr (arangodb::CanUpdateRequester) { + co_awaited_expression.update_requester({promise.id()}); + } + promise.update_source_location(std::move(loc)); + + return awaitable{.outer_promise = static_cast(this), + .inner_awaitable = arangodb::get_awaitable_object( + std::forward(co_awaited_expression)), + .myContext = arangodb::Context{}}; + } + + auto get_return_object() -> arangodb::futures::Future { + return promise.getFuture(); + } + + auto unhandled_exception() noexcept { + result.set_exception(std::current_exception()); + context.set(); + } + + arangodb::futures::Promise promise; + arangodb::futures::Try result; + arangodb::Context context; +}; template -struct FutureAwaitable { - [[nodiscard]] auto await_ready() const noexcept -> bool { return false; } - bool await_suspend(std_coro::coroutine_handle<> coro) noexcept { - // returning false resumes `coro` - _execContext = ExecContext::currentAsShared(); - std::move(_future).thenFinal( - [coro, this](futures::Try&& result) mutable noexcept { - _result = std::move(result); - if (_counter.fetch_sub(1) == 1) { - ExecContextScope exec(_execContext); - coro.resume(); - } - }); - return _counter.fetch_sub(1) != 1; +struct future_promise : future_promise_base { + future_promise(std::source_location loc = std::source_location::current()) + : future_promise_base(std::move(loc)) {} + auto return_value( + T const& t, + std::source_location loc = std::source_location:: + current()) noexcept(std::is_nothrow_copy_constructible_v) { + static_assert(std::is_copy_constructible_v); + future_promise_base::promise.update_state( + arangodb::async_registry::State::Resolved); + future_promise_base::promise.update_source_location(std::move(loc)); + future_promise_base::result.emplace(t); } - auto await_resume() -> T { return std::move(_result.value().get()); } - explicit FutureAwaitable(Future fut) : _future(std::move(fut)) {} - private: - std::atomic_uint8_t _counter{2}; - Future _future; - std::optional> _result; - std::shared_ptr _execContext; + auto return_value( + T&& t, std::source_location loc = std::source_location:: + current()) noexcept(std::is_nothrow_move_constructible_v) { + static_assert(std::is_move_constructible_v); + future_promise_base::promise.update_state( + arangodb::async_registry::State::Resolved); + future_promise_base::promise.update_source_location(std::move(loc)); + future_promise_base::result.emplace(std::move(t)); + } }; -/// See below at (*) for an explanation why we need this operator co_await -/// here! +template<> +struct future_promise + : future_promise_base { + future_promise(std::source_location loc = std::source_location::current()) + : future_promise_base(std::move(loc)) {} + auto return_void( + std::source_location loc = std::source_location::current()) noexcept { + promise.update_state(arangodb::async_registry::State::Resolved); + promise.update_source_location(std::move(loc)); + result.emplace(); + } +}; +/** + With this definition, Future can be used as a coroutine + */ +template +struct std_coro::coroutine_traits, Args...> { + using promise_type = future_promise; +}; + +/** + With this definition, Future can be used as a + coroutine + */ +template +struct std_coro::coroutine_traits< + arangodb::futures::Future, Args...> { + using promise_type = future_promise; +}; + +namespace arangodb::futures { + +/** + Be able to call co_await on a future + */ template auto operator co_await(Future&& f) noexcept { - return FutureAwaitable{std::move(f)}; + struct FutureAwaitable { + [[nodiscard]] auto await_ready() const noexcept -> bool { return false; } + bool await_suspend(std_coro::coroutine_handle<> coro) noexcept { + std::move(_future).thenFinal( + [coro, this](futures::Try&& result) mutable noexcept { + _result = std::move(result); + if (_counter.fetch_sub(1) == 1) { + coro.resume(); + } + }); + // returning false resumes `coro` + return _counter.fetch_sub(1) != 1; + } + auto await_resume() -> T { return std::move(_result.value().get()); } + explicit FutureAwaitable(Future fut) : _future(std::move(fut)) {} + + private: + std::atomic_uint8_t _counter{2}; + Future _future; + std::optional> _result; + }; + + return FutureAwaitable{std::move(f)}; } +/** + Be able to call co_await on some transformation of a future + + Transformations are defined below + */ template struct FutureTransformAwaitable : F { [[nodiscard]] auto await_ready() const noexcept -> bool { return false; } bool await_suspend(std_coro::coroutine_handle<> coro) noexcept { - // returning false resumes `coro` - _execContext = ExecContext::currentAsShared(); std::move(_future).thenFinal( [coro, this](futures::Try&& result) noexcept { _result = F::operator()(std::move(result)); if (_counter.fetch_sub(1) == 1) { - ExecContextScope exec(_execContext); coro.resume(); } }); + // returning false resumes `coro` return _counter.fetch_sub(1) != 1; } using ResultType = std::invoke_result_t&&>; @@ -123,7 +266,6 @@ struct FutureTransformAwaitable : F { std::atomic_uint8_t _counter{2}; Future _future; std::optional _result; - std::shared_ptr _execContext; }; template @@ -154,269 +296,4 @@ auto asResult(Future>&& f) noexcept { return basics::catchToResult([&] { return res.get(); }); }}; } - } // namespace arangodb::futures - -/// For every coroutine, there must be a so-called `promise_type`, which -/// is a helper class providing a few methods to configure the behaviour -/// of the coroutine. This can either be a member type called `promise_type` -/// of the return type of the coroutine, or, as in our case, it is determined -/// using the `std_coro::coroutine_traits` template with template parameters -/// using the return type (see -/// https://en.cppreference.com/w/cpp/language/coroutines -/// under "Promise") and then some. Since our return type for coroutines -/// is `arangodb::futures::Future`, we specialize this template here -/// to configure our coroutines (for an explanation see below the class): - -template -struct std_coro::coroutine_traits, Args...> { - struct promise_type { - // For some reason, non-maintainer compilation fails with a linker error - // if these are missing or defaulted. - promise_type(std::source_location loc = std::source_location::current()) - : promise{std::move(loc)}, - requester{*arangodb::async_registry::get_current_coroutine()} { - *arangodb::async_registry::get_current_coroutine() = {promise.id()}; - } - ~promise_type() {} - - arangodb::futures::Promise promise; - arangodb::futures::Try result; - arangodb::async_registry::Requester requester; - - auto initial_suspend() noexcept { - promise.update_state(arangodb::async_registry::State::Running); - return std_coro::suspend_never{}; - } - auto final_suspend() noexcept { - // TODO use symmetric transfer here - struct awaitable { - bool await_ready() noexcept { return false; } - bool await_suspend(std::coroutine_handle self) noexcept { - *arangodb::async_registry::get_current_coroutine() = - _promise->requester; - // we have to destroy the coroutine frame before - // we resolve the promise - _promise->promise.setTry(std::move(_promise->result)); - return false; - } - void await_resume() noexcept {} - - promise_type* _promise; - }; - - return awaitable{this}; - } - - auto get_return_object() -> arangodb::futures::Future { - return promise.getFuture(); - } - - auto return_value( - T const& t, - std::source_location loc = std::source_location:: - current()) noexcept(std::is_nothrow_copy_constructible_v) { - static_assert(std::is_copy_constructible_v); - promise.update_state(arangodb::async_registry::State::Resolved); - promise.update_source_location(std::move(loc)); - result.emplace(t); - } - - auto return_value( - T&& t, - std::source_location loc = std::source_location:: - current()) noexcept(std::is_nothrow_move_constructible_v) { - static_assert(std::is_move_constructible_v); - promise.update_state(arangodb::async_registry::State::Resolved); - promise.update_source_location(std::move(loc)); - result.emplace(std::move(t)); - } - - auto unhandled_exception() noexcept { - result.set_exception(std::current_exception()); - *arangodb::async_registry::get_current_coroutine() = requester; - } - - template - auto await_transform( - U&& co_awaited_expression, - std::source_location loc = std::source_location::current()) noexcept { - using inner_awaitable_type = decltype(arangodb::get_awaitable_object( - std::forward(co_awaited_expression))); - - struct awaitable { - bool await_ready() { return inner_awaitable.await_ready(); } - auto await_suspend(std::coroutine_handle<> handle) { - *arangodb::async_registry::get_current_coroutine() = - outer_promise->requester; - outer_promise->promise.update_state( - arangodb::async_registry::State::Suspended); - return inner_awaitable.await_suspend(handle); - } - auto await_resume() { - auto old_state = outer_promise->promise.update_state( - arangodb::async_registry::State::Running); - if (old_state.has_value() && - old_state.value() == arangodb::async_registry::State::Suspended) { - outer_promise->requester = - *arangodb::async_registry::get_current_coroutine(); - } - *arangodb::async_registry::get_current_coroutine() = { - outer_promise->promise.id()}; - return inner_awaitable.await_resume(); - } - - promise_type* outer_promise; - inner_awaitable_type inner_awaitable; - }; - - // update promises in registry - if constexpr (arangodb::CanUpdateRequester) { - co_awaited_expression.update_requester({promise.id()}); - } - promise.update_source_location(std::move(loc)); - - return awaitable{this, arangodb::get_awaitable_object( - std::forward(co_awaited_expression))}; - } - }; -}; - -/// (*) Explanation for the details: -/// The `promise_type` holds two pieces of data: -/// - first an `arangodb::futures::Promise` (not to be confused with the -/// promise_type!), and -/// - second an `arangodb::futures::Try` -/// After all, we want that the coroutine "returns" an empty `Future` -/// when it suspends, and it is supposed to set the return value (or -/// exception) via the corresponding `Promise` object to trigger -/// potential callbacks which are attached to the Future. -/// So how does this all work? -/// When the coroutine is first called an object of type `promise_type` -/// is contructed, which constructs its member `promise` of type -/// `Promise`. Then, early in the life of the coroutine, the method -/// `get_return_object` is called, which builds an object of type -/// `Future` from the `promise` member, so that it is associated with -/// the `promise` member. This is what will be returned when the coroutine -/// is first suspended. -/// Since `initial_suspend` returns `std_coro::suspend_never{}` no -/// suspension happens before the first code of the coroutine is run. -/// When the coroutine reaches a `co_await`, the expression behind it is -/// first evaluated. It is then the "awaitable" object (unless there is -/// a method `await_transform` in the current coroutines promise object, -/// which we haven't). In most cases, this will be another `Future` -/// which is returned from another coroutine. -/// The "awaitable" is now transformed to an "awaiter". This is done by -/// means of an `operator co_await` defined earlier in this file. It -/// essentially wraps our `Future` into a `FutureAwaitable` class -/// also defined above. -/// The C++ coroutine framework will then cal methods on the "awaiter" -/// for events to unfold: First it calls `await_ready` to see if we have -/// to suspend after all. We always return `false` there. -/// Then it calls `await_suspend` to suspend and later `await_resume` to -/// resume. The `FutureAwaitable` class essentially attaches a closure -/// to the `Future` which resumes the coroutine. - -/// The following is the version for return type `Future`, -/// corresponding to coroutines which return nothing. The differences -/// are purely technical (`return_void` instead of `return_value`, -/// basically). - -template -struct std_coro::coroutine_traits< - arangodb::futures::Future, Args...> { - struct promise_type { - arangodb::futures::Promise promise; - arangodb::futures::Try result; - arangodb::async_registry::Requester requester; - - promise_type(std::source_location loc = std::source_location::current()) - : promise{std::move(loc)}, - requester{*arangodb::async_registry::get_current_coroutine()} { - *arangodb::async_registry::get_current_coroutine() = {promise.id()}; - } - auto initial_suspend() noexcept { - promise.update_state(arangodb::async_registry::State::Running); - return std_coro::suspend_never{}; - } - auto final_suspend() noexcept { - // TODO use symmetric transfer here - struct awaitable { - bool await_ready() noexcept { return false; } - bool await_suspend(std::coroutine_handle self) noexcept { - *arangodb::async_registry::get_current_coroutine() = - _promise->requester; - // we have to destroy the coroutine frame before - // we resolve the promise - _promise->promise.setTry(std::move(_promise->result)); - return false; - } - void await_resume() noexcept {} - - promise_type* _promise; - }; - - return awaitable{this}; - } - - auto get_return_object() - -> arangodb::futures::Future { - return promise.getFuture(); - } - - auto return_void( - std::source_location loc = std::source_location::current()) noexcept { - promise.update_state(arangodb::async_registry::State::Resolved); - promise.update_source_location(std::move(loc)); - result.emplace(); - } - - auto unhandled_exception() noexcept { - result.set_exception(std::current_exception()); - *arangodb::async_registry::get_current_coroutine() = requester; - } - - template - auto await_transform( - U&& co_awaited_expression, - std::source_location loc = std::source_location::current()) noexcept { - using inner_awaitable_type = decltype(arangodb::get_awaitable_object( - std::forward(co_awaited_expression))); - - struct awaitable { - bool await_ready() { return inner_awaitable.await_ready(); } - auto await_suspend(std::coroutine_handle<> handle) { - *arangodb::async_registry::get_current_coroutine() = - outer_promise->requester; - outer_promise->promise.update_state( - arangodb::async_registry::State::Suspended); - return inner_awaitable.await_suspend(handle); - } - auto await_resume() { - auto old_state = outer_promise->promise.update_state( - arangodb::async_registry::State::Running); - if (old_state.has_value() && - old_state.value() == arangodb::async_registry::State::Suspended) { - outer_promise->requester = - *arangodb::async_registry::get_current_coroutine(); - } - *arangodb::async_registry::get_current_coroutine() = { - outer_promise->promise.id()}; - return inner_awaitable.await_resume(); - } - - promise_type* outer_promise; - inner_awaitable_type inner_awaitable; - }; - - // update promises in registry - if constexpr (arangodb::CanUpdateRequester) { - co_awaited_expression.update_requester({promise.id()}); - } - promise.update_source_location(std::move(loc)); - - return awaitable{this, arangodb::get_awaitable_object( - std::forward(co_awaited_expression))}; - } - }; -}; diff --git a/lib/TaskMonitoring/CMakeLists.txt b/lib/TaskMonitoring/CMakeLists.txt new file mode 100644 index 000000000000..fe33944774dd --- /dev/null +++ b/lib/TaskMonitoring/CMakeLists.txt @@ -0,0 +1,9 @@ +add_library(arango_task_registry STATIC + task.cpp) +target_include_directories(arango_task_registry PUBLIC + include) +target_link_libraries(arango_task_registry PUBLIC fmt arango) + +add_library(arango_task_registry_global STATIC + task_registry_variable.cpp) +target_link_libraries(arango_task_registry_global PUBLIC arango_task_registry) diff --git a/lib/TaskMonitoring/include/TaskMonitoring/shared_reference.h b/lib/TaskMonitoring/include/TaskMonitoring/shared_reference.h new file mode 100644 index 000000000000..623de77329ee --- /dev/null +++ b/lib/TaskMonitoring/include/TaskMonitoring/shared_reference.h @@ -0,0 +1,112 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include +#include +#include + +/** + Reference counting wrapper for a resource + + Destroys itself and calls a custom cleanup function on the resource when the + reference count decrements to zero. + */ +template +struct Shared { + static auto create(T* resource, std::function cleanup) -> Shared* { + if (resource == nullptr) { + std::abort(); + } + return new Shared{resource, cleanup}; + } + auto get_ref() const -> T& { return *_resource; } + auto get() const -> T* { return _resource; } + auto increment() -> void { _count.fetch_add(1, std::memory_order_acq_rel); } + auto decrement() -> void { + auto old = _count.fetch_sub(1, std::memory_order_acq_rel); + if (old == 1) { + _cleanup(_resource); + delete this; + } + } + auto ref_count() -> size_t { return _count.load(std::memory_order_release); } + + private: + T* _resource; + std::function _cleanup; + std::atomic _count = 0; + Shared(T* node, std::function cleanup) + : _resource{node}, _cleanup{cleanup} {} +}; + +/** + Shared reference to a resource + + Increases reference counter on construction and decreases it on destruction. + */ +template +struct SharedReference { + SharedReference(SharedReference const& other) + : _shared_node{other._shared_node} { + _shared_node->increment(); + } + auto operator=(SharedReference const& other) -> SharedReference { + _shared_node = other._shared_node; + _shared_node->increment(); + return *this; + } + SharedReference(SharedReference&& other) : _shared_node{other._shared_node} { + other._shared_node = nullptr; + } + auto operator=(SharedReference&& other) -> SharedReference& { + _shared_node = other._shared_node; + other._shared_node = nullptr; + return *this; + } + ~SharedReference() { + if (_shared_node) { + _shared_node->decrement(); + } + } + static auto create(Shared* node) -> SharedReference { + if (node == nullptr) { + std::abort(); + } + return SharedReference{node}; + } + static auto create(T* resource, std::function cleanup) + -> SharedReference { + return SharedReference{Shared::create(resource, cleanup)}; + } + auto operator*() const -> T& { return _shared_node->get_ref(); } + auto operator->() const -> T* { return _shared_node->get(); } + auto get() const -> T* { return _shared_node->get(); } + auto ref_count() -> size_t { return _shared_node->ref_count(); } + + private: + Shared* _shared_node; + SharedReference(Shared* node) : _shared_node{node} { + _shared_node->increment(); + } +}; diff --git a/lib/TaskMonitoring/include/TaskMonitoring/task.h b/lib/TaskMonitoring/include/TaskMonitoring/task.h new file mode 100644 index 000000000000..7c642f07a616 --- /dev/null +++ b/lib/TaskMonitoring/include/TaskMonitoring/task.h @@ -0,0 +1,173 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "Containers/Concurrent/ThreadOwnedList.h" +#include "Containers/Concurrent/source_location.h" +#include "Containers/Concurrent/thread.h" +#include "Inspection/Types.h" +#include "fmt/format.h" +#include "shared_reference.h" + +#include +#include +#include +#include +#include + +namespace arangodb::task_monitoring { + +struct RootTask { + bool operator==(RootTask const&) const = default; +}; +template +auto inspect(Inspector& f, RootTask& x) { + return f.object(x).fields(); +} + +struct TaskIdWrapper { + void* id; + bool operator==(TaskIdWrapper const&) const = default; +}; +template +auto inspect(Inspector& f, TaskIdWrapper& x) { + return f.object(x).fields(f.field("id", fmt::format("{}", x.id))); +} + +struct ParentTaskSnapshot : std::variant {}; +template +auto inspect(Inspector& f, ParentTaskSnapshot& x) { + return f.variant(x).unqualified().alternatives( + inspection::inlineType(), + inspection::inlineType()); +} + +enum class State { Created = 0, Running, Finished, Deleted }; +template +auto inspect(Inspector& f, State& x) { + return f.enumeration(x).values(State::Created, "Created", State::Running, + "Running", State::Finished, "Finished", + State::Deleted, "Deleted"); +} + +struct TaskSnapshot { + std::string name; + State state; + void* id; + ParentTaskSnapshot parent; + std::optional thread; + basics::SourceLocationSnapshot source_location; + bool operator==(TaskSnapshot const&) const = default; + auto update_state(State new_state) -> TaskSnapshot& { + state = new_state; + return *this; + } +}; +template +auto inspect(Inspector& f, TaskSnapshot& x) { + return f.object(x).fields(f.field("id", fmt::format("{}", x.id)), + f.field("name", x.name), f.field("state", x.state), + f.field("parent", x.parent), + f.field("thread", x.thread), + f.field("source_location", x.source_location)); +} +void PrintTo(const TaskSnapshot& task, std::ostream* os); + +struct Node; +using NodeReference = SharedReference; +struct ParentTask : std::variant {}; + +struct Task; + +/** + The task object inside the registry + */ +struct TaskInRegistry { + using Snapshot = TaskSnapshot; + auto id() -> void* { return this; } + auto snapshot() -> TaskSnapshot; + auto set_to_deleted() -> void { + state.store(State::Deleted, std::memory_order_release); + } + static auto root(std::string name, std::source_location loc) + -> TaskInRegistry { + return TaskInRegistry{.name = std::move(name), + .state = State::Running, + .parent = ParentTask{RootTask{}}, + .running_thread = basics::ThreadId::current(), + .source_location = std::move(loc)}; + } + static auto child(std::string name, NodeReference parent, + std::source_location loc) -> TaskInRegistry { + return TaskInRegistry{.name = std::move(name), + .state = State::Running, + .parent = ParentTask{std::move(parent)}, + .running_thread = basics::ThreadId::current(), + .source_location = std::move(loc)}; + } + + std::string const name; + std::atomic state; + std::atomic isDeleted = false; + ParentTask parent; + std::optional + running_thread; // proably has to also be atomic because + // changes for scheduled task + std::source_location const source_location; + // possibly interesting other properties: + // std::chrono::time_point creation = std:; +}; + +/** + Use inheritance to circumvent problems with non-satified constraints for Node + */ +struct Node : public containers::ThreadOwnedList::Node {}; + +struct ChildTask; +/** + This is a scope for an active task. + + It adds an entry to the task registry on construction and sets its + state to finished on destruction. + */ +struct Task { + friend ChildTask; + Task(Task&& other) = delete; + Task& operator=(Task&& other) = delete; + Task(Task const&) = delete; + Task& operator=(Task const&) = delete; + + Task(std::string name, + std::source_location loc = std::source_location::current()); + ~Task(); + + auto id() -> void*; + + private: + Task* parent; + NodeReference _node_in_registry; +}; + +auto get_current_task() -> Task**; + +} // namespace arangodb::task_monitoring diff --git a/lib/TaskMonitoring/include/TaskMonitoring/task_registry_variable.h b/lib/TaskMonitoring/include/TaskMonitoring/task_registry_variable.h new file mode 100644 index 000000000000..7f252726d7f3 --- /dev/null +++ b/lib/TaskMonitoring/include/TaskMonitoring/task_registry_variable.h @@ -0,0 +1,63 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "Containers/Concurrent/ListOfNonOwnedLists.h" +#include "Containers/Concurrent/metrics.h" +#include "Containers/Concurrent/ThreadOwnedList.h" +#include "TaskMonitoring/task.h" + +namespace arangodb::task_monitoring { + +using ThreadRegistry = containers::ThreadOwnedList; +struct Registry : public containers::ListOfNonOwnedLists { + // all thread registries that are added to this registry will use these + // metrics + std::shared_ptr metrics; + // metrics-feature is only available after startup, therefore we need to + // update the metrics after construction + // thread registries that are added to the registry before setting the metrics + // properly are not accounted for in the metrics + auto set_metrics(std::shared_ptr new_metrics) -> void { + auto guard = std::lock_guard(_mutex); + metrics = new_metrics; + } +}; + +/** + Global variable that holds all active tasks. + + Includes a list of thread owned lists, one for each initialized + thread. + */ +extern Registry registry; + +/** + Get thread registry of all active tasks on current thread. + + Creates the thread registry when called for the first time and adds it to the + global registry. + */ +auto get_thread_registry() noexcept -> ThreadRegistry&; + +} // namespace arangodb::task_monitoring diff --git a/lib/TaskMonitoring/task.cpp b/lib/TaskMonitoring/task.cpp new file mode 100644 index 000000000000..41de984de286 --- /dev/null +++ b/lib/TaskMonitoring/task.cpp @@ -0,0 +1,132 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#include "TaskMonitoring/task.h" + +#include "Assertions/ProdAssert.h" +#include "Containers/Concurrent/source_location.h" +#include "Containers/Concurrent/thread.h" +#include "Inspection/Format.h" +#include "TaskMonitoring/task_registry_variable.h" +#include +#include +#include +#include +#include + +// helper type for the visitor +namespace { +template +struct overloaded : Ts... { + using Ts::operator()...; +}; +template +overloaded(Ts...) -> overloaded; +} // namespace + +using namespace arangodb; +using namespace arangodb::task_monitoring; + +void arangodb::task_monitoring::PrintTo(const TaskSnapshot& task, + std::ostream* os) { + *os << task.id << "| " << task.name << " - " << inspection::json(task.parent); + // inspection::json(task); +} + +auto TaskInRegistry::snapshot() -> TaskSnapshot { + return TaskSnapshot{ + .name = name, + .state = state, + .id = id(), + .parent = std::visit( + overloaded{ + [&](RootTask const& root) { return ParentTaskSnapshot{root}; }, + [&](NodeReference const& parent) { + return ParentTaskSnapshot{TaskIdWrapper{parent->data.id()}}; + }}, + parent), + .thread = running_thread, + .source_location = basics::SourceLocationSnapshot{ + .file_name = source_location.file_name(), + .function_name = source_location.function_name(), + .line = source_location.line()}}; +} + +namespace { +auto mark_finished_nodes_for_deletion(Node* node) { + auto current_node = node; + while (true) { + auto specific_node = + reinterpret_cast::Node*>( + current_node); + + // make sure that we don't mark a node twice for deletion + auto expected = false; + if (not specific_node->data.isDeleted.compare_exchange_strong( + expected, true, std::memory_order_acq_rel)) { + break; + } + + auto& parent = specific_node->data.parent; + if (not std::holds_alternative(parent)) { + specific_node->list->mark_for_deletion(specific_node); + break; + } + auto& parent_ref = std::get(parent); + if (parent_ref.ref_count() != 1) { + specific_node->list->mark_for_deletion(specific_node); + break; + } + // node is last reference to parent, therefore it can be marked for deletion + current_node = parent_ref.get(); + + specific_node->list->mark_for_deletion(specific_node); + } +} +} // namespace + +Task::Task(std::string name, std::source_location loc) + : _node_in_registry{NodeReference::create( + reinterpret_cast(get_thread_registry().add([&]() { + if (auto current = *get_current_task(); current != nullptr) { + return TaskInRegistry::child( + std::move(name), current->_node_in_registry, std::move(loc)); + } + return TaskInRegistry::root(std::move(name), std::move(loc)); + })), + mark_finished_nodes_for_deletion)} { + parent = *get_current_task(); + *get_current_task() = this; +} + +Task::~Task() { + _node_in_registry->data.state.store(State::Finished, + std::memory_order_relaxed); + *get_current_task() = parent; +} + +auto Task::id() -> void* { return _node_in_registry->data.id(); } + +auto arangodb::task_monitoring::get_current_task() -> Task** { + static thread_local Task* current = nullptr; + return ¤t; +} diff --git a/lib/TaskMonitoring/task_registry_variable.cpp b/lib/TaskMonitoring/task_registry_variable.cpp new file mode 100644 index 000000000000..8b1e02fe2031 --- /dev/null +++ b/lib/TaskMonitoring/task_registry_variable.cpp @@ -0,0 +1,41 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// +#include "TaskMonitoring/task_registry_variable.h" + +namespace arangodb::task_monitoring { + +Registry registry; + +auto get_thread_registry() noexcept -> ThreadRegistry& { + struct ThreadRegistryGuard { + ThreadRegistryGuard() : _registry{ThreadRegistry::make(registry.metrics)} { + registry.add(_registry); + } + + std::shared_ptr _registry; + }; + static thread_local auto registry_guard = ThreadRegistryGuard{}; + return *registry_guard._registry; +} + +} // namespace arangodb::task_monitoring diff --git a/tests/Async/Registry/RegistryTest.cpp b/tests/Async/Registry/RegistryTest.cpp index b75bde7d366d..b9a7258d34df 100644 --- a/tests/Async/Registry/RegistryTest.cpp +++ b/tests/Async/Registry/RegistryTest.cpp @@ -40,11 +40,11 @@ auto promises_in_registry() -> std::vector { } struct MyPromise : public AddToAsyncRegistry { - SourceLocationSnapshot source_location; + basics::SourceLocationSnapshot source_location; basics::ThreadId thread; MyPromise(std::source_location loc = std::source_location::current()) : AddToAsyncRegistry{loc}, - source_location{SourceLocationSnapshot::from(std::move(loc))}, + source_location{basics::SourceLocationSnapshot::from(std::move(loc))}, thread{basics::ThreadId::current()} {} auto snapshot(State state = State::Running) -> PromiseSnapshot { return PromiseSnapshot{.id = id(), diff --git a/tests/AsyncRegistryServer/CMakeLists.txt b/tests/AsyncRegistryServer/CMakeLists.txt deleted file mode 100644 index f84891c944f2..000000000000 --- a/tests/AsyncRegistryServer/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -add_library(arango_tests_async_registry_server OBJECT - StacktraceTest.cpp) -target_link_libraries(arango_tests_async_registry_server PRIVATE - arango_async_registry_stacktrace - gtest) - -add_executable(arangodbtests_async_registry_server EXCLUDE_FROM_ALL) -target_link_libraries(arangodbtests_async_registry_server - arango_tests_async_registry_server - gtest_main) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e1163282254b..2877eeacf193 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -327,10 +327,11 @@ endif() target_link_libraries(arangodbtests arango - arango_tests_async_registry_server + arango_tests_forest arango_tests_basics arango_tests_replication2 arango_tests_replication2_pure + arango_tests_task_registry arango_tests_graph arango_tests_futures arango_tests_zkd @@ -419,7 +420,7 @@ endforeach() add_subdirectory(Actor) add_subdirectory(Async) -add_subdirectory(AsyncRegistryServer) add_subdirectory(Containers) add_subdirectory(sepp) add_subdirectory(VocBase/Properties) +add_subdirectory(TaskMonitoring) diff --git a/tests/Containers/CMakeLists.txt b/tests/Containers/CMakeLists.txt index 3f37c693cf96..2e209eb443d0 100644 --- a/tests/Containers/CMakeLists.txt +++ b/tests/Containers/CMakeLists.txt @@ -1 +1,12 @@ add_subdirectory(Concurrent) + +add_library(arango_tests_forest OBJECT + ForestTest.cpp) +target_link_libraries(arango_tests_forest PRIVATE + arango_forest + gtest) + +add_executable(arangodbtests_forest EXCLUDE_FROM_ALL) +target_link_libraries(arangodbtests_forest + arango_tests_forest + gtest_main) diff --git a/tests/AsyncRegistryServer/StacktraceTest.cpp b/tests/Containers/ForestTest.cpp similarity index 92% rename from tests/AsyncRegistryServer/StacktraceTest.cpp rename to tests/Containers/ForestTest.cpp index 6b324db89b7c..d1d1043fa23f 100644 --- a/tests/AsyncRegistryServer/StacktraceTest.cpp +++ b/tests/Containers/ForestTest.cpp @@ -20,14 +20,14 @@ /// /// @author Julia Volmer //////////////////////////////////////////////////////////////////////////////// -#include "AsyncRegistryServer/Stacktrace/forest.h" -#include "AsyncRegistryServer/Stacktrace/depth_first.h" +#include "Containers/Forest/forest.h" +#include "Containers/Forest/depth_first.h" #include -using namespace arangodb::async_registry; +using namespace arangodb::containers; -TEST(AsyncRegistryStacktraceTest, insert_nodes_into_forest) { +TEST(ForestTest, insert_nodes_into_forest) { Forest forest; forest.insert((void*)32, (void*)1, "first"); @@ -45,7 +45,7 @@ TEST(AsyncRegistryStacktraceTest, insert_nodes_into_forest) { ASSERT_EQ(forest.node((void*)1), std::nullopt); } -TEST(AsyncRegistryStacktraceTest, index_forest) { +TEST(ForestTest, index_forest) { Forest forest; forest.insert((void*)1, (void*)2, "first"); forest.insert((void*)2, (void*)4, "second"); @@ -66,7 +66,7 @@ TEST(AsyncRegistryStacktraceTest, index_forest) { ASSERT_EQ(forest, (Forest{{}, {}, {}})); } -TEST(AsyncRegistryStacktraceTest, executes_post_ordered_depth_first) { +TEST(ForestTest, executes_post_ordered_depth_first) { Forest forest; forest.insert((void*)1, (void*)0, "root"); forest.insert((void*)2, (void*)1, "node"); diff --git a/tests/Futures/CMakeLists.txt b/tests/Futures/CMakeLists.txt index bd2607d16f19..63c2c7247dbb 100644 --- a/tests/Futures/CMakeLists.txt +++ b/tests/Futures/CMakeLists.txt @@ -5,6 +5,7 @@ add_library(arango_tests_futures OBJECT TryTest.cpp) target_link_libraries(arango_tests_futures PRIVATE + arango arango_futures gtest velocypack diff --git a/tests/Futures/FutureCoroutineTest.cpp b/tests/Futures/FutureCoroutineTest.cpp index 43db4f01b49a..78e842ecdc2e 100644 --- a/tests/Futures/FutureCoroutineTest.cpp +++ b/tests/Futures/FutureCoroutineTest.cpp @@ -10,6 +10,7 @@ #include +using namespace arangodb; using namespace arangodb::futures; namespace { @@ -107,7 +108,7 @@ auto expect_all_promises_in_state(arangodb::async_registry::State state, } // namespace template -struct FutureCoroutineTest : ::testing::Test { +struct FutureTest : ::testing::Test { void SetUp() override { arangodb::async_registry::get_thread_registry().garbage_collect(); EXPECT_TRUE(std::holds_alternative( @@ -123,9 +124,9 @@ struct FutureCoroutineTest : ::testing::Test { }; using MyTypes = ::testing::Types; -TYPED_TEST_SUITE(FutureCoroutineTest, MyTypes); +TYPED_TEST_SUITE(FutureTest, MyTypes); -TYPED_TEST(FutureCoroutineTest, promises_in_async_registry_know_their_state) { +TYPED_TEST(FutureTest, promises_in_async_registry_know_their_state) { { auto coro = [&]() -> Future { co_await this->wait; @@ -162,7 +163,7 @@ auto find_promise_by_name(std::string_view name) } // namespace TYPED_TEST( - FutureCoroutineTest, + FutureTest, promises_in_async_registry_know_their_requester_with_nested_coroutines) { using TestType = decltype(this); struct Functions { @@ -231,7 +232,7 @@ TYPED_TEST( this->wait.await(); } -TYPED_TEST(FutureCoroutineTest, +TYPED_TEST(FutureTest, promises_in_async_registry_know_their_requester_with_move) { using TestType = decltype(this); struct Functions { @@ -279,3 +280,73 @@ TYPED_TEST(FutureCoroutineTest, this->wait.resume(); this->wait.await(); } + +struct ExecContext_Waiting : public arangodb::ExecContext { + ExecContext_Waiting() + : arangodb::ExecContext(arangodb::ExecContext::ConstructorToken{}, + arangodb::ExecContext::Type::Default, "Waiting", + "", arangodb::auth::Level::RW, + arangodb::auth::Level::NONE, true) {} +}; +struct ExecContext_Calling : public arangodb::ExecContext { + ExecContext_Calling() + : arangodb::ExecContext(arangodb::ExecContext::ConstructorToken{}, + arangodb::ExecContext::Type::Default, "Calling", + "", arangodb::auth::Level::RW, + arangodb::auth::Level::NONE, true) {} +}; +struct ExecContext_Begin : public arangodb::ExecContext { + ExecContext_Begin() + : arangodb::ExecContext(arangodb::ExecContext::ConstructorToken{}, + arangodb::ExecContext::Type::Default, "Begin", "", + arangodb::auth::Level::RW, + arangodb::auth::Level::NONE, true) {} +}; +struct ExecContext_End : public arangodb::ExecContext { + ExecContext_End() + : arangodb::ExecContext(arangodb::ExecContext::ConstructorToken{}, + arangodb::ExecContext::Type::Default, "End", "", + arangodb::auth::Level::RW, + arangodb::auth::Level::NONE, true) {} +}; +TYPED_TEST(FutureTest, execution_context_is_local_to_coroutine) { + ExecContextScope exec(std::make_shared()); + EXPECT_EQ(ExecContext::current().user(), "Begin"); + + auto waiting_coro = [&]() -> Future { + EXPECT_EQ(ExecContext::current().user(), "Begin"); + ExecContextScope exec(std::make_shared()); + EXPECT_EQ(ExecContext::current().user(), "Waiting"); + co_await this->wait; + EXPECT_EQ(ExecContext::current().user(), "Waiting"); + co_return; + }(); + EXPECT_EQ(ExecContext::current().user(), "Begin"); + + auto trivial_coro = []() -> Future { + EXPECT_EQ(ExecContext::current().user(), "Begin"); + co_return; + }(); + + auto calling_coro = [&]() -> Future { + EXPECT_EQ(ExecContext::current().user(), "Begin"); + ExecContextScope exec(std::make_shared()); + EXPECT_EQ(ExecContext::current().user(), "Calling"); + co_await std::move(waiting_coro); + EXPECT_EQ(ExecContext::current().user(), "Calling"); + co_await std::move(trivial_coro); + EXPECT_EQ(ExecContext::current().user(), "Calling"); + co_return; + }; + EXPECT_EQ(ExecContext::current().user(), "Begin"); + + std::ignore = calling_coro(); + EXPECT_EQ(ExecContext::current().user(), "Begin"); + + ExecContextScope new_exec(std::make_shared()); + EXPECT_EQ(ExecContext::current().user(), "End"); + + this->wait.resume(); + this->wait.await(); + EXPECT_EQ(ExecContext::current().user(), "End"); +} diff --git a/tests/Maintenance/MaintenanceFeatureTest.cpp b/tests/Maintenance/MaintenanceFeatureTest.cpp index 1b258c9fa3d6..95241ee1e700 100644 --- a/tests/Maintenance/MaintenanceFeatureTest.cpp +++ b/tests/Maintenance/MaintenanceFeatureTest.cpp @@ -31,7 +31,7 @@ #include "ApplicationFeatures/GreetingsFeaturePhase.h" #include "Basics/Result.h" #include "Basics/ScopeGuard.h" -#include "Cluster/Action.h" +#include "Cluster/MaintenanceActions/Action.h" #include "Cluster/ClusterFeature.h" #include "Cluster/Maintenance.h" #include "Cluster/MaintenanceFeature.h" diff --git a/tests/Maintenance/MaintenanceTest.cpp b/tests/Maintenance/MaintenanceTest.cpp index 796f1271e1a6..48c62e0c9dda 100644 --- a/tests/Maintenance/MaintenanceTest.cpp +++ b/tests/Maintenance/MaintenanceTest.cpp @@ -35,7 +35,7 @@ #include "Basics/StaticStrings.h" #include "Cluster/Maintenance.h" #include "Cluster/MaintenanceFeature.h" -#include "Cluster/ResignShardLeadership.h" +#include "Cluster/MaintenanceActions/ResignShardLeadership.h" #include "Metrics/MetricsFeature.h" #include "Mocks/Servers.h" #include "Mocks/StorageEngineMock.h" diff --git a/tests/Mocks/Servers.cpp b/tests/Mocks/Servers.cpp index 1537ddf02223..2aec90020011 100644 --- a/tests/Mocks/Servers.cpp +++ b/tests/Mocks/Servers.cpp @@ -40,13 +40,13 @@ #include "Basics/StringUtils.h" #include "Basics/TimeString.h" #include "Basics/files.h" -#include "Cluster/ActionDescription.h" +#include "Cluster/MaintenanceActions/ActionDescription.h" #include "Cluster/AgencyCache.h" #include "Cluster/ClusterFeature.h" #include "Cluster/ClusterInfo.h" -#include "Cluster/CreateCollection.h" -#include "Cluster/CreateDatabase.h" -#include "Cluster/DropDatabase.h" +#include "Cluster/MaintenanceActions/CreateCollection.h" +#include "Cluster/MaintenanceActions/CreateDatabase.h" +#include "Cluster/MaintenanceActions/DropDatabase.h" #include "Cluster/Maintenance.h" #include "ClusterEngine/ClusterEngine.h" #include "FeaturePhases/AqlFeaturePhase.h" diff --git a/tests/TaskMonitoring/CMakeLists.txt b/tests/TaskMonitoring/CMakeLists.txt new file mode 100644 index 000000000000..1b7fe3e86702 --- /dev/null +++ b/tests/TaskMonitoring/CMakeLists.txt @@ -0,0 +1,12 @@ +add_library(arango_tests_task_registry OBJECT + TaskRegistryTest.cpp) +target_link_libraries(arango_tests_task_registry PRIVATE + arango_task_registry_global + gtest +) + +add_executable(arangodbtests_task_registry EXCLUDE_FROM_ALL) +target_link_libraries(arangodbtests_task_registry + arango_tests_task_registry + gtest_main) + diff --git a/tests/TaskMonitoring/TaskRegistryTest.cpp b/tests/TaskMonitoring/TaskRegistryTest.cpp new file mode 100644 index 000000000000..3897ecc3622a --- /dev/null +++ b/tests/TaskMonitoring/TaskRegistryTest.cpp @@ -0,0 +1,512 @@ +//////////////////////////////////////////////////////////////////////////////// +/// DISCLAIMER +/// +/// Copyright 2014-2024 ArangoDB GmbH, Cologne, Germany +/// Copyright 2004-2014 triAGENS GmbH, Cologne, Germany +/// +/// Licensed under the Business Source License 1.1 (the "License"); +/// you may not use this file except in compliance with the License. +/// You may obtain a copy of the License at +/// +/// https://github.com/arangodb/arangodb/blob/devel/LICENSE +/// +/// Unless required by applicable law or agreed to in writing, software +/// distributed under the License is distributed on an "AS IS" BASIS, +/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +/// See the License for the specific language governing permissions and +/// limitations under the License. +/// +/// Copyright holder is ArangoDB GmbH, Cologne, Germany +/// +/// @author Julia Volmer +//////////////////////////////////////////////////////////////////////////////// + +#include "Async/async.h" +#include "Containers/Concurrent/thread.h" +#include "TaskMonitoring/task.h" +#include "TaskMonitoring/task_registry_variable.h" +#include "Inspection/JsonPrintInspector.h" +#include + +#include +#include + +using namespace arangodb; +using namespace arangodb::task_monitoring; + +namespace { +auto get_all_tasks() -> std::vector { + std::vector tasks; + registry.for_node( + [&](TaskSnapshot task) { tasks.emplace_back(std::move(task)); }); + return tasks; +} + +struct MyTask : public Task { + basics::SourceLocationSnapshot source_location; + MyTask(std::string name, + std::source_location loc = std::source_location::current()) + : Task{std::move(name), loc}, + source_location{basics::SourceLocationSnapshot::from(std::move(loc))} {} +}; + +} // namespace + +struct TaskRegistryTest : ::testing::Test { + void TearDown() override { + // garbage collection has to run at most twice in order to clean everything + // up on the current thread: + // - when a child task scope is deleted, the child's task-in-registry is + // marked for deletion + // - at this point its parent task scope can still exist, therefore it is + // not marked for deletion inside the child task scope destructor + // - when then the parent task scope is deleted, the parent's + // task-in-registry is still referenced by the child's task-in-registry + // (which is not yet deleted), therefore it is not yet marked for deletion + + // the first gc run destroys the child's task-in-registry + // which destroys the last reference to the parent's task-in-registry, which + // is therfore marked for deletion (together with all remaining + // task-in-registries higher up in the hierarchy that are not referenced by + // any other tasks) + get_thread_registry().garbage_collect(); + // the second gc run destroys the parent's task-in-registry (and possibly + // other marked for deletion items) + get_thread_registry().garbage_collect(); + EXPECT_EQ(get_all_tasks().size(), 0); + } +}; + +TEST_F(TaskRegistryTest, a_base_task_creates_a_root_task) { + auto task = MyTask{"test task"}; + + EXPECT_EQ(get_all_tasks(), (std::vector{(TaskSnapshot{ + .name = "test task", + .state = State::Running, + .id = task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = task.source_location})})); +} + +TEST_F(TaskRegistryTest, creates_a_child_task) { + auto parent_task = MyTask{"parent task"}; + auto child_task = MyTask{"child task"}; + + EXPECT_EQ( + get_all_tasks(), + (std::vector{ + (TaskSnapshot{.name = "child task", + .state = State::Running, + .id = child_task.id(), + .parent = {TaskIdWrapper{parent_task.id()}}, + .thread = basics::ThreadId::current(), + .source_location = child_task.source_location}), + (TaskSnapshot{.name = "parent task", + .state = State::Running, + .id = parent_task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = parent_task.source_location})})); +} + +TEST_F(TaskRegistryTest, creates_a_child_task_hierarchy) { + auto parent_task = MyTask{"parent task"}; + auto child_task = MyTask{"child task"}; + auto child_of_child_task = MyTask{"child of child task"}; + auto child_of_child_of_child_task = MyTask{"child of child of child task"}; + + EXPECT_EQ( + get_all_tasks(), + (std::vector{ + (TaskSnapshot{ + .name = "child of child of child task", + .state = State::Running, + .id = child_of_child_of_child_task.id(), + .parent = {TaskIdWrapper{child_of_child_task.id()}}, + .thread = basics::ThreadId::current(), + .source_location = child_of_child_of_child_task.source_location}), + (TaskSnapshot{ + .name = "child of child task", + .state = State::Running, + .id = child_of_child_task.id(), + .parent = {TaskIdWrapper{child_task.id()}}, + .thread = basics::ThreadId::current(), + .source_location = child_of_child_task.source_location}), + (TaskSnapshot{.name = "child task", + .state = State::Running, + .id = child_task.id(), + .parent = {TaskIdWrapper{parent_task.id()}}, + .thread = basics::ThreadId::current(), + .source_location = child_task.source_location}), + (TaskSnapshot{.name = "parent task", + .state = State::Running, + .id = parent_task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = parent_task.source_location})})); +} + +TEST_F(TaskRegistryTest, uses_correct_parent_task) { + auto parent_task = MyTask{"parent task"}; + { + auto first_child_task = MyTask{"first child task"}; + + EXPECT_EQ( + get_all_tasks(), + (std::vector{ + (TaskSnapshot{.name = "first child task", + .state = State::Running, + .id = first_child_task.id(), + .parent = {TaskIdWrapper{parent_task.id()}}, + .thread = basics::ThreadId::current(), + .source_location = first_child_task.source_location}), + (TaskSnapshot{.name = "parent task", + .state = State::Running, + .id = parent_task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = parent_task.source_location})})); + } + get_thread_registry().garbage_collect(); + + auto second_child_task = MyTask{"second child task"}; + + EXPECT_EQ( + get_all_tasks(), + (std::vector{ + (TaskSnapshot{.name = "second child task", + .state = State::Running, + .id = second_child_task.id(), + .parent = {TaskIdWrapper{parent_task.id()}}, + .thread = basics::ThreadId::current(), + .source_location = second_child_task.source_location}), + (TaskSnapshot{.name = "parent task", + .state = State::Running, + .id = parent_task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = parent_task.source_location})})); +} + +struct WaitSlot { + void resume() { + ready = true; + _continuation.resume(); + } + + std::coroutine_handle<> _continuation; + + bool await_ready() { return ready; } + void await_resume() {} + void await_suspend(std::coroutine_handle<> continuation) { + _continuation = continuation; + } + + bool ready = false; +}; + +TEST_F(TaskRegistryTest, a_base_task_lives_as_long_as_its_child) { + WaitSlot wait; + TaskSnapshot parent_task_snapshot; + TaskSnapshot child_task_snapshot; + { + auto parent_task = MyTask{"parent task"}; + + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 1); + EXPECT_EQ(tasks_in_registry[0], + (TaskSnapshot{.name = "parent task", + .state = State::Running, + .id = parent_task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = parent_task.source_location})); + parent_task_snapshot = tasks_in_registry[0]; + + std::ignore = [&wait, parent_task_snapshot, + &child_task_snapshot]() -> async { + auto child_task = MyTask{"child task"}; + + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 2); + EXPECT_EQ( + tasks_in_registry[0], + (TaskSnapshot{.name = "child task", + .state = State::Running, + .id = child_task.id(), + .parent = {TaskIdWrapper{parent_task_snapshot.id}}, + .thread = basics::ThreadId::current(), + .source_location = child_task.source_location})); + child_task_snapshot = tasks_in_registry[0]; + EXPECT_EQ(tasks_in_registry[1], parent_task_snapshot); + co_await wait; + co_return; + }(); + } + + // both task-in-registries still exist: + // child lives in suspended coroutine and references parent + // although parent scope is deleted + get_thread_registry().garbage_collect(); // does not do anything + EXPECT_EQ(get_all_tasks(), + (std::vector{ + child_task_snapshot, + parent_task_snapshot.update_state(State::Finished)})); + + // resume coroutine, mark child for deletion at end of coroutine and mark + // parent for deletion at end of scope + wait.resume(); +} + +TEST_F(TaskRegistryTest, create_another_task_after_child_suspended) { + WaitSlot wait; + TaskSnapshot parent_task_snapshot; + TaskSnapshot child_task_snapshot; + { + auto parent_task = MyTask{"parent task"}; + + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 1); + EXPECT_EQ(tasks_in_registry[0], + (TaskSnapshot{.name = "parent task", + .state = State::Running, + .id = parent_task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = parent_task.source_location})); + parent_task_snapshot = tasks_in_registry[0]; + + std::ignore = [&wait, parent_task_snapshot, + &child_task_snapshot]() -> async { + auto child_task = MyTask{"child task"}; + + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 2); + EXPECT_EQ( + tasks_in_registry[0], + (TaskSnapshot{.name = "child task", + .state = State::Running, + .id = child_task.id(), + .parent = {TaskIdWrapper{parent_task_snapshot.id}}, + .thread = basics::ThreadId::current(), + .source_location = child_task.source_location})); + child_task_snapshot = tasks_in_registry[0]; + EXPECT_EQ(tasks_in_registry[1], parent_task_snapshot); + co_await wait; + co_return; + }(); + + auto some_other_task = MyTask{"some other task"}; + + EXPECT_EQ( + get_all_tasks(), + (std::vector{ + (TaskSnapshot{.name = "some other task", + .state = State::Running, + .id = some_other_task.id(), + .parent = {TaskIdWrapper{parent_task.id()}}, + .thread = basics::ThreadId::current(), + .source_location = some_other_task.source_location}), + child_task_snapshot, parent_task_snapshot})); + } + + auto another_task = MyTask{"another task"}; + + get_thread_registry().garbage_collect(); // deletes some_other_task + EXPECT_EQ(get_all_tasks(), + (std::vector{ + (TaskSnapshot{.name = "another task", + .state = State::Running, + .id = another_task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = another_task.source_location}), + child_task_snapshot, + parent_task_snapshot.update_state(State::Finished)})); + + // resume coroutine, mark child for deletion at end of coroutine and mark + // parent for deletion at end of scope + wait.resume(); +} + +TEST_F(TaskRegistryTest, hierarchy_with_different_scopes) { + WaitSlot wait; + TaskSnapshot parent_task_snapshot; + TaskSnapshot child_task_snapshot; + TaskSnapshot child_of_child_task_snapshot; + { + auto parent_task = MyTask{"parent task"}; + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 1); + EXPECT_EQ(tasks_in_registry[0], + (TaskSnapshot{.name = "parent task", + .state = State::Running, + .id = parent_task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = parent_task.source_location})); + parent_task_snapshot = tasks_in_registry[0]; + + std::ignore = [&wait, parent_task_snapshot, &child_task_snapshot, + &child_of_child_task_snapshot]() -> async { + auto child_task = MyTask{"child task"}; + + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 2); + EXPECT_EQ( + tasks_in_registry[0], + (TaskSnapshot{.name = "child task", + .state = State::Running, + .id = child_task.id(), + .parent = {TaskIdWrapper{parent_task_snapshot.id}}, + .thread = basics::ThreadId::current(), + .source_location = child_task.source_location})); + child_task_snapshot = tasks_in_registry[0]; + EXPECT_EQ(tasks_in_registry[1], parent_task_snapshot); + + co_await [&wait, parent_task_snapshot, child_task_snapshot, + &child_of_child_task_snapshot]() -> async { + auto child_of_child_task = MyTask{"child of child task"}; + + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 3); + EXPECT_EQ(tasks_in_registry[0], + (TaskSnapshot{ + .name = "child of child task", + .state = State::Running, + .id = child_of_child_task.id(), + .parent = {TaskIdWrapper{child_task_snapshot.id}}, + .thread = basics::ThreadId::current(), + .source_location = child_of_child_task.source_location})); + child_of_child_task_snapshot = tasks_in_registry[0]; + EXPECT_EQ(tasks_in_registry[1], child_task_snapshot); + EXPECT_EQ(tasks_in_registry[2], parent_task_snapshot); + + co_await wait; + co_return; + }(); + + co_return; + }(); + } + + // both task-in-registries still exist: + // child lives in suspended coroutine and references parent + get_thread_registry().garbage_collect(); // does not do anything + EXPECT_EQ(get_all_tasks(), + (std::vector{ + child_of_child_task_snapshot, child_task_snapshot, + parent_task_snapshot.update_state(State::Finished)})); + + // resume coroutine, mark child of child for deletion, child and parent are + // marked for deletion when child of child is destroyed + wait.resume(); +} + +TEST_F(TaskRegistryTest, + a_base_task_lives_as_long_as_its_longest_living_child) { + WaitSlot first_wait; + WaitSlot second_wait; + TaskSnapshot parent_task_snapshot; + TaskSnapshot first_child_task_snapshot; + TaskSnapshot second_child_task_snapshot; + TaskSnapshot child_of_second_child_task_snapshot; + { + auto parent_task = MyTask{"parent task"}; + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 1); + EXPECT_EQ(tasks_in_registry[0], + (TaskSnapshot{.name = "parent task", + .state = State::Running, + .id = parent_task.id(), + .parent = {RootTask{}}, + .thread = basics::ThreadId::current(), + .source_location = parent_task.source_location})); + parent_task_snapshot = tasks_in_registry[0]; + + std::ignore = [&first_wait, parent_task_snapshot, + &first_child_task_snapshot]() -> async { + auto child_task = MyTask{"first child task"}; + + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 2); + EXPECT_EQ( + tasks_in_registry[0], + (TaskSnapshot{.name = "first child task", + .state = State::Running, + .id = child_task.id(), + .parent = {TaskIdWrapper{parent_task_snapshot.id}}, + .thread = basics::ThreadId::current(), + .source_location = child_task.source_location})); + first_child_task_snapshot = tasks_in_registry[0]; + EXPECT_EQ(tasks_in_registry[1], parent_task_snapshot); + + co_await first_wait; + co_return; + }(); + + auto second_child_task = MyTask{"second child task"}; + tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 3); + EXPECT_EQ( + tasks_in_registry[0], + (TaskSnapshot{.name = "second child task", + .state = State::Running, + .id = second_child_task.id(), + .parent = {TaskIdWrapper{parent_task_snapshot.id}}, + .thread = basics::ThreadId::current(), + .source_location = second_child_task.source_location})); + EXPECT_EQ(tasks_in_registry[1], first_child_task_snapshot); + EXPECT_EQ(tasks_in_registry[2], parent_task_snapshot); + second_child_task_snapshot = tasks_in_registry[0]; + + std::ignore = [&second_wait, parent_task_snapshot, + first_child_task_snapshot, second_child_task_snapshot, + &child_of_second_child_task_snapshot]() -> async { + auto child_of_child_task = MyTask{"child of second child task"}; + + auto tasks_in_registry = get_all_tasks(); + EXPECT_EQ(tasks_in_registry.size(), 4); + EXPECT_EQ(tasks_in_registry[0], + (TaskSnapshot{ + .name = "child of second child task", + .state = State::Running, + .id = child_of_child_task.id(), + .parent = {TaskIdWrapper{second_child_task_snapshot.id}}, + .thread = basics::ThreadId::current(), + .source_location = child_of_child_task.source_location})); + child_of_second_child_task_snapshot = tasks_in_registry[0]; + EXPECT_EQ(tasks_in_registry[1], second_child_task_snapshot); + EXPECT_EQ(tasks_in_registry[2], first_child_task_snapshot); + EXPECT_EQ(tasks_in_registry[3], parent_task_snapshot); + + co_await second_wait; + co_return; + }(); + } + + // all four task-in-registries still exist: + // childs live in suspended coroutines and reference its parents + get_thread_registry().garbage_collect(); // does not do anything + EXPECT_EQ(get_all_tasks(), + (std::vector{ + child_of_second_child_task_snapshot, + second_child_task_snapshot.update_state(State::Finished), + first_child_task_snapshot, + parent_task_snapshot.update_state(State::Finished)})); + + // marks child of second child and second child for deletion, parent is still + // in scope + second_wait.resume(); + get_thread_registry().garbage_collect(); + EXPECT_EQ(get_all_tasks(), + (std::vector{ + first_child_task_snapshot, + parent_task_snapshot.update_state(State::Finished)})); + + // marks first child and parent for deletion + first_wait.resume(); + get_thread_registry().garbage_collect(); + EXPECT_EQ(get_all_tasks().size(), 0); +}