diff --git a/.gitignore b/.gitignore
index 003cb9a1743cb674846ffadc7d850480227e07bf..fc8082d8b494800577d5951d3d104a872ac17a43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ docs/html
 /build*
 /.vs
 /CMakeSettings.json
+.vscode/*
diff --git a/.vscode/launch.json b/.vscode/launch.json
index c86bce66099f93f2a10bf28bb18c8a0b3be623df..da7ac6001e96c9e5fb6c91597bc3db3c5eb0abcd 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -5,12 +5,16 @@
     "version": "0.2.0",
     "configurations": [
         {
-            "name": "g++ - Debug Node",
+            "name": "(gdb) Launch",
             "type": "cppdbg",
             "request": "launch",
+            "program": "${command:cmake.launchTargetPath}",
             "args": [],
             "stopAtEntry": false,
             "cwd": "${workspaceFolder}/build",
+            "environment": [
+                {"name" : "ASAN_OPTIONS", "value" : "abort_on_error=1,protect_shadow_gap=0"}
+            ],
             "externalConsole": false,
             "MIMode": "gdb",
             "setupCommands": [
@@ -18,15 +22,44 @@
                     "description": "Enable pretty-printing for gdb",
                     "text": "-enable-pretty-printing",
                     "ignoreFailures": true
+                },
+                {
+                    "description": "Set Disassembly Flavor to Intel",
+                    "text": "-gdb-set disassembly-flavor intel",
+                    "ignoreFailures": true
                 }
             ],
-            "miDebuggerPath": "/usr/bin/gdb",
             "sourceFileMap": {
                 "${workspaceFolder}": {	
                     "editorPath": "${workspaceFolder}",
                     "useForBreakpoints": "true"
                 }
-            }            
+            }
+        },
+        {
+            "name": "(gdb) Attach",
+            "type": "cppdbg",
+            "request": "attach",
+            "program": "${command:cmake.launchTargetPath}",
+            "MIMode": "gdb",
+            "setupCommands": [
+                {
+                    "description": "Enable pretty-printing for gdb",
+                    "text": "-enable-pretty-printing",
+                    "ignoreFailures": true
+                },
+                {
+                    "description": "Set Disassembly Flavor to Intel",
+                    "text": "-gdb-set disassembly-flavor intel",
+                    "ignoreFailures": true
+                }
+            ],
+            "sourceFileMap": {
+                "${workspaceFolder}": {
+                    "editorPath": "${workspaceFolder}",
+                    "useForBreakpoints": "true"
+                }
+            }
         }
     ]
 }
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 713f547df23addb0c74a2d6fe5b32910b176613b..c02bff1583ead57cd94650c2d90a4118d5c1ee91 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -66,7 +66,18 @@
         "valarray": "cpp",
         "variant": "cpp",
         "any": "cpp",
-        "complex": "cpp"
+        "complex": "cpp",
+        "csignal": "cpp",
+        "compare": "cpp",
+        "concepts": "cpp",
+        "numbers": "cpp",
+        "ranges": "cpp",
+        "span": "cpp",
+        "stop_token": "cpp",
+        "typeindex": "cpp",
+        "semaphore": "cpp",
+        "*.ipp": "cpp"
     },
-    "cmake.cmakePath": "cmake"
+    "cmake.cmakePath": "cmake",
+    "cmake.configureOnOpen": true
 }
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7656ec7d0d907f108fc257702f320c557a2c2ee..5b5605d9bbbf9fc16fb36082af27eb4ff1465726 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required (VERSION 3.18.0)
 include (CheckIncludeFile)
 include (CheckIncludeFileCXX)
 include (CheckFunctionExists)
+include (FetchContent)
 
 if (NOT FTL_VERSION)
 	set(FTL_VERSION 0.0.1)
@@ -18,6 +19,8 @@ option(WITH_GNUTLS "Enable TLS support" ON)
 option(USE_CPPCHECK "Apply cppcheck during build" ON)
 option(BUILD_TESTS "Compile all unit and integration tests" ON)
 option(BUILD_EXAMPLES "Compile the examples" ON)
+option(ENABLE_PROFILER "Enable builtin performance profiling" OFF)
+option(DEBUG_LOCKS "Enable lock profiling (requires ENABLE_PROFILER)" OFF)
 
 if (NOT WIN32)
 	option(WITH_PYTHON "Enable python support" ON)
@@ -98,6 +101,7 @@ if (USE_CPPCHECK)
 	endif()
 endif()
 
+include(ftl_tracy)
 include(git_version)
 include(ftl_paths)
 
@@ -177,10 +181,12 @@ target_include_directories(beyond-common PUBLIC
 	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 	$<INSTALL_INTERFACE:include>)
 
+target_link_libraries(beyond-common PUBLIC Tracy)
+
 add_library(beyond-protocol STATIC
 	$<TARGET_OBJECTS:beyond-common>
-    src/peer.cpp
-    src/universe.cpp
+	src/peer.cpp
+	src/universe.cpp
 	src/socket/socket.cpp
 	src/protocol/connection.cpp
 	src/protocol/factory.cpp
@@ -207,7 +213,7 @@ add_library(beyond-protocol STATIC
 target_include_directories(beyond-protocol PUBLIC
 	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 	$<INSTALL_INTERFACE:include>)
-target_link_libraries(beyond-protocol Threads::Threads ${UUID_LIBRARIES})
+target_link_libraries(beyond-protocol Threads::Threads ${UUID_LIBRARIES} Tracy)
 
 if (WITH_GNUTLS)
 	target_link_libraries(beyond-protocol GnuTLS::GnuTLS)
@@ -224,5 +230,5 @@ if (BUILD_TESTS)
 endif()
 
 if (BUILD_EXAMPLES)
-    add_subdirectory(examples)
+	add_subdirectory(examples)
 endif()
diff --git a/cmake/ftl_tracy.cmake b/cmake/ftl_tracy.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..473c8d30ecda60789d9f9354a654665443ef61c5
--- /dev/null
+++ b/cmake/ftl_tracy.cmake
@@ -0,0 +1,25 @@
+if (ENABLE_PROFILER)
+    set(TRACY_CXX_OPTIONS "-DTRACY_ENABLE -DTRACY_DELAYED_INIT -DTRACY_VERBOSE -DNOMINMAX")
+
+    if (DEBUG_LOCKS)
+        set(TRACY_CXX_OPTIONS "${TRACY_CXX_OPTIONS} -DDEBUG_LOCKS")
+        message(STATUS "Lock profiling enabled")
+    endif()
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TRACY_CXX_OPTIONS}")
+
+    FetchContent_Declare(tracy
+        GIT_REPOSITORY https://github.com/wolfpld/tracy.git
+        GIT_TAG 897aec5b062664d2485f4f9a213715d2e527e0ca # tags/v0.9.1
+        GIT_SHALLOW TRUE
+        GIT_PROGRESS TRUE
+    )
+    FetchContent_MakeAvailable(tracy)
+
+    add_library(Tracy ALIAS TracyClient)
+
+    message(STATUS "Profiling (Tracy) enabled")
+
+else()
+    add_library(Tracy INTERFACE)
+endif()
diff --git a/include/ftl/profiler.hpp b/include/ftl/profiler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b6afbbaa912f10c576313860542bc00b818d659
--- /dev/null
+++ b/include/ftl/profiler.hpp
@@ -0,0 +1,40 @@
+/**
+ * @file profiler.hpp
+ * @copyright Copyright (c) 2020-2022 University of Turku, MIT License
+ * @author Nicolas Pope, Sebastian Hahta
+ */
+
+#pragma once
+
+#ifdef TRACY_ENABLE
+
+#include <tracy/Tracy.hpp>
+
+#define FTL_PROFILE_SCOPE(LABEL) ZoneScopedN(LABEL)
+
+// NOTE: Tracy expects Label to be a pointer to same address (this should be the case
+//       with GCC and MSVC with string pooling). If not, label has to be defined
+//       separately before use (static const char* ...) and exported if necessary
+
+/** Mark (secondary) frame start and stop. Each FTL_PROFILE_FRAME_BEGIN MUST be matched
+ * with FTL_PROFILE_FRAME_END */
+#define FTL_PROFILE_FRAME_BEGIN(LABEL) FrameMarkStart(#LABEL)
+#define FTL_PROFILE_FRAME_END(LABEL) FrameMarkEnd(#LABEL)
+
+/** Mark end of primary frame (main rendering/capture loop etc, if applicable) */
+#define FTL_PROFILE_PRIMARY_FRAME_END() FrameMark
+
+/// deprecated
+#define FTL_Profile(LABEL, LIMIT) FTL_PROFILE_SCOPE(LABEL)
+
+#else
+
+#define FTL_PROFILE_FRAME_BEGIN(LABEL) {}
+#define FTL_PROFILE_FRAME_END(LABEL) {}
+#define FTL_PROFILE_PRIMARY_FRAME_END() {}
+#define FTL_PROFILE_SCOPE(LABEL) {}
+
+/// deprectated
+#define FTL_Profile(LABEL, LIMIT) {}
+
+#endif
diff --git a/include/ftl/threads.hpp b/include/ftl/threads.hpp
index 0d62a76ab0d616dfdb060da269121f64cf032203..92f9bda15f3143d215baeead0dca1b3536e1ad61 100644
--- a/include/ftl/threads.hpp
+++ b/include/ftl/threads.hpp
@@ -12,29 +12,55 @@
 
 #define POOL_SIZE 10
 
-// #define DEBUG_MUTEX
-#define MUTEX_TIMEOUT 2
+/// consider using DECLARE_MUTEX(name) which allows (optional) profiling
+#define MUTEX std::mutex
+/// consider using DECLARE_RECURSIVE_MUTEX(name) which allows (optional) profiling
+#define RECURSIVE_MUTEX std::recursive_mutex
+/// consider using DECLARE_SHARED_MUTEX(name) which allows (optional) profiling
+#define SHARED_MUTEX std::shared_mutex
+
+#if defined(TRACY_ENABLE)
 
-#if defined DEBUG_MUTEX
-#include <ftl/lib/loguru.hpp>
-#include <chrono>
 #include <type_traits>
+#include <tracy/Tracy.hpp>
 
-#define MUTEX std::timed_mutex
-#define RECURSIVE_MUTEX std::recursive_timed_mutex
-#define SHARED_MUTEX std::shared_timed_mutex
+#define DECLARE_MUTEX(varname) TracyLockable(MUTEX, varname)
+#define DECLARE_RECURSIVE_MUTEX(varname) TracyLockable(RECURSIVE_MUTEX, varname)
+#define DECLARE_SHARED_MUTEX(varname) TracySharedLockable(SHARED_MUTEX, varname)
 
-#define UNIQUE_LOCK(M, L) std::unique_lock<std::remove_reference<decltype(M)>::type> L(M, std::chrono::milliseconds(MUTEX_TIMEOUT)); while (!L) { LOG(ERROR) << "Mutex timeout"; L.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT)); };
-#define SHARED_LOCK(M, L) std::shared_lock<std::remove_reference<decltype(M)>::type> L(M, std::chrono::milliseconds(MUTEX_TIMEOUT)); while (!L) { LOG(ERROR) << "Mutex timeout"; L.try_lock_for(std::chrono::milliseconds(MUTEX_TIMEOUT)); };
+#define DECLARE_MUTEX_N(varname, name) TracyLockableN(MUTEX, varname, name)
+#define DECLARE_RECURSIVE_MUTEX_N(varname, name) TracyLockableN(RECURSIVE_MUTEX, varname, name)
+#define DECLARE_SHARED_MUTEX_N(varname, name) TracySharedLockableN(SHARED_MUTEX, varname, name)
+
+/// mark lock acquired (mutex M);
+#define MARK_LOCK_AQUIRED(M) LockMark(M)
+// TODO: should automatic, but requires mutexes to be declared with DECLARE_..._MUTEX macros
+
+#define T_UNIQUE_LOCK(M) std::unique_lock<std::remove_reference<decltype(M)>::type>
+#define UNIQUE_LOCK(M, L) std::unique_lock<std::remove_reference<decltype(M)>::type> L(M)
+#define SHARED_LOCK(M, L) std::shared_lock<std::remove_reference<decltype(M)>::type> L(M)
 
 #else
-#define MUTEX std::mutex
-#define RECURSIVE_MUTEX std::recursive_mutex
-#define SHARED_MUTEX std::shared_mutex
 
-#define UNIQUE_LOCK(M, L) std::unique_lock<std::remove_reference<decltype(M)>::type> L(M);
-#define SHARED_LOCK(M, L) std::shared_lock<std::remove_reference<decltype(M)>::type> L(M);
-#endif  // DEBUG_MUTEX
+/// mutex with optional profiling (and debugging) when built with PROFILE_MUTEX.
+#define DECLARE_MUTEX(varname) MUTEX varname
+/// recursive mutex with optional profiling (and debugging) when built with PROFILE_MUTEX
+#define DECLARE_RECURSIVE_MUTEX(varname) RECURSIVE_MUTEX varname
+/// shared mutex with optional profiling (and debugging) when built with PROFILE_MUTEX
+#define DECLARE_SHARED_MUTEX(varname) SHARED_MUTEX varname
+
+#define DECLARE_MUTEX_N(varname, name) MUTEX varname
+#define DECLARE_RECURSIVE_MUTEX_N(varname, name) RECURSIVE_MUTEX varname
+#define DECLARE_SHARED_MUTEX_N(varname, name) SHARED_MUTEX varname
+
+/// mark lock acquired (mutex M)
+#define MARK_LOCK(M) {}
+
+#define T_UNIQUE_LOCK(M) std::unique_lock<std::remove_reference<decltype(M)>::type>
+#define UNIQUE_LOCK(M, L) std::unique_lock<std::remove_reference<decltype(M)>::type> L(M)
+#define SHARED_LOCK(M, L) std::shared_lock<std::remove_reference<decltype(M)>::type> L(M)
+
+#endif  // TRACY_ENABLE
 
 #define SHARED_LOCK_TYPE(M) std::shared_lock<M>
 
diff --git a/src/ctpl_stl.cpp b/src/ctpl_stl.cpp
index 947560af1898aeec977e707f367959e75aebf701..7d4e8f98cc50d59ef6858510fce2c8f4d7602c89 100644
--- a/src/ctpl_stl.cpp
+++ b/src/ctpl_stl.cpp
@@ -5,9 +5,20 @@
 
 #include <ftl/lib/ctpl_stl.hpp>
 
+#ifdef TRACY_ENABLE
+#include <tracy/Tracy.hpp>
+#endif
+
 void ctpl::thread_pool::set_thread(int i) {
     std::shared_ptr<std::atomic<bool>> flag(this->flags[i]);  // a copy of the shared ptr to the flag
     auto f = [this, i, flag/* a copy of the shared ptr to the flag */]() {
+        #if TRACY_ENABLE
+        {
+            const auto thread_name = "thread_pool/" + std::to_string(i);
+            tracy::SetThreadName(thread_name.c_str());
+        }
+        #endif
+        
         std::atomic<bool> & _flag = *flag;
         std::function<void(int id)> * _f;
         bool isPop = this->q.pop(_f);
diff --git a/src/peer.hpp b/src/peer.hpp
index de3fd9da769996dbfa7d3e22a1ea3dc30d2fa3ba..4e7d742e5b794a2e3493cda57a084d35ab3198e2 100644
--- a/src/peer.hpp
+++ b/src/peer.hpp
@@ -260,9 +260,8 @@ class Peer {
 
     // Send buffers
     msgpack::vrefbuffer send_buf_;
-    RECURSIVE_MUTEX send_mtx_;
-
-    RECURSIVE_MUTEX cb_mtx_;
+    DECLARE_RECURSIVE_MUTEX(send_mtx_);
+    DECLARE_RECURSIVE_MUTEX(cb_mtx_);
 
     const bool outgoing_;
     unsigned int local_id_;
diff --git a/src/protocol/connection.cpp b/src/protocol/connection.cpp
index 52136952157bba13a680178ba62310ba4e3ea0bc..e1528da309484190402a926a5aadc954a38fbd24 100644
--- a/src/protocol/connection.cpp
+++ b/src/protocol/connection.cpp
@@ -111,7 +111,7 @@ ssize_t SocketConnection::writev(const struct iovec *iov, int iovcnt) {
         writev_calls++;
 
         if (sent < 0) {
-            DLOG(ERROR) << "writev(): " << sock_.get_error_string();
+            LOG(2) << "writev(): " << sock_.get_error_string();
             if (sock_.is_fatal()) {
                 return sent;
             }
@@ -121,7 +121,7 @@ ssize_t SocketConnection::writev(const struct iovec *iov, int iovcnt) {
         sent_total += sent;
     }
 
-    DLOG(2) << "message required " << writev_calls << " writev() calls";
+    LOG(2) << "message required " << writev_calls << " writev() calls";
 
     /*if (can_increase_sock_buffer_) {
         auto send_buf_size = sock_.get_send_buffer_size();
diff --git a/src/universe.cpp b/src/universe.cpp
index 21f6563303883ee68df53b9920947f8af095c761..57af06a1e0721d24abe2c55ac86a73addc0e3c7f 100644
--- a/src/universe.cpp
+++ b/src/universe.cpp
@@ -75,6 +75,7 @@ struct NetImplDetail {
 
 // TODO(Seb): move to ServerSocket and ClientSocket
 // Defaults, should be changed in config
+
 #define TCP_SEND_BUFFER_SIZE    (32*1024*1024)
 #define TCP_RECEIVE_BUFFER_SIZE (32*1024*1024)  // Perhaps try 24K?
 #define WS_SEND_BUFFER_SIZE     (32*1024*1024)