diff --git a/lib/proxysql_utils.cpp b/lib/proxysql_utils.cpp index 062998414..deb3bd109 100644 --- a/lib/proxysql_utils.cpp +++ b/lib/proxysql_utils.cpp @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include using std::function; using std::string; @@ -488,16 +491,146 @@ std::string get_checksum_from_hash(uint64_t hash) { return string { &s_buf.front() }; } +/** + * @brief Closes all open file descriptors except stdin (0), stdout (1), stderr (2), and a specified exclusion list + * + * This function is typically called after fork() in the child process before exec() to ensure that + * the child process does not inherit unintended file descriptors from the parent. This is crucial + * for security, resource management, and preventing deadlocks in daemonized processes. + * + * **Security Implications:** + * - Prevents child processes from accessing sensitive file descriptors (database connections, sockets, log files) + * - Prevents information leaks through unintended descriptor inheritance + * - Required for secure privilege separation and sandboxing + * + * **Resource Management:** + * - Prevents resource exhaustion by closing descriptors not needed by the child + * - Allows the parent to close descriptors without affecting the child + * - Ensures proper cleanup in daemon/service contexts + * + * **Deadlock Prevention:** + * - In multi-threaded programs, if fork() is called while other threads hold locks or resources, + * the child process inherits a "frozen" state where only the forking thread exists but the + * locked resources remain held. Closing file descriptors prevents potential deadlocks + * when those descriptors are associated with pipes or sockets that may block on I/O. + * + * **Implementation Details:** + * + * The function uses three strategies, tried in order: + * + * 1. **Primary Method (Linux 5.9+, empty excludeFDs only):** close_range() syscall + * - Uses syscall(__NR_close_range, 3, ~0U, 0) to atomically close all fds >= 3 + * - Runtime detection: first call checks if syscall is available + * - This method is O(1) and the most efficient + * - ONLY used when excludeFDs is empty (otherwise would close excluded fds) + * + * 2. **Secondary Method:** Iterate through /proc/self/fd + * - Uses opendir("/proc/self/fd") to get a directory stream of open file descriptors + * - Uses dirfd() to get the directory's own fd and skips closing it (prevents self-referential closure bug) + * - Reads each entry and uses atoi() to convert to fd (no heap allocation) + * - Closes all descriptors > 2 (stdin/stdout/stderr) that are not in the exclusion list + * - This method is O(n) where n is the number of open file descriptors + * + * 3. **Fallback Method:** Iterate through rlimit + * - If /proc/self/fd is not available (e.g., on non-Linux systems or chroot environments), + * falls back to getrlimit(RLIMIT_NOFILE) + * - Iterates from 3 to rlim_cur-1, attempting to close each descriptor + * - Ignores EBADF errors for descriptors that aren't actually open + * - This method is O(rlim_cur) which can be much slower if rlim_max is large (e.g., 1048576) + * + * **Important Design Considerations for fork() Safety:** + * + * 1. **Heap Allocation Avoidance:** This function avoids heap allocations in the /proc/self/fd path: + * - Uses atoi() instead of std::stol(std::string(...)) + * - Uses simple loops instead of std::find() which may allocate + * - This is critical in multi-threaded programs after fork() where malloc locks may be held + * + * 2. **Self-Referential Directory FD Closure Prevention:** + * - The opendir() call creates a fd for the directory stream + * - We use dirfd() to get this fd and explicitly skip closing it + * - This prevents undefined behavior from closing the fd while iterating + * + * **Thread Safety Considerations:** + * - This function IS safe to call in the child process between fork() and execve() + * - By avoiding heap allocations (using atoi() and simple loops), it prevents deadlocks + * on malloc locks that may be held by other threads in the parent at fork time + * - For optimal safety, call with an empty excludeFDs initializer list: close_all_non_term_fd({}) + * + * **Parameters:** + * @param excludeFDs A vector of file descriptor numbers to keep open (in addition to 0, 1, 2) + * For example, if you have pipes for communication with the parent process, + * pass those pipe fds in this vector to preserve them. + * When non-empty, close_range() optimization cannot be used. + * + * **Example Usage:** + * @code + * // In forked child before exec - close everything except stdin/stdout/stderr + * close_all_non_term_fd({}); + * + * // Keep specific communication pipes open (works with /proc and rlimit methods) + * close_all_non_term_fd({read_pipe, write_pipe}); + * + * // After fork() in a daemon that keeps log files open + * close_all_non_term_fd({log_fd, status_pipe_fd}); + * @endcode + * + * **Portability:** + * - /proc/self/fd is Linux-specific. FreeBSD has /dev/fd, macOS has /dev/fd as well + * - The fallback method using getrlimit() is more portable but less efficient + * - close_range() is Linux 5.9+, with runtime detection for backward compatibility + * + * **See Also:** + * - close_range(2) - Linux 5.9+ syscall for efficient bulk closing + * - posix_spawn(3) - Alternative to fork+exec that handles fd closing atomically + * - fdwalk() - Solaris/IllumOS function for similar purposes + * + * @param excludeFDs Vector of file descriptors to preserve (in addition to 0, 1, 2) + * @return void + */ void close_all_non_term_fd(std::vector excludeFDs) { + // Try close_range() syscall first (Linux 5.9+) - most efficient and safe + // We use syscall directly with runtime detection to avoid hard dependency on kernel version + // close_range() can ONLY be used when excludeFDs is empty, because it closes all fds >= 3 +#ifdef __NR_close_range + if (excludeFDs.empty()) { + static int close_range_available = -1; // -1 = unknown, 0 = not available, 1 = available + if (close_range_available == -1) { + // First call: check if close_range is available + long ret = syscall(__NR_close_range, 3, ~0U, 0); + close_range_available = (ret == 0 || errno != ENOSYS) ? 1 : 0; + } + if (close_range_available == 1) { + // close_range is available, use it to close all fds >= 3 + syscall(__NR_close_range, 3, ~0U, 0); + return; + } + } +#endif + + // Fallback: iterate through /proc/self/fd DIR *d; struct dirent *dir; d = opendir("/proc/self/fd"); if (d) { + int dir_fd = dirfd(d); // Get the fd of the directory stream we're reading while ((dir = readdir(d)) != NULL) { if (strlen(dir->d_name) && dir->d_name[0] != '.') { - int fd = std::stol(std::string(dir->d_name)); + // Use atoi() instead of std::stol(std::string(...)) to avoid heap allocation + // This is critical in fork() context where malloc locks may be held + int fd = atoi(dir->d_name); if (fd > 2) { - if (std::find(excludeFDs.begin(), excludeFDs.end(), fd) == excludeFDs.end()) { + // Skip the directory fd itself to avoid closing what we're reading from + if (fd == dir_fd) + continue; + // Check if fd is in exclusion list + bool exclude = false; + for (size_t i = 0; i < excludeFDs.size(); i++) { + if (excludeFDs[i] == fd) { + exclude = true; + break; + } + } + if (!exclude) { close(fd); } } @@ -505,11 +638,20 @@ void close_all_non_term_fd(std::vector excludeFDs) { } closedir(d); } else { + // Final fallback: iterate through rlimit struct rlimit nlimit; int rc = getrlimit(RLIMIT_NOFILE, &nlimit); if (rc == 0) { for (unsigned int fd = 3; fd < nlimit.rlim_cur; fd++) { - if (std::find(excludeFDs.begin(), excludeFDs.end(), fd) == excludeFDs.end()) { + // Check if fd is in exclusion list + bool exclude = false; + for (size_t i = 0; i < excludeFDs.size(); i++) { + if (excludeFDs[i] == (int)fd) { + exclude = true; + break; + } + } + if (!exclude) { close(fd); } }