This is a file descriptor leak when the aggregator and the sampler are using different auth types

Question

This is a file descriptor leak when the aggregator and the sampler are using different auth types

johnstile opened this issue 8 months ago · comments

In this case the aggregator ldmsd is configured to use munge authentication to the sampler ldmsd, but the sampler ldmsd is configure to use ovis authentication.

The aggregator ldmsd connects to the sampler ldmsd using munge authentication, which is rejected, but the sampler ldmsd never closes the file descriptor completely.

The socket in the thread gets shutdown() but it then hangs forever in epoll().

The aggregator then reconnects, creating another thread which will never die and holds a file descriptor open. Lather, rinse, repeat, explode.

Tracing threads lead to an epoll_wait(88, <unfinished ...>

fgrep 93276 92705.strace
92720 clone(child_stack=0x7f65227fbfb0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tid=[93276], tls=0x7f65227fc700, child_tidptr=0x7f65227fc9d0) = 93276
92720 openat(AT_FDCWD, "/proc/self/task/93276/comm", O_RDWR <unfinished ...>
93276 set_robust_list(0x7f65227fc9e0, 24) = 0
93276 rt_sigprocmask(SIG_SETMASK, ~[RTMIN RT_1],  <unfinished ...>
93276 <... rt_sigprocmask resumed>NULL, 8) = 0
93276 epoll_wait(88,  <unfinished ...>
93276 <... epoll_wait resumed>[{events=EPOLLIN, data={u32=3026429024, u64=140074794853472}}], 4096, -1) = 1
93276 read(87, "\0\1\0\0\0\0\0.A\v\0\0\0\0\0\0\0\0\0\0", 20) = 20
93276 read(87, "\1\3\1\0SOCKET\0\0\0\0\0\n\4\2\0\0munge\0", 26) = 26
93276 mmap(NULL, 134217728, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x7f650c000000
93276 munmap(0x7f6510000000, 67108864)  = 0
93276 mprotect(0x7f650c000000, 135168, PROT_READ|PROT_WRITE) = 0
93276 sendto(87, "\0\t\0\0\0\0\0&%\0\0\0\0\0\0\0\0\0\0\0\0\0\0\16Auth mis"..., 38, MSG_NOSIGNAL, NULL, 0) = 38
93276 read(87, 0x7f65b463a610, 20)      = -1 EAGAIN (Resource temporarily unavailable)
93276 epoll_wait(88, [{events=EPOLLIN, data={u32=3026429024, u64=140074794853472}}], 4096, -1) = 1
93276 read(87, "", 20)                  = 0
93276 shutdown(87, SHUT_RDWR)           = 0
93276 epoll_wait(88, [{events=EPOLLIN|EPOLLHUP, data={u32=3026429024, u64=140074794853472}}], 4096, -1) = 1
93276 read(87, "", 20)                  = 0
93276 shutdown(87, SHUT_RDWR)           = -1 ENOTCONN (Transport endpoint is not connected)
93276 getsockopt(87, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
93276 epoll_ctl(88, EPOLL_CTL_DEL, 87, 0x7f65b463a53c) = 0
93276 epoll_wait(88,  <unfinished ...>

gdb shows this as:

(gdb) thread 131
[Switching to thread 131 (Thread 0x7f63a0ff9700 (LWP 95833))]
#0  0x00007f65d33e688f in epoll_wait () from /lib64/libc.so.6
(gdb) bt full
#0  0x00007f65d33e688f in epoll_wait () from /lib64/libc.so.6
No symbol table info available.
#1  0x00007f65d11749f9 in io_thread_proc (arg=0x7f65b55557e0) at zap_sock.c:1403
        __cancel_buf = {__cancel_jmp_buf = {{__cancel_jmp_buf = {0, -3677973586579537781, 140075276319934, 140075276319935, 0, 0, 3617084831187878027, 3620494620879896715}, __mask_was_saved = 0}}, __pad = {
            0x7f63a0ff8f10, 0x0, 0x0, 0x0}}
        __cancel_routine = 0x7f65d11748f4 <io_thread_cleanup>
        __cancel_arg = 0x7f65b55557e0
        __not_first_call = <optimized out>
        thr = 0x7f65b55557e0
        rc = <optimized out>
        n = <optimized out>
        i = <optimized out>
        sigset = {__val = {18446744067267100671, 18446744073709551615 <repeats 15 times>}}
        sep = <optimized out>
        __PRETTY_FUNCTION__ = "io_thread_proc"
#2  0x00007f65d32ab6ea in start_thread () from /lib64/libpthread.so.0
No symbol table info available.
#3  0x00007f65d33e649f in clone () from /lib64/libc.so.6
No symbol table info available.
(gdb)

This is the code of the thread. The call to function pointer at line 1412 which I suspect it how it goes of and does the auth checking:

1387 static void *io_thread_proc(void *arg)
1388 {
1389         /* Zap thread will not handle any signal */
1390         z_sock_io_thread_t thr = arg;
1391         int rc, n, i;
1392         sigset_t sigset;
1393         struct z_sock_ep *sep;
1394
1395         pthread_cleanup_push(io_thread_cleanup, arg);
1396
1397         sigfillset(&sigset);
1398         rc = sigprocmask(SIG_SETMASK, &sigset, NULL);
1399         assert(rc == 0 && "pthread_sigmask error");
1400
1401         while (1) {
1402                 zap_thrstat_wait_start(thr->zap_io_thread.stat);
1403                 n = epoll_wait(thr->efd, thr->ev, ZAP_SOCK_EV_SIZE, -1);
1404                 zap_thrstat_wait_end(thr->zap_io_thread.stat);
1405                 if (n < 0) {
1406                         if (errno == EINTR)
1407                                 continue; /* EINTR is OK */
1408                         break;
1409                 }
1410                 for (i = 0; i < n; i++) {
1411                         sep = thr->ev[i].data.ptr;
1412                         sep->ev_fn(&thr->ev[i]);
1413                 }
1414         }
1415
1416         pthread_cleanup_pop(1);
1417         return NULL;
1418 }

aggregator ldmsd:

/opt/ovis-ldms/sbin/ldmsd -F -c /aggregator.conf -m 65M -a ovis -x sock:60006 -P 4 -v INFO

aggregator conf: /aggregator.conf

auth_add name=auth_cluster_munge plugin=munge  socket=/run/munge_cluster/munge.socket
updtr_add name=aggr-0 interval=10000000 offset=100000
prdcr_add name=nid0000001 host=nid0000001 type=active xprt=sock port=6002 interval=2000000 auth=auth_cluster_munge
prdcr_subscribe stream=foo regex=.*
prdcr_start_regex regex=.*
updtr_prdcr_add name=aggr-0 regex=.*
updtr_start name=aggr-0

sampler ldmsd

/opt/ovis-ldms/sbin/ldmsd -x sock:6002 -c /sampler.conf -a ovis -v INFO -m 512K -r /opt/ovis-ldms/var/run/ldmsd/sampler.pid

sampler conf: sampler.conf

load name=vmstat
config name=vmstat producer=nid0000001 instance=nid0000001/vmstat component_id=0000001 schema=vmstat job_set=nid0000001/slurm
start name=vmstat interval=1000000 offset=0
load name=hello_sampler
config name=hello_sampler producer=nid0000001 instance=nid0000001/hello_sampler stream=foo component_id=1
start name=hello_sampler interval=1000000 offset=0

When the sampler ldmsd is configure to use munge, everything works as expected.

Tom Tucker · Answer 1 · Wed Jan 10 2024 01:03:26 GMT+0800 (China Standard Time)

Nice catch @johnstile . We'll fix this error path.

Tom Tucker · Answer 2 · Thu Jan 18 2024 02:03:04 GMT+0800 (China Standard Time)

@nichamon , @narategithub have either of you had a chance to look at this yet?

Narate Taerat · Answer 3 · Thu Jan 18 2024 02:04:02 GMT+0800 (China Standard Time)

@tom95858 I'm looking into this right now.

Narate Taerat · Answer 4 · Thu Jan 18 2024 03:09:08 GMT+0800 (China Standard Time)

The issue is easily reproduced by 1 sampler daemon and ldms_ls with any mismatch authentication method (e.g. ldmsd -a munge + ldms_ls -a none, or ldmsd -a ovis + ldms_ls -a munge). The problem seems to be in zap_reject mechanism. After the peer (ldms_ls in my case) got a reject message, it shutdown the socket, but somehow ldmsd epoll does not generate EPOLLHUP event. I'm digging into this ...

Tom Tucker · Answer 5 · Thu Jan 18 2024 03:16:05 GMT+0800 (China Standard Time)

Thanks @narategithub

Tom Tucker · Answer 6 · Thu Jan 18 2024 03:17:45 GMT+0800 (China Standard Time)

@narategithub maybe where the fd is removed from the epoll set?

Narate Taerat · Answer 7 · Thu Jan 18 2024 03:20:20 GMT+0800 (China Standard Time)

@tom95858 I think so too. I enabled DEBUG_ZAP_SOCK and looking into it.

Narate Taerat · Answer 8 · Thu Jan 18 2024 03:30:49 GMT+0800 (China Standard Time)

@tom95858 I think I know why. When the ldmsd received a connection request, the thread of the passive endpoint is the one processing the connection request. The new endpoint is removed from the passive thread and should later be assigned a new thread to process events. However, the new thread assignment is done in accept() code path. This leaved the rejected endpoints not having any thread processing them.

I'm fixing this in zap_sock(). I think this affect zap_ugni too since it uses the same logic. I'll also check zap_rdma just to be sure.

Narate Taerat · Answer 9 · Fri Jan 19 2024 05:54:53 GMT+0800 (China Standard Time)

Thank you @johnstile for submitting the issue.

@tom95858 Here's some updates:

Issue confirmed on sock on 4.4.1, but for a different reason stated above. See fix in pull request: #1329.
I think ugni on 4.4.1 does NOT have this issue (by code inspection). We should test this when the test machine is up.
rdma on 4.4.1 does NOT have this issue. I hand-tested with -DDEBUG to see that the endpoint is destroyed properly.
Issue confirmed on sock on OVIS-4. See fix in pull-request: #1327.
I think ugni on OVIS-4 will have the leak issue. See draft-fix in pull-request: #1328. (will convert to READY when we can test it on the test machine with ugni transport).
rdma on OVIS-4 does NOT have this issue. I hand-tested with -DDEBUG to see that the endpoint is destroyed properly.

Tom Tucker · Answer 10 · Sat Feb 10 2024 12:19:48 GMT+0800 (China Standard Time)

@johnstile is this resolved?