Overlord cannot respond to requests when kafka is abnormal
soullkk opened this issue · comments
soullkk commented
Overlord cannot respond to requests when kafka is abnormal
Affected Version
28.0.1
Description
Please include as much detailed information about the problem as possible.
-
Cluster size
3-node cluster -
Configurations in use
cluster configuration -
Steps to reproduce the problem
Stop one or two kafka nodes in the Kafka cluster and stop or start a large number of supervisors -
The error message or stack traces encountered. Providing more context, such as nearby log messages or even entire logs, can be helpful.
Overlord experienced a large number of connection timeouts
"qtp1009260571-132" #132 daemon prio=5 os_prio=0 cpu=110.66ms elapsed=14488.50s tid=0x000055cf4f39c800 nid=0x1a96a7 waiting for monitor entry [0x00007f5e36c12000]
java.lang.Thread.State: BLOCKED (on object monitor)
at org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor.stop(SeekableStreamSupervisor.java:933)
- waiting to lock <0x00000000eb35fae0> (a java.lang.Object)
at org.apache.druid.indexing.overlord.supervisor.SupervisorManager.possiblyStopAndRemoveSupervisorInternal(SupervisorManager.java:268)
at org.apache.druid.indexing.overlord.supervisor.SupervisorManager.stopAndRemoveSupervisor(SupervisorManager.java:105)
- locked <0x00000000f56ddbd8> (a java.lang.Object)
at org.apache.druid.indexing.overlord.supervisor.SupervisorResource.lambda$terminate$9(SupervisorResource.java:348)
at org.apache.druid.indexing.overlord.supervisor.SupervisorResource$$Lambda$731/574536258.apply(Unknown Source)
at org.apache.druid.indexing.overlord.supervisor.SupervisorResource.asLeaderWithSupervisorManager(SupervisorResource.java:476)
at org.apache.druid.indexing.overlord.supervisor.SupervisorResource.terminate(SupervisorResource.java:346)
at org.apache.druid.indexing.overlord.supervisor.SupervisorResource.shutdown(SupervisorResource.java:337)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.sun.jersey.spi.container.JavaMethodInvokerFactory$1.invoke(JavaMethodInvokerFactory.java:60)
at com.sun.jersey.server.impl.model.method.dispatch.AbstractResourceMethodDispatchProvider$ResponseOutInvoker._dispatch(AbstractResourceMethodDispatchProvider.java:205)
at com.sun.jersey.server.impl.model.method.dispatch.ResourceJavaMethodDispatcher.dispatch(ResourceJavaMethodDispatcher.java:75)
at com.sun.jersey.server.impl.uri.rules.HttpMethodRule.accept(HttpMethodRule.java:302)
at com.sun.jersey.server.impl.uri.rules.RightHandPathRule.accept(RightHandPathRule.java:147)
at com.sun.jersey.server.impl.uri.rules.ResourceClassRule.accept(ResourceClassRule.java:108)
at com.sun.jersey.server.impl.uri.rules.RightHandPathRule.accept(RightHandPathRule.java:147)
at com.sun.jersey.server.impl.uri.rules.RootResourceClassesRule.accept(RootResourceClassesRule.java:84)
at com.sun.jersey.server.impl.application.WebApplicationImpl._handleRequest(WebApplicationImpl.java:1542)
at com.sun.jersey.server.impl.application.WebApplicationImpl._handleRequest(WebApplicationImpl.java:1473)
at com.sun.jersey.server.impl.application.WebApplicationImpl.handleRequest(WebApplicationImpl.java:1419)
at com.sun.jersey.server.impl.application.WebApplicationImpl.handleRequest(WebApplicationImpl.java:1409)
at com.sun.jersey.spi.container.servlet.WebComponent.service(WebComponent.java:409)
at com.sun.jersey.spi.container.servlet.ServletContainer.service(ServletContainer.java:558)
at com.sun.jersey.spi.container.servlet.ServletContainer.service(ServletContainer.java:733)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:790)
at com.google.inject.servlet.ServletDefinition.doServiceImpl(ServletDefinition.java:286)
at com.google.inject.servlet.ServletDefinition.doService(ServletDefinition.java:276)
at com.google.inject.servlet.ServletDefinition.service(ServletDefinition.java:181)
at com.google.inject.servlet.ManagedServletPipeline.service(ManagedServletPipeline.java:91)
at com.google.inject.servlet.FilterChainInvocation.doFilter(FilterChainInvocation.java:85)
at com.google.inject.servlet.ManagedFilterPipeline.dispatch(ManagedFilterPipeline.java:120)
at com.google.inject.servlet.GuiceFilter.doFilter(GuiceFilter.java:135)
at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626)
at org.apache.druid.server.http.RedirectFilter.doFilter(RedirectFilter.java:73)
at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626)
at org.apache.druid.server.security.PreResponseAuthorizationCheckFilter.doFilter(PreResponseAuthorizationCheckFilter.java:82)
at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626)
at org.apache.druid.server.initialization.jetty.StandardResponseHeaderFilterHolder$StandardResponseHeaderFilter.doFilter(StandardResponseHeaderFilterHolder.java:161)
at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626)
at org.apache.druid.server.security.AllowHttpMethodsResourceFilter.doFilter(AllowHttpMethodsResourceFilter.java:78)
at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626)
at org.apache.druid.server.security.AllowOptionsResourceFilter.doFilter(AllowOptionsResourceFilter.java:75)
at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626)
at org.apache.druid.server.security.AllowAllAuthenticator$1.doFilter(AllowAllAuthenticator.java:84)
at org.apache.druid.server.security.AuthenticationWrappingFilter.doFilter(AuthenticationWrappingFilter.java:59)
at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626)
at org.apache.druid.server.security.SecuritySanityCheckFilter.doFilter(SecuritySanityCheckFilter.java:77)
at org.eclipse.jetty.servlet.FilterHolder.doFilter(FilterHolder.java:193)
at org.eclipse.jetty.servlet.ServletHandler$Chain.doFilter(ServletHandler.java:1626)
at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:552)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1624)
at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)
at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1440)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:505)
at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1594)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)
at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1355)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at org.eclipse.jetty.server.handler.gzip.GzipHandler.handle(GzipHandler.java:772)
at org.eclipse.jetty.server.handler.HandlerList.handle(HandlerList.java:59)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
at org.eclipse.jetty.server.Server.handle(Server.java:516)
at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:487)
at org.eclipse.jetty.server.HttpChannel$$Lambda$349/207742670.dispatch(Unknown Source)
at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:732)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:479)
at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277)
at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)
at org.eclipse.jetty.io.ssl.SslConnection$DecryptedEndPoint.onFillable(SslConnection.java:555)
at org.eclipse.jetty.io.ssl.SslConnection.onFillable(SslConnection.java:410)
at org.eclipse.jetty.io.ssl.SslConnection$2.succeeded(SslConnection.java:164)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)
at org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131)
at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:409)
at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883)
at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034)
at java.lang.Thread.run(Thread.java:750)
行 627: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 756: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 886: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 982: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 1100: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 1196: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 1291: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 1410: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 1505: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 1622: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 1721: - locked <0x00000000f56ddbd8> (a java.lang.Object)
行 1842: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 1938: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 2034: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 2151: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 2283: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 2423: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 2519: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 2626: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 2733: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 2839: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 2959: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 3054: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 3172: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 3300: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 3406: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
行 3502: - waiting to lock <0x00000000f56ddbd8> (a java.lang.Object)
- Any debugging that you have already done
This problem occurs when there are three nodes in a Kafka cluster, but one or two of them are unavailable.
There are a large number of timeout logs in the overlord.log as follows:
org.apache.druid.indexing.seekablestream.common.StreamException: org.apache.kafka.common.errors.TimeoutException: Timeout of 60000ms expired before the position for partition ODAEDATASET._DEFAULT.xxxxxx._DEFAULT-0 could be determined
at org.apache.druid.indexing.kafka.KafkaRecordSupplier.wrapExceptions(KafkaRecordSupplier.java:328) ~[?:?]
at org.apache.druid.indexing.kafka.KafkaRecordSupplier.getPosition(KafkaRecordSupplier.java:182) ~[?:?]
at org.apache.druid.indexing.kafka.KafkaRecordSupplier.getEarliestSequenceNumber(KafkaRecordSupplier.java:164) ~[?:?]
at org.apache.druid.indexing.kafka.KafkaRecordSupplier.isOffsetAvailable(KafkaRecordSupplier.java:174) ~[?:?]
at org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor.checkOffsetAvailability(SeekableStreamSupervisor.java:4114) ~[druid-indexing-service-24.0.1-htrunk17.jar:?]
at org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor.getOffsetFromStorageForPartition(SeekableStreamSupervisor.java:3570) ~[druid-indexing-service-24.0.1-htrunk17.jar:?]
at org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor.generateStartingSequencesForPartitionGroup(SeekableStreamSupervisor.java:3548) ~[druid-indexing-service-24.0.1-htrunk17.jar:?]
at org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor.createNewTasks(SeekableStreamSupervisor.java:3357) ~[druid-indexing-service-24.0.1-htrunk17.jar:?]
at org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor.runInternal(SeekableStreamSupervisor.java:1490) ~[druid-indexing-service-24.0.1-htrunk17.jar:?]
at org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor$RunNotice.handle(SeekableStreamSupervisor.java:403) ~[druid-indexing-service-24.0.1-htrunk17.jar:?]
at org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor.lambda$tryInit$3(SeekableStreamSupervisor.java:1049) ~[druid-indexing-service-24.0.1-htrunk17.jar:?]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_382]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_382]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_382]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_382]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_382]