Overlord and Coordinator crashing with OOM error when druid UI accessed by a user. On further investigation we identified that it's causing due to threads running for longtime while trying to retrieve the tasks details from database.
While investigating the issue we come across few issues around /tasks memory usage and also overlord crash issues in the forum which was fixed already but yet to be released officially. Can someone share the release timelines we can expect for these fixes?
#12404
#12318
Exception :
"qtp919063521-102" #102 daemon prio=5 os_prio=0 tid=0x00007f2b1851d800 nid=0x16f runnable [0x00007f2a729e4000]
java.lang.Thread.State: RUNNABLE
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at org.postgresql.core.VisibleBufferedInputStream.read(VisibleBufferedInputStream.java:248)
at org.postgresql.core.PGStream.receive(PGStream.java:529)
at org.postgresql.core.PGStream.receiveTupleV3(PGStream.java:489)
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2236)
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:312)
- locked <0x00000002ea36cb30> (a org.postgresql.core.v3.QueryExecutorImpl)
at org.postgresql.jdbc.PgStatement.executeInternal(PgStatement.java:448)
at org.postgresql.jdbc.PgStatement.execute(PgStatement.java:369)
at org.postgresql.jdbc.PgPreparedStatement.executeWithFlags(PgPreparedStatement.java:153)
at org.postgresql.jdbc.PgPreparedStatement.execute(PgPreparedStatement.java:142)
at org.apache.commons.dbcp2.DelegatingPreparedStatement.execute(DelegatingPreparedStatement.java:198)
at org.apache.commons.dbcp2.DelegatingPreparedStatement.execute(DelegatingPreparedStatement.java:198)
at org.skife.jdbi.v2.SQLStatement.internalExecute(SQLStatement.java:1328)
at org.skife.jdbi.v2.Query.fold(Query.java:173)
at org.skife.jdbi.v2.Query.list(Query.java:82)
at org.skife.jdbi.v2.Query.list(Query.java:75)
at org.apache.druid.metadata.SQLMetadataStorageActionHandler.lambda$getCompletedTaskInfo$3(SQLMetadataStorageActionHandler.java:286)
at org.apache.druid.metadata.SQLMetadataStorageActionHandler$$Lambda$735/1498702729.withHandle(Unknown Source)
at org.skife.jdbi.v2.DBI.withHandle(DBI.java:281)
at org.apache.druid.metadata.SQLMetadataConnector.lambda$retryWithHandle$0(SQLMetadataConnector.java:138)
at org.apache.druid.metadata.SQLMetadataConnector$$Lambda$89/958817283.perform(Unknown Source)
at org.apache.druid.java.util.common.RetryUtils.retry(RetryUtils.java:129)
at org.apache.druid.java.util.common.RetryUtils.retry(RetryUtils.java:81)
at org.apache.druid.java.util.common.RetryUtils.retry(RetryUtils.java:163)
at org.apache.druid.java.util.common.RetryUtils.retry(RetryUtils.java:153)
at org.apache.druid.metadata.SQLMetadataConnector.retryWithHandle(SQLMetadataConnector.java:138)
at org.apache.druid.metadata.SQLMetadataConnector.retryWithHandle(SQLMetadataConnector.java:148)
at org.apache.druid.metadata.SQLMetadataStorageActionHandler.getCompletedTaskInfo(SQLMetadataStorageActionHandler.java:278)
at org.apache.druid.indexing.overlord.MetadataTaskStorage.getRecentlyCreatedAlreadyFinishedTaskInfo(MetadataTaskStorage.java:230)
at org.apache.druid.indexing.overlord.TaskStorageQueryAdapter.getCompletedTaskInfoByCreatedTimeDuration(TaskStorageQueryAdapter.java:87)
at org.apache.druid.indexing.overlord.http.OverlordResource.getTasks(OverlordResource.java:636)
Overlord and Coordinator crashing with OOM error when druid UI accessed by a user. On further investigation we identified that it's causing due to threads running for longtime while trying to retrieve the tasks details from database.
While investigating the issue we come across few issues around /tasks memory usage and also overlord crash issues in the forum which was fixed already but yet to be released officially. Can someone share the release timelines we can expect for these fixes?
#12404
#12318
Exception :
"qtp919063521-102" #102 daemon prio=5 os_prio=0 tid=0x00007f2b1851d800 nid=0x16f runnable [0x00007f2a729e4000]
java.lang.Thread.State: RUNNABLE
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at org.postgresql.core.VisibleBufferedInputStream.read(VisibleBufferedInputStream.java:248)
at org.postgresql.core.PGStream.receive(PGStream.java:529)
at org.postgresql.core.PGStream.receiveTupleV3(PGStream.java:489)
at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2236)
at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:312)
at org.postgresql.jdbc.PgStatement.executeInternal(PgStatement.java:448)
at org.postgresql.jdbc.PgStatement.execute(PgStatement.java:369)
at org.postgresql.jdbc.PgPreparedStatement.executeWithFlags(PgPreparedStatement.java:153)
at org.postgresql.jdbc.PgPreparedStatement.execute(PgPreparedStatement.java:142)
at org.apache.commons.dbcp2.DelegatingPreparedStatement.execute(DelegatingPreparedStatement.java:198)
at org.apache.commons.dbcp2.DelegatingPreparedStatement.execute(DelegatingPreparedStatement.java:198)
at org.skife.jdbi.v2.SQLStatement.internalExecute(SQLStatement.java:1328)
at org.skife.jdbi.v2.Query.fold(Query.java:173)
at org.skife.jdbi.v2.Query.list(Query.java:82)
at org.skife.jdbi.v2.Query.list(Query.java:75)
at org.apache.druid.metadata.SQLMetadataStorageActionHandler.lambda$getCompletedTaskInfo$3(SQLMetadataStorageActionHandler.java:286)
at org.apache.druid.metadata.SQLMetadataStorageActionHandler$$Lambda$735/1498702729.withHandle(Unknown Source)
at org.skife.jdbi.v2.DBI.withHandle(DBI.java:281)
at org.apache.druid.metadata.SQLMetadataConnector.lambda$retryWithHandle$0(SQLMetadataConnector.java:138)
at org.apache.druid.metadata.SQLMetadataConnector$$Lambda$89/958817283.perform(Unknown Source)
at org.apache.druid.java.util.common.RetryUtils.retry(RetryUtils.java:129)
at org.apache.druid.java.util.common.RetryUtils.retry(RetryUtils.java:81)
at org.apache.druid.java.util.common.RetryUtils.retry(RetryUtils.java:163)
at org.apache.druid.java.util.common.RetryUtils.retry(RetryUtils.java:153)
at org.apache.druid.metadata.SQLMetadataConnector.retryWithHandle(SQLMetadataConnector.java:138)
at org.apache.druid.metadata.SQLMetadataConnector.retryWithHandle(SQLMetadataConnector.java:148)
at org.apache.druid.metadata.SQLMetadataStorageActionHandler.getCompletedTaskInfo(SQLMetadataStorageActionHandler.java:278)
at org.apache.druid.indexing.overlord.MetadataTaskStorage.getRecentlyCreatedAlreadyFinishedTaskInfo(MetadataTaskStorage.java:230)
at org.apache.druid.indexing.overlord.TaskStorageQueryAdapter.getCompletedTaskInfoByCreatedTimeDuration(TaskStorageQueryAdapter.java:87)
at org.apache.druid.indexing.overlord.http.OverlordResource.getTasks(OverlordResource.java:636)