NN fails becuase of NPE in replication monitor

Description

2020-03-23 23:11:08,720 ERROR io.hops.transaction.handler.RequestHandler: GET_NEXT_QUOTA_BATCH Tx Failed. total tx time TotalRetryCount(5) RemainingRetries(4) TX Stats: ms, Total Time: 1585001468426ms
io.hops.exception.TransientStorageException: java.sql.SQLException: Got temporary error 1217 'Out of operation records in local data manager (increase MaxNoOfLocalOperations)' from NDBCLUSTER
at io.hops.metadata.ndb.mysqlserver.HopsSQLExceptionHelper.wrap(HopsSQLExceptionHelper.java:33)
at io.hops.metadata.ndb.dalimpl.hdfs.QuotaUpdateClusterj.findLimited(QuotaUpdateClusterj.java:160)
at org.apache.hadoop.hdfs.server.namenode.QuotaUpdateManager$3.performTask(QuotaUpdateManager.java:186)
at io.hops.transaction.handler.LightWeightRequestHandler.execute(LightWeightRequestHandler.java:56)
at io.hops.transaction.handler.RequestHandler.handle(RequestHandler.java:68)
at io.hops.transaction.handler.RequestHandler.handle(RequestHandler.java:63)
at org.apache.hadoop.hdfs.server.namenode.QuotaUpdateManager.processNextUpdateBatch(QuotaUpdateManager.java:190)
at org.apache.hadoop.hdfs.server.namenode.QuotaUpdateManager.access$400(QuotaUpdateManager.java:51)
at org.apache.hadoop.hdfs.server.namenode.QuotaUpdateManager$QuotaUpdateMonitor.run(QuotaUpdateManager.java:125)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.sql.SQLException: Got temporary error 1217 'Out of operation records in local data manager (increase MaxNoOfLocalOperations)' from NDBCLUSTER
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:1073)
at com.mysql.jdbc.MysqlIO.checkErrorPacket(MysqlIO.java:3609)
at com.mysql.jdbc.MysqlIO.nextRowFast(MysqlIO.java:1578)
at com.mysql.jdbc.MysqlIO.nextRow(MysqlIO.java:1434)
at com.mysql.jdbc.MysqlIO.readSingleRowSet(MysqlIO.java:2925)
at com.mysql.jdbc.MysqlIO.getResultSet(MysqlIO.java:477)
at com.mysql.jdbc.MysqlIO.readResultsForQueryOrUpdate(MysqlIO.java:2631)
at com.mysql.jdbc.MysqlIO.readAllResults(MysqlIO.java:1800)
at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2221)
at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2624)
at com.mysql.jdbc.PreparedStatement.executeInternal(PreparedStatement.java:2127)
at com.mysql.jdbc.PreparedStatement.executeQuery(PreparedStatement.java:2293)
at com.zaxxer.hikari.proxy.PreparedStatementProxy.executeQuery(PreparedStatementProxy.java:44)
at io.hops.metadata.ndb.dalimpl.hdfs.QuotaUpdateClusterj.findLimited(QuotaUpdateClusterj.java:142)
... 8 more
2020-03-23 23:11:09,619 ERROR io.hops.leaderElection.LeaderElection: LE Status: id 92 LeaderElection: Update Tx took very long time to update: 1274, time_perid is 1000
2020-03-23 23:11:09,906 ERROR io.hops.transaction.handler.RequestHandler: ADD_INV_BLOCKS Tx Failed. total tx time TotalRetryCount(5) RemainingRetries(3) TX Stats: ms, Total Time: 1585001466817ms
io.hops.exception.TransientStorageException: com.mysql.clusterj.ClusterJDatastoreException: Error in NdbJTie: returnCode -1, code 1,217, mysqlCode -1, status 1, classification 7, message Out of operation records in local data manager (increase MaxNoOfLocalOperations) .
at io.hops.metadata.ndb.wrapper.HopsExceptionHelper.wrap(HopsExceptionHelper.java:31)
at io.hops.metadata.ndb.wrapper.HopsSession.savePersistentAll(HopsSession.java:195)
at io.hops.metadata.ndb.dalimpl.hdfs.InvalidatedBlockClusterj.prepare(InvalidatedBlockClusterj.java:267)
at org.apache.hadoop.hdfs.server.blockmanagement.InvalidateBlocks$4.performTask(InvalidateBlocks.java:268)
at io.hops.transaction.handler.LightWeightRequestHandler.execute(LightWeightRequestHandler.java:56)
at io.hops.transaction.handler.RequestHandler.handle(RequestHandler.java:68)
at io.hops.transaction.handler.RequestHandler.handle(RequestHandler.java:63)
at org.apache.hadoop.hdfs.server.blockmanagement.InvalidateBlocks.add(InvalidateBlocks.java:271)
at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.addToInvalidates(BlockManager.java:5501)
at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.processReport(BlockManager.java:2545)
at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.processReport(BlockManager.java:2217)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.blockReport(NameNodeRpcServer.java:1086)
at org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolServerSideTranslatorPB.blockReport(DatanodeProtocolServerSideTranslatorPB.java:148)
at org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos$DatanodeProtocolService$2.callBlockingMethod(DatanodeProtocolProtos.java:35607)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:447)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:996)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:850)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:793)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1929)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2786)
Caused by: com.mysql.clusterj.ClusterJDatastoreException: Error in NdbJTie: returnCode -1, code 1,217, mysqlCode -1, status 1, classification 7, message Out of operation records in local data manager (increase MaxNoOfLocalOperations) .
at com.mysql.clusterj.tie.Utility.throwError(Utility.java:1333)
at com.mysql.clusterj.tie.ClusterTransactionImpl.handleError(ClusterTransactionImpl.java:622)
at com.mysql.clusterj.tie.ClusterTransactionImpl.executeCommit(ClusterTransactionImpl.java:183)
at com.mysql.clusterj.tie.ClusterTransactionImpl.executeCommit(ClusterTransactionImpl.java:147)
at com.mysql.clusterj.core.SessionImpl.internalCommit(SessionImpl.java:917)
at com.mysql.clusterj.core.SessionImpl$4.end(SessionImpl.java:1164)
at com.mysql.clusterj.core.SessionImpl.endAutoTransaction(SessionImpl.java:972)
at com.mysql.clusterj.core.SessionImpl.savePersistentAll(SessionImpl.java:816)
at io.hops.metadata.ndb.wrapper.HopsSession.savePersistentAll(HopsSession.java:193)
... 20 more
2020-03-23 23:11:14,564 ERROR org.apache.hadoop.hdfs.server.blockmanagement.BlockManager: ReplicationMonitor thread received Runtime exception.
java.lang.NullPointerException
at io.hops.transaction.handler.TransactionalRequestHandler.execute(TransactionalRequestHandler.java:160)
at io.hops.transaction.handler.HopsTransactionalRequestHandler.execute(HopsTransactionalRequestHandler.java:50)
at io.hops.transaction.handler.RequestHandler.handle(RequestHandler.java:68)
at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.rescanPostponedMisreplicatedBlocks(BlockManager.java:2366)
at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager$ReplicationMonitor.run(BlockManager.java:5063)
at java.lang.Thread.run(Thread.java:748)
2020-03-23 23:11:14,566 WARN io.hops.transaction.handler.RequestHandler: GET_PENDING_UNCACHED TX Failed. TX Time: 914 ms, RetryCount: 0, TX Stats – Setup: 0ms, AcquireLocks: -1ms, InMemoryProcessing: -1ms, CommitTime: -1ms. Locks: . io.hops.exception.TransientStorageException: com.mysql.clusterj.ClusterJDatastoreException: Error in NdbJTie: returnCode -1, code 1,217, mysqlCode -1, status 1, classification 7, message Out of operation records in local data manager (increase MaxNoOfLocalOperations) .
io.hops.exception.TransientStorageException: com.mysql.clusterj.ClusterJDatastoreException: Error in NdbJTie: returnCode -1, code 1,217, mysqlCode -1, status 1, classification 7, message Out of operation records in local data manager (increase MaxNoOfLocalOperations) .
at io.hops.metadata.ndb.wrapper.HopsExceptionHelper.wrap(HopsExceptionHelper.java:31)
at io.hops.metadata.ndb.wrapper.HopsQuery.getResultList(HopsQuery.java:48)
at io.hops.metadata.ndb.dalimpl.hdfs.CachedBlockClusterJ.findCachedBlockByDatanodeId(CachedBlockClusterJ.java:160)
at io.hops.transaction.context.CachedBlockContext.findByDatanodeId(CachedBlockContext.java:185)
at io.hops.transaction.context.CachedBlockContext.findList(CachedBlockContext.java:73)
at io.hops.transaction.context.TransactionContext.findList(TransactionContext.java:150)
at io.hops.transaction.EntityManager.findList(EntityManager.java:93)
at io.hops.transaction.lock.Lock.acquireLockList(Lock.java:119)
at io.hops.transaction.lock.CachedBlockLock.acquire(CachedBlockLock.java:43)
at io.hops.transaction.lock.HdfsTransactionalLockAcquirer.acquire(HdfsTransactionalLockAcquirer.java:32)
at io.hops.transaction.handler.TransactionalRequestHandler.execute(TransactionalRequestHandler.java:88)
at io.hops.transaction.handler.HopsTransactionalRequestHandler.execute(HopsTransactionalRequestHandler.java:50)
at io.hops.transaction.handler.RequestHandler.handle(RequestHandler.java:68)
at io.hops.transaction.handler.RequestHandler.handle(RequestHandler.java:63)
at org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.getPendingUncachedTX(DatanodeDescriptor.java:181)
at org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager.handleHeartbeat(DatanodeManager.java:1348)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.handleHeartbeat(FSNamesystem.java:4117)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.sendHeartbeat(NameNodeRpcServer.java:1061)
at org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolServerSideTranslatorPB.sendHeartbeat(DatanodeProtocolServerSideTranslatorPB.java:107)
at org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos$DatanodeProtocolService$2.callBlockingMethod(DatanodeProtocolProtos.java:35605)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:447)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:996)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:850)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:793)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1929)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2786)
Caused by: com.mysql.clusterj.ClusterJDatastoreException: Error in NdbJTie: returnCode -1, code 1,217, mysqlCode -1, status 1, classification 7, message Out of operation records in local data manager (increase MaxNoOfLocalOperations) .
at com.mysql.clusterj.tie.Utility.throwError(Utility.java:1333)
at com.mysql.clusterj.tie.Utility.throwError(Utility.java:1319)
at com.mysql.clusterj.tie.NdbRecordScanResultDataImpl.next(NdbRecordScanResultDataImpl.java:160)
at com.mysql.clusterj.core.query.QueryDomainTypeImpl.getResultList(QueryDomainTypeImpl.java:183)
at com.mysql.clusterj.core.query.QueryImpl.getResultList(QueryImpl.java:146)
at io.hops.metadata.ndb.wrapper.HopsQuery.getResultList(HopsQuery.java:46)
... 26 more

Assignee

Salman Niazi

Reporter

Salman Niazi

Labels

None

Priority

Medium
Configure