-
Notifications
You must be signed in to change notification settings - Fork 4.1k
Description
我们用braft+brpc来做强一致性解决方案,目前在压力大持续时间长时,会导致死锁,通过gdb可以直接bt看到的栈分为两类。
1在等2中的锁,而2在执行过程中拿了1所需要的锁,在试图往rq中push时由于_rq是满的会失败然后usleep。而_rq里的task pop出来执行很可能也是log_manager的相关操作,也需要拿同一把锁。这样就形成了循环,死锁
我理解的思路有:
1 是否有可能在_rq满时不再继续usleep等待入queue,而是直接执行?
2 是否有可能通过某种方法在queue满时通知用户进行处理?
`#0 0x00007f9edd212334 in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f9edd20d5d8 in _L_lock_854 () from /lib64/libpthread.so.0
#2 0x00007f9edd20d4a7 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000df637b in pthread_mutex_lock_impl (__mutex=0x7f89f75ff4e8) at xxx/deps/incubator-brpc/src/bthread/mutex.cpp:555
#4 pthread_mutex_lock (__mutex=0x7f89f75ff4e8) at xxx/deps/incubator-brpc/src/bthread/mutex.cpp:813
#5 0x0000000000d14ca0 in lock (this=0x7f89f75ff4b0, index=14541) at /usr/include/butil/synchronization/lock.h:69
#6 lock (this=0x7f89f75ff4b0, index=14541) at /usr/include/c++/4.9.2/mutex:474
#7 unique_lock (this=0x7f89f75ff4b0, index=14541) at /usr/include/c++/4.9.2/mutex:406
#8 braft::LogManager::get_term (this=0x7f89f75ff4b0, index=14541) at xxx/src/braft/log_manager.cpp:787
#9 0x0000000000d2ad2d in braft::Replicator::_fill_common_fields (this=0x7f86cc92c460, request=0x7f86a8f9d360, prev_log_index=14541, is_heartbeat=Unhandled dwarf expression opcode 0xf3
) at xxx/src/braft/replicator.cpp:496
#10 0x0000000000d3063c in braft::Replicator::_send_entries (this=0x7f86cc92c460) at xxx/src/braft/replicator.cpp:611
#11 0x0000000000d326bf in braft::Replicator::_on_rpc_returned (id=Unhandled dwarf expression opcode 0xf3
) at xxx/src/braft/replicator.cpp:489
#12 0x0000000000d33b36 in brpc::internal::FunctionClosure5<unsigned long, brpc::Controller*, braft::AppendEntriesRequest*, braft::AppendEntriesResponse*, long>::Run (this=0x7f86d16ad950) at /usr/include/brpc/callback.h:339
#13 0x0000000000e03ef8 in brpc::Controller::EndRPC (this=0x7f86d16ad5d0, info=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/brpc/controller.cpp:893
#14 0x0000000000e05bc4 in brpc::Controller::OnVersionedRPCReturned (this=0x7f86d16ad5d0, info=..., new_bthread=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/brpc/controller.cpp:676
#15 0x0000000000e6de9a in OnResponse (msg_base=0x7f897544b140) at xxx/deps/incubator-brpc/src/brpc/details/controller_private_accessor.h:48
#16 brpc::policy::ProcessRpcResponse (msg_base=0x7f897544b140) at xxx/deps/incubator-brpc/src/brpc/policy/baidu_rpc_protocol.cpp:618
#17 0x0000000000e5b9fa in brpc::ProcessInputMessage (void_arg=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/brpc/input_messenger.cpp:136
#18 0x0000000000decee1 in bthread::TaskGroup::task_runner (skip_remained=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:297
#19 0x0000000000e3d5b1 in bthread_make_fcontext ()
#20 0x0000000000000000 in ?? ()`
`#0 0x00007f9edb9c1cbd in nanosleep () from /lib64/libc.so.6
#1 0x00007f9edb9f6f14 in usleep () from /lib64/libc.so.6
#2 0x0000000000deb891 in bthread::TaskGroup::ready_to_run_remote (this=0x7f8b6c0008c0, tid=142391050791690, nosignal=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:675
#3 0x0000000000dee52a in bthread::TaskGroup::start_background (this=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:448
#4 0x0000000000df4d6c in start_from_non_worker (tid=0x7f8aed7e9800, attr=0x7f89f7582988, fn=0xddcbd0 bthread::ExecutionQueueBase::_execute_tasks(void*), arg=0x7f8511942ef0)
at xxx/deps/incubator-brpc/src/bthread/bthread.cpp:146
#5 bthread_start_background (tid=0x7f8aed7e9800, attr=0x7f89f7582988, fn=0xddcbd0 bthread::ExecutionQueueBase::_execute_tasks(void*), arg=0x7f8511942ef0)
at xxx/deps/incubator-brpc/src/bthread/bthread.cpp:194
#6 0x0000000000dddb97 in bthread::ExecutionQueueBase::start_execute (this=0x7f89f75828d0, node=0x7f8511942ef0) at xxx/deps/incubator-brpc/src/bthread/execution_queue.cpp:115
#7 0x0000000000d1a898 in execute (id=, task=@0x7f8aed7e98d8, options=0x0, handle=0x0) at /usr/include/bthread/execution_queue_inl.h:318
#8 bthread::execution_queue_executebraft::LogManager::StableClosure* (id=, task=@0x7f8aed7e98d8, options=0x0, handle=0x0) at /usr/include/bthread/execution_queue_inl.h:363
#9 0x0000000000d16251 in execution_queue_executebraft::LogManager::StableClosure* (this=0x7f89f75ff4b0, entries=0x7f8aed7e9a70, done=0x7f84d06daa00) at /usr/include/bthread/execution_queue_inl.h:352
#10 execution_queue_executebraft::LogManager::StableClosure* (this=0x7f89f75ff4b0, entries=0x7f8aed7e9a70, done=0x7f84d06daa00) at /usr/include/bthread/execution_queue_inl.h:345
#11 braft::LogManager::append_entries (this=0x7f89f75ff4b0, entries=0x7f8aed7e9a70, done=0x7f84d06daa00) at xxx/src/braft/log_manager.cpp:485
#12 0x0000000000cf3821 in braft::NodeImpl::apply (this=0x7f89f75feb50, tasks=Unhandled dwarf expression opcode 0xf3
) at xxx/src/braft/node.cpp:1959
#13 0x0000000000cf3b1e in braft::NodeImpl::execute_applying_tasks (meta=0x7f89f75feb50, iter=...) at xxx/src/braft/node.cpp:724
#14 0x0000000000dda82d in bthread::ExecutionQueueBase::_execute (this=0x7f89f75827d0, head=0x7f8572168a90, high_priority=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/bthread/execution_queue.cpp:273
#15 0x0000000000dddc08 in bthread::ExecutionQueueBase::start_execute (this=0x7f89f75827d0, node=0x7f8572168a90) at xxx/deps/incubator-brpc/src/bthread/execution_queue.cpp:95
#16 0x0000000000c3ba4c in cellar::raft::BucketStateMachine::AsyncApply (this=0x3efba20, op_type=Unhandled dwarf expression opcode 0xf3
) at bucket_state_machine.cpp:87
...
...`
通过gdb_bthread_stack.py拿到的bthread stack基本上(99%)都是:
`#0 0x0000000000dea8a8 in jump_stack (pg=0x0, next_meta=0x5a7f77d934dc6) at xxx/deps/incubator-brpc/src/bthread/stack_inl.h:133
#1 bthread::TaskGroup::sched_to (pg=0x0, next_meta=0x5a7f77d934dc6) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:605
#2 0x0000000000deadce in sched_to (pg=0x7f894ebeba08) at xxx/deps/incubator-brpc/src/bthread/task_group_inl.h:80
#3 bthread::TaskGroup::sched (pg=0x7f894ebeba08) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:563
#4 0x0000000000de1cc7 in bthread::butex_wait (arg=0x7f87bceb60c0, expected_value=1, abstime=0x0) at xxx/deps/incubator-brpc/src/bthread/butex.cpp:660
#5 0x0000000000de93a0 in bthread::CountdownEvent::wait (this=0x7f894ebebb48) at xxx/deps/incubator-brpc/src/bthread/countdown_event.cpp:65
#6 0x0000000000d13991 in wait (this=Unhandled dwarf expression opcode 0xf3
) at xxx/src/braft/log_manager.cpp:164
#7 braft::LogManager::last_log_id (this=Unhandled dwarf expression opcode 0xf3
) at xxx/src/braft/log_manager.cpp:201
#8 0x0000000000cf1d7a in braft::NodeImpl::handle_pre_vote_request (this=0x7f89d42a3c30, request=0x7f87bc3c30a0, response=0x7f87bd29ef20) at xxx/src/braft/node.cpp:2027
#9 0x0000000000d7d892 in braft::RaftServiceImpl::pre_vote (this=Unhandled dwarf expression opcode 0xf3
) at xxx/src/braft/raft_service.cpp:62
#10 0x0000000000d67c6b in braft::RaftService::CallMethod (this=Unhandled dwarf expression opcode 0xf3
) at xxx/bld/braft/raft.pb.cc:5130
#11 0x0000000000e70cab in brpc::policy::ProcessRpcRequest (msg_base=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/brpc/policy/baidu_rpc_protocol.cpp:499
#12 0x0000000000e5b9fa in brpc::ProcessInputMessage (void_arg=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/brpc/input_messenger.cpp:136
#13 0x0000000000decee1 in bthread::TaskGroup::task_runner (skip_remained=Unhandled dwarf expression opcode 0xf3
) at xxx/deps/incubator-brpc/src/bthread/task_group.cpp:297
#14 0x0000000000e3d5b1 in bthread_make_fcontext ()
#15 0x0000000000000000 in ?? ()`