mirror of
https://github.com/bitcoin/bitcoin.git
synced 2025-02-10 10:52:31 -05:00
Merge bitcoin/bitcoin#25880: p2p: Make stalling timeout adaptive during IBD
39b93649c4
test: add functional test for IBD stalling logic (Martin Zumsande)0565951f34
p2p: Make block stalling timeout adaptive (Martin Zumsande) Pull request description: During IBD, there is the following stalling mechanism if we can't proceed with assigning blocks from a 1024 lookahead window because all of these blocks are either already downloaded or in-flight: We'll mark the peer from which we expect the current block that would allow us to advance our tip (and thereby move the 1024 window ahead) as a possible staller. We then give this peer 2 more seconds to deliver a block (`BLOCK_STALLING_TIMEOUT`) and if it doesn't, disconnect it and assign the critical block we need to another peer. Now the problem is that this second peer is immediately marked as a potential staller using the same mechanism and given 2 seconds as well - if our own connection is so slow that it simply takes us more than 2 seconds to download this block, that peer will also be disconnected (and so on...), leading to repeated disconnections and no progress in IBD. This has been described in #9213, and I have observed this when doing IBD on slower connections or with Tor - sometimes there would be several minutes without progress, where all we did was disconnect peers and find new ones. The `2s` stalling timeout was introduced in #4468, when blocks weren't full and before Segwit increased the maximum possible physical size of blocks - so I think it made a lot of sense back then. But it would be good to revisit this timeout now. This PR makes the timout adaptive (idea by sipa): If we disconnect a peer for stalling, we now double the timeout for the next peer (up to a maximum of 64s). If we connect a block, we half it again up to the old value of 2 seconds. That way, peers that are comparatively slower will still get disconnected, but long phases of disconnecting all peers shouldn't happen anymore. Fixes #9213 ACKs for top commit: achow101: ACK39b93649c4
RandyMcMillan: Strong Concept ACK39b93649c4
vasild: ACK39b93649c4
naumenkogs: ACK39b93649c4
Tree-SHA512: 85bc57093b2fb1d28d7409ed8df5a91543909405907bc129de7c6285d0810dd79bc05219e4d5aefcb55c85512b0ad5bed43a4114a17e46c35b9a3f9a983d5754
This commit is contained in:
commit
835212cd1d
3 changed files with 193 additions and 4 deletions
|
@ -113,8 +113,11 @@ static constexpr auto GETDATA_TX_INTERVAL{60s};
|
||||||
static const unsigned int MAX_GETDATA_SZ = 1000;
|
static const unsigned int MAX_GETDATA_SZ = 1000;
|
||||||
/** Number of blocks that can be requested at any given time from a single peer. */
|
/** Number of blocks that can be requested at any given time from a single peer. */
|
||||||
static const int MAX_BLOCKS_IN_TRANSIT_PER_PEER = 16;
|
static const int MAX_BLOCKS_IN_TRANSIT_PER_PEER = 16;
|
||||||
/** Time during which a peer must stall block download progress before being disconnected. */
|
/** Default time during which a peer must stall block download progress before being disconnected.
|
||||||
static constexpr auto BLOCK_STALLING_TIMEOUT{2s};
|
* the actual timeout is increased temporarily if peers are disconnected for hitting the timeout */
|
||||||
|
static constexpr auto BLOCK_STALLING_TIMEOUT_DEFAULT{2s};
|
||||||
|
/** Maximum timeout for stalling block download. */
|
||||||
|
static constexpr auto BLOCK_STALLING_TIMEOUT_MAX{64s};
|
||||||
/** Number of headers sent in one getheaders result. We rely on the assumption that if a peer sends
|
/** Number of headers sent in one getheaders result. We rely on the assumption that if a peer sends
|
||||||
* less than this number, we reached its tip. Changing this value is a protocol upgrade. */
|
* less than this number, we reached its tip. Changing this value is a protocol upgrade. */
|
||||||
static const unsigned int MAX_HEADERS_RESULTS = 2000;
|
static const unsigned int MAX_HEADERS_RESULTS = 2000;
|
||||||
|
@ -774,6 +777,9 @@ private:
|
||||||
/** Number of preferable block download peers. */
|
/** Number of preferable block download peers. */
|
||||||
int m_num_preferred_download_peers GUARDED_BY(cs_main){0};
|
int m_num_preferred_download_peers GUARDED_BY(cs_main){0};
|
||||||
|
|
||||||
|
/** Stalling timeout for blocks in IBD */
|
||||||
|
std::atomic<std::chrono::seconds> m_block_stalling_timeout{BLOCK_STALLING_TIMEOUT_DEFAULT};
|
||||||
|
|
||||||
bool AlreadyHaveTx(const GenTxid& gtxid)
|
bool AlreadyHaveTx(const GenTxid& gtxid)
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(cs_main, !m_recent_confirmed_transactions_mutex);
|
EXCLUSIVE_LOCKS_REQUIRED(cs_main, !m_recent_confirmed_transactions_mutex);
|
||||||
|
|
||||||
|
@ -1812,7 +1818,8 @@ void PeerManagerImpl::StartScheduledTasks(CScheduler& scheduler)
|
||||||
/**
|
/**
|
||||||
* Evict orphan txn pool entries based on a newly connected
|
* Evict orphan txn pool entries based on a newly connected
|
||||||
* block, remember the recently confirmed transactions, and delete tracked
|
* block, remember the recently confirmed transactions, and delete tracked
|
||||||
* announcements for them. Also save the time of the last tip update.
|
* announcements for them. Also save the time of the last tip update and
|
||||||
|
* possibly reduce dynamic block stalling timeout.
|
||||||
*/
|
*/
|
||||||
void PeerManagerImpl::BlockConnected(const std::shared_ptr<const CBlock>& pblock, const CBlockIndex* pindex)
|
void PeerManagerImpl::BlockConnected(const std::shared_ptr<const CBlock>& pblock, const CBlockIndex* pindex)
|
||||||
{
|
{
|
||||||
|
@ -1835,6 +1842,16 @@ void PeerManagerImpl::BlockConnected(const std::shared_ptr<const CBlock>& pblock
|
||||||
m_txrequest.ForgetTxHash(ptx->GetWitnessHash());
|
m_txrequest.ForgetTxHash(ptx->GetWitnessHash());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// In case the dynamic timeout was doubled once or more, reduce it slowly back to its default value
|
||||||
|
auto stalling_timeout = m_block_stalling_timeout.load();
|
||||||
|
Assume(stalling_timeout >= BLOCK_STALLING_TIMEOUT_DEFAULT);
|
||||||
|
if (stalling_timeout != BLOCK_STALLING_TIMEOUT_DEFAULT) {
|
||||||
|
const auto new_timeout = std::max(std::chrono::duration_cast<std::chrono::seconds>(stalling_timeout * 0.85), BLOCK_STALLING_TIMEOUT_DEFAULT);
|
||||||
|
if (m_block_stalling_timeout.compare_exchange_strong(stalling_timeout, new_timeout)) {
|
||||||
|
LogPrint(BCLog::NET, "Decreased stalling timeout to %d seconds\n", new_timeout.count());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void PeerManagerImpl::BlockDisconnected(const std::shared_ptr<const CBlock> &block, const CBlockIndex* pindex)
|
void PeerManagerImpl::BlockDisconnected(const std::shared_ptr<const CBlock> &block, const CBlockIndex* pindex)
|
||||||
|
@ -5713,12 +5730,19 @@ bool PeerManagerImpl::SendMessages(CNode* pto)
|
||||||
m_connman.PushMessage(pto, msgMaker.Make(NetMsgType::INV, vInv));
|
m_connman.PushMessage(pto, msgMaker.Make(NetMsgType::INV, vInv));
|
||||||
|
|
||||||
// Detect whether we're stalling
|
// Detect whether we're stalling
|
||||||
if (state.m_stalling_since.count() && state.m_stalling_since < current_time - BLOCK_STALLING_TIMEOUT) {
|
auto stalling_timeout = m_block_stalling_timeout.load();
|
||||||
|
if (state.m_stalling_since.count() && state.m_stalling_since < current_time - stalling_timeout) {
|
||||||
// Stalling only triggers when the block download window cannot move. During normal steady state,
|
// Stalling only triggers when the block download window cannot move. During normal steady state,
|
||||||
// the download window should be much larger than the to-be-downloaded set of blocks, so disconnection
|
// the download window should be much larger than the to-be-downloaded set of blocks, so disconnection
|
||||||
// should only happen during initial block download.
|
// should only happen during initial block download.
|
||||||
LogPrintf("Peer=%d is stalling block download, disconnecting\n", pto->GetId());
|
LogPrintf("Peer=%d is stalling block download, disconnecting\n", pto->GetId());
|
||||||
pto->fDisconnect = true;
|
pto->fDisconnect = true;
|
||||||
|
// Increase timeout for the next peer so that we don't disconnect multiple peers if our own
|
||||||
|
// bandwidth is insufficient.
|
||||||
|
const auto new_timeout = std::min(2 * stalling_timeout, BLOCK_STALLING_TIMEOUT_MAX);
|
||||||
|
if (stalling_timeout != new_timeout && m_block_stalling_timeout.compare_exchange_strong(stalling_timeout, new_timeout)) {
|
||||||
|
LogPrint(BCLog::NET, "Increased stalling timeout temporarily to %d seconds\n", m_block_stalling_timeout.load().count());
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// In case there is a block that has been in flight from this peer for block_interval * (1 + 0.5 * N)
|
// In case there is a block that has been in flight from this peer for block_interval * (1 + 0.5 * N)
|
||||||
|
|
164
test/functional/p2p_ibd_stalling.py
Executable file
164
test/functional/p2p_ibd_stalling.py
Executable file
|
@ -0,0 +1,164 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2022- The Bitcoin Core developers
|
||||||
|
# Distributed under the MIT software license, see the accompanying
|
||||||
|
# file COPYING or http://www.opensource.org/licenses/mit-license.php.
|
||||||
|
"""
|
||||||
|
Test stalling logic during IBD
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
from test_framework.blocktools import (
|
||||||
|
create_block,
|
||||||
|
create_coinbase
|
||||||
|
)
|
||||||
|
from test_framework.messages import (
|
||||||
|
MSG_BLOCK,
|
||||||
|
MSG_TYPE_MASK,
|
||||||
|
)
|
||||||
|
from test_framework.p2p import (
|
||||||
|
CBlockHeader,
|
||||||
|
msg_block,
|
||||||
|
msg_headers,
|
||||||
|
P2PDataStore,
|
||||||
|
)
|
||||||
|
from test_framework.test_framework import BitcoinTestFramework
|
||||||
|
from test_framework.util import (
|
||||||
|
assert_equal,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class P2PStaller(P2PDataStore):
|
||||||
|
def __init__(self, stall_block):
|
||||||
|
self.stall_block = stall_block
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def on_getdata(self, message):
|
||||||
|
for inv in message.inv:
|
||||||
|
self.getdata_requests.append(inv.hash)
|
||||||
|
if (inv.type & MSG_TYPE_MASK) == MSG_BLOCK:
|
||||||
|
if (inv.hash != self.stall_block):
|
||||||
|
self.send_message(msg_block(self.block_store[inv.hash]))
|
||||||
|
|
||||||
|
def on_getheaders(self, message):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class P2PIBDStallingTest(BitcoinTestFramework):
|
||||||
|
def set_test_params(self):
|
||||||
|
self.setup_clean_chain = True
|
||||||
|
self.num_nodes = 1
|
||||||
|
|
||||||
|
def run_test(self):
|
||||||
|
NUM_BLOCKS = 1025
|
||||||
|
NUM_PEERS = 4
|
||||||
|
node = self.nodes[0]
|
||||||
|
tip = int(node.getbestblockhash(), 16)
|
||||||
|
blocks = []
|
||||||
|
height = 1
|
||||||
|
block_time = node.getblock(node.getbestblockhash())['time'] + 1
|
||||||
|
self.log.info("Prepare blocks without sending them to the node")
|
||||||
|
block_dict = {}
|
||||||
|
for _ in range(NUM_BLOCKS):
|
||||||
|
blocks.append(create_block(tip, create_coinbase(height), block_time))
|
||||||
|
blocks[-1].solve()
|
||||||
|
tip = blocks[-1].sha256
|
||||||
|
block_time += 1
|
||||||
|
height += 1
|
||||||
|
block_dict[blocks[-1].sha256] = blocks[-1]
|
||||||
|
stall_block = blocks[0].sha256
|
||||||
|
|
||||||
|
headers_message = msg_headers()
|
||||||
|
headers_message.headers = [CBlockHeader(b) for b in blocks[:NUM_BLOCKS-1]]
|
||||||
|
peers = []
|
||||||
|
|
||||||
|
self.log.info("Check that a staller does not get disconnected if the 1024 block lookahead buffer is filled")
|
||||||
|
for id in range(NUM_PEERS):
|
||||||
|
peers.append(node.add_outbound_p2p_connection(P2PStaller(stall_block), p2p_idx=id, connection_type="outbound-full-relay"))
|
||||||
|
peers[-1].block_store = block_dict
|
||||||
|
peers[-1].send_message(headers_message)
|
||||||
|
|
||||||
|
# Need to wait until 1023 blocks are received - the magic total bytes number is a workaround in lack of an rpc
|
||||||
|
# returning the number of downloaded (but not connected) blocks.
|
||||||
|
self.wait_until(lambda: self.total_bytes_recv_for_blocks() == 172761)
|
||||||
|
|
||||||
|
self.all_sync_send_with_ping(peers)
|
||||||
|
# If there was a peer marked for stalling, it would get disconnected
|
||||||
|
self.mocktime = int(time.time()) + 3
|
||||||
|
node.setmocktime(self.mocktime)
|
||||||
|
self.all_sync_send_with_ping(peers)
|
||||||
|
assert_equal(node.num_test_p2p_connections(), NUM_PEERS)
|
||||||
|
|
||||||
|
self.log.info("Check that increasing the window beyond 1024 blocks triggers stalling logic")
|
||||||
|
headers_message.headers = [CBlockHeader(b) for b in blocks]
|
||||||
|
with node.assert_debug_log(expected_msgs=['Stall started']):
|
||||||
|
for p in peers:
|
||||||
|
p.send_message(headers_message)
|
||||||
|
self.all_sync_send_with_ping(peers)
|
||||||
|
|
||||||
|
self.log.info("Check that the stalling peer is disconnected after 2 seconds")
|
||||||
|
self.mocktime += 3
|
||||||
|
node.setmocktime(self.mocktime)
|
||||||
|
peers[0].wait_for_disconnect()
|
||||||
|
assert_equal(node.num_test_p2p_connections(), NUM_PEERS - 1)
|
||||||
|
self.wait_until(lambda: self.is_block_requested(peers, stall_block))
|
||||||
|
# Make sure that SendMessages() is invoked, which assigns the missing block
|
||||||
|
# to another peer and starts the stalling logic for them
|
||||||
|
self.all_sync_send_with_ping(peers)
|
||||||
|
|
||||||
|
self.log.info("Check that the stalling timeout gets doubled to 4 seconds for the next staller")
|
||||||
|
# No disconnect after just 3 seconds
|
||||||
|
self.mocktime += 3
|
||||||
|
node.setmocktime(self.mocktime)
|
||||||
|
self.all_sync_send_with_ping(peers)
|
||||||
|
assert_equal(node.num_test_p2p_connections(), NUM_PEERS - 1)
|
||||||
|
|
||||||
|
self.mocktime += 2
|
||||||
|
node.setmocktime(self.mocktime)
|
||||||
|
self.wait_until(lambda: node.num_test_p2p_connections() == NUM_PEERS - 2)
|
||||||
|
self.wait_until(lambda: self.is_block_requested(peers, stall_block))
|
||||||
|
self.all_sync_send_with_ping(peers)
|
||||||
|
|
||||||
|
self.log.info("Check that the stalling timeout gets doubled to 8 seconds for the next staller")
|
||||||
|
# No disconnect after just 7 seconds
|
||||||
|
self.mocktime += 7
|
||||||
|
node.setmocktime(self.mocktime)
|
||||||
|
self.all_sync_send_with_ping(peers)
|
||||||
|
assert_equal(node.num_test_p2p_connections(), NUM_PEERS - 2)
|
||||||
|
|
||||||
|
self.mocktime += 2
|
||||||
|
node.setmocktime(self.mocktime)
|
||||||
|
self.wait_until(lambda: node.num_test_p2p_connections() == NUM_PEERS - 3)
|
||||||
|
self.wait_until(lambda: self.is_block_requested(peers, stall_block))
|
||||||
|
self.all_sync_send_with_ping(peers)
|
||||||
|
|
||||||
|
self.log.info("Provide the withheld block and check that stalling timeout gets reduced back to 2 seconds")
|
||||||
|
with node.assert_debug_log(expected_msgs=['Decreased stalling timeout to 2 seconds']):
|
||||||
|
for p in peers:
|
||||||
|
if p.is_connected and (stall_block in p.getdata_requests):
|
||||||
|
p.send_message(msg_block(block_dict[stall_block]))
|
||||||
|
|
||||||
|
self.log.info("Check that all outstanding blocks get connected")
|
||||||
|
self.wait_until(lambda: node.getblockcount() == NUM_BLOCKS)
|
||||||
|
|
||||||
|
def total_bytes_recv_for_blocks(self):
|
||||||
|
total = 0
|
||||||
|
for info in self.nodes[0].getpeerinfo():
|
||||||
|
if ("block" in info["bytesrecv_per_msg"].keys()):
|
||||||
|
total += info["bytesrecv_per_msg"]["block"]
|
||||||
|
return total
|
||||||
|
|
||||||
|
def all_sync_send_with_ping(self, peers):
|
||||||
|
for p in peers:
|
||||||
|
if p.is_connected:
|
||||||
|
p.sync_send_with_ping()
|
||||||
|
|
||||||
|
def is_block_requested(self, peers, hash):
|
||||||
|
for p in peers:
|
||||||
|
if p.is_connected and (hash in p.getdata_requests):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
P2PIBDStallingTest().main()
|
|
@ -254,6 +254,7 @@ BASE_SCRIPTS = [
|
||||||
'wallet_importprunedfunds.py --descriptors',
|
'wallet_importprunedfunds.py --descriptors',
|
||||||
'p2p_leak_tx.py',
|
'p2p_leak_tx.py',
|
||||||
'p2p_eviction.py',
|
'p2p_eviction.py',
|
||||||
|
'p2p_ibd_stalling.py',
|
||||||
'wallet_signmessagewithaddress.py',
|
'wallet_signmessagewithaddress.py',
|
||||||
'rpc_signmessagewithprivkey.py',
|
'rpc_signmessagewithprivkey.py',
|
||||||
'rpc_generate.py',
|
'rpc_generate.py',
|
||||||
|
|
Loading…
Add table
Reference in a new issue