Skip to content

Commit

Permalink
raftstore: fix a bug where snapshot file is not deleted in time (#16977)
Browse files Browse the repository at this point in the history
close #16976

The issue is that if a leader fails to send a snapshot, it doesn't delete the 
snapshot file which may accumulate and waste disk space. The idle snapshot 
files will eventually be GCed, but that might take hours. The commit ensures 
that the snapshot file is also deleted on the error path. A new integration 
test was added which would fail without the fix.

Signed-off-by: Bisheng Huang <hbisheng@gmail.com>

Co-authored-by: lucasliang <nkcs_lykx@hotmail.com>
Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com>
  • Loading branch information
3 people committed May 16, 2024
1 parent 2a865bb commit 3c2cbcf
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/server/snap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,9 @@ pub fn send_snap(

#[cfg(feature = "failpoints")]
{
fail::fail_point!("snap_send_error", |_| {
Err(Error::Other(box_err!("snap_send_error")))
});
let should_delay = (|| {
fail::fail_point!("snap_send_timer_delay", |_| { true });
false
Expand Down Expand Up @@ -247,10 +250,11 @@ pub fn send_snap(
send_timer.observe_duration();
drop(deregister);
drop(client);

fail_point!("snapshot_delete_after_send");
mgr.delete_snapshot(&key, &chunks.snap, true);
match recv_result {
Ok(_) => {
fail_point!("snapshot_delete_after_send");
mgr.delete_snapshot(&key, &chunks.snap, true);
let cost = UnixSecs::now().into_inner().saturating_sub(snap_start);
let send_duration_sec = timer.saturating_elapsed().as_secs();
// it should ignore if the duration of snapshot is less than 1s to decrease the
Expand Down
33 changes: 33 additions & 0 deletions tests/failpoints/cases/test_snap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1047,3 +1047,36 @@ fn test_send_snapshot_timeout() {
fail::remove("snap_send_timer_delay");
fail::remove("snap_send_duration_timeout");
}

// Test that snapshots that failed to be sent are cleaned up.
#[test]
fn test_snapshot_cleanup_on_send_error() {
let mut cluster = new_server_cluster(0, 3);
let pd_client = cluster.pd_client.clone();
pd_client.disable_default_operator();

let r = cluster.run_conf_change();

pd_client.must_add_peer(r, new_peer(2, 2));
cluster.must_put(b"k1", b"v1");

// Simulate an error on the gprc stream so that the leader fails to send
// the first snapshot to peer 3.
fail::cfg("snap_send_error", "return").unwrap();
pd_client.must_add_peer(r, new_peer(3, 3));

// Snapshot files are identified by a snap key which consists of region,
// term, and applied index. By performing a new put here, we advance the
// applied index on the leader and make it generate a new snapshot file.
cluster.must_put(b"k2", b"v2");

// After removing the failpoint, the new snapshot will be sent to peer 3,
// but we want to make sure that the old snapshot gets cleaned up as well.
fail::remove("snap_send_error");
cluster.must_put(b"k3", b"v3");

must_get_equal(&cluster.get_engine(3), b"k3", b"v3");

// Check that all snapshots on the leader have been deleted.
must_empty_dir(cluster.get_snap_dir(1));
}

0 comments on commit 3c2cbcf

Please sign in to comment.