Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ See [DESIGN.md](DESIGN.md#guest-agent) for details.
| `RUST_LOG` | `warn` | Logging level (quiet by default; use `info` or `debug` for verbose) |
| `FCVM_NO_SNAPSHOT` | unset | Set to `1` to disable automatic snapshot creation (same as `--no-snapshot` flag) |
| `FCVM_NO_WRITEBACK_CACHE` | unset | Set to `1` to disable FUSE writeback cache (see below) |
| `FCVM_NO_XATTR_FASTPATH` | unset | Set to `1` to disable security.capability xattr fast path |


### FUSE Writeback Cache

Expand Down
65 changes: 29 additions & 36 deletions fc-agent/src/fuse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,29 +74,32 @@ fn get_trace_rate() -> u64 {
0
}

/// Set FCVM_FUSE_MAX_WRITE from kernel boot param if not already set.
/// This propagates the max_write limit to the fuse-pipe client which reads from env.
/// Used to limit write sizes in nested VMs to avoid vsock data corruption.
fn propagate_max_write_from_cmdline() {
// Skip if already set in environment
if std::env::var("FCVM_FUSE_MAX_WRITE").is_ok() {
return;
/// Get max_write from FCVM_FUSE_MAX_WRITE env var or kernel boot param (0 = unbounded).
/// Checks (in order):
/// 1. FCVM_FUSE_MAX_WRITE environment variable
/// 2. fuse_max_write=N kernel boot parameter (from /proc/cmdline)
/// 3. Default: 0 (unbounded)
fn get_max_write() -> u32 {
// First check environment variable
if let Some(n) = std::env::var("FCVM_FUSE_MAX_WRITE")
.ok()
.and_then(|s| s.parse().ok())
{
return n;
}

// Check kernel command line for fuse_max_write=N
// Then check kernel command line
if let Ok(cmdline) = std::fs::read_to_string("/proc/cmdline") {
for part in cmdline.split_whitespace() {
if let Some(value) = part.strip_prefix("fuse_max_write=") {
// Set as environment variable for fuse-pipe to pick up
std::env::set_var("FCVM_FUSE_MAX_WRITE", value);
eprintln!(
"[fc-agent] set FCVM_FUSE_MAX_WRITE={} from kernel cmdline",
value
);
return;
if let Ok(n) = value.parse() {
return n;
}
}
}
}

0
}

/// Set FCVM_NO_WRITEBACK_CACHE from kernel boot param if not already set.
Expand Down Expand Up @@ -132,31 +135,21 @@ fn propagate_no_writeback_cache_from_cmdline() {
/// * `mount_point` - The path where the filesystem will be mounted
pub fn mount_vsock(port: u32, mount_point: &str) -> anyhow::Result<()> {
// Propagate config from kernel cmdline to env vars for fuse-pipe
propagate_max_write_from_cmdline();
propagate_no_writeback_cache_from_cmdline();

let num_readers = get_num_readers();
let trace_rate = get_trace_rate();
let max_write = get_max_write();
eprintln!(
"[fc-agent] mounting FUSE volume at {} via vsock port {} ({} readers, trace_rate={})",
mount_point, port, num_readers, trace_rate
);
fuse_pipe::mount_vsock_with_options(HOST_CID, port, mount_point, num_readers, trace_rate)
}

/// Mount a FUSE filesystem with multiple reader threads.
///
/// Same as `mount_vsock` but creates multiple FUSE reader threads for
/// better parallel performance.
#[allow(dead_code)]
pub fn mount_vsock_with_readers(
port: u32,
mount_point: &str,
num_readers: usize,
) -> anyhow::Result<()> {
eprintln!(
"[fc-agent] mounting FUSE volume at {} via vsock port {} ({} readers)",
mount_point, port, num_readers
"[fc-agent] mounting FUSE volume at {} via vsock port {} ({} readers, trace_rate={}, max_write={})",
mount_point, port, num_readers, trace_rate, max_write
);
fuse_pipe::mount_vsock_with_readers(HOST_CID, port, mount_point, num_readers)
fuse_pipe::mount_vsock_with_options(
HOST_CID,
port,
mount_point,
num_readers,
trace_rate,
max_write,
)
}
50 changes: 15 additions & 35 deletions fuse-pipe/src/client/fuse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,23 +57,31 @@ pub struct FuseClient {
init_callback: Mutex<Option<InitCallback>>,
/// Shared flag set by destroy() to signal clean shutdown to reader threads
destroyed: Arc<AtomicBool>,
/// Maximum write size (0 = unbounded). Passed explicitly to avoid env var races.
max_write: u32,
}

impl FuseClient {
/// Create a new client using shared multiplexer.
pub fn new(mux: Arc<Multiplexer>) -> Self {
Self::with_destroyed_flag(mux, Arc::new(AtomicBool::new(false)))
Self::with_options(mux, Arc::new(AtomicBool::new(false)), 0)
}

/// Create a new client with a shared destroyed flag.
///
/// The destroyed flag is set by `destroy()` when the filesystem is unmounted.
/// Reader threads can check this flag to distinguish clean shutdown from errors.
pub fn with_destroyed_flag(mux: Arc<Multiplexer>, destroyed: Arc<AtomicBool>) -> Self {
Self::with_options(mux, destroyed, 0)
}

/// Create a new client with a shared destroyed flag and max_write limit.
pub fn with_options(mux: Arc<Multiplexer>, destroyed: Arc<AtomicBool>, max_write: u32) -> Self {
Self {
mux,
init_callback: Mutex::new(None),
destroyed,
max_write,
}
}

Expand All @@ -87,6 +95,7 @@ impl FuseClient {
mux,
init_callback: Mutex::new(Some(callback)),
destroyed,
max_write: 0,
}
}

Expand Down Expand Up @@ -204,12 +213,6 @@ fn protocol_file_type_to_fuser(ft: u8) -> FileType {
}
}

/// Default max_write size for FUSE operations (0 = unbounded, use kernel default).
///
/// For nested virtualization (L2 VMs), set FCVM_FUSE_MAX_WRITE=32768 to avoid
/// vsock data loss due to cache coherency issues in double Stage 2 translation.
const DEFAULT_FUSE_MAX_WRITE: u32 = 0;

impl Filesystem for FuseClient {
fn init(&mut self, _req: &Request, config: &mut fuser::KernelConfig) -> Result<(), io::Error> {
// Enable writeback cache for better write performance (kernel batches writes).
Expand Down Expand Up @@ -252,11 +255,8 @@ impl Filesystem for FuseClient {
}

// Limit max_write to avoid vsock data loss under nested virtualization.
// Override with FCVM_FUSE_MAX_WRITE env var (0 = unbounded).
let max_write = std::env::var("FCVM_FUSE_MAX_WRITE")
.ok()
.and_then(|v| v.parse::<u32>().ok())
.unwrap_or(DEFAULT_FUSE_MAX_WRITE);
// Passed explicitly via mount_vsock_with_options to avoid env var races.
let max_write = self.max_write;

if max_write > 0 {
if let Err(max) = config.set_max_write(max_write) {
Expand Down Expand Up @@ -605,7 +605,7 @@ impl Filesystem for FuseClient {
});

match response {
VolumeResponse::Written { size } => reply.written(size),
VolumeResponse::Written { size } => reply.written(size as u32),
VolumeResponse::Error { errno } => reply.error(Errno::from_i32(errno)),
_ => reply.error(Errno::EIO),
}
Expand Down Expand Up @@ -990,26 +990,6 @@ impl Filesystem for FuseClient {
}

fn getxattr(&self, req: &Request, ino: INodeNo, name: &OsStr, size: u32, reply: ReplyXattr) {
// Fast path: The kernel calls getxattr("security.capability") on every write
// to check if file capabilities need to be cleared. This is extremely common
// and almost always returns ENODATA (no capabilities set). Short-circuit this
// to avoid the expensive server round-trip (~32µs savings per write).
//
// This is safe because:
// 1. If capabilities ARE set, they're preserved (we'd need setxattr to clear)
// 2. The kernel's capability check is advisory - it clears caps on successful write
// 3. Container workloads rarely use file capabilities
//
// Can be disabled via FCVM_NO_XATTR_FASTPATH=1 for debugging.
if std::env::var("FCVM_NO_XATTR_FASTPATH").is_err() {
if let Some(name_str) = name.to_str() {
if name_str == "security.capability" {
reply.error(Errno::ENODATA);
return;
}
}
}

let response = self.send_request_sync(VolumeRequest::Getxattr {
ino: ino.into(),
name: name.to_string_lossy().to_string(),
Expand Down Expand Up @@ -1198,7 +1178,7 @@ impl Filesystem for FuseClient {
});

match response {
VolumeResponse::Written { size } => reply.written(size),
VolumeResponse::Written { size } => reply.written(size as u32),
VolumeResponse::Error { errno } => reply.error(Errno::from_i32(errno)),
_ => reply.error(Errno::EIO),
}
Expand Down Expand Up @@ -1241,7 +1221,7 @@ impl Filesystem for FuseClient {
);

match response {
VolumeResponse::Written { size } => reply.written(size),
VolumeResponse::Written { size } => reply.written(size as u32),
VolumeResponse::Error { errno } => reply.error(Errno::from_i32(errno)),
_ => reply.error(Errno::EIO),
}
Expand Down
8 changes: 5 additions & 3 deletions fuse-pipe/src/client/mount.rs
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ fn mount_internal<P: AsRef<Path>>(
/// ```
#[cfg(target_os = "linux")]
pub fn mount_vsock<P: AsRef<Path>>(cid: u32, port: u32, mount_point: P) -> anyhow::Result<()> {
mount_vsock_with_options(cid, port, mount_point, 1, 0)
mount_vsock_with_options(cid, port, mount_point, 1, 0, 0)
}

/// Mount a FUSE filesystem via vsock with multiple reader threads.
Expand All @@ -435,7 +435,7 @@ pub fn mount_vsock_with_readers<P: AsRef<Path>>(
mount_point: P,
num_readers: usize,
) -> anyhow::Result<()> {
mount_vsock_with_options(cid, port, mount_point, num_readers, 0)
mount_vsock_with_options(cid, port, mount_point, num_readers, 0, 0)
}

/// Mount a FUSE filesystem via vsock with full configuration.
Expand All @@ -447,13 +447,15 @@ pub fn mount_vsock_with_readers<P: AsRef<Path>>(
/// * `mount_point` - Directory where the FUSE filesystem will be mounted
/// * `num_readers` - Number of FUSE reader threads (1-8 recommended)
/// * `trace_rate` - Trace every Nth request (0 = disabled)
/// * `max_write` - Maximum write size in bytes (0 = unbounded, use kernel default)
#[cfg(target_os = "linux")]
pub fn mount_vsock_with_options<P: AsRef<Path>>(
cid: u32,
port: u32,
mount_point: P,
num_readers: usize,
trace_rate: u64,
max_write: u32,
) -> anyhow::Result<()> {
info!(target: "fuse-pipe::client", cid, port, num_readers, "connecting via vsock");

Expand Down Expand Up @@ -516,7 +518,7 @@ pub fn mount_vsock_with_options<P: AsRef<Path>>(
let mut session = None;
let mut last_error = None;
for attempt in 0..=SESSION_NEW_MAX_RETRIES {
let fs = FuseClient::with_destroyed_flag(Arc::clone(&mux), Arc::clone(&destroyed));
let fs = FuseClient::with_options(Arc::clone(&mux), Arc::clone(&destroyed), max_write);
match fuser::Session::new(fs, mount_point.as_ref(), &config) {
Ok(s) => {
if attempt > 0 {
Expand Down
2 changes: 1 addition & 1 deletion fuse-pipe/src/protocol/response.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ pub enum VolumeResponse {
Data { data: Vec<u8> },

/// Number of bytes written.
Written { size: u32 },
Written { size: u64 },

/// File opened response.
Opened { fh: u64, flags: u32 },
Expand Down
29 changes: 16 additions & 13 deletions fuse-pipe/src/server/passthrough.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,15 +210,18 @@ impl FilesystemHandler for PassthroughFs {
*g.borrow_mut() = groups_opt;
});

// Dispatch to the default handler
let result = self.handle_request(request);

// Clear thread-local
CURRENT_GROUPS.with(|g| {
*g.borrow_mut() = None;
});
// Use a Drop guard to ensure thread-local is cleared even on panic
struct GroupsGuard;
impl Drop for GroupsGuard {
fn drop(&mut self) {
CURRENT_GROUPS.with(|g| {
*g.borrow_mut() = None;
});
}
}
let _guard = GroupsGuard;

result
self.handle_request(request)
}

fn lookup(&self, parent: u64, name: &str, uid: u32, gid: u32, pid: u32) -> VolumeResponse {
Expand Down Expand Up @@ -656,7 +659,7 @@ impl FilesystemHandler for PassthroughFs {
) {
Ok(n) => {
tracing::debug!(target: "passthrough", fh, written = n, "write succeeded");
VolumeResponse::Written { size: n as u32 }
VolumeResponse::Written { size: n as u64 }
}
Err(e) => {
tracing::debug!(target: "passthrough", fh, error = ?e, "write failed");
Expand Down Expand Up @@ -1149,7 +1152,7 @@ impl FilesystemHandler for PassthroughFs {
) {
Ok(n) => {
tracing::debug!(target: "passthrough", copied = n, "copy_file_range succeeded");
VolumeResponse::Written { size: n as u32 }
VolumeResponse::Written { size: n as u64 }
}
Err(e) => {
tracing::debug!(target: "passthrough", error = ?e, "copy_file_range failed");
Expand Down Expand Up @@ -1190,7 +1193,7 @@ impl FilesystemHandler for PassthroughFs {
) {
Ok(n) => {
tracing::debug!(target: "passthrough", cloned = n, "remap_file_range succeeded");
VolumeResponse::Written { size: n as u32 }
VolumeResponse::Written { size: n as u64 }
}
Err(e) => {
tracing::debug!(target: "passthrough", error = ?e, "remap_file_range failed");
Expand Down Expand Up @@ -1607,7 +1610,7 @@ mod tests {
// For whole-file clone (len=0), we return the file size on success
assert_eq!(
size,
test_data.len() as u32,
test_data.len() as u64,
"FICLONE should return file size for whole file (len=0)"
);

Expand Down Expand Up @@ -1726,7 +1729,7 @@ mod tests {
match resp {
VolumeResponse::Written { size } => {
eprintln!("FICLONERANGE succeeded, size={}", size);
assert_eq!(size, block_size as u32, "should clone requested size");
assert_eq!(size, block_size as u64, "should clone requested size");

// Verify: first block of dest should equal second block of source
let resp = fs.read(dst_ino, dst_fh, 0, block_size as u32, uid, gid, 0);
Expand Down
10 changes: 6 additions & 4 deletions src/commands/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -552,12 +552,14 @@ pub async fn restore_from_snapshot(
}

// Verify TAP device was created successfully
let verify_cmd = format!("ip link show {} >/dev/null 2>&1", tap_device);
let verify_output = tokio::process::Command::new(&nsenter_prefix[0])
.args(&nsenter_prefix[1..])
.arg("bash")
.arg("-c")
.arg(&verify_cmd)
.arg("ip")
.arg("link")
.arg("show")
.arg(&tap_device)
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status()
.await
.context("verifying TAP device")?;
Expand Down
9 changes: 8 additions & 1 deletion src/commands/podman.rs
Original file line number Diff line number Diff line change
Expand Up @@ -479,12 +479,19 @@ async fn create_disk_from_dir(
);

// Create sparse file
tokio::process::Command::new("truncate")
let truncate_status = tokio::process::Command::new("truncate")
.args(["-s", &image_size.to_string(), output_path.to_str().unwrap()])
.status()
.await
.context("creating sparse file")?;

if !truncate_status.success() {
bail!(
"truncate failed with exit code: {:?}",
truncate_status.code()
);
}

// Format as ext4
let mkfs = tokio::process::Command::new("mkfs.ext4")
.args(["-q", "-F", output_path.to_str().unwrap()])
Expand Down
Loading
Loading