diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml index 7e2ec4f71..812dfc490 100644 --- a/.github/actions/setup-build-env/action.yml +++ b/.github/actions/setup-build-env/action.yml @@ -9,6 +9,9 @@ runs: rustup update stable rustup default stable rustup component add rustfmt clippy + rustup target add x86_64-unknown-freebsd + rustup toolchain install nightly + rustup component add rust-src --toolchain nightly - name: Cache Cargo dependencies uses: actions/cache@v4 diff --git a/.github/workflows/cross-compilation.yml b/.github/workflows/cross-compilation.yml index 40273f800..0fe86dcfe 100644 --- a/.github/workflows/cross-compilation.yml +++ b/.github/workflows/cross-compilation.yml @@ -35,7 +35,7 @@ jobs: - name: Install cross-compilation dependencies run: | sudo apt-get update - sudo apt-get install -y --no-install-recommends lld + sudo apt-get install -y --no-install-recommends clang lld - name: Build FreeBSD init on Linux run: make BUILD_BSD_INIT=1 -- init/init-freebsd @@ -52,7 +52,7 @@ jobs: - name: Install cross-compilation dependencies run: | sudo apt-get update - sudo apt-get install -y --no-install-recommends lld + sudo apt-get install -y --no-install-recommends clang lld - name: Build FreeBSD init on Linux aarch64 run: make BUILD_BSD_INIT=1 -- init/init-freebsd diff --git a/Cargo.lock b/Cargo.lock index c066dd4eb..5e5cf6822 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -563,6 +563,10 @@ dependencies = [ "serde_core", ] +[[package]] +name = "init-blob" +version = "0.1.0" + [[package]] name = "iocuddle" version = "0.1.1" @@ -745,6 +749,17 @@ dependencies = [ "log", ] +[[package]] +name = "krun-init" +version = "0.1.0-1.18.0" +dependencies = [ + "anyhow", + "libc", + "nix 0.30.1", + "serde", + "serde_json", +] + [[package]] name = "krun-input" version = "0.1.0" @@ -885,6 +900,7 @@ version = "1.18.0" dependencies = [ "crossbeam-channel", "env_logger", + "init-blob", "krun-aws-nitro", "krun-devices", "krun-display", diff --git a/Cargo.toml b/Cargo.toml index 83db53c57..d55fceb5e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,8 @@ [workspace] members = [ + "init", "src/libkrun", + "src/init-blob", "src/input", "src/display", "src/utils", diff --git a/Makefile b/Makefile index 17ec2f2d3..159356aab 100644 --- a/Makefile +++ b/Makefile @@ -20,8 +20,6 @@ AWS_NITRO_INIT_SRC = \ AWS_NITRO_INIT_LD_FLAGS = -larchive -lnsm -INIT_SRC = init/init.c - ifeq ($(SEV),1) VARIANT = -sev FEATURE_FLAGS := --features amd-sev @@ -130,6 +128,7 @@ else endif # Cross-compile on macOS with the LLVM linker (brew install lld) CC_BSD=$(CLANG) -target $(ARCH)-unknown-freebsd -fuse-ld=lld -stdlib=libc++ -Wl,-strip-debug --sysroot $(SYSROOT_BSD) + CARGO_BSD_RUSTFLAGS = -C linker=$(CLANG) -C link-arg=-target -C link-arg=$(ARCH)-unknown-freebsd -C link-arg=-fuse-ld=lld -C link-arg=-stdlib=libc++ -C link-arg=--sysroot=$(abspath $(SYSROOT_BSD)) else ifeq ($(OS),Linux) # Linux -> FreeBSD cross-compilation ifeq ($(SYSROOT_BSD),) @@ -140,16 +139,34 @@ else endif # Cross-compile on Linux with clang CC_BSD=$(CLANG) -target $(ARCH)-unknown-freebsd -fuse-ld=lld -Wl,-strip-debug --sysroot $(SYSROOT_BSD) + CARGO_BSD_RUSTFLAGS = -C linker=$(CLANG) -C link-arg=-target -C link-arg=$(ARCH)-unknown-freebsd -C link-arg=-fuse-ld=lld -C link-arg=--sysroot=$(abspath $(SYSROOT_BSD)) else # Build on FreeBSD host CC_BSD=$(CC) SYSROOT_BSD_TARGET = + CARGO_BSD_RUSTFLAGS = +endif + +FREEBSD_RUST_TARGET = $(subst arm64,aarch64,$(ARCH))-unknown-freebsd + +# aarch64-unknown-freebsd is Tier 3: no prebuilt std, requires nightly + build-std. +ifeq ($(FREEBSD_RUST_TARGET),aarch64-unknown-freebsd) + CARGO_BSD_TOOLCHAIN = +nightly + CARGO_BSD_EXTRA_FLAGS = -Z build-std +else + CARGO_BSD_TOOLCHAIN = + CARGO_BSD_EXTRA_FLAGS = endif ifeq ($(BUILD_BSD_INIT),1) INIT_BINARY_BSD = init/init-freebsd -$(INIT_BINARY_BSD): $(INIT_SRC) $(SYSROOT_BSD_TARGET) - $(CC_BSD) -std=c23 -O2 -static -Wall -o $@ $(INIT_SRC) -lutil +$(INIT_BINARY_BSD): $(shell find init/src -name '*.rs') init/Cargo.toml $(SYSROOT_BSD_TARGET) + RUSTFLAGS="$(CARGO_BSD_RUSTFLAGS)" \ + cargo $(CARGO_BSD_TOOLCHAIN) build --release \ + $(CARGO_BSD_EXTRA_FLAGS) \ + --manifest-path init/Cargo.toml \ + --target $(FREEBSD_RUST_TARGET) + cp target/$(FREEBSD_RUST_TARGET)/release/krun-init $@ endif # Sysroot preparation rules for cross-compilation on macOS @@ -272,7 +289,13 @@ TEST_FLAGS ?= EXTRA_LIBPATH_Linux = EXTRA_LIBPATH_Darwin = /opt/homebrew/opt/libkrunfw/lib:/opt/homebrew/opt/llvm/lib +# Extra cargo features for the test runner (passed via KRUN_TEST_FEATURES). +TEST_FEATURE_FLAGS := +ifeq ($(BLK),1) + TEST_FEATURE_FLAGS += blk +endif + # On macOS, SIP strips DYLD_LIBRARY_PATH when executing scripts via a shebang, # so we pass the path via LIBKRUN_LIB_PATH and let run.sh set the real variable. test: test-prefix - cd tests; RUST_LOG=trace LIBKRUN_LIB_PATH="$$(realpath ../test-prefix/$(LIBDIR_$(OS))/):$(EXTRA_LIBPATH_$(OS))" PKG_CONFIG_PATH="$$(realpath ../test-prefix/$(LIBDIR_$(OS))/pkgconfig/)" ./run.sh test --test-case "$(TEST)" $(TEST_FLAGS) + cd tests; RUST_LOG=trace KRUN_TEST_FEATURES="$(TEST_FEATURE_FLAGS)" LIBKRUN_LIB_PATH="$$(realpath ../test-prefix/$(LIBDIR_$(OS))/):$(EXTRA_LIBPATH_$(OS))" PKG_CONFIG_PATH="$$(realpath ../test-prefix/$(LIBDIR_$(OS))/pkgconfig/)" ./run.sh test --test-case "$(TEST)" $(TEST_FLAGS) diff --git a/include/libkrun.h b/include/libkrun.h index 87d5e1fa1..c5a64351f 100644 --- a/include/libkrun.h +++ b/include/libkrun.h @@ -6,8 +6,8 @@ extern "C" { #endif #include -#include #include +#include #include /** @@ -27,7 +27,6 @@ extern "C" { */ int32_t krun_set_log_level(uint32_t level); - #define KRUN_LOG_TARGET_DEFAULT -1 #define KRUN_LOG_LEVEL_OFF 0 @@ -47,23 +46,27 @@ int32_t krun_set_log_level(uint32_t level); * Initializes logging for the library. * * Arguments: - * "target_fd" - File descriptor to write log to. Note that using a file descriptor pointing to a regular file on - * filesystem might slow down the VM. - * Use KRUN_LOG_TARGET_DEFAULT to use the default target for log output (stderr). + * "target_fd" - File descriptor to write log to. Note that using a file + * descriptor pointing to a regular file on filesystem might slow down the VM. + * Use KRUN_LOG_TARGET_DEFAULT to use the default target for log + * output (stderr). * - * "level" - Level is an integer specifying the level of verbosity, higher number means more verbose log. - * The log levels are described by the constants: KRUN_LOG_LEVEL_{OFF, ERROR, WARN, INFO, DEBUG, TRACE} + * "level" - Level is an integer specifying the level of verbosity, higher + * number means more verbose log. The log levels are described by the constants: + * KRUN_LOG_LEVEL_{OFF, ERROR, WARN, INFO, DEBUG, TRACE} * - * "style" - Enable/disable usage of terminal escape sequences (to display colors) - * One of: KRUN_LOG_STYLE_{AUTO, ALWAYS, NEVER}. + * "style" - Enable/disable usage of terminal escape sequences (to display + * colors) One of: KRUN_LOG_STYLE_{AUTO, ALWAYS, NEVER}. * * "options" - Bitmask of logging options, use 0 for default options. - * KRUN_LOG_OPTION_NO_ENV to disallow environment variables to override these settings. + * KRUN_LOG_OPTION_NO_ENV to disallow environment variables to + * override these settings. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_init_log(int target_fd, uint32_t level, uint32_t style, uint32_t options); +int32_t krun_init_log(int target_fd, uint32_t level, uint32_t style, + uint32_t options); /** * Creates a configuration context. @@ -95,23 +98,27 @@ int32_t krun_free_ctx(uint32_t ctx_id); * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_vm_config(uint32_t ctx_id, uint8_t num_vcpus, uint32_t ram_mib); +int32_t krun_set_vm_config(uint32_t ctx_id, uint8_t num_vcpus, + uint32_t ram_mib); /** - * The virtiofs tag used for the root filesystem. Can be used with krun_add_virtiofs* - * for more control over root filesystem parameters (e.g. read-only, DAX window size). + * The virtiofs tag used for the root filesystem. Can be used with + * krun_add_virtiofs* for more control over root filesystem parameters (e.g. + * read-only, DAX window size). */ #define KRUN_FS_ROOT_TAG "/dev/root" /** - * Sets the path to be use as root for the microVM. Not available in libkrun-SEV. + * Sets the path to be use as root for the microVM. Not available in + * libkrun-SEV. * * For more control over the root filesystem (e.g. read-only, DAX window size), * use krun_add_virtiofs3() with KRUN_FS_ROOT_TAG instead. * * Arguments: * "ctx_id" - the configuration context ID. - * "root_path" - a null-terminated string representing the path to be used as root. + * "root_path" - a null-terminated string representing the path to be used as + * root. * * Returns: * Zero on success or a negative error number on failure. @@ -121,13 +128,13 @@ int32_t krun_set_root(uint32_t ctx_id, const char *root_path); /** * DEPRECATED. Use krun_add_disk instead. * - * Sets the path to the disk image that contains the file-system to be used as root for the microVM. - * The only supported image format is "raw". + * Sets the path to the disk image that contains the file-system to be used as + * root for the microVM. The only supported image format is "raw". * * Arguments: * "ctx_id" - the configuration context ID. - * "disk_path" - a null-terminated string representing the path leading to the disk image that - * contains the root file-system. + * "disk_path" - a null-terminated string representing the path leading to the + * disk image that contains the root file-system. * * Returns: * Zero on success or a negative error number on failure. @@ -142,8 +149,8 @@ int32_t krun_set_root_disk(uint32_t ctx_id, const char *disk_path); * * Arguments: * "ctx_id" - the configuration context ID. - * "disk_path" - a null-terminated string representing the path leading to the disk image that - * contains the root file-system. + * "disk_path" - a null-terminated string representing the path leading to the + * disk image that contains the root file-system. * * Returns: * Zero on success or a negative error number on failure. @@ -151,27 +158,29 @@ int32_t krun_set_root_disk(uint32_t ctx_id, const char *disk_path); int32_t krun_set_data_disk(uint32_t ctx_id, const char *disk_path); /** - * Adds a disk image to be used as a general partition for the microVM. The only supported image - * format is "raw". + * Adds a disk image to be used as a general partition for the microVM. The only + * supported image format is "raw". * * This API is mutually exclusive with the deprecated krun_set_root_disk and * krun_set_data_disk methods and must not be used together. * - * This function deliberately only handles images in the Raw format, because it doesn't allow - * specifying an image format, and probing an image's format is dangerous. For more information, - * see the security note on `krun_add_disk2`, which allows opening non-Raw images. + * This function deliberately only handles images in the Raw format, because it + * doesn't allow specifying an image format, and probing an image's format is + * dangerous. For more information, see the security note on `krun_add_disk2`, + * which allows opening non-Raw images. * * Arguments: * "ctx_id" - the configuration context ID. * "block_id" - a null-terminated string representing the partition. - * "disk_path" - a null-terminated string representing the path leading to the disk image. - * "read_only" - whether the mount should be read-only. Required if the caller does not have - * write permissions (for disk images in /usr/share). + * "disk_path" - a null-terminated string representing the path leading to the + * disk image. "read_only" - whether the mount should be read-only. Required if + * the caller does not have write permissions (for disk images in /usr/share). * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_disk(uint32_t ctx_id, const char *block_id, const char *disk_path, bool read_only); +int32_t krun_add_disk(uint32_t ctx_id, const char *block_id, + const char *disk_path, bool read_only); /* Supported disk image formats */ #define KRUN_DISK_FORMAT_RAW 0 @@ -180,73 +189,77 @@ int32_t krun_add_disk(uint32_t ctx_id, const char *block_id, const char *disk_pa #define KRUN_DISK_FORMAT_VMDK 2 /** - * Adds a disk image to be used as a general partition for the microVM. The supported - * image formats are: "raw" and "qcow2". + * Adds a disk image to be used as a general partition for the microVM. The + * supported image formats are: "raw" and "qcow2". * * This API is mutually exclusive with the deprecated krun_set_root_disk and * krun_set_data_disk methods and must not be used together. * * SECURITY NOTE: - * Non-Raw images can reference other files, which libkrun will automatically open, and to which the - * guest will have access. Libkrun should therefore never be asked to open an image in a non-Raw - * format when it doesn't come from a fully trustworthy source. - * - * Consequently, probing an image's format is quite dangerous and to be avoided if at all possible, - * which is why libkrun provides no facilities for doing so. If it's not clear what format an image - * has, it may also not be clear whether it can be trusted to not reference files to which the guest - * shouldn't have access. - * - * If probing absolutely can't be avoided, it must only be done on images that are fully trusted, i.e. - * before a potentially untrusted guest had write access to it. Specifically, consider that a guest has - * full access to all of a Raw image, and can therefore turn it into a file in an arbitrary format, for - * example, into a Qcow2 image, referencing and granting a malicious guest access to arbitrary files. - * To hand a Raw image to an untrusted and potentially malicious guest, and then to re-probe it after - * the guest was able to write to it (when it can no longer be trusted), would therefore be a severe - * security vulnerability. - * - * Therefore, after having probed a yet fully trusted image once, the result must be remembered so the - * image will from then on always be opened in the format that was detected originally. When adhering - * to this, a guest can write anything they want to a Raw image, it's always going to be opened as a - * Raw image, preventing the security vulnerability outlined above. - * - * However, if at all possible, the image format should be explicitly selected based on knowledge - * obtained separately from the pure image data, for example by the user. + * Non-Raw images can reference other files, which libkrun will automatically + * open, and to which the guest will have access. Libkrun should therefore never + * be asked to open an image in a non-Raw format when it doesn't come from a + * fully trustworthy source. + * + * Consequently, probing an image's format is quite dangerous and to be avoided + * if at all possible, which is why libkrun provides no facilities for doing so. + * If it's not clear what format an image has, it may also not be clear whether + * it can be trusted to not reference files to which the guest shouldn't have + * access. + * + * If probing absolutely can't be avoided, it must only be done on images that + * are fully trusted, i.e. before a potentially untrusted guest had write access + * to it. Specifically, consider that a guest has full access to all of a Raw + * image, and can therefore turn it into a file in an arbitrary format, for + * example, into a Qcow2 image, referencing and granting a malicious guest + * access to arbitrary files. To hand a Raw image to an untrusted and + * potentially malicious guest, and then to re-probe it after the guest was able + * to write to it (when it can no longer be trusted), would therefore be a + * severe security vulnerability. + * + * Therefore, after having probed a yet fully trusted image once, the result + * must be remembered so the image will from then on always be opened in the + * format that was detected originally. When adhering to this, a guest can write + * anything they want to a Raw image, it's always going to be opened as a Raw + * image, preventing the security vulnerability outlined above. + * + * However, if at all possible, the image format should be explicitly selected + * based on knowledge obtained separately from the pure image data, for example + * by the user. * * Arguments: * "ctx_id" - the configuration context ID. * "block_id" - a null-terminated string representing the partition. - * "disk_path" - a null-terminated string representing the path leading to the disk image. - * "disk_format" - the disk image format (i.e. KRUN_DISK_FORMAT_{RAW, QCOW2}) - * "read_only" - whether the mount should be read-only. Required if the caller does not have - * write permissions (for disk images in /usr/share). + * "disk_path" - a null-terminated string representing the path leading to + * the disk image. "disk_format" - the disk image format (i.e. + * KRUN_DISK_FORMAT_{RAW, QCOW2}) "read_only" - whether the mount should be + * read-only. Required if the caller does not have write permissions (for disk + * images in /usr/share). * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_disk2(uint32_t ctx_id, - const char *block_id, - const char *disk_path, - uint32_t disk_format, +int32_t krun_add_disk2(uint32_t ctx_id, const char *block_id, + const char *disk_path, uint32_t disk_format, bool read_only); - /* Supported sync modes */ /** * Ignore VIRTIO_BLK_F_FLUSH. - * WARNING: may lead to loss of data - */ + * WARNING: may lead to loss of data + */ #define KRUN_SYNC_NONE 0 /** - * Honor VIRTIO_BLK_F_FLUSH requests, but relax strict hardware syncing on macOS. - * This is the recommended mode. + * Honor VIRTIO_BLK_F_FLUSH requests, but relax strict hardware syncing on + * macOS. This is the recommended mode. * * On macOS this flushes the OS buffers, but does not ask the drive to flush - * its buffered data, which significantly improves performance. + * its buffered data, which significantly improves performance. * On Linux this is the same as full sync. */ #define KRUN_SYNC_RELAXED 1 -/** +/** * Honor VIRTIO_BLK_F_FLUSH, strictly flushing buffers to physical disk. */ #define KRUN_SYNC_FULL 2 @@ -263,93 +276,91 @@ int32_t krun_add_disk2(uint32_t ctx_id, * Arguments: * "ctx_id" - the configuration context ID. * "block_id" - a null-terminated string representing the partition. - * "disk_path" - a null-terminated string representing the path leading to the disk image. - * "disk_format" - the disk image format (i.e. KRUN_DISK_FORMAT_{RAW, QCOW2}) - * "read_only" - whether the mount should be read-only. Required if the caller does not have - * write permissions (for disk images in /usr/share). - * "direct_io" - whether to bypass the host caches. - * "sync_mode" - whether to enable VIRTIO_BLK_F_FLUSH. On macOS, an additional relaxed sync - * mode is available, which is enabled by default, and will not ask the drive - * to flush its buffered data. + * "disk_path" - a null-terminated string representing the path leading to + * the disk image. "disk_format" - the disk image format (i.e. + * KRUN_DISK_FORMAT_{RAW, QCOW2}) "read_only" - whether the mount should be + * read-only. Required if the caller does not have write permissions (for disk + * images in /usr/share). "direct_io" - whether to bypass the host caches. + * "sync_mode" - whether to enable VIRTIO_BLK_F_FLUSH. On macOS, an + * additional relaxed sync mode is available, which is enabled by default, and + * will not ask the drive to flush its buffered data. * * Returns: * Zero on success or a negative error number on failure. */ - int32_t krun_add_disk3(uint32_t ctx_id, - const char *block_id, - const char *disk_path, - uint32_t disk_format, - bool read_only, - bool direct_io, - uint32_t sync_mode); +int32_t krun_add_disk3(uint32_t ctx_id, const char *block_id, + const char *disk_path, uint32_t disk_format, + bool read_only, bool direct_io, uint32_t sync_mode); /** * NO LONGER SUPPORTED. DO NOT USE. * - * Configures the mapped volumes for the microVM. Only supported on macOS, on Linux use - * user_namespaces and bind-mounts instead. Not available in libkrun-SEV. + * Configures the mapped volumes for the microVM. Only supported on macOS, on + * Linux use user_namespaces and bind-mounts instead. Not available in + * libkrun-SEV. * * Arguments: * "ctx_id" - the configuration context ID. - * "mapped_volumes" - an array of string pointers with format "host_path:guest_path" representing - * the volumes to be mapped inside the microVM + * "mapped_volumes" - an array of string pointers with format + * "host_path:guest_path" representing the volumes to be mapped inside the + * microVM * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_mapped_volumes(uint32_t ctx_id, const char *const mapped_volumes[]); +int32_t krun_set_mapped_volumes(uint32_t ctx_id, + const char *const mapped_volumes[]); /** - * Adds an independent virtio-fs device pointing to a host's directory with a tag. + * Adds an independent virtio-fs device pointing to a host's directory with a + * tag. * * Arguments: * "ctx_id" - the configuration context ID. * "c_tag" - tag to identify the filesystem in the guest. - * "c_path" - full path to the directory in the host to be exposed to the guest. + * "c_path" - full path to the directory in the host to be exposed to + * the guest. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_virtiofs(uint32_t ctx_id, - const char *c_tag, +int32_t krun_add_virtiofs(uint32_t ctx_id, const char *c_tag, const char *c_path); /** - * Adds an independent virtio-fs device pointing to a host's directory with a tag. This - * variant allows specifying the size of the DAX window. + * Adds an independent virtio-fs device pointing to a host's directory with a + * tag. This variant allows specifying the size of the DAX window. * * Arguments: * "ctx_id" - the configuration context ID. * "c_tag" - tag to identify the filesystem in the guest. - * "c_path" - full path to the directory in the host to be exposed to the guest. - * "shm_size" - size of the DAX SHM window in bytes. + * "c_path" - full path to the directory in the host to be exposed to + * the guest. "shm_size" - size of the DAX SHM window in bytes. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_virtiofs2(uint32_t ctx_id, - const char *c_tag, - const char *c_path, - uint64_t shm_size); +int32_t krun_add_virtiofs2(uint32_t ctx_id, const char *c_tag, + const char *c_path, uint64_t shm_size); /** - * Adds an independent virtio-fs device pointing to a host's directory with a tag. This - * variant allows specifying the size of the DAX window and a read-only flag. + * Adds an independent virtio-fs device pointing to a host's directory with a + * tag. This variant allows specifying the size of the DAX window and a + * read-only flag. * * Arguments: * "ctx_id" - the configuration context ID. * "c_tag" - tag to identify the filesystem in the guest. - * "c_path" - full path to the directory in the host to be exposed to the guest. - * "shm_size" - size of the DAX SHM window in bytes. - * "read_only" - if true, the filesystem will be exposed as read-only to the guest. + * "c_path" - full path to the directory in the host to be exposed to + * the guest. "shm_size" - size of the DAX SHM window in bytes. + * "read_only" - if true, the filesystem will be exposed as read-only to + * the guest. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_virtiofs3(uint32_t ctx_id, - const char *c_tag, - const char *c_path, - uint64_t shm_size, +int32_t krun_add_virtiofs3(uint32_t ctx_id, const char *c_tag, + const char *c_path, uint64_t shm_size, bool read_only); /* Send the VFKIT magic after establishing the connection, @@ -358,8 +369,8 @@ int32_t krun_add_virtiofs3(uint32_t ctx_id, #define NET_FLAG_DHCP_CLIENT (1 << 1) /* TSI (Transparent Socket Impersonation) feature flags for vsock */ -#define KRUN_TSI_HIJACK_INET (1 << 0) -#define KRUN_TSI_HIJACK_UNIX (1 << 1) +#define KRUN_TSI_HIJACK_INET (1 << 0) +#define KRUN_TSI_HIJACK_UNIX (1 << 1) /* Taken from uapi/linux/virtio_net.h */ #define NET_FEATURE_CSUM 1 << 0 @@ -371,10 +382,11 @@ int32_t krun_add_virtiofs3(uint32_t ctx_id, #define NET_FEATURE_HOST_TSO6 1 << 12 #define NET_FEATURE_HOST_UFO 1 << 14 -/* These are the features enabled by krun_set_passt_fd and krun_set_gvproxy_path. */ -#define COMPAT_NET_FEATURES NET_FEATURE_CSUM | NET_FEATURE_GUEST_CSUM | \ - NET_FEATURE_GUEST_TSO4 | NET_FEATURE_GUEST_UFO | \ - NET_FEATURE_HOST_TSO4 | NET_FEATURE_HOST_UFO +/* These are the features enabled by krun_set_passt_fd and + * krun_set_gvproxy_path. */ +#define COMPAT_NET_FEATURES \ + NET_FEATURE_CSUM | NET_FEATURE_GUEST_CSUM | NET_FEATURE_GUEST_TSO4 | \ + NET_FEATURE_GUEST_UFO | NET_FEATURE_HOST_TSO4 | NET_FEATURE_HOST_UFO /** * Adds an independent virtio-net device connected to a * unixstream-based userspace network proxy, such as passt or @@ -411,11 +423,8 @@ int32_t krun_add_virtiofs3(uint32_t ctx_id, * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_net_unixstream(uint32_t ctx_id, - const char *c_path, - int fd, - uint8_t *const c_mac, - uint32_t features, +int32_t krun_add_net_unixstream(uint32_t ctx_id, const char *c_path, int fd, + uint8_t *const c_mac, uint32_t features, uint32_t flags); /** @@ -455,11 +464,8 @@ int32_t krun_add_net_unixstream(uint32_t ctx_id, * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_net_unixgram(uint32_t ctx_id, - const char *c_path, - int fd, - uint8_t *const c_mac, - uint32_t features, +int32_t krun_add_net_unixgram(uint32_t ctx_id, const char *c_path, int fd, + uint8_t *const c_mac, uint32_t features, uint32_t flags); /** @@ -486,10 +492,8 @@ int32_t krun_add_net_unixgram(uint32_t ctx_id, * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_net_tap(uint32_t ctx_id, - char *c_tap_name, - uint8_t *const c_mac, - uint32_t features, +int32_t krun_add_net_tap(uint32_t ctx_id, char *c_tap_name, + uint8_t *const c_mac, uint32_t features, uint32_t flags); /** @@ -556,17 +560,18 @@ int32_t krun_set_net_mac(uint32_t ctx_id, uint8_t *const c_mac); * -ENOTSUP when passt networking is used * * Notes: - * Passing NULL (or not calling this function) as "port_map" has a different meaning than - * passing an empty array. The first one will instruct libkrun to attempt to expose all - * listening ports in the guest to the host, while the second means that no port from - * the guest will be exposed to host. + * Passing NULL (or not calling this function) as "port_map" has a different + * meaning than passing an empty array. The first one will instruct libkrun to + * attempt to expose all listening ports in the guest to the host, while the + * second means that no port from the guest will be exposed to host. * - * Exposed ports will only become accessible by their "host_port" in the guest too. This - * means that for a map such as "8080:80", applications running inside the guest will also - * need to access the service through the "8080" port. + * Exposed ports will only become accessible by their "host_port" in the guest + * too. This means that for a map such as "8080:80", applications running inside + * the guest will also need to access the service through the "8080" port. * - * If past networking mode is used (krun_set_passt_fd was called), port mapping is not supported - * as an API of libkrun (but you can still do port mapping using command line arguments of passt) + * If past networking mode is used (krun_set_passt_fd was called), port mapping + * is not supported as an API of libkrun (but you can still do port mapping + * using command line arguments of passt) */ int32_t krun_set_port_map(uint32_t ctx_id, const char *const port_map[]); @@ -606,17 +611,18 @@ int32_t krun_set_gpu_options(uint32_t ctx_id, uint32_t virgl_flags); * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_gpu_options2(uint32_t ctx_id, - uint32_t virgl_flags, +int32_t krun_set_gpu_options2(uint32_t ctx_id, uint32_t virgl_flags, uint64_t shm_size); -/* Maximum number of displays. Same as VIRTIO_GPU_MAX_SCANOUTS defined in the virtio-gpu spec */ +/* Maximum number of displays. Same as VIRTIO_GPU_MAX_SCANOUTS defined in the + * virtio-gpu spec */ #define KRUN_MAX_DISPLAYS 16 /** * Configure a display output for the VM. * - * Note that to have display output a display backend must also be set (see krun_set_display_backend). + * Note that to have display output a display backend must also be set (see + * krun_set_display_backend). * * Arguments: * "ctx_id" - the configuration context ID. @@ -624,17 +630,19 @@ int32_t krun_set_gpu_options2(uint32_t ctx_id, * "height" - the height of the window/display * * Returns: - * The id of the display (0 to KRUN_MAX_DISPLAYS - 1) on success or a negative error number on failure. + * The id of the display (0 to KRUN_MAX_DISPLAYS - 1) on success or a negative + * error number on failure. */ int32_t krun_add_display(uint32_t ctx_id, uint32_t width, uint32_t height); /** * Configure a custom EDID blob for a display * - * This replaces the generated EDID with a custom one. Configuring an EDID blob makes all display parameters except - * width and height ignored. + * This replaces the generated EDID with a custom one. Configuring an EDID blob + * makes all display parameters except width and height ignored. * - * Note that libkrun doesn't do any checks if the EDID matches the width/height specified in krun_add_display(). + * Note that libkrun doesn't do any checks if the EDID matches the width/height + * specified in krun_add_display(). * * Arguments: * "ctx_id" - the configuration context ID. @@ -645,7 +653,8 @@ int32_t krun_add_display(uint32_t ctx_id, uint32_t width, uint32_t height); * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_display_set_edid(uint32_t ctx_id, uint32_t display_id, const uint8_t* edid_blob, size_t blob_size); +int32_t krun_display_set_edid(uint32_t ctx_id, uint32_t display_id, + const uint8_t *edid_blob, size_t blob_size); /** * Configure DPI of the display reported to the guest @@ -660,12 +669,14 @@ int32_t krun_display_set_edid(uint32_t ctx_id, uint32_t display_id, const uint8_ * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_display_set_dpi(uint32_t ctx_id, uint32_t display_id, uint32_t dpi); +int32_t krun_display_set_dpi(uint32_t ctx_id, uint32_t display_id, + uint32_t dpi); /** * Configure physical size of the display reported to the guest * - * This overrides the physical size of the display set by krun_set_display_physical_size() + * This overrides the physical size of the display set by + * krun_set_display_physical_size() * * Arguments: * "ctx_id" - the configuration context ID. @@ -676,7 +687,8 @@ int32_t krun_display_set_dpi(uint32_t ctx_id, uint32_t display_id, uint32_t dpi) * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_display_set_physical_size(uint32_t ctx_id, uint32_t display_id, uint16_t width_mm, uint16_t height_mm); +int32_t krun_display_set_physical_size(uint32_t ctx_id, uint32_t display_id, + uint16_t width_mm, uint16_t height_mm); /** * Configure refresh rate for a display @@ -690,10 +702,12 @@ int32_t krun_display_set_physical_size(uint32_t ctx_id, uint32_t display_id, uin * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_display_set_refresh_rate(uint32_t ctx_id, uint32_t display_id, uint32_t refresh_rate); +int32_t krun_display_set_refresh_rate(uint32_t ctx_id, uint32_t display_id, + uint32_t refresh_rate); /** - * Configures a krun_display_backend struct to be used for display output. (see libkrun_display.h) + * Configures a krun_display_backend struct to be used for display output. (see + * libkrun_display.h) * * Arguments: * "ctx_id" - the configuration context ID @@ -703,8 +717,8 @@ int32_t krun_display_set_refresh_rate(uint32_t ctx_id, uint32_t display_id, uint * Returns: * Zero on success or a negative error number (errno) on failure. */ -int32_t krun_set_display_backend(uint32_t ctx_id, const void *display_backend, size_t backend_size); - +int32_t krun_set_display_backend(uint32_t ctx_id, const void *display_backend, + size_t backend_size); /** * Adds an input device with separate config and events objects. @@ -719,13 +733,16 @@ int32_t krun_set_display_backend(uint32_t ctx_id, const void *display_backend, s * Returns: * Zero on success or a negative error code otherwise. */ -int krun_add_input_device(uint32_t ctx_id, const void *config_backend, size_t config_backend_size, - const void *events_backend, size_t events_backend_size); +int krun_add_input_device(uint32_t ctx_id, const void *config_backend, + size_t config_backend_size, + const void *events_backend, + size_t events_backend_size); /** * Creates a passthrough input device from a host /dev/input/* file descriptor. - * The device configuration will be automatically queried from the host device using ioctls. - * + * The device configuration will be automatically queried from the host device + * using ioctls. + * * Arguments: * "ctx_id" - The krun context * "input_fd" - File descriptor to a /dev/input/* device on the host @@ -740,7 +757,8 @@ int krun_add_input_device_fd(uint32_t ctx_id, int input_fd); * * Arguments: * "ctx_id" - the configuration context ID. - * "enable" - boolean indicating whether virtio-snd should be enabled or disabled. + * "enable" - boolean indicating whether virtio-snd should be enabled or + * disabled. * * Returns: * Zero on success or a negative error number on failure. @@ -762,7 +780,8 @@ int32_t krun_set_snd_device(uint32_t ctx_id, bool enable); /** * Vhost-user console device default queue configuration. * Console device uses 4 queues for multiport support: - * receiveq (idx 0), transmitq (idx 1), control receiveq (idx 2), control transmitq (idx 3). + * receiveq (idx 0), transmitq (idx 1), control receiveq (idx 2), control + * transmitq (idx 3). */ #define KRUN_VHOST_USER_CONSOLE_NUM_QUEUES 4 #define KRUN_VHOST_USER_CONSOLE_QUEUE_SIZES ((uint16_t[]){128, 128, 64, 64}) @@ -790,7 +809,8 @@ int32_t krun_set_snd_device(uint32_t ctx_id, bool enable); /** * Vhost-user sound device default queue configuration. - * Sound device uses 4 queues: control (idx 0), event (idx 1), TX/playback (idx 2), RX/capture (idx 3). + * Sound device uses 4 queues: control (idx 0), event (idx 1), TX/playback (idx + * 2), RX/capture (idx 3). */ #define KRUN_VHOST_USER_SND_NUM_QUEUES 4 #define KRUN_VHOST_USER_SND_QUEUE_SIZES ((uint16_t[]){64, 64, 64, 64}) @@ -820,36 +840,38 @@ int32_t krun_set_snd_device(uint32_t ctx_id, bool enable); * * Arguments: * "ctx_id" - the configuration context ID. - * "device_type" - type of vhost-user device (e.g., KRUN_VHOST_USER_DEVICE_RNG). - * "socket_path" - path to the vhost-user Unix domain socket (e.g., "/tmp/vhost-rng.sock"). - * "name" - device name for logging/debugging (e.g., "vhost-rng", "vhost-snd"). - * NULL = auto-generate from device_type ("vhost-user-4", "vhost-user-25", etc.) - * "num_queues" - number of virtqueues. - * 0 = auto-detect from backend (requires backend MQ support). - * >0 = explicit queue count. - * Or use device-specific constants like KRUN_VHOST_USER_RNG_NUM_QUEUES. - * "queue_sizes" - array of queue sizes for each queue. - * NULL = use default size (256) for all queues. - * When num_queues=0 (auto-detect): array must be 0-terminated (sentinel). - * When num_queues>0 (explicit): array must have exactly num_queues elements. - * Use device-specific constants like KRUN_VHOST_USER_RNG_QUEUE_SIZES for defaults. + * "device_type" - type of vhost-user device (e.g., + * KRUN_VHOST_USER_DEVICE_RNG). "socket_path" - path to the vhost-user Unix + * domain socket (e.g., "/tmp/vhost-rng.sock"). "name" - device name for + * logging/debugging (e.g., "vhost-rng", "vhost-snd"). NULL = auto-generate from + * device_type ("vhost-user-4", "vhost-user-25", etc.) "num_queues" - number + * of virtqueues. 0 = auto-detect from backend (requires backend MQ support). >0 + * = explicit queue count. Or use device-specific constants like + * KRUN_VHOST_USER_RNG_NUM_QUEUES. "queue_sizes" - array of queue sizes for + * each queue. NULL = use default size (256) for all queues. When num_queues=0 + * (auto-detect): array must be 0-terminated (sentinel). When num_queues>0 + * (explicit): array must have exactly num_queues elements. Use device-specific + * constants like KRUN_VHOST_USER_RNG_QUEUE_SIZES for defaults. * * Examples: * // Auto-detect queue count, use default size (256) - * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", NULL, 0, NULL); + * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", + * NULL, 0, NULL); * * // Auto-detect queue count, use custom size (512) for all queues * uint16_t custom_size[] = {512, 0}; // 0 = sentinel terminator - * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", NULL, 0, custom_size); + * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", + * NULL, 0, custom_size); * * // Explicit defaults using #define constants - * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", "vhost-rng", - * KRUN_VHOST_USER_RNG_NUM_QUEUES, + * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_RNG, "/tmp/rng.sock", + * "vhost-rng", KRUN_VHOST_USER_RNG_NUM_QUEUES, * KRUN_VHOST_USER_RNG_QUEUE_SIZES); * * // Explicit queue count with custom sizes * uint16_t sizes[] = {256, 512}; - * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_SND, "/tmp/snd.sock", "vhost-snd", 2, sizes); + * krun_add_vhost_user_device(ctx, KRUN_VHOST_USER_DEVICE_SND, "/tmp/snd.sock", + * "vhost-snd", 2, sizes); * * Returns: * Zero on success or a negative error number on failure. @@ -857,19 +879,19 @@ int32_t krun_set_snd_device(uint32_t ctx_id, bool enable); * -ENOENT - Context doesn't exist * -ENOTSUP - vhost-user support not compiled in */ -int32_t krun_add_vhost_user_device(uint32_t ctx_id, - uint32_t device_type, - const char *socket_path, - const char *name, +int32_t krun_add_vhost_user_device(uint32_t ctx_id, uint32_t device_type, + const char *socket_path, const char *name, uint16_t num_queues, const uint16_t *queue_sizes); /** - * Configures a map of rlimits to be set in the guest before starting the isolated binary. + * Configures a map of rlimits to be set in the guest before starting the + * isolated binary. * * Arguments: * "ctx_id" - the configuration context ID. - * "rlimits" - an array of string pointers with format "RESOURCE=RLIM_CUR:RLIM_MAX". + * "rlimits" - an array of string pointers with format + * "RESOURCE=RLIM_CUR:RLIM_MAX". * * Returns: * Zero on success or a negative error number on failure. @@ -881,53 +903,55 @@ int32_t krun_set_rlimits(uint32_t ctx_id, const char *const rlimits[]); * * Arguments: * "ctx_id" - the configuration context ID. - * "oem_strings" - an array of string pointers. Must be terminated with an additional NULL pointer. + * "oem_strings" - an array of string pointers. Must be terminated with an + * additional NULL pointer. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_smbios_oem_strings(uint32_t ctx_id, const char *const oem_strings[]); +int32_t krun_set_smbios_oem_strings(uint32_t ctx_id, + const char *const oem_strings[]); /** * Sets the working directory for the executable to be run inside the microVM. * * Arguments: * "ctx_id" - the configuration context ID. - * "workdir_path" - the path to the working directory, relative to the root configured with - * "krun_set_root". + * "workdir_path" - the path to the working directory, relative to the root + * configured with "krun_set_root". * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_workdir(uint32_t ctx_id, - const char *workdir_path); +int32_t krun_set_workdir(uint32_t ctx_id, const char *workdir_path); /** - * Sets the path to the executable to be run inside the microVM, the arguments to be passed to the - * executable, and the environment variables to be configured in the context of the executable. + * Sets the path to the executable to be run inside the microVM, the arguments + * to be passed to the executable, and the environment variables to be + * configured in the context of the executable. * * Arguments: * "ctx_id" - the configuration context ID. - * "exec_path" - the path to the executable, relative to the root configured with "krun_set_root". - * "argv" - an array of string pointers to be passed as arguments. - * "envp" - an array of string pointers to be injected as environment variables into the - * context of the executable. If NULL, it will auto-generate an array collecting the - * the variables currently present in the environment. + * "exec_path" - the path to the executable, relative to the root configured + * with "krun_set_root". "argv" - an array of string pointers to be passed + * as arguments. "envp" - an array of string pointers to be injected as + * environment variables into the context of the executable. If NULL, it will + * auto-generate an array collecting the the variables currently present in the + * environment. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_exec(uint32_t ctx_id, - const char *exec_path, - const char *const argv[], - const char *const envp[]); +int32_t krun_set_exec(uint32_t ctx_id, const char *exec_path, + const char *const argv[], const char *const envp[]); /** * Sets the path to the firmware to be loaded into the microVM. * * Arguments: * "ctx_id" - the configuration context ID. - * "firmware_path" - the path to the firmware, relative to the host's filesystem. + * "firmware_path" - the path to the firmware, relative to the host's + * filesystem. * * * Returns: @@ -948,16 +972,14 @@ int32_t krun_set_firmware(uint32_t ctx_id, const char *firmware_path); * "ctx_id" - the configuration context ID. * "kernel_path" - the path to the kernel, relative to the host's filesystem. * "kernel_format" - the kernel format. - * "initramfs" - the path to the initramfs, relative to the host's filesystem. - * "cmdline" - the kernel command line. + * "initramfs" - the path to the initramfs, relative to the host's + * filesystem. "cmdline" - the kernel command line. * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_set_kernel(uint32_t ctx_id, - const char *kernel_path, - uint32_t kernel_format, - const char *initramfs, +int32_t krun_set_kernel(uint32_t ctx_id, const char *kernel_path, + uint32_t kernel_format, const char *initramfs, const char *cmdline); /** @@ -965,9 +987,9 @@ int32_t krun_set_kernel(uint32_t ctx_id, * * Arguments: * "ctx_id" - the configuration context ID. - * "envp" - an array of string pointers to be injected as environment variables into the - * context of the executable. If NULL, it will auto-generate an array collecting the - * the variables currently present in the environment. + * "envp" - an array of string pointers to be injected as environment + * variables into the context of the executable. If NULL, it will auto-generate + * an array collecting the the variables currently present in the environment. * * Returns: * Zero on success or a negative error number on failure. @@ -975,11 +997,13 @@ int32_t krun_set_kernel(uint32_t ctx_id, int32_t krun_set_env(uint32_t ctx_id, const char *const envp[]); /** - * Sets the file path to the TEE configuration file. Only available in libkrun-sev. + * Sets the file path to the TEE configuration file. Only available in + * libkrun-sev. * * Arguments: * "ctx_id" - the configuration context ID. - * "filepath" - a null-terminated string representing file path to the TEE config file. + * "filepath" - a null-terminated string representing file path to the TEE + * config file. * * Returns: * Zero on success or a negative error number on failure. @@ -995,8 +1019,7 @@ int32_t krun_set_tee_config_file(uint32_t ctx_id, const char *filepath); * "filepath" - a null-terminated string representing the path of the UNIX * socket in the host. */ -int32_t krun_add_vsock_port(uint32_t ctx_id, - uint32_t port, +int32_t krun_add_vsock_port(uint32_t ctx_id, uint32_t port, const char *c_filepath); /** @@ -1007,12 +1030,11 @@ int32_t krun_add_vsock_port(uint32_t ctx_id, * "port" - a vsock port that the guest will connect to for IPC. * "filepath" - a null-terminated string representing the path of the UNIX * socket in the host. - * "listen" - true if guest expects connections to be initiated from host side + * "listen" - true if guest expects connections to be initiated from host + * side */ -int32_t krun_add_vsock_port2(uint32_t ctx_id, - uint32_t port, - const char *c_filepath, - bool listen); +int32_t krun_add_vsock_port2(uint32_t ctx_id, uint32_t port, + const char *c_filepath, bool listen); /** * Add a vsock device with specified TSI features. @@ -1026,8 +1048,8 @@ int32_t krun_add_vsock_port2(uint32_t ctx_id, * * Arguments: * "ctx_id" - the configuration context ID. - * "tsi_features" - bitmask of TSI features (KRUN_TSI_HIJACK_INET, KRUN_TSI_HIJACK_UNIX) - * Use 0 to add vsock without any TSI hijacking. + * "tsi_features" - bitmask of TSI features (KRUN_TSI_HIJACK_INET, + * KRUN_TSI_HIJACK_UNIX) Use 0 to add vsock without any TSI hijacking. * * Returns: * Zero on success or a negative error number on failure. @@ -1035,8 +1057,8 @@ int32_t krun_add_vsock_port2(uint32_t ctx_id, int32_t krun_add_vsock(uint32_t ctx_id, uint32_t tsi_features); /** - * Returns the eventfd file descriptor to signal the guest to shut down orderly. This must be - * called before starting the microVM with "krun_start_event". + * Returns the eventfd file descriptor to signal the guest to shut down orderly. + * This must be called before starting the microVM with "krun_start_event". * * Arguments: * "ctx_id" - the configuration context ID. @@ -1047,18 +1069,19 @@ int32_t krun_add_vsock(uint32_t ctx_id, uint32_t tsi_features); int32_t krun_get_shutdown_eventfd(uint32_t ctx_id); /** - * Configures the console device to ignore stdin and write the output to "c_filepath". + * Configures the console device to ignore stdin and write the output to + * "c_filepath". * * Arguments: * "ctx_id" - the configuration context ID. - * "filepath" - a null-terminated string representing the path of the file to write the - * console output. + * "filepath" - a null-terminated string representing the path of the file to + * write the console output. * * Notes: - * This API only applies to the implicitly created console. If the implicit console is - * disabled via `krun_disable_implicit_console` the operation is a NOOP. Additionally, - * this API does not have any effect on consoles created via the `krun_add_*_console_default` - * APIs. + * This API only applies to the implicitly created console. If the implicit + * console is disabled via `krun_disable_implicit_console` the operation is a + * NOOP. Additionally, this API does not have any effect on consoles created via + * the `krun_add_*_console_default` APIs. */ int32_t krun_set_console_output(uint32_t ctx_id, const char *c_filepath); @@ -1102,9 +1125,9 @@ int32_t krun_setgid(uint32_t ctx_id, gid_t gid); * "enabled" - true to enable Nested Virtualization in the microVM. * * Returns: - * Zero on success or a negative error number on failure. Success doesn't imply that - * Nested Virtualization is supported on the system, only that it's going to be requested - * when the microVM is created after calling "krun_start_enter". + * Zero on success or a negative error number on failure. Success doesn't imply + * that Nested Virtualization is supported on the system, only that it's going + * to be requested when the microVM is created after calling "krun_start_enter". */ int32_t krun_set_nested_virt(uint32_t ctx_id, bool enabled); @@ -1150,12 +1173,14 @@ int32_t krun_has_feature(uint64_t feature); * Get the maximum number of vCPUs supported by the hypervisor. * * Returns: - * The maximum number of vCPUs that can be created, or a negative error number on failure. + * The maximum number of vCPUs that can be created, or a negative error number + * on failure. */ int32_t krun_get_max_vcpus(void); /** - * Specify whether to split IRQCHIP responsibilities between the host and the guest. + * Specify whether to split IRQCHIP responsibilities between the host and the + * guest. * * Arguments: * "ctx_id" - the configuration context ID. @@ -1163,9 +1188,16 @@ int32_t krun_get_max_vcpus(void); * * Returns: * Zero on success or a negative error number on failure. -*/ + */ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); +/* + * NOTE: Implicit resource creation is a legacy convenience. The 2.0 API + * (see https://github.com/containers/libkrun/issues/634) will not create + * any implicit resources. Callers should start using the + * krun_disable_implicit_* functions now to ease migration. + */ + /* * Do not create an implicit console device in the guest. By using this API, * libkrun will create zero console devices on behalf of the user. Any @@ -1180,6 +1212,79 @@ int32_t krun_split_irqchip(uint32_t ctx_id, bool enable); */ int32_t krun_disable_implicit_console(uint32_t ctx_id); +/** + * Do not inject the default init binary (/init.krun) into the root + * filesystem. Must be called before krun_set_root(). + * + * Arguments: + * "ctx_id" - the configuration context ID. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_disable_implicit_init(uint32_t ctx_id); + +/** + * Get a pointer to the built-in default init binary. + * + * This is the same binary that libkrun injects as /init.krun by default. + * Callers that use krun_disable_implicit_init() can use this to inject the + * init binary themselves (e.g. via krun_fs_add_overlay_file with custom + * settings). + * + * The returned pointer is valid for the lifetime of the process (static data). + * + * Arguments: + * "data_out" - receives a pointer to the init binary bytes. + * "len_out" - receives the length in bytes. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_get_default_init(const uint8_t **data_out, size_t *len_out); + +/** + * Add a virtual overlay file to a virtiofs device. + * + * The file will appear in the root directory of the specified virtiofs + * mount and is backed entirely by host memory (no host file). The data + * pointer is NOT copied — the caller must keep the memory valid for the + * full VM lifetime. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "filename" - name of the file in the root directory. + * "data" - pointer to the file content. + * "data_len" - length of the file content in bytes. + * "mode" - file mode bits (e.g. 0100644 for a regular file). + * "one_shot" - if true, the file can only be looked up once. + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_fs_add_overlay_file(uint32_t ctx_id, const char *fs_tag, + const char *filename, const uint8_t *data, + size_t data_len, uint32_t mode, bool one_shot); + +/** + * Add a virtual overlay directory to a virtiofs device. + * + * The directory will appear in the root directory of the specified virtiofs + * mount. It is empty and read-only, useful as a mount point. + * + * Arguments: + * "ctx_id" - the configuration context ID. + * "fs_tag" - tag of the virtiofs device (e.g. "/dev/root"). + * "dirname" - name of the directory in the root directory. + * "mode" - directory mode bits (e.g. 040755). + * + * Returns: + * Zero on success or a negative error number on failure. + */ +int32_t krun_fs_add_overlay_dir(uint32_t ctx_id, const char *fs_tag, + const char *dirname, uint32_t mode); + /** * Disable the implicit vsock device. * @@ -1209,18 +1314,21 @@ int32_t krun_set_kernel_console(uint32_t ctx_id, const char *console_id); /* * Adds a virtio-console device to the guest. * - * The function can be called multiple times for adding multiple virtio-console devices. - * In the guest, the consoles will appear in the same order as they are added (that is, - * the first added console will be "hvc0", the second "hvc1", ...). However, if the - * implicit console is not disabled via `krun_disable_implicit_console`, the first - * console created with the function will occupy the "hvc1" ID. - * - * This function attaches a multi port virtio-console to the guest. If the input, output and error - * file descriptors are TTYs, the device will be created with just a single console port (`err_fd` - * is ignored in this case, because error output just goes to the TTY). For each of the non-TTY file - * descriptors an additional non-console port is created ("krun-stdin"/"krun-stdout"/"krun-stderr"). - * The libkrun init process in the guest detects the existence of the additional ports and redirects - * the stdin/stdout/stderr of the application in the guest appropriately. + * The function can be called multiple times for adding multiple virtio-console + * devices. In the guest, the consoles will appear in the same order as they are + * added (that is, the first added console will be "hvc0", the second "hvc1", + * ...). However, if the implicit console is not disabled via + * `krun_disable_implicit_console`, the first console created with the function + * will occupy the "hvc1" ID. + * + * This function attaches a multi port virtio-console to the guest. If the + * input, output and error file descriptors are TTYs, the device will be created + * with just a single console port (`err_fd` is ignored in this case, because + * error output just goes to the TTY). For each of the non-TTY file descriptors + * an additional non-console port is created + * ("krun-stdin"/"krun-stdout"/"krun-stderr"). The libkrun init process in the + * guest detects the existence of the additional ports and redirects the + * stdin/stdout/stderr of the application in the guest appropriately. * * Arguments: * "ctx_id" - the configuration context ID. @@ -1231,19 +1339,18 @@ int32_t krun_set_kernel_console(uint32_t ctx_id, const char *console_id); * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_virtio_console_default(uint32_t ctx_id, - int input_fd, - int output_fd, - int err_fd); +int32_t krun_add_virtio_console_default(uint32_t ctx_id, int input_fd, + int output_fd, int err_fd); /* * Adds a legacy serial device to the guest. * * The function can be called multiple times for adding multiple serial devices. - * In the guest, the consoles will appear in the same order as they are added (that is, - * the first added console will be "ttyS0", the second "ttyS1", ...). However, if the - * implicit console is not disabled via `krun_disable_implicit_console` on aarch64 or macOS, - * the first console created with the function will occupy the "ttyS1" ID. + * In the guest, the consoles will appear in the same order as they are added + * (that is, the first added console will be "ttyS0", the second "ttyS1", ...). + * However, if the implicit console is not disabled via + * `krun_disable_implicit_console` on aarch64 or macOS, the first console + * created with the function will occupy the "ttyS1" ID. * * Arguments: * "ctx_id" - the configuration context ID. @@ -1253,22 +1360,25 @@ int32_t krun_add_virtio_console_default(uint32_t ctx_id, * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_serial_console_default(uint32_t ctx_id, - int input_fd, - int output_fd); +int32_t krun_add_serial_console_default(uint32_t ctx_id, int input_fd, + int output_fd); /* - * Adds a multi-port virtio-console device to the guest with explicitly configured ports. - * - * This function creates a new virtio-console device that can have multiple ports added to it - * via krun_add_console_port_tty() and krun_add_console_port_inout(). Unlike krun_add_virtio_console_default(), - * this does not do any automatic detections to configure ports based on the file descriptors. - * - * The function can be called multiple times for adding multiple virtio-console devices. - * Each device appears in the guest with port 0 accessible as /dev/hvcN (hvc0, hvc1, etc.) in the order - * devices are added. If the implicit console is not disabled via `krun_disable_implicit_console`, - * the first explicitly added device will occupy the "hvc1" ID. Additional ports within each device - * (port 1, 2, ...) appear as /dev/vportNpM character devices. + * Adds a multi-port virtio-console device to the guest with explicitly + * configured ports. + * + * This function creates a new virtio-console device that can have multiple + * ports added to it via krun_add_console_port_tty() and + * krun_add_console_port_inout(). Unlike krun_add_virtio_console_default(), this + * does not do any automatic detections to configure ports based on the file + * descriptors. + * + * The function can be called multiple times for adding multiple virtio-console + * devices. Each device appears in the guest with port 0 accessible as /dev/hvcN + * (hvc0, hvc1, etc.) in the order devices are added. If the implicit console is + * not disabled via `krun_disable_implicit_console`, the first explicitly added + * device will occupy the "hvc1" ID. Additional ports within each device (port + * 1, 2, ...) appear as /dev/vportNpM character devices. * * Arguments: * "ctx_id" - the configuration context ID. @@ -1281,45 +1391,47 @@ int32_t krun_add_virtio_console_multiport(uint32_t ctx_id); /* * Adds a TTY port to a multi-port virtio-console device. * - * The TTY file descriptor is used for both input and output. This port will be marked with the - * VIRTIO_CONSOLE_CONSOLE_PORT flag, enabling console-specific features notably window resize. + * The TTY file descriptor is used for both input and output. This port will be + * marked with the VIRTIO_CONSOLE_CONSOLE_PORT flag, enabling console-specific + * features notably window resize. * * Arguments: * "ctx_id" - the configuration context ID - * "console_id" - the console ID returned by krun_add_virtio_console_multiport() - * "name" - the name of the port for identifying the port in the guest, can be empty ("") - * "tty_fd" - file descriptor for the TTY to use for both input, output, and determining terminal size + * "console_id" - the console ID returned by + * krun_add_virtio_console_multiport() "name" - the name of the port for + * identifying the port in the guest, can be empty ("") "tty_fd" - file + * descriptor for the TTY to use for both input, output, and determining + * terminal size * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_console_port_tty(uint32_t ctx_id, - uint32_t console_id, - const char *name, - int tty_fd); +int32_t krun_add_console_port_tty(uint32_t ctx_id, uint32_t console_id, + const char *name, int tty_fd); /* - * Adds a generic I/O port to a multi-port virtio-console device, suitable for arbitrary bidirectional - * data streams that don't require terminal functionality. + * Adds a generic I/O port to a multi-port virtio-console device, suitable for + * arbitrary bidirectional data streams that don't require terminal + * functionality. * - * This port will NOT be marked with the VIRTIO_CONSOLE_CONSOLE_PORT flag, meaning it won't support - * console-specific features like window resize signals. + * This port will NOT be marked with the VIRTIO_CONSOLE_CONSOLE_PORT flag, + * meaning it won't support console-specific features like window resize + * signals. * * Arguments: * "ctx_id" - the configuration context ID - * "console_id" - the console ID returned by krun_add_virtio_console_multiport() - * "name" - the name of the port for identifying the port in the guest, can be empty ("") - * "input_fd" - file descriptor to use for input (host writes, guest reads) - * "output_fd" - file descriptor to use for output (guest writes, host reads) + * "console_id" - the console ID returned by + * krun_add_virtio_console_multiport() "name" - the name of the port for + * identifying the port in the guest, can be empty ("") "input_fd" - file + * descriptor to use for input (host writes, guest reads) "output_fd" - file + * descriptor to use for output (guest writes, host reads) * * Returns: * Zero on success or a negative error number on failure. */ -int32_t krun_add_console_port_inout(uint32_t ctx_id, - uint32_t console_id, - const char *name, - int input_fd, - int output_fd); +int32_t krun_add_console_port_inout(uint32_t ctx_id, uint32_t console_id, + const char *name, int input_fd, + int output_fd); /** * Configure block device to be used as root filesystem. @@ -1327,23 +1439,29 @@ int32_t krun_add_console_port_inout(uint32_t ctx_id, * Arguments: * "ctx_id" - the configuration context ID. * "device" - a null-terminated string specifying the root device - * (e.g. "/dev/vda1", must refer to a previously configured block device) - * "fstype" - a null-terminated string specifying the filesystem type (e.g. "ext4", can be set to "auto" or NULL) - * "options" - a null-terminated string with a comma-separated list of mount options (can be NULL) + * (e.g. "/dev/vda1", must refer to a previously configured block + * device) "fstype" - a null-terminated string specifying the filesystem type + * (e.g. "ext4", can be set to "auto" or NULL) "options" - a null-terminated + * string with a comma-separated list of mount options (can be NULL) * * Notes: - * This function can be used if you want a root filesystem backed by a block device instead of a virtiofs path. - * Because libkrun uses its own built-in init process (implemented as a virtual file in the virtiofs driver), - * you'd normally have to copy the executable into every filesystem image (or partition) you intend to boot from. - * This is obviously difficult to maintain, so instead we can create a dummy virtiofs root behind the scenes, - * execute init from it as usual and then switch to the actual root configured by this function. + * This function can be used if you want a root filesystem backed by a block + * device instead of a virtiofs path. Because libkrun uses its own built-in init + * process (implemented as a virtual file in the virtiofs driver), you'd + * normally have to copy the executable into every filesystem image (or + * partition) you intend to boot from. This is obviously difficult to maintain, + * so instead we can create a dummy virtiofs root behind the scenes, execute + * init from it as usual and then switch to the actual root configured by this + * function. */ -int32_t krun_set_root_disk_remount(uint32_t ctx_id, const char *device, const char *fstype, const char *options); +int32_t krun_set_root_disk_remount(uint32_t ctx_id, const char *device, + const char *fstype, const char *options); /** - * Starts and enters the microVM with the configured parameters. The VMM will attempt to take over - * stdin/stdout to manage them on behalf of the process running inside the isolated environment, - * simulating that the latter has direct control of the terminal. + * Starts and enters the microVM with the configured parameters. The VMM will + * attempt to take over stdin/stdout to manage them on behalf of the process + * running inside the isolated environment, simulating that the latter has + * direct control of the terminal. * * This function consumes the configuration pointed by the context ID. * @@ -1351,15 +1469,17 @@ int32_t krun_set_root_disk_remount(uint32_t ctx_id, const char *device, const ch * "ctx_id" - the configuration context ID. * * Notes: - * This function only returns if an error happens before starting the microVM. Otherwise, the - * VMM assumes it has full control of the process, and will call to exit() with the workload's exit - * code once the microVM shuts down. If an error occurred before running the workload the process - * will exit() with an error exit code. + * This function only returns if an error happens before starting the microVM. + * Otherwise, the VMM assumes it has full control of the process, and will call + * to exit() with the workload's exit code once the microVM shuts down. If an + * error occurred before running the workload the process will exit() with an + * error exit code. * * Error exit codes: * 125 - "init" cannot set up the environment inside the microVM. - * 126 - "init" can find the executable to be run inside the microVM but cannot execute it. - * 127 - "init" cannot find the executable to be run inside the microVM. + * 126 - "init" can find the executable to be run inside the microVM but + * cannot execute it. 127 - "init" cannot find the executable to be run + * inside the microVM. * * Returns: * -EINVAL - The VMM has detected an error in the microVM configuration. diff --git a/init/Cargo.toml b/init/Cargo.toml new file mode 100644 index 000000000..d71e23bf0 --- /dev/null +++ b/init/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "krun-init" +version = "0.1.0-1.18.0" +edition = "2021" +description = "PID-1 init binary for libkrun guest VMs" +license = "Apache-2.0" +repository = "https://github.com/containers/libkrun" + +[[bin]] +name = "krun-init" +path = "src/main.rs" + +[features] +amd-sev = [] +tdx = [] +timesync = [] + +[dependencies] +anyhow = "1" +libc = "0.2" +nix = { version = "0.30", features = ["fs", "hostname", "ioctl", "mount", "process", "reboot", "resource", "signal", "socket", "term", "uio"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" diff --git a/init/dhcp.c b/init/dhcp.c deleted file mode 100644 index e852bded8..000000000 --- a/init/dhcp.c +++ /dev/null @@ -1,620 +0,0 @@ -/* - * DHCP Client Implementation - * - * Standalone DHCP client for configuring IPv4 network interfaces. - * Translated from Rust implementation in muvm/src/guest/net.rs - */ - -#include "dhcp.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DHCP_BUFFER_SIZE 576 -#define DHCP_MSG_OFFER 2 -#define DHCP_MSG_ACK 5 - -/* Helper function to send netlink message */ -static int nl_send(int sock, struct nlmsghdr *nlh) -{ - struct sockaddr_nl sa = { - .nl_family = AF_NETLINK, - }; - - struct iovec iov = { - .iov_base = nlh, - .iov_len = nlh->nlmsg_len, - }; - - struct msghdr msg = { - .msg_name = &sa, - .msg_namelen = sizeof(sa), - .msg_iov = &iov, - .msg_iovlen = 1, - }; - - return sendmsg(sock, &msg, 0); -} - -/* Helper function to receive netlink response */ -static int nl_recv(int sock, char *buf, size_t len) -{ - struct sockaddr_nl sa; - struct iovec iov = { - .iov_base = buf, - .iov_len = len, - }; - - struct msghdr msg = { - .msg_name = &sa, - .msg_namelen = sizeof(sa), - .msg_iov = &iov, - .msg_iovlen = 1, - }; - - return recvmsg(sock, &msg, 0); -} - -/* Add routing attribute to netlink message */ -static void add_rtattr(struct nlmsghdr *nlh, int type, const void *data, - int len) -{ - int rtalen = RTA_SPACE(len); - struct rtattr *rta = - (struct rtattr *)(((char *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len)); - rta->rta_type = type; - rta->rta_len = RTA_LENGTH(len); - memcpy(RTA_DATA(rta), data, len); - nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + rtalen; -} - -/* Set MTU */ -static int set_mtu(int nl_sock, int iface_index, unsigned int mtu) -{ - char buf[4096]; - struct nlmsghdr *nlh; - struct nlmsgerr *err; - struct ifinfomsg *ifi; - - memset(buf, 0, sizeof(buf)); - nlh = (struct nlmsghdr *)buf; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - nlh->nlmsg_type = RTM_NEWLINK; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - nlh->nlmsg_seq = 1; - nlh->nlmsg_pid = getpid(); - - ifi = (struct ifinfomsg *)NLMSG_DATA(nlh); - ifi->ifi_family = AF_UNSPEC; - ifi->ifi_type = ARPHRD_ETHER; - ifi->ifi_index = iface_index; - - add_rtattr(nlh, IFLA_MTU, &mtu, sizeof(mtu)); - - if (nl_send(nl_sock, nlh) < 0) { - perror("nl_send failed for set_mtu"); - return -1; - } - - /* Receive ACK */ - int len = nl_recv(nl_sock, buf, sizeof(buf)); - if (len < (int)NLMSG_LENGTH(sizeof(struct nlmsgerr))) { - perror("nl_recv failed for set_mtu"); - return -1; - } - - if (nlh->nlmsg_type != NLMSG_ERROR) { - printf("netlink didn't return a valid answer for set_mtu\n"); - return -1; - } - - err = (struct nlmsgerr *)NLMSG_DATA(nlh); - if (err->error != 0) { - printf("netlink returned an error for set_mtu: %d\n", err->error); - return -1; - } - - return 0; -} - -/* Add or delete IPv4 route */ -static int mod_route4(int nl_sock, int iface_index, int cmd, struct in_addr gw) -{ - char buf[4096]; - struct nlmsghdr *nlh; - struct nlmsgerr *err; - struct rtmsg *rtm; - struct in_addr dst = {.s_addr = INADDR_ANY}; - - memset(buf, 0, sizeof(buf)); - nlh = (struct nlmsghdr *)buf; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); - nlh->nlmsg_type = cmd; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK; - nlh->nlmsg_seq = 1; - nlh->nlmsg_pid = getpid(); - - rtm = (struct rtmsg *)NLMSG_DATA(nlh); - rtm->rtm_family = AF_INET; - rtm->rtm_dst_len = 0; - rtm->rtm_src_len = 0; - rtm->rtm_tos = 0; - rtm->rtm_table = RT_TABLE_MAIN; - rtm->rtm_protocol = RTPROT_BOOT; - rtm->rtm_scope = RT_SCOPE_UNIVERSE; - rtm->rtm_type = RTN_UNICAST; - rtm->rtm_flags = 0; - - add_rtattr(nlh, RTA_OIF, &iface_index, sizeof(iface_index)); - add_rtattr(nlh, RTA_DST, &dst, sizeof(dst)); - add_rtattr(nlh, RTA_GATEWAY, &gw, sizeof(gw)); - - if (nl_send(nl_sock, nlh) < 0) { - perror("nl_send failed for mod_route4"); - return -1; - } - - /* Receive ACK */ - int len = nl_recv(nl_sock, buf, sizeof(buf)); - if (len < (int)NLMSG_LENGTH(sizeof(struct nlmsgerr))) { - perror("nl_recv failed for mod_route4"); - return -1; - } - - if (nlh->nlmsg_type != NLMSG_ERROR) { - printf("netlink didn't return a valid answer for mod_route4\n"); - return -1; - } - - err = (struct nlmsgerr *)NLMSG_DATA(nlh); - if (err->error != 0) { - printf("netlink returned an error for mod_route4: %d\n", err->error); - return -1; - } - - return 0; -} - -/* Add or delete IPv4 address */ -static int mod_addr4(int nl_sock, int iface_index, int cmd, struct in_addr addr, - unsigned char prefix_len) -{ - char buf[4096]; - struct nlmsghdr *nlh; - struct nlmsgerr *err; - struct ifaddrmsg *ifa; - - memset(buf, 0, sizeof(buf)); - nlh = (struct nlmsghdr *)buf; - nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)); - nlh->nlmsg_type = cmd; - nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK; - nlh->nlmsg_seq = 1; - nlh->nlmsg_pid = getpid(); - - ifa = (struct ifaddrmsg *)NLMSG_DATA(nlh); - ifa->ifa_family = AF_INET; - ifa->ifa_prefixlen = prefix_len; - ifa->ifa_flags = 0; - ifa->ifa_scope = RT_SCOPE_UNIVERSE; - ifa->ifa_index = iface_index; - - add_rtattr(nlh, IFA_LOCAL, &addr, sizeof(addr)); - add_rtattr(nlh, IFA_ADDRESS, &addr, sizeof(addr)); - - if (nl_send(nl_sock, nlh) < 0) { - perror("nl_send failed for mod_addr4"); - return -1; - } - - /* Receive ACK */ - int len = nl_recv(nl_sock, buf, sizeof(buf)); - if (len < (int)NLMSG_LENGTH(sizeof(struct nlmsgerr))) { - perror("nl_recv failed for mod_addr4"); - return -1; - } - - if (nlh->nlmsg_type != NLMSG_ERROR) { - printf("netlink didn't return a valid answer for mod_addr4\n"); - return -1; - } - - err = (struct nlmsgerr *)NLMSG_DATA(nlh); - if (err->error != 0) { - printf("netlink returned an error for mod_addr4: %d\n", err->error); - return -1; - } - - return 0; -} - -/* Count leading ones in a 32-bit value */ -static unsigned char count_leading_ones(uint32_t val) -{ - unsigned char count = 0; - for (int i = 31; i >= 0; i--) { - if (val & (1U << i)) { - count++; - } else { - break; - } - } - return count; -} - -/* Return the DHCP message type (option 53) from a response, or 0 */ -static unsigned char get_dhcp_msg_type(const unsigned char *response, - ssize_t len) -{ - /* Walk DHCP options (TLV chain starting after the magic cookie) */ - size_t p = 240; - while (p < (size_t)len) { - unsigned char opt = response[p]; - - if (opt == 0xff) /* end */ - break; - if (opt == 0) { /* padding */ - p++; - continue; - } - - if (p + 1 >= (size_t)len) - break; - - unsigned char opt_len = response[p + 1]; - p += 2; - - if (p + opt_len > (size_t)len) - break; - if (opt == 53 && opt_len >= 1) /* Message Type */ - return response[p]; - - p += opt_len; - } - return 0; -} - -/* Parse a DHCP ACK and configure the interface. Returns 0 or -1 on error. */ -static int handle_dhcp_ack(int nl_sock, int iface_index, - const unsigned char *response, ssize_t len) -{ - /* Need at least 240 bytes (DHCP header + magic cookie) + 1 for options */ - if (len < 241) { - printf("DHCPACK too short (%zd bytes)\n", len); - return -1; - } - - /* Parse DHCP response */ - struct in_addr addr; - /* yiaddr is at offset 16-19 in network byte order */ - memcpy(&addr.s_addr, &response[16], sizeof(addr.s_addr)); - - if (addr.s_addr == INADDR_ANY) { - printf("DHCPACK has no address (yiaddr is 0.0.0.0)\n"); - return -1; - } - - struct in_addr netmask = {.s_addr = INADDR_ANY}; - struct in_addr router = {.s_addr = INADDR_ANY}; - /* Clamp MTU to passt's limit */ - uint16_t mtu = 65520; - - FILE *resolv = fopen("/etc/resolv.conf", "w"); - if (!resolv) { - perror("Failed to open /etc/resolv.conf"); - } - - /* Parse DHCP options (start at offset 240 after magic cookie) */ - size_t p = 240; - while (p < (size_t)len) { - unsigned char opt = response[p]; - - if (opt == 0xff) { - /* Option 255: End (of options) */ - break; - } - - if (opt == 0) { /* Padding */ - p++; - continue; - } - - if (p + 1 >= (size_t)len) - break; - - unsigned char opt_len = response[p + 1]; - p += 2; /* Length doesn't include code and length field itself */ - - if (p + opt_len > (size_t)len) { - /* Malformed packet, option length exceeds packet boundary */ - break; - } - - if (opt == 1 && opt_len >= 4) { - /* Option 1: Subnet Mask */ - memcpy(&netmask.s_addr, &response[p], sizeof(netmask.s_addr)); - } else if (opt == 3 && opt_len >= 4) { - /* Option 3: Router */ - memcpy(&router.s_addr, &response[p], sizeof(router.s_addr)); - } else if (opt == 6 && opt_len >= 4) { - /* Option 6: Domain Name Server */ - if (resolv) { - for (int dns_p = p; dns_p + 4 <= p + opt_len; dns_p += 4) { - fprintf(resolv, "nameserver %d.%d.%d.%d\n", response[dns_p], - response[dns_p + 1], response[dns_p + 2], - response[dns_p + 3]); - } - } - } else if (opt == 26 && opt_len >= 2) { - /* Option 26: Interface MTU */ - mtu = (response[p] << 8) | response[p + 1]; - - /* We don't know yet if IPv6 is available: don't go below 1280 B - */ - if (mtu < 1280) - mtu = 1280; - if (mtu > 65520) - mtu = 65520; - } - - p += opt_len; - } - - if (resolv) { - fclose(resolv); - } - - /* Calculate prefix length from netmask */ - unsigned char prefix_len = count_leading_ones(ntohl(netmask.s_addr)); - - if (mod_addr4(nl_sock, iface_index, RTM_NEWADDR, addr, prefix_len) != 0) { - printf("couldn't add the address provided by the DHCP server\n"); - return -1; - } - if (mod_route4(nl_sock, iface_index, RTM_NEWROUTE, router) != 0) { - printf("couldn't add the default route provided by the DHCP server\n"); - return -1; - } - set_mtu(nl_sock, iface_index, mtu); - return 0; -} - -/* Send DISCOVER with Rapid Commit, process ACK, configure address and route */ -int do_dhcp(const char *iface) -{ - struct sockaddr_in bind_addr, dest_addr; - struct dhcp_packet request = {0}; - unsigned char response[DHCP_BUFFER_SIZE]; - struct timeval timeout; - int iface_index; - int broadcast = 1; - int nl_sock = -1; - int sock = -1; - int ret = -1; - - iface_index = if_nametoindex(iface); - if (iface_index == 0) { - perror("Failed to find index for network interface"); - return ret; - } - - nl_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (nl_sock < 0) { - perror("Failed to create netlink socket"); - return ret; - } - - struct sockaddr_nl sa = { - .nl_family = AF_NETLINK, - .nl_pid = getpid(), - .nl_groups = 0, - }; - - if (bind(nl_sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - perror("Failed to bind netlink socket"); - goto cleanup; - } - - /* Send request (DHCPDISCOVER) */ - sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); - if (sock < 0) { - perror("socket failed"); - goto cleanup; - } - - /* Allow broadcast */ - if (setsockopt(sock, SOL_SOCKET, SO_BROADCAST, &broadcast, - sizeof(broadcast)) < 0) { - perror("setsockopt SO_BROADCAST failed"); - goto cleanup; - } - - if (setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, iface, - strlen(iface) + 1) < 0) { - perror("setsockopt SO_BINDTODEVICE failed"); - goto cleanup; - } - - /* Bind to port 68 (DHCP client) */ - memset(&bind_addr, 0, sizeof(bind_addr)); - bind_addr.sin_family = AF_INET; - bind_addr.sin_port = htons(68); - bind_addr.sin_addr.s_addr = INADDR_ANY; - - if (bind(sock, (struct sockaddr *)&bind_addr, sizeof(bind_addr)) < 0) { - perror("bind failed"); - goto cleanup; - } - - request.op = 1; /* BOOTREQUEST */ - request.htype = 1; /* Hardware address type: Ethernet */ - request.hlen = 6; /* Hardware address length */ - request.hops = 0; /* DHCP relay Hops */ - request.xid = - htonl(getpid()); /* Transaction ID: use PID for some randomness */ - request.secs = - 0; /* Seconds elapsed since beginning of acquisition or renewal */ - request.flags = htons(0x8000); /* DHCP message flags: Broadcast */ - request.ciaddr = 0; /* Client IP address (not set yet) */ - request.yiaddr = 0; /* 'your' IP address (server will fill) */ - request.siaddr = 0; /* Server IP address (not set) */ - request.giaddr = 0; /* Relay agent IP address (not set) */ - request.magic = htonl(0x63825363); /* Magic cookie */ - - /* Populate chaddr with the interface's MAC address */ - struct ifreq mac_ifr; - memset(&mac_ifr, 0, sizeof(mac_ifr)); - strncpy(mac_ifr.ifr_name, iface, IFNAMSIZ); - - if (ioctl(sock, SIOCGIFHWADDR, &mac_ifr) < 0) { - perror("ioctl(SIOCGIFHWADDR) failed"); - goto cleanup; - } - memcpy(request.chaddr, mac_ifr.ifr_hwaddr.sa_data, 6); - - /* Build DHCP options */ - int opt_offset = 0; - - /* Option 53: DHCP Message Type = DISCOVER (1) */ - request.options[opt_offset++] = 53; - request.options[opt_offset++] = 1; - request.options[opt_offset++] = 1; - - /* Option 80: Rapid Commit (RFC 4039) */ - request.options[opt_offset++] = 80; - request.options[opt_offset++] = 0; - - /* Option 255: End of options */ - request.options[opt_offset++] = 0xff; - - /* Remaining bytes are padding (up to 300 bytes) */ - - /* Send DHCP DISCOVER */ - memset(&dest_addr, 0, sizeof(dest_addr)); - dest_addr.sin_family = AF_INET; - dest_addr.sin_port = htons(67); - dest_addr.sin_addr.s_addr = INADDR_BROADCAST; - - if (sendto(sock, &request, sizeof(request), 0, - (struct sockaddr *)&dest_addr, sizeof(dest_addr)) < 0) { - perror("sendto failed"); - goto cleanup; - } - - /* Keep IPv6-only fast: set receive timeout to 100ms */ - timeout.tv_sec = 0; - timeout.tv_usec = 100000; - if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)) < - 0) { - perror("setsockopt SO_RCVTIMEO failed"); - goto cleanup; - } - - /* Get response: DHCPACK (Rapid Commit) or DHCPOFFER */ - struct sockaddr_in from_addr; - socklen_t from_len = sizeof(from_addr); - ssize_t len = recvfrom(sock, response, sizeof(response), 0, - (struct sockaddr *)&from_addr, &from_len); - - if (len <= 0) - goto done; /* No DHCP response — not an error, VM may be IPv6-only */ - - unsigned char msg_type = get_dhcp_msg_type(response, len); - - if (msg_type == DHCP_MSG_ACK) { - /* Rapid Commit — server sent ACK directly */ - close(sock); - sock = -1; - if (handle_dhcp_ack(nl_sock, iface_index, response, len) != 0) - goto cleanup; - } else if (msg_type == DHCP_MSG_OFFER) { - /* - * DHCPOFFER — complete the 4-way handshake by sending DHCPREQUEST - * and waiting for DHCPACK. Servers without Rapid Commit (e.g. - * gvproxy) require this. - */ - struct in_addr offered_addr; - memcpy(&offered_addr.s_addr, &response[16], - sizeof(offered_addr.s_addr)); - - /* Build DHCPREQUEST */ - memset(request.options, 0, sizeof(request.options)); - opt_offset = 0; - - /* Option 53: DHCP Message Type = REQUEST (3) */ - request.options[opt_offset++] = 53; - request.options[opt_offset++] = 1; - request.options[opt_offset++] = 3; - - /* Option 50: Requested IP Address */ - request.options[opt_offset++] = 50; - request.options[opt_offset++] = 4; - memcpy(&request.options[opt_offset], &offered_addr.s_addr, 4); - opt_offset += 4; - - /* Option 54: Server Identifier (from_addr) */ - request.options[opt_offset++] = 54; - request.options[opt_offset++] = 4; - memcpy(&request.options[opt_offset], &from_addr.sin_addr.s_addr, 4); - opt_offset += 4; - - /* Option 255: End */ - request.options[opt_offset++] = 0xff; - - if (sendto(sock, &request, sizeof(request), 0, - (struct sockaddr *)&dest_addr, sizeof(dest_addr)) < 0) { - perror("sendto DHCPREQUEST failed"); - goto cleanup; - } - - from_len = sizeof(from_addr); - len = recvfrom(sock, response, sizeof(response), 0, - (struct sockaddr *)&from_addr, &from_len); - - close(sock); - sock = -1; - - if (len <= 0) { - printf("no DHCPACK received\n"); - goto cleanup; - } - - if (get_dhcp_msg_type(response, len) != DHCP_MSG_ACK) { - printf("expected DHCPACK but got message type %d\n", - get_dhcp_msg_type(response, len)); - goto cleanup; - } - - if (handle_dhcp_ack(nl_sock, iface_index, response, len) != 0) - goto cleanup; - } else { - printf("unexpected DHCP message type %d\n", msg_type); - goto cleanup; - } - -done: - ret = 0; -cleanup: - if (sock >= 0) { - close(sock); - } - if (nl_sock >= 0) { - close(nl_sock); - } - return ret; -} diff --git a/init/dhcp.h b/init/dhcp.h deleted file mode 100644 index 39e20ead7..000000000 --- a/init/dhcp.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * DHCP Client Implementation - * - * Standalone DHCP client for configuring IPv4 network interfaces. - * Translated from Rust implementation in muvm/src/guest/net.rs - */ - -#ifndef DHCP_H -#define DHCP_H - -#include - -/* BOOTP vendor-specific area size (64) - magic cookie (4) */ -#define DHCP_OPTIONS_SIZE 60 - -/* DHCP packet structure (RFC 2131) */ -struct dhcp_packet { - uint8_t op; /* Message op code / message type (1 = BOOTREQUEST) */ - uint8_t htype; /* Hardware address type (1 = Ethernet) */ - uint8_t hlen; /* Hardware address length (6 for Ethernet) */ - uint8_t hops; /* Client sets to zero */ - uint32_t xid; /* Transaction ID */ - uint16_t secs; /* Seconds elapsed since client began address acquisition */ - uint16_t flags; /* Flags (0x8000 = Broadcast) */ - uint32_t ciaddr; /* Client IP address */ - uint32_t yiaddr; /* 'your' (client) IP address */ - uint32_t siaddr; /* IP address of next server to use in bootstrap */ - uint32_t giaddr; /* Relay agent IP address */ - uint8_t chaddr[16]; /* Client hardware address */ - uint8_t sname[64]; /* Optional server host name */ - uint8_t file[128]; /* Boot file name */ - uint32_t magic; /* Magic cookie (0x63825363) */ - uint8_t options[DHCP_OPTIONS_SIZE]; /* Options field */ -} __attribute__((packed)); - -/* - * Perform DHCP discovery and configuration for a network interface - * - * This function: - * 1. Binds a UDP socket to the interface using SO_BINDTODEVICE - * 2. Sends a DHCP DISCOVER message with Rapid Commit option - * 3. Waits up to 100ms for a response: - * - If DHCPACK (Rapid Commit): applies configuration directly - * - If DHCPOFFER: sends DHCPREQUEST and waits for DHCPACK - * - If no response: returns success (VM may be IPv6-only) - * 4. Parses the ACK and configures: - * - IPv4 address with appropriate prefix length - * - Default gateway route - * - DNS servers (overwriting /etc/resolv.conf) - * - Interface MTU - * - * Parameters: - * iface - The name of the network interface to be configured. - * - * Returns: - * 0 on success (whether or not DHCP response was received) - * -1 on error - */ -int do_dhcp(const char *iface); - -#endif /* DHCP_H */ diff --git a/init/init.c b/init/init.c deleted file mode 100644 index 4b205db69..000000000 --- a/init/init.c +++ /dev/null @@ -1,1581 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#if __FreeBSD__ -#include -#include -#include -#else -#include -#endif -#include -#include -#include - -#if __linux__ -#include -#include -#include -#endif - -#include "dhcp.h" -#include "jsmn.h" - -#ifdef SEV -#include "tee/snp_attest.h" -#endif - -#define KRUN_EXIT_CODE_IOCTL 0x7602 -#define KRUN_REMOVE_ROOT_DIR_IOCTL 0x7603 - -#define KRUN_MAGIC "KRUN" -#define KRUN_FOOTER_LEN 12 -#define CMDLINE_SECRET_PATH "/sfs/secrets/coco/cmdline" -#define CONFIG_FILE_PATH "/.krun_config.json" -#define MAX_PASS_SIZE 512 -#define MAX_TOKENS 16384 - -static int jsoneq(const char *, jsmntok_t *, const char *); - -#ifdef SEV -static char *sev_get_luks_passphrase(int *); -static char *snp_get_luks_passphrase(char *, char *, char *, int *); -#endif - -char DEFAULT_KRUN_INIT[] = "/bin/sh"; - -#if __FreeBSD__ -static char *get_kenv(const char *name) -{ - static char kenv_value[KENV_MVALLEN + 1]; - if (kenv(KENV_GET, name, kenv_value, KENV_MVALLEN + 1) < 0) { - return NULL; - } - return kenv_value; -} - -#define getenv get_kenv - -#define _PATH_CONSOLE "/dev/console" -#define _PATH_DEVNULL "/dev/null" -#define _PATH_INITLOG "/init.log" -/* - * Start a session and allocate a controlling terminal. - * Only called by children of init after forking. - */ -static void open_console(void) -{ - int fd; - - /* - * Try to open /dev/console. Open the device with O_NONBLOCK to - * prevent potential blocking on a carrier. - */ - revoke(_PATH_CONSOLE); - if ((fd = open(_PATH_CONSOLE, O_RDWR | O_NONBLOCK)) != -1) { - (void)fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) & ~O_NONBLOCK); - if (login_tty(fd) == 0) - return; - close(fd); - } - - /* No luck. Log output to file if possible. */ - if ((fd = open(_PATH_DEVNULL, O_RDWR)) == -1) { - _exit(1); - } - if (fd != STDIN_FILENO) { - dup2(fd, STDIN_FILENO); - close(fd); - } - fd = open(_PATH_INITLOG, O_WRONLY | O_APPEND | O_CREAT, 0644); - if (fd == -1) - dup2(STDIN_FILENO, STDOUT_FILENO); - else if (fd != STDOUT_FILENO) { - dup2(fd, STDOUT_FILENO); - close(fd); - } - dup2(STDOUT_FILENO, STDERR_FILENO); -} - -#define KRUN_CONFIG_ISO_DEV "/dev/iso9660/KRUN_CONFIG" -#define ISO_CONFIG_FILE_PATH "/mnt/krun_config.json" - -bool config_file_from_iso(const char **path) -{ - const char *iov_args[] = {"fstype", "cd9660", "fspath", - "/mnt", "from", KRUN_CONFIG_ISO_DEV}; - - const int iovlen = sizeof(iov_args) / sizeof(iov_args[0]); - struct iovec iov[iovlen]; - int i; - - struct stat st; - // mkdir can fail with read-only fs error, - // so we rather check if /mnt exists first - if (stat("/mnt", &st) != 0) { - if (errno != ENOENT) { - perror("stat(/mnt)"); - exit(-1); - } - if (mkdir("/mnt", 0755) < 0) { - perror("mkdir(/mnt)"); - exit(-1); - } - } - - for (i = 0; i < iovlen; i++) { - iov[i].iov_base = (void *)iov_args[i]; - iov[i].iov_len = strlen(iov_args[i]) + 1; - } - - if (nmount(iov, iovlen, MNT_RDONLY) < 0) { - *path = NULL; - return false; - } - *path = ISO_CONFIG_FILE_PATH; - return true; -} - -int unmount_config_iso() -{ - return unmount("/mnt", 0); -} -#endif - -static void set_rlimits(const char *rlimits) -{ - unsigned long long int lim_id, lim_cur, lim_max; - struct rlimit rlim; - char *item = (char *)rlimits; - - while (1) { - lim_id = lim_cur = lim_max = ULLONG_MAX; - - lim_id = strtoull(item, &item, 10); - if (lim_id == ULLONG_MAX) { - printf("Invalid rlimit ID\n"); - break; - } - - item++; - lim_cur = strtoull(item, &item, 10); - item++; - lim_max = strtoull(item, &item, 10); - - rlim.rlim_cur = lim_cur; - rlim.rlim_max = lim_max; - if (setrlimit(lim_id, &rlim) != 0) { - printf("Error setting rlimit for ID=%lld\n", lim_id); - } - - if (*item != '\0') { - item++; - } else { - break; - } - } -} - -#ifdef SEV -/* - * The LUKS passphrase is obtained from a KBS attestation server, complete an - * SNP attestation to get the passphrase. - */ -static char *get_luks_passphrase(int *pass_len) -{ - int fd, ret, num_tokens, wid_found, url_found, tee_found, tee_data_found; - uint64_t dev_size, tc_size; - char wid[256], url[256], *tc_json, *tok_start, *tok_end; - char footer[KRUN_FOOTER_LEN], tee[256], tee_data[256], *return_str; - jsmn_parser parser; - jsmntok_t *tokens; - size_t tok_size; - - return_str = NULL; - - /* - * If a user registered the TEE config data disk with - * krun_set_data_disk(), it would appear as /dev/vdb in the guest. - * Mount this device and read the config. - */ - if (mkdir("/dev", 0755) < 0 && errno != EEXIST) { - perror("mkdir(/dev)"); - goto finish; - } - - if (mount("devtmpfs", "/dev", "devtmpfs", MS_RELATIME, NULL) < 0 && - errno != EBUSY) { - perror("mount(devtmpfs)"); - - goto rmdir_dev; - } - - fd = open("/dev/vda", O_RDONLY); - if (fd < 0) { - perror("open(/dev/vda)"); - - goto umount_dev; - } - - ret = ioctl(fd, BLKGETSIZE64, &dev_size); - if (ret != 0) { - perror("ioctl(BLKGETSIZE64)"); - - goto close_dev; - } - - if (lseek(fd, dev_size - KRUN_FOOTER_LEN, SEEK_SET) == -1) { - perror("lseek(END - KRUN_FOOTER_LEN)"); - - goto close_dev; - } - - ret = read(fd, &footer[0], KRUN_FOOTER_LEN); - if (ret != KRUN_FOOTER_LEN) { - perror("read(KRUN_FOOTER_LEN)"); - - goto close_dev; - } - - if (memcmp(&footer[0], KRUN_MAGIC, 4) != 0) { - printf("Couldn't find KRUN footer signature, falling back to SEV\n"); - return_str = sev_get_luks_passphrase(pass_len); - - goto close_dev; - } - - tc_size = *(uint64_t *)&footer[4]; - - if (lseek(fd, dev_size - tc_size - KRUN_FOOTER_LEN, SEEK_SET) == -1) { - perror("lseek(END - tc_size - KRUN_FOOTER_LEN)"); - - goto close_dev; - } - - tc_json = malloc(tc_size + 1); - if (tc_json == NULL) { - perror("malloc(tc_size)"); - - goto close_dev; - } - - ret = read(fd, tc_json, tc_size); - if (ret != tc_size) { - perror("read(tc_size)"); - - goto free_mem; - } - tc_json[tc_size] = '\0'; - - /* - * Parse the TEE config's workload_id and attestation_url field. - */ - jsmn_init(&parser); - - tokens = (jsmntok_t *)malloc(sizeof(jsmntok_t) * MAX_TOKENS); - if (tokens == NULL) { - perror("malloc(jsmntok_t)"); - - goto free_mem; - } - - num_tokens = - jsmn_parse(&parser, tc_json, strlen(tc_json), tokens, MAX_TOKENS); - if (num_tokens < 0) { - printf("Unable to allocate JSON tokens\n"); - - goto free_mem; - } else if (num_tokens < 1 || tokens[0].type != JSMN_OBJECT) { - printf("Unable to find object in TEE configuration file\n"); - - goto free_mem; - } - - wid_found = url_found = tee_found = tee_data_found = 0; - - for (int i = 1; i < num_tokens - 1; ++i) { - tok_start = tc_json + tokens[i + 1].start; - tok_end = tc_json + tokens[i + 1].end; - tok_size = tok_end - tok_start; - if (!jsoneq(tc_json, &tokens[i], "workload_id")) { - strncpy(wid, tok_start, tok_size); - wid_found = 1; - } else if (!jsoneq(tc_json, &tokens[i], "attestation_url")) { - strncpy(url, tok_start, tok_size); - url_found = 1; - } else if (!jsoneq(tc_json, &tokens[i], "tee")) { - strncpy(tee, tok_start, tok_size); - tee_found = 1; - } else if (!jsoneq(tc_json, &tokens[i], "tee_data")) { - strncpy(tee_data, tok_start, tok_size); - tee_data_found = 1; - } - } - - if (!wid_found) { - printf("Unable to find attestation workload ID\n"); - - goto free_mem; - } else if (!url_found) { - printf("Unable to find attestation server URL\n"); - - goto free_mem; - } else if (!tee_found) { - printf("Unable to find TEE generation server URL\n"); - - goto free_mem; - } - - if (strcmp(tee, "snp") == 0) { - if (tee_data_found == 0) { - printf("Unable to find SNP generation\n"); - goto free_mem; - } - - return_str = snp_get_luks_passphrase(url, wid, tee_data, pass_len); - } else if (strcmp(tee, "sev") == 0) { - return_str = sev_get_luks_passphrase(pass_len); - } - -free_mem: - free(tc_json); - -close_dev: - close(fd); - -umount_dev: - umount("/dev"); - -rmdir_dev: - rmdir("/dev"); - -finish: - return return_str; -} - -static char *snp_get_luks_passphrase(char *url, char *wid, char *tee_data, - int *pass_len) -{ - char *pass; - - pass = (char *)malloc(MAX_PASS_SIZE); - if (pass == NULL) { - return NULL; - } - - if (snp_attest(pass, url, wid, tee_data) == 0) { - *pass_len = strlen(pass); - return pass; - } - - free(pass); - - return NULL; -} - -static char *sev_get_luks_passphrase(int *pass_len) -{ - char *pass = NULL; - int len; - int fd; - - pass = getenv("KRUN_PASS"); - if (pass) { - *pass_len = strnlen(pass, MAX_PASS_SIZE); - return pass; - } - if (mkdir("/sfs", 0755) < 0 && errno != EEXIST) { - perror("mkdir(/sfs)"); - return NULL; - } - - if (mount("securityfs", "/sfs", "securityfs", - MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) { - perror("mount(/sfs)"); - goto cleanup_dir; - } - - fd = open(CMDLINE_SECRET_PATH, O_RDONLY); - if (fd < 0) { - goto cleanup_sfs; - } - - pass = malloc(MAX_PASS_SIZE); - if (!pass) { - goto cleanup_fd; - } - - if ((len = read(fd, pass, MAX_PASS_SIZE)) < 0) { - free(pass); - pass = NULL; - } else { - *pass_len = len; - unlink(CMDLINE_SECRET_PATH); - } - -cleanup_fd: - close(fd); -cleanup_sfs: - umount("/sfs"); -cleanup_dir: - rmdir("/sfs"); - - return pass; -} - -static int chroot_luks() -{ - char *pass; - int pass_len; - int pid; - int pipefd[2]; - int wstatus; - - pass = get_luks_passphrase(&pass_len); - if (!pass) { - printf("Couldn't find LUKS passphrase\n"); - return -1; - } - - printf("Unlocking LUKS root filesystem\n"); - - if (mount("proc", "/proc", "proc", - MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) { - perror("mount(/proc)"); - return -1; - } - - pipe(pipefd); - - pid = fork(); - if (pid == 0) { - close(pipefd[1]); - dup2(pipefd[0], 0); - close(pipefd[0]); - - if (execl("/sbin/cryptsetup", "cryptsetup", "open", "/dev/vda", - "luksroot", "-", NULL) < 0) { - perror("execl"); - return -1; - } - } else { - write(pipefd[1], pass, strnlen(pass, pass_len)); - close(pipefd[1]); - waitpid(pid, &wstatus, 0); - } - - memset(pass, 0, pass_len); - - printf("Mounting LUKS root filesystem\n"); - - if (mount("/dev/mapper/luksroot", "/luksroot", "ext4", 0, NULL) < 0) { - perror("mount(/luksroot)"); - return -1; - } - - chdir("/luksroot"); - - if (mount(".", "/", NULL, MS_MOVE, NULL)) { - perror("remount root"); - return -1; - } - chroot("."); - - return 0; -} -#endif - -static int mount_filesystems() -{ -#if __linux__ - char *const DIRS_LEVEL1[] = {"/dev", "/proc", "/sys"}; - char *const DIRS_LEVEL2[] = {"/dev/pts", "/dev/shm"}; - int i; - - for (i = 0; i < 3; ++i) { - if (mkdir(DIRS_LEVEL1[i], 0755) < 0 && errno != EEXIST) { - printf("Error creating directory (%s)\n", DIRS_LEVEL1[i]); - return -1; - } - } - - if (mount("devtmpfs", "/dev", "devtmpfs", MS_RELATIME, NULL) < 0 && - errno != EBUSY) { - perror("mount(/dev)"); - return -1; - } - - if (mount("proc", "/proc", "proc", - MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0 && - errno != EBUSY) { - perror("mount(/proc)"); - return -1; - } - - if (mount("sysfs", "/sys", "sysfs", - MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0 && - errno != EBUSY) { - perror("mount(/sys)"); - return -1; - } - - if (mount("cgroup2", "/sys/fs/cgroup", "cgroup2", - MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0 && - errno != EBUSY) { - perror("mount(/sys/fs/cgroup)"); - return -1; - } - - for (i = 0; i < 2; ++i) { - if (mkdir(DIRS_LEVEL2[i], 0755) < 0 && errno != EEXIST) { - printf("Error creating directory (%s)\n", DIRS_LEVEL2[i]); - return -1; - } - } - - if (mount("devpts", "/dev/pts", "devpts", - MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0 && - errno != EBUSY) { - perror("mount(/dev/pts)"); - return -1; - } - - if (mount("tmpfs", "/dev/shm", "tmpfs", MS_NOEXEC | MS_NOSUID | MS_RELATIME, - NULL) < 0 && - errno != EBUSY) { - perror("mount(/dev/shm)"); - return -1; - } - - /* May fail if already exists and that's fine. */ - symlink("/proc/self/fd", "/dev/fd"); -#endif - return 0; -} - -/* - * hexToDigit, Utf32toUtf8 and parts of unescape_string are taken from libyajl: - * - * Copyright (c) 2007-2014, Lloyd Hilaiel - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ -static void hexToDigit(unsigned int *val, const unsigned char *hex) -{ - unsigned int i; - for (i = 0; i < 4; i++) { - unsigned char c = hex[i]; - if (c >= 'A') - c = (c & ~0x20) - 7; - c -= '0'; - *val = (*val << 4) | c; - } -} - -static void Utf32toUtf8(unsigned int codepoint, char *utf8Buf) -{ - if (codepoint < 0x80) { - utf8Buf[0] = (char)codepoint; - utf8Buf[1] = 0; - } else if (codepoint < 0x0800) { - utf8Buf[0] = (char)((codepoint >> 6) | 0xC0); - utf8Buf[1] = (char)((codepoint & 0x3F) | 0x80); - utf8Buf[2] = 0; - } else if (codepoint < 0x10000) { - utf8Buf[0] = (char)((codepoint >> 12) | 0xE0); - utf8Buf[1] = (char)(((codepoint >> 6) & 0x3F) | 0x80); - utf8Buf[2] = (char)((codepoint & 0x3F) | 0x80); - utf8Buf[3] = 0; - } else if (codepoint < 0x200000) { - utf8Buf[0] = (char)((codepoint >> 18) | 0xF0); - utf8Buf[1] = (char)(((codepoint >> 12) & 0x3F) | 0x80); - utf8Buf[2] = (char)(((codepoint >> 6) & 0x3F) | 0x80); - utf8Buf[3] = (char)((codepoint & 0x3F) | 0x80); - utf8Buf[4] = 0; - } else { - utf8Buf[0] = '?'; - utf8Buf[1] = 0; - } -} - -/* Do not worry about invalid JSON, it was already parsed by jsmn. */ -static void unescape_string(char *string, int len) -{ - unsigned char *val = (unsigned char *)string; - unsigned char *end; - int i = 0; - - end = val + len; - while (val < end) { - if (*val != '\\') { - string[i++] = *val++; - continue; - } - switch (*++val) { - case 'n': - string[i++] = '\n'; - break; - case 't': - string[i++] = '\t'; - break; - case 'r': - string[i++] = '\r'; - break; - case 'b': - string[i++] = '\b'; - break; - case 'f': - string[i++] = '\f'; - break; - case '\\': - string[i++] = '\\'; - break; - case '\"': - string[i++] = '\"'; - break; - case '/': - string[i++] = '/'; - break; - case 'u': { - const char *unescaped = "?"; - char utf8Buf[5]; - unsigned int codepoint = 0; - hexToDigit(&codepoint, val++); - val += 3; - /* check if this is a surrogate */ - if ((codepoint & 0xFC00) == 0xD800) { - val++; - if (val[0] == '\\' && val[1] == 'u') { - unsigned int surrogate = 0; - hexToDigit(&surrogate, val + 2); - codepoint = (((codepoint & 0x3F) << 10) | - ((((codepoint >> 6) & 0xF) + 1) << 16) | - (surrogate & 0x3FF)); - val += 5; - } else { - unescaped = "?"; - break; - } - } - - Utf32toUtf8(codepoint, utf8Buf); - unescaped = utf8Buf; - - if (codepoint == 0) { - memcpy(&string[i++], unescaped, 1); - continue; - } - memcpy(&string[i], unescaped, (unsigned int)strlen(unescaped)); - break; - } - } - } - string[i] = '\0'; -} - -static void config_parse_env(char *data, jsmntok_t *token) -{ - jsmntok_t *tenv; - char *env, *env_val; - int len; - int i; - - for (i = 0; i < token->size; i++) { - tenv = &token[i + 1]; - - env = data + tenv->start; - len = tenv->end - tenv->start; - - unescape_string(env, len); - - env_val = strstr(env, "="); - if (!env_val) { - continue; - } - - env[len] = '\0'; - *env_val = '\0'; - env_val++; - - if ((strcmp(env, "HOME") == 0) || (strcmp(env, "TERM") == 0)) { - setenv(env, env_val, 1); - } else { - setenv(env, env_val, 0); - } - } -} - -static char **config_parse_args(char *data, jsmntok_t *token) -{ - jsmntok_t *targ; - char *arg, *value; - char **argv; - int len; - int i; - const int n_args = token->size; - - argv = malloc((n_args + 1) * sizeof(char *)); - if (!argv) { - perror("malloc(config_parse_args)"); - return NULL; - } - - for (i = 0; i < n_args; i++) { - targ = &token[i + 1]; - - value = data + targ->start; - len = targ->end - targ->start; - - arg = malloc(len + 1); - if (!arg) { - perror("malloc(config_parse_args arg)"); - while (--i >= 0) - free(argv[i]); - free(argv); - return NULL; - } - memcpy(arg, value, len); - arg[len] = '\0'; - - unescape_string(arg, len); - - argv[i] = arg; - } - - if (i == 0) { - free(argv); - argv = NULL; - } else { - argv[i] = NULL; - } - - return argv; -} - -static char *config_parse_string(char *data, jsmntok_t *token) -{ - char *string; - char *val; - int len; - - val = data + token->start; - len = token->end - token->start; - if (!len) { - return NULL; - } - - string = malloc(len + 1); - - if (!string) { - return NULL; - } - memcpy(string, val, len); - string[len] = '\0'; - - unescape_string(string, len); - - return string; -} - -static int jsoneq(const char *json, jsmntok_t *tok, const char *s) -{ - if (tok->type == JSMN_STRING && (int)strlen(s) == tok->end - tok->start && - strncasecmp(json + tok->start, s, tok->end - tok->start) == 0) { - return 0; - } - return -1; -} - -char **concat_entrypoint_argv(char **entrypoint, char **config_argv) -{ - char **argv; - int i, j; - int n_args = 0; - - for (i = 0; entrypoint[i]; i++) - n_args++; - for (j = 0; config_argv[j]; j++) - n_args++; - - argv = malloc((n_args + 1) * sizeof(char *)); - if (!argv) { - perror("malloc(concat_entrypoint_argv)"); - return NULL; - } - - for (i = 0; entrypoint[i]; i++) { - argv[i] = entrypoint[i]; - } - - for (j = 0; config_argv[j]; i++, j++) { - argv[i] = config_argv[j]; - } - - argv[i] = NULL; - - return argv; -} - -static unsigned int config_parse_skip(jsmntok_t *token) -{ - unsigned int n = 1; - - for (int i = 0; i < token->size; i++) { - n += config_parse_skip(&token[n]); - } - - return n; -} - -static bool is_mount_point(const char *path) -{ - /* - * Beware that Podman arranges tmpfs auto-mounts. This means stat/lstat - * cannot be used to check the mount status as it would cause mounting the - * host tmpfs. Let's look at /proc/mounts instead. - */ - FILE *mounts; - char line[1024]; - char mount_point[512]; - bool found = false; - - mounts = fopen("/proc/mounts", "r"); - if (!mounts) { - perror("fopen(/proc/mounts)"); - return false; - } - - while (fgets(line, sizeof(line), mounts)) { - /* - * This doesn't handle escape sequences for spaces and tabs in paths. - * Not an issue currently as we don't mount any such paths, but could be - * improved in future. - */ - if (sscanf(line, "%*s %511s %*s %*s %*d %*d", mount_point) == 1) { - if (strcmp(mount_point, path) == 0) { - found = true; - break; - } - } - } - - fclose(mounts); - return found; -} - -static char *config_parse_mounts(char *data, jsmntok_t *token) -{ - jsmntok_t *tmount, *tdestination, *ttype, *tsource; - unsigned int i, j; - unsigned int t = 0; - - if (token[t++].type != JSMN_ARRAY) { - printf("Mounts not an array\n"); - return NULL; - } - - for (i = 0; i < token->size; i++) { - tmount = &token[t++]; - if (tmount->type != JSMN_OBJECT) { - printf("Unexpected mounts contents\n"); - return NULL; - } - - tdestination = ttype = tsource = NULL; - for (j = 0; j < tmount->size; j++) { - if (jsoneq(data, &token[t], "destination") == 0) { - tdestination = &token[t + 1]; - t += 2; - } else if (jsoneq(data, &token[t], "type") == 0) { - ttype = &token[t + 1]; - t += 2; - } else if (jsoneq(data, &token[t], "source") == 0) { - tsource = &token[t + 1]; - t += 2; - } else { - t += config_parse_skip(&token[t]); - } - } - - if (tdestination && ttype && tsource && - jsoneq(data, ttype, "tmpfs") == 0 && - jsoneq(data, tsource, "tmpfs") == 0) { - char *path = config_parse_string(data, tdestination); - if (path) { - if (!is_mount_point(path)) { - return path; - } - free(path); - } - } - } - - return NULL; -} - -static int config_parse_file(char ***argv, char **workdir, char **tmpfs, - const char *config_file) -{ - jsmn_parser parser; - jsmntok_t *tokens; - struct stat stat; - char *data; - off_t data_len; - char **config_argv; - char **entrypoint; - int parsed_env, parsed_workdir, parsed_args, parsed_entrypoint, - parsed_tmpfs; - int num_tokens; - int ret = -1; - int fd; - int i; - - fd = open(config_file, O_RDONLY); - if (fd < 0) { - return ret; - } - - if (fstat(fd, &stat) != 0) { - perror("Couldn't stat config file"); - goto cleanup_fd; - } - - data_len = stat.st_size; - data = malloc(data_len); - if (!data) { - perror("Couldn't allocate memory"); - goto cleanup_fd; - } - - if (read(fd, data, data_len) < 0) { - perror("Error reading config file"); - goto cleanup_data; - } - - tokens = malloc(MAX_TOKENS * sizeof(jsmntok_t)); - if (!tokens) { - perror("Couldn't allocate memory"); - goto cleanup_data; - } - - jsmn_init(&parser); - num_tokens = jsmn_parse(&parser, data, data_len, tokens, MAX_TOKENS); - if (num_tokens < 0) { - printf("Error parsing config file\n"); - goto cleanup_tokens; - } - - if (num_tokens < 1 || tokens[0].type != JSMN_OBJECT) { - printf("Couldn't find object in config file\n"); - goto cleanup_tokens; - } - - config_argv = NULL; - entrypoint = NULL; - parsed_env = parsed_workdir = parsed_args = parsed_entrypoint = - parsed_tmpfs = 0; - - for (i = 1; - i < num_tokens && (!parsed_env || !parsed_args || !parsed_workdir || - !parsed_entrypoint || !parsed_tmpfs); - i++) { - if (!parsed_env && jsoneq(data, &tokens[i], "Env") == 0 && - (i + 1) < num_tokens && tokens[i + 1].type == JSMN_ARRAY) { - config_parse_env(data, &tokens[i + 1]); - parsed_env = 1; - } - - if (!parsed_args && jsoneq(data, &tokens[i], "args") == 0 && - (i + 1) < num_tokens) { - config_argv = config_parse_args(data, &tokens[i + 1]); - parsed_args = 1; - } - - if (!parsed_args && jsoneq(data, &tokens[i], "Cmd") == 0 && - (i + 1) < num_tokens) { - config_argv = config_parse_args(data, &tokens[i + 1]); - parsed_args = 1; - } - - if (!parsed_workdir && jsoneq(data, &tokens[i], "WorkingDir") == 0 && - (i + 1) < num_tokens) { - *workdir = config_parse_string(data, &tokens[i + 1]); - parsed_workdir = 1; - } - - if (!parsed_workdir && jsoneq(data, &tokens[i], "Cwd") == 0 && - (i + 1) < num_tokens) { - *workdir = config_parse_string(data, &tokens[i + 1]); - parsed_workdir = 1; - } - - if (!parsed_entrypoint && jsoneq(data, &tokens[i], "Entrypoint") == 0 && - (i + 1) < num_tokens) { - entrypoint = config_parse_args(data, &tokens[i + 1]); - parsed_entrypoint = 1; - } - - if (!parsed_tmpfs && jsoneq(data, &tokens[i], "mounts") == 0 && - (i + 1) < num_tokens && - (*tmpfs = config_parse_mounts(data, &tokens[i + 1]))) { - parsed_tmpfs = 1; - } - } - - if (config_argv && entrypoint) { - *argv = concat_entrypoint_argv(entrypoint, config_argv); - } else { - *argv = config_argv; - } - - ret = 0; - -cleanup_tokens: - free(tokens); -cleanup_data: - free(data); -cleanup_fd: - close(fd); - - return ret; -} - -#ifdef __TIMESYNC__ - -#define TSYNC_PORT 123 -#define BUFSIZE 8 -#define NANOS_IN_SECOND 1000000000 -/* Set clock if delta is bigger than 100ms */ -#define DELTA_SYNC 100000000 - -void clock_worker() -{ - int sockfd, n; - struct sockaddr_vm serveraddr; - char buf[BUFSIZE]; - struct timespec gtime; - struct timespec htime; - uint64_t gtime_ns; - uint64_t htime_ns; - - sockfd = socket(AF_VSOCK, SOCK_DGRAM, 0); - if (sockfd < 0) { - perror("Couldn't create timesync socket"); - return; - } - - bzero((char *)&serveraddr, sizeof(serveraddr)); - serveraddr.svm_family = AF_VSOCK; - serveraddr.svm_port = TSYNC_PORT; - serveraddr.svm_cid = 3; - - bzero(buf, BUFSIZE); - - n = bind(sockfd, (struct sockaddr *)&serveraddr, sizeof(serveraddr)); - if (n < 0) { - printf("Couldn't bind timesync socket\n"); - return; - } - - while (1) { - n = recv(sockfd, buf, BUFSIZE, 0); - if (n < 0) { - perror("Error in timesync recv"); - return; - } else if (n != 8) { - printf("Ignoring bogus timesync packet\n"); - continue; - } - - htime_ns = *(uint64_t *)&buf[0]; - clock_gettime(CLOCK_REALTIME, >ime); - gtime_ns = gtime.tv_sec * NANOS_IN_SECOND; - gtime_ns += gtime.tv_nsec; - - if (llabs(htime_ns - gtime_ns) > DELTA_SYNC) { - htime.tv_sec = htime_ns / NANOS_IN_SECOND; - htime.tv_nsec = htime_ns % NANOS_IN_SECOND; - clock_settime(CLOCK_REALTIME, &htime); - } - } -} -#endif - -int reopen_fd(int fd, char *path, int flags) -{ - int newfd = open(path, flags); - if (newfd < 0) { - printf("Failed to open '%s': %s\n", path, strerror(errno)); - return -1; - } - - close(fd); - if (dup2(newfd, fd) < 0) { - perror("dup2"); - close(newfd); - return -1; - } - close(newfd); - return 0; -} - -int setup_redirects() -{ - DIR *ports_dir = opendir("/sys/class/virtio-ports"); - if (ports_dir == NULL) { - printf("Unable to open ports directory!\n"); - return -4; - } - - char path[2048]; - char name_buf[1024]; - - struct dirent *entry = NULL; - while ((entry = readdir(ports_dir))) { - char *port_identifier = entry->d_name; - int result_len = - snprintf(path, sizeof(path), "/sys/class/virtio-ports/%s/name", - port_identifier); - - // result was truncated - if (result_len > sizeof(name_buf) - 1) { - printf("Path buffer too small"); - return -1; - } - - FILE *port_name_file = fopen(path, "r"); - if (port_name_file == NULL) { - continue; - } - - char *port_name = fgets(name_buf, sizeof(name_buf), port_name_file); - fclose(port_name_file); - - if (port_name != NULL && strcmp(port_name, "krun-stdin\n") == 0) { - // if previous snprintf didn't fail, this one cannot fail either - snprintf(path, sizeof(path), "/dev/%s", port_identifier); - reopen_fd(STDIN_FILENO, path, O_RDONLY); - } else if (port_name != NULL && - strcmp(port_name, "krun-stdout\n") == 0) { - snprintf(path, sizeof(path), "/dev/%s", port_identifier); - reopen_fd(STDOUT_FILENO, path, O_WRONLY); - } else if (port_name != NULL && - strcmp(port_name, "krun-stderr\n") == 0) { - snprintf(path, sizeof(path), "/dev/%s", port_identifier); - reopen_fd(STDERR_FILENO, path, O_WRONLY); - } - } - - closedir(ports_dir); - return 0; -} - -int is_virtiofs(const char *path) -{ - struct statfs fs; - - if (statfs(path, &fs) != 0) { - perror("statfs"); - return -1; - } - - // virtiofs magic number: 0x65735546 - return (fs.f_type == 0x65735546) ? 1 : 0; -} - -void set_exit_code(int code) -{ - int fd; - int ret; - int virtiofs_check; - - // Only use the ioctl if virtiofs is used for root filesystem - virtiofs_check = is_virtiofs("/"); - if (virtiofs_check < 0) { - printf("Warning: Could not determine filesystem type for root\n"); - } - - if (virtiofs_check == 0) { - // Root filesystem is not virtiofs, skip the ioctl - return; - } - - fd = open("/", O_RDONLY); - if (fd < 0) { - perror("Couldn't open root filesystem to report exit code"); - return; - } - - ret = ioctl(fd, KRUN_EXIT_CODE_IOCTL, code); - if (ret < 0) { - perror("Error using the ioctl to set the exit code"); - } - - close(fd); -} - -#if __linux__ -int try_mount(const char *source, const char *target, const char *fstype, - unsigned long mountflags, const void *data) -{ - FILE *f; - char line[129]; - int mount_status = -1; - - if (fstype) { - return mount(source, target, fstype, mountflags, data); - } - - f = fopen("/proc/filesystems", "r"); - if (f == NULL) { - perror("fopen(/proc/filesystems)"); - return -1; - } - while (fgets(line, sizeof(line), f)) { - char fstype[sizeof(line)]; - if (!strncmp(line, "nodev", 5)) { - continue; - } - if (sscanf(line, "%128s\n", fstype) != 1) { - continue; - } - - mount_status = mount(source, target, fstype, mountflags, data); - if (mount_status == 0) { - break; - } - } - fclose(f); - - return mount_status; -} -#endif - -char *clone_str(const char *str) -{ - if (str == NULL) { - return NULL; - } - return strdup(str); -} - -int main(int argc, char **argv) -{ - struct ifreq ifr; - int sockfd; - int status; - int saved_errno; - bool init_pid1 = false; - char localhost[] = "localhost\0"; - char *hostname; - char *krun_home; - char *krun_term; - char *krun_init; -#if __linux__ - char *krun_dhcp; - int fd; - char *krun_root; - char *krun_root_fstype; - char *krun_root_options; -#endif - char *env_init_pid1; - char *config_workdir, *env_workdir; - char *config_tmpfs; - char *rlimits; - char **config_argv, **exec_argv; - const char *config_file; -#if __FreeBSD__ - bool config_file_mounted = false; - - open_console(); -#endif - -#ifdef TDX - if (mkdir("/tmp", 0755) < 0 && errno != EEXIST) { - perror("mkdir(/tmp)"); - exit(-1); - } - if (mkdir("/tmp/vda", 0755) < 0 && errno != EEXIST) { - perror("mkdir(/tmp/vda)"); - exit(-1); - } - if (mount("/dev/vda", "/tmp/vda", "ext4", MS_RELATIME, NULL) < 0) { - perror("mount(/dev/vda)"); - exit(-1); - } - chdir("/tmp/vda"); - if (mount(".", "/", NULL, MS_MOVE, NULL) < 0) { - perror("remount root"); - exit(-1); - } - chroot("."); - -#endif - -#ifdef SEV - if (chroot_luks() < 0) { - printf("Couldn't switch to LUKS volume, bailing out\n"); - exit(-1); - } -#endif - if (mount_filesystems() < 0) { - printf("Couldn't mount filesystems, bailing out\n"); - exit(-2); - } - -#if __linux__ - krun_root = clone_str(getenv("KRUN_BLOCK_ROOT_DEVICE")); - if (krun_root) { - if (mkdir("/newroot", 0755) < 0 && errno != EEXIST) { - perror("mkdir(/newroot)"); - exit(-1); - } - - krun_root_fstype = clone_str(getenv("KRUN_BLOCK_ROOT_FSTYPE")); - krun_root_options = clone_str(getenv("KRUN_BLOCK_ROOT_OPTIONS")); - - if (try_mount(krun_root, "/newroot", krun_root_fstype, 0, - krun_root_options) < 0) { - perror("mount KRUN_BLOCK_ROOT_DEVICE"); - exit(-1); - } - free(krun_root); - free(krun_root_fstype); - free(krun_root_options); - - chdir("/newroot"); - - fd = open("/", O_RDONLY); - if (fd < 0) { - perror("Couldn't open temporary root directory for removing"); - exit(-1); - } - if (ioctl(fd, KRUN_REMOVE_ROOT_DIR_IOCTL) < 0) { - perror("Error removing temporary root directory"); - } - close(fd); - - if (mount(".", "/", NULL, MS_MOVE, NULL) < 0) { - perror("remount root"); - exit(-1); - } - chroot("."); - - // we must mount filesystems again after chrooting - if (mount_filesystems() < 0) { - printf("Couldn't mount filesystems, bailing out\n"); - exit(-2); - } - } - - if (mount(NULL, "/", NULL, MS_REC | MS_SHARED, NULL) < 0) { - perror("Couldn't set shared propagation on the root mount"); - exit(-1); - } -#endif - - setsid(); - ioctl(0, TIOCSCTTY, 1); - -#if __FreeBSD__ - setlogin("root"); -#endif - - sockfd = socket(AF_INET, SOCK_DGRAM, 0); - if (sockfd >= 0) { - memset(&ifr, 0, sizeof ifr); - strncpy(ifr.ifr_name, "lo", IFNAMSIZ); - ifr.ifr_flags |= IFF_UP; - ioctl(sockfd, SIOCSIFFLAGS, &ifr); - -#if __linux__ - krun_dhcp = getenv("KRUN_DHCP"); - if (krun_dhcp && strcmp(krun_dhcp, "1") == 0) { - memset(&ifr, 0, sizeof ifr); - strncpy(ifr.ifr_name, "eth0", IFNAMSIZ); - if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) == 0) { - /* eth0 exists, bring it up first */ - ifr.ifr_flags |= IFF_UP; - ioctl(sockfd, SIOCSIFFLAGS, &ifr); - - /* Configure eth0 with DHCP */ - if (do_dhcp("eth0") != 0) { - printf("Warning: DHCP configuration for eth0 failed\n"); - } - } - } -#endif - - close(sockfd); - } - - config_argv = NULL; - config_workdir = NULL; - config_tmpfs = NULL; - - config_file = getenv("KRUN_CONFIG"); - -#if __FreeBSD__ - if (!config_file) { - config_file_mounted = config_file_from_iso(&config_file); - } -#endif - - if (!config_file) { - config_file = CONFIG_FILE_PATH; - } - - config_parse_file(&config_argv, &config_workdir, &config_tmpfs, - config_file); - -#if __FreeBSD__ - if (config_file_mounted) { - unmount_config_iso(); - } -#endif - -#if __linux__ - if (config_tmpfs) { - /* TODO: Honour mount flags from the config file. Most notably, - * tmpcopyup is set by Podman by default, requesting copying the files - * present in the original directory, e.g. from the image. */ - if (mount("tmpfs", config_tmpfs, "tmpfs", - MS_NOEXEC | MS_NOSUID | MS_NODEV | MS_RELATIME, NULL) < 0) { - perror("mount for tmpfs"); - exit(-1); - } - } -#endif - - krun_home = getenv("KRUN_HOME"); - if (krun_home) { - setenv("HOME", krun_home, 1); - } - - krun_term = getenv("KRUN_TERM"); - if (krun_term) { - setenv("TERM", krun_term, 1); - } - - hostname = getenv("HOSTNAME"); - if (hostname) { - sethostname(hostname, strlen(hostname)); - } else { - sethostname(&localhost[0], strlen(localhost)); - } - - rlimits = getenv("KRUN_RLIMITS"); - if (rlimits) { - set_rlimits(rlimits); - } - - env_workdir = getenv("KRUN_WORKDIR"); - if (env_workdir) { - chdir(env_workdir); - } else if (config_workdir) { - chdir(config_workdir); - } - - exec_argv = argv; - krun_init = getenv("KRUN_INIT"); - if (krun_init) { - exec_argv[0] = clone_str(krun_init); - } else if (config_argv) { - exec_argv = config_argv; - } else { - exec_argv[0] = &DEFAULT_KRUN_INIT[0]; - } - - env_init_pid1 = getenv("KRUN_INIT_PID1"); - if (env_init_pid1 && *env_init_pid1 == '1') { - init_pid1 = true; - } - -#ifdef __TIMESYNC__ - if (fork() == 0) { - clock_worker(); - _exit(1); - } -#endif - - if (init_pid1) { - goto exec_init; - } - - // We need to fork ourselves, because pid 1 cannot doesn't receive SIGINT - // signal - int child = fork(); - if (child < 0) { - perror("fork"); - set_exit_code(125); - exit(125); - } - if (child == 0) { // child - exec_init: -#if __FreeBSD__ - open_console(); -#else - if (setup_redirects() < 0) { - exit(125); - } -#endif - if (execvp(exec_argv[0], exec_argv) < 0) { - saved_errno = errno; - printf("Couldn't execute '%s' inside the vm: %s\n", exec_argv[0], - strerror(errno)); - // Use the same exit code as chroot and podman do. - if (saved_errno == ENOENT) { - exit(127); - } else { - exit(126); - } - } - } else { // parent - // Wait until the workload's entrypoint has exited, ignoring any other - // children. - while (waitpid(-1, &status, 0) != child) { - // Not the first child, ignore it. - }; - - // The workload's entrypoint has exited, record its exit code and exit - // ourselves. - if (WIFEXITED(status)) { - set_exit_code(WEXITSTATUS(status)); - } else if (WIFSIGNALED(status)) { - set_exit_code(WTERMSIG(status) + 128); - } - - sync(); -#if __linux__ - reboot(LINUX_REBOOT_CMD_RESTART); -#endif - } - - return 0; -} diff --git a/init/jsmn.h b/init/jsmn.h deleted file mode 100644 index 30d37a24a..000000000 --- a/init/jsmn.h +++ /dev/null @@ -1,494 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2010 Serge Zaitsev - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef JSMN_H -#define JSMN_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define JSMN_API static - -/** - * JSON type identifier. Basic types are: - * o Object - * o Array - * o String - * o Other primitive: number, boolean (true/false) or null - */ -typedef enum { - JSMN_UNDEFINED = 0, - JSMN_OBJECT = 1 << 0, - JSMN_ARRAY = 1 << 1, - JSMN_STRING = 1 << 2, - JSMN_PRIMITIVE = 1 << 3 -} jsmntype_t; - -enum jsmnerr { - /* Not enough tokens were provided */ - JSMN_ERROR_NOMEM = -1, - /* Invalid character inside JSON string */ - JSMN_ERROR_INVAL = -2, - /* The string is not a full JSON packet, more bytes expected */ - JSMN_ERROR_PART = -3 -}; - -/** - * JSON token description. - * type type (object, array, string etc.) - * start start position in JSON data string - * end end position in JSON data string - */ -typedef struct jsmntok { - jsmntype_t type; - int start; - int end; - int size; -#ifdef JSMN_PARENT_LINKS - int parent; -#endif -} jsmntok_t; - -/** - * JSON parser. Contains an array of token blocks available. Also stores - * the string being parsed now and current position in that string. - */ -typedef struct jsmn_parser { - unsigned int pos; /* offset in the JSON string */ - unsigned int toknext; /* next token to allocate */ - int toksuper; /* superior token node, e.g. parent object or array */ -} jsmn_parser; - -/** - * Create JSON parser over an array of tokens - */ -JSMN_API void jsmn_init(jsmn_parser *parser); - -/** - * Run JSON parser. It parses a JSON data string into and array of tokens, each - * describing - * a single JSON object. - */ -JSMN_API int jsmn_parse(jsmn_parser *parser, const char *js, const size_t len, - jsmntok_t *tokens, const unsigned int num_tokens); - -#ifndef JSMN_HEADER -/** - * Allocates a fresh unused token from the token pool. - */ -static jsmntok_t *jsmn_alloc_token(jsmn_parser *parser, jsmntok_t *tokens, - const size_t num_tokens) -{ - jsmntok_t *tok; - if (parser->toknext >= num_tokens) { - return NULL; - } - tok = &tokens[parser->toknext++]; - tok->start = tok->end = -1; - tok->size = 0; -#ifdef JSMN_PARENT_LINKS - tok->parent = -1; -#endif - return tok; -} - -/** - * Fills token type and boundaries. - */ -static void jsmn_fill_token(jsmntok_t *token, const jsmntype_t type, - const int start, const int end) -{ - token->type = type; - token->start = start; - token->end = end; - token->size = 0; -} - -/** - * Fills next available token with JSON primitive. - */ -static int jsmn_parse_primitive(jsmn_parser *parser, const char *js, - const size_t len, jsmntok_t *tokens, - const size_t num_tokens) -{ - jsmntok_t *token; - int start; - - start = parser->pos; - - for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { - switch (js[parser->pos]) { -#ifndef JSMN_STRICT - /* In strict mode primitive must be followed by "," or "}" or "]" */ - case ':': -#endif - case '\t': - case '\r': - case '\n': - case ' ': - case ',': - case ']': - case '}': - goto found; - default: - /* to quiet a warning from gcc*/ - break; - } - /* libkrun: Let's be permissive with non-ASCII bytes - if (js[parser->pos] < 32 || js[parser->pos] >= 127) { - parser->pos = start; - return JSMN_ERROR_INVAL; - } - */ - } -#ifdef JSMN_STRICT - /* In strict mode primitive must be followed by a comma/object/array */ - parser->pos = start; - return JSMN_ERROR_PART; -#endif - -found: - if (tokens == NULL) { - parser->pos--; - return 0; - } - token = jsmn_alloc_token(parser, tokens, num_tokens); - if (token == NULL) { - parser->pos = start; - return JSMN_ERROR_NOMEM; - } - jsmn_fill_token(token, JSMN_PRIMITIVE, start, parser->pos); -#ifdef JSMN_PARENT_LINKS - token->parent = parser->toksuper; -#endif - parser->pos--; - return 0; -} - -/** - * Fills next token with JSON string. - */ -static int jsmn_parse_string(jsmn_parser *parser, char *js, const size_t len, - jsmntok_t *tokens, const size_t num_tokens) -{ - jsmntok_t *token; - - int start = parser->pos; - - /* Skip starting quote */ - parser->pos++; - - for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { - char c = js[parser->pos]; - - /* Quote: end of string */ - if (c == '\"') { - if (tokens == NULL) { - return 0; - } - token = jsmn_alloc_token(parser, tokens, num_tokens); - if (token == NULL) { - parser->pos = start; - return JSMN_ERROR_NOMEM; - } - jsmn_fill_token(token, JSMN_STRING, start + 1, parser->pos); -#ifdef JSMN_PARENT_LINKS - token->parent = parser->toksuper; -#endif - return 0; - } - - /* Backslash: Quoted symbol expected */ - if (c == '\\' && parser->pos + 1 < len) { - int i; - parser->pos++; - switch (js[parser->pos]) { - /* Allowed escaped symbols */ - case '\"': - case '/': - case '\\': - case 'b': - case 'f': - case 'r': - case 'n': - case 't': - break; - /* Allows escaped symbol \uXXXX */ - case 'u': { - char unicode[5]; - long ascii; - - parser->pos++; - for (i = 0; - i < 4 && parser->pos < len && js[parser->pos] != '\0'; - i++) { - /* If it isn't a hex character we have an error */ - if (!((js[parser->pos] >= 48 && - js[parser->pos] <= 57) || /* 0-9 */ - (js[parser->pos] >= 65 && - js[parser->pos] <= 70) || /* A-F */ - (js[parser->pos] >= 97 && - js[parser->pos] <= 102))) { /* a-f */ - parser->pos = start; - return JSMN_ERROR_INVAL; - } - unicode[i] = js[parser->pos]; - parser->pos++; - } - - unicode[4] = '\0'; - ascii = strtol(&unicode[0], NULL, 16); - if (ascii < 0 || ascii > 127) { - /* This unicode char doesn't translate directly to ASCII */ - parser->pos = start; - return JSMN_ERROR_INVAL; - } - - parser->pos--; - js[parser->pos] = (char)ascii; - break; - } - /* Unexpected symbol */ - default: - parser->pos = start; - return JSMN_ERROR_INVAL; - } - } - } - parser->pos = start; - return JSMN_ERROR_PART; -} - -/** - * Parse JSON string and fill tokens. - */ -JSMN_API int jsmn_parse(jsmn_parser *parser, const char *js, const size_t len, - jsmntok_t *tokens, const unsigned int num_tokens) -{ - int r; - int i; - jsmntok_t *token; - int count = parser->toknext; - - for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { - char c; - jsmntype_t type; - - c = js[parser->pos]; - switch (c) { - case '{': - case '[': - count++; - if (tokens == NULL) { - break; - } - token = jsmn_alloc_token(parser, tokens, num_tokens); - if (token == NULL) { - return JSMN_ERROR_NOMEM; - } - if (parser->toksuper != -1) { - jsmntok_t *t = &tokens[parser->toksuper]; -#ifdef JSMN_STRICT - /* In strict mode an object or array can't become a key */ - if (t->type == JSMN_OBJECT) { - return JSMN_ERROR_INVAL; - } -#endif - t->size++; -#ifdef JSMN_PARENT_LINKS - token->parent = parser->toksuper; -#endif - } - token->type = (c == '{' ? JSMN_OBJECT : JSMN_ARRAY); - token->start = parser->pos; - parser->toksuper = parser->toknext - 1; - break; - case '}': - case ']': - if (tokens == NULL) { - break; - } - type = (c == '}' ? JSMN_OBJECT : JSMN_ARRAY); -#ifdef JSMN_PARENT_LINKS - if (parser->toknext < 1) { - return JSMN_ERROR_INVAL; - } - token = &tokens[parser->toknext - 1]; - for (;;) { - if (token->start != -1 && token->end == -1) { - if (token->type != type) { - return JSMN_ERROR_INVAL; - } - token->end = parser->pos + 1; - parser->toksuper = token->parent; - break; - } - if (token->parent == -1) { - if (token->type != type || parser->toksuper == -1) { - return JSMN_ERROR_INVAL; - } - break; - } - token = &tokens[token->parent]; - } -#else - for (i = parser->toknext - 1; i >= 0; i--) { - token = &tokens[i]; - if (token->start != -1 && token->end == -1) { - if (token->type != type) { - return JSMN_ERROR_INVAL; - } - parser->toksuper = -1; - token->end = parser->pos + 1; - break; - } - } - /* Error if unmatched closing bracket */ - if (i == -1) { - return JSMN_ERROR_INVAL; - } - for (; i >= 0; i--) { - token = &tokens[i]; - if (token->start != -1 && token->end == -1) { - parser->toksuper = i; - break; - } - } -#endif - break; - case '\"': - r = jsmn_parse_string(parser, (char *)js, len, tokens, num_tokens); - if (r < 0) { - return r; - } - count++; - if (parser->toksuper != -1 && tokens != NULL) { - tokens[parser->toksuper].size++; - } - break; - case '\t': - case '\r': - case '\n': - case ' ': - break; - case ':': - parser->toksuper = parser->toknext - 1; - break; - case ',': - if (tokens != NULL && parser->toksuper != -1 && - tokens[parser->toksuper].type != JSMN_ARRAY && - tokens[parser->toksuper].type != JSMN_OBJECT) { -#ifdef JSMN_PARENT_LINKS - parser->toksuper = tokens[parser->toksuper].parent; -#else - for (i = parser->toknext - 1; i >= 0; i--) { - if (tokens[i].type == JSMN_ARRAY || - tokens[i].type == JSMN_OBJECT) { - if (tokens[i].start != -1 && tokens[i].end == -1) { - parser->toksuper = i; - break; - } - } - } -#endif - } - break; -#ifdef JSMN_STRICT - /* In strict mode primitives are: numbers and booleans */ - case '-': - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - case 't': - case 'f': - case 'n': - /* And they must not be keys of the object */ - if (tokens != NULL && parser->toksuper != -1) { - const jsmntok_t *t = &tokens[parser->toksuper]; - if (t->type == JSMN_OBJECT || - (t->type == JSMN_STRING && t->size != 0)) { - return JSMN_ERROR_INVAL; - } - } -#else - /* In non-strict mode every unquoted value is a primitive */ - default: -#endif - r = jsmn_parse_primitive(parser, js, len, tokens, num_tokens); - if (r < 0) { - return r; - } - count++; - if (parser->toksuper != -1 && tokens != NULL) { - tokens[parser->toksuper].size++; - } - break; - -#ifdef JSMN_STRICT - /* Unexpected char in strict mode */ - default: - return JSMN_ERROR_INVAL; -#endif - } - } - - if (tokens != NULL) { - for (i = parser->toknext - 1; i >= 0; i--) { - /* Unmatched opened object or array */ - if (tokens[i].start != -1 && tokens[i].end == -1) { - return JSMN_ERROR_PART; - } - } - } - - return count; -} - -/** - * Creates a new parser based over a given buffer with an array of tokens - * available. - */ -JSMN_API void jsmn_init(jsmn_parser *parser) -{ - parser->pos = 0; - parser->toknext = 0; - parser->toksuper = -1; -} - -#endif /* JSMN_HEADER */ - -#ifdef __cplusplus -} -#endif - -#endif /* JSMN_H */ diff --git a/init/src/config.rs b/init/src/config.rs new file mode 100644 index 000000000..93ecad89b --- /dev/null +++ b/init/src/config.rs @@ -0,0 +1,91 @@ +use anyhow::{Context, Result}; +use serde::Deserialize; +use std::env; +use std::fs; + +const CONFIG_FILE_PATH: &str = "/.krun_config.json"; + +// The krun OCI runtime passes a full OCI runtime-spec config.json as the +// config file. The fields we care about live inside "process". +#[derive(Deserialize, Default)] +struct ProcessConfig { + args: Option>, + env: Option>, + cwd: Option, +} + +#[cfg(target_os = "linux")] +#[derive(Deserialize, Default)] +struct Mount { + #[serde(rename = "destination")] + destination: Option, + #[serde(rename = "type")] + mount_type: Option, + #[serde(rename = "source")] + source: Option, +} + +#[derive(Deserialize, Default)] +struct RawConfig { + process: Option, + #[cfg(target_os = "linux")] + mounts: Option>, +} + +#[derive(Default)] +pub struct Config { + pub argv: Option>, + pub workdir: Option, + #[cfg(target_os = "linux")] + pub tmpfs: Option, +} + +pub fn load(#[cfg(target_os = "linux")] is_mount_point: impl Fn(&str) -> bool) -> Config { + let path = env::var("KRUN_CONFIG").unwrap_or_else(|_| CONFIG_FILE_PATH.to_string()); + + let Ok(raw) = parse_file(&path) else { + return Config::default(); + }; + + let process = raw.process.unwrap_or_default(); + + // Apply environment variables from the process config. + for entry in process.env.unwrap_or_default() { + let Some((key, val)) = entry.split_once('=') else { + continue; + }; + let overwrite = matches!(key, "HOME" | "TERM"); + if env::var(key).is_err() || overwrite { + // SAFETY: single-threaded at this point. + unsafe { env::set_var(key, val) }; + } + } + + let argv = process.args.filter(|v| !v.is_empty()); + let workdir = process.cwd; + + // Find the first tmpfs mount whose destination is not already mounted. + #[cfg(target_os = "linux")] + let tmpfs = raw.mounts.unwrap_or_default().into_iter().find_map(|m| { + let dest = m.destination?; + let ty = m.mount_type.as_deref().unwrap_or(""); + let src = m.source.as_deref().unwrap_or(""); + if ty == "tmpfs" && src == "tmpfs" && !is_mount_point(&dest) { + Some(dest) + } else { + None + } + }); + + Config { + argv, + workdir, + #[cfg(target_os = "linux")] + tmpfs, + } +} + +fn parse_file(path: &str) -> Result { + let data = fs::read(path).with_context(|| format!("read {path}"))?; + serde_json::from_slice(&data).with_context(|| format!("parse {path}")) +} diff --git a/init/src/dhcp.rs b/init/src/dhcp.rs new file mode 100644 index 000000000..701788e2c --- /dev/null +++ b/init/src/dhcp.rs @@ -0,0 +1,621 @@ +use anyhow::{bail, Context}; +use std::io::Error as IoError; +use std::mem; +use std::slice; + +const DHCP_BUFFER_SIZE: usize = 576; +const DHCP_OPTIONS_SIZE: usize = 60; +const DHCP_OPTIONS_OFFSET: usize = 240; +const DHCP_OPTIONS_END: u8 = 0xff; +const DHCP_MSG_OFFER: u8 = 2; +const DHCP_MSG_ACK: u8 = 5; + +#[repr(C, packed)] +struct DhcpPacket { + op: u8, + htype: u8, + hlen: u8, + hops: u8, + xid: u32, + secs: u16, + flags: u16, + ciaddr: u32, + yiaddr: u32, + siaddr: u32, + giaddr: u32, + chaddr: [u8; 16], + sname: [u8; 64], + file: [u8; 128], + magic: u32, + options: [u8; DHCP_OPTIONS_SIZE], +} + +impl DhcpPacket { + fn zeroed() -> Self { + // SAFETY: DhcpPacket is plain-old-data with no padding invariants. + unsafe { mem::zeroed() } + } + + fn as_bytes(&self) -> &[u8] { + // SAFETY: packed repr, no padding; reading as bytes is valid. + unsafe { slice::from_raw_parts(self as *const _ as *const u8, mem::size_of::()) } + } +} + +struct DhcpOptionsWriter<'a> { + buf: &'a mut [u8], + pos: usize, +} + +impl<'a> DhcpOptionsWriter<'a> { + fn new(buf: &'a mut [u8]) -> Self { + Self { buf, pos: 0 } + } + + fn push(&mut self, code: u8, data: &[u8]) { + self.buf[self.pos] = code; + self.buf[self.pos + 1] = data.len() as u8; + self.buf[self.pos + 2..self.pos + 2 + data.len()].copy_from_slice(data); + self.pos += 2 + data.len(); + } + + fn finish(self) { + self.buf[self.pos] = DHCP_OPTIONS_END; + } +} + +struct DhcpOptions<'a>(&'a [u8]); + +impl<'a> Iterator for DhcpOptions<'a> { + type Item = (u8, &'a [u8]); + + fn next(&mut self) -> Option { + loop { + let opt = *self.0.first()?; + if opt == DHCP_OPTIONS_END { + self.0 = &[]; + return None; + } + self.0 = &self.0[1..]; + if opt == 0 { + continue; + } + let len = *self.0.first()? as usize; + self.0 = &self.0[1..]; + let data = self.0.get(..len)?; + self.0 = &self.0[len..]; + return Some((opt, data)); + } + } +} + +// libc doesn't expose ifinfomsg, ifaddrmsg, or rtmsg — define them locally. + +#[repr(C)] +struct IfInfoMsg { + ifi_family: u8, + _pad: u8, + ifi_type: u16, + ifi_index: i32, + ifi_flags: u32, + ifi_change: u32, +} + +#[repr(C)] +struct IfAddrMsg { + ifa_family: u8, + ifa_prefixlen: u8, + ifa_flags: u8, + ifa_scope: u8, + ifa_index: u32, +} + +#[repr(C)] +struct RtMsg { + rtm_family: u8, + rtm_dst_len: u8, + rtm_src_len: u8, + rtm_tos: u8, + rtm_table: u8, + rtm_protocol: u8, + rtm_scope: u8, + rtm_type: u8, + rtm_flags: u32, +} + +unsafe fn struct_as_bytes(v: &T) -> &[u8] { + slice::from_raw_parts(v as *const T as *const u8, mem::size_of::()) +} + +fn nl_send(sock: libc::c_int, buf: &[u8]) -> anyhow::Result<()> { + // Use mem::zeroed() so the opaque nl_pad field is correctly initialised. + let mut sa: libc::sockaddr_nl = unsafe { mem::zeroed() }; + sa.nl_family = libc::AF_NETLINK as libc::sa_family_t; + + let iov = libc::iovec { + iov_base: buf.as_ptr() as *mut _, + iov_len: buf.len(), + }; + // Use zeroed() rather than a struct literal: musl's msghdr has private + // padding fields (__pad1, __pad2) that cannot be named in a literal. + let mut msg: libc::msghdr = unsafe { mem::zeroed() }; + msg.msg_name = &sa as *const _ as *mut _; + msg.msg_namelen = mem::size_of_val(&sa) as u32; + msg.msg_iov = &iov as *const _ as *mut _; + msg.msg_iovlen = 1; + let ret = unsafe { libc::sendmsg(sock, &msg, 0) }; + if ret < 0 { + bail!("nl_send: {}", IoError::last_os_error()); + } + Ok(()) +} + +fn nl_recv(sock: libc::c_int, buf: &mut [u8]) -> anyhow::Result { + let mut sa: libc::sockaddr_nl = unsafe { mem::zeroed() }; + let iov = libc::iovec { + iov_base: buf.as_mut_ptr() as *mut _, + iov_len: buf.len(), + }; + let mut msg: libc::msghdr = unsafe { mem::zeroed() }; + msg.msg_name = &mut sa as *mut _ as *mut _; + msg.msg_namelen = mem::size_of_val(&sa) as u32; + msg.msg_iov = &iov as *const _ as *mut _; + msg.msg_iovlen = 1; + let ret = unsafe { libc::recvmsg(sock, &mut msg, 0) }; + if ret < 0 { + bail!("nl_recv: {}", IoError::last_os_error()); + } + Ok(ret as usize) +} + +fn add_rtattr(buf: &mut [u8], msg_len: &mut usize, rta_type: u16, data: &[u8]) { + let rta_len = (4 + data.len()) as u16; + let aligned_start = (*msg_len + 3) & !3; + let end = aligned_start + ((rta_len as usize + 3) & !3); + assert!(end <= buf.len(), "netlink buffer too small"); + + buf[aligned_start..aligned_start + 2].copy_from_slice(&rta_len.to_ne_bytes()); + buf[aligned_start + 2..aligned_start + 4].copy_from_slice(&rta_type.to_ne_bytes()); + buf[aligned_start + 4..aligned_start + 4 + data.len()].copy_from_slice(data); + buf[aligned_start + 4 + data.len()..end].fill(0); + + *msg_len = end; + buf[0..4].copy_from_slice(&(*msg_len as u32).to_ne_bytes()); +} + +fn nl_check_ack(buf: &[u8], recv_len: usize, op: &str) -> anyhow::Result<()> { + let min = mem::size_of::() + mem::size_of::(); + if recv_len < min { + bail!("{op}: netlink response too short"); + } + let nlh = unsafe { &*(buf.as_ptr() as *const libc::nlmsghdr) }; + if nlh.nlmsg_type != libc::NLMSG_ERROR as u16 { + bail!( + "{op}: expected NLMSG_ERROR ACK, got type {}", + nlh.nlmsg_type + ); + } + let err_offset = mem::size_of::(); + let err = i32::from_ne_bytes(buf[err_offset..err_offset + 4].try_into().unwrap()); + if err != 0 { + bail!("{op}: netlink error {err}"); + } + Ok(()) +} + +fn nl_hdr(buf: &mut [u8], msg_len: usize, nlmsg_type: u16, flags: u16) { + let nlh = libc::nlmsghdr { + nlmsg_len: msg_len as u32, + nlmsg_type, + nlmsg_flags: flags, + nlmsg_seq: 1, + nlmsg_pid: unsafe { libc::getpid() } as u32, + }; + buf[..mem::size_of_val(&nlh)].copy_from_slice(unsafe { struct_as_bytes(&nlh) }); +} + +fn set_mtu(nl_sock: libc::c_int, iface_index: i32, mtu: u32) -> anyhow::Result<()> { + let mut buf = [0u8; 4096]; + let base = mem::size_of::() + mem::size_of::(); + let mut msg_len = base; + + nl_hdr( + &mut buf, + base, + libc::RTM_NEWLINK, + (libc::NLM_F_REQUEST | libc::NLM_F_ACK) as u16, + ); + + let ifi = IfInfoMsg { + ifi_family: libc::AF_UNSPEC as u8, + _pad: 0, + ifi_type: libc::ARPHRD_ETHER, + ifi_index: iface_index, + ifi_flags: 0, + ifi_change: 0, + }; + let ifi_off = mem::size_of::(); + buf[ifi_off..ifi_off + mem::size_of_val(&ifi)] + .copy_from_slice(unsafe { struct_as_bytes(&ifi) }); + + add_rtattr(&mut buf, &mut msg_len, libc::IFLA_MTU, &mtu.to_ne_bytes()); + + nl_send(nl_sock, &buf[..msg_len])?; + let recv_len = nl_recv(nl_sock, &mut buf)?; + nl_check_ack(&buf, recv_len, "set_mtu") +} + +fn mod_addr4( + nl_sock: libc::c_int, + iface_index: i32, + cmd: u16, + addr: u32, + prefix_len: u8, +) -> anyhow::Result<()> { + let mut buf = [0u8; 4096]; + let base = mem::size_of::() + mem::size_of::(); + let mut msg_len = base; + + nl_hdr( + &mut buf, + base, + cmd, + (libc::NLM_F_REQUEST | libc::NLM_F_CREATE | libc::NLM_F_ACK) as u16, + ); + + let ifa = IfAddrMsg { + ifa_family: libc::AF_INET as u8, + ifa_prefixlen: prefix_len, + ifa_flags: 0, + ifa_scope: libc::RT_SCOPE_UNIVERSE, + ifa_index: iface_index as u32, + }; + let ifa_off = mem::size_of::(); + buf[ifa_off..ifa_off + mem::size_of_val(&ifa)] + .copy_from_slice(unsafe { struct_as_bytes(&ifa) }); + + let addr_bytes = addr.to_ne_bytes(); + add_rtattr(&mut buf, &mut msg_len, libc::IFA_LOCAL, &addr_bytes); + add_rtattr(&mut buf, &mut msg_len, libc::IFA_ADDRESS, &addr_bytes); + + nl_send(nl_sock, &buf[..msg_len])?; + let recv_len = nl_recv(nl_sock, &mut buf)?; + nl_check_ack(&buf, recv_len, "mod_addr4") +} + +fn mod_route4( + nl_sock: libc::c_int, + iface_index: i32, + cmd: u16, + gateway: u32, +) -> anyhow::Result<()> { + let mut buf = [0u8; 4096]; + let base = mem::size_of::() + mem::size_of::(); + let mut msg_len = base; + + nl_hdr( + &mut buf, + base, + cmd, + (libc::NLM_F_REQUEST | libc::NLM_F_CREATE | libc::NLM_F_ACK) as u16, + ); + + let rtm = RtMsg { + rtm_family: libc::AF_INET as u8, + rtm_dst_len: 0, + rtm_src_len: 0, + rtm_tos: 0, + rtm_table: libc::RT_TABLE_MAIN, + rtm_protocol: libc::RTPROT_BOOT, + rtm_scope: libc::RT_SCOPE_UNIVERSE, + rtm_type: libc::RTN_UNICAST, + rtm_flags: 0, + }; + let rtm_off = mem::size_of::(); + buf[rtm_off..rtm_off + mem::size_of_val(&rtm)] + .copy_from_slice(unsafe { struct_as_bytes(&rtm) }); + + add_rtattr( + &mut buf, + &mut msg_len, + libc::RTA_OIF, + &(iface_index as u32).to_ne_bytes(), + ); + add_rtattr(&mut buf, &mut msg_len, libc::RTA_DST, &0u32.to_ne_bytes()); + add_rtattr( + &mut buf, + &mut msg_len, + libc::RTA_GATEWAY, + &gateway.to_ne_bytes(), + ); + + nl_send(nl_sock, &buf[..msg_len])?; + let recv_len = nl_recv(nl_sock, &mut buf)?; + nl_check_ack(&buf, recv_len, "mod_route4") +} + +fn dhcp_msg_type(response: &[u8]) -> u8 { + DhcpOptions(response.get(DHCP_OPTIONS_OFFSET..).unwrap_or(&[])) + .find(|&(code, _)| code == 53) + .and_then(|(_, data)| data.first().copied()) + .unwrap_or(0) +} + +fn handle_dhcp_ack(nl_sock: libc::c_int, iface_index: i32, response: &[u8]) -> anyhow::Result<()> { + if response.len() < DHCP_OPTIONS_OFFSET + 1 { + bail!("DHCPACK too short ({} bytes)", response.len()); + } + + let addr = u32::from_ne_bytes(response[16..20].try_into().unwrap()); + if addr == 0 { + bail!("DHCPACK: yiaddr is 0.0.0.0"); + } + + let mut netmask: u32 = 0; + let mut router: u32 = 0; + let mut mtu: u16 = 65520; + let mut resolv_conf = String::new(); + + for (opt, data) in DhcpOptions(response.get(DHCP_OPTIONS_OFFSET..).unwrap_or(&[])) { + match opt { + 1 if data.len() >= 4 => { + netmask = u32::from_ne_bytes(data[..4].try_into().unwrap()); + } + 3 if data.len() >= 4 => { + router = u32::from_ne_bytes(data[..4].try_into().unwrap()); + } + 6 => { + for chunk in data.chunks_exact(4) { + resolv_conf.push_str(&format!( + "nameserver {}.{}.{}.{}\n", + chunk[0], chunk[1], chunk[2], chunk[3] + )); + } + } + 26 if data.len() >= 2 => { + mtu = u16::from_be_bytes(data[..2].try_into().unwrap()).clamp(1280, 65520); + } + _ => {} + } + } + + if !resolv_conf.is_empty() { + if let Err(e) = std::fs::write("/etc/resolv.conf", &resolv_conf) { + eprintln!("Warning: couldn't write /etc/resolv.conf: {e}"); + } + } + + let prefix_len = u32::from_be(netmask).leading_ones() as u8; + + mod_addr4(nl_sock, iface_index, libc::RTM_NEWADDR, addr, prefix_len) + .context("add address from DHCP")?; + mod_route4(nl_sock, iface_index, libc::RTM_NEWROUTE, router) + .context("add default route from DHCP")?; + let _ = set_mtu(nl_sock, iface_index, mtu as u32); + + Ok(()) +} + +fn scopeguard(f: F) -> impl Drop { + struct Guard(Option); + impl Drop for Guard { + fn drop(&mut self) { + if let Some(f) = self.0.take() { + f(); + } + } + } + Guard(Some(f)) +} + +pub fn do_dhcp(iface: &str) -> anyhow::Result<()> { + let iface_cstr = std::ffi::CString::new(iface).unwrap(); + + let iface_index = unsafe { libc::if_nametoindex(iface_cstr.as_ptr()) } as i32; + if iface_index == 0 { + bail!("if_nametoindex({iface}): {}", IoError::last_os_error()); + } + + let nl_sock = unsafe { libc::socket(libc::AF_NETLINK, libc::SOCK_RAW, libc::NETLINK_ROUTE) }; + if nl_sock < 0 { + bail!("socket(AF_NETLINK): {}", IoError::last_os_error()); + } + let _nl_guard = scopeguard(move || unsafe { + libc::close(nl_sock); + }); + + let mut nl_sa: libc::sockaddr_nl = unsafe { mem::zeroed() }; + nl_sa.nl_family = libc::AF_NETLINK as libc::sa_family_t; + nl_sa.nl_pid = unsafe { libc::getpid() } as u32; + if unsafe { + libc::bind( + nl_sock, + &nl_sa as *const _ as *const libc::sockaddr, + mem::size_of_val(&nl_sa) as u32, + ) + } < 0 + { + bail!("bind(netlink): {}", IoError::last_os_error()); + } + + let sock = unsafe { libc::socket(libc::AF_INET, libc::SOCK_DGRAM, libc::IPPROTO_UDP) }; + if sock < 0 { + bail!("socket(AF_INET): {}", IoError::last_os_error()); + } + let _sock_guard = scopeguard(move || unsafe { + libc::close(sock); + }); + + let bcast: libc::c_int = 1; + unsafe { + libc::setsockopt( + sock, + libc::SOL_SOCKET, + libc::SO_BROADCAST, + &bcast as *const _ as *const _, + mem::size_of_val(&bcast) as u32, + ); + libc::setsockopt( + sock, + libc::SOL_SOCKET, + libc::SO_BINDTODEVICE, + iface_cstr.as_ptr() as *const _, + (iface.len() + 1) as u32, + ); + } + + let mut bind_addr: libc::sockaddr_in = unsafe { mem::zeroed() }; + bind_addr.sin_family = libc::AF_INET as libc::sa_family_t; + bind_addr.sin_port = 68u16.to_be(); + if unsafe { + libc::bind( + sock, + &bind_addr as *const _ as *const libc::sockaddr, + mem::size_of_val(&bind_addr) as u32, + ) + } < 0 + { + bail!("bind(UDP 68): {}", IoError::last_os_error()); + } + + let mut pkt = DhcpPacket::zeroed(); + pkt.op = 1; + pkt.htype = 1; + pkt.hlen = 6; + pkt.xid = (unsafe { libc::getpid() } as u32).to_be(); + pkt.flags = 0x8000u16.to_be(); + pkt.magic = 0x63825363u32.to_be(); + + let mut mac_ifr: libc::ifreq = unsafe { mem::zeroed() }; + let name_bytes = iface.as_bytes(); + unsafe { + std::ptr::copy_nonoverlapping( + name_bytes.as_ptr() as *const libc::c_char, + mac_ifr.ifr_name.as_mut_ptr(), + name_bytes.len().min(libc::IFNAMSIZ - 1), + ); + } + if unsafe { libc::ioctl(sock, libc::SIOCGIFHWADDR as _, &mut mac_ifr) } < 0 { + bail!("ioctl(SIOCGIFHWADDR): {}", IoError::last_os_error()); + } + let sa_data = unsafe { mac_ifr.ifr_ifru.ifru_hwaddr.sa_data }; + for (dst, src) in pkt.chaddr.iter_mut().zip(sa_data.iter().take(6)) { + // We need to allow the unnecessary cast, because this will cause clippy to fail on aarch64 + // without it + #[allow(clippy::unnecessary_cast)] + { + *dst = *src as u8; + } + } + + let mut opts = DhcpOptionsWriter::new(&mut pkt.options); + opts.push(53, &[1]); // Discover + opts.push(80, &[]); // Rapid Commit + opts.finish(); + + let mut dest: libc::sockaddr_in = unsafe { mem::zeroed() }; + dest.sin_family = libc::AF_INET as libc::sa_family_t; + dest.sin_port = 67u16.to_be(); + dest.sin_addr.s_addr = libc::INADDR_BROADCAST; + + let mut timeout: libc::timeval = unsafe { mem::zeroed() }; + timeout.tv_usec = 100_000; + unsafe { + libc::setsockopt( + sock, + libc::SOL_SOCKET, + libc::SO_RCVTIMEO, + &timeout as *const _ as *const _, + mem::size_of_val(&timeout) as u32, + ); + } + + let pkt_bytes = pkt.as_bytes(); + if unsafe { + libc::sendto( + sock, + pkt_bytes.as_ptr() as *const _, + pkt_bytes.len(), + 0, + &dest as *const _ as *const libc::sockaddr, + mem::size_of_val(&dest) as u32, + ) + } < 0 + { + bail!("sendto(DISCOVER): {}", IoError::last_os_error()); + } + + let mut response = [0u8; DHCP_BUFFER_SIZE]; + let mut from: libc::sockaddr_in = unsafe { mem::zeroed() }; + let mut from_len = mem::size_of_val(&from) as u32; + let recv_len = unsafe { + libc::recvfrom( + sock, + response.as_mut_ptr() as *mut _, + response.len(), + 0, + &mut from as *mut _ as *mut libc::sockaddr, + &mut from_len, + ) + }; + + if recv_len <= 0 { + return Ok(()); // no response — VM may be IPv6-only + } + let recv_len = recv_len as usize; + let msg_type = dhcp_msg_type(&response[..recv_len]); + + if msg_type == DHCP_MSG_ACK { + handle_dhcp_ack(nl_sock, iface_index, &response[..recv_len])?; + } else if msg_type == DHCP_MSG_OFFER { + let offered_addr = u32::from_ne_bytes(response[16..20].try_into().unwrap()); + let server_addr = from.sin_addr.s_addr; + + pkt.options = [0; DHCP_OPTIONS_SIZE]; + let mut opts = DhcpOptionsWriter::new(&mut pkt.options); + opts.push(53, &[3]); // Request + opts.push(50, &offered_addr.to_ne_bytes()); // Requested IP + opts.push(54, &server_addr.to_ne_bytes()); // Server ID + opts.finish(); + + let pkt_bytes = pkt.as_bytes(); + if unsafe { + libc::sendto( + sock, + pkt_bytes.as_ptr() as *const _, + pkt_bytes.len(), + 0, + &dest as *const _ as *const libc::sockaddr, + mem::size_of_val(&dest) as u32, + ) + } < 0 + { + bail!("sendto(REQUEST): {}", IoError::last_os_error()); + } + + from_len = mem::size_of_val(&from) as u32; + let recv_len2 = unsafe { + libc::recvfrom( + sock, + response.as_mut_ptr() as *mut _, + response.len(), + 0, + &mut from as *mut _ as *mut libc::sockaddr, + &mut from_len, + ) + }; + if recv_len2 <= 0 { + bail!("no DHCPACK received"); + } + let recv_len2 = recv_len2 as usize; + let ack_type = dhcp_msg_type(&response[..recv_len2]); + if ack_type != DHCP_MSG_ACK { + bail!("expected DHCPACK, got type {ack_type}"); + } + handle_dhcp_ack(nl_sock, iface_index, &response[..recv_len2])?; + } else { + bail!("unexpected DHCP message type {msg_type}"); + } + + Ok(()) +} diff --git a/init/src/env.rs b/init/src/env.rs new file mode 100644 index 000000000..2f13acc49 --- /dev/null +++ b/init/src/env.rs @@ -0,0 +1,101 @@ +use std::env; +#[cfg(target_os = "linux")] +use std::ffi::CString; +#[cfg(target_os = "linux")] +use std::mem; +#[cfg(target_os = "linux")] +use std::ptr; + +#[cfg(target_os = "linux")] +pub fn setup_network(iface: &str) { + let sock = unsafe { libc::socket(libc::AF_INET, libc::SOCK_DGRAM, 0) }; + if sock < 0 { + return; + } + let mut ifr: libc::ifreq = unsafe { mem::zeroed() }; + let lo = b"lo\0"; + unsafe { + ptr::copy_nonoverlapping( + lo.as_ptr() as *const libc::c_char, + ifr.ifr_name.as_mut_ptr(), + lo.len(), + ); + ifr.ifr_ifru.ifru_flags |= libc::IFF_UP as libc::c_short; + libc::ioctl(sock, libc::SIOCSIFFLAGS as _, &ifr); + } + + #[cfg(target_os = "linux")] + setup_dhcp(iface, sock); + + unsafe { libc::close(sock) }; +} + +#[cfg(not(target_os = "linux"))] +pub fn setup_network() {} + +#[cfg(target_os = "linux")] +fn setup_dhcp(iface: &str, sock: i32) { + if std::env::var("KRUN_DHCP").as_deref() != Ok("1") { + return; + } + + let mut ifr: libc::ifreq = unsafe { mem::zeroed() }; + let name = CString::new(iface).unwrap(); + unsafe { + ptr::copy_nonoverlapping( + name.as_ptr(), + ifr.ifr_name.as_mut_ptr(), + name.as_bytes_with_nul().len().min(libc::IFNAMSIZ), + ); + } + let exists = unsafe { libc::ioctl(sock, libc::SIOCGIFFLAGS as _, &mut ifr) } == 0; + if exists { + unsafe { + ifr.ifr_ifru.ifru_flags |= libc::IFF_UP as libc::c_short; + libc::ioctl(sock, libc::SIOCSIFFLAGS as _, &ifr); + } + if let Err(e) = crate::dhcp::do_dhcp(iface) { + eprintln!("Warning: DHCP configuration for {iface} failed: {e}"); + } + } +} + +pub fn apply_hostname() { + let hostname = env::var("HOSTNAME").unwrap_or_else(|_| "localhost".into()); + let _ = nix::unistd::sethostname(&hostname); +} + +pub fn apply_env() { + if let Ok(home) = env::var("KRUN_HOME") { + unsafe { env::set_var("HOME", home) }; + } + if let Ok(term) = env::var("KRUN_TERM") { + unsafe { env::set_var("TERM", term) }; + } +} + +pub fn apply_rlimits() { + let Ok(rlimits) = env::var("KRUN_RLIMITS") else { + return; + }; + for item in rlimits.split(',') { + let Some((id_s, rest)) = item.split_once('=') else { + continue; + }; + let Some((cur_s, max_s)) = rest.split_once(':') else { + continue; + }; + let (Ok(id), Ok(cur), Ok(max)) = ( + id_s.parse::(), + cur_s.parse::(), + max_s.parse::(), + ) else { + continue; + }; + let rlim = libc::rlimit { + rlim_cur: cur, + rlim_max: max, + }; + unsafe { libc::setrlimit(id as _, &rlim) }; + } +} diff --git a/init/src/exec.rs b/init/src/exec.rs new file mode 100644 index 000000000..577f41308 --- /dev/null +++ b/init/src/exec.rs @@ -0,0 +1,125 @@ +#[cfg(target_os = "linux")] +use nix::fcntl::{self, OFlag}; +#[cfg(target_os = "linux")] +use nix::sys::reboot::{self, RebootMode}; +#[cfg(target_os = "linux")] +use nix::sys::stat::Mode; +use nix::sys::wait::{self, WaitStatus}; +use nix::unistd::{self, ForkResult}; +use std::env; +use std::ffi::CString; +#[cfg(target_os = "linux")] +use std::fs; +#[cfg(target_os = "linux")] +use std::path::Path; +use std::process; + +#[cfg(target_os = "linux")] +use nix::sys::statfs::{self, FsType}; +#[cfg(target_os = "linux")] +use std::os::fd::AsRawFd; + +#[cfg(target_os = "linux")] +const KRUN_EXIT_CODE_IOCTL: libc::c_ulong = 0x7602; +#[cfg(target_os = "linux")] +// 0x6573_5546 fits in i32, so the cast to FsType's inner c_long is safe on +// both 32-bit (c_long = i32) and 64-bit (c_long = i64) targets. +const VIRTIOFS_MAGIC: libc::c_long = 0x6573_5546; + +#[cfg(target_os = "linux")] +pub fn setup_redirects() { + let Ok(ports_dir) = fs::read_dir("/sys/class/virtio-ports") else { + return; + }; + for entry in ports_dir.flatten() { + let name_path = entry.path().join("name"); + let Ok(port_name) = fs::read_to_string(&name_path) else { + continue; + }; + let (fd, flags) = match port_name.trim_end_matches('\n') { + "krun-stdin" => (libc::STDIN_FILENO, libc::O_RDONLY), + "krun-stdout" => (libc::STDOUT_FILENO, libc::O_WRONLY), + "krun-stderr" => (libc::STDERR_FILENO, libc::O_WRONLY), + _ => continue, + }; + let dev = CString::new(format!("/dev/{}", entry.file_name().to_string_lossy())).unwrap(); + let new_fd = unsafe { libc::open(dev.as_ptr(), flags) }; + if new_fd >= 0 && new_fd != fd { + // new_fd != fd: dup it onto the target and close the spare. + unsafe { + libc::dup2(new_fd, fd); + libc::close(new_fd); + } + } + // new_fd == fd: device opened directly onto the target fd (happens when + // the target was already closed); it is already in the right place. + // new_fd < 0: open failed; leave the existing fd untouched. + } +} + +#[cfg(target_os = "linux")] +pub fn set_exit_code(code: i32) { + let Ok(fs) = statfs::statfs(Path::new("/")) else { + return; + }; + if fs.filesystem_type() != FsType(VIRTIOFS_MAGIC as _) { + return; + } + if let Ok(fd) = fcntl::open(Path::new("/"), OFlag::O_RDONLY, Mode::empty()) { + unsafe { libc::ioctl(fd.as_raw_fd(), KRUN_EXIT_CODE_IOCTL as _, code) }; + } +} + +#[cfg(not(target_os = "linux"))] +pub fn set_exit_code(_code: i32) {} + +pub fn run_workload(argv: &[String]) -> ! { + if env::var("KRUN_INIT_PID1") == Ok("1".to_owned()) { + exec_workload(argv); + } + + match unsafe { unistd::fork() } { + Err(_) => { + set_exit_code(125); + process::exit(125); + } + Ok(ForkResult::Child) => exec_workload(argv), + Ok(ForkResult::Parent { child }) => { + let code = loop { + match wait::waitpid(None, None) { + Ok(WaitStatus::Exited(pid, c)) if pid == child => break c, + Ok(WaitStatus::Signaled(pid, sig, _)) if pid == child => { + break sig as i32 + 128; + } + _ => continue, + } + }; + set_exit_code(code); + unistd::sync(); + #[cfg(target_os = "linux")] + let _ = reboot::reboot(RebootMode::RB_AUTOBOOT); + process::exit(code) + } + } +} + +fn exec_workload(argv: &[String]) -> ! { + #[cfg(target_os = "linux")] + setup_redirects(); + #[cfg(target_os = "freebsd")] + crate::freebsd::open_console(); + + let c_argv: Vec = argv + .iter() + .map(|s| CString::new(s.as_str()).unwrap()) + .collect(); + + let Err(e) = unistd::execvp(&c_argv[0], &c_argv); + let code = if e == nix::errno::Errno::ENOENT { + 127 + } else { + 126 + }; + eprintln!("Couldn't execute '{}': {e}", argv[0]); + process::exit(code); +} diff --git a/init/src/freebsd.rs b/init/src/freebsd.rs new file mode 100644 index 000000000..9e310ea86 --- /dev/null +++ b/init/src/freebsd.rs @@ -0,0 +1,168 @@ +use std::ffi::CString; + +extern "C" { + fn revoke(path: *const libc::c_char) -> libc::c_int; +} + +const KENV_MVALLEN: usize = 128; +const ISO_DEV: &str = "/dev/iso9660/KRUN_CONFIG"; +const ISO_MOUNT: &str = "/mnt"; +pub const ISO_CONFIG_PATH: &str = "/mnt/krun_config.json"; + +const KENV_VARS: &[&str] = &[ + "HOSTNAME", + "KRUN_CONFIG", + "KRUN_HOME", + "KRUN_INIT", + "KRUN_INIT_PID1", + "KRUN_RLIMITS", + "KRUN_TERM", + "KRUN_WORKDIR", +]; + +fn kenv_get(name: &str) -> Option { + let c_name = CString::new(name).ok()?; + let mut buf = vec![0u8; KENV_MVALLEN + 1]; + let ret = unsafe { + libc::kenv( + libc::KENV_GET, + c_name.as_ptr(), + buf.as_mut_ptr() as *mut libc::c_char, + (KENV_MVALLEN + 1) as i32, + ) + }; + if ret < 0 { + return None; + } + let s = unsafe { std::ffi::CStr::from_ptr(buf.as_ptr() as *const libc::c_char) }; + Some(s.to_string_lossy().into_owned()) +} + +/// Populate the process environment from the FreeBSD kernel environment. +/// +/// On FreeBSD, init runs before the process environment is set up, so +/// variables like KRUN_INIT must be read from kenv(2) rather than getenv(3). +pub fn populate_env_from_kenv() { + for &var in KENV_VARS { + if let Some(val) = kenv_get(var) { + unsafe { std::env::set_var(var, val) }; + } + } +} + +/// Open /dev/console and make it the controlling terminal. +/// +/// Replicates login_tty(3) inline to avoid a libutil dependency: +/// revoke any existing opens, open the device, create a new session, +/// set the controlling terminal via TIOCSCTTY, then dup2 into stdio. +/// Falls back to /dev/null + /init.log if the console cannot be opened. +pub fn open_console() { + let console = b"/dev/console\0"; + unsafe { revoke(console.as_ptr() as *const libc::c_char) }; + + let fd = unsafe { + libc::open( + console.as_ptr() as *const libc::c_char, + libc::O_RDWR | libc::O_NONBLOCK, + ) + }; + + if fd < 0 { + fallback_console(); + return; + } + + let flags = unsafe { libc::fcntl(fd, libc::F_GETFL) }; + unsafe { libc::fcntl(fd, libc::F_SETFL, flags & !libc::O_NONBLOCK) }; + + unsafe { + libc::setsid(); + libc::ioctl(fd, libc::TIOCSCTTY, 0); + libc::dup2(fd, libc::STDIN_FILENO); + libc::dup2(fd, libc::STDOUT_FILENO); + libc::dup2(fd, libc::STDERR_FILENO); + if fd > libc::STDERR_FILENO { + libc::close(fd); + } + } +} + +fn fallback_console() { + let null = b"/dev/null\0"; + let log = b"/init.log\0"; + + let null_fd = unsafe { libc::open(null.as_ptr().cast(), libc::O_RDWR) }; + if null_fd >= 0 && null_fd != libc::STDIN_FILENO { + unsafe { + libc::dup2(null_fd, libc::STDIN_FILENO); + libc::close(null_fd); + } + } + + let log_fd = unsafe { + libc::open( + log.as_ptr().cast(), + libc::O_WRONLY | libc::O_APPEND | libc::O_CREAT, + 0o644u32, + ) + }; + let out_fd = if log_fd >= 0 { + log_fd + } else { + libc::STDIN_FILENO + }; + unsafe { + libc::dup2(out_fd, libc::STDOUT_FILENO); + libc::dup2(libc::STDOUT_FILENO, libc::STDERR_FILENO); + if log_fd >= 0 && log_fd != libc::STDOUT_FILENO { + libc::close(log_fd); + } + } +} + +/// Mount the KRUN_CONFIG ISO image at /mnt via nmount(2). +/// Returns true on success. +pub fn mount_config_iso() -> bool { + let _ = std::fs::create_dir_all(ISO_MOUNT); + + let fstype_key = b"fstype\0"; + let fstype_val = b"cd9660\0"; + let fspath_key = b"fspath\0"; + let fspath_cstr = CString::new(ISO_MOUNT).unwrap(); + let from_key = b"from\0"; + let from_cstr = CString::new(ISO_DEV).unwrap(); + + let mut iov = [ + libc::iovec { + iov_base: fstype_key.as_ptr() as *mut _, + iov_len: fstype_key.len(), + }, + libc::iovec { + iov_base: fstype_val.as_ptr() as *mut _, + iov_len: fstype_val.len(), + }, + libc::iovec { + iov_base: fspath_key.as_ptr() as *mut _, + iov_len: fspath_key.len(), + }, + libc::iovec { + iov_base: fspath_cstr.as_ptr() as *mut _, + iov_len: fspath_cstr.as_bytes_with_nul().len(), + }, + libc::iovec { + iov_base: from_key.as_ptr() as *mut _, + iov_len: from_key.len(), + }, + libc::iovec { + iov_base: from_cstr.as_ptr() as *mut _, + iov_len: from_cstr.as_bytes_with_nul().len(), + }, + ]; + + unsafe { libc::nmount(iov.as_mut_ptr(), iov.len() as u32, libc::MNT_RDONLY) == 0 } +} + +pub fn unmount_config_iso() { + let mount_cstr = CString::new(ISO_MOUNT).unwrap(); + unsafe { libc::unmount(mount_cstr.as_ptr(), 0) }; +} diff --git a/init/src/fs.rs b/init/src/fs.rs new file mode 100644 index 000000000..0a365141c --- /dev/null +++ b/init/src/fs.rs @@ -0,0 +1,189 @@ +use anyhow::{bail, Context}; +use nix::errno::Errno; +use nix::mount::{self, MsFlags}; +use nix::unistd; +use std::env; +use std::fs::{self, File}; +use std::io::{BufRead, BufReader}; +use std::os::unix::fs as unix_fs; + +const KRUN_REMOVE_ROOT_DIR_IOCTL: libc::c_ulong = 0x7603; + +/// Mount, treating EBUSY (already mounted) as success. +fn mount_once( + src: Option<&str>, + target: &str, + fstype: Option<&str>, + flags: MsFlags, +) -> anyhow::Result<()> { + match mount::mount(src, target, fstype, flags, None::<&str>) { + Ok(()) => Ok(()), + Err(Errno::EBUSY) => Ok(()), + Err(e) => Err(e).with_context(|| format!("mount {target}")), + } +} + +pub fn mount_filesystems() -> anyhow::Result<()> { + let base_flags = MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_RELATIME; + fs::create_dir_all("/dev").context("create /dev")?; + fs::create_dir_all("/proc").context("create /proc")?; + fs::create_dir_all("/sys").context("create /sys")?; + + mount_once( + Some("devtmpfs"), + "/dev", + Some("devtmpfs"), + MsFlags::MS_RELATIME, + )?; + + mount_once( + Some("proc"), + "/proc", + Some("proc"), + MsFlags::MS_NODEV | base_flags, + )?; + + mount_once( + Some("sysfs"), + "/sys", + Some("sysfs"), + MsFlags::MS_NODEV | base_flags, + )?; + + mount_once( + Some("cgroup2"), + "/sys/fs/cgroup", + Some("cgroup2"), + MsFlags::MS_NODEV | base_flags, + )?; + + fs::create_dir_all("/dev/pts").context("create /dev/pts")?; + fs::create_dir_all("/dev/shm").context("create /dev/shm")?; + + mount_once(Some("devpts"), "/dev/pts", Some("devpts"), base_flags)?; + mount_once(Some("tmpfs"), "/dev/shm", Some("tmpfs"), base_flags)?; + + // Best-effort; may already exist. + let _ = unix_fs::symlink("/proc/self/fd", "/dev/fd"); + + Ok(()) +} + +/// Returns true if path is listed as a mount point in /proc/mounts. +/// +/// Uses /proc/mounts instead of stat() because Podman arranges tmpfs +/// auto-mounts that would be triggered by a stat call. +pub fn is_mount_point(path: &str) -> bool { + let Ok(f) = File::open("/proc/mounts") else { + return false; + }; + for line in BufReader::new(f).lines().map_while(Result::ok) { + let mut parts = line.split_whitespace(); + let _ = parts.next(); // device + if parts.next() == Some(path) { + return true; + } + } + false +} + +pub fn mount_tmpfs(path: &str) -> anyhow::Result<()> { + mount::mount( + Some("tmpfs"), + path, + Some("tmpfs"), + MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_RELATIME, + None::<&str>, + ) + .with_context(|| format!("mount tmpfs at {path}")) +} + +/// Mount /dev/vda as ext4, then pivot root into it. +#[cfg(any(feature = "amd-sev", feature = "tdx"))] +pub fn mount_tee_block_device() -> anyhow::Result<()> { + fs::create_dir_all("/tmp/vda").context("create /tmp/vda")?; + + mount_once( + Some("/dev/vda"), + "/tmp/vda", + Some("ext4"), + MsFlags::MS_RELATIME, + )?; + unistd::chdir("/tmp/vda").context("chdir /tmp/vda")?; + + mount_once(Some("."), "/", None::<&str>, MsFlags::MS_MOVE)?; + unistd::chroot(".").context("chroot .") +} + +/// Mount source onto target, trying each non-virtual filesystem listed in +/// /proc/filesystems when fstype is None. +pub fn try_mount( + source: &str, + target: &str, + fstype: Option<&str>, + flags: MsFlags, +) -> anyhow::Result<()> { + if let Some(fs) = fstype { + mount::mount(Some(source), target, Some(fs), flags, None::<&str>) + .with_context(|| format!("mount {source} -> {target} as {fs}"))?; + } + + let f = File::open("/proc/filesystems").context("open /proc/filesystems")?; + for line in BufReader::new(f).lines().map_while(Result::ok) { + if line.starts_with("nodev") { + continue; + } + let fs = line.trim(); + if mount::mount(Some(source), target, Some(fs), flags, None::<&str>).is_ok() { + return Ok(()); + } + } + bail!("no supported filesystem found for {source}") +} + +/// Handle KRUN_BLOCK_ROOT_DEVICE: mount the block device at /newroot, +/// ask the virtiofs device to remove the temporary root, then pivot. +pub fn mount_block_root_device() -> anyhow::Result<()> { + let Some(krun_root) = env::var_os("KRUN_BLOCK_ROOT_DEVICE") else { + return Ok(()); + }; + let krun_root = krun_root.to_string_lossy().into_owned(); + + fs::create_dir_all("/newroot").context("create /newroot")?; + + let fstype = env::var("KRUN_BLOCK_ROOT_FSTYPE").ok(); + let options = env::var("KRUN_BLOCK_ROOT_OPTIONS").ok(); + + try_mount(&krun_root, "/newroot", fstype.as_deref(), MsFlags::empty())?; + + unistd::chdir("/newroot").context("chdir /newroot")?; + + // Ask the virtiofs device to tear down the temporary root directory. + let fd = unsafe { libc::open(c"/".as_ptr().cast(), libc::O_RDONLY) }; + if fd >= 0 { + unsafe { libc::ioctl(fd, KRUN_REMOVE_ROOT_DIR_IOCTL as _) }; + unsafe { libc::close(fd) }; + } + + mount::mount(Some("."), "/", None::<&str>, MsFlags::MS_MOVE, None::<&str>) + .context("pivot root MS_MOVE")?; + + unistd::chroot(".").context("chroot after block root pivot")?; + + // Re-mount standard filesystems now that we're in the new root. + mount_filesystems()?; + + drop(options); + Ok(()) +} + +pub fn mount_shared_root() -> anyhow::Result<()> { + mount::mount( + None::<&str>, + "/", + None::<&str>, + MsFlags::MS_REC | MsFlags::MS_SHARED, + None::<&str>, + ) + .context("set MS_SHARED on root mount") +} diff --git a/init/src/main.rs b/init/src/main.rs new file mode 100644 index 000000000..8aaf25392 --- /dev/null +++ b/init/src/main.rs @@ -0,0 +1,94 @@ +mod config; +#[cfg(target_os = "linux")] +mod dhcp; +mod env; +mod exec; +#[cfg(target_os = "freebsd")] +mod freebsd; +#[cfg(target_os = "linux")] +mod fs; +#[cfg(feature = "timesync")] +mod timesync; + +fn main() -> anyhow::Result<()> { + #[cfg(target_os = "freebsd")] + freebsd::open_console(); + + #[cfg(target_os = "freebsd")] + freebsd::populate_env_from_kenv(); + + #[cfg(any(feature = "amd-sev", feature = "tdx"))] + fs::mount_tee_block_device()?; + + #[cfg(target_os = "linux")] + { + fs::mount_filesystems()?; + fs::mount_block_root_device()?; + fs::mount_shared_root()?; + } + + unsafe { + libc::setsid(); + libc::ioctl(0, libc::TIOCSCTTY as _, 1i32); + } + + #[cfg(target_os = "freebsd")] + unsafe { + libc::setlogin(b"root\0".as_ptr().cast()) + }; + + env::setup_network( + #[cfg(target_os = "linux")] + "eth0", + ); + + #[cfg(target_os = "freebsd")] + let iso_mounted = std::env::var("KRUN_CONFIG").is_err() && freebsd::mount_config_iso(); + + #[cfg(target_os = "linux")] + let cfg = config::load(fs::is_mount_point); + #[cfg(not(target_os = "linux"))] + let cfg = config::load(); + + #[cfg(target_os = "freebsd")] + if iso_mounted { + freebsd::unmount_config_iso(); + } + + #[cfg(target_os = "linux")] + if let Some(ref path) = cfg.tmpfs { + fs::mount_tmpfs(path)?; + } + + env::apply_env(); + env::apply_hostname(); + env::apply_rlimits(); + + if let Some(ref workdir) = std::env::var("KRUN_WORKDIR").ok().or(cfg.workdir) { + let _ = nix::unistd::chdir(workdir.as_str()); + } + + // The kernel places everything after `--` in the cmdline as this + // process's argv[1..]. The C init built exec_argv by replacing argv[0] + // with KRUN_INIT (or /bin/sh) and keeping argv[1..] in every branch. + let proc_args: Vec = std::env::args().collect(); + + let argv: Vec = if let Ok(init) = std::env::var("KRUN_INIT") { + // KRUN_INIT holds the binary; kernel cmdline args are the arguments. + let mut v = vec![init]; + v.extend_from_slice(&proc_args[1..]); + v + } else if let Some(v) = cfg.argv { + v + } else if proc_args.len() > 1 { + // No KRUN_INIT and no config: treat proc_args[1..] as the command. + proc_args.into_iter().skip(1).collect() + } else { + vec!["/bin/sh".to_string()] + }; + + #[cfg(feature = "timesync")] + timesync::run(); + + exec::run_workload(&argv); +} diff --git a/init/src/timesync.rs b/init/src/timesync.rs new file mode 100644 index 000000000..80243efa4 --- /dev/null +++ b/init/src/timesync.rs @@ -0,0 +1,57 @@ +use std::mem; + +const TSYNC_PORT: u32 = 123; +const NANOS_IN_SECOND: u64 = 1_000_000_000; +const DELTA_SYNC: u64 = 100_000_000; // 100ms — don't bother adjusting for smaller drifts + +pub fn run() { + let sock = unsafe { libc::socket(libc::AF_VSOCK, libc::SOCK_DGRAM, 0) }; + if sock < 0 { + return; + } + + let mut addr: libc::sockaddr_vm = unsafe { mem::zeroed() }; + addr.svm_family = libc::AF_VSOCK as _; + addr.svm_port = TSYNC_PORT; + addr.svm_cid = libc::VMADDR_CID_ANY; + + if unsafe { + libc::bind( + sock, + &addr as *const _ as *const libc::sockaddr, + mem::size_of_val(&addr) as _, + ) + } < 0 + { + unsafe { libc::close(sock) }; + return; + } + + std::thread::Builder::new() + .name("timesync".into()) + .spawn(move || loop { + let mut buf = [0u8; 8]; + let n = unsafe { libc::recv(sock, buf.as_mut_ptr() as *mut _, buf.len(), 0) }; + if n < 0 { + break; + } + if n != 8 { + continue; + } + + let host_ns = u64::from_le_bytes(buf); + + let mut guest_ts: libc::timespec = unsafe { mem::zeroed() }; + unsafe { libc::clock_gettime(libc::CLOCK_REALTIME, &mut guest_ts) }; + let guest_ns = guest_ts.tv_sec as u64 * NANOS_IN_SECOND + guest_ts.tv_nsec as u64; + + if host_ns.abs_diff(guest_ns) > DELTA_SYNC { + let host_ts = libc::timespec { + tv_sec: (host_ns / NANOS_IN_SECOND) as libc::time_t, + tv_nsec: (host_ns % NANOS_IN_SECOND) as libc::c_long, + }; + unsafe { libc::clock_settime(libc::CLOCK_REALTIME, &host_ts) }; + } + }) + .unwrap(); +} diff --git a/init/tee/kbs/kbs.h b/init/tee/kbs/kbs.h deleted file mode 100644 index 9549a76a4..000000000 --- a/init/tee/kbs/kbs.h +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -#ifndef _KBS -#define _KBS - -#include -#include -#include - -#include "../snp_attest.h" - -/* - * Identifiers for all possible TEE architectures. - */ -enum tee { - TEE_SEV, - TEE_SGX, - TEE_SNP, - TEE_TDX, -}; - -/* - * The type of KBS operation to be performed. - */ -enum curl_post_type { - KBS_CURL_REQ, - KBS_CURL_ATTEST, - KBS_CURL_GET_KEY, -}; - -// kbs_util.c -char *tee_str(int); -char *find_cookie(char *, char *); -int read_cookie_val(char *, char *); -int json_parse_str(char *, char *, char *); - -// kbs_types.c -int kbs_request_marshal(char *, int, char *); -int kbs_challenge(CURL *, char *, char *, char *); -int kbs_attest(CURL *, char *, struct snp_report *, BIGNUM *, BIGNUM *, char *); -int kbs_get_key(CURL *, char *, char *, EVP_PKEY *, char *); - -// kbs_curl.c -int kbs_curl_post(CURL *, char *, char *, char *, int); -int kbs_curl_get(CURL *, char *, char *, char *, int); - -// kbs_crypto.c -int kbs_tee_pubkey_create(EVP_PKEY **, BIGNUM **, BIGNUM **); -int kbs_nonce_pubkey_hash(char *, EVP_PKEY *, unsigned char **, unsigned int *); -void BN_b64(BIGNUM *, char *); -int rsa_pkey_decrypt(EVP_PKEY *, char *, char **); - -#endif /* _KBS */ diff --git a/init/tee/kbs/kbs_crypto.c b/init/tee/kbs/kbs_crypto.c deleted file mode 100644 index 516fa2c5e..000000000 --- a/init/tee/kbs/kbs_crypto.c +++ /dev/null @@ -1,332 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "kbs.h" - -/* - * Create an OpenSSL TEE public/private key pair. - */ -int kbs_tee_pubkey_create(EVP_PKEY **pkey, BIGNUM **n, BIGNUM **e) -{ - int ret, rc; - EVP_PKEY_CTX *ctx; - - rc = -1; - ctx = NULL; - - /* - * The public/private key pair will use an RSA algorithm. Generate the - * keys' context. - */ - ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_RSA, NULL); - if (ctx == NULL) { - printf("ERROR: creating TEE public key context\n"); - - return rc; - } - - ret = EVP_PKEY_keygen_init(ctx); - if (ret < 1) { - printf("ERROR: initializing TEE public key generation\n"); - - goto ctx_free; - } - - /* - * Set key generation bits to 2048 and generate the key pair. - */ - ret = EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, 2048); - if (ret < 1) { - printf("ERROR: setting RSA keygen bits\n"); - - goto ctx_free; - } - - *pkey = NULL; - ret = EVP_PKEY_keygen(ctx, pkey); - if (ret < 1) { - printf("ERROR: generating RSA key\n"); - - goto ctx_free; - } - - /* - * Get the modulus and exponents of the key pair. - */ - ret = EVP_PKEY_get_bn_param(*pkey, OSSL_PKEY_PARAM_RSA_N, n); - if (ret < 0 || n == NULL) { - printf("ERROR: getting public key modulus\n"); - - goto ctx_free; - } - - ret = EVP_PKEY_get_bn_param(*pkey, OSSL_PKEY_PARAM_RSA_E, e); - if (ret < 0 || e == NULL) { - printf("ERROR: getting public key exponent\n"); - - goto ctx_free; - } - - rc = 0; - -ctx_free: - EVP_PKEY_CTX_free(ctx); - - return rc; -} - -/* - * Create a SHA512 hash of the nonce and TEE public key to send to the - * attestation server. - */ -int kbs_nonce_pubkey_hash(char *nonce, EVP_PKEY *pkey, unsigned char **hash, - unsigned int *size) -{ - int rc; - EVP_MD_CTX *md_ctx; - BIGNUM *n, *e; - char n_b64[512], e_b64[512]; - - rc = -1; - - /* - * Initialize an MD context and initialize the SHA512 digest. - */ - md_ctx = EVP_MD_CTX_new(); - if (md_ctx == NULL) { - printf("ERROR: generating SHA512 context\n"); - - return rc; - } - - if (EVP_DigestInit_ex(md_ctx, EVP_sha512(), NULL) < 1) { - printf("ERROR: initializing SHA512 hash\n"); - - goto md_ctx_free; - } - - /* - * Update the digest with the data from the nonce. - */ - if (EVP_DigestUpdate(md_ctx, (void *)nonce, strlen(nonce)) < 1) { - printf("ERROR: updating SHA512 digest with nonce\n"); - - goto md_ctx_free; - } - - /* - * Update the digest with the data from the TEE public key. - * - * To do this, we will write the base64 encoding of the TEE public - * key's modulus and exponent. - */ - n = e = NULL; - if (EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_RSA_N, &n) == 0) { - printf("ERROR: unable to retrieve public key modulus\n"); - - goto md_ctx_free; - } - - if (EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_RSA_E, &e) == 0) { - printf("ERROR: unable to retrieve public key exponent\n"); - - goto md_ctx_free; - } - - /* - * base64-encode the modulus and exponents, and hash the base64 strings - * into the SHA512 digest. - */ - BN_b64(n, n_b64); - BN_b64(e, e_b64); - - if (EVP_DigestUpdate(md_ctx, (void *)n_b64, strlen(n_b64)) < 1) { - printf("ERROR: updating SHA512 digest with public key N\n"); - - goto md_ctx_free; - } - - if (EVP_DigestUpdate(md_ctx, (void *)e_b64, strlen(e_b64)) < 1) { - printf("ERROR: updating SHA512 digest with public key E\n"); - - goto md_ctx_free; - } - - /* - * Allocate the memory to hold the SHA512 hash, and write the SHA512 - * hash to the "hash" byte array. - */ - *hash = (unsigned char *)OPENSSL_malloc(EVP_MD_size(EVP_sha512())); - if (*hash == NULL) { - printf("ERROR: allocating memory for SHA512 hash\n"); - - goto md_ctx_free; - } - - if (EVP_DigestFinal_ex(md_ctx, *hash, size) < 1) { - printf("ERROR: finalizing the SHA512 hash\n"); - - goto hash_free; - } - - rc = 0; - - goto md_ctx_free; - -hash_free: - OPENSSL_free((void *)*hash); - -md_ctx_free: - EVP_MD_CTX_free(md_ctx); - - return rc; -} - -/* - * Using a given RSA public/private key pair, decrypt an encrypted and hex - * encoded string of text. Store the plaintext of the encrypted text into a - * buffer and point "plain_ptr" to said buffer. - */ -int rsa_pkey_decrypt(EVP_PKEY *pkey, char *enc, char **plain_ptr) -{ - int rc; - EVP_PKEY_CTX *ctx; - char enc_bin[4096], *plain; - size_t enc_bin_len, secret_plain_len = 4096; - - rc = -1; - - /* - * Decode the hex-encoded string to its byte format. - */ - if (OPENSSL_hexstr2buf_ex((unsigned char *)enc_bin, 4096, &enc_bin_len, enc, - '\0') != 1) { - printf("Error converting hex to buf\n"); - - return rc; - } - - /* - * Initialize the public key decryption context. - */ - ctx = EVP_PKEY_CTX_new(pkey, NULL); - if (ctx == NULL) { - printf("ERROR: creation of pkey context for decryption\n"); - - return rc; - } - - if (EVP_PKEY_decrypt_init(ctx) <= 0) { - printf("ERROR: creation of decryption context for pkey\n"); - - goto ctx_free; - } - - if (EVP_PKEY_CTX_set_rsa_padding(ctx, RSA_PKCS1_PADDING) <= 0) { - printf("Error setting RSA padding\n"); - - goto ctx_free; - } - - /* - * To first get the length that the plain secret buffer should be, call - * EVP_PKEY_decrypt() with a NULL output buffer argument. Then, - * "secret_plain_len" will contain the proper amount of bytes to - * allocate for the output buffer. - */ - rc = EVP_PKEY_decrypt(ctx, NULL, &secret_plain_len, - (unsigned char *)enc_bin, enc_bin_len); - if (rc <= 0) { - printf("ERROR: finding plaintext passphrase length: %d\n", rc); - - goto ctx_free; - } - - /* - * Allocate the output buffer using "secret_plain_len". - */ - plain = OPENSSL_malloc(secret_plain_len); - if (plain == NULL) - goto ctx_free; - - /* - * Decrypt the string using the OpenSSL RSA public key. - */ - rc = EVP_PKEY_decrypt(ctx, (unsigned char *)plain, &secret_plain_len, - (unsigned char *)enc_bin, enc_bin_len); - if (rc <= 0) { - printf("ERROR: decrypting RSA-encrypted passphrase: %d\n", rc); - OPENSSL_free(plain); - - goto ctx_free; - } - plain[secret_plain_len] = '\0'; - - /* - * Set the "plain_ptr" arg to the plaintext passphrase". - */ - *plain_ptr = plain; - - rc = 0; - -ctx_free: - EVP_PKEY_CTX_free(ctx); - - return rc; -} - -/* - * base64-encode the contents of an OpenSSL BIGNUM. - */ -void BN_b64(BIGNUM *bn, char *str) -{ - BIO *bio; - BIO *b64; - char *bn_bin; - char *bn_b64; - int bn_binlen; - int bn_b64len; - - /* - * Encode the BIGNUM contents to binary. - */ - bn_binlen = BN_num_bytes(bn); - bn_bin = malloc(bn_binlen); - BN_bn2bin(bn, (unsigned char *)bn_bin); - - /* - * Write the binary-encoded string to to a base64-configured OpenSSL - * BIO. - */ - b64 = BIO_new(BIO_f_base64()); - BIO_set_flags(b64, BIO_FLAGS_BASE64_NO_NL); - bio = BIO_new(BIO_s_mem()); - BIO_push(b64, bio); - BIO_write(b64, bn_bin, bn_binlen); - BIO_flush(b64); - - /* - * Retrieve the base64-encoded contents of the BIO, null-terminate the - * string, and copy those contents to the output string. - */ - bn_b64len = BIO_get_mem_data(b64, &bn_b64); - bn_b64[bn_b64len] = '\0'; - - strcpy(str, bn_b64); - - /* - * Cleanup OpenSSL data structures. - */ - BIO_free(b64); - BIO_free(bio); - free(bn_bin); -} diff --git a/init/tee/kbs/kbs_curl.c b/init/tee/kbs/kbs_curl.c deleted file mode 100644 index eb639db72..000000000 --- a/init/tee/kbs/kbs_curl.c +++ /dev/null @@ -1,232 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include - -#include "kbs.h" - -#define KBS_CURL_ERR(x) \ - printf("%s: %s\n", __func__, x); \ - return -1; - -static CURLcode kbs_curl_set_headers(CURL *, char *); -size_t cwrite(void *, size_t, size_t, void *); - -/* - * Complete a cURL POST request. POST the "in" string and retrieve the contents - * of the POST request "out" string. - * - * Depending on the type of request, some extra headers may need to be set. - * For example, on a KBS REQUEST, no session ID has been retrieved from the - * attestation server so far. Yet, during a KBS_ATTEST request, a session ID - * has been given from the server and must be added to the headers. - */ -int kbs_curl_post(CURL *curl, char *url, char *in, char *out, int type) -{ - CURLcode code; - struct curl_slist *cks; - char full_url[256], *session_id_label, session_id[256]; - - /* - * Neither the input or output strings should be invalid/NULL. - */ - if (!in) { - KBS_CURL_ERR("Input argument NULL"); - } - - if (!out) { - KBS_CURL_ERR("Output argument NULL"); - } - - if (curl_easy_setopt(curl, CURLOPT_POST, 1L) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_POST"); - } - - if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, cwrite) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_WRITEFUNCTION"); - } - - /* - * If the operation being completed is a KBS REQUEST, then this is the - * initial request to the attestation server, and there is no session - * ID to make note of. Otherwise, the session ID has been established - * and must be parsed from the cURL cookies data. - */ - cks = NULL; - if (type == KBS_CURL_REQ) { - sprintf(full_url, "%s/kbs/v0/auth", url); - - if (curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "") != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_COOKIEFILE"); - } - - if (kbs_curl_set_headers(curl, NULL) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_HTTPHEADER"); - } - } else { - sprintf(full_url, "%s/kbs/v0/attest", url); - - if (curl_easy_getinfo(curl, CURLINFO_COOKIELIST, &cks) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_COOKIELIST"); - } - - session_id_label = NULL; - while (cks) { - session_id_label = find_cookie(cks->data, "session_id"); - - if (session_id_label) - break; - cks = cks->next; - } - - if (session_id_label == NULL) { - KBS_CURL_ERR("No session_id cookie found"); - } - - if (read_cookie_val(session_id_label, session_id) < 0) { - KBS_CURL_ERR("No session_id value for cookie"); - } - - if (kbs_curl_set_headers(curl, (char *)session_id) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_HTTPHEADER"); - } - } - - if (curl_easy_setopt(curl, CURLOPT_URL, full_url) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_URL"); - } - - /* - * This is a cURL POST request that will write data to the "out" - * argument. "out" is expected to have been allocated beforehand and - * able to hold the full response from the attestation server. - */ - if (curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, (long)strlen(in)) != - CURLE_OK) { - KBS_CURL_ERR("CURLOPT_POSTFIELDSIZE"); - } - - if (curl_easy_setopt(curl, CURLOPT_POSTFIELDS, in) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_POSTFIELDS"); - } - - if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, out) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_WRITEDATA"); - } - - code = curl_easy_perform(curl); - if (code != CURLE_OK && code != CURLE_WRITE_ERROR) { - KBS_CURL_ERR("CURL_EASY_PERFORM"); - } - - return 0; -} - -/* - * A cURL GET request. No input is given, and we are simply retrieving data - * from the KBS attestation server. - */ -int kbs_curl_get(CURL *curl, char *url, char *wid, char *out, int type) -{ - CURLcode code; - char full_url[100], *session_id_label, session_id[100]; - struct curl_slist *cookies; - - if (type != KBS_CURL_GET_KEY) { - KBS_CURL_ERR("Invalid KBS operation"); - } - - code = curl_easy_getinfo(curl, CURLINFO_COOKIELIST, &cookies); - if (code != CURLE_OK) { - KBS_CURL_ERR("Cannot retrieve cURL cookies"); - } - - /* - * This API is used by kbs_get_key(), therefore we are expected to have - * a valid session ID by this point. Parse the cURL cookies data to find - * this session ID. - */ - while (cookies != NULL) { - session_id_label = find_cookie(cookies->data, "session_id"); - if (session_id_label) - break; - - cookies = cookies->next; - } - - if (session_id_label == NULL) { - KBS_CURL_ERR("Couldn't find cookie labeled\n"); - } - - /* - * Read the session ID and include it in the cURL headers. - */ - if (read_cookie_val(session_id_label, session_id) < 0) { - KBS_CURL_ERR("Couldn't read cookie value\n"); - } - - if (kbs_curl_set_headers(curl, (char *)session_id) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_HTTPHEADER"); - } - - /* - * The location of the KBS key is located at - * $ATTESTATION_URL/kbs/v0/key/$WORKLOAD_ID. - */ - sprintf(full_url, "%s/kbs/v0/key/%s", url, wid); - - if (curl_easy_setopt(curl, CURLOPT_URL, full_url) != CURLE_OK) { - KBS_CURL_ERR("CURLOPT_URL"); - } - - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, cwrite); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, out); - - code = curl_easy_perform(curl); - if (code != CURLE_OK && code != CURLE_WRITE_ERROR) { - KBS_CURL_ERR("CURL_EASY_PERFORM"); - } - - return 0; -} - -/* - * Set the cURL headers. If the session args is not NULL, that indicates that - * the session ID has been retrieved from attestation server before, and that - * session ID should be included in the headers. - */ -static CURLcode kbs_curl_set_headers(CURL *curl, char *session) -{ - struct curl_slist *slist; - char session_buf[512]; - - slist = NULL; - slist = curl_slist_append(slist, "Accept: application/json"); - slist = curl_slist_append(slist, - "Content-Type: application/json; charset=utf-8"); - - /* - * Add the session ID cookie if the session ID exists. - */ - if (session) { - sprintf(session_buf, "Cookie: session_id=%s", session); - curl_slist_append(slist, session_buf); - } - - /* - * Set the headers. - */ - return curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist); -} - -/* - * Simple strcpy() for attestation server responses. Required by a cURL - * operation that writes data. - */ -size_t cwrite(void *data, size_t size, size_t nmemb, void *userp) -{ - strcpy((char *)userp, (char *)data); - - return size; -} diff --git a/init/tee/kbs/kbs_types.c b/init/tee/kbs/kbs_types.c deleted file mode 100644 index b9f512d01..000000000 --- a/init/tee/kbs/kbs_types.c +++ /dev/null @@ -1,247 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include - -#include -#include -#include -#include - -#include "kbs.h" - -#include "../snp_attest.h" - -static void kbs_attestation_marshal(struct snp_report *, char *, BIGNUM *, - BIGNUM *, char *); -static void kbs_attestation_marshal_tee_pubkey(char *, BIGNUM *, BIGNUM *); - -/* - * Given a TEE architecture and workload ID, write the JSON string of the - * KBS REQUEST. - */ -int kbs_request_marshal(char *json_request, int tee, char *workload_id) -{ - char *teestr; - - /* - * Retrieve the KBS string equivalent of the TEE enum value. - */ - teestr = tee_str(tee); - if (teestr == NULL) - return -1; - - /* - * Build the KBS REQUEST JSON string. - */ - sprintf(json_request, - "{\"extra-params\":\"{\\\"workload_id\\\":\\\"%s\\\"}\",\"tee\":\"%" - "s\",\"version\":\"0.0.0\"}", - workload_id, teestr); - - return 0; -} - -/* - * Peform a KBS CHALLENGE. - * - * "json_request" is the JSON string of the KBS REQUEST. - * "nonce" is the output argument to be retrieved from the attestation server. - */ -int kbs_challenge(CURL *curl, char *url, char *json_request, char *nonce) -{ - int ret, rc; - char *nonce_json; - - rc = -1; - - nonce_json = (char *)malloc(0x2000); - if (nonce_json == NULL) { - printf("ERROR: unable to allocate JSON nonce buffer\n"); - - return rc; - } - - ret = kbs_curl_post(curl, url, (void *)json_request, (void *)nonce_json, - KBS_CURL_REQ); - if (ret < 0) { - printf("ERROR: could not complete KBS challenge\n"); - - goto out; - } - - /* - * Parse the JSON response from the KBS server to retrieve the nonce. - */ - if (json_parse_str(nonce, "nonce", nonce_json) < 0) { - printf("ERROR: unable to parse nonce from server response\n"); - - goto out; - } - - rc = 0; - -out: - free(nonce_json); - - return rc; -} - -/* - * Send all required materials (attestation report, certificate chain, etc..) - * to the attestation server for attestation. - */ -int kbs_attest(CURL *curl, char *url, struct snp_report *report, BIGNUM *mod, - BIGNUM *exp, char *gen) -{ - int rc; - char *json, errmsg[200]; - - rc = -1; - json = (char *)malloc(0x1000); - if (json == NULL) { - printf("ERROR: unable to allocate JSON buffer\n"); - - return rc; - } - - /* - * Marshal the kbs_types Attestation JSON struct with the given - * attestation report and certificate chain. - */ - kbs_attestation_marshal(report, json, mod, exp, gen); - - /* - * Ensure the error messaging string is empty, because we will - * eventually read this string as indicator of a cURL attestation - * server error. - */ - strcpy(errmsg, ""); - - if (kbs_curl_post(curl, url, json, errmsg, KBS_CURL_ATTEST) < 0) { - printf("ERROR: could not complete KBS attestation\n"); - - rc = -1; - goto out; - } - - /* - * If there is no error message, it can be assumed that the attestation - * was completed successfully. - */ - if (strcmp(errmsg, "") != 0) { - rc = -1; - printf("ATTESTATION ERROR: %s\n", errmsg); - - goto out; - } - - rc = 0; - -out: - free((void *)json); - - return rc; -} - -/* - * Retrieve the secret from the KBS attestation server. - */ -int kbs_get_key(CURL *curl, char *url, char *wid, EVP_PKEY *pkey, char *pass) -{ - int end_idx; - char json[4096]; - char encrypted[4096], *plain; - - /* - * The key is represented as a JSON byte list, copy this JSON list - * string to "json". - */ - if (kbs_curl_get(curl, url, wid, json, KBS_CURL_GET_KEY) < 0) { - printf("ERROR: could not complete KBS passphrase retrieval\n"); - - return -1; - } - - end_idx = strlen(json) - 2; - - memcpy(encrypted, json + 1, end_idx); - encrypted[end_idx] = '\0'; - - if (rsa_pkey_decrypt(pkey, encrypted, &plain) < 0) { - printf("ERROR: could not decrypt passphrase from KBS server\n"); - - return -1; - } - - strcpy(pass, plain); - - OPENSSL_free(plain); - - return 0; -} - -/* - * Marshal a JSON string of the kbs_types Attestation struct from the given - * attestation report and certificate data. - */ -static void kbs_attestation_marshal(struct snp_report *report, char *json, - BIGNUM *mod, BIGNUM *exp, char *gen) -{ - char buf[4096], *report_hexstr; - size_t report_hexstr_len; - - report_hexstr = (char *)malloc(0x1000); - if (report_hexstr == NULL) - return; - - sprintf(buf, "{"); - strcpy(json, buf); - - kbs_attestation_marshal_tee_pubkey(json, mod, exp); - - sprintf(buf, "\"tee-evidence\":\"{"); - strcat(json, buf); - - sprintf(buf, "\\\"gen\\\":\\\"%s\\\",", gen); - strcat(json, buf); - - OPENSSL_buf2hexstr_ex(report_hexstr, 0x1000, &report_hexstr_len, - (unsigned char *)report, sizeof(*report), '\0'); - report_hexstr[report_hexstr_len] = '\0'; - sprintf(buf, "\\\"report\\\":\\\"%s\\\",", report_hexstr); - strcat(json, buf); - - strcat(json, "\\\"cert_chain\\\":\\\"[]\\\"}"); - - strcat(json, "\"}"); -} - -/* - * Marshal a JSON string of the KBS TEE public key. - */ -static void kbs_attestation_marshal_tee_pubkey(char *json, BIGNUM *mod, - BIGNUM *exp) -{ - char mod_b64[512], exp_b64[512]; - char buf[1024]; - - if (mod == NULL || exp == NULL) - return; - - BN_b64(mod, mod_b64); - BN_b64(exp, exp_b64); - - sprintf(buf, "\"tee-pubkey\":{"); - strcat(json, buf); - - sprintf(buf, "\"alg\":\"RSA\","); - strcat(json, buf); - - sprintf(buf, "\"k-mod\":\"%s\",", mod_b64); - strcat(json, buf); - - sprintf(buf, "\"k-exp\":\"%s\"},", exp_b64); - strcat(json, buf); -} diff --git a/init/tee/kbs/kbs_util.c b/init/tee/kbs/kbs_util.c deleted file mode 100644 index 6c0b7c878..000000000 --- a/init/tee/kbs/kbs_util.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -#include -#include - -#include "../../jsmn.h" -#include "kbs.h" - -#define MAX_TOKENS 16384 - -static int label_find(char *, char *); - -/* - * Return the string identifier of the inputted TEE architecture. - */ -char *tee_str(int tee) -{ - switch (tee) { - case TEE_SEV: - return "sev"; - case TEE_SGX: - return "sgx"; - case TEE_SNP: - return "snp"; - case TEE_TDX: - return "tdx"; - - /* - * No other TEE architecture is supported. - */ - default: - printf("ERROR: tee_str(): Invalid input\n"); - return NULL; - } -} - -/* - * Parse a given string of cURL cookie data and find the label indicated by the - * "label" argument. This function is essentially a search of a substring - * within a given string. - */ -char *find_cookie(char *cookie_data, char *label) -{ - char *cookie_ptr; - size_t label_len, cookie_len; - - label_len = strlen(label); - cookie_len = strlen(cookie_data); - - cookie_ptr = cookie_data; - for (int i = 0; i < (cookie_len - label_len); i++, cookie_ptr++) { - if (strncmp(cookie_ptr, label, label_len) == 0) - return cookie_ptr; - } - - return NULL; -} - -/* - * From a label in a cURL cookie string, parse its associated value. - */ -int read_cookie_val(char *label, char *buf) -{ - char *ptr; - int ws; - - ws = 0; - ptr = label; - for (ptr = label; *ptr != '\0'; ptr++) { - if (*ptr == ' ' || *ptr == '\t') - ws = 1; - else if (ws == 1) { - strcpy(buf, ptr); - - return 0; - } - } - - return -1; -} - -/* - * Given a JSON string and a "label", parse the string associated with that - * label and write the contents to "out". - */ -int json_parse_str(char *out, char *label, char *json) -{ - int ntokens, eq, rc; - jsmn_parser parser; - jsmntok_t *tokens, *curr, *next; - char *val; - int len; - - rc = -1; - - tokens = (jsmntok_t *)malloc(MAX_TOKENS * sizeof(jsmntok_t)); - if (tokens == NULL) { - printf("ERROR: unable to allocate JSON string\n"); - - return rc; - } - - jsmn_init(&parser); - - ntokens = jsmn_parse(&parser, json, strlen(json), tokens, MAX_TOKENS); - if (ntokens <= 0) { - printf("ERROR: unable to find any tokens in KBS challenge\n"); - - goto out; - } - - /* - * Traverse each token of the JSON string. - */ - for (int i = 0; i < ntokens - 1; i++) { - curr = &tokens[i]; - next = &tokens[i + 1]; - - /* - * Only interested in reading a string. - */ - if (curr->type != JSMN_STRING) - continue; - - /* - * Compare the current token with the label being searched for. - */ - eq = label_find(label, json + curr->start); - if (eq && next->type == JSMN_STRING) { - /* - * Found the string associated with the label, calculate - * its beginning and ending indexes within the JSON - * string and copy the contents over to "out". - */ - val = json + next->start; - len = next->end - next->start; - - memcpy((void *)out, (void *)val, len); - rc = 0; - - goto out; - } - } - -out: - free((void *)tokens); - - return rc; -} - -static int label_find(char *label, char *str) -{ - size_t label_sz; - - label_sz = strlen(label); - - for (int i = 0; i < label_sz; i++) { - if (label[i] != str[i]) - return 0; - if (label[i] != '\0') - continue; - } - - return 1; -} diff --git a/init/tee/snp_attest.c b/init/tee/snp_attest.c deleted file mode 100644 index 8303705ff..000000000 --- a/init/tee/snp_attest.c +++ /dev/null @@ -1,220 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include -#include - -#include "kbs/kbs.h" -#include "snp_attest.h" - -#define NONCE_MAX 1024 -#define JSON_MAX 1024 -#define GEN_MAX 32 - -static int snp_get_report(const uint8_t *, size_t, struct snp_report *); -static int SNP_ATTEST_ERR(char *); -static void json_fmt(char *); - -int snp_attest(char *pass, char *url, char *wid, char *tee_data) -{ - CURL *curl; - char nonce[NONCE_MAX], json[JSON_MAX], gen[GEN_MAX]; - struct snp_report report; - EVP_PKEY *pkey; - BIGNUM *n, *e; - unsigned int hash_size; - uint8_t *hash; - - if (kbs_request_marshal(json, TEE_SNP, wid) < 0) - return SNP_ATTEST_ERR("Unable to marshal KBS REQUEST"); - - curl = curl_easy_init(); - if (curl == NULL) - return SNP_ATTEST_ERR("Unable to initialize cURL instance"); - - if (kbs_challenge(curl, url, json, nonce) < 0) - return SNP_ATTEST_ERR("Unable to retrieve nonce from server"); - - json_fmt(tee_data); - if (json_parse_str(gen, "gen", tee_data) < 0) - return SNP_ATTEST_ERR("Unable to retrieve SNP generation"); - - n = e = NULL; - if (kbs_tee_pubkey_create(&pkey, &n, &e) < 0) - return SNP_ATTEST_ERR("Unable to create TEE public key"); - - if (kbs_nonce_pubkey_hash(nonce, pkey, &hash, &hash_size) < 0) - return SNP_ATTEST_ERR("Unable to hash nonce and public key"); - - if (snp_get_report(hash, hash_size, &report) != EXIT_SUCCESS) - return SNP_ATTEST_ERR("Unable to retrieve attestation report"); - - if (kbs_attest(curl, url, &report, n, e, gen) < 0) - return SNP_ATTEST_ERR("Unable to complete KBS ATTESTATION"); - - curl_easy_reset(curl); - - if (kbs_get_key(curl, url, wid, pkey, pass) < 0) - return SNP_ATTEST_ERR("Unable to retrieve passphrase"); - - return 0; -} - -/* - * A function for the SNP_GET_REPORT ioctl. - * - * SNP_GET_REPORT fills both the attestation report and the certificate - * data. - */ -static int snp_get_report(const uint8_t *data, size_t data_sz, - struct snp_report *report) -{ - int rc = EXIT_FAILURE; - int fd = -1; - struct snp_report_req req; - struct snp_report_resp resp; - struct snp_guest_request_ioctl guest_req; - struct msg_report_resp *report_resp = (struct msg_report_resp *)&resp.data; - - /* - * The kernel will attempt to fill the report, certs, and certs_size, - * Therefore, none of these values can be NULL. - */ - if (report == NULL) { - printf("report is NULL\n"); - rc = EINVAL; - - goto out; - } - - /* - * We will be filling the user_data field of the request with "data". - * Ensure that the data is valid and can fit in the user_data field. - */ - if (data && (data_sz > sizeof(req.user_data) || data_sz == 0)) { - rc = EINVAL; - - goto out; - } - - /* - * Initialize data structures. - */ - memset(&req, 0, sizeof(req)); - - /* - * Copy the data into user_data if it exists. - */ - if (data) - memcpy(&req.user_data, data, data_sz); - - memset(&resp, 0, sizeof(resp)); - - memset(&guest_req, 0, sizeof(guest_req)); - guest_req.msg_version = 1; - guest_req.req_data = (__u64)&req; - guest_req.resp_data = (__u64)&resp; - - /* - * Open the SEV guest device. - */ - errno = 0; - fd = open(SEV_GUEST_DEV, O_RDWR); - if (fd == -1) { - rc = errno; - perror("open"); - - goto out; - } - - /* - * Retrieve the SNP attestation report. - */ - errno = 0; - rc = ioctl(fd, SNP_GET_REPORT, &guest_req); - if (rc == -1) { - rc = errno; - perror("ioctl"); - fprintf(stderr, "errno is %u\n", errno); - fprintf(stderr, "firmware error %#llx\n", guest_req.fw_err); - fprintf(stderr, "report error %x\n", report_resp->status); - - goto out_close; - } - - /* - * Ensure that the report was successfully generated. - */ - if (report_resp->status != 0) { - fprintf(stderr, "firmware error %x\n", report_resp->status); - rc = report_resp->status; - - goto out_close; - } else if (report_resp->report_size > sizeof(*report)) { - fprintf(stderr, "report size is %u bytes (expected %lu)!\n", - report_resp->report_size, sizeof(*report)); - rc = EFBIG; - - goto out_close; - } - - /* - * Copy the report + certs data. - */ - memcpy(report, &report_resp->report, report_resp->report_size); - rc = EXIT_SUCCESS; - -out_close: - if (fd > 0) { - close(fd); - fd = -1; - } -out: - return rc; -} - -static int SNP_ATTEST_ERR(char *errmsg) -{ - printf("SNP ATTEST ERROR: %s\n", errmsg); - - return -1; -} - -/* - * String format an unformatted JSON string: - * - * For example, this string: - * "{\"test\":\"123\"}" - * - * Would become: - * "{"test":"123"}" - */ -static void json_fmt(char *str) -{ - char cpy[strlen(str)]; - size_t sz, cpy_idx; - - sz = strlen(str); - cpy_idx = 0; - - for (int i = 0; i < sz; i++) { - if (str[i] != '\\') - cpy[cpy_idx++] = str[i]; - } - cpy[cpy_idx] = '\0'; - - strcpy(str, cpy); -} diff --git a/init/tee/snp_attest.h b/init/tee/snp_attest.h deleted file mode 100644 index d923d9b76..000000000 --- a/init/tee/snp_attest.h +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -#ifndef _SNP_ATTEST -#define _SNP_ATTEST - -#include - -#include - -#define SEV_GUEST_DEV "/dev/sev-guest" - -/* - * Cryptographic signature (should be signed by the VCEK). - */ -struct signature { - uint8_t r[72]; - uint8_t s[72]; - uint8_t reserved[512 - 144]; -}; - -/* - * Structure containing the security version numbers of each component in the - * Trusted Computing Base (TCB) of the SNP firmware. - */ -union tcb_version { - struct { - uint8_t boot_loader; - uint8_t tee; - uint8_t reserved[4]; - uint8_t snp; - uint8_t microcode; - }; - uint64_t raw; -}; - -/* - * An array of certificates. Consult the AMD SEV GHCB document to understand how - * this table should be built and parsed. - */ -struct cert_table { - struct cert_table_entry { - uuid_t guid; - uint32_t offset; - uint32_t len; - } *entry; -}; - -/* - * SNP attestation report structure. Based off of the attestation report - * structure described in firmware version 1.52. - */ -struct snp_report { - uint32_t version; - uint32_t guest_svn; - uint64_t policy; - uint8_t family_id[16]; - uint8_t image_id[16]; - uint32_t vmpl; - uint32_t signature_algo; - union tcb_version current_tcb; - - /* - * TODO: Change to a "struct platform_info". - */ - uint64_t platform_info; - - uint32_t author_key_en : 1; - uint32_t _reserved_0 : 31; - uint32_t _reserved_1; - uint8_t report_data[64]; - uint8_t measurement[48]; - uint8_t host_data[32]; - uint8_t id_key_digest[48]; - uint8_t author_key_digest[48]; - uint8_t report_id[32]; - uint8_t report_id_ma[32]; - union tcb_version reported_tcb; - uint8_t _reserved_2[24]; - uint8_t chip_id[64]; - union tcb_version committed_tcb; - uint8_t current_build; - uint8_t current_minor; - uint8_t current_major; - uint8_t _reserved_3; - uint8_t committed_build; - uint8_t committed_minor; - uint8_t committed_major; - uint8_t _reserved_4; - union tcb_version launch_tcb; - uint8_t _reserved_5[168]; - struct signature signature; -}; - -/* - * Response from the SNP_GET_EXT_REPORT ioctl. - */ -struct msg_report_resp { - uint32_t status; - uint32_t report_size; - uint8_t reserved[0x20 - 0x8]; - struct snp_report report; -}; - -// snp_attest.c -int snp_attest(char *, char *, char *, char *); - -#endif /* _SNP_ATTEST */ diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index ab6ecfe2a..6b492fbf3 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -3,7 +3,7 @@ name = "krun-devices" version = "0.1.0-1.18.0" authors = ["The libkrun Authors"] edition = "2021" -build = "build.rs" + description = "Virtual device emulation for libkrun" license = "Apache-2.0" repository = "https://github.com/containers/libkrun" @@ -20,6 +20,7 @@ input = ["zerocopy", "krun_input"] virgl_resource_map2 = [] aws-nitro = [] test_utils = [] +timesync = [] vhost-user = ["vhost", "vmm-sys-util"] [dependencies] diff --git a/src/devices/build.rs b/src/devices/build.rs deleted file mode 100644 index 49a4346d2..000000000 --- a/src/devices/build.rs +++ /dev/null @@ -1,68 +0,0 @@ -use std::ffi::OsStr; -use std::path::PathBuf; -use std::process::Command; - -fn build_default_init() -> PathBuf { - let manifest_dir = PathBuf::from(std::env::var_os("CARGO_MANIFEST_DIR").unwrap()); - let libkrun_root = manifest_dir.join("../.."); - let init_src = libkrun_root.join("init/init.c"); - let dhcp_src = libkrun_root.join("init/dhcp.c"); - - let out_dir = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); - let init_bin = out_dir.join("init"); - - println!("cargo:rerun-if-env-changed=CC_LINUX"); - println!("cargo:rerun-if-env-changed=CC"); - println!("cargo:rerun-if-env-changed=TIMESYNC"); - println!("cargo:rerun-if-changed={}", init_src.display()); - println!("cargo:rerun-if-changed={}", dhcp_src.display()); - println!( - "cargo:rerun-if-changed={}", - libkrun_root.join("init/jsmn.h").display() - ); - println!( - "cargo:rerun-if-changed={}", - libkrun_root.join("init/dhcp.h").display() - ); - - let mut init_cc_flags = vec!["-O2", "-static", "-Wall"]; - if std::env::var_os("TIMESYNC").as_deref() == Some(OsStr::new("1")) { - init_cc_flags.push("-D__TIMESYNC__"); - } - - let cc_value = std::env::var("CC_LINUX") - .or_else(|_| std::env::var("CC")) - .unwrap_or_else(|_| "cc".to_string()); - let mut cc_parts = cc_value.split_ascii_whitespace(); - let cc = cc_parts.next().expect("CC_LINUX/CC must not be empty"); - let status = Command::new(cc) - .args(cc_parts) - .args(&init_cc_flags) - .arg("-o") - .arg(&init_bin) - .arg(&init_src) - .arg(&dhcp_src) - .status() - .unwrap_or_else(|e| panic!("failed to execute {cc}: {e}")); - - if !status.success() { - panic!("failed to compile init/init.c: {status}"); - } - init_bin -} - -fn main() { - let init_binary_path = std::env::var_os("KRUN_INIT_BINARY_PATH") - .map(PathBuf::from) - .unwrap_or_else(|| { - let init_path = build_default_init(); - // SAFETY: The build script is single threaded. - unsafe { std::env::set_var("KRUN_INIT_BINARY_PATH", &init_path) }; - init_path - }); - println!( - "cargo:rustc-env=KRUN_INIT_BINARY_PATH={}", - init_binary_path.display() - ); - println!("cargo:rerun-if-env-changed=KRUN_INIT_BINARY_PATH"); -} diff --git a/src/devices/src/virtio/fs/augment_fs.rs b/src/devices/src/virtio/fs/augment_fs.rs new file mode 100644 index 000000000..351f7bf98 --- /dev/null +++ b/src/devices/src/virtio/fs/augment_fs.rs @@ -0,0 +1,740 @@ +// Virtual inode overlay for virtiofs. +// +// `AugmentFs` wraps an inner `FileSystem` implementation and intercepts +// FUSE operations for virtual inodes — synthetic read-only files that exist +// only in memory. All other operations are delegated to the inner filesystem. +// +// Virtual inodes are injected into the root directory (parent = ROOT_ID) and +// are currently only accessible via lookup (they do not appear in readdir). +// +// One-shot files can only be looked up once — the name is removed from the +// directory on first lookup so subsequent lookups return ENOENT. + +#[cfg(target_os = "macos")] +use crossbeam_channel::Sender; +use std::collections::HashMap; +use std::ffi::CStr; +use std::ffi::CString; +use std::io; +use std::sync::atomic::{AtomicI32, Ordering}; +use std::sync::Arc; +use std::sync::RwLock; +use std::time::Duration; + +#[cfg(target_os = "macos")] +use utils::worker_message::WorkerMessage; + +use super::filesystem::{ + Context, DirEntry, Entry, Extensions, FileSystem, FsOptions, GetxattrReply, ListxattrReply, + OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, +}; +use super::fuse; +use super::inode_alloc::InodeAllocator; +use super::virtual_inode::{VirtualEntry, VirtualInode}; +use crate::virtio::bindings; + +type Inode = u64; +type Handle = u64; + +/// Sentinel handle returned for virtual file opens. The inner filesystem's +/// handle allocator starts at 1 so this never collides. +const VIRTUAL_HANDLE: Handle = 0; + +/// Virtual entries never change; use a large cache timeout. +const VIRTUAL_TIMEOUT: Duration = Duration::from_secs(86400); + +// Use Linux errno values, not host values. The guest always runs Linux +// and the FUSE server passes error codes through without translation. +const LINUX_ENOENT: i32 = 2; +const LINUX_EACCES: i32 = 13; +const LINUX_EEXIST: i32 = 17; +const LINUX_EXDEV: i32 = 18; +const LINUX_EINVAL: i32 = 22; +const LINUX_EPERM: i32 = 1; +const LINUX_ENOSYS: i32 = 38; +const LINUX_ENODATA: i32 = 61; +const LINUX_ENXIO: i32 = 6; + +fn eperm() -> io::Error { + io::Error::from_raw_os_error(LINUX_EPERM) +} + +/// Overlay that injects virtual inodes into an inner `FileSystem`. +pub struct AugmentFs { + inner: T, + /// Maps (parent_inode, name) → child inode number. One-shot entries + /// are removed on first lookup so the file can only be opened once. + name_to_inode: RwLock>, + /// Maps virtual inode number → (mode, inode data). One-shot entries are + /// removed from this map on release. + inodes: RwLock>, +} + +impl> AugmentFs { + /// Create a new overlay. + /// + /// `entries` are registered as virtual inodes in the root directory. + /// Inode numbers are obtained from `inode_alloc`, the same allocator + /// used by the inner filesystem. + pub fn new(inner: T, inode_alloc: &InodeAllocator, entries: Vec) -> Self { + let mut name_to_inode = HashMap::new(); + let mut inodes = HashMap::new(); + + Self::register_entries( + fuse::ROOT_ID, + entries, + inode_alloc, + &mut name_to_inode, + &mut inodes, + ); + + Self { + inner, + name_to_inode: RwLock::new(name_to_inode), + inodes: RwLock::new(inodes), + } + } + + fn register_entries( + parent: Inode, + entries: Vec, + inode_alloc: &InodeAllocator, + name_to_inode: &mut HashMap<(Inode, CString), Inode>, + inodes: &mut HashMap, + ) { + for entry in entries { + let ino = inode_alloc.next(); + name_to_inode.insert((parent, entry.name), ino); + + // Recurse into directory children before moving the inode. + if let VirtualInode::Dir { children } = entry.inode { + Self::register_entries(ino, children, inode_alloc, name_to_inode, inodes); + inodes.insert( + ino, + ( + entry.mode, + VirtualInode::Dir { + children: Vec::new(), + }, + ), + ); + } else { + inodes.insert(ino, (entry.mode, entry.inode)); + } + } + } + + fn is_virtual(&self, inode: Inode) -> bool { + self.inodes.read().unwrap().contains_key(&inode) + } +} + +impl> FileSystem for AugmentFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, capable: FsOptions) -> io::Result { + self.inner.init(capable) + } + + fn destroy(&self) { + self.inner.destroy() + } + + fn lookup(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result { + let key = (parent, CString::from(name)); + let inode = self.name_to_inode.read().unwrap().get(&key).copied(); + if let Some(inode) = inode { + let inodes = self.inodes.read().unwrap(); + if let Some((mode, vnode)) = inodes.get(&inode) { + let one_shot = vnode.is_one_shot(); + let st = vnode.stat(inode, *mode); + let entry_timeout = if one_shot { + Duration::ZERO + } else { + VIRTUAL_TIMEOUT + }; + + if one_shot { + drop(inodes); + self.name_to_inode.write().unwrap().remove(&key); + } + + return Ok(Entry { + inode, + generation: 0, + attr: st, + attr_flags: 0, + attr_timeout: VIRTUAL_TIMEOUT, + entry_timeout, + }); + } + } + self.inner.lookup(ctx, parent, name) + } + + fn forget(&self, ctx: Context, inode: Inode, count: u64) { + if !self.is_virtual(inode) { + self.inner.forget(ctx, inode, count) + } + } + + fn batch_forget(&self, ctx: Context, requests: Vec<(Inode, u64)>) { + let real: Vec<_> = requests + .into_iter() + .filter(|(ino, _)| !self.is_virtual(*ino)) + .collect(); + if !real.is_empty() { + self.inner.batch_forget(ctx, real); + } + } + + fn getattr( + &self, + ctx: Context, + inode: Inode, + handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + { + let inodes = self.inodes.read().unwrap(); + if let Some((mode, vnode)) = inodes.get(&inode) { + let st = vnode.stat(inode, *mode); + return Ok((st, VIRTUAL_TIMEOUT)); + } + } + self.inner.getattr(ctx, inode, handle) + } + + fn setattr( + &self, + ctx: Context, + inode: Inode, + attr: bindings::stat64, + handle: Option, + valid: SetattrValid, + ) -> io::Result<(bindings::stat64, Duration)> { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.setattr(ctx, inode, attr, handle, valid) + } + + fn readlink(&self, ctx: Context, inode: Inode) -> io::Result> { + if self.is_virtual(inode) { + return Err(io::Error::from_raw_os_error(LINUX_EINVAL)); + } + self.inner.readlink(ctx, inode) + } + + fn symlink( + &self, + ctx: Context, + linkname: &CStr, + parent: Inode, + name: &CStr, + extensions: Extensions, + ) -> io::Result { + self.inner.symlink(ctx, linkname, parent, name, extensions) + } + + fn mknod( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + mode: u32, + rdev: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + self.inner + .mknod(ctx, inode, name, mode, rdev, umask, extensions) + } + + fn mkdir( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result { + let key = (parent, CString::from(name)); + if self.name_to_inode.read().unwrap().contains_key(&key) { + return Err(io::Error::from_raw_os_error(libc::EEXIST)); + } + self.inner.mkdir(ctx, parent, name, mode, umask, extensions) + } + + fn unlink(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.unlink(ctx, parent, name) + } + + fn rmdir(&self, ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> { + self.inner.rmdir(ctx, parent, name) + } + + fn rename( + &self, + ctx: Context, + olddir: Inode, + oldname: &CStr, + newdir: Inode, + newname: &CStr, + flags: u32, + ) -> io::Result<()> { + self.inner + .rename(ctx, olddir, oldname, newdir, newname, flags) + } + + fn link( + &self, + ctx: Context, + inode: Inode, + newparent: Inode, + newname: &CStr, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.link(ctx, inode, newparent, newname) + } + + fn open( + &self, + ctx: Context, + inode: Inode, + kill_priv: bool, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + if self.is_virtual(inode) { + if (flags as i32 & libc::O_ACCMODE) != libc::O_RDONLY { + return Err(io::Error::from_raw_os_error(libc::EACCES)); + } + return Ok((Some(VIRTUAL_HANDLE), OpenOptions::empty())); + } + self.inner.open(ctx, inode, kill_priv, flags) + } + + fn create( + &self, + ctx: Context, + parent: Inode, + name: &CStr, + mode: u32, + kill_priv: bool, + flags: u32, + umask: u32, + extensions: Extensions, + ) -> io::Result<(Entry, Option, OpenOptions)> { + self.inner + .create(ctx, parent, name, mode, kill_priv, flags, umask, extensions) + } + + fn read( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mut w: W, + size: u32, + offset: u64, + lock_owner: Option, + flags: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some((_, vnode)) = inodes.get(&inode) { + let data = vnode.data(); + let off: usize = offset + .try_into() + .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?; + if off >= data.len() { + return Ok(0); } + let remaining = data.len() - off; + let len = remaining.min(size as usize); + return w.write(&data[off..(off + len)]); + } + } + self.inner + .read(ctx, inode, handle, w, size, offset, lock_owner, flags) + } + + fn write( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + r: R, + size: u32, + offset: u64, + lock_owner: Option, + delayed_write: bool, + kill_priv: bool, + flags: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.write( + ctx, + inode, + handle, + r, + size, + offset, + lock_owner, + delayed_write, + kill_priv, + flags, + ) + } + + fn flush(&self, ctx: Context, inode: Inode, handle: Handle, lock_owner: u64) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.flush(ctx, inode, handle, lock_owner) + } + + fn fsync(&self, ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> { + if self.is_virtual(inode) { + return Ok(()); + } + self.inner.fsync(ctx, inode, datasync, handle) + } + + fn fallocate( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + mode: u32, + offset: u64, + length: u64, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner + .fallocate(ctx, inode, handle, mode, offset, length) + } + + fn release( + &self, + ctx: Context, + inode: Inode, + flags: u32, + handle: Handle, + flush: bool, + flock_release: bool, + lock_owner: Option, + ) -> io::Result<()> { + { + let mut inodes = self.inodes.write().unwrap(); + if let Some((_, vnode)) = inodes.get(&inode) { + if vnode.is_one_shot() { + inodes.remove(&inode); + } + return Ok(()); + } + } + self.inner + .release(ctx, inode, flags, handle, flush, flock_release, lock_owner) + } + + fn statfs(&self, ctx: Context, inode: Inode) -> io::Result { + self.inner.statfs(ctx, inode) + } + + fn getxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + size: u32, + ) -> io::Result { + if self.is_virtual(inode) { + return Err(io::Error::from_raw_os_error(LINUX_ENODATA)); + } + self.inner.getxattr(ctx, inode, name, size) + } + + fn listxattr(&self, ctx: Context, inode: Inode, size: u32) -> io::Result { + if self.is_virtual(inode) { + if size == 0 { + return Ok(ListxattrReply::Count(0)); + } + return Ok(ListxattrReply::Names(Vec::new())); + } + self.inner.listxattr(ctx, inode, size) + } + + fn setxattr( + &self, + ctx: Context, + inode: Inode, + name: &CStr, + value: &[u8], + flags: u32, + ) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.setxattr(ctx, inode, name, value, flags) + } + + fn removexattr(&self, ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> { + if self.is_virtual(inode) { + return Err(eperm()); + } + self.inner.removexattr(ctx, inode, name) + } + + fn opendir( + &self, + ctx: Context, + inode: Inode, + flags: u32, + ) -> io::Result<(Option, OpenOptions)> { + self.inner.opendir(ctx, inode, flags) + } + + fn readdir( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry) -> io::Result, + { + self.inner + .readdir(ctx, inode, handle, size, offset, add_entry) + } + + fn readdirplus( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + size: u32, + offset: u64, + add_entry: F, + ) -> io::Result<()> + where + F: FnMut(DirEntry, Entry) -> io::Result, + { + self.inner + .readdirplus(ctx, inode, handle, size, offset, add_entry) + } + + fn fsyncdir( + &self, + ctx: Context, + inode: Inode, + datasync: bool, + handle: Handle, + ) -> io::Result<()> { + self.inner.fsyncdir(ctx, inode, datasync, handle) + } + + fn releasedir(&self, ctx: Context, inode: Inode, flags: u32, handle: Handle) -> io::Result<()> { + self.inner.releasedir(ctx, inode, flags, handle) + } + + fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> { + if self.is_virtual(inode) { + if mask & (libc::W_OK as u32) != 0 { + return Err(io::Error::from_raw_os_error(LINUX_EACCES)); + } + return Ok(()); + } + self.inner.access(ctx, inode, mask) + } + + fn lseek( + &self, + ctx: Context, + inode: Inode, + _handle: Handle, + offset: u64, + whence: u32, + ) -> io::Result { + { + let inodes = self.inodes.read().unwrap(); + if let Some((_, vnode)) = inodes.get(&inode) { + let size = vnode.data().len() as u64; + // FUSE lseek is only called for SEEK_DATA/SEEK_HOLE. + return match whence as i32 { + libc::SEEK_DATA => { + if offset < size { + Ok(offset) + } else { + Err(io::Error::from_raw_os_error(LINUX_ENXIO)) + } + } + libc::SEEK_HOLE => { + if offset < size { + Ok(size) + } else { + Err(io::Error::from_raw_os_error(LINUX_ENXIO)) + } + } + _ => Err(io::Error::from_raw_os_error(LINUX_EINVAL)), + }; + } + } + self.inner.lseek(ctx, inode, _handle, offset, whence) + } + + fn copyfilerange( + &self, + ctx: Context, + inode_in: Inode, + handle_in: Handle, + offset_in: u64, + inode_out: Inode, + handle_out: Handle, + offset_out: u64, + len: u64, + flags: u64, + ) -> io::Result { + // Virtual inodes don't have real file descriptors, so copy_file_range + // cannot work. Return EXDEV to tell the kernel to fall back to + // read+write. + if self.is_virtual(inode_in) || self.is_virtual(inode_out) { + return Err(io::Error::from_raw_os_error(LINUX_EXDEV)); + } + self.inner.copyfilerange( + ctx, inode_in, handle_in, offset_in, inode_out, handle_out, offset_out, len, flags, + ) + } + + fn setupmapping( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + foffset: u64, + len: u64, + flags: u64, + moffset: u64, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + { + let inodes = self.inodes.read().unwrap(); + if let Some((_, vnode)) = inodes.get(&inode) { + let data = vnode.data(); + #[cfg(target_os = "linux")] + { + if (moffset + len) > shm_size { + return Err(io::Error::from_raw_os_error(LINUX_EINVAL)); + } + + let addr = host_shm_base + moffset; + let ret = unsafe { + libc::mmap( + addr as *mut libc::c_void, + len as usize, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, + -1, + 0, + ) + }; + if std::ptr::eq(ret, libc::MAP_FAILED) { + return Err(io::Error::last_os_error()); + } + + let foff = foffset as usize; + if foff < data.len() { + let available = data.len() - foff; + let to_copy = (len as usize).min(available); + unsafe { + libc::memcpy( + addr as *mut libc::c_void, + data.as_ptr().add(foff) as *const _, + to_copy, + ) + }; + } + + return Ok(()); + } + + // TODO: implement DAX for virtual files on macOS. + // Needs a shared memory region manager (see setupmapping + // in macos/passthrough.rs for the real-file DAX path). + #[cfg(target_os = "macos")] + { + let _ = data; + return Err(io::Error::from_raw_os_error(libc::ENOSYS)); + } } + } + self.inner.setupmapping( + ctx, + inode, + handle, + foffset, + len, + flags, + moffset, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn removemapping( + &self, + ctx: Context, + requests: Vec, + host_shm_base: u64, + shm_size: u64, + #[cfg(target_os = "macos")] map_sender: &Option>, + ) -> io::Result<()> { + self.inner.removemapping( + ctx, + requests, + host_shm_base, + shm_size, + #[cfg(target_os = "macos")] + map_sender, + ) + } + + fn ioctl( + &self, + ctx: Context, + inode: Inode, + handle: Handle, + flags: u32, + cmd: u32, + arg: u64, + in_size: u32, + out_size: u32, + exit_code: &Arc, + ) -> io::Result> { + // The ioctl cmd values use Linux encoding regardless of host OS + // because the guest always runs Linux. + const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; + + match cmd { + VIRTIO_IOC_EXIT_CODE_REQ => { + exit_code.store(arg as i32, Ordering::SeqCst); + Ok(Vec::new()) + } + _ => self.inner.ioctl( + ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, + ), + } + } +} diff --git a/src/devices/src/virtio/fs/device.rs b/src/devices/src/virtio/fs/device.rs index bc877bc24..b01a296e7 100644 --- a/src/devices/src/virtio/fs/device.rs +++ b/src/devices/src/virtio/fs/device.rs @@ -17,6 +17,7 @@ use super::super::{ VirtioShmRegion, }; use super::passthrough; +use super::virtual_inode::VirtualEntry; use super::worker::FsWorker; use super::ExportTable; use super::{defs, defs::uapi}; @@ -46,8 +47,9 @@ pub struct Fs { device_state: DeviceState, config: VirtioFsConfig, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, + virtual_entries: Vec, worker_thread: Option>, worker_stopfd: EventFd, exit_code: Arc, @@ -58,10 +60,10 @@ pub struct Fs { impl Fs { pub fn new( fs_id: String, - shared_dir: String, + shared_dir: Option, exit_code: Arc, - allow_root_dir_delete: bool, read_only: bool, + virtual_entries: Vec, ) -> super::Result { let avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX); @@ -70,11 +72,10 @@ impl Fs { config.tag[..tag.len()].copy_from_slice(tag.as_slice()); config.num_request_queues = 1; - let fs_cfg = passthrough::Config { - root_dir: shared_dir, - allow_root_dir_delete, + let fs_cfg = shared_dir.map(|root_dir| passthrough::Config { + root_dir, ..Default::default() - }; + }); Ok(Fs { avail_features, @@ -84,6 +85,7 @@ impl Fs { shm_region: None, passthrough_cfg: fs_cfg, read_only, + virtual_entries, worker_thread: None, worker_stopfd: EventFd::new(EFD_NONBLOCK).map_err(FsError::EventFd)?, exit_code, @@ -103,10 +105,14 @@ impl Fs { pub fn set_export_table(&mut self, export_table: ExportTable) -> u64 { static FS_UNIQUE_ID: AtomicU64 = AtomicU64::new(0); - self.passthrough_cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); - self.passthrough_cfg.export_table = Some(export_table); + let cfg = self + .passthrough_cfg + .as_mut() + .expect("export_table requires a passthrough filesystem"); + cfg.export_fsid = FS_UNIQUE_ID.fetch_add(1, Ordering::Relaxed); + cfg.export_table = Some(export_table); - self.passthrough_cfg.export_fsid + cfg.export_fsid } #[cfg(target_os = "macos")] @@ -180,6 +186,7 @@ impl VirtioDevice for Fs { queue_evts.push(dq.event); } + let virtual_entries = std::mem::take(&mut self.virtual_entries); let worker = FsWorker::new( worker_queues, queue_evts, @@ -188,6 +195,7 @@ impl VirtioDevice for Fs { self.shm_region.clone(), self.passthrough_cfg.clone(), self.read_only, + virtual_entries, self.worker_stopfd.try_clone().unwrap(), self.exit_code.clone(), #[cfg(target_os = "macos")] diff --git a/src/devices/src/virtio/fs/inode_alloc.rs b/src/devices/src/virtio/fs/inode_alloc.rs new file mode 100644 index 000000000..1919b1406 --- /dev/null +++ b/src/devices/src/virtio/fs/inode_alloc.rs @@ -0,0 +1,28 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + +use super::fuse; + +/// Allocates unique FUSE inode numbers. +/// +/// FUSE inode numbers are opaque identifiers with two reserved values: +/// - `0` — invalid / negative-entry cache sentinel (never allocated) +/// - `1` (`ROOT_ID`) — the root directory of the filesystem +/// +/// All other numbers are allocated sequentially starting from `ROOT_ID + 1`. +/// The allocator is `Send + Sync` and safe to share across threads. +pub struct InodeAllocator { + next: AtomicU64, +} + +impl InodeAllocator { + pub fn new() -> Self { + Self { + next: AtomicU64::new(fuse::ROOT_ID + 1), + } + } + + /// Allocate the next inode number. Each call returns a unique value. + pub fn next(&self) -> u64 { + self.next.fetch_add(1, Ordering::Relaxed) + } +} diff --git a/src/devices/src/virtio/fs/linux/passthrough.rs b/src/devices/src/virtio/fs/linux/passthrough.rs index e5ca21a03..8272a7e01 100644 --- a/src/devices/src/virtio/fs/linux/passthrough.rs +++ b/src/devices/src/virtio/fs/linux/passthrough.rs @@ -16,7 +16,7 @@ use std::sync::{Arc, RwLock}; use std::time::Duration; use caps::{has_cap, CapSet, Capability}; -use nix::{request_code_none, request_code_read}; +use nix::request_code_read; use vm_memory::ByteValued; @@ -25,15 +25,13 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; const CURRENT_DIR_CSTR: &[u8] = b".\0"; const PARENT_DIR_CSTR: &[u8] = b"..\0"; const EMPTY_CSTR: &[u8] = b"\0"; const PROC_CSTR: &[u8] = b"/proc/self/fd\0"; -const INIT_CSTR: &[u8] = b"init.krun\0"; - -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); type Inode = u64; type Handle = u64; @@ -327,7 +325,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -342,7 +339,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -358,14 +354,12 @@ pub struct PassthroughFs { // documentation of the `O_PATH` flag in `open(2)` for more details on what one can and cannot // do with an fd opened with this flag. inodes: RwLock>>, - next_inode: AtomicU64, - init_inode: u64, + inode_alloc: Arc, // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be // used for reading and writing data. handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, // File descriptor pointing to the `/proc/self/fd` directory. This is used to convert an fd from // `inodes` into one that can go into `handles`. This is accomplished by reading the @@ -392,7 +386,7 @@ enum FileOrLink { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let fd = if let Some(fd) = cfg.proc_sfd_rawfd { fd } else { @@ -438,12 +432,10 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), - init_inode: fuse::ROOT_ID + 1, + inode_alloc, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, proc_self_fd, @@ -579,7 +571,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { @@ -992,25 +984,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("do_lookup: {name:?}"); - let init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == init_name { - let mut st: libc::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1129,11 +1103,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1234,16 +1204,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset.try_into().map_err(|_| einval())?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -1824,10 +1784,6 @@ impl FileSystem for PassthroughFs { return Err(io::Error::from_raw_os_error(libc::ENOSYS)); } - if inode == self.init_inode { - return Err(io::Error::from_raw_os_error(libc::ENODATA)); - } - let mut buf = vec![0; size as usize]; // The f{set,get,remove,list}xattr functions don't work on an fd opened with `O_PATH` so we @@ -2087,36 +2043,6 @@ impl FileSystem for PassthroughFs { debug!("setupmapping: ino {inode:?} addr={addr:x} len={len}"); - if inode == self.init_inode { - let ret = unsafe { - libc::mmap( - addr as *mut libc::c_void, - len as usize, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED, - -1, - 0, - ) - }; - if std::ptr::eq(ret, libc::MAP_FAILED) { - return Err(io::Error::last_os_error()); - } - - let to_copy = if len as usize > INIT_BINARY.len() { - INIT_BINARY.len() - } else { - len as usize - }; - unsafe { - libc::memcpy( - addr as *mut libc::c_void, - INIT_BINARY.as_ptr() as *const _, - to_copy, - ) - }; - return Ok(()); - } - let file = self.open_inode(inode, open_flags)?; let fd = file.as_raw_fd(); @@ -2175,10 +2101,10 @@ impl FileSystem for PassthroughFs { handle: Self::Handle, _flags: u32, cmd: u32, - arg: u64, + _arg: u64, _in_size: u32, out_size: u32, - exit_code: &Arc, + _exit_code: &Arc, ) -> io::Result> { const VIRTIO_IOC_MAGIC: u8 = b'v'; @@ -2190,14 +2116,6 @@ impl FileSystem for PassthroughFs { VIRTIO_IOC_EXPORT_FD_SIZE ) as u32; - const VIRTIO_IOC_TYPE_EXIT_CODE: u8 = 2; - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_TYPE_EXIT_CODE) as u32; - - const VIRTIO_IOC_REMOVE_ROOT_DIR_CODE: u8 = 3; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = - request_code_none!(VIRTIO_IOC_MAGIC, VIRTIO_IOC_REMOVE_ROOT_DIR_CODE) as u32; - match cmd { VIRTIO_IOC_EXPORT_FD_REQ => { if out_size as usize != VIRTIO_IOC_EXPORT_FD_SIZE { @@ -2228,14 +2146,6 @@ impl FileSystem for PassthroughFs { ret.extend_from_slice(&handle.to_ne_bytes()); Ok(ret) } - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), } } diff --git a/src/devices/src/virtio/fs/macos/passthrough.rs b/src/devices/src/virtio/fs/macos/passthrough.rs index 53680bd92..c09ecdd67 100644 --- a/src/devices/src/virtio/fs/macos/passthrough.rs +++ b/src/devices/src/virtio/fs/macos/passthrough.rs @@ -29,16 +29,14 @@ use super::super::filesystem::{ ListxattrReply, OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::super::fuse; +use super::super::inode_alloc::InodeAllocator; use super::super::multikey::MultikeyBTreeMap; -const INIT_CSTR: &[u8] = b"init.krun\0"; const XATTR_KEY: &[u8] = b"user.containers.override_stat\0"; const SECURITY_CAPABILITY: &[u8] = b"security.capability\0"; const UID_MAX: u32 = u32::MAX - 1; -static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); - type Inode = u64; type Handle = u64; @@ -516,7 +514,6 @@ pub struct Config { pub export_fsid: u64, /// Table of exported FDs to share with other subsystems. Not supported for macos. pub export_table: Option, - pub allow_root_dir_delete: bool, } impl Default for Config { @@ -531,7 +528,6 @@ impl Default for Config { proc_sfd_rawfd: None, export_fsid: 0, export_table: None, - allow_root_dir_delete: false, } } } @@ -543,12 +539,10 @@ impl Default for Config { /// combination of mount namespaces and the pivot_root system call. pub struct PassthroughFs { inodes: RwLock>>, - next_inode: AtomicU64, - init_inode: u64, + inode_alloc: Arc, handles: RwLock>>, next_handle: AtomicU64, - init_handle: u64, map_windows: Mutex>, @@ -560,7 +554,7 @@ pub struct PassthroughFs { } impl PassthroughFs { - pub fn new(cfg: Config) -> io::Result { + pub fn new(cfg: Config, inode_alloc: Arc) -> io::Result { let root = CString::new(cfg.root_dir.as_str()).expect("CString::new failed"); // Safe because this doesn't modify any memory and we check the return value. @@ -579,12 +573,10 @@ impl PassthroughFs { Ok(PassthroughFs { inodes: RwLock::new(MultikeyBTreeMap::new()), - next_inode: AtomicU64::new(fuse::ROOT_ID + 2), - init_inode: fuse::ROOT_ID + 1, + inode_alloc, handles: RwLock::new(BTreeMap::new()), next_handle: AtomicU64::new(1), - init_handle: 0, map_windows: Mutex::new(HashMap::new()), @@ -723,7 +715,7 @@ impl PassthroughFs { // There is a possible race here where 2 threads end up adding the same file // into the inode list. However, since each of those will get a unique Inode // value and unique file descriptors this shouldn't be that much of a problem. - let inode = self.next_inode.fetch_add(1, Ordering::Relaxed); + let inode = self.inode_alloc.next(); self.inodes.write().unwrap().insert( inode, InodeAltKey { @@ -1201,25 +1193,7 @@ impl FileSystem for PassthroughFs { fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result { debug!("lookup: {name:?}"); - let _init_name = unsafe { CStr::from_bytes_with_nul_unchecked(INIT_CSTR) }; - - if self.init_inode != 0 && name == _init_name { - let mut st: bindings::stat64 = unsafe { mem::zeroed() }; - st.st_size = INIT_BINARY.len() as i64; - st.st_ino = self.init_inode; - st.st_mode = 0o100_755; - - Ok(Entry { - inode: self.init_inode, - generation: 0, - attr: st, - attr_flags: 0, - attr_timeout: self.cfg.attr_timeout, - entry_timeout: self.cfg.entry_timeout, - }) - } else { - self.do_lookup(parent, name) - } + self.do_lookup(parent, name) } fn forget(&self, _ctx: Context, inode: Inode, count: u64) { @@ -1339,11 +1313,7 @@ impl FileSystem for PassthroughFs { kill_priv: bool, flags: u32, ) -> io::Result<(Option, OpenOptions)> { - if inode == self.init_inode { - Ok((Some(self.init_handle), OpenOptions::empty())) - } else { - self.do_open(inode, kill_priv, flags) - } + self.do_open(inode, kill_priv, flags) } fn release( @@ -1456,18 +1426,6 @@ impl FileSystem for PassthroughFs { _flags: u32, ) -> io::Result { debug!("read: {inode:?}"); - if inode == self.init_inode { - let off: usize = offset - .try_into() - .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?; - let len = if off + (size as usize) < INIT_BINARY.len() { - size as usize - } else { - INIT_BINARY.len() - off - }; - return w.write(&INIT_BINARY[off..(off + len)]); - } - let data = self .handles .read() @@ -2053,10 +2011,6 @@ impl FileSystem for PassthroughFs { return Err(linux_error(io::Error::from_raw_os_error(libc::ENOSYS))); } - if inode == self.init_inode { - return Err(linux_error(io::Error::from_raw_os_error(libc::ENODATA))); - } - if name.to_bytes() == XATTR_KEY { return Err(linux_error(io::Error::from_raw_os_error(libc::EACCES))); } @@ -2469,34 +2423,4 @@ impl FileSystem for PassthroughFs { Ok(()) } - - fn ioctl( - &self, - _ctx: Context, - _inode: Self::Inode, - _handle: Self::Handle, - _flags: u32, - cmd: u32, - arg: u64, - _in_size: u32, - _out_size: u32, - exit_code: &Arc, - ) -> io::Result> { - // We can't use nix::request_code_none here since it's system-dependent - // and we need the value from Linux. - const VIRTIO_IOC_EXIT_CODE_REQ: u32 = 0x7602; - const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; - - match cmd { - VIRTIO_IOC_EXIT_CODE_REQ => { - exit_code.store(arg as i32, Ordering::SeqCst); - Ok(Vec::new()) - } - VIRTIO_IOC_REMOVE_ROOT_DIR_REQ if self.cfg.allow_root_dir_delete => { - std::fs::remove_dir_all(&self.cfg.root_dir)?; - Ok(Vec::new()) - } - _ => Err(io::Error::from_raw_os_error(libc::EOPNOTSUPP)), - } - } } diff --git a/src/devices/src/virtio/fs/mod.rs b/src/devices/src/virtio/fs/mod.rs index 7ce9d48c2..6868a2495 100644 --- a/src/devices/src/virtio/fs/mod.rs +++ b/src/devices/src/virtio/fs/mod.rs @@ -2,10 +2,14 @@ mod device; #[allow(dead_code)] mod filesystem; pub mod fuse; +mod inode_alloc; #[allow(dead_code)] mod multikey; +mod null_fs; mod read_only; mod server; +mod augment_fs; +pub mod virtual_inode; mod worker; #[cfg(target_os = "linux")] diff --git a/src/devices/src/virtio/fs/null_fs.rs b/src/devices/src/virtio/fs/null_fs.rs new file mode 100644 index 000000000..f2dd93a68 --- /dev/null +++ b/src/devices/src/virtio/fs/null_fs.rs @@ -0,0 +1,44 @@ +// A minimal filesystem that serves an empty root directory. +// +// Used with AugmentFs to provide a virtual-only filesystem (e.g. for +// booting from a block device where the virtiofs root only needs init.krun). + +use std::io; +use std::mem; +use std::time::Duration; + +use super::filesystem::{Context, FileSystem, FsOptions}; +use super::fuse; +use crate::virtio::bindings; + +/// An empty filesystem with just a root directory and nothing in it. +pub struct NullFs; + +type Inode = u64; +type Handle = u64; + +impl FileSystem for NullFs { + type Inode = Inode; + type Handle = Handle; + + fn init(&self, _capable: FsOptions) -> io::Result { + Ok(FsOptions::empty()) + } + + fn getattr( + &self, + _ctx: Context, + inode: Inode, + _handle: Option, + ) -> io::Result<(bindings::stat64, Duration)> { + if inode == fuse::ROOT_ID { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = fuse::ROOT_ID; + st.st_mode = libc::S_IFDIR | 0o755; + st.st_nlink = 2; + st.st_blksize = 4096; + return Ok((st, Duration::from_secs(86400))); + } + Err(io::Error::from_raw_os_error(libc::ENOENT)) + } +} diff --git a/src/devices/src/virtio/fs/read_only.rs b/src/devices/src/virtio/fs/read_only.rs index e975f2dda..5495db1ed 100644 --- a/src/devices/src/virtio/fs/read_only.rs +++ b/src/devices/src/virtio/fs/read_only.rs @@ -25,6 +25,7 @@ use super::filesystem::{ OpenOptions, SetattrValid, ZeroCopyReader, ZeroCopyWriter, }; use super::fuse; +use super::inode_alloc::InodeAllocator; use super::passthrough::{self, PassthroughFs}; use crate::virtio::bindings; @@ -35,10 +36,6 @@ fn erofs() -> io::Error { io::Error::from_raw_os_error(libc::EROFS) } -// Keep the Linux ioctl number so read-only virtio-fs can still handle -// non-mutating control ioctls while rejecting host-side root deletion. -const VIRTIO_IOC_REMOVE_ROOT_DIR_REQ: u32 = 0x7603; - fn read_only_open_flags(flags: u32) -> io::Result { let f = flags as i32; if f & libc::O_ACCMODE != libc::O_RDONLY { @@ -60,9 +57,9 @@ pub struct PassthroughFsRo { } impl PassthroughFsRo { - pub fn new(cfg: passthrough::Config) -> io::Result { + pub fn new(cfg: passthrough::Config, inode_alloc: Arc) -> io::Result { Ok(Self { - inner: PassthroughFs::new(cfg)?, + inner: PassthroughFs::new(cfg, inode_alloc)?, }) } } @@ -318,10 +315,6 @@ impl FileSystem for PassthroughFsRo { out_size: u32, exit_code: &Arc, ) -> io::Result> { - if cmd == VIRTIO_IOC_REMOVE_ROOT_DIR_REQ { - return Err(erofs()); - } - self.inner.ioctl( ctx, inode, handle, flags, cmd, arg, in_size, out_size, exit_code, ) diff --git a/src/devices/src/virtio/fs/virtual_inode.rs b/src/devices/src/virtio/fs/virtual_inode.rs new file mode 100644 index 000000000..a5c795157 --- /dev/null +++ b/src/devices/src/virtio/fs/virtual_inode.rs @@ -0,0 +1,64 @@ +// Virtual inode types for the virtiofs overlay. +// +// A `VirtualInode` represents a synthetic inode injected into the guest +// filesystem without any corresponding host file or directory. + +use std::ffi::CString; +use std::mem; + +use crate::virtio::bindings; + +/// A synthetic inode that exists only in memory. +pub enum VirtualInode { + /// A read-only file backed by a static byte slice. + File { + data: &'static [u8], + /// If true, the file can only be looked up once. + one_shot: bool, + }, + /// A directory containing other virtual entries. + Dir { children: Vec }, +} + +impl VirtualInode { + pub fn is_dir(&self) -> bool { + matches!(self, Self::Dir { .. }) + } + + pub fn is_one_shot(&self) -> bool { + matches!(self, Self::File { one_shot: true, .. }) + } + + pub fn data(&self) -> &'static [u8] { + match self { + Self::File { data, .. } => data, + Self::Dir { .. } => &[], + } + } + + /// Synthesize a stat result for this virtual inode. + pub fn stat(&self, inode: u64, mode: u32) -> bindings::stat64 { + let mut st: bindings::stat64 = unsafe { mem::zeroed() }; + st.st_ino = inode; + st.st_mode = mode as _; + st.st_blksize = 4096; + match self { + Self::File { data, .. } => { + st.st_size = data.len() as i64; + st.st_nlink = 1; + st.st_blocks = ((data.len() as i64) + 511) / 512; + } + Self::Dir { .. } => { + st.st_nlink = 2; + } + } + st + } +} + +/// An entry to register as a virtual inode. +pub struct VirtualEntry { + pub name: CString, + pub mode: u32, + pub inode: VirtualInode, +} diff --git a/src/devices/src/virtio/fs/worker.rs b/src/devices/src/virtio/fs/worker.rs index c612b3e9b..2809cb89c 100644 --- a/src/devices/src/virtio/fs/worker.rs +++ b/src/devices/src/virtio/fs/worker.rs @@ -16,14 +16,19 @@ use vm_memory::GuestMemoryMmap; use super::super::{FsError, Queue}; use super::defs::{HPQ_INDEX, REQ_INDEX}; use super::descriptor_utils::{Reader, Writer}; +use super::inode_alloc::InodeAllocator; +use super::null_fs::NullFs; use super::passthrough::{self, PassthroughFs}; use super::read_only::PassthroughFsRo; use super::server::Server; +use super::augment_fs::AugmentFs; +use super::virtual_inode::VirtualEntry; use crate::virtio::{InterruptTransport, VirtioShmRegion}; enum FsServer { - ReadWrite(Server), - ReadOnly(Server), + ReadWrite(Server>), + ReadOnly(Server>), + Null(Server>), } impl FsServer { @@ -52,6 +57,14 @@ impl FsServer { #[cfg(target_os = "macos")] map_sender, ), + FsServer::Null(s) => s.handle_message( + r, + w, + shm_region, + exit_code, + #[cfg(target_os = "macos")] + map_sender, + ), } } } @@ -77,16 +90,36 @@ impl FsWorker { interrupt: InterruptTransport, mem: GuestMemoryMmap, shm_region: Option, - passthrough_cfg: passthrough::Config, + passthrough_cfg: Option, read_only: bool, + virtual_entries: Vec, stop_fd: EventFd, exit_code: Arc, #[cfg(target_os = "macos")] map_sender: Option>, ) -> Result { - let server = if read_only { - FsServer::ReadOnly(Server::new(PassthroughFsRo::new(passthrough_cfg)?)) - } else { - FsServer::ReadWrite(Server::new(PassthroughFs::new(passthrough_cfg)?)) + let inode_alloc = Arc::new(InodeAllocator::new()); + let server = match passthrough_cfg { + Some(cfg) if read_only => { + let inner = PassthroughFsRo::new(cfg, inode_alloc.clone())?; + FsServer::ReadOnly(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + Some(cfg) => { + let inner = PassthroughFs::new(cfg, inode_alloc.clone())?; + FsServer::ReadWrite(Server::new(AugmentFs::new( + inner, + &inode_alloc, + virtual_entries, + ))) + } + None => FsServer::Null(Server::new(AugmentFs::new( + NullFs, + &inode_alloc, + virtual_entries, + ))), }; Ok(Self { queues, diff --git a/src/init-blob/Cargo.toml b/src/init-blob/Cargo.toml new file mode 100644 index 000000000..acb21f626 --- /dev/null +++ b/src/init-blob/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "init-blob" +version = "0.1.0" +edition = "2021" +description = "Default init binary blob for libkrun guests" +license = "Apache-2.0" +repository = "https://github.com/containers/libkrun" +build = "build.rs" + +[features] +amd-sev = [] +tdx = [] +timesync = [] + +[lib] +path = "src/lib.rs" diff --git a/src/init-blob/build.rs b/src/init-blob/build.rs new file mode 100644 index 000000000..20295d146 --- /dev/null +++ b/src/init-blob/build.rs @@ -0,0 +1,163 @@ +use std::path::PathBuf; +use std::process::Command; + +fn musl_target_for_host() -> &'static str { + let host = std::env::var("HOST").unwrap_or_default(); + if host.starts_with("aarch64") { + "aarch64-unknown-linux-musl" + } else { + "x86_64-unknown-linux-musl" + } +} + +fn musl_supported(rustc: &str) -> bool { + let musl_target = musl_target_for_host(); + let output = Command::new(rustc) + .args(["--target", musl_target, "--print", "sysroot"]) + .output(); + match output { + Ok(o) if o.status.success() => { + let sysroot = PathBuf::from(String::from_utf8_lossy(&o.stdout).trim()); + sysroot + .join("lib/rustlib") + .join(musl_target) + .join("lib") + .exists() + } + _ => false, + } +} + +/// Return a rustc binary that has the musl target's std library available. +/// +/// Tries the active rustc first. If that fails, searches ~/.rustup/toolchains/ +/// for a stable toolchain that does support musl — covering the common case +/// where the system package manager's rustc (e.g. Fedora's /usr/bin/rustc) +/// is used as the workspace compiler but the user also has a rustup toolchain +/// with musl support installed. +fn find_musl_rustc(default_rustc: &str) -> Option { + if musl_supported(default_rustc) { + return Some(PathBuf::from(default_rustc)); + } + + let home = std::env::var_os("HOME")?; + let toolchains = PathBuf::from(home).join(".rustup").join("toolchains"); + let mut candidates: Vec = std::fs::read_dir(&toolchains) + .ok()? + .flatten() + .map(|e| e.path().join("bin").join("rustc")) + .filter(|p| p.exists()) + .collect(); + + // Prefer stable toolchains over nightly/beta. + candidates.sort_by_key(|p| !p.to_string_lossy().contains("stable")); + + candidates + .into_iter() + .find(|rustc| musl_supported(rustc.to_str().unwrap_or(""))) +} + +fn build_rust_init() -> PathBuf { + let manifest_dir = PathBuf::from(std::env::var_os("CARGO_MANIFEST_DIR").unwrap()); + let workspace_root = manifest_dir.join("../.."); + let init_manifest = workspace_root.join("init/Cargo.toml"); + + let out_dir = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); + // Separate target dir avoids conflicting with the parent workspace cargo lock. + let init_target_dir = out_dir.join("init-target"); + let init_bin = out_dir.join("init"); + + let musl_target = musl_target_for_host(); + let profile = std::env::var("PROFILE").unwrap_or_else(|_| "release".to_string()); + let default_cargo = std::env::var("CARGO").unwrap_or_else(|_| "cargo".to_string()); + let default_rustc = std::env::var("RUSTC").unwrap_or_else(|_| "rustc".to_string()); + + println!( + "cargo:rerun-if-changed={}", + workspace_root.join("init/src").display() + ); + println!("cargo:rerun-if-changed={}", init_manifest.display()); + // Resolve which rustc (and paired cargo) to use for the init binary. + let (rustc, cargo, use_musl) = match find_musl_rustc(&default_rustc) { + Some(musl_rustc) => { + // Use the cargo from the same toolchain bin/ directory so that + // it inherits the same sysroot and target support. + let cargo = musl_rustc + .parent() + .map(|bin| bin.join("cargo")) + .filter(|p| p.exists()) + .map(|p| p.to_string_lossy().into_owned()) + .unwrap_or(default_cargo); + (musl_rustc.to_string_lossy().into_owned(), cargo, true) + } + None => { + println!( + "cargo:warning=musl target not available; krun-init will be dynamically linked. \ + Run `rustup target add x86_64-unknown-linux-musl` for a static binary." + ); + (default_rustc, default_cargo, false) + } + }; + + let mut cmd = Command::new(&cargo); + cmd.arg("build") + .arg("--manifest-path") + .arg(&init_manifest) + .arg("--target-dir") + .arg(&init_target_dir) + .env("RUSTC", &rustc); + + if profile == "release" { + cmd.arg("--release"); + } + + if use_musl { + cmd.arg("--target").arg(musl_target); + } + + let mut features: Vec<&str> = Vec::new(); + if cfg!(feature = "amd-sev") { + features.push("amd-sev"); + } + if cfg!(feature = "tdx") { + features.push("tdx"); + } + if cfg!(feature = "timesync") { + features.push("timesync"); + } + if !features.is_empty() { + cmd.arg("--features").arg(features.join(",")); + } + + let status = cmd + .status() + .unwrap_or_else(|e| panic!("failed to run {cargo}: {e}")); + if !status.success() { + panic!("failed to build krun-init"); + } + + let built = if use_musl { + // Cross-compilation: cargo places the binary at /// + init_target_dir + .join(musl_target) + .join(&profile) + .join("krun-init") + } else { + init_target_dir.join(&profile).join("krun-init") + }; + std::fs::copy(&built, &init_bin).unwrap_or_else(|e| panic!("failed to copy krun-init: {e}")); + + init_bin +} + +fn main() { + let init_binary_path = std::env::var_os("KRUN_INIT_BINARY_PATH") + .map(PathBuf::from) + .unwrap_or_else(build_rust_init); + + println!( + "cargo:rustc-env=KRUN_INIT_BINARY_PATH={}", + init_binary_path.display() + ); + println!("cargo:rerun-if-env-changed=KRUN_INIT_BINARY_PATH"); +} diff --git a/src/init-blob/src/lib.rs b/src/init-blob/src/lib.rs new file mode 100644 index 000000000..4397da679 --- /dev/null +++ b/src/init-blob/src/lib.rs @@ -0,0 +1 @@ +pub static INIT_BINARY: &[u8] = include_bytes!(env!("KRUN_INIT_BINARY_PATH")); diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index 24db7a9ff..a191a82de 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -10,8 +10,8 @@ repository = "https://github.com/containers/libkrun" [features] tee = ["vmm/tee", "devices/tee"] -amd-sev = ["blk", "tee", "vmm/amd-sev", "devices/amd-sev"] -tdx = ["blk", "tee", "vmm/tdx", "devices/tdx"] +amd-sev = ["blk", "tee", "vmm/amd-sev", "devices/amd-sev", "init-blob/amd-sev"] +tdx = ["blk", "tee", "vmm/tdx", "devices/tdx", "init-blob/tdx"] net = ["devices/net", "vmm/net"] blk = ["devices/blk", "vmm/blk"] gpu = ["vmm/gpu", "devices/gpu", "krun_display"] @@ -20,6 +20,7 @@ input = ["krun_input", "vmm/input", "devices/input"] virgl_resource_map2 = ["devices/virgl_resource_map2"] aws-nitro = ["vmm/aws-nitro", "devices/aws-nitro", "dep:aws-nitro", "dep:nitro-enclaves"] vhost-user = ["vmm/vhost-user", "devices/vhost-user"] +timesync = ["devices/timesync", "init-blob/timesync"] [dependencies] crossbeam-channel = ">=0.5.15" @@ -32,6 +33,7 @@ krun_display = { package = "krun-display", version = "0.1.0", path = "../display krun_input = { package = "krun-input", version = "0.1.0", path = "../input", optional = true, features = ["bindgen_clang_runtime"] } devices = { package = "krun-devices", version = "=0.1.0-1.18.0", path = "../devices" } +init-blob = { path = "../init-blob" } polly = { package = "krun-polly", version = "=0.1.0-1.18.0", path = "../polly" } utils = { package = "krun-utils", version = "=0.1.0-1.18.0", path = "../utils" } vmm = { package = "krun-vmm", version = "=0.1.0-1.18.0", path = "../vmm" } diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 1d8b3fcb1..1a9adafa3 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -14,16 +14,15 @@ use env_logger::{Env, Target}; #[cfg(feature = "gpu")] use krun_display::DisplayBackend; +#[cfg(not(feature = "tee"))] +use devices::virtio::fs::virtual_inode::{VirtualEntry, VirtualInode}; use libc::{c_char, c_int, size_t}; use once_cell::sync::Lazy; use polly::event_manager::EventManager; -#[cfg(all(feature = "blk", not(feature = "tee")))] -use rand::distr::{Alphanumeric, SampleString}; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::convert::TryInto; use std::env; -#[cfg(target_os = "linux")] use std::ffi::CString; use std::ffi::{c_void, CStr}; use std::fs::File; @@ -90,6 +89,20 @@ static KRUN_NITRO_DEBUG: Mutex = Mutex::new(false); // Path to the init binary to be executed inside the VM. const INIT_PATH: &str = "/init.krun"; +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +const DEFAULT_INIT_PAYLOAD: &[u8] = init_blob::INIT_BINARY; + +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +fn init_virtual_entry() -> VirtualEntry { + VirtualEntry { + name: std::ffi::CString::new("init.krun").unwrap(), + mode: 0o100_755, + inode: VirtualInode::File { + data: DEFAULT_INIT_PAYLOAD, + one_shot: true, + }, + } +} static KRUNFW: LazyLock> = LazyLock::new(|| unsafe { libloading::Library::new(KRUNFW_NAME).ok() }); @@ -167,6 +180,8 @@ struct ContextConfig { console_output: Option, vmm_uid: Option, vmm_gid: Option, + #[cfg(not(feature = "tee"))] + disable_implicit_init: bool, } impl ContextConfig { @@ -592,13 +607,17 @@ pub unsafe extern "C" fn krun_set_root(ctx_id: u32, c_root_path: *const c_char) match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); + let mut virtual_entries = Vec::new(); + if !cfg.disable_implicit_init { + virtual_entries.push(init_virtual_entry()); + } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id, - shared_dir, + shared_dir: Some(shared_dir), // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: false, read_only: false, + virtual_entries, }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -640,7 +659,7 @@ pub unsafe extern "C" fn krun_add_virtiofs3( shm_size: u64, read_only: bool, ) -> i32 { - if c_tag.is_null() || c_path.is_null() { + if c_tag.is_null() { return -libc::EINVAL; } @@ -648,9 +667,15 @@ pub unsafe extern "C" fn krun_add_virtiofs3( Ok(tag) => tag, Err(_) => return -libc::EINVAL, }; - let path = match CStr::from_ptr(c_path).to_str() { - Ok(path) => path, - Err(_) => return -libc::EINVAL, + + // NULL path means NullFs (virtual-only filesystem, no host directory). + let path = if c_path.is_null() { + None + } else { + match CStr::from_ptr(c_path).to_str() { + Ok(path) => Some(path), + Err(_) => return -libc::EINVAL, + } }; let shm = if shm_size > 0 { @@ -665,12 +690,16 @@ pub unsafe extern "C" fn krun_add_virtiofs3( match CTX_MAP.lock().unwrap().entry(ctx_id) { Entry::Occupied(mut ctx_cfg) => { let cfg = ctx_cfg.get_mut(); + let mut virtual_entries = Vec::new(); + if tag == "/dev/root" && !cfg.disable_implicit_init { + virtual_entries.push(init_virtual_entry()); + } cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: tag.to_string(), - shared_dir: path.to_string(), + shared_dir: path.map(|p| p.to_string()), shm_size: shm, - allow_root_dir_delete: false, read_only, + virtual_entries, }); } Entry::Vacant(_) => return -libc::ENOENT, @@ -2396,25 +2425,31 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( return -libc::EINVAL; } - // To boot from a filesystem other than virtiofs, - // we need to setup a temporary root from which init.krun can be executed. - // Otherwise, it would have to be copied to the target filesystem beforehand. - // Instead, init.krun will run from virtiofs and then switch to the real root. - let root_dir_suffix = Alphanumeric.sample_string(&mut rand::rng(), 6); - let empty_root = env::temp_dir().join(format!("krun-empty-root-{root_dir_suffix}")); - - if let Err(e) = std::fs::create_dir_all(&empty_root) { - error!("Failed to create empty root directory: {e:?}"); - return -libc::EINVAL; + // Boot from a block device: the virtiofs root only needs to + // serve init.krun and provide mount points for /dev, /proc, /sys. + // Use a NullFs (no host directory) with the inode overlay. + let mut virtual_entries = Vec::new(); + if !ctx_cfg.disable_implicit_init { + virtual_entries.push(init_virtual_entry()); + } + // init.c needs these directories as mount points before + // pivoting to the block device root. + for name in ["dev", "proc", "sys", "newroot"] { + virtual_entries.push(VirtualEntry { + name: CString::new(name).unwrap(), + mode: libc::S_IFDIR as u32 | 0o755, + inode: VirtualInode::Dir { + children: Vec::new(), + }, + }); } - ctx_cfg.vmr.add_fs_device(FsDeviceConfig { fs_id: "/dev/root".into(), - shared_dir: empty_root.to_string_lossy().into(), + shared_dir: None, // Default to a conservative 512 MB window. shm_size: Some(1 << 29), - allow_root_dir_delete: true, read_only: false, + virtual_entries, }); ctx_cfg.set_block_root(device, fstype, options); @@ -2425,6 +2460,132 @@ pub unsafe extern "C" fn krun_set_root_disk_remount( KRUN_SUCCESS } +#[no_mangle] +#[cfg(not(feature = "tee"))] +pub extern "C" fn krun_disable_implicit_init(ctx_id: u32) -> i32 { + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + ctx_cfg.get_mut().disable_implicit_init = true; + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(any(feature = "tee", feature = "aws-nitro")))] +pub unsafe extern "C" fn krun_get_default_init( + data_out: *mut *const u8, + len_out: *mut size_t, +) -> i32 { + if data_out.is_null() || len_out.is_null() { + return -libc::EINVAL; + } + *data_out = DEFAULT_INIT_PAYLOAD.as_ptr(); + *len_out = DEFAULT_INIT_PAYLOAD.len(); + KRUN_SUCCESS +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(feature = "tee"))] +pub unsafe extern "C" fn krun_fs_add_overlay_file( + ctx_id: u32, + c_fs_tag: *const c_char, + c_filename: *const c_char, + data: *const u8, + data_len: size_t, + mode: u32, + one_shot: bool, +) -> i32 { + if c_fs_tag.is_null() || c_filename.is_null() || data.is_null() || data_len == 0 { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let filename = match CString::new(CStr::from_ptr(c_filename).to_bytes()) { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + // SAFETY: The caller guarantees the memory remains valid for the VM + // lifetime (see the C header contract). + let payload: &'static [u8] = slice::from_raw_parts(data, data_len); + + let entry = VirtualEntry { + name: filename, + mode, + inode: VirtualInode::File { + data: payload, + one_shot, + }, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + match cfg.vmr.fs.iter_mut().find(|fs| fs.fs_id == fs_tag) { + Some(fs_cfg) => fs_cfg.virtual_entries.push(entry), + None => return -libc::ENOENT, + } + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + +#[allow(clippy::missing_safety_doc)] +#[no_mangle] +#[cfg(not(feature = "tee"))] +pub unsafe extern "C" fn krun_fs_add_overlay_dir( + ctx_id: u32, + c_fs_tag: *const c_char, + c_dirname: *const c_char, + mode: u32, +) -> i32 { + if c_fs_tag.is_null() || c_dirname.is_null() { + return -libc::EINVAL; + } + + let fs_tag = match CStr::from_ptr(c_fs_tag).to_str() { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let dirname = match CString::new(CStr::from_ptr(c_dirname).to_bytes()) { + Ok(s) => s, + Err(_) => return -libc::EINVAL, + }; + + let entry = VirtualEntry { + name: dirname, + mode, + inode: VirtualInode::Dir { + children: Vec::new(), + }, + }; + + match CTX_MAP.lock().unwrap().entry(ctx_id) { + Entry::Occupied(mut ctx_cfg) => { + let cfg = ctx_cfg.get_mut(); + match cfg.vmr.fs.iter_mut().find(|fs| fs.fs_id == fs_tag) { + Some(fs_cfg) => fs_cfg.virtual_entries.push(entry), + None => return -libc::ENOENT, + } + } + Entry::Vacant(_) => return -libc::ENOENT, + } + + KRUN_SUCCESS +} + #[no_mangle] pub extern "C" fn krun_disable_implicit_console(ctx_id: u32) -> i32 { match CTX_MAP.lock().unwrap().entry(ctx_id) { @@ -2817,7 +2978,7 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { let (sender, _receiver) = unbounded(); let _vmm = match vmm::builder::build_microvm( - &ctx_cfg.vmr, + &mut ctx_cfg.vmr, &mut event_manager, ctx_cfg.shutdown_efd, sender, @@ -2874,3 +3035,58 @@ fn krun_start_enter_nitro(ctx_id: u32) -> i32 { } } } + +#[cfg(all(test, not(feature = "tee")))] +mod tests { + use super::*; + + use std::ffi::CString; + use std::ptr::null; + + static TEST_LOCK: Lazy> = Lazy::new(|| Mutex::new(())); + + #[test] + fn root_virtiofs_injects_default_init_by_default() { + let _guard = TEST_LOCK.lock().unwrap(); + + let tag = CString::new("/dev/root").unwrap(); + let ctx = krun_create_ctx() as u32; + + unsafe { + assert_eq!(krun_add_virtiofs3(ctx, tag.as_ptr(), null(), 0, false), KRUN_SUCCESS); + } + + let ctx_map = CTX_MAP.lock().unwrap(); + let cfg = ctx_map.get(&ctx).unwrap(); + assert_eq!(cfg.vmr.fs.len(), 1); + assert_eq!(cfg.vmr.fs[0].virtual_entries.len(), 1); + assert_eq!(cfg.vmr.fs[0].virtual_entries[0].name.to_bytes(), b"init.krun"); + drop(ctx_map); + + assert_eq!(krun_free_ctx(ctx), KRUN_SUCCESS); + } + + #[test] + fn root_virtiofs_respects_disable_implicit_init() { + let _guard = TEST_LOCK.lock().unwrap(); + + let tag = CString::new("/dev/root").unwrap(); + let ctx = krun_create_ctx() as u32; + + assert_eq!(krun_disable_implicit_init(ctx), KRUN_SUCCESS); + unsafe { + assert_eq!(krun_add_virtiofs3(ctx, tag.as_ptr(), null(), 0, false), KRUN_SUCCESS); + } + + let ctx_map = CTX_MAP.lock().unwrap(); + let cfg = ctx_map.get(&ctx).unwrap(); + assert_eq!(cfg.vmr.fs.len(), 1); + assert!( + cfg.vmr.fs[0].virtual_entries.is_empty(), + "root virtiofs should not inject init.krun after krun_disable_implicit_init()" + ); + drop(ctx_map); + + assert_eq!(krun_free_ctx(ctx), KRUN_SUCCESS); + } +} diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 1aa9c5c48..aa4b8cec4 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -571,7 +571,7 @@ fn choose_payload(vm_resources: &VmResources) -> Result, _sender: Sender, @@ -1064,7 +1064,7 @@ pub fn build_microvm( #[cfg(not(any(feature = "tee", feature = "aws-nitro")))] attach_fs_devices( &mut vmm, - &vm_resources.fs, + std::mem::take(&mut vm_resources.fs), &mut _shm_manager, #[cfg(not(feature = "tee"))] export_table, @@ -2040,7 +2040,7 @@ fn attach_mmio_device( #[cfg(not(any(feature = "tee", feature = "aws-nitro")))] fn attach_fs_devices( vmm: &mut Vmm, - fs_devs: &[FsDeviceConfig], + fs_devs: Vec, shm_manager: &mut ShmManager, #[cfg(not(feature = "tee"))] export_table: Option, intc: IrqChip, @@ -2049,14 +2049,14 @@ fn attach_fs_devices( ) -> std::result::Result<(), StartMicrovmError> { use self::StartMicrovmError::*; - for (i, config) in fs_devs.iter().enumerate() { + for (i, config) in fs_devs.into_iter().enumerate() { let fs = Arc::new(Mutex::new( devices::virtio::Fs::new( - config.fs_id.clone(), - config.shared_dir.clone(), + config.fs_id, + config.shared_dir, exit_code.clone(), - config.allow_root_dir_delete, config.read_only, + config.virtual_entries, ) .unwrap(), )); diff --git a/src/vmm/src/vmm_config/fs.rs b/src/vmm/src/vmm_config/fs.rs index ccf86f5cd..b95982bf7 100644 --- a/src/vmm/src/vmm_config/fs.rs +++ b/src/vmm/src/vmm_config/fs.rs @@ -1,8 +1,11 @@ -#[derive(Clone, Debug)] +use devices::virtio::fs::virtual_inode::VirtualEntry; + pub struct FsDeviceConfig { pub fs_id: String, - pub shared_dir: String, + /// Host directory to pass through. None means a virtual-only filesystem + /// (NullFs + InodeOverlay, no host directory). + pub shared_dir: Option, pub shm_size: Option, - pub allow_root_dir_delete: bool, pub read_only: bool, + pub virtual_entries: Vec, } diff --git a/tests/guest-agent/Cargo.toml b/tests/guest-agent/Cargo.toml index 47617a0e4..9ab3fcbfc 100644 --- a/tests/guest-agent/Cargo.toml +++ b/tests/guest-agent/Cargo.toml @@ -2,6 +2,9 @@ name = "guest-agent" edition = "2021" +[features] +blk = ["test_cases/blk"] + [dependencies] test_cases = { path = "../test_cases", features = ["guest"] } anyhow = "1.0.95" diff --git a/tests/run.sh b/tests/run.sh index 3d7b1e6ef..13e57b0eb 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -44,8 +44,17 @@ if [ "$OS" = "Darwin" ]; then echo "Cross-compiling guest-agent for $GUEST_TARGET" fi -cargo build --target=$GUEST_TARGET -p guest-agent -cargo build -p runner +# KRUN_TEST_FEATURES can be set to pass extra features to test_cases/runner +# (e.g. "blk" when libkrun was built with BLK=1). +TEST_FEATURES="${KRUN_TEST_FEATURES:-}" + +if [ -n "$TEST_FEATURES" ]; then + cargo build --target=$GUEST_TARGET -p guest-agent --features "$TEST_FEATURES" + cargo build -p runner --features "$TEST_FEATURES" +else + cargo build --target=$GUEST_TARGET -p guest-agent + cargo build -p runner +fi # On macOS, the runner needs entitlements to use Hypervisor.framework if [ "$OS" = "Darwin" ]; then diff --git a/tests/runner/Cargo.toml b/tests/runner/Cargo.toml index 8133341b8..141d07a5f 100644 --- a/tests/runner/Cargo.toml +++ b/tests/runner/Cargo.toml @@ -2,6 +2,9 @@ name = "runner" edition = "2021" +[features] +blk = ["test_cases/blk"] + [dependencies] test_cases = { path = "../test_cases", features = ["host"] } anyhow = "1.0.95" diff --git a/tests/test_cases/Cargo.toml b/tests/test_cases/Cargo.toml index 8c9bcc924..4ecc6801b 100644 --- a/tests/test_cases/Cargo.toml +++ b/tests/test_cases/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" [features] host = ["krun-sys", "serde", "serde_json"] guest = [] +blk = [] [lib] name = "test_cases" diff --git a/tests/test_cases/src/lib.rs b/tests/test_cases/src/lib.rs index 83f3b6b14..ea9ec98f5 100644 --- a/tests/test_cases/src/lib.rs +++ b/tests/test_cases/src/lib.rs @@ -27,6 +27,13 @@ use test_pjdfstest::TestPjdfstest; mod test_virtiofs_misc; use test_virtiofs_misc::TestVirtioFsMisc; +mod test_augmentfs; +use test_augmentfs::TestAugmentFs; + +#[cfg(feature = "blk")] +mod test_root_disk_remount; +#[cfg(feature = "blk")] +use test_root_disk_remount::TestRootDiskRemount; pub enum TestOutcome { Pass, @@ -86,6 +93,9 @@ pub fn test_cases() -> Vec { TestCase::new("virtiofs-root-ro", Box::new(TestVirtiofsRootRo)), TestCase::new("virtiofs-misc", Box::new(TestVirtioFsMisc)), TestCase::new("pjdfstest", Box::new(TestPjdfstest)), + TestCase::new("augmentfs", Box::new(TestAugmentFs)), + #[cfg(feature = "blk")] + TestCase::new("root-disk-remount", Box::new(TestRootDiskRemount)), TestCase::new("perf-net-passt-tx", Box::new(TestNetPerf::new_passt_tx())), TestCase::new("perf-net-passt-rx", Box::new(TestNetPerf::new_passt_rx())), TestCase::new("perf-net-tap-tx", Box::new(TestNetPerf::new_tap_tx())), diff --git a/tests/test_cases/src/test_augmentfs.rs b/tests/test_cases/src/test_augmentfs.rs new file mode 100644 index 000000000..b25ec81c3 --- /dev/null +++ b/tests/test_cases/src/test_augmentfs.rs @@ -0,0 +1,278 @@ +// Test the AugmentFs overlay over a NullFs. +// +// Boots a VM with NO host filesystem — the root virtiofs is backed entirely +// by virtual inodes: init.krun (one-shot), the guest-agent binary (one-shot), +// a .krun_config.json (one-shot), persistent test files, and virtual +// directories as mount points for /dev, /proc, /sys. + +use macros::{guest, host}; + +pub struct TestAugmentFs; + +fn make_test_payload() -> Vec { + (0..8192u32).map(|i| (i % 251) as u8).collect() +} + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use std::ffi::CString; + use std::ptr::null_mut; + + impl Test for TestAugmentFs { + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let test_case = CString::new(test_setup.test_case)?; + + // Read the guest-agent binary into memory. Leaked because + // krun_start_enter never returns. + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + let guest_agent_bytes: &'static [u8] = + Vec::leak(std::fs::read(&guest_agent_path).expect("Failed to read guest-agent")); + + // Build JSON config: exec the guest-agent with our test name. + let json = format!( + r#"{{"args": ["/guest-agent", "{}"], "cwd": "/"}}"#, + test_case.to_str().unwrap() + ); + let json_bytes: &'static [u8] = Vec::leak(json.into_bytes()); + + // Deterministic test payload for range-read tests. + let payload: &'static [u8] = Vec::leak(make_test_payload()); + + // A small marker file to test persistent reads. + let marker: &'static [u8] = b"virtual-file-marker-content-12345"; + + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + + // Disable the implicit init — we'll inject it ourselves. + krun_call!(krun_disable_implicit_init(ctx))?; + + // Get the default init binary. + let mut init_data: *const u8 = null_mut(); + let mut init_len: usize = 0; + krun_call!(krun_get_default_init(&mut init_data, &mut init_len))?; + + // Set up root with NO host directory (NullFs). + krun_call!(krun_add_virtiofs3( + ctx, + c"/dev/root".as_ptr(), + std::ptr::null(), // NULL path → NullFs + 0, // no SHM window + false, // not read-only + ))?; + + // Virtual directories needed by init as mount points. + for dir in [c"dev", c"proc", c"sys"] { + krun_call!(krun_fs_add_overlay_dir( + ctx, + c"/dev/root".as_ptr(), + dir.as_ptr(), + 0o040_755, + ))?; + } + + // Overlay init.krun (one-shot, executable). + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"init.krun".as_ptr(), + init_data, + init_len, + 0o100_755, + true, + ))?; + + // Overlay guest-agent (one-shot, executable). After init + // execs it, the file should no longer be visible. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"guest-agent".as_ptr(), + guest_agent_bytes.as_ptr(), + guest_agent_bytes.len(), + 0o100_755, + true, + ))?; + + // Overlay .krun_config.json (one-shot). + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c".krun_config.json".as_ptr(), + json_bytes.as_ptr(), + json_bytes.len(), + 0o100_644, + true, + ))?; + + // Overlay a persistent marker file. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"marker.txt".as_ptr(), + marker.as_ptr(), + marker.len(), + 0o100_644, + false, + ))?; + + // Overlay a deterministic 8 KiB payload for range-read tests. + krun_call!(krun_fs_add_overlay_file( + ctx, + c"/dev/root".as_ptr(), + c"testdata.bin".as_ptr(), + payload.as_ptr(), + payload.len(), + 0o100_444, + false, + ))?; + + krun_call!(krun_set_workdir(ctx, c"/".as_ptr()))?; + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::io::{ErrorKind, Read, Seek, SeekFrom}; + use std::path::Path; + + impl Test for TestAugmentFs { + fn in_guest(self: Box) { + // --- One-shot files should be gone --- + assert!( + !Path::new("/.krun_config.json").exists(), + ".krun_config.json should be gone (one-shot)" + ); + assert!( + !Path::new("/init.krun").exists(), + "init.krun should be gone (one-shot)" + ); + + // --- One-shot guest-agent can't see itself --- + assert!( + !Path::new("/guest-agent").exists(), + "guest-agent should be gone (one-shot)" + ); + + // --- Virtual directories should be accessible --- + // init already mounted over these, but let's verify they + // exist as directories (the mount points came from our + // virtual dir overlay). + for dir in ["/dev", "/proc", "/sys"] { + let meta = fs::metadata(dir) + .unwrap_or_else(|e| panic!("{dir} should exist: {e}")); + assert!(meta.is_dir(), "{dir} should be a directory"); + } + + // Verify the mounts actually worked by checking known entries. + assert!( + Path::new("/dev/null").exists(), + "/dev/null should exist (devtmpfs)" + ); + assert!( + Path::new("/proc/self").exists(), + "/proc/self should exist (procfs)" + ); + assert!( + Path::new("/sys/kernel").exists(), + "/sys/kernel should exist (sysfs)" + ); + + // Verify directory listing works on each mounted fs. + let dev_entries: Vec<_> = fs::read_dir("/dev") + .expect("read_dir /dev") + .collect(); + assert!(!dev_entries.is_empty(), "/dev listing should not be empty"); + + let proc_entries: Vec<_> = fs::read_dir("/proc") + .expect("read_dir /proc") + .collect(); + assert!(!proc_entries.is_empty(), "/proc listing should not be empty"); + + let sys_entries: Vec<_> = fs::read_dir("/sys") + .expect("read_dir /sys") + .collect(); + assert!(!sys_entries.is_empty(), "/sys listing should not be empty"); + + // --- Persistent files should still exist --- + assert!(Path::new("/marker.txt").exists(), "marker.txt should exist"); + assert!( + Path::new("/testdata.bin").exists(), + "testdata.bin should exist" + ); + + // --- Read + verify marker content --- + let content = fs::read_to_string("/marker.txt").expect("read marker.txt"); + assert_eq!(content, "virtual-file-marker-content-12345"); + + // --- Repeated reads return the same data --- + let content2 = fs::read_to_string("/marker.txt").expect("re-read marker.txt"); + assert_eq!(content, content2, "repeated reads differ"); + + // --- Write should fail --- + let err = fs::OpenOptions::new() + .write(true) + .open("/marker.txt") + .expect_err("write-open should fail"); + assert_eq!(err.kind(), ErrorKind::PermissionDenied); + + // --- stat reports correct size --- + let meta = fs::metadata("/testdata.bin").expect("stat testdata.bin"); + assert_eq!(meta.len(), 8192, "testdata.bin size mismatch"); + + // --- Range reads on the 8 KiB payload --- + let expected = make_test_payload(); + let mut f = fs::File::open("/testdata.bin").expect("open testdata.bin"); + + // Full read. + let got = fs::read("/testdata.bin").expect("full read"); + assert_eq!(got, expected, "full read mismatch"); + + // Read first 256 bytes. + let mut buf = vec![0u8; 256]; + f.read_exact(&mut buf).expect("read first 256"); + assert_eq!(buf, &expected[..256], "first 256 bytes mismatch"); + + // Seek to offset 4000, read 512 bytes. + f.seek(SeekFrom::Start(4000)).expect("seek to 4000"); + let mut buf = vec![0u8; 512]; + f.read_exact(&mut buf).expect("read at offset 4000"); + assert_eq!(buf, &expected[4000..4512], "range [4000..4512] mismatch"); + + // Seek to last 10 bytes. + f.seek(SeekFrom::End(-10)).expect("seek to end-10"); + let mut buf = vec![0u8; 10]; + f.read_exact(&mut buf).expect("read last 10"); + assert_eq!(buf, &expected[8182..8192], "last 10 bytes mismatch"); + + // Read past EOF should return 0 bytes. + f.seek(SeekFrom::Start(8192)).expect("seek to EOF"); + let mut buf = vec![0u8; 100]; + let n = f.read(&mut buf).expect("read past EOF"); + assert_eq!(n, 0, "read past EOF should return 0"); + + // Seek back to start, re-read, verify consistency. + f.seek(SeekFrom::Start(0)).expect("seek to start"); + let mut full = Vec::new(); + f.read_to_end(&mut full).expect("read_to_end"); + assert_eq!(full, expected, "read_to_end mismatch"); + + println!("OK"); + } + } +} diff --git a/tests/test_cases/src/test_root_disk_remount.rs b/tests/test_cases/src/test_root_disk_remount.rs new file mode 100644 index 000000000..d0dfaf567 --- /dev/null +++ b/tests/test_cases/src/test_root_disk_remount.rs @@ -0,0 +1,124 @@ +// Test that krun_set_root_disk_remount works with NullFs. +// +// Creates a tiny ext4 disk image containing only the guest-agent binary, +// boots from it via krun_set_root_disk_remount (which uses NullFs for the +// initial virtiofs root with init.krun overlaid), and verifies the guest +// successfully pivoted to the block device root. + +use macros::{guest, host}; + +pub struct TestRootDiskRemount; + +#[host] +mod host { + use super::*; + + use crate::{krun_call, krun_call_u32}; + use crate::{Test, TestSetup}; + use krun_sys::*; + use std::ffi::CString; + use std::process::Command; + use std::ptr::null; + + fn create_disk_image(guest_agent_path: &str, output_path: &str) { + // Populate from a staging directory using mke2fs -d (no root needed). + let staging = format!("{output_path}.staging"); + std::fs::create_dir_all(&staging).expect("mkdir staging"); + + std::fs::copy(guest_agent_path, format!("{staging}/guest-agent")) + .expect("copy guest-agent"); + + // Marker file to verify the guest booted from the block device. + std::fs::write( + format!("{staging}/block-marker"), + "booted-from-block-device", + ) + .expect("write marker"); + + let status = Command::new("mke2fs") + .args(["-q", "-t", "ext4", "-d", &staging, output_path, "32M"]) + .status() + .expect("mke2fs failed"); + assert!(status.success(), "mke2fs failed"); + + std::fs::remove_dir_all(&staging).expect("cleanup staging"); + } + + impl Test for TestRootDiskRemount { + fn start_vm(self: Box, test_setup: TestSetup) -> anyhow::Result<()> { + let guest_agent_path = std::env::var("KRUN_TEST_GUEST_AGENT_PATH") + .expect("KRUN_TEST_GUEST_AGENT_PATH not set"); + + let disk_path = format!("{}/rootfs.ext4", test_setup.tmp_dir.display()); + create_disk_image(&guest_agent_path, &disk_path); + + let c_disk_path = CString::new(disk_path)?; + let test_case = CString::new(test_setup.test_case)?; + + unsafe { + krun_call!(krun_set_log_level(KRUN_LOG_LEVEL_TRACE))?; + let ctx = krun_call_u32!(krun_create_ctx())?; + krun_call!(krun_set_vm_config(ctx, 1, 512))?; + + // Set up the exec path for the guest-agent on the block + // device root. + let argv = [test_case.as_ptr(), null()]; + let envp = [null()]; + krun_call!(krun_set_exec( + ctx, + c"/guest-agent".as_ptr(), + argv.as_ptr(), + envp.as_ptr(), + ))?; + + krun_call!(krun_set_workdir(ctx, c"/".as_ptr()))?; + + // Add a block device with the ext4 image. + krun_call!(krun_add_disk( + ctx, + c"vda".as_ptr(), + c_disk_path.as_ptr(), + false, // not read-only + ))?; + + // Configure block device as root, pivot from NullFs. + krun_call!(krun_set_root_disk_remount( + ctx, + c"/dev/vda".as_ptr(), + c"ext4".as_ptr(), + std::ptr::null(), // no mount options + ))?; + + krun_call!(krun_start_enter(ctx))?; + } + Ok(()) + } + } +} + +#[guest] +mod guest { + use super::*; + use crate::Test; + use std::fs; + use std::path::Path; + + impl Test for TestRootDiskRemount { + fn in_guest(self: Box) { + // Verify we're running from the block device root. + let marker = fs::read_to_string("/block-marker") + .expect("Failed to read /block-marker — not on block device root?"); + assert_eq!(marker, "booted-from-block-device"); + + // The init.krun virtual file should be gone (one-shot, and we + // pivoted away from the NullFs root anyway). + assert!(!Path::new("/init.krun").exists()); + + // /proc and /dev should be mounted (init re-mounts after pivot). + assert!(Path::new("/proc/self").exists(), "/proc/self missing"); + assert!(Path::new("/dev/null").exists(), "/dev/null missing"); + + println!("OK"); + } + } +}