From 4883e4585273a6b1adca34887abf50cc2154385f Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 22 Jun 2026 20:41:12 -0700 Subject: [PATCH] nvidia: skip services when hardware is absent --- .../images/dstack-rootfs-nvidia.inc | 1 + ...idia-fabricmanager-nvswitch-condition.conf | 6 +++ .../nvidia/files/nvidia-gpu-detect | 51 +++++++++++++++++++ .../nvidia/files/nvidia-persistenced.service | 3 ++ .../nvidia/nvidia-fabricmanager_%.bbappend | 15 ++++++ .../nvidia/nvidia-gpu-detect_1.0.bb | 16 ++++++ .../nvidia/nvidia-persistenced_1.0.bb | 2 + 7 files changed, 94 insertions(+) create mode 100644 meta-nvidia/recipes-graphics/nvidia/files/nvidia-fabricmanager-nvswitch-condition.conf create mode 100755 meta-nvidia/recipes-graphics/nvidia/files/nvidia-gpu-detect create mode 100644 meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_%.bbappend create mode 100644 meta-nvidia/recipes-graphics/nvidia/nvidia-gpu-detect_1.0.bb diff --git a/meta-dstack/recipes-core/images/dstack-rootfs-nvidia.inc b/meta-dstack/recipes-core/images/dstack-rootfs-nvidia.inc index 2b2355b..914084e 100644 --- a/meta-dstack/recipes-core/images/dstack-rootfs-nvidia.inc +++ b/meta-dstack/recipes-core/images/dstack-rootfs-nvidia.inc @@ -1,6 +1,7 @@ NVIDIA_GROUP = "acpid \ nvidia \ nvidia-firmware \ + nvidia-gpu-detect \ nvidia-persistenced \ nvidia-fabricmanager \ libnvidia-nscq \ diff --git a/meta-nvidia/recipes-graphics/nvidia/files/nvidia-fabricmanager-nvswitch-condition.conf b/meta-nvidia/recipes-graphics/nvidia/files/nvidia-fabricmanager-nvswitch-condition.conf new file mode 100644 index 0000000..61d490a --- /dev/null +++ b/meta-nvidia/recipes-graphics/nvidia/files/nvidia-fabricmanager-nvswitch-condition.conf @@ -0,0 +1,6 @@ +# Skip the fabric manager cleanly on hosts without NVSwitch (i.e. any non +# HGX/DGX instance, including GPU instances that have no NVSwitch) instead of +# letting it fail. ExecCondition exit 1 -> systemd marks the unit as skipped, +# not failed. +[Service] +ExecCondition=/usr/bin/nvidia-gpu-detect nvswitch diff --git a/meta-nvidia/recipes-graphics/nvidia/files/nvidia-gpu-detect b/meta-nvidia/recipes-graphics/nvidia/files/nvidia-gpu-detect new file mode 100755 index 0000000..1fdb6d2 --- /dev/null +++ b/meta-nvidia/recipes-graphics/nvidia/files/nvidia-gpu-detect @@ -0,0 +1,51 @@ +#!/bin/sh +# nvidia-gpu-detect - report NVIDIA GPU / NVSwitch presence via sysfs PCI. +# +# Intended for use as a systemd ExecCondition= so that GPU-only services skip +# cleanly (instead of failing) on instances without a GPU / without NVSwitch. +# This lets the NVIDIA and plain images be merged into a single image. +# +# exit 0 -> hardware present (systemd runs the unit) +# exit 1 -> hardware absent (systemd skips the unit, no failure) +# +# Detection mirrors scripts/bin/enable_vfio_passthrough.sh, but reads PCI class +# codes from sysfs so it works before the nvidia driver is loaded and without +# relying on lspci / the pci.ids name database: +# +# GPU = NVIDIA (0x10de) VGA (0x0300xx) or 3D controller (0x0302xx) +# NVSwitch = NVIDIA (0x10de) bridge (0x06xxxx, any bridge subclass) +# +# NVSwitch matches any NVIDIA bridge-class device (mirrors the `grep 'Bridge'` +# heuristic in scripts/bin/enable_vfio_passthrough.sh) rather than only the +# "Other bridge" subclass 0x0680, so it stays correct if a future NVSwitch +# enumerates under a different bridge subclass. This is safe here because these +# images run as TDX guests whose only NVIDIA-vendor devices are passed-through +# GPUs (class 0x03xx) and NVSwitches (class 0x06xx). + +NVIDIA_VENDOR="0x10de" + +# match_vendor_class : succeed if any PCI device has NVIDIA's +# vendor id and a class matching the given glob (e.g. "0x0302*"). +match_vendor_class() { + for dev in /sys/bus/pci/devices/*; do + [ -r "$dev/vendor" ] && [ -r "$dev/class" ] || continue + [ "$(cat "$dev/vendor")" = "$NVIDIA_VENDOR" ] || continue + case "$(cat "$dev/class")" in + $1) return 0 ;; + esac + done + return 1 +} + +case "${1:-gpu}" in + gpu) + match_vendor_class '0x0300*' || match_vendor_class '0x0302*' + ;; + nvswitch) + match_vendor_class '0x06*' + ;; + *) + echo "usage: ${0##*/} {gpu|nvswitch}" >&2 + exit 64 + ;; +esac diff --git a/meta-nvidia/recipes-graphics/nvidia/files/nvidia-persistenced.service b/meta-nvidia/recipes-graphics/nvidia/files/nvidia-persistenced.service index 07e956b..3960c51 100644 --- a/meta-nvidia/recipes-graphics/nvidia/files/nvidia-persistenced.service +++ b/meta-nvidia/recipes-graphics/nvidia/files/nvidia-persistenced.service @@ -6,6 +6,9 @@ After=systemd-modules-load.service modprobe@.service [Service] Type=oneshot +# Skip cleanly on instances without an NVIDIA GPU (exit 1 -> unit skipped, not +# failed), so the NVIDIA image can run on GPU-less hosts. +ExecCondition=/usr/bin/nvidia-gpu-detect gpu ExecStart=/usr/bin/env nvidia-persistenced --uvm-persistence-mode ExecStartPost=/usr/bin/nvidia-smi conf-compute -srs 1 RemainAfterExit=yes diff --git a/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_%.bbappend b/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_%.bbappend new file mode 100644 index 0000000..8ae5bd8 --- /dev/null +++ b/meta-nvidia/recipes-graphics/nvidia/nvidia-fabricmanager_%.bbappend @@ -0,0 +1,15 @@ +FILESEXTRAPATHS:prepend := "${THISDIR}/files:" + +# Only start the fabric manager when NVSwitch hardware is present, so the +# service is silently skipped (not failed) on non-NVSwitch instances. +SRC_URI += "file://nvidia-fabricmanager-nvswitch-condition.conf" + +RDEPENDS:${PN} += "nvidia-gpu-detect" + +do_install:append() { + install -d ${D}${systemd_system_unitdir}/nvidia-fabricmanager.service.d + install -m 0644 ${UNPACKDIR}/nvidia-fabricmanager-nvswitch-condition.conf \ + ${D}${systemd_system_unitdir}/nvidia-fabricmanager.service.d/10-nvswitch-condition.conf +} + +FILES:${PN} += "${systemd_system_unitdir}/nvidia-fabricmanager.service.d/10-nvswitch-condition.conf" diff --git a/meta-nvidia/recipes-graphics/nvidia/nvidia-gpu-detect_1.0.bb b/meta-nvidia/recipes-graphics/nvidia/nvidia-gpu-detect_1.0.bb new file mode 100644 index 0000000..036aa81 --- /dev/null +++ b/meta-nvidia/recipes-graphics/nvidia/nvidia-gpu-detect_1.0.bb @@ -0,0 +1,16 @@ +SUMMARY = "Detect NVIDIA GPU / NVSwitch presence for conditional systemd services" +DESCRIPTION = "Small sysfs-based helper used as a systemd ExecCondition= so that \ +GPU-only services (nvidia-persistenced, nvidia-fabricmanager) skip cleanly on \ +instances without a GPU or without NVSwitch, allowing a single merged image." +LICENSE = "CLOSED" + +SRC_URI = "file://nvidia-gpu-detect" + +S = "${UNPACKDIR}" + +do_install() { + install -d ${D}${bindir} + install -m 0755 ${UNPACKDIR}/nvidia-gpu-detect ${D}${bindir}/nvidia-gpu-detect +} + +FILES:${PN} = "${bindir}/nvidia-gpu-detect" diff --git a/meta-nvidia/recipes-graphics/nvidia/nvidia-persistenced_1.0.bb b/meta-nvidia/recipes-graphics/nvidia/nvidia-persistenced_1.0.bb index 0c3ee94..e7026f3 100644 --- a/meta-nvidia/recipes-graphics/nvidia/nvidia-persistenced_1.0.bb +++ b/meta-nvidia/recipes-graphics/nvidia/nvidia-persistenced_1.0.bb @@ -9,6 +9,8 @@ S = "${UNPACKDIR}" inherit systemd +RDEPENDS:${PN} += "nvidia-gpu-detect" + SYSTEMD_PACKAGES = "${PN}" SYSTEMD_SERVICE:${PN} = "nvidia-persistenced.service" SYSTEMD_AUTO_ENABLE:${PN} = "enable"