Let the LXD‘container autolly use the host Machine Nvidia-driver and the cuda enviroment
该摘要展示了在LXC容器中配置和验证NVIDIA GPU的过程。用户首先检查了容器配置,设置了NVIDIA运行时,并尝试添加GPU设备时遇到验证错误。通过nvidia-smi确认系统已安装NVIDIA Titan RTX显卡(24GB显存)和CUDA 12.9驱动。经过多次调试后成功添加GPU设备,并验证容器内可正常访问GPU资源。最后在conda的torch环境中确认PyTorch 2.8.0能
(torch) d437@d437-MS-7C71:~$ lxc config show gpu-base
architecture: x86_64
config:
image.architecture: amd64
image.description: ubuntu 22.04 LTS amd64 (release) (20250725)
image.label: release
image.os: ubuntu
image.release: jammy
image.serial: "20250725"
image.type: squashfs
image.version: "22.04"
security.nesting: "true"
volatile.base_image: 52b1b7d517ed140528dde80adca9a1b4bde7724a92127278a24918536b1a6b9b
volatile.cloud-init.instance-id: 7d1d05fe-f303-4c5a-b2d6-f2f5c722743a
volatile.eth0.host_name: vethfce604a4
volatile.eth0.hwaddr: 00:16:3e:41:e7:89
volatile.idmap.base: "0"
volatile.idmap.current: '[{"Isuid":true,"Isgid":false,"Hostid":1000000,"Nsid":0,"Maprange":1000000000},{"Isuid":false,"Isgid":true,"Hostid":1000000,"Nsid":0,"Maprange":1000000000}]'
volatile.idmap.next: '[{"Isuid":true,"Isgid":false,"Hostid":1000000,"Nsid":0,"Maprange":1000000000},{"Isuid":false,"Isgid":true,"Hostid":1000000,"Nsid":0,"Maprange":1000000000}]'
volatile.last_state.idmap: '[]'
volatile.last_state.power: RUNNING
volatile.last_state.ready: "false"
volatile.uuid: 93570466-38fd-4619-9610-4ff47860d5f6
volatile.uuid.generation: 93570466-38fd-4619-9610-4ff47860d5f6
devices:
gpu:
type: gpu
libnvidia-ml-575:
path: /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.575.64.03
source: /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.575.64.03
type: disk
nvidia-smi-bin:
path: /usr/bin/nvidia-smi
source: /usr/bin/nvidia-smi
type: disk
nvidia-uvm:
path: /dev/nvidia-uvm
source: /dev/nvidia-uvm
type: unix-char
nvidiactl:
path: /dev/nvidiactl
source: /dev/nvidiactl
type: unix-char
ephemeral: false
profiles:
- default
stateful: false
description: ""
(torch) d437@d437-MS-7C71:~$ lxc exec gpu-base -- ls /usr/lib/libcuda.so*
ls: cannot access '/usr/lib/libcuda.so*': No such file or directory
(torch) d437@d437-MS-7C71:~$ lxc config set gpu-base nvidia.runtime true
(torch) d437@d437-MS-7C71:~$ lxc exec gpu-base -- ls /usr/lib/libcuda.so*
ls: cannot access '/usr/lib/libcuda.so*': No such file or directory
(torch) d437@d437-MS-7C71:~$ ls -l /usr/lib/x86_64-linux-gnu/libcuda.so*
lrwxrwxrwx 1 root root 12 Jul 2 18:56 /usr/lib/x86_64-linux-gnu/libcuda.so -> libcuda.so.1
lrwxrwxrwx 1 root root 20 Jul 2 18:56 /usr/lib/x86_64-linux-gnu/libcuda.so.1 -> libcuda.so.575.64.03
-rw-r--r-- 1 root root 92316728 Jul 2 18:56 /usr/lib/x86_64-linux-gnu/libcuda.so.575.64.03
(torch) d437@d437-MS-7C71:~$ lxc config device remove gpu-base gpu
Device gpu removed from gpu-base
(torch) d437@d437-MS-7C71:~$ lxc config device add gpu-base gpu0 gpu gputype=physical id=00000000:01:00.0
Error: Invalid devices: Device validation failed for "gpu0": ID must be integer DRM ID or CDI ID: Invalid CDI ID: unqualified device "00000000:01:00.0", missing vendor
(torch) d437@d437-MS-7C71:~$ nvidia-smi
Sun Aug 17 11:44:43 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.64.03 Driver Version: 575.64.03 CUDA Version: 12.9 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA TITAN RTX Off | 00000000:01:00.0 Off | N/A |
| 41% 37C P8 26W / 280W | 281MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 2064 G /usr/lib/xorg/Xorg 239MiB |
| 0 N/A N/A 2374 G /usr/bin/gnome-shell 9MiB |
| 0 N/A N/A 3310 G ...exec/xdg-desktop-portal-gnome 21MiB |
+-----------------------------------------------------------------------------------------+
(torch) d437@d437-MS-7C71:~$ lxc config device add gpu-base gpu0 gpu gputype=physical id=00000000:01:00.0
Error: Invalid devices: Device validation failed for "gpu0": ID must be integer DRM ID or CDI ID: Invalid CDI ID: unqualified device "00000000:01:00.0", missing vendor
(torch) d437@d437-MS-7C71:~$ lxd --version
6.4
(torch) d437@d437-MS-7C71:~$ lxc config set gpu-base nvidia.runtime true
(torch) d437@d437-MS-7C71:~$ lxc config device remove gpu-base gpu
Error: Device doesn't exist
(torch) d437@d437-MS-7C71:~$ lxc config device add gpu-base gpu0 gpu gputype=physical
Device gpu0 added to gpu-base
(torch) d437@d437-MS-7C71:~$ lxc restart gpu-base
(torch) d437@d437-MS-7C71:~$ lxc exec gpu-base -- nvidia-smi
Sun Aug 17 03:47:11 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 575.64.03 Driver Version: 575.64.03 CUDA Version: 12.9 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA TITAN RTX Off | 00000000:01:00.0 Off | N/A |
| 40% 37C P8 26W / 280W | 286MiB / 24576MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
+-----------------------------------------------------------------------------------------+
(torch) d437@d437-MS-7C71:~$ lxc exec gpu-base -- ls -l /dev/nvidia*
ls: cannot access '/dev/nvidia-modeset': No such file or directory
crw-rw-rw- 1 nobody nogroup 511, 0 Aug 15 13:39 /dev/nvidia-uvm
crw-rw-rw- 1 nobody nogroup 511, 1 Aug 15 13:39 /dev/nvidia-uvm-tools
crw-rw-rw- 1 root root 195, 0 Aug 17 03:46 /dev/nvidia0
crw-rw-rw- 1 nobody nogroup 195, 255 Aug 15 13:39 /dev/nvidiactl
(torch) d437@d437-MS-7C71:~$ lxc exec gpu-base -- ls -l /usr/lib/x86_64-linux-gnu/libcuda.so*
lrwxrwxrwx 1 root root 12 Jun 6 14:46 /usr/lib/x86_64-linux-gnu/libcuda.so -> libcuda.so.1
lrwxrwxrwx 1 root root 20 Aug 17 03:47 /usr/lib/x86_64-linux-gnu/libcuda.so.1 -> libcuda.so.575.64.03
-rw-r--r-- 1 nobody nogroup 92316728 Jul 2 10:56 /usr/lib/x86_64-linux-gnu/libcuda.so.575.64.03
(torch) d437@d437-MS-7C71:~$ lxc exec gpu-base -- bash
conda activate torch
python3 -c "import torch; print(torch.__version__); print(torch.cuda.is_available()); print(torch.cuda.device_count()); print(torch.cuda.get_device_name(0))"
(base) root@gpu-base:~# conda activate torch
(torch) root@gpu-base:~# python3 -c "import torch; print(torch.__version__); print(torch.cuda.is_available()); print(torch.cuda.device_count()); print(torch.cuda.get_device_name(0))"
2.8.0+cu129
True
1
NVIDIA TITAN RTX
更多推荐

所有评论(0)