NVIDIA / go-nvml

Go Bindings for the NVIDIA Management Library (NVML)

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Getting the wrong MIG count per GPU Index

anaconda2196 opened this issue · comments

Hello @klueska @elezar

I am getting the wrong MIG count per GPU Index.

Sample Program:

package main

import (
	"fmt"
	"log"
	"strconv"

	"github.com/NVIDIA/go-nvml/pkg/nvml"
)

func main() {
	ret := nvml.Init()
	if ret != nvml.SUCCESS {
		log.Fatalf("Unable to initialize NVML: %v", nvml.ErrorString(ret))
	}
	defer func() {
		ret := nvml.Shutdown()
		if ret != nvml.SUCCESS {
			log.Fatalf("Unable to shutdown NVML: %v", nvml.ErrorString(ret))
		}
	}()

	count, ret := nvml.DeviceGetCount()
	if ret != nvml.SUCCESS {
		log.Fatalf("Unable to get device count: %v", nvml.ErrorString(ret))
	}

	fmt.Println("Total GPU COUNT", count)

	for i := 0; i < count; i++ {
		device, ret := nvml.DeviceGetHandleByIndex(i)
		gpuindex := i

		if ret != nvml.SUCCESS {
			log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret))
		}

		uuid, ret := device.GetUUID()
		if ret != nvml.SUCCESS {
			log.Fatalf("Unable to get uuid of device at index %d: %v", i, nvml.ErrorString(ret))
		}

		fmt.Printf("%v\n", uuid)

		// MIG Device Count per GPU Index
		migcount, ret := nvml.DeviceGetMaxMigDeviceCount(device)
		if ret != nvml.SUCCESS {
			log.Fatalln("Unable to get mig device count for GPU ", i, nvml.ErrorString(ret))
		}

		fmt.Println("MIG COUNT", migcount)

		for i := 0; i < migcount; i++ {

			migdevice, ret := nvml.DeviceGetMigDeviceHandleByIndex(device, i)

			if ret != nvml.SUCCESS {
				log.Fatalln("Unable to get mig gpu instance id for MIG index ", i, "of GPU Index", gpuindex, nvml.ErrorString(ret))
			}

			id, _ := migdevice.GetGpuInstanceId()
			migid := strconv.Itoa(id)

			uuid, ret := migdevice.GetUUID()
			if ret != nvml.SUCCESS {
				log.Fatalln("Unable to get miguuid for MIG index ", i, "of GPU Index ", gpuindex, nvml.ErrorString(ret))
			}

			miguuid := string(uuid)
			fmt.Printf("MIG ID %v\n", migid)
			fmt.Printf("MIG UUID %v\n", miguuid)
		}
	}

}

GPU - A100 (MIG enabled but no MIG devices are created)
-> Wrong output, should be MIG Count - 0

go run nvml.go 
Total GPU COUNT 1
GPU-3b722aa5-db24-5b93-84c8-e4e0efcef078
MIG COUNT 7
2023/04/18 18:54:44 Unable to get mig gpu instance id for MIG index  0 of GPU Index 0 Not Found
exit status 1


~ # nvidia-smi -L
GPU 0: NVIDIA A100-PCIE-40GB (UUID: GPU-3b722aa5-db24-5b93-84c8-e4e0efcef078)

~ # nvidia-smi
Tue Apr 18 18:55:05 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA A100-PCI...  Off  | 00000000:86:00.0 Off |                   On |
| N/A   29C    P0    37W / 250W |      0MiB / 40960MiB |     N/A      Default |
|                               |                      |              Enabled |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| MIG devices:                                                                |
+------------------+----------------------+-----------+-----------------------+
| GPU  GI  CI  MIG |         Memory-Usage |        Vol|         Shared        |
|      ID  ID  Dev |           BAR1-Usage | SM     Unc| CE  ENC  DEC  OFA  JPG|
|                  |                      |        ECC|                       |
|==================+======================+===========+=======================|
|  No MIG devices found                                                       |
+-----------------------------------------------------------------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

GPU - A100 (MIG enabled - Mix Strategy)
-> Wrong output, should be MIG Count - 3

# go run nvml.go 
Total GPU COUNT 1
GPU-3b722aa5-db24-5b93-84c8-e4e0efcef078
MIG COUNT 7
MIG ID 2
MIG UUID MIG-23de08f7-6fcc-5be3-9094-cc3b4162f0bb
MIG ID 3
MIG UUID MIG-9d822d55-dc61-5acb-8c24-544828b47b03
MIG ID 9
MIG UUID MIG-5f16849a-8ae9-571a-bc2c-486916c4a324
2023/04/18 18:55:49 Unable to get mig gpu instance id for MIG index  3 of GPU Index 0 Not Found
exit status 1

~ # nvidia-smi -L
GPU 0: NVIDIA A100-PCIE-40GB (UUID: GPU-3b722aa5-db24-5b93-84c8-e4e0efcef078)
  MIG 3g.20gb     Device  0: (UUID: MIG-23de08f7-6fcc-5be3-9094-cc3b4162f0bb)
  MIG 2g.10gb     Device  1: (UUID: MIG-9d822d55-dc61-5acb-8c24-544828b47b03)
  MIG 1g.5gb      Device  2: (UUID: MIG-5f16849a-8ae9-571a-bc2c-486916c4a324)


 # nvidia-smi
Tue Apr 18 18:56:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA A100-PCI...  Off  | 00000000:86:00.0 Off |                   On |
| N/A   29C    P0    39W / 250W |     39MiB / 40960MiB |     N/A      Default |
|                               |                      |              Enabled |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| MIG devices:                                                                |
+------------------+----------------------+-----------+-----------------------+
| GPU  GI  CI  MIG |         Memory-Usage |        Vol|         Shared        |
|      ID  ID  Dev |           BAR1-Usage | SM     Unc| CE  ENC  DEC  OFA  JPG|
|                  |                      |        ECC|                       |
|==================+======================+===========+=======================|
|  0    2   0   0  |     19MiB / 19968MiB | 42      0 |  3   0    2    0    0 |
|                  |      0MiB / 32767MiB |           |                       |
+------------------+----------------------+-----------+-----------------------+
|  0    3   0   1  |     13MiB /  9856MiB | 28      0 |  2   0    1    0    0 |
|                  |      0MiB / 16383MiB |           |                       |
+------------------+----------------------+-----------+-----------------------+
|  0    9   0   2  |      6MiB /  4864MiB | 14      0 |  1   0    0    0    0 |
|                  |      0MiB /  8191MiB |           |                       |
+------------------+----------------------+-----------+-----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+


GPU - A100 (MIG disabled)
-> Correct output

nvidia-smi -mig 0
Disabled MIG Mode for GPU 00000000:86:00.0

Warning: persistence mode is disabled on device 00000000:86:00.0. See the Known Issues section of the nvidia-smi(1) man page for more information. Run with [--help | -h] switch to get more information on how to enable persistence mode.
All done.

 # nvidia-smi -r
GPU 00000000:86:00.0 was successfully reset.
All done.

 # go run nvml.go 
Total GPU COUNT 1
GPU-3b722aa5-db24-5b93-84c8-e4e0efcef078
MIG COUNT 0

 # nvidia-smi -L
GPU 0: NVIDIA A100-PCIE-40GB (UUID: GPU-3b722aa5-db24-5b93-84c8-e4e0efcef078)

 # nvidia-smi
Tue Apr 18 19:04:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA A100-PCI...  Off  | 00000000:86:00.0 Off |                    0 |
| N/A   29C    P0    34W / 250W |      0MiB / 40960MiB |      4%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+


Any help would be greatly appreciated

Hi @anaconda2196. From the nvml.h docs:

/**
 * Get the maximum number of MIG devices that can exist under a given parent NVML device.
 *
 * Returns zero if MIG is not supported or enabled.
 *
 * For Ampere &tm; or newer fully supported devices.
 * Supported on Linux only.
 *
 * @param device                               Target device handle
 * @param count                                Count of MIG devices
 *
 * @return
 *         - \ref NVML_SUCCESS                 if \a count was successfully retrieved
 *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
 *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a count reference is invalid
 *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
 */
nvmlReturn_t DECLDIR nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device, unsigned int *count);

This means that the MIG count being returned is correct. This is the maximum number of possible MIG devices and not the number of MIG devices actually allocated. For a GPU with MIG disabled this is 0.

To get the number of actual MIG devices you need to iterate over possible MIG devices and call DeviceGetMigDeviceHandleByIndex for each of these. The indices that return success would be valid MIG devices.

From the docs:

/**
 * Get MIG device handle for the given index under its parent NVML device.
 *
 * If the compute instance is destroyed either explicitly or by destroying,
 * resetting or unbinding the parent GPU instance or the GPU device itself
 * the MIG device handle would remain invalid and must be requested again
 * using this API. Handles may be reused and their properties can change in
 * the process.
 *
 * For Ampere &tm; or newer fully supported devices.
 * Supported on Linux only.
 *
 * @param device                               Reference to the parent GPU device handle
 * @param index                                Index of the MIG device
 * @param migDevice                            Reference to the MIG device handle
 *
 * @return
 *         - \ref NVML_SUCCESS                 if \a migDevice handle was successfully created
 *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
 *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a index or \a migDevice reference is invalid
 *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
 *         - \ref NVML_ERROR_NOT_FOUND         if no valid MIG device was found at \a index
 *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
 */
nvmlReturn_t DECLDIR nvmlDeviceGetMigDeviceHandleByIndex(nvmlDevice_t device, unsigned int index,
                                                         nvmlDevice_t *migDevice);

A return value of NVML_ERROR_NOT_FOUND would indicate an invalid index, for example.

@klueska @elezar Oh okay got it. Thanks for sharing this.

Could you please tell me which exact function I should call to get the number of actual MIG devices created per GPU index?

I would like to first get the total MIG device count per GPU index, and then later use DeviceGetMigDeviceHandleByIndex to get the MIG ID and MIG UUID as per the above sample program.

@anaconda2196 there is no specific function to get the number of MIG devices. If you need to get the available MIG devices you would need to:

  1. For each GPU with MIG enabled
    1. Get the maximum number of MIG devices
    2. Iterate for this many devices and check whether this is a valid device index (see the link that @klueska supplied)
    3. For valid devices store the deviceIndex-migIndex tuple (or increase a counter).