topology_inspector.py

Inspect your GPU topology and understand communication paths

This script examines your GPU setup and reports on NVLink connections, PCIe topology, and NUMA affinity.

What It Does

Detects available GPUs and their properties
Identifies NVLink connections between GPU pairs
Maps PCIe topology (bridges, switches)
Shows CPU/NUMA affinity for each GPU
Suggests optimal process placement

Run It

python tutorial/part1-distributed/chapter04-nccl-topology/scripts/topology_inspector.py

Example Output (8-GPU DGX)

=== GPU Topology Inspector ===

Found 8 GPUs:
  GPU 0: NVIDIA A100-SXM4-80GB
  GPU 1: NVIDIA A100-SXM4-80GB
  ...

NVLink Connections:
  GPU 0 <--NV12--> GPU 1
  GPU 0 <--NV12--> GPU 2
  GPU 0 <--NV12--> GPU 3
  ...

PCIe Topology:
  GPU 0-3: Same PCIe switch (fast)
  GPU 4-7: Same PCIe switch (fast)
  GPU 0-4: Cross-switch (slower)

NUMA Affinity:
  GPU 0-3: NUMA node 0 (CPUs 0-31)
  GPU 4-7: NUMA node 1 (CPUs 32-63)

Recommendations:
  - For 4-GPU jobs, use GPUs 0-3 or 4-7 (same switch)
  - For 8-GPU jobs, expect ~10% overhead from cross-switch communication

Why This Matters

Understanding topology helps you:

Place processes optimally - Keep communicating processes on fast interconnects
Debug performance issues - Unexpectedly slow? Check if you’re using PCIe instead of NVLink
Choose parallelism strategy - Tensor parallel works best with NVLink

Source Code

#!/usr/bin/env python3
"""
GPU Topology Inspector

This script inspects your GPU topology and provides insights about:
- Number of GPUs and their properties
- NVLink connections between GPUs
- PCIe topology
- NUMA affinity
- Recommended process placement

Usage:
    python topology_inspector.py

Note: This requires NVIDIA GPUs. On systems without GPUs, it will
display a simulated topology for educational purposes.
"""

import os
import subprocess
import sys
from typing import Dict, List, Optional, Tuple


def check_nvidia_smi() -> bool:
    """Check if nvidia-smi is available."""
    try:
        subprocess.run(
            ["nvidia-smi", "--version"],
            capture_output=True,
            check=True
        )
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False


def get_gpu_count() -> int:
    """Get the number of GPUs."""
    try:
        result = subprocess.run(
            ["nvidia-smi", "-L"],
            capture_output=True,
            text=True,
            check=True
        )
        return len(result.stdout.strip().split('\n'))
    except:
        return 0


def get_gpu_info() -> List[Dict]:
    """Get detailed GPU information."""
    try:
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=index,name,memory.total,pci.bus_id",
             "--format=csv,noheader,nounits"],
            capture_output=True,
            text=True,
            check=True
        )
        gpus = []
        for line in result.stdout.strip().split('\n'):
            parts = [p.strip() for p in line.split(',')]
            if len(parts) >= 4:
                gpus.append({
                    'index': int(parts[0]),
                    'name': parts[1],
                    'memory_mb': int(parts[2]),
                    'pci_bus': parts[3]
                })
        return gpus
    except:
        return []


def get_topology_matrix() -> Optional[str]:
    """Get the GPU topology matrix from nvidia-smi."""
    try:
        result = subprocess.run(
            ["nvidia-smi", "topo", "-m"],
            capture_output=True,
            text=True,
            check=True
        )
        return result.stdout
    except:
        return None


def get_nvlink_status() -> Optional[str]:
    """Get NVLink status for GPU 0."""
    try:
        result = subprocess.run(
            ["nvidia-smi", "nvlink", "--status", "-i", "0"],
            capture_output=True,
            text=True,
            check=True
        )
        return result.stdout
    except:
        return None


def parse_topology_matrix(matrix_str: str) -> Dict[Tuple[int, int], str]:
    """Parse topology matrix into a dict of GPU pairs to connection types."""
    connections = {}
    lines = matrix_str.strip().split('\n')

    # Find the header line with GPU columns
    header_idx = None
    for i, line in enumerate(lines):
        if 'GPU0' in line or 'GPU 0' in line:
            header_idx = i
            break

    if header_idx is None:
        return connections

    # Parse the matrix
    for line in lines[header_idx + 1:]:
        if not line.strip() or 'Legend' in line:
            break

        parts = line.split()
        if not parts or not parts[0].startswith('GPU'):
            continue

        try:
            gpu_from = int(parts[0].replace('GPU', ''))
            for col_idx, conn in enumerate(parts[1:]):
                if conn in ['X', 'NV1', 'NV2', 'NV3', 'NV4', 'NV5', 'NV6',
                           'NV7', 'NV8', 'NV12', 'NV18', 'SYS', 'NODE',
                           'PHB', 'PXB', 'PIX']:
                    connections[(gpu_from, col_idx)] = conn
        except (ValueError, IndexError):
            continue

    return connections


def print_header(title: str) -> None:
    """Print a formatted header."""
    print("\n" + "═" * 60)
    print(f" {title}")
    print("═" * 60)


def print_simulated_topology() -> None:
    """Print a simulated topology for educational purposes."""
    print_header("SIMULATED GPU TOPOLOGY (No GPUs Detected)")

    print("""
This is a simulated DGX-A100 topology for educational purposes.

In a real DGX-A100 with 8 A100 GPUs:

        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7
GPU0     X      NV12    NV12    NV12    NV12    NV12    NV12    NV12
GPU1    NV12     X      NV12    NV12    NV12    NV12    NV12    NV12
GPU2    NV12    NV12     X      NV12    NV12    NV12    NV12    NV12
GPU3    NV12    NV12    NV12     X      NV12    NV12    NV12    NV12
GPU4    NV12    NV12    NV12    NV12     X      NV12    NV12    NV12
GPU5    NV12    NV12    NV12    NV12    NV12     X      NV12    NV12
GPU6    NV12    NV12    NV12    NV12    NV12    NV12     X      NV12
GPU7    NV12    NV12    NV12    NV12    NV12    NV12    NV12     X

Legend:
  X    = Self
  NV#  = Connected via NVLink (# = number of links)
  SYS  = Connected via PCIe across NUMA nodes (slowest)
  NODE = Same NUMA node, connected via PCIe
  PHB  = Same PCIe host bridge
  PXB  = Different PCIe bridges, same PCIe switch
  PIX  = Same PCIe switch

Performance implications:
  NV12 (12 NVLinks): ~300 GB/s bidirectional
  SYS:               ~12 GB/s (PCIe 4.0 x16 through CPU)

This shows why NVLink matters: 25x higher bandwidth!
""")


def analyze_topology(connections: Dict[Tuple[int, int], str], num_gpus: int) -> None:
    """Analyze and report on the topology."""
    print_header("TOPOLOGY ANALYSIS")

    # Count NVLink connections
    nvlink_pairs = []
    pcie_pairs = []

    for (g1, g2), conn in connections.items():
        if g1 < g2:  # Avoid double counting
            if conn.startswith('NV'):
                nvlink_pairs.append((g1, g2, conn))
            elif conn in ['SYS', 'NODE', 'PHB', 'PXB', 'PIX']:
                pcie_pairs.append((g1, g2, conn))

    print(f"\nNVLink Connections ({len(nvlink_pairs)} pairs):")
    if nvlink_pairs:
        for g1, g2, conn in sorted(nvlink_pairs):
            num_links = conn.replace('NV', '')
            print(f"  GPU{g1} <-> GPU{g2}: {conn} ({num_links} links)")
    else:
        print("  None detected")

    print(f"\nPCIe-only Connections ({len(pcie_pairs)} pairs):")
    if pcie_pairs:
        for g1, g2, conn in sorted(pcie_pairs):
            print(f"  GPU{g1} <-> GPU{g2}: {conn}")
    else:
        print("  None (all pairs have NVLink)")

    # Recommendations
    print_header("RECOMMENDATIONS")

    if len(nvlink_pairs) == (num_gpus * (num_gpus - 1)) // 2:
        print("✓ All GPU pairs connected via NVLink")
        print("  → Ideal for all-reduce operations")
        print("  → Can use any GPU grouping")
    elif nvlink_pairs:
        print("⚠ Mixed NVLink/PCIe topology")
        print("  → Group NVLink-connected GPUs together when possible")
        print("  → Use process groups to exploit fast connections")
    else:
        print("⚠ No NVLink detected")
        print("  → Performance will be limited by PCIe bandwidth")
        print("  → Consider using smaller batch sizes to hide communication")


def main():
    print("╔" + "═" * 58 + "╗")
    print("║" + " GPU TOPOLOGY INSPECTOR".center(58) + "║")
    print("╚" + "═" * 58 + "╝")

    # Check if we have nvidia-smi
    if not check_nvidia_smi():
        print("\n[INFO] nvidia-smi not found. Showing simulated topology.")
        print_simulated_topology()
        return

    # Get GPU count
    num_gpus = get_gpu_count()
    if num_gpus == 0:
        print("\n[INFO] No NVIDIA GPUs detected. Showing simulated topology.")
        print_simulated_topology()
        return

    # Get GPU information
    print_header("GPU INFORMATION")
    gpus = get_gpu_info()
    for gpu in gpus:
        print(f"\nGPU {gpu['index']}: {gpu['name']}")
        print(f"  Memory: {gpu['memory_mb']} MB")
        print(f"  PCI Bus: {gpu['pci_bus']}")

    # Get topology matrix
    print_header("TOPOLOGY MATRIX")
    topo_matrix = get_topology_matrix()
    if topo_matrix:
        print(topo_matrix)
        connections = parse_topology_matrix(topo_matrix)
        analyze_topology(connections, num_gpus)
    else:
        print("Could not retrieve topology matrix")

    # Get NVLink status
    nvlink_status = get_nvlink_status()
    if nvlink_status and "NVLINK" not in nvlink_status:
        print_header("NVLINK STATUS (GPU 0)")
        print(nvlink_status)

    # PyTorch CUDA information
    print_header("PYTORCH CUDA INFO")
    try:
        import torch
        if torch.cuda.is_available():
            print(f"PyTorch version: {torch.__version__}")
            print(f"CUDA version: {torch.version.cuda}")
            print(f"cuDNN version: {torch.backends.cudnn.version()}")
            print(f"GPU count (torch): {torch.cuda.device_count()}")

            for i in range(torch.cuda.device_count()):
                props = torch.cuda.get_device_properties(i)
                print(f"\nGPU {i}: {props.name}")
                print(f"  Compute capability: {props.major}.{props.minor}")
                print(f"  Total memory: {props.total_memory / 1e9:.1f} GB")
                print(f"  Multi-processor count: {props.multi_processor_count}")
        else:
            print("PyTorch CUDA not available")
    except ImportError:
        print("PyTorch not installed")

    # Summary
    print_header("QUICK REFERENCE")
    print("""
NCCL Environment Variables for Tuning:
  NCCL_DEBUG=INFO           Show what NCCL is doing
  NCCL_ALGO=Ring            Force ring algorithm
  NCCL_ALGO=Tree            Force tree algorithm
  NCCL_NTHREADS=256         Set thread count
  NCCL_P2P_DISABLE=1        Disable peer-to-peer (for debugging)

Common Commands:
  nvidia-smi topo -m        Show topology matrix
  nvidia-smi nvlink --status Show NVLink connections
  nvidia-smi -q -d MEMORY   Show memory usage details
""")


if __name__ == "__main__":
    main()

Keyboard shortcuts

ML Systems for Dummies

topology_inspector.py

What It Does

Run It

Example Output (8-GPU DGX)

Why This Matters

Source Code