parallel_strategy_calculator.py
Calculate optimal parallelism configuration for your model and hardware
This script helps you design a parallelism strategy by calculating memory requirements, communication volume, and efficiency for different configurations.
What It Does
- Takes model specifications (parameters, layers, hidden size)
- Takes hardware specifications (GPU memory, count, interconnect)
- Calculates memory per GPU for each parallelism strategy
- Recommends optimal configuration
Run It
python tutorial/part2-parallelism/chapter07-pipeline-expert/scripts/parallel_strategy_calculator.py
Example Usage
=== Parallelism Strategy Calculator ===
Model: 70B parameters, 80 layers, hidden=8192
Hardware: 64 GPUs (8 per node), 80GB each, NVLink intra-node
Configuration options:
| Strategy | TP | PP | DP | Memory/GPU | Comm Volume | Feasible? |
|---------------|----|----|----| -----------|-------------|-----------|
| Pure DP | 1 | 1 | 64 | 840 GB | 2D/step | No |
| TP=8, DP=8 | 8 | 1 | 8 | 105 GB | 8D/layer | No |
| TP=8, PP=2 | 8 | 2 | 4 | 52 GB | 8D/layer | Yes |
| TP=8, PP=4 | 8 | 4 | 2 | 26 GB | 8D/layer | Yes (rec) |
Recommended: TP=8 (within node), PP=4 (across nodes), DP=2
Reasoning:
- TP=8 uses NVLink bandwidth efficiently
- PP=4 distributes 80 layers across 4 stages (20 layers each)
- DP=2 provides batch parallelism for throughput
Input Parameters
The calculator considers:
- Model: Parameters, layers, hidden dimension, precision
- Hardware: GPU count, memory per GPU, interconnect bandwidth
- Training: Batch size, sequence length, microbatch count
What It Calculates
For each configuration:
- Memory per GPU: Parameters + gradients + optimizer + activations
- Communication volume: Per-step all_reduce/all_gather/send-recv
- Bubble fraction: For pipeline configurations
- Feasibility: Does it fit in GPU memory?
Source Code
#!/usr/bin/env python3
"""
Parallel Strategy Calculator
Given model specifications and hardware constraints, this script helps
you determine the optimal parallelism strategy.
It calculates:
- Memory requirements for different strategies
- Communication volumes
- Recommended configuration
Usage:
python parallel_strategy_calculator.py
python parallel_strategy_calculator.py --params 70 --gpus 64 --memory 80
"""
import argparse
from dataclasses import dataclass
from typing import Optional, Tuple
import math
@dataclass
class ModelConfig:
"""Model configuration."""
params_billions: float
hidden_size: int = 8192
num_layers: int = 80
num_heads: int = 64
vocab_size: int = 128000
intermediate_ratio: float = 4.0
is_moe: bool = False
num_experts: int = 1
top_k: int = 1 # Experts activated per token
@dataclass
class HardwareConfig:
"""Hardware configuration."""
num_gpus: int
memory_per_gpu_gb: float
intra_node_bandwidth_gbps: float = 900 # NVLink
inter_node_bandwidth_gbps: float = 50 # InfiniBand
gpus_per_node: int = 8
@dataclass
class TrainingConfig:
"""Training configuration."""
batch_size: int = 1024
sequence_length: int = 4096
dtype_bytes: int = 2 # FP16/BF16
def estimate_model_memory(config: ModelConfig, dtype_bytes: int = 2) -> dict:
"""Estimate memory requirements for model parameters."""
# Parameter count estimation
# Embedding: vocab_size * hidden_size
# Per layer: 4 * hidden^2 (QKV + O) + 2 * hidden * intermediate (FFN)
embedding_params = config.vocab_size * config.hidden_size
attention_params = 4 * config.hidden_size ** 2 # Q, K, V, O projections
ffn_params = 2 * config.hidden_size * int(config.hidden_size * config.intermediate_ratio)
if config.is_moe:
# MoE: multiply FFN by number of experts
ffn_params *= config.num_experts
layer_params = attention_params + ffn_params
total_params = embedding_params + (layer_params * config.num_layers)
# Convert to bytes
params_bytes = total_params * dtype_bytes
gradients_bytes = params_bytes # Same size as params
# Optimizer states (Adam: 2x params in FP32)
optimizer_bytes = total_params * 4 * 2
return {
'params': params_bytes / 1e9, # GB
'gradients': gradients_bytes / 1e9,
'optimizer': optimizer_bytes / 1e9,
'total': (params_bytes + gradients_bytes + optimizer_bytes) / 1e9,
'param_count': total_params,
}
def estimate_activation_memory(
config: ModelConfig,
training: TrainingConfig,
tp_degree: int = 1,
pp_degree: int = 1
) -> float:
"""Estimate activation memory per GPU in GB."""
batch_per_gpu = training.batch_size // (training.batch_size // pp_degree)
seq_len = training.sequence_length
hidden = config.hidden_size
# Per-layer activations (simplified)
# Input to attention, attention output, FFN intermediate, etc.
activations_per_layer = 10 * batch_per_gpu * seq_len * hidden // tp_degree
layers_per_stage = config.num_layers // pp_degree
total_activation_bytes = activations_per_layer * layers_per_stage * training.dtype_bytes
return total_activation_bytes / 1e9
def calculate_communication_volume(
config: ModelConfig,
training: TrainingConfig,
tp_degree: int,
dp_degree: int,
pp_degree: int
) -> dict:
"""Calculate communication volume for different parallelism types."""
batch = training.batch_size
seq = training.sequence_length
hidden = config.hidden_size
dtype = training.dtype_bytes
# TP communication: all_reduce per layer
# 2 all_reduce per transformer layer (attention + FFN)
tp_volume_per_layer = 4 * batch * seq * hidden * dtype * (tp_degree - 1) / tp_degree
tp_volume_total = tp_volume_per_layer * config.num_layers / pp_degree
# PP communication: activations between stages
pp_volume = 2 * batch * seq * hidden * dtype # Forward and backward
# DP communication: gradient all_reduce
params_per_stage = config.params_billions * 1e9 / pp_degree
dp_volume = 2 * params_per_stage * dtype * (dp_degree - 1) / dp_degree
return {
'tp_per_step_gb': tp_volume_total / 1e9,
'pp_per_step_gb': pp_volume / 1e9,
'dp_per_step_gb': dp_volume / 1e9,
'total_gb': (tp_volume_total + pp_volume + dp_volume) / 1e9,
}
def find_optimal_strategy(
model: ModelConfig,
hardware: HardwareConfig,
training: TrainingConfig
) -> dict:
"""Find optimal parallelism strategy given constraints."""
mem = estimate_model_memory(model, training.dtype_bytes)
results = []
# Try different configurations
for tp in [1, 2, 4, 8]:
if tp > hardware.gpus_per_node:
continue
for pp in [1, 2, 4, 8, 16]:
if tp * pp > hardware.num_gpus:
continue
dp = hardware.num_gpus // (tp * pp)
if dp < 1:
continue
# Memory per GPU
params_per_gpu = mem['params'] / (tp * pp)
grads_per_gpu = mem['gradients'] / (tp * pp)
optimizer_per_gpu = mem['optimizer'] / (tp * pp) # With ZeRO-3
# ZeRO-3 shards optimizer across DP
optimizer_per_gpu = optimizer_per_gpu / dp
activation_mem = estimate_activation_memory(model, training, tp, pp)
total_mem = params_per_gpu + grads_per_gpu + optimizer_per_gpu + activation_mem
# Communication
comm = calculate_communication_volume(model, training, tp, dp, pp)
# Estimate if TP crosses nodes
tp_is_intra_node = tp <= hardware.gpus_per_node
results.append({
'tp': tp,
'pp': pp,
'dp': dp,
'memory_per_gpu': total_mem,
'fits': total_mem < hardware.memory_per_gpu_gb * 0.9, # 90% threshold
'communication': comm,
'tp_intra_node': tp_is_intra_node,
})
return results
def print_results(results: list, hardware: HardwareConfig) -> None:
"""Print strategy comparison."""
print("\n" + "=" * 80)
print(" STRATEGY COMPARISON")
print("=" * 80)
# Header
print(f"\n{'TP':>4} {'PP':>4} {'DP':>4} {'Mem/GPU':>10} {'Fits?':>8} "
f"{'TP Comm':>10} {'PP Comm':>10} {'DP Comm':>10}")
print("-" * 80)
valid_configs = []
for r in results:
fits = "✓" if r['fits'] else "✗"
tp_note = "" if r['tp_intra_node'] else "*"
print(f"{r['tp']:>4}{tp_note:<1} {r['pp']:>3} {r['dp']:>4} "
f"{r['memory_per_gpu']:>9.1f}GB {fits:>8} "
f"{r['communication']['tp_per_step_gb']:>9.2f}GB "
f"{r['communication']['pp_per_step_gb']:>9.2f}GB "
f"{r['communication']['dp_per_step_gb']:>9.2f}GB")
if r['fits']:
valid_configs.append(r)
print("\n* TP crosses node boundary (slower inter-node communication)")
# Recommendation
print("\n" + "=" * 80)
print(" RECOMMENDATION")
print("=" * 80)
if not valid_configs:
print("\n⚠ No configuration fits in memory!")
print(" Consider: More GPUs, larger GPU memory, or smaller batch size")
return
# Sort by communication volume (prefer lower communication)
valid_configs.sort(key=lambda x: (
0 if x['tp_intra_node'] else 1, # Prefer intra-node TP
x['communication']['total_gb'],
))
best = valid_configs[0]
print(f"""
Recommended configuration:
Tensor Parallelism (TP): {best['tp']}
Pipeline Parallelism (PP): {best['pp']}
Data Parallelism (DP): {best['dp']}
Memory per GPU: {best['memory_per_gpu']:.1f} GB (limit: {hardware.memory_per_gpu_gb} GB)
Total communication: {best['communication']['total_gb']:.2f} GB per step
Reasoning:
- TP={best['tp']} {"stays within a node (NVLink speed)" if best['tp_intra_node'] else "crosses nodes (slower)"}
- PP={best['pp']} splits model into {best['pp']} stages
- DP={best['dp']} {"provides excellent scaling" if best['dp'] > 1 else "single replica"}
""")
def main():
parser = argparse.ArgumentParser(description="Parallel Strategy Calculator")
parser.add_argument("--params", "-p", type=float, default=70,
help="Model parameters in billions (default: 70)")
parser.add_argument("--gpus", "-g", type=int, default=64,
help="Total number of GPUs (default: 64)")
parser.add_argument("--memory", "-m", type=float, default=80,
help="GPU memory in GB (default: 80)")
parser.add_argument("--batch-size", "-b", type=int, default=512,
help="Global batch size (default: 512)")
parser.add_argument("--seq-len", "-s", type=int, default=4096,
help="Sequence length (default: 4096)")
parser.add_argument("--moe", action="store_true",
help="Model is Mixture of Experts")
parser.add_argument("--num-experts", type=int, default=64,
help="Number of experts for MoE (default: 64)")
args = parser.parse_args()
print("╔" + "═" * 78 + "╗")
print("║" + " PARALLEL STRATEGY CALCULATOR".center(78) + "║")
print("╚" + "═" * 78 + "╝")
# Configure model
model = ModelConfig(
params_billions=args.params,
is_moe=args.moe,
num_experts=args.num_experts if args.moe else 1,
)
hardware = HardwareConfig(
num_gpus=args.gpus,
memory_per_gpu_gb=args.memory,
)
training = TrainingConfig(
batch_size=args.batch_size,
sequence_length=args.seq_len,
)
# Print configuration
print(f"\n{'─'*40}")
print(" MODEL CONFIGURATION")
print(f"{'─'*40}")
mem = estimate_model_memory(model)
print(f"Parameters: {args.params}B ({mem['param_count']/1e9:.1f}B actual)")
print(f"Parameters memory: {mem['params']:.1f} GB")
print(f"Gradients memory: {mem['gradients']:.1f} GB")
print(f"Optimizer memory: {mem['optimizer']:.1f} GB")
print(f"Total model memory: {mem['total']:.1f} GB")
if args.moe:
print(f"MoE: {args.num_experts} experts")
print(f"\n{'─'*40}")
print(" HARDWARE CONFIGURATION")
print(f"{'─'*40}")
print(f"GPUs: {args.gpus} ({args.gpus // 8} nodes)")
print(f"Memory per GPU: {args.memory} GB")
print(f"\n{'─'*40}")
print(" TRAINING CONFIGURATION")
print(f"{'─'*40}")
print(f"Batch size: {args.batch_size}")
print(f"Sequence length: {args.seq_len}")
# Calculate strategies
results = find_optimal_strategy(model, hardware, training)
print_results(results, hardware)
# Additional MoE considerations
if args.moe:
print("\n" + "=" * 80)
print(" MOE-SPECIFIC CONSIDERATIONS")
print("=" * 80)
print(f"""
For MoE models, also consider Expert Parallelism (EP):
With {args.num_experts} experts and 8-way EP:
- {args.num_experts // 8} experts per GPU
- Communication: 2 all-to-all per layer
EP vs TP for MoE:
- EP: Keeps full expert matrices → better GEMM efficiency
- TP: Slices experts → smaller GEMMs, worse efficiency
- EP preferred when num_experts >> TP degree
Recommendation: Use EP instead of/in addition to TP for the FFN experts.
""")
if __name__ == "__main__":
main()