Étape 1 : diagnostiquer le goulot d'étranglement
# GPU utilization — should be 90%+
nvidia-smi -l 1
# If low GPU util → CPU/DataLoader bottleneck
Corriger le goulot du DataLoader
loader = DataLoader(
dataset,
batch_size=64,
num_workers=min(os.cpu_count(), 8), # key
pin_memory=True, # key
persistent_workers=True, # key
prefetch_factor=2
)
Gestion de la mémoire CUDA
# Clear cache between experiments
torch.cuda.empty_cache()
# Monitor
print(f'Allocated: {torch.cuda.memory_allocated()/1e9:.1f}GB')
print(f'Reserved: {torch.cuda.memory_reserved()/1e9:.1f}GB')
Accumulation de gradient (pour de grands batches sur VRAM limitée)
for i, (x, y) in enumerate(loader):
loss = model(x) / accumulation_steps
loss.backward()
if (i+1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()