الخطوة 1: تشخيص الاختناق
# GPU utilization — should be 90%+
nvidia-smi -l 1
# If low GPU util → CPU/DataLoader bottleneck
إصلاح اختناق DataLoader
loader = DataLoader(
dataset,
batch_size=64,
num_workers=min(os.cpu_count(), 8), # key
pin_memory=True, # key
persistent_workers=True, # key
prefetch_factor=2
)
إدارة ذاكرة CUDA
# Clear cache between experiments
torch.cuda.empty_cache()
# Monitor
print(f'Allocated: {torch.cuda.memory_allocated()/1e9:.1f}GB')
print(f'Reserved: {torch.cuda.memory_reserved()/1e9:.1f}GB')
تراكم التدرج (للدفعات الكبيرة على VRAM محدودة)
for i, (x, y) in enumerate(loader):
loss = model(x) / accumulation_steps
loss.backward()
if (i+1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()