AI/ML Troubleshooting: Complete Diagnostic and Solution Guide

AI/ML Troubleshooting Guide

Complete diagnostic and solution guide for common AI and machine learning issues across Python, TensorFlow, PyTorch, and CUDA environments.

Python Installation Issues

Python Not Found or Version Conflicts

# Check Python installation
which python
which python3
python --version
python3 --version

# Check all Python installations
ls -la /usr/bin/python*

# Create symlink to correct version
sudo ln -s /usr/bin/python3.11 /usr/bin/python

# Add to PATH
export PATH="/usr/local/bin/python3.11:$PATH"
echo 'export PATH="/usr/local/bin/python3.11:$PATH"' >> ~/.bashrc
source ~/.bashrc

# Windows - check PATH
where python
echo %PATH%

pip Installation Failures

# Reinstall pip
python -m ensurepip --upgrade
python -m pip install --upgrade pip

# Use --user flag for permission issues
pip install --user package_name

# Ignore SSL errors (temporary)
pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org package_name

# Clear pip cache
pip cache purge

# Install specific version
pip install package_name==specific_version

# Install with verbose output for debugging
pip install -vvv package_name

CUDA and GPU Issues

GPU Not Detected

# Check NVIDIA driver
nvidia-smi

# If command not found, install driver
sudo apt update
sudo apt install nvidia-driver-535

# Reboot after driver installation
sudo reboot

# Check CUDA installation
nvcc --version
which nvcc

# Verify CUDA path
echo $PATH
echo $LD_LIBRARY_PATH

# Add CUDA to PATH
export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Check GPU with Python
python -c "import torch; print(torch.cuda.is_available())"
python -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"

CUDA Out of Memory Errors

# TensorFlow memory growth
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# Set memory limit
tf.config.set_logical_device_configuration(
    gpus[0],
    [tf.config.LogicalDeviceConfiguration(memory_limit=4096)]
)

# PyTorch clear cache
import torch
torch.cuda.empty_cache()

# Reduce batch size
batch_size = 16  # Instead of 32 or 64

# Enable gradient checkpointing
from torch.utils.checkpoint import checkpoint

# Use mixed precision
from torch.cuda.amp import autocast
with autocast():
    output = model(input)

# Monitor GPU memory
nvidia-smi -l 1  # Update every second
watch -n 1 nvidia-smi

CUDA Version Mismatch

# Check CUDA version
nvcc --version
nvidia-smi  # Check driver-supported CUDA version

# Check PyTorch CUDA version
python -c "import torch; print(torch.version.cuda)"

# Check TensorFlow CUDA version
python -c "import tensorflow as tf; print(tf.sysconfig.get_build_info()['cuda_version'])"

# Install matching versions
# For CUDA 12.1
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# For CUDA 11.8
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Reinstall TensorFlow with correct CUDA
pip uninstall tensorflow
pip install tensorflow[and-cuda]

Training Issues

Model Not Learning (Loss Not Decreasing)

# Check learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Try different learning rates
for lr in [0.1, 0.01, 0.001, 0.0001]:
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # Train and evaluate

# Check data normalization
# Ensure data is properly normalized
mean = data.mean()
std = data.std()
normalized_data = (data - mean) / std

# Check labels
print(labels.unique())  # Ensure labels are correct
print(labels.min(), labels.max())

# Verify loss function
# For classification
criterion = nn.CrossEntropyLoss()
# For regression
criterion = nn.MSELoss()

# Check gradient flow
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f'{name}: grad={param.grad is not None}')

# Use learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5
)

Overfitting Issues

# Add dropout
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Dropout(0.5),  # Add dropout
    nn.Linear(256, 10)
)

# Add L2 regularization
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Use data augmentation
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor()
])

# Early stopping
best_val_loss = float('inf')
patience = 5
counter = 0

for epoch in range(epochs):
    val_loss = validate()
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping")
            break

# Reduce model complexity
# Use fewer layers or smaller hidden sizes

NaN or Inf in Loss

# Check for NaN/Inf
if torch.isnan(loss) or torch.isinf(loss):
    print("NaN or Inf detected!")

# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

# Lower learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Check input data
print(f"Data min: {data.min()}, max: {data.max()}")
print(f"Data has NaN: {torch.isnan(data).any()}")
print(f"Data has Inf: {torch.isinf(data).any()}")

# Replace NaN/Inf
data = torch.nan_to_num(data, nan=0.0, posinf=1.0, neginf=-1.0)

# Use stable loss functions
# Instead of log(x), use log(x + epsilon)
epsilon = 1e-8
loss = -torch.log(predictions + epsilon)

Data Loading Issues

Slow Data Loading

# Increase num_workers
train_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,  # Increase for faster loading
    pin_memory=True  # For GPU training
)

# Prefetch data
train_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
    prefetch_factor=2
)

# Use persistent workers
train_loader = DataLoader(
    dataset,
    batch_size=32,
    num_workers=4,
    persistent_workers=True
)

Memory Errors with Large Datasets

# Use generator/iterator
def data_generator():
    for item in large_dataset:
        yield preprocess(item)

# Process in chunks
chunk_size = 1000
for i in range(0, len(dataset), chunk_size):
    chunk = dataset[i:i+chunk_size]
    process(chunk)

# Memory-mapped files
import numpy as np
data = np.memmap('data.npy', dtype='float32', mode='r', shape=(100000, 784))

# Clear unused variables
import gc
del large_variable
gc.collect()
torch.cuda.empty_cache()

Model Loading/Saving Issues

Model Load Errors

# Check model architecture matches
model = YourModel()  # Must match saved model
model.load_state_dict(torch.load('model.pth'))

# Load with map_location for CPU/GPU
model.load_state_dict(torch.load('model.pth', map_location='cpu'))
model.load_state_dict(torch.load('model.pth', map_location='cuda:0'))

# Load with weights_only for security
model.load_state_dict(torch.load('model.pth', weights_only=True))

# Handle missing keys
state_dict = torch.load('model.pth')
model.load_state_dict(state_dict, strict=False)

# Check saved keys
checkpoint = torch.load('model.pth')
print(checkpoint.keys())

Package Dependency Conflicts

Resolve Version Conflicts

# Check installed versions
pip list | grep torch
pip list | grep tensorflow
pip list | grep numpy

# Check package dependencies
pip show package_name

# Create clean environment
python -m venv clean_env
source clean_env/bin/activate
pip install requirements.txt

# Use specific versions
pip install numpy==1.24.0 pandas==2.0.0

# Uninstall conflicting packages
pip uninstall package_name -y
pip install package_name

# Install from requirements with constraints
pip install -r requirements.txt --constraint constraints.txt

Performance Optimization

Slow Training Speed

# Use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Increase batch size
batch_size = 64  # or 128

# Use DataLoader num_workers
train_loader = DataLoader(dataset, batch_size=32, num_workers=4)

# Enable cuDNN benchmark
torch.backends.cudnn.benchmark = True

# Use mixed precision
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

# Profile code
import torch.profiler
with torch.profiler.profile() as prof:
    model(input_data)
print(prof.key_averages().table(sort_by="cuda_time_total"))

# Use torch.compile (PyTorch 2.0+)
compiled_model = torch.compile(model)

Environment Issues

Import Errors

# Check Python path
import sys
print(sys.path)

# Add custom path
sys.path.append('/path/to/custom/modules')

# Set PYTHONPATH
export PYTHONPATH="${PYTHONPATH}:/path/to/modules"

# Reinstall package
pip uninstall package_name
pip install package_name --no-cache-dir

# Check module location
import package_name
print(package_name.__file__)

# Install in development mode
pip install -e .

Debugging Tools

Debug Commands

# Enable warnings
import warnings
warnings.filterwarnings('default')

# Verbose TensorFlow
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'

# PyTorch debugging
torch.autograd.set_detect_anomaly(True)

# Check gradients
for name, param in model.named_parameters():
    print(f'{name}: {param.grad}')

# Print tensor statistics
print(f'Mean: {tensor.mean()}, Std: {tensor.std()}')
print(f'Min: {tensor.min()}, Max: {tensor.max()}')

# Use pdb debugger
import pdb
pdb.set_trace()

# TensorBoard for visualization
tensorboard --logdir=runs

Conclusion

This comprehensive troubleshooting guide covers common AI/ML issues and solutions. VCCLHOSTING provides expert support and pre-configured GPU servers to minimize these issues.