DINOv3 HuggingFace Tutorial

Complete step-by-step guide to using DINOv3 with HuggingFace Transformers. From installation to production deployment.

🚀 Quick Start Guide 💻 Ready-to-run Code 🔧 Troubleshooting Tips 📈 Performance Optimization

📋 Tutorial Contents

1

Installation & Setup

Prerequisites

Before getting started, ensure you have:

Installation Methods

📦 Method 1: pip (Recommended)

# Install transformers and dependencies
pip install transformers torch torchvision
pip install pillow requests

# Optional: Install with specific PyTorch version
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118

🐍 Method 2: conda

# Create new environment
conda create -n dinov3 python=3.9
conda activate dinov3

# Install packages
conda install pytorch torchvision -c pytorch
pip install transformers pillow
✅ Verification: Run the following to verify your installation:
python -c "import transformers; print(transformers.__version__)"
2

Quick Start Example

Let's start with a simple example to get DINOv3 running in just a few lines of code:

# Quick Start: DINOv3 Feature Extraction
from transformers import Dinov2Model, Dinov2ImageProcessor
from PIL import Image
import torch
import requests

# Load model and processor
processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large")
model = Dinov2Model.from_pretrained("facebook/dinov2-large")

# Load a sample image
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"
image = Image.open(requests.get(url, stream=True).raw)

# Process image and extract features
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

# Get feature embeddings
features = outputs.pooler_output  # Shape: [1, 1024]
print(f"Feature shape: {features.shape}")
print(f"Feature norm: {torch.norm(features):.4f}")
Feature shape: torch.Size([1, 1024])
Feature norm: 32.1847
⚠️ Note: First time running will download the model weights (~1.1GB). This may take a few minutes depending on your internet connection.
3

Image Classification with DINOv3

While DINOv3 is primarily a feature extractor, you can easily build a classifier on top of it:

# Image Classification Example
import torch
import torch.nn as nn
from transformers import Dinov2Model, Dinov2ImageProcessor
from PIL import Image
import requests

class DINOv3Classifier(nn.Module):
    def __init__(self, num_classes, model_name="facebook/dinov2-large"):
        super().__init__()
        self.backbone = Dinov2Model.from_pretrained(model_name)
        self.classifier = nn.Linear(self.backbone.config.hidden_size, num_classes)
        
    def forward(self, pixel_values):
        outputs = self.backbone(pixel_values=pixel_values)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return logits

# Initialize model for 1000 classes (ImageNet)
model = DINOv3Classifier(num_classes=1000)
processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large")

# Load and process image
url = "https://upload.wikimedia.org/wikipedia/commons/3/30/Vulpes_vulpes_ssp_fulvus.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(images=image, return_tensors="pt")

# Forward pass
with torch.no_grad():
    logits = model(inputs['pixel_values'])
    predictions = torch.nn.functional.softmax(logits, dim=-1)
    
# Get top 5 predictions
top5_prob, top5_catid = torch.topk(predictions, 5)
for i in range(top5_prob.size(1)):
    print(f"Class {top5_catid[0][i]}: {top5_prob[0][i]*100:.2f}%")
4

Advanced Feature Extraction

DINOv3 excels at extracting rich visual features. Here's how to get different types of features:

# Advanced Feature Extraction
from transformers import Dinov2Model, Dinov2ImageProcessor
import torch
import numpy as np

def extract_features(images, model, processor):
    """Extract different types of features from DINOv3"""
    
    # Process images
    inputs = processor(images=images, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    
    # Different feature types
    features = {
        'cls_token': outputs.pooler_output,  # [CLS] token features
        'patch_embeddings': outputs.last_hidden_state,  # All patch features
        'mean_pooled': outputs.last_hidden_state.mean(dim=1),  # Mean pooled
        'all_layers': outputs.hidden_states  # Features from all layers
    }
    
    return features

# Load model
model = Dinov2Model.from_pretrained("facebook/dinov2-large")
processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large")

# Extract features
features = extract_features(image, model, processor)

# Print feature shapes
for key, value in features.items():
    if key == 'all_layers':
        print(f"{key}: {len(value)} layers, each {value[0].shape}")
    else:
        print(f"{key}: {value.shape}")
cls_token: torch.Size([1, 1024])
patch_embeddings: torch.Size([1, 257, 1024])
mean_pooled: torch.Size([1, 1024])
all_layers: 25 layers, each torch.Size([1, 257, 1024])

Feature Similarity Comparison

# Compare similarity between two images
def compute_similarity(image1, image2, model, processor):
    """Compute cosine similarity between two images"""
    
    # Extract features for both images
    features1 = extract_features(image1, model, processor)['cls_token']
    features2 = extract_features(image2, model, processor)['cls_token']
    
    # Compute cosine similarity
    similarity = torch.nn.functional.cosine_similarity(features1, features2)
    return similarity.item()

# Example usage
similarity_score = compute_similarity(image1, image2, model, processor)
print(f"Similarity score: {similarity_score:.4f}")
5

Efficient Batch Processing

For production use, you'll want to process multiple images efficiently:

# Batch Processing Example
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import Dinov2Model, Dinov2ImageProcessor
from PIL import Image
import os

class ImageDataset(Dataset):
    def __init__(self, image_paths, processor):
        self.image_paths = image_paths
        self.processor = processor
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert('RGB')
        inputs = self.processor(images=image, return_tensors="pt")
        return {
            'pixel_values': inputs['pixel_values'].squeeze(0),
            'path': image_path
        }

def process_images_batch(image_dir, batch_size=8):
    """Process images in batches for efficiency"""
    
    # Setup
    processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large")
    model = Dinov2Model.from_pretrained("facebook/dinov2-large")
    model.eval()
    
    # Get image paths
    image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) 
                   if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    # Create dataset and dataloader
    dataset = ImageDataset(image_paths, processor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_features = []
    all_paths = []
    
    with torch.no_grad():
        for batch in dataloader:
            pixel_values = batch['pixel_values']
            paths = batch['path']
            
            # Forward pass
            outputs = model(pixel_values=pixel_values)
            features = outputs.pooler_output
            
            all_features.append(features)
            all_paths.extend(paths)
            
            print(f"Processed batch: {len(paths)} images")
    
    # Combine all features
    all_features = torch.cat(all_features, dim=0)
    
    return all_features, all_paths

# Usage example
# features, paths = process_images_batch("path/to/your/images")
6

Fine-tuning DINOv3

Fine-tune DINOv3 on your custom dataset for domain-specific tasks:

# Fine-tuning Setup
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import Dinov2Model, Dinov2ImageProcessor, AdamW
from transformers import get_linear_schedule_with_warmup

class DINOv3ForClassification(nn.Module):
    def __init__(self, num_classes, freeze_backbone=False):
        super().__init__()
        self.backbone = Dinov2Model.from_pretrained("facebook/dinov2-large")
        
        # Optionally freeze backbone
        if freeze_backbone:
            for param in self.backbone.parameters():
                param.requires_grad = False
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(self.backbone.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, pixel_values, labels=None):
        outputs = self.backbone(pixel_values=pixel_values)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        
        return {'loss': loss, 'logits': logits}

def train_model(model, train_dataloader, val_dataloader, num_epochs=3):
    """Training loop for fine-tuning"""
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(0.1 * total_steps), 
        num_training_steps=total_steps
    )
    
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        
        for batch_idx, batch in enumerate(train_dataloader):
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs['loss']
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            scheduler.step()
            
            total_loss += loss.item()
            
            if batch_idx % 100 == 0:
                print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}")
        
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch} completed. Average loss: {avg_loss:.4f}")

# Initialize model for 10 classes
model = DINOv3ForClassification(num_classes=10, freeze_backbone=True)

# Train the model
# train_model(model, train_dataloader, val_dataloader)
💡 Fine-tuning Tips:
  • Start with a frozen backbone and only train the classification head
  • Use a smaller learning rate (1e-5 to 5e-5) when unfreezing the backbone
  • Apply gradual unfreezing: unfreeze the last few layers first
  • Use data augmentation to prevent overfitting
7

Performance Optimization

Mixed Precision Training

# Mixed Precision for Faster Training
import torch
from torch.cuda.amp import autocast, GradScaler

def train_with_amp(model, dataloader, optimizer):
    """Training with Automatic Mixed Precision"""
    
    scaler = GradScaler()
    model.train()
    
    for batch in dataloader:
        pixel_values = batch['pixel_values'].cuda()
        labels = batch['labels'].cuda()
        
        optimizer.zero_grad()
        
        # Forward pass with autocast
        with autocast():
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs['loss']
        
        # Scaled backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

Model Quantization

# Post-training Quantization
import torch
from transformers import Dinov2Model

def quantize_model(model):
    """Quantize model for faster inference"""
    
    # Dynamic quantization
    quantized_model = torch.quantization.quantize_dynamic(
        model, {torch.nn.Linear}, dtype=torch.qint8
    )
    
    return quantized_model

# Load and quantize model
model = Dinov2Model.from_pretrained("facebook/dinov2-base")  # Use base for speed
quantized_model = quantize_model(model)

print(f"Original model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")
print("Quantized model ready for faster inference")

Inference Optimization

# Optimized Inference Pipeline
class OptimizedDINOv3:
    def __init__(self, model_name="facebook/dinov2-base", device="cuda"):
        self.device = device
        self.processor = Dinov2ImageProcessor.from_pretrained(model_name)
        self.model = Dinov2Model.from_pretrained(model_name).to(device)
        self.model.eval()
        
        # Compile model for faster inference (PyTorch 2.0+)
        if hasattr(torch, 'compile'):
            self.model = torch.compile(self.model)
    
    def extract_features(self, images):
        """Optimized feature extraction"""
        
        with torch.no_grad():
            if isinstance(images, list):
                inputs = self.processor(images=images, return_tensors="pt")
            else:
                inputs = self.processor(images=[images], return_tensors="pt")
            
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            # Use half precision for inference
            with autocast():
                outputs = self.model(**inputs)
            
            return outputs.pooler_output.cpu()

# Usage
dinov3 = OptimizedDINOv3()
features = dinov3.extract_features(image)
8

Common Issues & Solutions

❌ CUDA Out of Memory

Problem: RuntimeError: CUDA out of memory

Solutions:

  • Use smaller batch sizes
  • Use DINOv2-base instead of large
  • Enable gradient checkpointing
  • Use CPU for inference if needed
# Solution: Reduce batch size
batch_size = 1  # Instead of 8
model = Dinov2Model.from_pretrained(
    "facebook/dinov2-base"  # Instead of large
)

❌ Slow Inference

Problem: Model inference is too slow

Solutions:

  • Use GPU instead of CPU
  • Enable mixed precision
  • Use model compilation (PyTorch 2.0+)
  • Batch multiple images together
# Solution: Enable optimizations
model = model.half().cuda()  # Half precision
model = torch.compile(model)  # Compilation

❌ Import Errors

Problem: ModuleNotFoundError or version conflicts

Solutions:

  • Update transformers: pip install -U transformers
  • Check PyTorch compatibility
  • Use virtual environment
  • Clear pip cache if needed
# Solution: Fresh install
pip uninstall transformers torch
pip install transformers torch torchvision

❌ Poor Performance

Problem: Model doesn't perform well on your data

Solutions:

  • Check image preprocessing
  • Ensure proper image format (RGB)
  • Consider fine-tuning on your domain
  • Use appropriate model size
# Solution: Proper preprocessing
image = image.convert('RGB')  # Ensure RGB
inputs = processor(
    images=image, 
    return_tensors="pt",
    do_resize=True,
    size={"height": 518, "width": 518}
)

🔧 Quick Diagnostic Script

# Run this to diagnose your setup
import torch
import transformers
from PIL import Image

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Test model loading
try:
    from transformers import Dinov2Model
    model = Dinov2Model.from_pretrained("facebook/dinov2-base")
    print("✅ Model loading successful")
except Exception as e:
    print(f"❌ Model loading failed: {e}")

🚀 Advanced Use Cases

1. Image Retrieval System

# Build an image retrieval system
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class ImageRetrieval:
    def __init__(self):
        self.model = Dinov2Model.from_pretrained("facebook/dinov2-large")
        self.processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large")
        self.features_db = []
        self.image_paths = []
    
    def add_images(self, image_paths):
        for path in image_paths:
            image = Image.open(path)
            features = self._extract_features(image)
            self.features_db.append(features)
            self.image_paths.append(path)
    
    def search(self, query_image, top_k=5):
        query_features = self._extract_features(query_image)
        similarities = cosine_similarity([query_features], self.features_db)[0]
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = [(self.image_paths[i], similarities[i]) for i in top_indices]
        return results

2. Zero-shot Object Detection

# Zero-shot object detection with patch features
def detect_objects(image, model, processor, threshold=0.7):
    """Simple object detection using patch attention"""
    
    inputs = processor(images=image, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)
    
    # Get attention weights from last layer
    attention = outputs.attentions[-1]  # [batch, heads, seq_len, seq_len]
    
    # Average across heads and get CLS attention to patches
    cls_attention = attention[0, :, 0, 1:].mean(dim=0)  # Remove CLS token
    
    # Reshape to spatial dimensions
    h = w = int(np.sqrt(cls_attention.shape[0]))
    attention_map = cls_attention.reshape(h, w)
    
    # Find high attention regions
    high_attention = attention_map > threshold
    
    return attention_map, high_attention

🔗 Additional Resources

📚 Documentation

💻 Code Examples

🎯 Applications

  • Image Classification
  • Feature Extraction
  • Zero-shot Detection
  • Image Retrieval