Installation & Setup
Prerequisites
Before getting started, ensure you have:
- Python 3.8 or higher
- PyTorch 1.11.0 or higher
- At least 8GB of RAM (16GB recommended)
- CUDA-compatible GPU (optional but recommended)
Installation Methods
📦 Method 1: pip (Recommended)
# Install transformers and dependencies pip install transformers torch torchvision pip install pillow requests # Optional: Install with specific PyTorch version pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118
🐍 Method 2: conda
# Create new environment conda create -n dinov3 python=3.9 conda activate dinov3 # Install packages conda install pytorch torchvision -c pytorch pip install transformers pillow
python -c "import transformers; print(transformers.__version__)"
Quick Start Example
Let's start with a simple example to get DINOv3 running in just a few lines of code:
# Quick Start: DINOv3 Feature Extraction from transformers import Dinov2Model, Dinov2ImageProcessor from PIL import Image import torch import requests # Load model and processor processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large") model = Dinov2Model.from_pretrained("facebook/dinov2-large") # Load a sample image url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" image = Image.open(requests.get(url, stream=True).raw) # Process image and extract features inputs = processor(images=image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) # Get feature embeddings features = outputs.pooler_output # Shape: [1, 1024] print(f"Feature shape: {features.shape}") print(f"Feature norm: {torch.norm(features):.4f}")
Feature shape: torch.Size([1, 1024]) Feature norm: 32.1847
Image Classification with DINOv3
While DINOv3 is primarily a feature extractor, you can easily build a classifier on top of it:
# Image Classification Example import torch import torch.nn as nn from transformers import Dinov2Model, Dinov2ImageProcessor from PIL import Image import requests class DINOv3Classifier(nn.Module): def __init__(self, num_classes, model_name="facebook/dinov2-large"): super().__init__() self.backbone = Dinov2Model.from_pretrained(model_name) self.classifier = nn.Linear(self.backbone.config.hidden_size, num_classes) def forward(self, pixel_values): outputs = self.backbone(pixel_values=pixel_values) pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) return logits # Initialize model for 1000 classes (ImageNet) model = DINOv3Classifier(num_classes=1000) processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large") # Load and process image url = "https://upload.wikimedia.org/wikipedia/commons/3/30/Vulpes_vulpes_ssp_fulvus.jpg" image = Image.open(requests.get(url, stream=True).raw) inputs = processor(images=image, return_tensors="pt") # Forward pass with torch.no_grad(): logits = model(inputs['pixel_values']) predictions = torch.nn.functional.softmax(logits, dim=-1) # Get top 5 predictions top5_prob, top5_catid = torch.topk(predictions, 5) for i in range(top5_prob.size(1)): print(f"Class {top5_catid[0][i]}: {top5_prob[0][i]*100:.2f}%")
Advanced Feature Extraction
DINOv3 excels at extracting rich visual features. Here's how to get different types of features:
# Advanced Feature Extraction from transformers import Dinov2Model, Dinov2ImageProcessor import torch import numpy as np def extract_features(images, model, processor): """Extract different types of features from DINOv3""" # Process images inputs = processor(images=images, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs, output_hidden_states=True) # Different feature types features = { 'cls_token': outputs.pooler_output, # [CLS] token features 'patch_embeddings': outputs.last_hidden_state, # All patch features 'mean_pooled': outputs.last_hidden_state.mean(dim=1), # Mean pooled 'all_layers': outputs.hidden_states # Features from all layers } return features # Load model model = Dinov2Model.from_pretrained("facebook/dinov2-large") processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large") # Extract features features = extract_features(image, model, processor) # Print feature shapes for key, value in features.items(): if key == 'all_layers': print(f"{key}: {len(value)} layers, each {value[0].shape}") else: print(f"{key}: {value.shape}")
cls_token: torch.Size([1, 1024]) patch_embeddings: torch.Size([1, 257, 1024]) mean_pooled: torch.Size([1, 1024]) all_layers: 25 layers, each torch.Size([1, 257, 1024])
Feature Similarity Comparison
# Compare similarity between two images def compute_similarity(image1, image2, model, processor): """Compute cosine similarity between two images""" # Extract features for both images features1 = extract_features(image1, model, processor)['cls_token'] features2 = extract_features(image2, model, processor)['cls_token'] # Compute cosine similarity similarity = torch.nn.functional.cosine_similarity(features1, features2) return similarity.item() # Example usage similarity_score = compute_similarity(image1, image2, model, processor) print(f"Similarity score: {similarity_score:.4f}")
Efficient Batch Processing
For production use, you'll want to process multiple images efficiently:
# Batch Processing Example import torch from torch.utils.data import DataLoader, Dataset from transformers import Dinov2Model, Dinov2ImageProcessor from PIL import Image import os class ImageDataset(Dataset): def __init__(self, image_paths, processor): self.image_paths = image_paths self.processor = processor def __len__(self): return len(self.image_paths) def __getitem__(self, idx): image_path = self.image_paths[idx] image = Image.open(image_path).convert('RGB') inputs = self.processor(images=image, return_tensors="pt") return { 'pixel_values': inputs['pixel_values'].squeeze(0), 'path': image_path } def process_images_batch(image_dir, batch_size=8): """Process images in batches for efficiency""" # Setup processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large") model = Dinov2Model.from_pretrained("facebook/dinov2-large") model.eval() # Get image paths image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))] # Create dataset and dataloader dataset = ImageDataset(image_paths, processor) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False) all_features = [] all_paths = [] with torch.no_grad(): for batch in dataloader: pixel_values = batch['pixel_values'] paths = batch['path'] # Forward pass outputs = model(pixel_values=pixel_values) features = outputs.pooler_output all_features.append(features) all_paths.extend(paths) print(f"Processed batch: {len(paths)} images") # Combine all features all_features = torch.cat(all_features, dim=0) return all_features, all_paths # Usage example # features, paths = process_images_batch("path/to/your/images")
Fine-tuning DINOv3
Fine-tune DINOv3 on your custom dataset for domain-specific tasks:
# Fine-tuning Setup import torch import torch.nn as nn from torch.utils.data import DataLoader from transformers import Dinov2Model, Dinov2ImageProcessor, AdamW from transformers import get_linear_schedule_with_warmup class DINOv3ForClassification(nn.Module): def __init__(self, num_classes, freeze_backbone=False): super().__init__() self.backbone = Dinov2Model.from_pretrained("facebook/dinov2-large") # Optionally freeze backbone if freeze_backbone: for param in self.backbone.parameters(): param.requires_grad = False # Classification head self.classifier = nn.Sequential( nn.Dropout(0.1), nn.Linear(self.backbone.config.hidden_size, 512), nn.ReLU(), nn.Dropout(0.1), nn.Linear(512, num_classes) ) def forward(self, pixel_values, labels=None): outputs = self.backbone(pixel_values=pixel_values) pooled_output = outputs.pooler_output logits = self.classifier(pooled_output) loss = None if labels is not None: loss_fn = nn.CrossEntropyLoss() loss = loss_fn(logits, labels) return {'loss': loss, 'logits': logits} def train_model(model, train_dataloader, val_dataloader, num_epochs=3): """Training loop for fine-tuning""" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # Optimizer and scheduler optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01) total_steps = len(train_dataloader) * num_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps ) model.train() for epoch in range(num_epochs): total_loss = 0 for batch_idx, batch in enumerate(train_dataloader): pixel_values = batch['pixel_values'].to(device) labels = batch['labels'].to(device) optimizer.zero_grad() # Forward pass outputs = model(pixel_values=pixel_values, labels=labels) loss = outputs['loss'] # Backward pass loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() total_loss += loss.item() if batch_idx % 100 == 0: print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}") avg_loss = total_loss / len(train_dataloader) print(f"Epoch {epoch} completed. Average loss: {avg_loss:.4f}") # Initialize model for 10 classes model = DINOv3ForClassification(num_classes=10, freeze_backbone=True) # Train the model # train_model(model, train_dataloader, val_dataloader)
- Start with a frozen backbone and only train the classification head
- Use a smaller learning rate (1e-5 to 5e-5) when unfreezing the backbone
- Apply gradual unfreezing: unfreeze the last few layers first
- Use data augmentation to prevent overfitting
Performance Optimization
Mixed Precision Training
# Mixed Precision for Faster Training import torch from torch.cuda.amp import autocast, GradScaler def train_with_amp(model, dataloader, optimizer): """Training with Automatic Mixed Precision""" scaler = GradScaler() model.train() for batch in dataloader: pixel_values = batch['pixel_values'].cuda() labels = batch['labels'].cuda() optimizer.zero_grad() # Forward pass with autocast with autocast(): outputs = model(pixel_values=pixel_values, labels=labels) loss = outputs['loss'] # Scaled backward pass scaler.scale(loss).backward() scaler.step(optimizer) scaler.update()
Model Quantization
# Post-training Quantization import torch from transformers import Dinov2Model def quantize_model(model): """Quantize model for faster inference""" # Dynamic quantization quantized_model = torch.quantization.quantize_dynamic( model, {torch.nn.Linear}, dtype=torch.qint8 ) return quantized_model # Load and quantize model model = Dinov2Model.from_pretrained("facebook/dinov2-base") # Use base for speed quantized_model = quantize_model(model) print(f"Original model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters") print("Quantized model ready for faster inference")
Inference Optimization
# Optimized Inference Pipeline class OptimizedDINOv3: def __init__(self, model_name="facebook/dinov2-base", device="cuda"): self.device = device self.processor = Dinov2ImageProcessor.from_pretrained(model_name) self.model = Dinov2Model.from_pretrained(model_name).to(device) self.model.eval() # Compile model for faster inference (PyTorch 2.0+) if hasattr(torch, 'compile'): self.model = torch.compile(self.model) def extract_features(self, images): """Optimized feature extraction""" with torch.no_grad(): if isinstance(images, list): inputs = self.processor(images=images, return_tensors="pt") else: inputs = self.processor(images=[images], return_tensors="pt") inputs = {k: v.to(self.device) for k, v in inputs.items()} # Use half precision for inference with autocast(): outputs = self.model(**inputs) return outputs.pooler_output.cpu() # Usage dinov3 = OptimizedDINOv3() features = dinov3.extract_features(image)
Common Issues & Solutions
❌ CUDA Out of Memory
Problem: RuntimeError: CUDA out of memory
Solutions:
- Use smaller batch sizes
- Use DINOv2-base instead of large
- Enable gradient checkpointing
- Use CPU for inference if needed
# Solution: Reduce batch size
batch_size = 1 # Instead of 8
model = Dinov2Model.from_pretrained(
"facebook/dinov2-base" # Instead of large
)
❌ Slow Inference
Problem: Model inference is too slow
Solutions:
- Use GPU instead of CPU
- Enable mixed precision
- Use model compilation (PyTorch 2.0+)
- Batch multiple images together
# Solution: Enable optimizations model = model.half().cuda() # Half precision model = torch.compile(model) # Compilation
❌ Import Errors
Problem: ModuleNotFoundError or version conflicts
Solutions:
- Update transformers:
pip install -U transformers - Check PyTorch compatibility
- Use virtual environment
- Clear pip cache if needed
# Solution: Fresh install pip uninstall transformers torch pip install transformers torch torchvision
❌ Poor Performance
Problem: Model doesn't perform well on your data
Solutions:
- Check image preprocessing
- Ensure proper image format (RGB)
- Consider fine-tuning on your domain
- Use appropriate model size
# Solution: Proper preprocessing
image = image.convert('RGB') # Ensure RGB
inputs = processor(
images=image,
return_tensors="pt",
do_resize=True,
size={"height": 518, "width": 518}
)
🔧 Quick Diagnostic Script
# Run this to diagnose your setup import torch import transformers from PIL import Image print(f"PyTorch version: {torch.__version__}") print(f"Transformers version: {transformers.__version__}") print(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"GPU: {torch.cuda.get_device_name(0)}") print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") # Test model loading try: from transformers import Dinov2Model model = Dinov2Model.from_pretrained("facebook/dinov2-base") print("✅ Model loading successful") except Exception as e: print(f"❌ Model loading failed: {e}")
🚀 Advanced Use Cases
1. Image Retrieval System
# Build an image retrieval system import numpy as np from sklearn.metrics.pairwise import cosine_similarity class ImageRetrieval: def __init__(self): self.model = Dinov2Model.from_pretrained("facebook/dinov2-large") self.processor = Dinov2ImageProcessor.from_pretrained("facebook/dinov2-large") self.features_db = [] self.image_paths = [] def add_images(self, image_paths): for path in image_paths: image = Image.open(path) features = self._extract_features(image) self.features_db.append(features) self.image_paths.append(path) def search(self, query_image, top_k=5): query_features = self._extract_features(query_image) similarities = cosine_similarity([query_features], self.features_db)[0] top_indices = np.argsort(similarities)[::-1][:top_k] results = [(self.image_paths[i], similarities[i]) for i in top_indices] return results
2. Zero-shot Object Detection
# Zero-shot object detection with patch features def detect_objects(image, model, processor, threshold=0.7): """Simple object detection using patch attention""" inputs = processor(images=image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs, output_attentions=True) # Get attention weights from last layer attention = outputs.attentions[-1] # [batch, heads, seq_len, seq_len] # Average across heads and get CLS attention to patches cls_attention = attention[0, :, 0, 1:].mean(dim=0) # Remove CLS token # Reshape to spatial dimensions h = w = int(np.sqrt(cls_attention.shape[0])) attention_map = cls_attention.reshape(h, w) # Find high attention regions high_attention = attention_map > threshold return attention_map, high_attention