Source code for pyod.utils.encoders.huggingface

# -*- coding: utf-8 -*-
"""HuggingFaceEncoder for EmbeddingOD."""
# Author: Yue Zhao <yzhao062@gmail.com>
# License: BSD 2 clause

import numpy as np

try:
    import torch
    from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
except ImportError:
    torch = None
    AutoModel = None

from . import BaseEncoder


[docs] class HuggingFaceEncoder(BaseEncoder): """Encoder using HuggingFace transformers. Supports both text (AutoTokenizer + AutoModel) and image (AutoImageProcessor + AutoModel) modalities. Parameters ---------- model_name : str HuggingFace model name or path. device : str or None, optional (default=None) Device for inference. None for auto-detection. pooling : str, optional (default='cls') Pooling strategy: 'cls' for CLS token, 'mean' for mean of all token embeddings. modality : str, optional (default='text') Input modality: 'text' or 'image'. Examples -------- >>> from pyod.utils.encoders.huggingface import HuggingFaceEncoder >>> encoder = HuggingFaceEncoder('bert-base-uncased', modality='text') >>> embeddings = encoder.encode(["hello", "world"]) """ def __init__(self, model_name, device=None, pooling='cls', modality='text'): if AutoModel is None: raise ImportError( "HuggingFaceEncoder requires 'transformers' and 'torch'. " "Install with: pip install transformers torch") self.model_name = model_name self.device = device self.pooling = pooling self.modality = modality def _load_model(self): """Load model and processor/tokenizer on first use.""" if self.device is None: if torch.cuda.is_available(): self.device_ = torch.device('cuda') elif hasattr(torch.backends, 'mps') and \ torch.backends.mps.is_available(): self.device_ = torch.device('mps') else: self.device_ = torch.device('cpu') else: self.device_ = torch.device(self.device) self.model_ = AutoModel.from_pretrained( self.model_name).to(self.device_) self.model_.eval() if self.modality == 'image': self.processor_ = AutoImageProcessor.from_pretrained( self.model_name) else: self.processor_ = AutoTokenizer.from_pretrained( self.model_name)
[docs] def encode(self, X, batch_size=32, show_progress=True): """Encode text or images to embeddings. Parameters ---------- X : list of str (text) or list of PIL.Image (image) Input data. batch_size : int, optional (default=32) Batch size for encoding. show_progress : bool, optional (default=True) Show progress bar. Returns ------- embeddings : numpy array of shape (n_samples, n_features) """ if not hasattr(self, 'model_'): self._load_model() all_embeddings = [] for i in range(0, len(X), batch_size): batch = X[i:i + batch_size] if self.modality == 'image': inputs = self.processor_( images=list(batch), return_tensors='pt' ).to(self.device_) else: inputs = self.processor_( list(batch), return_tensors='pt', padding=True, truncation=True, max_length=512 ).to(self.device_) with torch.no_grad(): outputs = self.model_(**inputs) hidden = outputs.last_hidden_state if self.pooling == 'cls': emb = hidden[:, 0, :] elif self.pooling == 'mean': if self.modality == 'text': mask = inputs['attention_mask'].unsqueeze(-1).float() emb = (hidden * mask).sum(dim=1) / mask.sum(dim=1) else: emb = hidden[:, 1:, :].mean(dim=1) else: raise ValueError( "pooling must be 'cls' or 'mean', got '%s'" % self.pooling) all_embeddings.append(emb.cpu().numpy()) embeddings = np.concatenate(all_embeddings, axis=0) return self._validate_output(embeddings, n_samples=len(X))