Source code for pyod.utils.encoders.openai_encoder

# -*- coding: utf-8 -*-
"""OpenAIEncoder for EmbeddingOD."""
# Author: Yue Zhao <yzhao062@gmail.com>
# License: BSD 2 clause

import os

import numpy as np

try:
    from openai import OpenAI
except ImportError:
    OpenAI = None

from . import BaseEncoder

_MAX_BATCH_SIZE = 2048  # OpenAI API limit per request


[docs] class OpenAIEncoder(BaseEncoder): """Encoder using OpenAI Embeddings API. Produces text embeddings via the OpenAI API. Handles batching (max 2048 items per request) internally. Parameters ---------- model_name : str, optional (default='text-embedding-3-small') OpenAI embedding model name. dimensions : int or None, optional (default=None) Truncate embeddings to this dimensionality (Matryoshka). Only supported by text-embedding-3-* models. api_key : str or None, optional (default=None) OpenAI API key. Falls back to OPENAI_API_KEY environment variable. Examples -------- >>> from pyod.utils.encoders.openai_encoder import OpenAIEncoder >>> encoder = OpenAIEncoder('text-embedding-3-small') >>> embeddings = encoder.encode(["normal text", "anomalous text"]) """ def __init__(self, model_name='text-embedding-3-small', dimensions=None, api_key=None): if OpenAI is None: raise ImportError( "OpenAIEncoder requires 'openai'. " "Install with: pip install openai") self.model_name = model_name self.dimensions = dimensions self.api_key = api_key or os.environ.get('OPENAI_API_KEY')
[docs] def encode(self, X, batch_size=2048, show_progress=True): """Encode text strings to embeddings via OpenAI API. Parameters ---------- X : list of str Text strings to encode. batch_size : int, optional (default=2048) Batch size. Capped at 2048 (OpenAI API limit). show_progress : bool, optional (default=True) Show progress bar (not used for API calls). Returns ------- embeddings : numpy array of shape (n_samples, n_features) """ if not hasattr(self, 'client_'): self.client_ = OpenAI(api_key=self.api_key) batch_size = min(batch_size, _MAX_BATCH_SIZE) all_embeddings = [] for i in range(0, len(X), batch_size): batch = list(X[i:i + batch_size]) kwargs = { 'model': self.model_name, 'input': batch, 'encoding_format': 'float', } if self.dimensions is not None: kwargs['dimensions'] = self.dimensions response = self.client_.embeddings.create(**kwargs) batch_emb = [item.embedding for item in response.data] all_embeddings.extend(batch_emb) embeddings = np.array(all_embeddings) return self._validate_output(embeddings, n_samples=len(X))