oceansweep's picture
Upload 155 files
43cd37c verified
# Embeddings_Create.py
# Description: Functions for Creating and managing Embeddings in ChromaDB with LLama.cpp/OpenAI/Transformers
#
# Imports:
import logging
import os
import time
from functools import wraps
from threading import Lock, Timer
from typing import List
#
# 3rd-Party Imports:
import numpy as np
import onnxruntime as ort
import requests
from transformers import AutoTokenizer, AutoModel
import torch
#
# Local Imports:
from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
from App_Function_Libraries.Utils.Utils import load_comprehensive_config
from App_Function_Libraries.Metrics.metrics_logger import log_counter, log_histogram
#
#######################################################################################################################
#
# Functions:
# FIXME - Version 2
# Load configuration
loaded_config = load_comprehensive_config()
embedding_provider = loaded_config['Embeddings']['embedding_provider']
embedding_model = loaded_config['Embeddings']['embedding_model']
embedding_api_url = loaded_config['Embeddings']['embedding_api_url']
embedding_api_key = loaded_config['Embeddings']['embedding_api_key']
model_dir = loaded_config['Embeddings'].get('model_dir', './App_Function_Libraries/models/embedding_models/')
# Embedding Chunking Settings
chunk_size = loaded_config['Embeddings']['chunk_size']
overlap = loaded_config['Embeddings']['overlap']
# Global cache for embedding models
embedding_models = {}
# Commit hashes
commit_hashes = {
"jinaai/jina-embeddings-v3": "4be32c2f5d65b95e4bcce473545b7883ec8d2edd",
"Alibaba-NLP/gte-large-en-v1.5": "104333d6af6f97649377c2afbde10a7704870c7b",
"dunzhang/setll_en_400M_v5": "2aa5579fcae1c579de199a3866b6e514bbbf5d10"
}
class HuggingFaceEmbedder:
def __init__(self, model_name, cache_dir, timeout_seconds=30):
self.model_name = model_name
self.cache_dir = cache_dir # Store cache_dir
self.tokenizer = None
self.model = None
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.timeout_seconds = timeout_seconds
self.last_used_time = 0
self.unload_timer = None
log_counter("huggingface_embedder_init", labels={"model_name": model_name})
def load_model(self):
log_counter("huggingface_model_load_attempt", labels={"model_name": self.model_name})
start_time = time.time()
# https://huggingface.co/docs/transformers/custom_models
if self.model is None:
# Pass cache_dir to from_pretrained to specify download directory
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
trust_remote_code=True,
cache_dir=self.cache_dir, # Specify cache directory
revision=commit_hashes.get(self.model_name, None) # Pass commit hash
)
self.model = AutoModel.from_pretrained(
self.model_name,
trust_remote_code=True,
cache_dir=self.cache_dir, # Specify cache directory
revision=commit_hashes.get(self.model_name, None) # Pass commit hash
)
self.model.to(self.device)
self.last_used_time = time.time()
self.reset_timer()
load_time = time.time() - start_time
log_histogram("huggingface_model_load_duration", load_time, labels={"model_name": self.model_name})
log_counter("huggingface_model_load_success", labels={"model_name": self.model_name})
def unload_model(self):
log_counter("huggingface_model_unload", labels={"model_name": self.model_name})
if self.model is not None:
del self.model
del self.tokenizer
if torch.cuda.is_available():
torch.cuda.empty_cache()
self.model = None
self.tokenizer = None
if self.unload_timer:
self.unload_timer.cancel()
def reset_timer(self):
if self.unload_timer:
self.unload_timer.cancel()
self.unload_timer = Timer(self.timeout_seconds, self.unload_model)
self.unload_timer.start()
def create_embeddings(self, texts):
log_counter("huggingface_create_embeddings_attempt", labels={"model_name": self.model_name})
start_time = time.time()
self.load_model()
# https://huggingface.co/docs/transformers/custom_models
inputs = self.tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
try:
with torch.no_grad():
outputs = self.model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings.cpu().float().numpy() # Convert to float32 before returning
except RuntimeError as e:
if "Got unsupported ScalarType BFloat16" in str(e):
logging.warning("BFloat16 not supported. Falling back to float32.")
# Convert model to float32
self.model = self.model.float()
with torch.no_grad():
outputs = self.model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1)
embedding_time = time.time() - start_time
log_histogram("huggingface_create_embeddings_duration", embedding_time,
labels={"model_name": self.model_name})
log_counter("huggingface_create_embeddings_success", labels={"model_name": self.model_name})
return embeddings.cpu().float().numpy()
else:
log_counter("huggingface_create_embeddings_failure", labels={"model_name": self.model_name})
raise
class ONNXEmbedder:
def __init__(self, model_name, onnx_model_dir, timeout_seconds=30):
self.model_name = model_name
self.model_path = os.path.join(onnx_model_dir, f"{model_name}.onnx")
# https://huggingface.co/docs/transformers/custom_models
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
cache_dir=onnx_model_dir, # Ensure tokenizer uses the same directory
revision=commit_hashes.get(model_name, None) # Pass commit hash
)
self.session = None
self.timeout_seconds = timeout_seconds
self.last_used_time = 0
self.unload_timer = None
self.device = "cpu" # ONNX Runtime will default to CPU unless GPU is configured
log_counter("onnx_embedder_init", labels={"model_name": model_name})
def load_model(self):
log_counter("onnx_model_load_attempt", labels={"model_name": self.model_name})
start_time = time.time()
if self.session is None:
if not os.path.exists(self.model_path):
raise FileNotFoundError(f"ONNX model not found at {self.model_path}")
logging.info(f"Loading ONNX model from {self.model_path}")
self.session = ort.InferenceSession(self.model_path)
self.last_used_time = time.time()
self.reset_timer()
load_time = time.time() - start_time
log_histogram("onnx_model_load_duration", load_time, labels={"model_name": self.model_name})
log_counter("onnx_model_load_success", labels={"model_name": self.model_name})
def unload_model(self):
log_counter("onnx_model_unload", labels={"model_name": self.model_name})
if self.session is not None:
logging.info("Unloading ONNX model to free resources.")
self.session = None
if self.unload_timer:
self.unload_timer.cancel()
def reset_timer(self):
if self.unload_timer:
self.unload_timer.cancel()
self.unload_timer = Timer(self.timeout_seconds, self.unload_model)
self.unload_timer.start()
def create_embeddings(self, texts: List[str]) -> List[List[float]]:
log_counter("onnx_create_embeddings_attempt", labels={"model_name": self.model_name})
start_time = time.time()
self.load_model()
try:
inputs = self.tokenizer(
texts,
return_tensors="np",
padding=True,
truncation=True,
max_length=512
)
input_ids = inputs["input_ids"].astype(np.int64)
attention_mask = inputs["attention_mask"].astype(np.int64)
ort_inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask
}
ort_outputs = self.session.run(None, ort_inputs)
last_hidden_state = ort_outputs[0]
embeddings = np.mean(last_hidden_state, axis=1)
embedding_time = time.time() - start_time
log_histogram("onnx_create_embeddings_duration", embedding_time, labels={"model_name": self.model_name})
log_counter("onnx_create_embeddings_success", labels={"model_name": self.model_name})
return embeddings.tolist()
except Exception as e:
log_counter("onnx_create_embeddings_failure", labels={"model_name": self.model_name})
logging.error(f"Error creating embeddings with ONNX model: {str(e)}")
raise
class RateLimiter:
def __init__(self, max_calls, period):
self.max_calls = max_calls
self.period = period
self.calls = []
self.lock = Lock()
def __call__(self, func):
def wrapper(*args, **kwargs):
with self.lock:
now = time.time()
self.calls = [call for call in self.calls if call > now - self.period]
if len(self.calls) >= self.max_calls:
sleep_time = self.calls[0] - (now - self.period)
time.sleep(sleep_time)
self.calls.append(time.time())
return func(*args, **kwargs)
return wrapper
def exponential_backoff(max_retries=5, base_delay=1):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except Exception as e:
if attempt == max_retries - 1:
raise
delay = base_delay * (2 ** attempt)
logging.warning(f"Attempt {attempt + 1} failed. Retrying in {delay} seconds. Error: {str(e)}")
time.sleep(delay)
return wrapper
return decorator
@exponential_backoff()
@RateLimiter(max_calls=50, period=60)
def create_embeddings_batch(texts: List[str],
provider: str,
model: str,
api_url: str,
timeout_seconds: int = 300
) -> List[List[float]]:
global embedding_models
log_counter("create_embeddings_batch_attempt", labels={"provider": provider, "model": model})
start_time = time.time()
try:
if provider.lower() == 'huggingface':
if model not in embedding_models:
if model == "dunzhang/stella_en_400M_v5":
embedding_models[model] = ONNXEmbedder(model, model_dir, timeout_seconds)
else:
# Pass model_dir to HuggingFaceEmbedder
embedding_models[model] = HuggingFaceEmbedder(model, model_dir, timeout_seconds)
embedder = embedding_models[model]
embedding_time = time.time() - start_time
log_histogram("create_embeddings_batch_duration", embedding_time,
labels={"provider": provider, "model": model})
log_counter("create_embeddings_batch_success", labels={"provider": provider, "model": model})
return embedder.create_embeddings(texts)
elif provider.lower() == 'openai':
logging.debug(f"Creating embeddings for {len(texts)} texts using OpenAI API")
embedding_time = time.time() - start_time
log_histogram("create_embeddings_batch_duration", embedding_time,
labels={"provider": provider, "model": model})
log_counter("create_embeddings_batch_success", labels={"provider": provider, "model": model})
return [create_openai_embedding(text, model) for text in texts]
elif provider.lower() == 'local':
response = requests.post(
api_url,
json={"texts": texts, "model": model},
headers={"Authorization": f"Bearer {embedding_api_key}"}
)
if response.status_code == 200:
embedding_time = time.time() - start_time
log_histogram("create_embeddings_batch_duration", embedding_time,
labels={"provider": provider, "model": model})
log_counter("create_embeddings_batch_success", labels={"provider": provider, "model": model})
return response.json()['embeddings']
else:
raise Exception(f"Error from local API: {response.text}")
else:
raise ValueError(f"Unsupported embedding provider: {provider}")
except Exception as e:
log_counter("create_embeddings_batch_error", labels={"provider": provider, "model": model, "error": str(e)})
logging.error(f"Error in create_embeddings_batch: {str(e)}")
raise
def create_embedding(text: str, provider: str, model: str, api_url: str) -> List[float]:
log_counter("create_embedding_attempt", labels={"provider": provider, "model": model})
start_time = time.time()
embedding = create_embeddings_batch([text], provider, model, api_url)[0]
if isinstance(embedding, np.ndarray):
embedding = embedding.tolist()
embedding_time = time.time() - start_time
log_histogram("create_embedding_duration", embedding_time, labels={"provider": provider, "model": model})
log_counter("create_embedding_success", labels={"provider": provider, "model": model})
return embedding
def create_openai_embedding(text: str, model: str) -> List[float]:
log_counter("create_openai_embedding_attempt", labels={"model": model})
start_time = time.time()
embedding = get_openai_embeddings(text, model)
embedding_time = time.time() - start_time
log_histogram("create_openai_embedding_duration", embedding_time, labels={"model": model})
log_counter("create_openai_embedding_success", labels={"model": model})
return embedding
# FIXME - Version 1
# # FIXME - Add all globals to summarize.py
# loaded_config = load_comprehensive_config()
# embedding_provider = loaded_config['Embeddings']['embedding_provider']
# embedding_model = loaded_config['Embeddings']['embedding_model']
# embedding_api_url = loaded_config['Embeddings']['embedding_api_url']
# embedding_api_key = loaded_config['Embeddings']['embedding_api_key']
#
# # Embedding Chunking Settings
# chunk_size = loaded_config['Embeddings']['chunk_size']
# overlap = loaded_config['Embeddings']['overlap']
#
#
# # FIXME - Add logging
#
# class HuggingFaceEmbedder:
# def __init__(self, model_name, timeout_seconds=120): # Default timeout of 2 minutes
# self.model_name = model_name
# self.tokenizer = None
# self.model = None
# self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# self.timeout_seconds = timeout_seconds
# self.last_used_time = 0
# self.unload_timer = None
#
# def load_model(self):
# if self.model is None:
# self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
# self.model = AutoModel.from_pretrained(self.model_name)
# self.model.to(self.device)
# self.last_used_time = time.time()
# self.reset_timer()
#
# def unload_model(self):
# if self.model is not None:
# del self.model
# del self.tokenizer
# if torch.cuda.is_available():
# torch.cuda.empty_cache()
# self.model = None
# self.tokenizer = None
# if self.unload_timer:
# self.unload_timer.cancel()
#
# def reset_timer(self):
# if self.unload_timer:
# self.unload_timer.cancel()
# self.unload_timer = Timer(self.timeout_seconds, self.unload_model)
# self.unload_timer.start()
#
# def create_embeddings(self, texts):
# self.load_model()
# inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
# inputs = {k: v.to(self.device) for k, v in inputs.items()}
# with torch.no_grad():
# outputs = self.model(**inputs)
# embeddings = outputs.last_hidden_state.mean(dim=1)
# return embeddings.cpu().numpy()
#
# # Global variable to hold the embedder
# huggingface_embedder = None
#
#
# class RateLimiter:
# def __init__(self, max_calls, period):
# self.max_calls = max_calls
# self.period = period
# self.calls = []
# self.lock = Lock()
#
# def __call__(self, func):
# def wrapper(*args, **kwargs):
# with self.lock:
# now = time.time()
# self.calls = [call for call in self.calls if call > now - self.period]
# if len(self.calls) >= self.max_calls:
# sleep_time = self.calls[0] - (now - self.period)
# time.sleep(sleep_time)
# self.calls.append(time.time())
# return func(*args, **kwargs)
# return wrapper
#
#
# def exponential_backoff(max_retries=5, base_delay=1):
# def decorator(func):
# @wraps(func)
# def wrapper(*args, **kwargs):
# for attempt in range(max_retries):
# try:
# return func(*args, **kwargs)
# except Exception as e:
# if attempt == max_retries - 1:
# raise
# delay = base_delay * (2 ** attempt)
# logging.warning(f"Attempt {attempt + 1} failed. Retrying in {delay} seconds. Error: {str(e)}")
# time.sleep(delay)
# return wrapper
# return decorator
#
#
# # FIXME - refactor/setup to use config file & perform chunking
# @exponential_backoff()
# @RateLimiter(max_calls=50, period=60)
# def create_embeddings_batch(texts: List[str], provider: str, model: str, api_url: str, timeout_seconds: int = 300) -> List[List[float]]:
# global embedding_models
#
# try:
# if provider.lower() == 'huggingface':
# if model not in embedding_models:
# if model == "dunzhang/stella_en_400M_v5":
# embedding_models[model] = ONNXEmbedder(model, model_dir, timeout_seconds)
# else:
# embedding_models[model] = HuggingFaceEmbedder(model, timeout_seconds)
# embedder = embedding_models[model]
# return embedder.create_embeddings(texts)
#
# elif provider.lower() == 'openai':
# logging.debug(f"Creating embeddings for {len(texts)} texts using OpenAI API")
# return [create_openai_embedding(text, model) for text in texts]
#
# elif provider.lower() == 'local':
# response = requests.post(
# api_url,
# json={"texts": texts, "model": model},
# headers={"Authorization": f"Bearer {embedding_api_key}"}
# )
# if response.status_code == 200:
# return response.json()['embeddings']
# else:
# raise Exception(f"Error from local API: {response.text}")
# else:
# raise ValueError(f"Unsupported embedding provider: {provider}")
# except Exception as e:
# logging.error(f"Error in create_embeddings_batch: {str(e)}")
# raise
#
# def create_embedding(text: str, provider: str, model: str, api_url: str) -> List[float]:
# return create_embeddings_batch([text], provider, model, api_url)[0]
#
#
# def create_openai_embedding(text: str, model: str) -> List[float]:
# embedding = get_openai_embeddings(text, model)
# return embedding
#
#
# # FIXME - refactor to use onnx embeddings callout
# def create_stella_embeddings(text: str) -> List[float]:
# if embedding_provider == 'local':
# # Load the model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5")
# model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5")
#
# # Tokenize and encode the text
# inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
#
# # Generate embeddings
# with torch.no_grad():
# outputs = model(**inputs)
#
# # Use the mean of the last hidden state as the sentence embedding
# embeddings = outputs.last_hidden_state.mean(dim=1)
#
# return embeddings[0].tolist() # Convert to list for consistency
# elif embedding_provider == 'openai':
# return get_openai_embeddings(text, embedding_model)
# else:
# raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
# #
# # End of F
# ##############################################################
#
#
# ##############################################################
# #
# # ONNX Embeddings Functions
#
# # FIXME - UPDATE
# # Define the model path
# model_dir = "/tldw/App_Function_Libraries/models/embedding_models/"
# model_name = "your-huggingface-model-name"
# onnx_model_path = os.path.join(model_dir, model_name, "model.onnx")
#
# # Tokenizer download (if applicable)
# #tokenizer = AutoTokenizer.from_pretrained(model_name)
#
# # Ensure the model directory exists
# #if not os.path.exists(onnx_model_path):
# # You can add logic to download the ONNX model from a remote source
# # if it's not already available in the folder.
# # Example: huggingface_hub.download (if model is hosted on Hugging Face Hub)
# # raise Exception(f"ONNX model not found at {onnx_model_path}")
#
# class ONNXEmbedder:
# def __init__(self, model_name, model_dir, timeout_seconds=120):
# self.model_name = model_name
# self.model_path = os.path.join(model_dir, f"{model_name}.onnx")
# self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# self.session = None
# self.timeout_seconds = timeout_seconds
# self.last_used_time = 0
# self.unload_timer = None
# self.device = "cpu" # ONNX Runtime will default to CPU unless GPU is configured
#
# def load_model(self):
# if self.session is None:
# if not os.path.exists(self.model_path):
# raise FileNotFoundError(f"ONNX model not found at {self.model_path}")
# logging.info(f"Loading ONNX model from {self.model_path}")
# self.session = ort.InferenceSession(self.model_path)
# self.last_used_time = time.time()
# self.reset_timer()
#
# def unload_model(self):
# if self.session is not None:
# logging.info("Unloading ONNX model to free resources.")
# self.session = None
# if self.unload_timer:
# self.unload_timer.cancel()
#
# def reset_timer(self):
# if self.unload_timer:
# self.unload_timer.cancel()
# self.unload_timer = Timer(self.timeout_seconds, self.unload_model)
# self.unload_timer.start()
#
# def create_embeddings(self, texts: List[str]) -> List[List[float]]:
# self.load_model()
#
# try:
# inputs = self.tokenizer(texts, return_tensors="np", padding=True, truncation=True, max_length=512)
# input_ids = inputs["input_ids"].astype(np.int64)
# attention_mask = inputs["attention_mask"].astype(np.int64)
#
# ort_inputs = {
# "input_ids": input_ids,
# "attention_mask": attention_mask
# }
#
# ort_outputs = self.session.run(None, ort_inputs)
#
# last_hidden_state = ort_outputs[0]
# embeddings = np.mean(last_hidden_state, axis=1)
#
# return embeddings.tolist()
# except Exception as e:
# logging.error(f"Error creating embeddings with ONNX model: {str(e)}")
# raise
#
# # Global cache for the ONNX embedder instance
# onnx_embedder = None
#
# # Global cache for embedding models
# embedding_models = {}
#
# def create_onnx_embeddings(texts: List[str]) -> List[List[float]]:
# global onnx_embedder
# model_dir = "/tldw/App_Function_Libraries/models/embedding_models/"
# model_name = "your-huggingface-model-name" # This can be pulled from config
#
# if onnx_embedder is None:
# onnx_embedder = ONNXEmbedder(model_name=model_name, model_dir=model_dir)
#
# # Generate embeddings
# embeddings = onnx_embedder.create_embeddings(texts)
# return embeddings
#
# #
# # End of ONNX Embeddings Functions
# ##############################################################
#
# End of File.
#######################################################################################################################