Spaces:
Running
Running
# Embeddings_Create.py | |
# Description: Functions for Creating and managing Embeddings in ChromaDB with LLama.cpp/OpenAI/Transformers | |
# | |
# Imports: | |
import logging | |
from typing import List, Dict, Any | |
import numpy as np | |
# | |
# 3rd-Party Imports: | |
import requests | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
# | |
# Local Imports: | |
from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings | |
from App_Function_Libraries.Summarization_General_Lib import summarize | |
from App_Function_Libraries.Utils.Utils import load_comprehensive_config | |
from App_Function_Libraries.Chunk_Lib import chunk_options, improved_chunking_process#, determine_chunk_position | |
# | |
# | |
####################################################################################################################### | |
# | |
# Functions: | |
# FIXME - Add all globals to summarize.py | |
loaded_config = load_comprehensive_config() | |
embedding_provider = 'openai' | |
embedding_model = 'text-embedding-3-large' | |
embedding_api_url = 'https://127.0.0.1' | |
embedding_api_key = 'busted_api_key_lol' | |
# Embedding Chunking Settings | |
chunk_size = '500' | |
overlap = '200' | |
# FIXME - Add logging | |
# FIXME - refactor/setup to use config file & perform chunking | |
def create_embedding(text: str, provider: str, model: str, api_url: str = None, api_key: str = None) -> List[float]: | |
try: | |
if provider == 'openai': | |
embedding = get_openai_embeddings(text, model) | |
elif provider == 'local': | |
embedding = create_local_embedding(text, model, api_url, api_key) | |
elif provider == 'huggingface': | |
embedding = create_huggingface_embedding(text, model) | |
elif provider == 'llamacpp': | |
embedding = create_llamacpp_embedding(text, api_url) | |
else: | |
raise ValueError(f"Unsupported embedding provider: {provider}") | |
if isinstance(embedding, np.ndarray): | |
embedding = embedding.tolist() | |
elif isinstance(embedding, torch.Tensor): | |
embedding = embedding.detach().cpu().numpy().tolist() | |
return embedding | |
except Exception as e: | |
logging.error(f"Error creating embedding: {str(e)}") | |
raise | |
def create_huggingface_embedding(text: str, model: str) -> List[float]: | |
tokenizer = AutoTokenizer.from_pretrained(model) | |
model = AutoModel.from_pretrained(model) | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
embeddings = outputs.last_hidden_state.mean(dim=1) | |
return embeddings[0].tolist() | |
# FIXME | |
def create_stella_embeddings(text: str) -> List[float]: | |
if embedding_provider == 'local': | |
# Load the model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5") | |
model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5") | |
# Tokenize and encode the text | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
# Generate embeddings | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Use the mean of the last hidden state as the sentence embedding | |
embeddings = outputs.last_hidden_state.mean(dim=1) | |
return embeddings[0].tolist() # Convert to list for consistency | |
elif embedding_provider == 'openai': | |
return get_openai_embeddings(text, embedding_model) | |
else: | |
raise ValueError(f"Unsupported embedding provider: {embedding_provider}") | |
def create_llamacpp_embedding(text: str, api_url: str) -> List[float]: | |
response = requests.post( | |
api_url, | |
json={"input": text} | |
) | |
response.raise_for_status() | |
return response.json()['embedding'] | |
def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]: | |
response = requests.post( | |
api_url, | |
json={"text": text, "model": model}, | |
headers={"Authorization": f"Bearer {api_key}"} | |
) | |
response.raise_for_status() | |
return response.json().get('embedding', None) | |
def chunk_for_embedding(text: str, file_name: str, api_name, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]: | |
options = chunk_options.copy() | |
if custom_chunk_options: | |
options.update(custom_chunk_options) | |
# FIXME | |
if api_name is not None: | |
# Generate summary of the full document | |
full_summary = summarize(text, None, api_name, None, None, None) | |
else: | |
full_summary = "Full document summary not available." | |
chunks = improved_chunking_process(text, options) | |
total_chunks = len(chunks) | |
chunked_text_with_headers = [] | |
for i, chunk in enumerate(chunks, 1): | |
chunk_text = chunk['text'] | |
chunk_position = 1#DIRTY HACK #determine_chunk_position(chunk['metadata']['relative_position']) | |
chunk_header = f""" | |
Original Document: {file_name} | |
Full Document Summary: {full_summary} | |
Chunk: {i} of {total_chunks} | |
Position: {chunk_position} | |
--- Chunk Content --- | |
""" | |
full_chunk_text = chunk_header + chunk_text | |
chunk['text'] = full_chunk_text | |
chunk['metadata']['file_name'] = file_name | |
chunked_text_with_headers.append(chunk) | |
return chunked_text_with_headers | |
def create_openai_embedding(text: str, model: str) -> List[float]: | |
embedding = get_openai_embeddings(text, model) | |
return embedding | |
# | |
# End of File. | |
####################################################################################################################### | |