|
|
|
|
|
|
|
|
|
import logging
|
|
from typing import List, Dict, Any
|
|
|
|
import numpy as np
|
|
|
|
|
|
import requests
|
|
from transformers import AutoTokenizer, AutoModel
|
|
import torch
|
|
|
|
|
|
from App_Function_Libraries.LLM_API_Calls import get_openai_embeddings
|
|
from App_Function_Libraries.Summarization_General_Lib import summarize
|
|
from App_Function_Libraries.Utils.Utils import load_comprehensive_config
|
|
from App_Function_Libraries.Chunk_Lib import chunk_options, improved_chunking_process, determine_chunk_position
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
loaded_config = load_comprehensive_config()
|
|
embedding_provider = loaded_config['Embeddings']['embedding_provider']
|
|
embedding_model = loaded_config['Embeddings']['embedding_model']
|
|
embedding_api_url = loaded_config['Embeddings']['embedding_api_url']
|
|
embedding_api_key = loaded_config['Embeddings']['embedding_api_key']
|
|
|
|
|
|
chunk_size = loaded_config['Embeddings']['chunk_size']
|
|
overlap = loaded_config['Embeddings']['overlap']
|
|
|
|
|
|
|
|
|
|
|
|
def create_embedding(text: str, provider: str, model: str, api_url: str = None, api_key: str = None) -> List[float]:
|
|
try:
|
|
if provider == 'openai':
|
|
embedding = get_openai_embeddings(text, model)
|
|
elif provider == 'local':
|
|
embedding = create_local_embedding(text, model, api_url, api_key)
|
|
elif provider == 'huggingface':
|
|
embedding = create_huggingface_embedding(text, model)
|
|
elif provider == 'llamacpp':
|
|
embedding = create_llamacpp_embedding(text, api_url)
|
|
else:
|
|
raise ValueError(f"Unsupported embedding provider: {provider}")
|
|
|
|
if isinstance(embedding, np.ndarray):
|
|
embedding = embedding.tolist()
|
|
elif isinstance(embedding, torch.Tensor):
|
|
embedding = embedding.detach().cpu().numpy().tolist()
|
|
|
|
return embedding
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error creating embedding: {str(e)}")
|
|
raise
|
|
|
|
|
|
def create_huggingface_embedding(text: str, model: str) -> List[float]:
|
|
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
model = AutoModel.from_pretrained(model)
|
|
|
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
|
|
embeddings = outputs.last_hidden_state.mean(dim=1)
|
|
return embeddings[0].tolist()
|
|
|
|
|
|
|
|
def create_stella_embeddings(text: str) -> List[float]:
|
|
if embedding_provider == 'local':
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("dunzhang/stella_en_400M_v5")
|
|
model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5")
|
|
|
|
|
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
|
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
|
|
|
|
embeddings = outputs.last_hidden_state.mean(dim=1)
|
|
|
|
return embeddings[0].tolist()
|
|
elif embedding_provider == 'openai':
|
|
return get_openai_embeddings(text, embedding_model)
|
|
else:
|
|
raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
|
|
|
|
|
|
def create_llamacpp_embedding(text: str, api_url: str) -> List[float]:
|
|
response = requests.post(
|
|
api_url,
|
|
json={"input": text}
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()['embedding']
|
|
|
|
|
|
def create_local_embedding(text: str, model: str, api_url: str, api_key: str) -> List[float]:
|
|
response = requests.post(
|
|
api_url,
|
|
json={"text": text, "model": model},
|
|
headers={"Authorization": f"Bearer {api_key}"}
|
|
)
|
|
response.raise_for_status()
|
|
return response.json().get('embedding', None)
|
|
|
|
|
|
def chunk_for_embedding(text: str, file_name: str, api_name, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
|
options = chunk_options.copy()
|
|
if custom_chunk_options:
|
|
options.update(custom_chunk_options)
|
|
|
|
|
|
|
|
if api_name is not None:
|
|
|
|
full_summary = summarize(text, None, api_name, None, None, None)
|
|
else:
|
|
full_summary = "Full document summary not available."
|
|
|
|
chunks = improved_chunking_process(text, options)
|
|
total_chunks = len(chunks)
|
|
|
|
chunked_text_with_headers = []
|
|
for i, chunk in enumerate(chunks, 1):
|
|
chunk_text = chunk['text']
|
|
chunk_position = determine_chunk_position(chunk['metadata']['relative_position'])
|
|
|
|
chunk_header = f"""
|
|
Original Document: {file_name}
|
|
Full Document Summary: {full_summary}
|
|
Chunk: {i} of {total_chunks}
|
|
Position: {chunk_position}
|
|
|
|
--- Chunk Content ---
|
|
"""
|
|
|
|
full_chunk_text = chunk_header + chunk_text
|
|
chunk['text'] = full_chunk_text
|
|
chunk['metadata']['file_name'] = file_name
|
|
chunked_text_with_headers.append(chunk)
|
|
|
|
return chunked_text_with_headers
|
|
|
|
|
|
def create_openai_embedding(text: str, model: str) -> List[float]:
|
|
embedding = get_openai_embeddings(text, model)
|
|
return embedding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|