Spaces:
Sleeping
Sleeping
import os | |
import time | |
from src.helper import PINECONE_API_KEY, text_split, download_hugging_face_embeddings | |
from langchain.vectorstores import Pinecone as LangchainPinecone # Alias to avoid confusion | |
from dotenv import load_dotenv | |
from pinecone import Pinecone, ServerlessSpec | |
from langchain_pinecone import PineconeVectorStore | |
from PyPDF2 import PdfReader | |
# Define the load_pdf function | |
def load_pdf(file_path): | |
all_text = "" | |
with open(file_path, 'rb') as file: | |
reader = PdfReader(file) | |
for page in reader.pages: | |
all_text += page.extract_text() + "\n" | |
return all_text if all_text else None | |
# Define the text_split function | |
def text_split(text): | |
from langchain.text_splitter import CharacterTextSplitter | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
return text_splitter.split_text(text) | |
# Load environment variables if not already set | |
load_dotenv() | |
# Load and process data | |
pdf_file_path = "data/Okelloetal.2008TourismanalysisManka.pdf" # Update this path to your single PDF file | |
extracted_data = load_pdf(pdf_file_path) | |
if extracted_data is None: | |
raise ValueError("The extracted data is None. Please check the load_pdf function.") | |
print(f"Extracted Data: {extracted_data}") | |
# Split the extracted text into chunks | |
text_chunks = text_split(extracted_data) | |
if text_chunks is None: | |
raise ValueError("The text_chunks is None. Please check the text_split function.") | |
print(f"Text Chunks: {text_chunks}") | |
embeddings = download_hugging_face_embeddings() | |
if embeddings is None: | |
raise ValueError("The embeddings is None. Please check the download_hugging_face_embeddings function.") | |
print(f"Embeddings: {embeddings}") | |
# Ensure Pinecone API key is available | |
api_key = os.environ.get("PINECONE_API_KEY") | |
if not api_key: | |
raise ValueError("PINECONE_API_KEY environment variable not set.") | |
# Initialize Pinecone client | |
pc = Pinecone(api_key=api_key) | |
# Specify cloud and region for the serverless index | |
cloud = os.environ.get('PINECONE_CLOUD') or 'aws' | |
region = os.environ.get('PINECONE_REGION') or 'us-east-1' | |
spec = ServerlessSpec(cloud=cloud, region=region) | |
# Define the index name | |
index_name = "healthbot" | |
# Create the index if it does not exist | |
if index_name not in pc.list_indexes().names(): | |
pc.create_index( | |
name=index_name, | |
dimension=384, | |
metric="cosine", | |
spec=spec | |
) | |
# Wait for the index to be ready | |
while not pc.describe_index(index_name).status['ready']: | |
time.sleep(1) | |
# Connect to the created index | |
index = pc.Index(index_name) | |
time.sleep(1) | |
# Example: Add data to the index with reduced metadata | |
# Create a dictionary to simulate external storage of text chunks | |
text_chunk_store = {} | |
# Function to simulate storing text chunk and returning a reference ID | |
def store_text_chunk(text_chunk): | |
chunk_id = f"chunk_{len(text_chunk_store)}" | |
text_chunk_store[chunk_id] = text_chunk | |
return chunk_id | |
# Add text chunks to Pinecone with reference IDs | |
for i, text_chunk in enumerate(text_chunks): | |
chunk_id = store_text_chunk(text_chunk) | |
embedding = embeddings.embed_query(text_chunk) # Embed the text chunk | |
index.upsert( | |
vectors=[ | |
{ | |
"id": f"vec_{i}", | |
"values": embedding, | |
"metadata": {"chunk_id": chunk_id} # Only store the reference ID as metadata | |
} | |
], | |
namespace="ns1" | |
) | |
print("Indexing completed successfully.") |