import datetime import os import uuid import fitz import streamlit as st from langchain_text_splitters import RecursiveCharacterTextSplitter from llama_cpp import Llama from datastore import ChromaStore from embeddings import Embedding #### state if "chat_history" not in st.session_state: st.session_state.chat_history = [] if "document_submitted" not in st.session_state: st.session_state.document_submitted = False def phi3(input: str, relevant_chunks: list): llm = Llama( model_path=os.path.join( os.getcwd(), "models", "Phi-3.1-mini-4k-instruct-Q4_K_M.gguf", ), n_ctx=2000, n_threads=1, # The number of CPU threads to use, n_gpu_layers=0, # The number of layers to offload to GPU, ) prompt = f"""CONTENT: {relevant_chunks}\n\nQUESTION: {input}\n\nFrom the given CONTENT, Please answer the QUESTION.""" output = llm( f"<|user|>\n{prompt}<|end|>\n<|assistant|>", max_tokens=2000, stop=["<|end|>"], echo=True, ) cleaned_output = output["choices"][0]["text"].split("<|assistant|>", 1)[-1].strip() return cleaned_output def generate_unique_id(): unique_id = uuid.uuid4() current_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f") combined_id = f"{unique_id}-{current_time}" return combined_id def add_to_vectorstore(content: str, chunk_size: int = 500, chunk_overlap: int = 20): chromastore = ChromaStore(collection_name="pdf_store") # delete if already exist if "pdf_store" in chromastore.list_collections(): chromastore.delete("pdf_store") st.toast("Old database cleaned!") collection = chromastore.create() # chunkify content text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False, ) chunks = text_splitter.split_text(content) # generate embeddings and ids embeddings, ids = [], [] for i, chunk in enumerate(chunks): embeddings.append(Embedding.encode_text(chunk).tolist()) ids.append(generate_unique_id()) # add to vectorstore chromastore.add( collection=collection, embeddings=embeddings, documents=chunks, ids=ids, ) def similarity_search(query: str): chromastore = ChromaStore(collection_name="pdf_store") collection = chromastore.create() query_embedding = Embedding.encode_text(query).tolist() return chromastore.query(collection=collection, query_embedding=query_embedding) def main(): st.set_page_config(page_icon="🤖", page_title="Phi 3 RAG", layout="wide") st.markdown( """