TheDavidYoungblood commited on
Commit
927f45c
1 Parent(s): 51d559c

Add application file and requirements

Browse files
Files changed (2) hide show
  1. app.py +83 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
3
+ import fitz # PyMuPDF
4
+ from datasets import load_dataset
5
+ from llama_index.core import Document, VectorStoreIndex, StorageContext, load_index_from_storage, Settings
6
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
+ from llama_index.llms.ollama import Ollama
8
+
9
+ # Load Llama 3 model components
10
+ tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
11
+ retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="custom", passages_path="my_knowledge_base.faiss")
12
+ model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)
13
+
14
+ # Load the embedding model
15
+ embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
16
+
17
+ # Create an LLM object using the deployed Llama3 Ollama instance
18
+ llm = Ollama(model="llama3:instruct", request_timeout=60.0)
19
+
20
+ # Set global settings for the LLM, chunk size, and embedding model
21
+ Settings.llm = llm
22
+ Settings.chunk_size = 512
23
+ Settings.embed_model = embed_model
24
+
25
+ # Function to extract text from PDFs
26
+ def extract_text_from_pdf(pdf_files):
27
+ texts = []
28
+ for pdf in pdf_files:
29
+ doc = fitz.open(pdf.name)
30
+ text = ""
31
+ for page in doc:
32
+ text += page.get_text()
33
+ texts.append(text)
34
+ return texts
35
+
36
+ # Function to provide answers based on questions and PDFs
37
+ def rag_answer(question, pdf_files):
38
+ texts = extract_text_from_pdf(pdf_files)
39
+ context = " ".join(texts)
40
+ inputs = tokenizer(question, return_tensors="pt")
41
+ outputs = model.generate(**inputs, context_input=context)
42
+ return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
43
+
44
+ # Function to create the Vector Store Index from documents
45
+ def create_vector_store_index(documents):
46
+ index = VectorStoreIndex.from_documents(documents)
47
+ index.storage_context.persist(persist_dir="pdf_docs")
48
+ return index
49
+
50
+ # Load dataset and convert to Document format
51
+ pdf_docs = load_dataset('your-dataset-name', split='train') # Replace with your actual dataset name
52
+ documents = [Document(text=row['text'], metadata={'title': row['title']}) for index, row in pdf_docs.iterrows()]
53
+
54
+ # Create or load the vector store index
55
+ try:
56
+ storage_context = StorageContext.from_defaults(persist_dir="pdf_docs")
57
+ vector_index = load_index_from_storage(storage_context)
58
+ except:
59
+ vector_index = create_vector_store_index(documents)
60
+
61
+ # Define the query engine powered by the Vector Store
62
+ query_engine = vector_index.as_query_engine(similarity_top_k=10)
63
+
64
+ # Functions for Gradio UI
65
+ def query(text):
66
+ z = query_engine.query(text)
67
+ return z
68
+
69
+ def interface(text):
70
+ z = query(text)
71
+ response = z.response
72
+ return response
73
+
74
+ # Gradio interface
75
+ with gr.Blocks(theme=gr.themes.Glass().set(block_title_text_color="black", body_background_fill="black", input_background_fill="black", body_text_color="white")) as demo:
76
+ gr.Markdown("h1 {text-align: center;display: block;}Information Custodian Chat Agent")
77
+ with gr.Row():
78
+ output_text = gr.Textbox(lines=20)
79
+ with gr.Row():
80
+ input_text = gr.Textbox(label='Enter your query here')
81
+ input_text.submit(fn=interface, inputs=input_text, outputs=output_text)
82
+
83
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ faiss-cpu
4
+ datasets
5
+ PyMuPDF
6
+ llama-index-embeddings-instructor
7
+ llama-index-embeddings-huggingface
8
+ llama-index-llms-ollama
9
+ llama-index