Spaces:
Sleeping
Sleeping
added application
Browse files- .gitignore +1 -0
- Dockerfile +14 -0
- __pycache__/RAG.cpython-310.pyc +0 -0
- __pycache__/app.cpython-310.pyc +0 -0
- __pycache__/app.cpython-39.pyc +0 -0
- __pycache__/file_processing.cpython-310.pyc +0 -0
- __pycache__/file_processing.cpython-39.pyc +0 -0
- __pycache__/main.cpython-310.pyc +0 -0
- __pycache__/query_processing.cpython-310.pyc +0 -0
- __pycache__/query_processing.cpython-39.pyc +0 -0
- app.py +54 -0
- file_processing.py +105 -0
- main.py +90 -0
- query_processing.py +29 -0
- requirements.txt +42 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.vercel
|
Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use the official Python 3.10.9 image
|
2 |
+
FROM python:3.10.9
|
3 |
+
|
4 |
+
# Copy the current directory contents into the container at .
|
5 |
+
COPY . .
|
6 |
+
|
7 |
+
# Set the working directory to /
|
8 |
+
WORKDIR /
|
9 |
+
|
10 |
+
# Install requirements.txt
|
11 |
+
RUN pip install --no-cache-dir --upgrade -r /requirements.txt
|
12 |
+
|
13 |
+
# Start the FastAPI app on port 7860, the default port expected by Spaces
|
14 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
__pycache__/RAG.cpython-310.pyc
ADDED
Binary file (3.26 kB). View file
|
|
__pycache__/app.cpython-310.pyc
ADDED
Binary file (2.14 kB). View file
|
|
__pycache__/app.cpython-39.pyc
ADDED
Binary file (1.15 kB). View file
|
|
__pycache__/file_processing.cpython-310.pyc
ADDED
Binary file (3.34 kB). View file
|
|
__pycache__/file_processing.cpython-39.pyc
ADDED
Binary file (1.69 kB). View file
|
|
__pycache__/main.cpython-310.pyc
ADDED
Binary file (1.47 kB). View file
|
|
__pycache__/query_processing.cpython-310.pyc
ADDED
Binary file (1.18 kB). View file
|
|
__pycache__/query_processing.cpython-39.pyc
ADDED
Binary file (995 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, File, UploadFile, Form
|
2 |
+
from file_processing import load_documents, chunk_documents, create_embeddings
|
3 |
+
from query_processing import load_qa_chain, process_query
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
import os
|
6 |
+
|
7 |
+
load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
|
8 |
+
|
9 |
+
openai_api_key = os.environ.get('OPENAI_API_KEY')
|
10 |
+
print(openai_api_key)
|
11 |
+
|
12 |
+
app = FastAPI()
|
13 |
+
from fastapi.middleware.cors import CORSMiddleware
|
14 |
+
|
15 |
+
app.add_middleware(
|
16 |
+
CORSMiddleware,
|
17 |
+
allow_origins=["http://localhost:3000"], # Allows only requests from your React app
|
18 |
+
allow_credentials=True,
|
19 |
+
allow_methods=["*"], # Allows all methods
|
20 |
+
allow_headers=["*"], # Allows all headers
|
21 |
+
)
|
22 |
+
|
23 |
+
@app.post("/process-file")
|
24 |
+
async def process_file(collection_name: str = Form(...), file: UploadFile = File(...)):
|
25 |
+
print("Received collection_name:", collection_name)
|
26 |
+
print("Received file:", file.filename)
|
27 |
+
# Load documents
|
28 |
+
documents = await load_documents(file)
|
29 |
+
|
30 |
+
# Chunk documents
|
31 |
+
chunked_docs = chunk_documents(documents, chunk_size=500, chunk_overlap=100)
|
32 |
+
|
33 |
+
# Create embeddings and store in Chroma
|
34 |
+
vector_store = create_embeddings(chunked_docs, collection_name)
|
35 |
+
preview_length = 750 # Adjust based on desired preview size
|
36 |
+
document_previews = [doc.page_content[:preview_length] for doc in documents] # or whatever attribute holds the content
|
37 |
+
|
38 |
+
# Return the success message along with the document previews
|
39 |
+
return {"message": "File processed successfully", "document_preview": document_previews}
|
40 |
+
from pydantic import BaseModel
|
41 |
+
|
42 |
+
class QueryRequest(BaseModel):
|
43 |
+
collection_name: str
|
44 |
+
query: str
|
45 |
+
@app.post("/query")
|
46 |
+
async def query(request: QueryRequest):
|
47 |
+
# Load the RetrievalQA chain
|
48 |
+
print(request.dict())
|
49 |
+
qa_chain = load_qa_chain(request.collection_name)
|
50 |
+
|
51 |
+
# Process the query
|
52 |
+
result = process_query(request.query, qa_chain)
|
53 |
+
|
54 |
+
return {"result": result}
|
file_processing.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.document_loaders import PyPDFLoader, PDFMinerLoader, DirectoryLoader
|
2 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain.vectorstores import Chroma
|
5 |
+
from os.path import join
|
6 |
+
import os
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
|
9 |
+
openai_api_key = os.environ.get('OPENAI_API_KEY')
|
10 |
+
from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
|
11 |
+
|
12 |
+
# def load_documents(file_path):
|
13 |
+
# if file_path.endswith('.txt'):
|
14 |
+
# loader = TextLoader(file_path)
|
15 |
+
# elif file_path.endswith('.pdf'):
|
16 |
+
# loader = PyPDFLoader(file_path)
|
17 |
+
# elif file_path.endswith('.doc') or file_path.endswith('.docx'):
|
18 |
+
# loader = UnstructuredWordDocumentLoader(file_path)
|
19 |
+
# elif file_path.endswith('.csv'):
|
20 |
+
# loader = CSVLoader(file_path)
|
21 |
+
# else:
|
22 |
+
# raise ValueError(f"Unsupported file format: {file_path}")
|
23 |
+
|
24 |
+
# documents = loader.load()
|
25 |
+
# return documents
|
26 |
+
from fastapi import UploadFile
|
27 |
+
from typing import List
|
28 |
+
import fitz # PyMuPDF
|
29 |
+
import pandas as pd
|
30 |
+
import docx
|
31 |
+
from langchain.docstore.document import Document
|
32 |
+
def read_pdf(file_path: str) -> str:
|
33 |
+
doc = fitz.open(file_path)
|
34 |
+
text = ""
|
35 |
+
for page in doc:
|
36 |
+
text += page.get_text()
|
37 |
+
return text
|
38 |
+
|
39 |
+
def read_docx(file_path: str) -> str:
|
40 |
+
doc = docx.Document(file_path)
|
41 |
+
fullText = []
|
42 |
+
for para in doc.paragraphs:
|
43 |
+
fullText.append(para.text)
|
44 |
+
return '\n'.join(fullText)
|
45 |
+
|
46 |
+
def read_csv(file_path: str) -> str:
|
47 |
+
df = pd.read_csv(file_path)
|
48 |
+
return df.to_string()
|
49 |
+
|
50 |
+
def read_txt(file_path: str) -> str:
|
51 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
52 |
+
return file.read()
|
53 |
+
|
54 |
+
async def load_documents(file: UploadFile)->List[Document]:
|
55 |
+
temp_file_path = f"temp_{file.filename}"
|
56 |
+
try:
|
57 |
+
# Save the uploaded file to a temporary file
|
58 |
+
with open(temp_file_path, "wb") as temp_file:
|
59 |
+
temp_file.write(await file.read())
|
60 |
+
|
61 |
+
content = ""
|
62 |
+
if file.filename.endswith('.pdf'):
|
63 |
+
content = read_pdf(temp_file_path)
|
64 |
+
elif file.filename.endswith('.docx'):
|
65 |
+
content = read_docx(temp_file_path)
|
66 |
+
elif file.filename.endswith('.csv'):
|
67 |
+
content = read_csv(temp_file_path)
|
68 |
+
elif file.filename.endswith('.txt'):
|
69 |
+
content = read_txt(temp_file_path)
|
70 |
+
else:
|
71 |
+
raise ValueError("Unsupported file format")
|
72 |
+
except Exception as e:
|
73 |
+
# Handle general errors - log or adjust as necessary for your application
|
74 |
+
print(f"Error processing document: {e}")
|
75 |
+
content = "Error processing document."
|
76 |
+
finally:
|
77 |
+
# Cleanup: remove the temporary file
|
78 |
+
if os.path.exists(temp_file_path):
|
79 |
+
os.remove(temp_file_path)
|
80 |
+
|
81 |
+
metadata = {'source': file.filename}
|
82 |
+
document = Document(page_content=content, metadata=metadata)
|
83 |
+
return [document]
|
84 |
+
|
85 |
+
|
86 |
+
from langchain.text_splitter import CharacterTextSplitter
|
87 |
+
|
88 |
+
def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
|
89 |
+
text_splitter = CharacterTextSplitter(
|
90 |
+
chunk_size=chunk_size,
|
91 |
+
chunk_overlap=chunk_overlap
|
92 |
+
)
|
93 |
+
chunked_docs = text_splitter.split_documents(documents)
|
94 |
+
return chunked_docs
|
95 |
+
|
96 |
+
|
97 |
+
from langchain.embeddings import OpenAIEmbeddings
|
98 |
+
from langchain.vectorstores import Chroma
|
99 |
+
|
100 |
+
def create_embeddings(chunked_docs, collection_name):
|
101 |
+
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
102 |
+
vector_store = Chroma.from_documents(chunked_docs, embeddings, collection_name=collection_name)
|
103 |
+
vector_store.persist()
|
104 |
+
|
105 |
+
return vector_store
|
main.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from dotenv import load_dotenv
|
2 |
+
# from typing import Any
|
3 |
+
# from fastapi import FastAPI, HTTPException
|
4 |
+
# from fastapi.middleware.cors import CORSMiddleware
|
5 |
+
# from pydantic import BaseModel
|
6 |
+
# import RAG
|
7 |
+
# # Load environment variables from .env file (if any)
|
8 |
+
# load_dotenv()
|
9 |
+
|
10 |
+
|
11 |
+
# class Response(BaseModel):
|
12 |
+
# result: str | None
|
13 |
+
|
14 |
+
# class UserQuery(BaseModel):
|
15 |
+
# messages: str
|
16 |
+
|
17 |
+
# origins = [
|
18 |
+
# "http://localhost",
|
19 |
+
# "http://localhost:8080",
|
20 |
+
# "http://localhost:3000"
|
21 |
+
# ]
|
22 |
+
|
23 |
+
# app = FastAPI()
|
24 |
+
# app.add_middleware(
|
25 |
+
# CORSMiddleware,
|
26 |
+
# allow_origins=origins,
|
27 |
+
# allow_credentials=True,
|
28 |
+
# allow_methods=["*"],
|
29 |
+
# allow_headers=["*"],
|
30 |
+
# )
|
31 |
+
|
32 |
+
# initialize_model()
|
33 |
+
# # @app.post("/predict", response_model = Response)
|
34 |
+
# # def predict() -> Any:
|
35 |
+
|
36 |
+
# # #implement this code block
|
37 |
+
|
38 |
+
# # return {"result": "hello world!"}
|
39 |
+
# # @app.get("/hello")
|
40 |
+
# # async def hello():
|
41 |
+
# # return 'Hello World'
|
42 |
+
# @app.post("/home")
|
43 |
+
# def home_route(home: UserQuery):
|
44 |
+
# try:
|
45 |
+
# if not home.messages:
|
46 |
+
# raise HTTPException(status_code=400, detail="Empty value")
|
47 |
+
|
48 |
+
# # Call the custom function to generate a response using RetrievalQA
|
49 |
+
# answer, generation = generate_response(home.messages)
|
50 |
+
|
51 |
+
# return {"response": answer, "reasoning": generation}
|
52 |
+
# except Exception as e:
|
53 |
+
# print(f"An error occurred: {e}")
|
54 |
+
# raise HTTPException(status_code=500, detail="Internal Server Error")
|
55 |
+
|
56 |
+
|
57 |
+
from file_processing import load_documents, chunk_documents, create_embeddings
|
58 |
+
from query_processing import load_qa_chain, process_query
|
59 |
+
from dotenv import load_dotenv
|
60 |
+
import os
|
61 |
+
|
62 |
+
def main():
|
63 |
+
load_dotenv()
|
64 |
+
openai_api_key = os.environ.get('OPENAI_API_KEY')
|
65 |
+
file_path = r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\backend\files\Option for Residence Accommodation.pdf'
|
66 |
+
collection_name = 'my_collection'
|
67 |
+
|
68 |
+
# Load documents
|
69 |
+
documents = load_documents(file_path)
|
70 |
+
|
71 |
+
# Chunk documents
|
72 |
+
chunked_docs = chunk_documents(documents, chunk_size=500, chunk_overlap=100)
|
73 |
+
|
74 |
+
# Create embeddings and store in Chroma
|
75 |
+
vector_store = create_embeddings(chunked_docs, collection_name)
|
76 |
+
|
77 |
+
# Load the RetrievalQA chain
|
78 |
+
qa_chain = load_qa_chain(collection_name)
|
79 |
+
|
80 |
+
# Process user queries
|
81 |
+
while True:
|
82 |
+
query = input("Enter your query (or 'exit' to quit): ")
|
83 |
+
if query.lower() == 'exit':
|
84 |
+
break
|
85 |
+
|
86 |
+
result = process_query(query, qa_chain)
|
87 |
+
print(result)
|
88 |
+
|
89 |
+
if __name__ == '__main__':
|
90 |
+
main()
|
query_processing.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.embeddings import OpenAIEmbeddings
|
2 |
+
from langchain.vectorstores import Chroma
|
3 |
+
from langchain.llms import OpenAI
|
4 |
+
from langchain.chains import RetrievalQA
|
5 |
+
import os
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
|
8 |
+
openai_api_key = os.environ.get('OPENAI_API_KEY')
|
9 |
+
def load_qa_chain(collection_name):
|
10 |
+
# Load the vector store from disk
|
11 |
+
vector_store = Chroma(collection_name=collection_name, embedding_function=OpenAIEmbeddings())
|
12 |
+
|
13 |
+
# Create an instance of OpenAI language model
|
14 |
+
llm = OpenAI(openai_api_key=openai_api_key)
|
15 |
+
retriever = vector_store.as_retriever(search_kwargs={"k": 2})
|
16 |
+
# Create a RetrievalQA chain
|
17 |
+
qa_chain = RetrievalQA.from_chain_type(
|
18 |
+
llm=llm,
|
19 |
+
chain_type="map_reduce",
|
20 |
+
retriever=vector_store.as_retriever()
|
21 |
+
)
|
22 |
+
|
23 |
+
return qa_chain
|
24 |
+
|
25 |
+
def process_query(query, qa_chain):
|
26 |
+
# Run the query through the RetrievalQA chain
|
27 |
+
result = qa_chain.run(query)
|
28 |
+
|
29 |
+
return result
|
requirements.txt
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# annotated-types==0.6.0
|
2 |
+
# anyio==4.3.0
|
3 |
+
# click==8.1.7
|
4 |
+
# colorama==0.4.6
|
5 |
+
# exceptiongroup==1.2.0
|
6 |
+
# fastapi==0.110.0
|
7 |
+
# h11==0.14.0
|
8 |
+
# httptools==0.6.1
|
9 |
+
# idna==3.6
|
10 |
+
# pydantic==2.6.3
|
11 |
+
# pydantic_core==2.16.3
|
12 |
+
# pyspark==3.3.1
|
13 |
+
# python-dotenv==1.0.1
|
14 |
+
# PyYAML==6.0.1
|
15 |
+
# sniffio==1.3.1
|
16 |
+
# starlette==0.36.3
|
17 |
+
# typing_extensions==4.10.0
|
18 |
+
# uvicorn==0.28.0
|
19 |
+
# watchfiles==0.21.0
|
20 |
+
# websockets==12.0
|
21 |
+
# langchain
|
22 |
+
# sentence-transformers
|
23 |
+
# chromadb
|
24 |
+
# torch==2.1.0
|
25 |
+
# accelerate==0.22.0
|
26 |
+
# bitsandbytes
|
27 |
+
langchain
|
28 |
+
openai
|
29 |
+
chromadb
|
30 |
+
tiktoken
|
31 |
+
PyPDF2
|
32 |
+
pypdf
|
33 |
+
python-docx
|
34 |
+
pandas
|
35 |
+
python-dotenv
|
36 |
+
fastapi
|
37 |
+
uvicorn
|
38 |
+
python-multipart
|
39 |
+
# chardet
|
40 |
+
PyMuPDF>=1.18.19
|
41 |
+
python-docx
|
42 |
+
pandas
|