Sk4467 commited on
Commit
d0fbfa7
1 Parent(s): 4f9bf7e

added application

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .vercel
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.10.9 image
2
+ FROM python:3.10.9
3
+
4
+ # Copy the current directory contents into the container at .
5
+ COPY . .
6
+
7
+ # Set the working directory to /
8
+ WORKDIR /
9
+
10
+ # Install requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r /requirements.txt
12
+
13
+ # Start the FastAPI app on port 7860, the default port expected by Spaces
14
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
__pycache__/RAG.cpython-310.pyc ADDED
Binary file (3.26 kB). View file
 
__pycache__/app.cpython-310.pyc ADDED
Binary file (2.14 kB). View file
 
__pycache__/app.cpython-39.pyc ADDED
Binary file (1.15 kB). View file
 
__pycache__/file_processing.cpython-310.pyc ADDED
Binary file (3.34 kB). View file
 
__pycache__/file_processing.cpython-39.pyc ADDED
Binary file (1.69 kB). View file
 
__pycache__/main.cpython-310.pyc ADDED
Binary file (1.47 kB). View file
 
__pycache__/query_processing.cpython-310.pyc ADDED
Binary file (1.18 kB). View file
 
__pycache__/query_processing.cpython-39.pyc ADDED
Binary file (995 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form
2
+ from file_processing import load_documents, chunk_documents, create_embeddings
3
+ from query_processing import load_qa_chain, process_query
4
+ from dotenv import load_dotenv
5
+ import os
6
+
7
+ load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
8
+
9
+ openai_api_key = os.environ.get('OPENAI_API_KEY')
10
+ print(openai_api_key)
11
+
12
+ app = FastAPI()
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+
15
+ app.add_middleware(
16
+ CORSMiddleware,
17
+ allow_origins=["http://localhost:3000"], # Allows only requests from your React app
18
+ allow_credentials=True,
19
+ allow_methods=["*"], # Allows all methods
20
+ allow_headers=["*"], # Allows all headers
21
+ )
22
+
23
+ @app.post("/process-file")
24
+ async def process_file(collection_name: str = Form(...), file: UploadFile = File(...)):
25
+ print("Received collection_name:", collection_name)
26
+ print("Received file:", file.filename)
27
+ # Load documents
28
+ documents = await load_documents(file)
29
+
30
+ # Chunk documents
31
+ chunked_docs = chunk_documents(documents, chunk_size=500, chunk_overlap=100)
32
+
33
+ # Create embeddings and store in Chroma
34
+ vector_store = create_embeddings(chunked_docs, collection_name)
35
+ preview_length = 750 # Adjust based on desired preview size
36
+ document_previews = [doc.page_content[:preview_length] for doc in documents] # or whatever attribute holds the content
37
+
38
+ # Return the success message along with the document previews
39
+ return {"message": "File processed successfully", "document_preview": document_previews}
40
+ from pydantic import BaseModel
41
+
42
+ class QueryRequest(BaseModel):
43
+ collection_name: str
44
+ query: str
45
+ @app.post("/query")
46
+ async def query(request: QueryRequest):
47
+ # Load the RetrievalQA chain
48
+ print(request.dict())
49
+ qa_chain = load_qa_chain(request.collection_name)
50
+
51
+ # Process the query
52
+ result = process_query(request.query, qa_chain)
53
+
54
+ return {"result": result}
file_processing.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader, PDFMinerLoader, DirectoryLoader
2
+ from langchain.embeddings import SentenceTransformerEmbeddings
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+ from os.path import join
6
+ import os
7
+ from dotenv import load_dotenv
8
+ load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
9
+ openai_api_key = os.environ.get('OPENAI_API_KEY')
10
+ from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
11
+
12
+ # def load_documents(file_path):
13
+ # if file_path.endswith('.txt'):
14
+ # loader = TextLoader(file_path)
15
+ # elif file_path.endswith('.pdf'):
16
+ # loader = PyPDFLoader(file_path)
17
+ # elif file_path.endswith('.doc') or file_path.endswith('.docx'):
18
+ # loader = UnstructuredWordDocumentLoader(file_path)
19
+ # elif file_path.endswith('.csv'):
20
+ # loader = CSVLoader(file_path)
21
+ # else:
22
+ # raise ValueError(f"Unsupported file format: {file_path}")
23
+
24
+ # documents = loader.load()
25
+ # return documents
26
+ from fastapi import UploadFile
27
+ from typing import List
28
+ import fitz # PyMuPDF
29
+ import pandas as pd
30
+ import docx
31
+ from langchain.docstore.document import Document
32
+ def read_pdf(file_path: str) -> str:
33
+ doc = fitz.open(file_path)
34
+ text = ""
35
+ for page in doc:
36
+ text += page.get_text()
37
+ return text
38
+
39
+ def read_docx(file_path: str) -> str:
40
+ doc = docx.Document(file_path)
41
+ fullText = []
42
+ for para in doc.paragraphs:
43
+ fullText.append(para.text)
44
+ return '\n'.join(fullText)
45
+
46
+ def read_csv(file_path: str) -> str:
47
+ df = pd.read_csv(file_path)
48
+ return df.to_string()
49
+
50
+ def read_txt(file_path: str) -> str:
51
+ with open(file_path, 'r', encoding='utf-8') as file:
52
+ return file.read()
53
+
54
+ async def load_documents(file: UploadFile)->List[Document]:
55
+ temp_file_path = f"temp_{file.filename}"
56
+ try:
57
+ # Save the uploaded file to a temporary file
58
+ with open(temp_file_path, "wb") as temp_file:
59
+ temp_file.write(await file.read())
60
+
61
+ content = ""
62
+ if file.filename.endswith('.pdf'):
63
+ content = read_pdf(temp_file_path)
64
+ elif file.filename.endswith('.docx'):
65
+ content = read_docx(temp_file_path)
66
+ elif file.filename.endswith('.csv'):
67
+ content = read_csv(temp_file_path)
68
+ elif file.filename.endswith('.txt'):
69
+ content = read_txt(temp_file_path)
70
+ else:
71
+ raise ValueError("Unsupported file format")
72
+ except Exception as e:
73
+ # Handle general errors - log or adjust as necessary for your application
74
+ print(f"Error processing document: {e}")
75
+ content = "Error processing document."
76
+ finally:
77
+ # Cleanup: remove the temporary file
78
+ if os.path.exists(temp_file_path):
79
+ os.remove(temp_file_path)
80
+
81
+ metadata = {'source': file.filename}
82
+ document = Document(page_content=content, metadata=metadata)
83
+ return [document]
84
+
85
+
86
+ from langchain.text_splitter import CharacterTextSplitter
87
+
88
+ def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
89
+ text_splitter = CharacterTextSplitter(
90
+ chunk_size=chunk_size,
91
+ chunk_overlap=chunk_overlap
92
+ )
93
+ chunked_docs = text_splitter.split_documents(documents)
94
+ return chunked_docs
95
+
96
+
97
+ from langchain.embeddings import OpenAIEmbeddings
98
+ from langchain.vectorstores import Chroma
99
+
100
+ def create_embeddings(chunked_docs, collection_name):
101
+ embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
102
+ vector_store = Chroma.from_documents(chunked_docs, embeddings, collection_name=collection_name)
103
+ vector_store.persist()
104
+
105
+ return vector_store
main.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from dotenv import load_dotenv
2
+ # from typing import Any
3
+ # from fastapi import FastAPI, HTTPException
4
+ # from fastapi.middleware.cors import CORSMiddleware
5
+ # from pydantic import BaseModel
6
+ # import RAG
7
+ # # Load environment variables from .env file (if any)
8
+ # load_dotenv()
9
+
10
+
11
+ # class Response(BaseModel):
12
+ # result: str | None
13
+
14
+ # class UserQuery(BaseModel):
15
+ # messages: str
16
+
17
+ # origins = [
18
+ # "http://localhost",
19
+ # "http://localhost:8080",
20
+ # "http://localhost:3000"
21
+ # ]
22
+
23
+ # app = FastAPI()
24
+ # app.add_middleware(
25
+ # CORSMiddleware,
26
+ # allow_origins=origins,
27
+ # allow_credentials=True,
28
+ # allow_methods=["*"],
29
+ # allow_headers=["*"],
30
+ # )
31
+
32
+ # initialize_model()
33
+ # # @app.post("/predict", response_model = Response)
34
+ # # def predict() -> Any:
35
+
36
+ # # #implement this code block
37
+
38
+ # # return {"result": "hello world!"}
39
+ # # @app.get("/hello")
40
+ # # async def hello():
41
+ # # return 'Hello World'
42
+ # @app.post("/home")
43
+ # def home_route(home: UserQuery):
44
+ # try:
45
+ # if not home.messages:
46
+ # raise HTTPException(status_code=400, detail="Empty value")
47
+
48
+ # # Call the custom function to generate a response using RetrievalQA
49
+ # answer, generation = generate_response(home.messages)
50
+
51
+ # return {"response": answer, "reasoning": generation}
52
+ # except Exception as e:
53
+ # print(f"An error occurred: {e}")
54
+ # raise HTTPException(status_code=500, detail="Internal Server Error")
55
+
56
+
57
+ from file_processing import load_documents, chunk_documents, create_embeddings
58
+ from query_processing import load_qa_chain, process_query
59
+ from dotenv import load_dotenv
60
+ import os
61
+
62
+ def main():
63
+ load_dotenv()
64
+ openai_api_key = os.environ.get('OPENAI_API_KEY')
65
+ file_path = r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\backend\files\Option for Residence Accommodation.pdf'
66
+ collection_name = 'my_collection'
67
+
68
+ # Load documents
69
+ documents = load_documents(file_path)
70
+
71
+ # Chunk documents
72
+ chunked_docs = chunk_documents(documents, chunk_size=500, chunk_overlap=100)
73
+
74
+ # Create embeddings and store in Chroma
75
+ vector_store = create_embeddings(chunked_docs, collection_name)
76
+
77
+ # Load the RetrievalQA chain
78
+ qa_chain = load_qa_chain(collection_name)
79
+
80
+ # Process user queries
81
+ while True:
82
+ query = input("Enter your query (or 'exit' to quit): ")
83
+ if query.lower() == 'exit':
84
+ break
85
+
86
+ result = process_query(query, qa_chain)
87
+ print(result)
88
+
89
+ if __name__ == '__main__':
90
+ main()
query_processing.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings import OpenAIEmbeddings
2
+ from langchain.vectorstores import Chroma
3
+ from langchain.llms import OpenAI
4
+ from langchain.chains import RetrievalQA
5
+ import os
6
+ from dotenv import load_dotenv
7
+ load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
8
+ openai_api_key = os.environ.get('OPENAI_API_KEY')
9
+ def load_qa_chain(collection_name):
10
+ # Load the vector store from disk
11
+ vector_store = Chroma(collection_name=collection_name, embedding_function=OpenAIEmbeddings())
12
+
13
+ # Create an instance of OpenAI language model
14
+ llm = OpenAI(openai_api_key=openai_api_key)
15
+ retriever = vector_store.as_retriever(search_kwargs={"k": 2})
16
+ # Create a RetrievalQA chain
17
+ qa_chain = RetrievalQA.from_chain_type(
18
+ llm=llm,
19
+ chain_type="map_reduce",
20
+ retriever=vector_store.as_retriever()
21
+ )
22
+
23
+ return qa_chain
24
+
25
+ def process_query(query, qa_chain):
26
+ # Run the query through the RetrievalQA chain
27
+ result = qa_chain.run(query)
28
+
29
+ return result
requirements.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # annotated-types==0.6.0
2
+ # anyio==4.3.0
3
+ # click==8.1.7
4
+ # colorama==0.4.6
5
+ # exceptiongroup==1.2.0
6
+ # fastapi==0.110.0
7
+ # h11==0.14.0
8
+ # httptools==0.6.1
9
+ # idna==3.6
10
+ # pydantic==2.6.3
11
+ # pydantic_core==2.16.3
12
+ # pyspark==3.3.1
13
+ # python-dotenv==1.0.1
14
+ # PyYAML==6.0.1
15
+ # sniffio==1.3.1
16
+ # starlette==0.36.3
17
+ # typing_extensions==4.10.0
18
+ # uvicorn==0.28.0
19
+ # watchfiles==0.21.0
20
+ # websockets==12.0
21
+ # langchain
22
+ # sentence-transformers
23
+ # chromadb
24
+ # torch==2.1.0
25
+ # accelerate==0.22.0
26
+ # bitsandbytes
27
+ langchain
28
+ openai
29
+ chromadb
30
+ tiktoken
31
+ PyPDF2
32
+ pypdf
33
+ python-docx
34
+ pandas
35
+ python-dotenv
36
+ fastapi
37
+ uvicorn
38
+ python-multipart
39
+ # chardet
40
+ PyMuPDF>=1.18.19
41
+ python-docx
42
+ pandas