Spaces:

junipark
/

gemma_paper_qa

Paused

App Files Files Community

halyn commited on Oct 3

Commit

f085c10

•

1 Parent(s): 80a4c83

update code

Browse files

Files changed (1) hide show

app.py +217 -88

app.py CHANGED Viewed

@@ -1,93 +1,222 @@
-import streamlit as st
 import requests
 from PyPDF2 import PdfReader
-st.title("Welcome to GemmaPaperQA")
-st.subheader("Upload Your Paper")
-# def main_page():
-#     paper = st.file_uploader("Upload Here!", type="pdf", label_visibility="hidden")
-#     if paper:
-#         st.write(f"Upload complete! File name is {paper.name}")
-#         st.write("Please click the button below.")
-#         # pdf_reader = PdfReader(paper)
-#         # for page in pdf_reader.pages:
-#         #     paper_title.append(page.extract_text())
-#         #     break
-#         # paper_name = paper_title[0].split("\n")[0]
-#         # st.subheader(f"You upload the <{paper_name}> paper")
-#         if st.button("Click Here :)"):
-#             # FastAPI 서버에 PDF 파일 전송
-#             try:
-#                 files = {"file": (paper.name, paper, "application/pdf")}
-#                 response = requests.post(f"{FASTAPI_URL}/upload_pdf", files=files)
-#                 if response.status_code == 200:
-#                     st.success("PDF successfully uploaded to the model! Please click the button again")
-#                     st.session_state.messages = []
-#                     st.session_state.paper_name = paper.name[:-4]
-#                     st.session_state.page = "chat"
-#                 else:
-#                     st.error(f"Failed to upload PDF to the model. Error: {response.text}")
-#             except requests.RequestException as e:
-#                 st.error(f"Error connecting to the server: {str(e)}")
-# def chat_page():
-#     st.title(f"Welcome to GemmaPaperQA")
-#     st.subheader(f"Ask anything about {st.session_state.paper_name}")
-#     if "messages" not in st.session_state:
-#         st.session_state.messages = []
-#     for message in st.session_state.messages:
-#         with st.chat_message(message["role"]):
-#             st.markdown(message["content"])
-#     if prompt := st.chat_input("Chat here !"):
-#         # Add user message to chat history
-#         st.session_state.messages.append({"role": "user", "content": prompt})
-#         # Display user message in chat message container
-#         with st.chat_message("user"):
-#             st.markdown(prompt)
-#         # Get response from FastAPI server
-#         response = get_response_from_fastapi(prompt)
-#         # Display assistant response in chat message container
-#         with st.chat_message("assistant"):
-#             st.markdown(response)
-#         # Add assistant response to chat history
-#         st.session_state.messages.append({"role": "assistant", "content": response})
-#     if st.button("Go back to main page"):
-#         st.session_state.page = "main"
-# def get_response_from_fastapi(prompt):
-#     try:
-#         response = requests.post(f"{FASTAPI_URL}/ask", json={"text": prompt})
-#         if response.status_code == 200:
-#             return response.json()["response"]
-#         else:
-#             return f"Sorry, I couldn't generate a response. Error: {response.text}"
-#     except requests.RequestException as e:
-#         return f"Sorry, there was an error connecting to the server: {str(e)}"
-# # 초기 페이지 설정
-# if "page" not in st.session_state:
-#     st.session_state.page = "main"
-# # paper_name 초기화
-# if "paper_name" not in st.session_state:
-#     st.session_state.paper_name = ""
-# # 페이지 렌더링
-# if st.session_state.page == "main":
-#     main_page()
-# elif st.session_state.page == "chat":
-#     chat_page()

+import os
+import io
 import requests
+from dotenv import load_dotenv
+from fastapi import FastAPI, HTTPException, UploadFile, File
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
 from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.chains.question_answering import load_qa_chain
+from langchain.llms import HuggingFacePipeline
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import streamlit as st
+# Disable WANDB
+os.environ['WANDB_DISABLED'] = "true"
+# Constants
+MODEL_PATH = "/home/lab/halyn/gemma/halyn/paper/models/gemma-2-9b-it"
+FASTAPI_URL = "http://203.249.64.50:8080"  # 서버 주소
+app = FastAPI()
+# CORS 설정
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # 모든 출처 허용
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global variables to store the knowledge base and QA chain
+knowledge_base = None
+qa_chain = None
+def load_pdf(pdf_file):
+    """
+    Load and extract text from a PDF.
+    Args:
+        pdf_file (str) : The PDF file.
+    Returns:
+        str: Extracted text from the PDF.
+    """
+    pdf_reader = PdfReader(pdf_file)
+    text = "".join(page.extract_text() for page in pdf_reader.pages)
+    return text
+def split_text(text):
+    """
+    Split the extracted text into chunks.
+    Args:
+        text (str) : The full text extracted from the PDF.
+    Returns:
+        list : A list of text chunks
+    """
+    text_splitter = CharacterTextSplitter(
+        separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
+    )
+    return text_splitter.split_text(text)
+def create_knowledge_base(chunks):
+    """
+    Create a FAISS knowledge base from text chunks.
+    Args:
+        chunks (list) : A list of text chunks.
+    Returns:
+        FAISS: A FAISS knowledge base object
+    """
+    embeddings = HuggingFaceEmbeddings()
+    return FAISS.from_texts(chunks, embeddings)
+def load_model(model_path):
+    """
+    Load the HuggingFace model and tokenizer, and create a text-generation pipeline.
+    Args:
+        model_path (str) : The path to the pre-trained model.
+    Returns:
+        pipeline: A HuggingFace pipeline for text generation.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path)
+    return pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=150, temperature=0.1)
+@app.on_event("startup")
+async def startup_event():
+    """ Start function to run the PDF question-answering system. """
+    global qa_chain
+    load_dotenv()
+    # Load the language model
+    try:
+        pipe = load_model(MODEL_PATH)
+        llm = HuggingFacePipeline(pipeline=pipe)
+        qa_chain = load_qa_chain(llm, chain_type="stuff")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        raise HTTPException(status_code=500, detail="Failed to load the language model")
+@app.post("/upload_pdf")
+async def upload_pdf(file: UploadFile = File(...)):
+    global knowledge_base
+    try:
+        contents = await file.read()
+        pdf_file = io.BytesIO(contents)
+        text = load_pdf(pdf_file)
+        chunks = split_text(text)
+        knowledge_base = create_knowledge_base(chunks)
+        return {"message": "PDF uploaded and processed successfully"}
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to process PDF: {str(e)}")
+class Question(BaseModel):
+    text: str
+@app.post("/ask")
+async def ask_question(question: Question):
+    global knowledge_base, qa_chain
+    if not knowledge_base:
+        raise HTTPException(status_code=400, detail="No PDF has been uploaded yet")
+    if not qa_chain:
+        raise HTTPException(status_code=500, detail="QA chain is not initialized")
+    try:
+        docs = knowledge_base.similarity_search(question.text)
+        response = qa_chain.run(input_documents=docs, question=question.text)
+        if "Helpful Answer:" in response:
+                response = response.split("Helpful Answer:")[1].strip()
+        return {"response": response}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing question: {str(e)}")
+# Streamlit UI
+def main_page():
+    st.title("Welcome to GemmaPaperQA")
+    st.subheader("Upload Your Paper")
+    paper = st.file_uploader("Upload Here!", type="pdf", label_visibility="hidden")
+    if paper:
+        st.write(f"Upload complete! File name is {paper.name}")
+        st.write("Please click the button below.")
+        if st.button("Click Here :)"):
+            # FastAPI 서버에 PDF 파일 전송
+            try:
+                files = {"file": (paper.name, paper, "application/pdf")}
+                response = requests.post(f"{FASTAPI_URL}/upload_pdf", files=files)
+                if response.status_code == 200:
+                    st.success("PDF successfully uploaded to the model! Please click the button again")
+                    st.session_state.messages = []
+                    st.session_state.paper_name = paper.name[:-4]
+                    st.session_state.page = "chat"
+                else:
+                    st.error(f"Failed to upload PDF to the model. Error: {response.text}")
+            except requests.RequestException as e:
+                st.error(f"Error connecting to the server: {str(e)}")
+def chat_page():
+    st.title(f"Welcome to GemmaPaperQA")
+    st.subheader(f"Ask anything about {st.session_state.paper_name}")
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    if prompt := st.chat_input("Chat here !"):
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        # Display user message in chat message container
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Get response from FastAPI server
+        response = get_response_from_fastapi(prompt)
+        # Display assistant response in chat message container
+        with st.chat_message("assistant"):
+            st.markdown(response)
+        # Add assistant response to chat history
+        st.session_state.messages.append({"role": "assistant", "content": response})
+    if st.button("Go back to main page"):
+        st.session_state.page = "main"
+def get_response_from_fastapi(prompt):
+    try:
+        response = requests.post(f"{FASTAPI_URL}/ask", json={"text": prompt})
+        if response.status_code == 200:
+            return response.json()["response"]
+        else:
+            return f"Sorry, I couldn't generate a response. Error: {response.text}"
+    except requests.RequestException as e:
+        return f"Sorry, there was an error connecting to the server: {str(e)}"
+# Streamlit - 초기 페이지 설정
+if "page" not in st.session_state:
+    st.session_state.page = "main"
+# paper_name 초기화
+if "paper_name" not in st.session_state:
+    st.session_state.paper_name = ""
+# 페이지 렌더링
+if st.session_state.page == "main":
+    main_page()
+elif st.session_state.page == "chat":
+    chat_page()
+# FastAPI 앱 실행을 위한 코드
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8050)