File size: 4,960 Bytes
6728994
 
 
 
 
b26dc5d
6728994
 
 
fb553a2
 
6728994
1d1c23a
9322867
6728994
1d1c23a
 
 
fb553a2
6728994
 
 
 
 
 
fb553a2
6728994
 
fb553a2
6728994
 
fb553a2
 
6728994
 
 
fb553a2
6728994
 
 
 
 
fb553a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6728994
 
 
fb553a2
 
6728994
fb553a2
6728994
fb553a2
6728994
fb553a2
 
 
 
 
6728994
fb553a2
6728994
 
fb553a2
6728994
fb553a2
 
 
6728994
fb553a2
6728994
 
fb553a2
6728994
 
 
fb553a2
8c24c78
 
 
6728994
 
 
 
 
fb553a2
 
6728994
 
 
 
fb553a2
6728994
 
fb553a2
6728994
 
fb553a2
6728994
fb553a2
 
ff02b0e
 
 
6728994
 
 
 
fb553a2
 
 
 
 
 
 
6728994
 
fb553a2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import streamlit as st
import fitz
from PIL import Image
import tempfile
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.chains.question_answering import load_qa_chain
from docx import Document
import io
# from langchain_community.llms import HuggingFaceHub
from langchain_huggingface import HuggingFaceEndpoint

# Ensure you have your Hugging Face token stored in an environment variable
huggingface_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')

if huggingface_token is None:
    raise ValueError("No Hugging Face token found. Please set the HUGGINGFACEHUB_API_TOKEN environment variable.")

llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", huggingfacehub_api_token=huggingface_token)

# Initialize conversation history list
if "conversation_history" not in st.session_state:
    st.session_state.conversation_history = []

# Function to load document and perform question answering (cached)

@st.cache_data
def process_document(uploaded_file, query):
    # Save uploaded file to temporary directory
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        tmp_file.write(uploaded_file.read())

        # Load document based on file type
        file_extension = os.path.splitext(uploaded_file.name)[1].lower()
        if file_extension == ".pdf":
            loader = PyPDFLoader(tmp_file.name)
            document_text = None
        elif file_extension == ".docx":
            loader = Docx2txtLoader(tmp_file.name)
            document = Document(tmp_file.name)
            document_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
        else:
            st.error("Unsupported file format. Please upload a text file (.txt), a PDF file (.pdf), or a Word document (.docx).")
            return "", None

    documents = loader.load()

    # Load QA chain
    # chain = load_qa_chain(llm=OpenAI(), verbose=True)
    chain = load_qa_chain(llm=llm, verbose=True)

    # Perform question answering
    response = chain.invoke({"input_documents": documents, "question": query})

    # Remove temporary file
    os.unlink(tmp_file.name)

    return response["output_text"], document_text





# Function to update conversation history
def update_conversation(query, response):
    st.session_state.conversation_history.append({"question": query, "answer": response})

# Function to convert PDF pages to images
def pdf_to_images(pdf_bytes):
    doc = fitz.open("pdf", pdf_bytes)
    images = []

    for page_num in range(doc.page_count):
        page = doc[page_num]
        image = page.get_pixmap()
        img = Image.frombytes("RGB", [image.width, image.height], image.samples)
        images.append(img)

    return images

# Streamlit UI
def main():
    # Set sidebar title
    st.sidebar.title("7steps.AI")
    st.sidebar.markdown("---")

    # File uploader for document in sidebar
    uploaded_file = st.sidebar.file_uploader("Upload a document", type=["pdf", "docx"])

    # Display document content or images
    if uploaded_file is not None:
        st.title("Document Content")
        file_extension = os.path.splitext(uploaded_file.name)[1].lower()
        if file_extension in [".docx"]:
            _, document_text = process_document(uploaded_file, "")
            if document_text is not None:
                st.text_area("Document Text", value=document_text, height=300)
        elif file_extension == ".pdf":
            images = pdf_to_images(uploaded_file.getvalue())
            if images:
                page_number = st.number_input("Page Number", value=1, min_value=1, max_value=len(images))
                st.image(images[page_number - 1], caption=f"Page {page_number}", use_column_width=True)

                # Download button for images
                img_bytes = io.BytesIO()
                images[page_number - 1].save(img_bytes, format='PNG')
                st.download_button("Download Image", img_bytes.getvalue(), f'Page_{page_number}.png')

    # Text box for new question in sidebar
    query = st.sidebar.text_input("Enter your question:")

    # "Ask" button in sidebar
    if st.sidebar.button("Ask"):
        if uploaded_file is not None:
            # Process document and display response
            response, _ = process_document(uploaded_file, query)
            if response:  # Check if response is not empty
                # Update conversation history
                # st.write(response)
                st.write("You:", query)
                st.write("AI:", response)
                update_conversation(query, response)
        else:
            st.sidebar.write("Please upload a document first.")

    # # Display conversation history
    # st.title("Conversation History")
    # for item in st.session_state.conversation_history:
    #     st.write("You:", item["question"])
    #     st.write("AI:", item["answer"])

# Run the application
if __name__ == "__main__":
    main()