Spaces:

nonstopio
/

Brize_RAG_POC

Sleeping

App Files Files Community

Nikhil-Murade commited on Jun 17

Commit

befaea8

•

1 Parent(s): ed217fe

Upload 3 files

Browse files

Files changed (3) hide show

ingestion.py +70 -0
main.py +138 -0
requirements.txt +123 -0

ingestion.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+import nest_asyncio
+nest_asyncio.apply()
+# bring in our LLAMA_CLOUD_API_KEY
+from dotenv import load_dotenv
+load_dotenv()
+##### LLAMAPARSE #####
+from llama_parse import LlamaParse
+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.core import Settings
+##### Qdrant #######
+import qdrant_client
+from qdrant_client import QdrantClient, models
+llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
+# set up parser
+parser = LlamaParse(api_key=llamaparse_api_key, result_type="text")
+# use SimpleDirectoryReader to parse our file
+file_extractor = {".pdf": parser}
+documents = SimpleDirectoryReader(
+    input_dir="./documents", file_extractor=file_extractor
+).load_data()
+qdrant_url = os.getenv("QDRANT_URL")
+qdrant_api_key = os.getenv("QDRANT_API_KEY")
+embed_model = OpenAIEmbedding(model="text-embedding-3-large")
+Settings.embed_model = embed_model
+from llama_index.llms.openai import OpenAI
+openai_api_key = os.getenv("OPENAI_API_KEY")
+llm = OpenAI(model="gpt-3.5-turbo", api_key=openai_api_key)
+Settings.llm = llm
+client = qdrant_client.QdrantClient(
+    api_key=qdrant_api_key,
+    url=qdrant_url,
+)
+###Creating New Collection on Qdrant Not needed###
+# client.create_collection(
+#     collection_name="RAG_test",
+#     vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
+# )
+vector_store = QdrantVectorStore(client=client, collection_name="RAG_Test")
+storage_context = StorageContext.from_defaults(vector_store=vector_store)
+index = VectorStoreIndex.from_documents(
+    documents=documents, storage_context=storage_context, show_progress=True
+)
+index.storage_context.persist()

main.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+import nest_asyncio
+nest_asyncio.apply()
+# bring in our LLAMA_CLOUD_API_KEY
+from dotenv import load_dotenv
+load_dotenv()
+# UI
+import streamlit as st
+from llama_index.core import VectorStoreIndex, StorageContext
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.core import Settings
+from llama_index.core.postprocessor import SentenceEmbeddingOptimizer
+##### Qdrant #######
+import qdrant_client
+@st.cache_resource(show_spinner=False)
+def get_index() -> VectorStoreIndex:
+    embed_model = OpenAIEmbedding(model="text-embedding-3-large")
+    Settings.embed_model = embed_model
+    from llama_index.llms.openai import OpenAI
+    openai_api_key = os.getenv("OPENAI_API_KEY")
+    llm = OpenAI(model="gpt-3.5-turbo", api_key=openai_api_key)
+    Settings.llm = llm
+    qdrant_url = os.getenv("QDRANT_URL")
+    qdrant_api_key = os.getenv("QDRANT_API_KEY")
+    client = qdrant_client.QdrantClient(
+        api_key=qdrant_api_key,
+        url=qdrant_url,
+    )
+    vector_store = QdrantVectorStore(client=client, collection_name="RAG_FINAL")
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    return VectorStoreIndex.from_vector_store(
+        vector_store,
+        storage_context=storage_context,
+        embed_model=embed_model,
+    )
+index = get_index()
+if "chat_engine" not in st.session_state.keys():
+    # postprocessor = SentenceEmbeddingOptimizer(
+    #     percentile_cutoff=0.5, threshold_cutoff=0.7
+    # )
+    st.session_state.chat_engine = index.as_chat_engine(
+        chat_mode="context",
+        verbose=True
+        # system_prompt = ("""You are an AI assistant for the Brize learning platform chat interface.
+        #             Brize, a continuous learning platform, leverages the GROW career coaching framework to guide employee growth at every career stage.
+        #             Follow these instructions to provide the best user experience:
+        #             * Relevance Check:
+        #             Ensure the user's questions are relevant to data, retrieval, or specific topics related to
+        #             1 Strategic Presence Momentum,
+        #             2 Managing Others
+        #             3 Leading Others
+        #             4 Brize Related Information
+        #             (don't show the above list in your response)
+        #             If a question is not relevant, respond with: "Please ask relevant questions."
+        #             * Clarity and Conciseness:
+        #             Provide clear and concise answers.
+        #             Avoid lengthy responses unless the complexity of the question necessitates a detailed explanation.
+        #             * Specificity:
+        #             Encourage users to be specific in their queries to provide the most accurate answers.
+        #             If a question is too broad or vague or When in doubt, ask the user for more details to provide the best possible assistance.
+        #             * Sensitive Information:
+        #             Remind users not to share sensitive personal data or proprietary information.
+        #             Inform them that the system is designed to provide assistance and information, not to handle confidential data.
+        #             * Guidelines:
+        #             Always prioritize clarity and usefulness in your responses.
+        #             Maintain a professional, helpful and Kind tone.
+        #             Be succinct unless a detailed response is necessary.""")
+        # node_postprocessors=[postprocessor]
+    )
+st.set_page_config(
+    page_title="Chat with Llamaindex docs powered by Llamaindex",
+    page_icon=":nonstop:",
+    layout="centered",
+    initial_sidebar_state="auto",
+    menu_items=None,
+)
+st.title("Chat with Brize 💬📚")
+if "messages" not in st.session_state.keys():
+    st.session_state.messages = [
+        {
+            "role": "assistant",
+            "content": "Ask me a question about Brize Courses",
+        }
+    ]
+if prompt := st.chat_input("Your question"):
+    st.session_state.messages.append({"role": "user", "content": prompt})
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.write(message["content"])
+if st.session_state.messages[-1]["role"] != "assistant":
+    with st.chat_message("assistant"):
+        with st.spinner("Thinking..."):
+            response = st.session_state.chat_engine.chat(message=prompt)
+            st.write(response.response)
+            nodes = [node for node in response.source_nodes]
+            for col, node, i in zip(st.columns(len(nodes)), nodes, range(len(nodes))):
+                with col:
+                    st.header(f"Source Node {i+1}: score = {node.score}")
+                    # st.write(node.text)
+                    st.subheader(f"File Path: {node.metadata['file_name']}")
+                    st.write(node.metadata)
+                    st.header("Source :")
+                    st.write(node.get_content()[:1000] + "...")
+                break
+            message = {"role": "assistant", "content": response.response}
+            st.session_state.messages.append(message)

requirements.txt ADDED Viewed

	@@ -0,0 +1,123 @@

+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.7.0
+anyio==4.4.0
+async-timeout==4.0.3
+attrs==23.2.0
+beautifulsoup4==4.12.3
+black==24.4.2
+blinker==1.8.2
+cachetools==5.3.3
+certifi==2024.6.2
+charset-normalizer==3.3.2
+click==8.1.7
+dataclasses-json==0.6.6
+Deprecated==1.2.14
+dirtyjson==1.0.8
+distro==1.9.0
+entrypoints==0.4
+exceptiongroup==1.2.1
+frozenlist==1.4.1
+fsspec==2024.6.0
+gitdb==4.0.11
+GitPython==3.1.43
+greenlet==3.0.3
+grpcio==1.64.1
+grpcio-tools==1.64.1
+h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
+httpcore==1.0.5
+httpx==0.27.0
+hyperframe==6.0.1
+idna==3.7
+importlib_metadata==7.1.0
+Jinja2==3.1.4
+joblib==1.4.2
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+llama-index==0.10.43
+llama-index-agent-openai==0.2.7
+llama-index-cli==0.1.12
+llama-index-core==0.10.43
+llama-index-embeddings-openai==0.1.10
+llama-index-indices-managed-llama-cloud==0.1.6
+llama-index-legacy==0.9.48
+llama-index-llms-openai==0.1.22
+llama-index-multi-modal-llms-openai==0.1.6
+llama-index-program-openai==0.1.6
+llama-index-question-gen-openai==0.1.3
+llama-index-readers-file==0.1.23
+llama-index-readers-llama-parse==0.1.4
+llama-index-vector-stores-qdrant==0.2.8
+llama-parse==0.4.4
+llamaindex-py-client==0.1.19
+loguru==0.7.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.2
+mdurl==0.1.2
+multidict==6.0.5
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.3
+nltk==3.8.1
+numpy==1.26.4
+openai==1.31.0
+packaging==24.0
+pandas==2.2.2
+pathspec==0.12.1
+pillow==10.3.0
+platformdirs==4.2.2
+portalocker==2.8.2
+protobuf==3.20.3
+pyarrow==16.1.0
+pydantic==2.7.3
+pydantic_core==2.18.4
+pydeck==0.9.1
+Pygments==2.18.0
+Pympler==1.0.1
+pypdf==4.2.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.1
+PyYAML==6.0.1
+qdrant-client==1.9.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.18.1
+scikit-learn==1.0.2
+scipy==1.13.1
+semver==3.0.2
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.5
+SQLAlchemy==2.0.30
+streamlit==1.35.0
+striprtf==0.0.26
+tenacity==8.3.0
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+tornado==6.4
+tqdm==4.66.4
+trubrics==1.3.6
+typeguard==2.13.3
+typer==0.12.3
+typing-inspect==0.9.0
+typing_extensions==4.12.1
+tzdata==2024.1
+tzlocal==5.2
+urllib3==2.2.1
+validators==0.28.3
+watchdog==4.0.1
+wrapt==1.16.0
+yarl==1.9.4
+zipp==3.19.2