hayuh
/

Rare-Disease-Hackathon

Model card Files Files and versions Community

hayuh commited on Jun 15

Commit

6a9583a

•

1 Parent(s): 7becc3b

GPT 3.5 Turbo Model with Multi-Document Agentic RAG (12 documents from EDS dataset)

Browse files

Files changed (17) hide show

.gitattributes +6 -0
Ehlers-Danlos-1/2024_EDS_1.pdf +0 -0
Ehlers-Danlos-1/2024_EDS_2.pdf +3 -0
Ehlers-Danlos-1/2024_EDS_3.pdf +3 -0
Ehlers-Danlos-1/2024_EDS_4.pdf +3 -0
Ehlers-Danlos-1/2024_EDS_5.pdf +3 -0
Ehlers-Danlos-1/2024_EDS_6.pdf +0 -0
Ehlers-Danlos-1/2024_EDS_7.pdf +0 -0
Ehlers-Danlos-1/Unknown_EDS_1.pdf +3 -0
Ehlers-Danlos-1/Unknown_EDS_2.pdf +0 -0
Ehlers-Danlos-1/Unknown_EDS_3.pdf +0 -0
Ehlers-Danlos-1/Unknown_EDS_4.pdf +0 -0
Ehlers-Danlos-1/Unknown_EDS_5.pdf +3 -0
helper.py +13 -0
rag.ipynb +218 -0
requirements.txt +10 -0
utils.py +82 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Ehlers-Danlos-1/2024_EDS_2.pdf filter=lfs diff=lfs merge=lfs -text
+Ehlers-Danlos-1/2024_EDS_3.pdf filter=lfs diff=lfs merge=lfs -text
+Ehlers-Danlos-1/2024_EDS_4.pdf filter=lfs diff=lfs merge=lfs -text
+Ehlers-Danlos-1/2024_EDS_5.pdf filter=lfs diff=lfs merge=lfs -text
+Ehlers-Danlos-1/Unknown_EDS_1.pdf filter=lfs diff=lfs merge=lfs -text
+Ehlers-Danlos-1/Unknown_EDS_5.pdf filter=lfs diff=lfs merge=lfs -text

Ehlers-Danlos-1/2024_EDS_1.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff

Ehlers-Danlos-1/2024_EDS_2.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46fc736ff4174473e0a846b7ca8430c140d89cd2c9f663e105bc48b33f8d9c99
+size 2616000

Ehlers-Danlos-1/2024_EDS_3.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fef5c8c375297158ad7ad63166405ca7ce4ac511371a8454fe9df972755b0fe
+size 10344738

Ehlers-Danlos-1/2024_EDS_4.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25db35c77fd6aeba6b15278671a462b30ffbb6f97eb5f221e0459f6d11c0f8ed
+size 1071576

Ehlers-Danlos-1/2024_EDS_5.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57ef98bcb445da6abda66de35204634bd81d8c6dcdf53bfc3be54447ec9ad0ad
+size 2772421

Ehlers-Danlos-1/2024_EDS_6.pdf ADDED Viewed

Binary file (146 kB). View file

Ehlers-Danlos-1/2024_EDS_7.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff

Ehlers-Danlos-1/Unknown_EDS_1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbeaf13d3298a00bc1c7acfba3177a0c639f677e0f0941452709fe60542052d4
+size 21553835

Ehlers-Danlos-1/Unknown_EDS_2.pdf ADDED Viewed

Binary file (428 kB). View file

Ehlers-Danlos-1/Unknown_EDS_3.pdf ADDED Viewed

Binary file (817 kB). View file

Ehlers-Danlos-1/Unknown_EDS_4.pdf ADDED Viewed

Binary file (392 kB). View file

Ehlers-Danlos-1/Unknown_EDS_5.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c5a77524b6bb4dca40798af5ff3e3c622216a13ac21a60d9befce255977b47a
+size 1847313

helper.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Add your utilities or helper functions to this file.
+import os
+from dotenv import load_dotenv, find_dotenv
+# these expect to find a .env file at the directory above the lesson.                                                                                                                     # the format for that file is (without the comment)                                                                                                                                       #API_KEYNAME=AStringThatIsTheLongAPIKeyFromSomeService
+def load_env():
+    _ = load_dotenv(find_dotenv())
+def get_openai_api_key():
+    load_env()
+    openai_api_key = os.getenv("OPENAI_API_KEY")
+    return openai_api_key

rag.ipynb ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#import OpenAI key with helper function\n",
+    "from helper import get_openai_api_key\n",
+    "\n",
+    "OPENAI_API_KEY = get_openai_api_key()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#A lot of modules use async and we want them to be compatible with Jupyter notebook\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Ehlers-Danlos-1\\\\2024_EDS_1.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_2.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_3.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_4.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_5.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_6.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_7.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_1.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_2.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_3.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_4.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_5.pdf']\n",
+      "['2024_EDS_1.pdf', '2024_EDS_2.pdf', '2024_EDS_3.pdf', '2024_EDS_4.pdf', '2024_EDS_5.pdf', '2024_EDS_6.pdf', '2024_EDS_7.pdf', 'Unknown_EDS_1.pdf', 'Unknown_EDS_2.pdf', 'Unknown_EDS_3.pdf', 'Unknown_EDS_4.pdf', 'Unknown_EDS_5.pdf']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "\n",
+    "# Define the path to the directory containing the PDF files\n",
+    "folder_path = 'Ehlers-Danlos-1'\n",
+    "\n",
+    "# Get the list of all PDF files in the directory\n",
+    "pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))\n",
+    "print(pdf_files)\n",
+    "\n",
+    "# Extract just the filenames (optional)\n",
+    "pdf_filenames = [os.path.basename(pdf) for pdf in pdf_files]\n",
+    "\n",
+    "# Print the list of PDF filenames\n",
+    "print(pdf_filenames)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_1.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_2.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_3.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_4.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_5.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_6.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_7.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_1.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_2.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_3.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_4.pdf\n",
+      "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_5.pdf\n"
+     ]
+    }
+   ],
+   "source": [
+    "from utils import get_doc_tools\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Ensure function names are within the allowed length limit\n",
+    "def truncate_function_name(name, max_length=64):\n",
+    "    return name if len(name) <= max_length else name[:max_length]\n",
+    "\n",
+    "paper_to_tools_dict = {}\n",
+    "for pdf in pdf_files:\n",
+    "    print(f\"Getting tools for paper: {pdf}\")\n",
+    "    vector_tool, summary_tool = get_doc_tools(pdf, Path(pdf).stem)\n",
+    "    #vector_tool, summary_tool = get_doc_tools(pdf, truncate_function_name(Path(pdf).stem))\n",
+    "    paper_to_tools_dict[pdf] = [vector_tool, summary_tool]\n",
+    "    #print(vector_tool)\n",
+    "    #print(summary_tool)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_tools = [t for pdf in pdf_files for t in paper_to_tools_dict[pdf]]\n",
+    "#all_tools = [truncate_function_name(tool) for tool in all_tools]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define an \"object\" index and retriever over these tools\n",
+    "from llama_index.core import VectorStoreIndex\n",
+    "from llama_index.core.objects import ObjectIndex\n",
+    "\n",
+    "obj_index = ObjectIndex.from_objects(\n",
+    "    all_tools,\n",
+    "    index_cls=VectorStoreIndex,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "obj_retriever = obj_index.as_retriever(similarity_top_k=3)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.openai import OpenAI\n",
+    "\n",
+    "llm = OpenAI(model=\"gpt-3.5-turbo\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.agent import FunctionCallingAgentWorker\n",
+    "from llama_index.core.agent import AgentRunner\n",
+    "\n",
+    "agent_worker = FunctionCallingAgentWorker.from_tools(\n",
+    "    tool_retriever=obj_retriever,\n",
+    "    llm=llm, \n",
+    "    verbose=True\n",
+    ")\n",
+    "agent = AgentRunner(agent_worker)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Added user message to memory: Do people with EDS suffer from dislocations, and if so, how do they manifest?\n",
+      "=== Calling Function ===\n",
+      "Calling function: summary_tool_Unknown_EDS_1 with args: {\"input\": \"Do people with EDS suffer from dislocations?\"}\n",
+      "=== Function Output ===\n",
+      "Yes.\n",
+      "=== Calling Function ===\n",
+      "Calling function: summary_tool_Unknown_EDS_5 with args: {\"input\": \"How do dislocations manifest in people with EDS?\"}\n",
+      "=== Function Output ===\n",
+      "Dislocations in people with Ehlers-Danlos Syndrome (EDS) typically manifest due to the hypermobility and laxity of joints commonly associated with the condition. This increased joint flexibility can lead to joints easily moving out of their normal positions, resulting in dislocations. Additionally, the weakened connective tissues in individuals with EDS can contribute to joint instability, making dislocations more frequent and easier to occur.\n",
+      "=== LLM Response ===\n",
+      "Yes, people with Ehlers-Danlos Syndrome (EDS) do suffer from dislocations. Dislocations in individuals with EDS typically manifest due to the hypermobility and laxity of joints commonly associated with the condition. This increased joint flexibility can lead to joints easily moving out of their normal positions, resulting in dislocations. Additionally, the weakened connective tissues in individuals with EDS can contribute to joint instability, making dislocations more frequent and easier to occur.\n",
+      "Yes, people with Ehlers-Danlos Syndrome (EDS) do suffer from dislocations. Dislocations in individuals with EDS typically manifest due to the hypermobility and laxity of joints commonly associated with the condition. This increased joint flexibility can lead to joints easily moving out of their normal positions, resulting in dislocations. Additionally, the weakened connective tissues in individuals with EDS can contribute to joint instability, making dislocations more frequent and easier to occur.\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "response = agent.query(\n",
+    "    \"Do people with EDS suffer from dislocations, and if so, how do they manifest?\"\n",
+    ")\n",
+    "print(str(response))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# requirements file
+# note which revision of python, for example 3.9.6
+# in this file, insert all the pip install needs, include revision
+python-dotenv==1.0.0
+llama-index==0.10.27
+llama-index-llms-openai==0.1.15
+llama-index-embeddings-openai==0.1.7

utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core import Settings
+from llama_index.llms.openai import OpenAI
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.core import SummaryIndex, VectorStoreIndex
+from llama_index.core.tools import QueryEngineTool
+from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
+from llama_index.core.selectors import LLMSingleSelector
+from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.tools import FunctionTool, QueryEngineTool
+from llama_index.core.vector_stores import MetadataFilters, FilterCondition
+from typing import List, Optional
+def get_doc_tools(
+    file_path: str,
+    name: str,
+) -> str:
+    """Get vector query and summary query tools from a document."""
+    # load documents
+    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
+    splitter = SentenceSplitter(chunk_size=1024)
+    nodes = splitter.get_nodes_from_documents(documents)
+    vector_index = VectorStoreIndex(nodes)
+    def vector_query(
+        query: str,
+        page_numbers: Optional[List[str]] = None
+    ) -> str:
+        """Use to answer questions over a given paper.
+        Useful if you have specific questions over the paper.
+        Always leave page_numbers as None UNLESS there is a specific page you want to search for.
+        Args:
+            query (str): the string query to be embedded.
+            page_numbers (Optional[List[str]]): Filter by set of pages. Leave as NONE
+                if we want to perform a vector search
+                over all pages. Otherwise, filter by the set of specified pages.
+        """
+        page_numbers = page_numbers or []
+        metadata_dicts = [
+            {"key": "page_label", "value": p} for p in page_numbers
+        ]
+        query_engine = vector_index.as_query_engine(
+            similarity_top_k=2,
+            filters=MetadataFilters.from_dicts(
+                metadata_dicts,
+                condition=FilterCondition.OR
+            )
+        )
+        response = query_engine.query(query)
+        return response
+    vector_query_tool = FunctionTool.from_defaults(
+        name=f"vector_tool_{name}",
+        fn=vector_query
+    )
+    summary_index = SummaryIndex(nodes)
+    summary_query_engine = summary_index.as_query_engine(
+        response_mode="tree_summarize",
+        use_async=True,
+    )
+    summary_tool = QueryEngineTool.from_defaults(
+        name=f"summary_tool_{name}",
+        query_engine=summary_query_engine,
+        description=(
+            f"Useful for summarization questions related to {name}"
+        ),
+    )
+    return vector_query_tool, summary_tool