hayuh commited on
Commit
6a9583a
1 Parent(s): 7becc3b

GPT 3.5 Turbo Model with Multi-Document Agentic RAG (12 documents from EDS dataset)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Ehlers-Danlos-1/2024_EDS_2.pdf filter=lfs diff=lfs merge=lfs -text
37
+ Ehlers-Danlos-1/2024_EDS_3.pdf filter=lfs diff=lfs merge=lfs -text
38
+ Ehlers-Danlos-1/2024_EDS_4.pdf filter=lfs diff=lfs merge=lfs -text
39
+ Ehlers-Danlos-1/2024_EDS_5.pdf filter=lfs diff=lfs merge=lfs -text
40
+ Ehlers-Danlos-1/Unknown_EDS_1.pdf filter=lfs diff=lfs merge=lfs -text
41
+ Ehlers-Danlos-1/Unknown_EDS_5.pdf filter=lfs diff=lfs merge=lfs -text
Ehlers-Danlos-1/2024_EDS_1.pdf ADDED
The diff for this file is too large to render. See raw diff
 
Ehlers-Danlos-1/2024_EDS_2.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46fc736ff4174473e0a846b7ca8430c140d89cd2c9f663e105bc48b33f8d9c99
3
+ size 2616000
Ehlers-Danlos-1/2024_EDS_3.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fef5c8c375297158ad7ad63166405ca7ce4ac511371a8454fe9df972755b0fe
3
+ size 10344738
Ehlers-Danlos-1/2024_EDS_4.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25db35c77fd6aeba6b15278671a462b30ffbb6f97eb5f221e0459f6d11c0f8ed
3
+ size 1071576
Ehlers-Danlos-1/2024_EDS_5.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ef98bcb445da6abda66de35204634bd81d8c6dcdf53bfc3be54447ec9ad0ad
3
+ size 2772421
Ehlers-Danlos-1/2024_EDS_6.pdf ADDED
Binary file (146 kB). View file
 
Ehlers-Danlos-1/2024_EDS_7.pdf ADDED
The diff for this file is too large to render. See raw diff
 
Ehlers-Danlos-1/Unknown_EDS_1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbeaf13d3298a00bc1c7acfba3177a0c639f677e0f0941452709fe60542052d4
3
+ size 21553835
Ehlers-Danlos-1/Unknown_EDS_2.pdf ADDED
Binary file (428 kB). View file
 
Ehlers-Danlos-1/Unknown_EDS_3.pdf ADDED
Binary file (817 kB). View file
 
Ehlers-Danlos-1/Unknown_EDS_4.pdf ADDED
Binary file (392 kB). View file
 
Ehlers-Danlos-1/Unknown_EDS_5.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c5a77524b6bb4dca40798af5ff3e3c622216a13ac21a60d9befce255977b47a
3
+ size 1847313
helper.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Add your utilities or helper functions to this file.
2
+
3
+ import os
4
+ from dotenv import load_dotenv, find_dotenv
5
+
6
+ # these expect to find a .env file at the directory above the lesson. # the format for that file is (without the comment) #API_KEYNAME=AStringThatIsTheLongAPIKeyFromSomeService
7
+ def load_env():
8
+ _ = load_dotenv(find_dotenv())
9
+
10
+ def get_openai_api_key():
11
+ load_env()
12
+ openai_api_key = os.getenv("OPENAI_API_KEY")
13
+ return openai_api_key
rag.ipynb ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "#import OpenAI key with helper function\n",
10
+ "from helper import get_openai_api_key\n",
11
+ "\n",
12
+ "OPENAI_API_KEY = get_openai_api_key()"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 4,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "#A lot of modules use async and we want them to be compatible with Jupyter notebook\n",
22
+ "import nest_asyncio\n",
23
+ "\n",
24
+ "nest_asyncio.apply()"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 5,
30
+ "metadata": {},
31
+ "outputs": [
32
+ {
33
+ "name": "stdout",
34
+ "output_type": "stream",
35
+ "text": [
36
+ "['Ehlers-Danlos-1\\\\2024_EDS_1.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_2.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_3.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_4.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_5.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_6.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_7.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_1.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_2.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_3.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_4.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_5.pdf']\n",
37
+ "['2024_EDS_1.pdf', '2024_EDS_2.pdf', '2024_EDS_3.pdf', '2024_EDS_4.pdf', '2024_EDS_5.pdf', '2024_EDS_6.pdf', '2024_EDS_7.pdf', 'Unknown_EDS_1.pdf', 'Unknown_EDS_2.pdf', 'Unknown_EDS_3.pdf', 'Unknown_EDS_4.pdf', 'Unknown_EDS_5.pdf']\n"
38
+ ]
39
+ }
40
+ ],
41
+ "source": [
42
+ "import os\n",
43
+ "import glob\n",
44
+ "\n",
45
+ "# Define the path to the directory containing the PDF files\n",
46
+ "folder_path = 'Ehlers-Danlos-1'\n",
47
+ "\n",
48
+ "# Get the list of all PDF files in the directory\n",
49
+ "pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))\n",
50
+ "print(pdf_files)\n",
51
+ "\n",
52
+ "# Extract just the filenames (optional)\n",
53
+ "pdf_filenames = [os.path.basename(pdf) for pdf in pdf_files]\n",
54
+ "\n",
55
+ "# Print the list of PDF filenames\n",
56
+ "print(pdf_filenames)\n"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 6,
62
+ "metadata": {},
63
+ "outputs": [
64
+ {
65
+ "name": "stdout",
66
+ "output_type": "stream",
67
+ "text": [
68
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_1.pdf\n",
69
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_2.pdf\n",
70
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_3.pdf\n",
71
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_4.pdf\n",
72
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_5.pdf\n",
73
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_6.pdf\n",
74
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_7.pdf\n",
75
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_1.pdf\n",
76
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_2.pdf\n",
77
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_3.pdf\n",
78
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_4.pdf\n",
79
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_5.pdf\n"
80
+ ]
81
+ }
82
+ ],
83
+ "source": [
84
+ "from utils import get_doc_tools\n",
85
+ "from pathlib import Path\n",
86
+ "\n",
87
+ "# Ensure function names are within the allowed length limit\n",
88
+ "def truncate_function_name(name, max_length=64):\n",
89
+ " return name if len(name) <= max_length else name[:max_length]\n",
90
+ "\n",
91
+ "paper_to_tools_dict = {}\n",
92
+ "for pdf in pdf_files:\n",
93
+ " print(f\"Getting tools for paper: {pdf}\")\n",
94
+ " vector_tool, summary_tool = get_doc_tools(pdf, Path(pdf).stem)\n",
95
+ " #vector_tool, summary_tool = get_doc_tools(pdf, truncate_function_name(Path(pdf).stem))\n",
96
+ " paper_to_tools_dict[pdf] = [vector_tool, summary_tool]\n",
97
+ " #print(vector_tool)\n",
98
+ " #print(summary_tool)"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 7,
104
+ "metadata": {},
105
+ "outputs": [],
106
+ "source": [
107
+ "all_tools = [t for pdf in pdf_files for t in paper_to_tools_dict[pdf]]\n",
108
+ "#all_tools = [truncate_function_name(tool) for tool in all_tools]\n"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 8,
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "# define an \"object\" index and retriever over these tools\n",
118
+ "from llama_index.core import VectorStoreIndex\n",
119
+ "from llama_index.core.objects import ObjectIndex\n",
120
+ "\n",
121
+ "obj_index = ObjectIndex.from_objects(\n",
122
+ " all_tools,\n",
123
+ " index_cls=VectorStoreIndex,\n",
124
+ ")"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 9,
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": [
133
+ "obj_retriever = obj_index.as_retriever(similarity_top_k=3)\n"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 10,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "from llama_index.llms.openai import OpenAI\n",
143
+ "\n",
144
+ "llm = OpenAI(model=\"gpt-3.5-turbo\")"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 11,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "from llama_index.core.agent import FunctionCallingAgentWorker\n",
154
+ "from llama_index.core.agent import AgentRunner\n",
155
+ "\n",
156
+ "agent_worker = FunctionCallingAgentWorker.from_tools(\n",
157
+ " tool_retriever=obj_retriever,\n",
158
+ " llm=llm, \n",
159
+ " verbose=True\n",
160
+ ")\n",
161
+ "agent = AgentRunner(agent_worker)"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 13,
167
+ "metadata": {},
168
+ "outputs": [
169
+ {
170
+ "name": "stdout",
171
+ "output_type": "stream",
172
+ "text": [
173
+ "Added user message to memory: Do people with EDS suffer from dislocations, and if so, how do they manifest?\n",
174
+ "=== Calling Function ===\n",
175
+ "Calling function: summary_tool_Unknown_EDS_1 with args: {\"input\": \"Do people with EDS suffer from dislocations?\"}\n",
176
+ "=== Function Output ===\n",
177
+ "Yes.\n",
178
+ "=== Calling Function ===\n",
179
+ "Calling function: summary_tool_Unknown_EDS_5 with args: {\"input\": \"How do dislocations manifest in people with EDS?\"}\n",
180
+ "=== Function Output ===\n",
181
+ "Dislocations in people with Ehlers-Danlos Syndrome (EDS) typically manifest due to the hypermobility and laxity of joints commonly associated with the condition. This increased joint flexibility can lead to joints easily moving out of their normal positions, resulting in dislocations. Additionally, the weakened connective tissues in individuals with EDS can contribute to joint instability, making dislocations more frequent and easier to occur.\n",
182
+ "=== LLM Response ===\n",
183
+ "Yes, people with Ehlers-Danlos Syndrome (EDS) do suffer from dislocations. Dislocations in individuals with EDS typically manifest due to the hypermobility and laxity of joints commonly associated with the condition. This increased joint flexibility can lead to joints easily moving out of their normal positions, resulting in dislocations. Additionally, the weakened connective tissues in individuals with EDS can contribute to joint instability, making dislocations more frequent and easier to occur.\n",
184
+ "Yes, people with Ehlers-Danlos Syndrome (EDS) do suffer from dislocations. Dislocations in individuals with EDS typically manifest due to the hypermobility and laxity of joints commonly associated with the condition. This increased joint flexibility can lead to joints easily moving out of their normal positions, resulting in dislocations. Additionally, the weakened connective tissues in individuals with EDS can contribute to joint instability, making dislocations more frequent and easier to occur.\n"
185
+ ]
186
+ }
187
+ ],
188
+ "source": [
189
+ "\n",
190
+ "response = agent.query(\n",
191
+ " \"Do people with EDS suffer from dislocations, and if so, how do they manifest?\"\n",
192
+ ")\n",
193
+ "print(str(response))"
194
+ ]
195
+ }
196
+ ],
197
+ "metadata": {
198
+ "kernelspec": {
199
+ "display_name": "Python 3 (ipykernel)",
200
+ "language": "python",
201
+ "name": "python3"
202
+ },
203
+ "language_info": {
204
+ "codemirror_mode": {
205
+ "name": "ipython",
206
+ "version": 3
207
+ },
208
+ "file_extension": ".py",
209
+ "mimetype": "text/x-python",
210
+ "name": "python",
211
+ "nbconvert_exporter": "python",
212
+ "pygments_lexer": "ipython3",
213
+ "version": "3.12.3"
214
+ }
215
+ },
216
+ "nbformat": 4,
217
+ "nbformat_minor": 2
218
+ }
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # requirements file
2
+ # note which revision of python, for example 3.9.6
3
+ # in this file, insert all the pip install needs, include revision
4
+
5
+
6
+ python-dotenv==1.0.0
7
+
8
+ llama-index==0.10.27
9
+ llama-index-llms-openai==0.1.15
10
+ llama-index-embeddings-openai==0.1.7
utils.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core import SimpleDirectoryReader
2
+ from llama_index.core.node_parser import SentenceSplitter
3
+ from llama_index.core import Settings
4
+ from llama_index.llms.openai import OpenAI
5
+ from llama_index.embeddings.openai import OpenAIEmbedding
6
+ from llama_index.core import SummaryIndex, VectorStoreIndex
7
+ from llama_index.core.tools import QueryEngineTool
8
+ from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
9
+ from llama_index.core.selectors import LLMSingleSelector
10
+
11
+ from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex
12
+ from llama_index.core.node_parser import SentenceSplitter
13
+ from llama_index.core.tools import FunctionTool, QueryEngineTool
14
+ from llama_index.core.vector_stores import MetadataFilters, FilterCondition
15
+ from typing import List, Optional
16
+
17
+
18
+
19
+ def get_doc_tools(
20
+ file_path: str,
21
+ name: str,
22
+ ) -> str:
23
+ """Get vector query and summary query tools from a document."""
24
+
25
+ # load documents
26
+ documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
27
+ splitter = SentenceSplitter(chunk_size=1024)
28
+ nodes = splitter.get_nodes_from_documents(documents)
29
+ vector_index = VectorStoreIndex(nodes)
30
+
31
+ def vector_query(
32
+ query: str,
33
+ page_numbers: Optional[List[str]] = None
34
+ ) -> str:
35
+ """Use to answer questions over a given paper.
36
+
37
+ Useful if you have specific questions over the paper.
38
+ Always leave page_numbers as None UNLESS there is a specific page you want to search for.
39
+
40
+ Args:
41
+ query (str): the string query to be embedded.
42
+ page_numbers (Optional[List[str]]): Filter by set of pages. Leave as NONE
43
+ if we want to perform a vector search
44
+ over all pages. Otherwise, filter by the set of specified pages.
45
+
46
+ """
47
+
48
+ page_numbers = page_numbers or []
49
+ metadata_dicts = [
50
+ {"key": "page_label", "value": p} for p in page_numbers
51
+ ]
52
+
53
+ query_engine = vector_index.as_query_engine(
54
+ similarity_top_k=2,
55
+ filters=MetadataFilters.from_dicts(
56
+ metadata_dicts,
57
+ condition=FilterCondition.OR
58
+ )
59
+ )
60
+ response = query_engine.query(query)
61
+ return response
62
+
63
+
64
+ vector_query_tool = FunctionTool.from_defaults(
65
+ name=f"vector_tool_{name}",
66
+ fn=vector_query
67
+ )
68
+
69
+ summary_index = SummaryIndex(nodes)
70
+ summary_query_engine = summary_index.as_query_engine(
71
+ response_mode="tree_summarize",
72
+ use_async=True,
73
+ )
74
+ summary_tool = QueryEngineTool.from_defaults(
75
+ name=f"summary_tool_{name}",
76
+ query_engine=summary_query_engine,
77
+ description=(
78
+ f"Useful for summarization questions related to {name}"
79
+ ),
80
+ )
81
+
82
+ return vector_query_tool, summary_tool