Spaces:
Sleeping
Sleeping
xicocdi
commited on
Commit
•
d597087
1
Parent(s):
4acec17
retrieval evals
Browse files- Embedding_Model_Eval.ipynb +702 -0
- Retrieval_Strat_Eval.ipynb +0 -0
- app.py +2 -2
- contextual_compression_metrics.csv +6 -0
- contextual_compression_ragas_results.csv +0 -0
- multiquery_ft_embedding_metrics.csv +6 -0
- multiquery_ft_embedding_ragas_results.csv +0 -0
- multiquery_metrics.csv +6 -0
- multiquery_ragas_results.csv +0 -0
Embedding_Model_Eval.ipynb
ADDED
@@ -0,0 +1,702 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"!pip install langchain langchain_community langchain_openai pypdf langsmith qdrant-client ragas pandas"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 2,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": [
|
18 |
+
"import os\n",
|
19 |
+
"import openai\n",
|
20 |
+
"from getpass import getpass\n",
|
21 |
+
"\n",
|
22 |
+
"openai.api_key = getpass(\"Please provide your OpenAI Key: \")\n",
|
23 |
+
"os.environ[\"OPENAI_API_KEY\"] = openai.api_key"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": 3,
|
29 |
+
"metadata": {},
|
30 |
+
"outputs": [],
|
31 |
+
"source": [
|
32 |
+
"import pandas as pd\n",
|
33 |
+
"\n",
|
34 |
+
"test_df = pd.read_csv(\"synthetic_midterm_question_dataset.csv\")"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"cell_type": "code",
|
39 |
+
"execution_count": 4,
|
40 |
+
"metadata": {},
|
41 |
+
"outputs": [],
|
42 |
+
"source": [
|
43 |
+
"test_questions = test_df[\"question\"].values.tolist()\n",
|
44 |
+
"test_groundtruths = test_df[\"ground_truth\"].values.tolist()"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"execution_count": 5,
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"from langchain_community.document_loaders import PyPDFLoader\n",
|
54 |
+
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
|
55 |
+
"from langchain_openai import OpenAIEmbeddings\n",
|
56 |
+
"from langchain_community.vectorstores.chroma import Chroma\n",
|
57 |
+
"from langchain_openai import ChatOpenAI\n",
|
58 |
+
"from langchain.prompts import PromptTemplate\n",
|
59 |
+
"from langchain.chains import ConversationalRetrievalChain\n",
|
60 |
+
"from langchain_community.vectorstores import Qdrant\n",
|
61 |
+
"from langchain.memory import ConversationBufferMemory"
|
62 |
+
]
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"cell_type": "code",
|
66 |
+
"execution_count": 6,
|
67 |
+
"metadata": {},
|
68 |
+
"outputs": [],
|
69 |
+
"source": [
|
70 |
+
"pdf_paths = [\"/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf\",\n",
|
71 |
+
"\"/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf\"]"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": 7,
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [],
|
79 |
+
"source": [
|
80 |
+
"pdf_documents = []\n",
|
81 |
+
"for pdf_path in pdf_paths:\n",
|
82 |
+
" loader = PyPDFLoader(pdf_path)\n",
|
83 |
+
" pdf_documents.extend(loader.load())"
|
84 |
+
]
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"cell_type": "code",
|
88 |
+
"execution_count": 8,
|
89 |
+
"metadata": {},
|
90 |
+
"outputs": [],
|
91 |
+
"source": [
|
92 |
+
"text_splitter = RecursiveCharacterTextSplitter(\n",
|
93 |
+
" chunk_size=2000,\n",
|
94 |
+
" chunk_overlap=100,\n",
|
95 |
+
" )\n",
|
96 |
+
"pdf_docs = text_splitter.split_documents(pdf_documents)"
|
97 |
+
]
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"cell_type": "code",
|
101 |
+
"execution_count": 11,
|
102 |
+
"metadata": {},
|
103 |
+
"outputs": [
|
104 |
+
{
|
105 |
+
"data": {
|
106 |
+
"text/html": [
|
107 |
+
"<div>\n",
|
108 |
+
"<style scoped>\n",
|
109 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
110 |
+
" vertical-align: middle;\n",
|
111 |
+
" }\n",
|
112 |
+
"\n",
|
113 |
+
" .dataframe tbody tr th {\n",
|
114 |
+
" vertical-align: top;\n",
|
115 |
+
" }\n",
|
116 |
+
"\n",
|
117 |
+
" .dataframe thead th {\n",
|
118 |
+
" text-align: right;\n",
|
119 |
+
" }\n",
|
120 |
+
"</style>\n",
|
121 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
122 |
+
" <thead>\n",
|
123 |
+
" <tr style=\"text-align: right;\">\n",
|
124 |
+
" <th></th>\n",
|
125 |
+
" <th>Metric</th>\n",
|
126 |
+
" <th>Baseline</th>\n",
|
127 |
+
" </tr>\n",
|
128 |
+
" </thead>\n",
|
129 |
+
" <tbody>\n",
|
130 |
+
" <tr>\n",
|
131 |
+
" <th>0</th>\n",
|
132 |
+
" <td>faithfulness</td>\n",
|
133 |
+
" <td>0.895359</td>\n",
|
134 |
+
" </tr>\n",
|
135 |
+
" <tr>\n",
|
136 |
+
" <th>1</th>\n",
|
137 |
+
" <td>answer_relevancy</td>\n",
|
138 |
+
" <td>0.955419</td>\n",
|
139 |
+
" </tr>\n",
|
140 |
+
" <tr>\n",
|
141 |
+
" <th>2</th>\n",
|
142 |
+
" <td>context_recall</td>\n",
|
143 |
+
" <td>0.934028</td>\n",
|
144 |
+
" </tr>\n",
|
145 |
+
" <tr>\n",
|
146 |
+
" <th>3</th>\n",
|
147 |
+
" <td>context_precision</td>\n",
|
148 |
+
" <td>0.937500</td>\n",
|
149 |
+
" </tr>\n",
|
150 |
+
" <tr>\n",
|
151 |
+
" <th>4</th>\n",
|
152 |
+
" <td>answer_correctness</td>\n",
|
153 |
+
" <td>0.629267</td>\n",
|
154 |
+
" </tr>\n",
|
155 |
+
" </tbody>\n",
|
156 |
+
"</table>\n",
|
157 |
+
"</div>"
|
158 |
+
],
|
159 |
+
"text/plain": [
|
160 |
+
" Metric Baseline\n",
|
161 |
+
"0 faithfulness 0.895359\n",
|
162 |
+
"1 answer_relevancy 0.955419\n",
|
163 |
+
"2 context_recall 0.934028\n",
|
164 |
+
"3 context_precision 0.937500\n",
|
165 |
+
"4 answer_correctness 0.629267"
|
166 |
+
]
|
167 |
+
},
|
168 |
+
"execution_count": 11,
|
169 |
+
"metadata": {},
|
170 |
+
"output_type": "execute_result"
|
171 |
+
}
|
172 |
+
],
|
173 |
+
"source": [
|
174 |
+
"baseline_metrics = pd.read_csv(\"medium_chunk_metrics.csv\")\n",
|
175 |
+
"baseline_metrics.rename(columns={'MediumChunk': 'Baseline'}, inplace=True)\n",
|
176 |
+
"baseline_metrics\n"
|
177 |
+
]
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"cell_type": "code",
|
181 |
+
"execution_count": null,
|
182 |
+
"metadata": {},
|
183 |
+
"outputs": [],
|
184 |
+
"source": [
|
185 |
+
"pip install sentence-transformers"
|
186 |
+
]
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"cell_type": "code",
|
190 |
+
"execution_count": 19,
|
191 |
+
"metadata": {},
|
192 |
+
"outputs": [],
|
193 |
+
"source": [
|
194 |
+
"from sentence_transformers import SentenceTransformer\n",
|
195 |
+
"\n",
|
196 |
+
"model = SentenceTransformer(\"XicoC/midterm-finetuned-arctic\")"
|
197 |
+
]
|
198 |
+
},
|
199 |
+
{
|
200 |
+
"cell_type": "code",
|
201 |
+
"execution_count": 21,
|
202 |
+
"metadata": {},
|
203 |
+
"outputs": [],
|
204 |
+
"source": [
|
205 |
+
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
206 |
+
"\n",
|
207 |
+
"embedding = HuggingFaceEmbeddings(model_name=\"XicoC/midterm-finetuned-arctic\")"
|
208 |
+
]
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"cell_type": "code",
|
212 |
+
"execution_count": 22,
|
213 |
+
"metadata": {},
|
214 |
+
"outputs": [],
|
215 |
+
"source": [
|
216 |
+
"vectorstore = Qdrant.from_documents(\n",
|
217 |
+
" documents=pdf_docs,\n",
|
218 |
+
" embedding=embedding,\n",
|
219 |
+
" location=\":memory:\",\n",
|
220 |
+
" collection_name=\"Midterm Embedding Eval\"\n",
|
221 |
+
")\n",
|
222 |
+
"\n",
|
223 |
+
"retriever = vectorstore.as_retriever(\n",
|
224 |
+
" search_type=\"mmr\",\n",
|
225 |
+
" search_kwargs={\"k\": 4, \"fetch_k\": 10},\n",
|
226 |
+
")\n",
|
227 |
+
"\n",
|
228 |
+
"memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True, output_key=\"answer\")"
|
229 |
+
]
|
230 |
+
},
|
231 |
+
{
|
232 |
+
"cell_type": "code",
|
233 |
+
"execution_count": 26,
|
234 |
+
"metadata": {},
|
235 |
+
"outputs": [],
|
236 |
+
"source": [
|
237 |
+
"from langchain.retrievers.multi_query import MultiQueryRetriever\n",
|
238 |
+
"\n",
|
239 |
+
"retriever_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0)\n",
|
240 |
+
"multiquery_ft_embedding_retriever = MultiQueryRetriever.from_llm(\n",
|
241 |
+
" retriever=retriever, llm=retriever_llm\n",
|
242 |
+
")"
|
243 |
+
]
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"cell_type": "code",
|
247 |
+
"execution_count": 24,
|
248 |
+
"metadata": {},
|
249 |
+
"outputs": [],
|
250 |
+
"source": [
|
251 |
+
"llm = ChatOpenAI(\n",
|
252 |
+
" model=\"gpt-4o-mini\",\n",
|
253 |
+
" temperature=0,\n",
|
254 |
+
" streaming=True,\n",
|
255 |
+
")"
|
256 |
+
]
|
257 |
+
},
|
258 |
+
{
|
259 |
+
"cell_type": "code",
|
260 |
+
"execution_count": 25,
|
261 |
+
"metadata": {},
|
262 |
+
"outputs": [],
|
263 |
+
"source": [
|
264 |
+
"custom_template = \"\"\"\n",
|
265 |
+
"You are an expert in artificial intelligence policy, ethics, and industry trends. Your task is to provide clear and accurate answers to questions related to AI's role in politics, government regulations, and its ethical implications for enterprises. Use reliable and up-to-date information from government documents, industry reports, and academic research to inform your responses. Make sure to consider how AI is evolving, especially in relation to the current political landscape, and provide answers in a way that is easy to understand for both AI professionals and non-experts.\n",
|
266 |
+
"\n",
|
267 |
+
"Remember these key points:\n",
|
268 |
+
"1. Use \"you\" when addressing the user and \"I\" when referring to yourself.\n",
|
269 |
+
"2. If you encounter complex or legal language in the context, simplify it for easy understanding. Imagine you're explaining it to someone who isn't familiar with legal terms.\n",
|
270 |
+
"3. Be prepared for follow-up questions and maintain context from previous exchanges.\n",
|
271 |
+
"4. If there's no information from a retrieved document in the context to answer a question or if there are no documents to cite, say: \"I'm sorry, I don't know the answer to that question.\"\n",
|
272 |
+
"5. When providing information, always cite the source document and page number in parentheses at the end of the relevant sentence or paragraph, like this: (Source: [document name], p. [page number]).\n",
|
273 |
+
"\n",
|
274 |
+
"Here are a few example questions you might receive:\n",
|
275 |
+
"\n",
|
276 |
+
"How are governments regulating AI, and what new policies have been implemented?\n",
|
277 |
+
"What are the ethical risks of using AI in political decision-making?\n",
|
278 |
+
"How can enterprises ensure their AI applications meet government ethical standards?\n",
|
279 |
+
"\n",
|
280 |
+
"One final rule for you to remember. You CANNOT under any circumstance, answer any question that does not pertain to the AI. If you do answer an out-of-scope question, you could lose your job. If you are asked a question that does not have to do with AI, you must say: \"I'm sorry, I don't know the answer to that question.\"\n",
|
281 |
+
"Context: {context}\n",
|
282 |
+
"Chat History: {chat_history}\n",
|
283 |
+
"Human: {question}\n",
|
284 |
+
"AI:\"\"\"\n",
|
285 |
+
"\n",
|
286 |
+
"PROMPT = PromptTemplate(\n",
|
287 |
+
" template=custom_template, input_variables=[\"context\", \"question\", \"chat_history\"]\n",
|
288 |
+
")"
|
289 |
+
]
|
290 |
+
},
|
291 |
+
{
|
292 |
+
"cell_type": "code",
|
293 |
+
"execution_count": 27,
|
294 |
+
"metadata": {},
|
295 |
+
"outputs": [],
|
296 |
+
"source": [
|
297 |
+
"multiquery_ft_embedding_rag_chain = ConversationalRetrievalChain.from_llm(\n",
|
298 |
+
" llm,\n",
|
299 |
+
" retriever=multiquery_ft_embedding_retriever,\n",
|
300 |
+
" memory=memory,\n",
|
301 |
+
" combine_docs_chain_kwargs={\"prompt\": PROMPT},\n",
|
302 |
+
" return_source_documents=True,\n",
|
303 |
+
" )"
|
304 |
+
]
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"cell_type": "code",
|
308 |
+
"execution_count": 28,
|
309 |
+
"metadata": {},
|
310 |
+
"outputs": [
|
311 |
+
{
|
312 |
+
"data": {
|
313 |
+
"text/plain": [
|
314 |
+
"{'question': 'What are Trustworthy AI Characteristics?',\n",
|
315 |
+
" 'chat_history': [HumanMessage(content='What are Trustworthy AI Characteristics?'),\n",
|
316 |
+
" AIMessage(content='Trustworthy AI characteristics refer to the essential qualities that artificial intelligence systems should possess to ensure they are reliable, ethical, and beneficial to society. Here are some key characteristics:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their decision-making processes can be understood and scrutinized. This means providing clear explanations for how decisions are made and who is responsible for them.\\n\\n2. **Privacy Enhanced**: AI systems must prioritize user privacy and data protection. This involves implementing measures to safeguard personal information and ensuring that data is used responsibly.\\n\\n3. **Safe, Secure, and Resilient**: AI systems should be robust against attacks and failures. They must be designed to operate safely, even in unexpected situations, and should have mechanisms in place to recover from errors.\\n\\n4. **Explainable and Interpretable**: Users should be able to understand how AI systems arrive at their conclusions. This is crucial for building trust and ensuring that users can make informed decisions based on AI outputs.\\n\\n5. **Fair with Harmful Bias Managed**: AI systems should actively work to identify and mitigate biases that could lead to unfair treatment of individuals or groups. This includes ensuring that the data used to train AI models is representative and free from harmful stereotypes.\\n\\n6. **Valid and Reliable**: AI systems must produce accurate and consistent results. This involves regular testing and validation to ensure that the systems perform as intended across different scenarios.\\n\\n7. **Information Integrity**: AI systems should provide high-integrity information that can be trusted. This means distinguishing between fact and opinion, being transparent about the sources of information, and ensuring that the information is accurate and reliable.\\n\\n8. **Environmental Impact Consideration**: AI systems should be designed with an awareness of their environmental footprint, including energy consumption and carbon emissions associated with their training and operation (Source: National Institute of Standards and Technology, 2023, p. [specific page number not provided]).\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they are used ethically and responsibly.')],\n",
|
317 |
+
" 'answer': 'Trustworthy AI characteristics refer to the essential qualities that artificial intelligence systems should possess to ensure they are reliable, ethical, and beneficial to society. Here are some key characteristics:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their decision-making processes can be understood and scrutinized. This means providing clear explanations for how decisions are made and who is responsible for them.\\n\\n2. **Privacy Enhanced**: AI systems must prioritize user privacy and data protection. This involves implementing measures to safeguard personal information and ensuring that data is used responsibly.\\n\\n3. **Safe, Secure, and Resilient**: AI systems should be robust against attacks and failures. They must be designed to operate safely, even in unexpected situations, and should have mechanisms in place to recover from errors.\\n\\n4. **Explainable and Interpretable**: Users should be able to understand how AI systems arrive at their conclusions. This is crucial for building trust and ensuring that users can make informed decisions based on AI outputs.\\n\\n5. **Fair with Harmful Bias Managed**: AI systems should actively work to identify and mitigate biases that could lead to unfair treatment of individuals or groups. This includes ensuring that the data used to train AI models is representative and free from harmful stereotypes.\\n\\n6. **Valid and Reliable**: AI systems must produce accurate and consistent results. This involves regular testing and validation to ensure that the systems perform as intended across different scenarios.\\n\\n7. **Information Integrity**: AI systems should provide high-integrity information that can be trusted. This means distinguishing between fact and opinion, being transparent about the sources of information, and ensuring that the information is accurate and reliable.\\n\\n8. **Environmental Impact Consideration**: AI systems should be designed with an awareness of their environmental footprint, including energy consumption and carbon emissions associated with their training and operation (Source: National Institute of Standards and Technology, 2023, p. [specific page number not provided]).\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they are used ethically and responsibly.',\n",
|
318 |
+
" 'source_documents': [Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 11, '_id': '5acf90a4d7bb43b4a0eb744ffc20429f', '_collection_name': 'Midterm Embedding Eval'}, page_content='8 Trustworthy AI Characteristics: Accountable and Transparent, Privacy Enhanced, Safe, Secure and \\nResilient \\n2.5. Environmental Impacts \\nTraining, maint aining, and operating (running inference on) GAI systems are resource -intensive activities , \\nwith potentially large energy and environmental footprints. Energy and carbon emissions vary based on \\nwhat is being done with the GAI model (i.e., pre -training, fine -tuning, inference), the modality of the \\ncontent , hardware used, and type of task or application . \\nCurrent e stimates suggest that training a single transformer LLM can emit as much carbon as 300 round-\\ntrip flights between San Francisco and New York. In a study comparing energy consumption and carbon \\nemissions for LLM inference, generative tasks ( e.g., text summarization) were found to be more energy - \\nand carbon -i ntensive th an discriminative or non- generative tasks (e.g., text classification). \\nMethods for creating smaller versions of train ed models, such as model distillation or compression, \\ncould reduce environmental impacts at inference time, but training and tuning such models may still \\ncontribute to their environmental impacts . Currently there is no agreed upon method to estimate \\nenvironmental impacts from GAI . \\nTrustworthy AI Characteristics: Accountable and Transparent, Safe \\n2.6. Harmful Bias and Homogenization \\nBias exists in many forms and can become ingrained in automated systems. AI systems , including GAI \\nsystems, can increase the speed and scale at which harmful biases manifest and are acted upon, \\npotentially perpetuati ng and amplify ing harms to individuals, groups, communities, organizations, and \\nsociety . For example, when prompted to generate images of CEOs, doctors, lawyers, and judges, current \\ntext-to-image models underrepresent women and/or racial minorities , and people with disabilities . \\nImage generator models have also produce d biased or stereotyped output for various demographic'),\n",
|
319 |
+
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 59, '_id': '6860cf964d8240dcb54e6868b373eb14', '_collection_name': 'Midterm Embedding Eval'}, page_content='National Institute of Standards and Technology (2024) Adversarial Machine Learning: A Taxonomy and \\nTerminology of Attacks and Mitigations https://csrc.nist.gov/pubs/ai/100/2/e2023/final \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework . \\nhttps://www.nist.gov/itl/ai -risk-management -framework \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Chapter 3: AI \\nRisks and Trustworthiness. \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Foundational_Information/3- sec-characteristics \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework , Chapter 6 : AI \\nRMF Profiles. https://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Core_And_Profiles/6 -sec-profile \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Appendix A: \\nDescriptions of AI Actor Tasks . \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Appendices/Appendix_A#:~:text=AI%20actors%\\n20in%20this%20category,data%20providers%2C%20system%20funders%2C%20product'),\n",
|
320 |
+
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 12, '_id': '0ad84096cbc64d01b44ef852fc024e0f', '_collection_name': 'Midterm Embedding Eval'}, page_content='There may also be concerns about emotional entanglement between humans and GAI systems, which \\ncould lead to negative psychological impacts . \\nTrustworthy AI Characteristics: Accountable and Transparent, Explainable and Interpretable, Fair with \\nHarmful Bias Managed, Privacy Enhanced, Safe , Valid and Reliable \\n2.8. Information Integrity \\nInformation integrity describes the “ spectrum of information and associated patterns of its creation, \\nexchange, and consumption in society .” High-integrity information can be trusted; “distinguishes fact \\nfrom fiction, opinion, and inference; acknowledges uncertainties; and is transparent about its level of \\nvetting. This information can be linked to the original source(s) with appropriate evidence. High- integrity \\ninformation is also accurate and reliable, can be verified and authenticated, has a clear chain of custody, \\nand creates reasonable expectations about when its validity may expire. ”11 \\n \\n \\n11 This definition of information integrity is derived from the 2022 White House Roadmap for Researchers on \\nPriorities Related to Information Integrity Research and Development.'),\n",
|
321 |
+
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 44, '_id': '1354701fd1f14d3b88b2d1fe0d433bb1', '_collection_name': 'Midterm Embedding Eval'}, page_content='generation of artificially intelligent partners.95 The National Science Foundation’s program on Fairness in \\nArtificial Intelligence also includes a specific interest in research foundations for explainable AI.96\\n45'),\n",
|
322 |
+
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 21, '_id': 'c5c0419fd2424b0eaa852cd42b94375e', '_collection_name': 'Midterm Embedding Eval'}, page_content=\"SAFE AND EFFECTIVE \\nSYSTEMS \\nHOW THESE PRINCIPLES CAN MOVE INTO PRACTICE\\nReal-life examples of how these principles can become reality, through laws, policies, and practical \\ntechnical and sociotechnical approaches to protecting rights, opportunities, and access. \\nSome U.S government agencies have developed specific frameworks for ethical use of AI \\nsystems. The Department of Energy (DOE) has activated the AI Advancement Council that oversees coordina -\\ntion and advises on implementation of the DOE AI Strategy and addresses issues and/or escalations on the \\nethical use and development of AI systems.20 The Department of Defense has adopted Artificial Intelligence \\nEthical Principles, and tenets for Responsible Artificial Intelligence specifically tailored to its national \\nsecurity and defense activities.21 Similarl y, the U.S. Intelligence Community (IC) has developed the Principles \\nof Artificial Intelligence Ethics for the Intelligence Community to guide personnel on whether and how to \\ndevelop and use AI in furtherance of the IC's mission, as well as an AI Ethics Framework to help implement \\nthese principles.22\\nThe National Science Foundation (NSF) funds extensive research to help foster the \\ndevelopment of automated systems that adhere to and advance their safety, security and \\neffectiveness. Multiple NSF programs support research that directly addresses many of these principles: \\nthe National AI Research Institutes23 support research on all aspects of safe, trustworth y, fai r, and explainable \\nAI algorithms and systems; the Cyber Physical Systems24 program supports research on developing safe \\nautonomous and cyber physical systems with AI components; the Secure and Trustworthy Cyberspace25 \\nprogram supports research on cybersecurity and privacy enhancing technologies in automated systems; the \\nFormal Methods in the Field26 program supports research on rigorous formal verification and analysis of\"),\n",
|
323 |
+
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 35, '_id': '4a03397cb66849518171d94b407075a5', '_collection_name': 'Midterm Embedding Eval'}, page_content='32 MEASURE 2.6: The AI system is evaluated regularly for safety risks – as identified in the MAP function. The AI system to be \\ndeployed is demonstrated to be safe, its residual negative risk does not exceed the risk tolerance, and it can fail safely, p articularly if \\nmade to operate beyond its knowledge limits. Safety metrics reflect system reliability and robustness, real- time monitoring, and \\nresponse times for AI system failures. \\nAction ID Suggested Action GAI Risks \\nMS-2.6-001 Assess adverse impacts , including health and wellbeing impacts for value chain \\nor other AI Actors that are exposed to sexually explicit, offensive , or violent \\ninformation during GAI training and maintenance. Human -AI Configuration ; Obscene, \\nDegrading, and/or Abusive \\nContent ; Value Chain and \\nComponent Integration; Dangerous , Violent, or Hateful \\nContent \\nMS-2.6-002 Assess existence or levels of harmful bias , intellectual property infringement, \\ndata privacy violations, obscenity, extremism, violence, or CBRN information in \\nsystem training data. Data Privacy ; Intellectual Property ; \\nObscene, Degrading, and/or Abusive Content ; Harmful Bias and \\nHomogenization ; Dangerous , \\nViolent, or Hateful Content ; CBRN \\nInformation or Capabilities \\nMS-2.6-003 Re-evaluate safety features of fine -tuned models when the negative risk exceeds \\norganizational risk tolerance. Dangerous , Violent, or Hateful \\nContent \\nMS-2.6-004 Review GAI system outputs for validity and safety: Review generated code to assess risks that may arise from unreliable downstream decision -making. Value Chain and Component \\nIntegration ; Dangerous , Violent, or \\nHateful Content \\nMS-2.6-005 Verify that GAI system architecture can monitor outputs and performance, and \\nhandle, recover from, and repair errors when security anomalies, threats and impacts are detected. Confabulation ; Information \\nIntegrity ; Information Security'),\n",
|
324 |
+
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 34, '_id': '9c2f57bf30814277bf1a20c88ed563be', '_collection_name': 'Midterm Embedding Eval'}, page_content='31 MS-2.3-004 Utilize a purpose -built testing environment such as NIST Dioptra to empirically \\nevaluate GAI trustworthy characteristics. CBRN Information or Capabilities ; \\nData Privacy ; Confabulation ; \\nInformation Integrity ; Information \\nSecurity ; Dangerous , Violent, or \\nHateful Content ; Harmful Bias and \\nHomogenization \\nAI Actor Tasks: AI Deployment, TEVV \\n \\nMEASURE 2.5: The AI system to be deployed is demonstrated to be valid and reliable. Limitations of the generalizability beyond the \\nconditions under which the technology was developed are documented. \\nAction ID Suggested Action Risks \\nMS-2.5-001 Avoid extrapolating GAI system performance or capabilities from narrow, non -\\nsystematic, and anecdotal assessments. Human -AI Configuration ; \\nConfabulation \\nMS-2.5-002 Document the extent to which human domain knowledge is employed to \\nimprove GAI system performance, via, e.g., RLHF, fine -tuning, retrieval-\\naugmented generation, content moderation, business rules. Human -AI Configuration \\nMS-2.5-003 Review and verify sources and citations in GAI system outputs during pre -\\ndeployment risk measurement and ongoing monitoring activities. Confabulation \\nMS-2.5-004 Track and document instances of anthropomorphization (e.g., human images, \\nmentions of human feelings, cyborg imagery or motifs) in GAI system interfaces. Human -AI Configuration \\nMS-2.5-0 05 Verify GAI system training data and TEVV data provenance, and that fine -tuning \\nor retrieval- augmented generation data is grounded. Information Integrity \\nMS-2.5-0 06 Regularly review security and safety guardrails, especially if the GAI system is \\nbeing operated in novel circumstances. This includes reviewing reasons why the \\nGAI system was initially assessed as being safe to deploy. Information Security ; Dangerous , \\nViolent, or Hateful Content \\nAI Actor Tasks: Domain Experts, TEVV'),\n",
|
325 |
+
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 41, '_id': '318ce3f5be254529966a22fbe12f9e13', '_collection_name': 'Midterm Embedding Eval'}, page_content='Information Integrity \\nMS-3.3-003 Evaluate potential biases and stereotypes that could emerge from the AI -\\ngenerated content using appropriate methodologies including computational testing methods as well as evaluating structured feedback input. Harmful Bias and Homogenization')]}"
|
326 |
+
]
|
327 |
+
},
|
328 |
+
"execution_count": 28,
|
329 |
+
"metadata": {},
|
330 |
+
"output_type": "execute_result"
|
331 |
+
}
|
332 |
+
],
|
333 |
+
"source": [
|
334 |
+
"multiquery_ft_embedding_rag_chain.invoke({\"question\": \"What are Trustworthy AI Characteristics?\"})"
|
335 |
+
]
|
336 |
+
},
|
337 |
+
{
|
338 |
+
"cell_type": "code",
|
339 |
+
"execution_count": 29,
|
340 |
+
"metadata": {},
|
341 |
+
"outputs": [],
|
342 |
+
"source": [
|
343 |
+
"answers = []\n",
|
344 |
+
"contexts = []\n",
|
345 |
+
"\n",
|
346 |
+
"for question in test_questions:\n",
|
347 |
+
" response = multiquery_ft_embedding_rag_chain.invoke({\"question\" : question})\n",
|
348 |
+
" answers.append(response[\"answer\"])\n",
|
349 |
+
" contexts.append([context.page_content for context in response[\"source_documents\"]])"
|
350 |
+
]
|
351 |
+
},
|
352 |
+
{
|
353 |
+
"cell_type": "code",
|
354 |
+
"execution_count": 30,
|
355 |
+
"metadata": {},
|
356 |
+
"outputs": [],
|
357 |
+
"source": [
|
358 |
+
"from datasets import Dataset\n",
|
359 |
+
"\n",
|
360 |
+
"multiquery_ft_embedding_dataset = Dataset.from_dict({\n",
|
361 |
+
" \"question\" : test_questions,\n",
|
362 |
+
" \"answer\" : answers,\n",
|
363 |
+
" \"contexts\" : contexts,\n",
|
364 |
+
" \"ground_truth\" : test_groundtruths\n",
|
365 |
+
"})"
|
366 |
+
]
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"cell_type": "code",
|
370 |
+
"execution_count": 31,
|
371 |
+
"metadata": {},
|
372 |
+
"outputs": [
|
373 |
+
{
|
374 |
+
"data": {
|
375 |
+
"text/plain": [
|
376 |
+
"{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',\n",
|
377 |
+
" 'answer': \"Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\\n\\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect them. This transparency is crucial for building trust between the public and the entities deploying these systems.\\n\\n2. **Accountability**: By clearly identifying the entities responsible for the design and use of automated systems, it holds them accountable for the decisions made by these systems. This accountability is essential for addressing any potential harms or errors that may arise from automated decision-making.\\n\\n3. **Informed Decision-Making**: Notice and explanation allow individuals to understand how and why decisions are made, which is vital for them to contest or appeal those decisions if necessary. For example, if someone is denied a loan or a job, knowing the criteria used by the automated system can help them address any inaccuracies or biases.\\n\\n4. **Protection of Rights**: Automated systems can significantly impact people's lives, from employment opportunities to legal outcomes. Providing notice and explanations helps protect individuals' rights by ensuring they have the information needed to understand and challenge decisions that may adversely affect them.\\n\\n5. **Encouraging Fairness**: When individuals are informed about the use of automated systems and the rationale behind decisions, it can help mitigate biases and promote fairness. This is particularly important in sensitive areas like criminal justice, healthcare, and employment.\\n\\n6. **Facilitating Oversight**: Clear notice and explanations enable oversight bodies, researchers, and the public to scrutinize automated systems, ensuring they operate fairly and effectively. This oversight is crucial for identifying and correcting any issues that may arise.\\n\\nIn summary, the legal requirement for notice and explanation in automated systems is essential for ensuring transparency, accountability, and fairness, ultimately protecting individuals' rights and fostering trust in these technologies (Source: [document name], p. [specific page number not provided]).\",\n",
|
378 |
+
" 'contexts': ['NOTICE & \\nEXPLANATION \\nWHAT SHOULD BE EXPECTED OF AUTOMATED SYSTEMS\\nThe expectations for automated systems are meant to serve as a blueprint for the development of additional \\ntechnical standards and practices that are tailored for particular sectors and contexts. \\nAn automated system should provide demonstrably clear, timely, understandable, and accessible notice of use, and \\nexplanations as to how and why a decision was made or an action was taken by the system. These expectations are explained below. \\nProvide clear, timely, understandable, and accessible notice of use and explanations \\nGenerally accessible plain language documentation. The entity responsible for using the automated \\nsystem should ensure that documentation describing the overall system (including any human components) is \\npublic and easy to find. The documentation should describe, in plain language, how the system works and how \\nany automated component is used to determine an action or decision. It should also include expectations about \\nreporting described throughout this framework, such as the algorithmic impact assessments described as \\npart of Algorithmic Discrimination Protections. \\nAccount able. Notices should clearly identify the entity r esponsible for designing each component of the \\nsystem and the entity using it. \\nTimely and up-to-date. Users should receive notice of the use of automated systems in advance of using or \\nwhile being impacted by the technolog y. An explanation should be available with the decision itself, or soon \\nthereafte r. Notice should be kept up-to-date and people impacted by the system should be notified of use case \\nor key functionality changes. \\nBrief and clear. Notices and explanations should be assessed, such as by research on users’ experiences, \\nincluding user testing, to ensure that the people using or impacted by the automated system are able to easily',\n",
|
379 |
+
" 'NOTICE & \\nEXPLANATION \\nWHY THIS PRINCIPLE IS IMPORTANT\\nThis section provides a brief summary of the problems which the principle seeks to address and protect \\nagainst, including illustrative examples. \\nAutomated systems now determine opportunities, from employment to credit, and directly shape the American \\npublic’s experiences, from the courtroom to online classrooms, in ways that profoundly impact people’s lives. But this expansive impact is not always visible. An applicant might not know whether a person rejected their resume or a hiring algorithm moved them to the bottom of the list. A defendant in the courtroom might not know if a judge deny\\n-\\ning their bail is informed by an automated system that labeled them “high risk.” From correcting errors to contesting decisions, people are often denied the knowledge they need to address the impact of automated systems on their lives. Notice and explanations also serve an important safety and efficacy purpose, allowing experts to verify the reasonable\\n-\\nness of a recommendation before enacting it. \\nIn order to guard against potential harms, the American public needs to know if an automated system is being used. Clear, brief, and understandable notice is a prerequisite for achieving the other protections in this framework. Like\\n-\\nwise, the public is often unable to ascertain how or why an automated system has made a decision or contributed to a particular outcome. The decision-making processes of automated systems tend to be opaque, complex, and, therefore, unaccountable, whether by design or by omission. These factors can make explanations both more challenging and more important, and should not be used as a pretext to avoid explaining important decisions to the people impacted by those choices. In the context of automated systems, clear and valid explanations should be recognized as a baseline requirement.',\n",
|
380 |
+
" 'or label to ensure the goal of the automated system is appropriately identified and measured. Additionally , \\njustification should be documented for each data attribute and source to explain why it is appropriate to use \\nthat data to inform the results of the automated system and why such use will not violate any applicable laws. \\nIn cases of high-dimensional and/or derived attributes, such justifications can be provided as overall \\ndescriptions of the attribute generation process and appropriateness. \\n19',\n",
|
381 |
+
" 'Meaningful access to examine the system. Designers, developers, and deployers of automated \\nsystems should consider limited waivers of confidentiality (including those related to trade secrets) where necessary in order to provide meaningful oversight of systems used in sensitive domains, incorporating mea\\n-\\nsures to protect intellectual property and trade secrets from unwarranted disclosure as appropriate. This includes (potentially private and protected) meaningful access to source code, documentation, and related data during any associated legal discovery, subject to effective confidentiality or court orders. Such meaning\\n-\\nful access should include (but is not limited to) adhering to the principle on Notice and Explanation using the highest level of risk so the system is designed with built-in explanations; such systems should use fully-trans\\n-\\nparent models where the model itself can be understood by people needing to directly examine it. \\nDemonstrate access to human alternatives, consideration, and fallback \\nReporting. Reporting should include an assessment of timeliness and the extent of additional burden for human alternatives, aggregate statistics about who chooses the human alternative, along with the results of the assessment about brevity, clarity, and accessibility of notice and opt-out instructions. Reporting on the accessibility, timeliness, and effectiveness of human consideration and fallback should be made public at regu\\n-',\n",
|
382 |
+
" \"Providing notice has long been a standard practice, and in many cases is a legal requirement, when, for example, making a video recording of someone (outside of a law enforcement or national security context). In some cases, such as credit, lenders are required to provide notice and explanation to consumers. Techniques used to automate the process of explaining such systems are under active research and improvement and such explanations can take many forms. Innovative companies and researchers are rising to the challenge and creating and deploying explanatory systems that can help the public better understand decisions that impact them. \\nWhile notice and explanation requirements are already in place in some sectors or situations, the American public deserve to know consistently and across sectors if an automated system is being used in a way that impacts their rights, opportunities, or access. This knowledge should provide confidence in how the public is being treated, and trust in the validity and reasonable use of automated systems. \\n• A lawyer representing an older client with disabilities who had been cut off from Medicaid-funded home\\nhealth-care assistance couldn't determine why\\n, especially since the decision went against historical access\\npractices. In a court hearing, the lawyer learned from a witness that the state in which the older client\\nlived \\nhad recently adopted a new algorithm to determine eligibility.83 The lack of a timely explanation made it\\nharder \\nto understand and contest the decision.\\n•\\nA formal child welfare investigation is opened against a parent based on an algorithm and without the parent\\never \\nbeing notified that data was being collected and used as part of an algorithmic child maltreatment\\nrisk assessment.84 The lack of notice or an explanation makes it harder for those performing child\\nmaltreatment assessments to validate the risk assessment and denies parents knowledge that could help them\\ncontest a decision.\\n41\",\n",
|
383 |
+
" \"find notices and explanations, read them quickl y, and understand and act on them. This includes ensuring that \\nnotices and explanations are accessible to users with disabilities and are available in the language(s) and read-\\ning level appropriate for the audience. Notices and explanations may need to be available in multiple forms, \\n(e.g., on pape r, on a physical sign, or online), in order to meet these expectations and to be accessible to the \\nAmerican public. \\nProvide explanations as to how and why a decision was made or an action was taken by an \\nautomated system \\nTailored to the purpose. Explanations should be tailored to the specific purpose for which the user is \\nexpected to use the explanation, and should clearly state that purpose. An informational explanation might differ from an explanation provided to allow for the possibility of recourse, an appeal, or one provided in the context of a dispute or contestation process. For the purposes of this framework, 'explanation' should be construed broadly. An explanation need not be a plain-language statement about causality but could consist of any mechanism that allows the recipient to build the necessary understanding and intuitions to achieve the stated purpose. Tailoring should be assessed (e.g., via user experience research). \\nTailored to the target of the explanation. Explanations should be targeted to specific audiences and clearly state that audience. An explanation provided to the subject of a decision might differ from one provided to an advocate, or to a domain expert or decision maker. Tailoring should be assessed (e.g., via user experience research). \\n43\"],\n",
|
384 |
+
" 'ground_truth': 'Providing notice and explanation as a legal requirement in the context of automated systems is significant because it allows individuals to understand how automated systems are impacting their lives. It helps in correcting errors, contesting decisions, and verifying the reasonableness of recommendations before enacting them. Clear and valid explanations are essential to ensure transparency, accountability, and trust in the use of automated systems across various sectors.'}"
|
385 |
+
]
|
386 |
+
},
|
387 |
+
"execution_count": 31,
|
388 |
+
"metadata": {},
|
389 |
+
"output_type": "execute_result"
|
390 |
+
}
|
391 |
+
],
|
392 |
+
"source": [
|
393 |
+
"multiquery_ft_embedding_dataset[0]"
|
394 |
+
]
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"cell_type": "code",
|
398 |
+
"execution_count": 32,
|
399 |
+
"metadata": {},
|
400 |
+
"outputs": [],
|
401 |
+
"source": [
|
402 |
+
"from ragas import evaluate\n",
|
403 |
+
"from ragas.metrics import (\n",
|
404 |
+
" faithfulness,\n",
|
405 |
+
" answer_relevancy,\n",
|
406 |
+
" answer_correctness,\n",
|
407 |
+
" context_recall,\n",
|
408 |
+
" context_precision,\n",
|
409 |
+
")\n",
|
410 |
+
"\n",
|
411 |
+
"metrics = [\n",
|
412 |
+
" faithfulness,\n",
|
413 |
+
" answer_relevancy,\n",
|
414 |
+
" context_recall,\n",
|
415 |
+
" context_precision,\n",
|
416 |
+
" answer_correctness,\n",
|
417 |
+
"]"
|
418 |
+
]
|
419 |
+
},
|
420 |
+
{
|
421 |
+
"cell_type": "code",
|
422 |
+
"execution_count": 33,
|
423 |
+
"metadata": {},
|
424 |
+
"outputs": [
|
425 |
+
{
|
426 |
+
"data": {
|
427 |
+
"application/vnd.jupyter.widget-view+json": {
|
428 |
+
"model_id": "190157ee020d4f2191f04a2682ba8737",
|
429 |
+
"version_major": 2,
|
430 |
+
"version_minor": 0
|
431 |
+
},
|
432 |
+
"text/plain": [
|
433 |
+
"Evaluating: 0%| | 0/120 [00:00<?, ?it/s]"
|
434 |
+
]
|
435 |
+
},
|
436 |
+
"metadata": {},
|
437 |
+
"output_type": "display_data"
|
438 |
+
}
|
439 |
+
],
|
440 |
+
"source": [
|
441 |
+
"multiquery_ft_embedding_results = evaluate(multiquery_ft_embedding_dataset, metrics)"
|
442 |
+
]
|
443 |
+
},
|
444 |
+
{
|
445 |
+
"cell_type": "code",
|
446 |
+
"execution_count": 34,
|
447 |
+
"metadata": {},
|
448 |
+
"outputs": [
|
449 |
+
{
|
450 |
+
"data": {
|
451 |
+
"text/plain": [
|
452 |
+
"{'faithfulness': 0.8684, 'answer_relevancy': 0.9558, 'context_recall': 0.9444, 'context_precision': 0.9537, 'answer_correctness': 0.6034}"
|
453 |
+
]
|
454 |
+
},
|
455 |
+
"execution_count": 34,
|
456 |
+
"metadata": {},
|
457 |
+
"output_type": "execute_result"
|
458 |
+
}
|
459 |
+
],
|
460 |
+
"source": [
|
461 |
+
"multiquery_ft_embedding_results"
|
462 |
+
]
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"cell_type": "code",
|
466 |
+
"execution_count": null,
|
467 |
+
"metadata": {},
|
468 |
+
"outputs": [],
|
469 |
+
"source": [
|
470 |
+
"multiquery_ft_embedding_results_df = multiquery_ft_embedding_results.to_pandas()\n",
|
471 |
+
"multiquery_ft_embedding_results_df.head()"
|
472 |
+
]
|
473 |
+
},
|
474 |
+
{
|
475 |
+
"cell_type": "code",
|
476 |
+
"execution_count": 36,
|
477 |
+
"metadata": {},
|
478 |
+
"outputs": [],
|
479 |
+
"source": [
|
480 |
+
"multiquery_ft_embedding_results_df.to_csv(\"multiquery_ft_embedding_ragas_results.csv\", index=False)"
|
481 |
+
]
|
482 |
+
},
|
483 |
+
{
|
484 |
+
"cell_type": "code",
|
485 |
+
"execution_count": 37,
|
486 |
+
"metadata": {},
|
487 |
+
"outputs": [],
|
488 |
+
"source": [
|
489 |
+
"multiquery_ft_embedding_metrics_df = pd.DataFrame(list(multiquery_ft_embedding_results.items()), columns=['Metric', 'Fine-Tune Embedding'])"
|
490 |
+
]
|
491 |
+
},
|
492 |
+
{
|
493 |
+
"cell_type": "code",
|
494 |
+
"execution_count": 38,
|
495 |
+
"metadata": {},
|
496 |
+
"outputs": [
|
497 |
+
{
|
498 |
+
"data": {
|
499 |
+
"text/html": [
|
500 |
+
"<div>\n",
|
501 |
+
"<style scoped>\n",
|
502 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
503 |
+
" vertical-align: middle;\n",
|
504 |
+
" }\n",
|
505 |
+
"\n",
|
506 |
+
" .dataframe tbody tr th {\n",
|
507 |
+
" vertical-align: top;\n",
|
508 |
+
" }\n",
|
509 |
+
"\n",
|
510 |
+
" .dataframe thead th {\n",
|
511 |
+
" text-align: right;\n",
|
512 |
+
" }\n",
|
513 |
+
"</style>\n",
|
514 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
515 |
+
" <thead>\n",
|
516 |
+
" <tr style=\"text-align: right;\">\n",
|
517 |
+
" <th></th>\n",
|
518 |
+
" <th>Metric</th>\n",
|
519 |
+
" <th>Fine-Tune Embedding</th>\n",
|
520 |
+
" </tr>\n",
|
521 |
+
" </thead>\n",
|
522 |
+
" <tbody>\n",
|
523 |
+
" <tr>\n",
|
524 |
+
" <th>0</th>\n",
|
525 |
+
" <td>faithfulness</td>\n",
|
526 |
+
" <td>0.868351</td>\n",
|
527 |
+
" </tr>\n",
|
528 |
+
" <tr>\n",
|
529 |
+
" <th>1</th>\n",
|
530 |
+
" <td>answer_relevancy</td>\n",
|
531 |
+
" <td>0.955777</td>\n",
|
532 |
+
" </tr>\n",
|
533 |
+
" <tr>\n",
|
534 |
+
" <th>2</th>\n",
|
535 |
+
" <td>context_recall</td>\n",
|
536 |
+
" <td>0.944444</td>\n",
|
537 |
+
" </tr>\n",
|
538 |
+
" <tr>\n",
|
539 |
+
" <th>3</th>\n",
|
540 |
+
" <td>context_precision</td>\n",
|
541 |
+
" <td>0.953668</td>\n",
|
542 |
+
" </tr>\n",
|
543 |
+
" <tr>\n",
|
544 |
+
" <th>4</th>\n",
|
545 |
+
" <td>answer_correctness</td>\n",
|
546 |
+
" <td>0.603407</td>\n",
|
547 |
+
" </tr>\n",
|
548 |
+
" </tbody>\n",
|
549 |
+
"</table>\n",
|
550 |
+
"</div>"
|
551 |
+
],
|
552 |
+
"text/plain": [
|
553 |
+
" Metric Fine-Tune Embedding\n",
|
554 |
+
"0 faithfulness 0.868351\n",
|
555 |
+
"1 answer_relevancy 0.955777\n",
|
556 |
+
"2 context_recall 0.944444\n",
|
557 |
+
"3 context_precision 0.953668\n",
|
558 |
+
"4 answer_correctness 0.603407"
|
559 |
+
]
|
560 |
+
},
|
561 |
+
"execution_count": 38,
|
562 |
+
"metadata": {},
|
563 |
+
"output_type": "execute_result"
|
564 |
+
}
|
565 |
+
],
|
566 |
+
"source": [
|
567 |
+
"multiquery_ft_embedding_metrics_df"
|
568 |
+
]
|
569 |
+
},
|
570 |
+
{
|
571 |
+
"cell_type": "code",
|
572 |
+
"execution_count": 39,
|
573 |
+
"metadata": {},
|
574 |
+
"outputs": [],
|
575 |
+
"source": [
|
576 |
+
"multiquery_ft_embedding_metrics_df.to_csv(\"multiquery_ft_embedding_metrics.csv\", index=False)"
|
577 |
+
]
|
578 |
+
},
|
579 |
+
{
|
580 |
+
"cell_type": "code",
|
581 |
+
"execution_count": 41,
|
582 |
+
"metadata": {},
|
583 |
+
"outputs": [
|
584 |
+
{
|
585 |
+
"data": {
|
586 |
+
"text/html": [
|
587 |
+
"<div>\n",
|
588 |
+
"<style scoped>\n",
|
589 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
590 |
+
" vertical-align: middle;\n",
|
591 |
+
" }\n",
|
592 |
+
"\n",
|
593 |
+
" .dataframe tbody tr th {\n",
|
594 |
+
" vertical-align: top;\n",
|
595 |
+
" }\n",
|
596 |
+
"\n",
|
597 |
+
" .dataframe thead th {\n",
|
598 |
+
" text-align: right;\n",
|
599 |
+
" }\n",
|
600 |
+
"</style>\n",
|
601 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
602 |
+
" <thead>\n",
|
603 |
+
" <tr style=\"text-align: right;\">\n",
|
604 |
+
" <th></th>\n",
|
605 |
+
" <th>Metric</th>\n",
|
606 |
+
" <th>Baseline</th>\n",
|
607 |
+
" <th>Fine-Tune Embedding</th>\n",
|
608 |
+
" <th>Baseline -> Fine-Tune Embedding</th>\n",
|
609 |
+
" </tr>\n",
|
610 |
+
" </thead>\n",
|
611 |
+
" <tbody>\n",
|
612 |
+
" <tr>\n",
|
613 |
+
" <th>0</th>\n",
|
614 |
+
" <td>faithfulness</td>\n",
|
615 |
+
" <td>0.895359</td>\n",
|
616 |
+
" <td>0.868351</td>\n",
|
617 |
+
" <td>-0.027007</td>\n",
|
618 |
+
" </tr>\n",
|
619 |
+
" <tr>\n",
|
620 |
+
" <th>1</th>\n",
|
621 |
+
" <td>answer_relevancy</td>\n",
|
622 |
+
" <td>0.955419</td>\n",
|
623 |
+
" <td>0.955777</td>\n",
|
624 |
+
" <td>0.000358</td>\n",
|
625 |
+
" </tr>\n",
|
626 |
+
" <tr>\n",
|
627 |
+
" <th>2</th>\n",
|
628 |
+
" <td>context_recall</td>\n",
|
629 |
+
" <td>0.934028</td>\n",
|
630 |
+
" <td>0.944444</td>\n",
|
631 |
+
" <td>0.010417</td>\n",
|
632 |
+
" </tr>\n",
|
633 |
+
" <tr>\n",
|
634 |
+
" <th>3</th>\n",
|
635 |
+
" <td>context_precision</td>\n",
|
636 |
+
" <td>0.937500</td>\n",
|
637 |
+
" <td>0.953668</td>\n",
|
638 |
+
" <td>0.016168</td>\n",
|
639 |
+
" </tr>\n",
|
640 |
+
" <tr>\n",
|
641 |
+
" <th>4</th>\n",
|
642 |
+
" <td>answer_correctness</td>\n",
|
643 |
+
" <td>0.629267</td>\n",
|
644 |
+
" <td>0.603407</td>\n",
|
645 |
+
" <td>-0.025861</td>\n",
|
646 |
+
" </tr>\n",
|
647 |
+
" </tbody>\n",
|
648 |
+
"</table>\n",
|
649 |
+
"</div>"
|
650 |
+
],
|
651 |
+
"text/plain": [
|
652 |
+
" Metric Baseline Fine-Tune Embedding \\\n",
|
653 |
+
"0 faithfulness 0.895359 0.868351 \n",
|
654 |
+
"1 answer_relevancy 0.955419 0.955777 \n",
|
655 |
+
"2 context_recall 0.934028 0.944444 \n",
|
656 |
+
"3 context_precision 0.937500 0.953668 \n",
|
657 |
+
"4 answer_correctness 0.629267 0.603407 \n",
|
658 |
+
"\n",
|
659 |
+
" Baseline -> Fine-Tune Embedding \n",
|
660 |
+
"0 -0.027007 \n",
|
661 |
+
"1 0.000358 \n",
|
662 |
+
"2 0.010417 \n",
|
663 |
+
"3 0.016168 \n",
|
664 |
+
"4 -0.025861 "
|
665 |
+
]
|
666 |
+
},
|
667 |
+
"execution_count": 41,
|
668 |
+
"metadata": {},
|
669 |
+
"output_type": "execute_result"
|
670 |
+
}
|
671 |
+
],
|
672 |
+
"source": [
|
673 |
+
"df_baseline_ft_embeddings = pd.merge(baseline_metrics, multiquery_ft_embedding_metrics_df, on='Metric')\n",
|
674 |
+
"\n",
|
675 |
+
"df_baseline_ft_embeddings['Baseline -> Fine-Tune Embedding'] = df_baseline_ft_embeddings['Fine-Tune Embedding'] - df_baseline_ft_embeddings['Baseline']\n",
|
676 |
+
"\n",
|
677 |
+
"df_baseline_ft_embeddings"
|
678 |
+
]
|
679 |
+
}
|
680 |
+
],
|
681 |
+
"metadata": {
|
682 |
+
"kernelspec": {
|
683 |
+
"display_name": "base",
|
684 |
+
"language": "python",
|
685 |
+
"name": "python3"
|
686 |
+
},
|
687 |
+
"language_info": {
|
688 |
+
"codemirror_mode": {
|
689 |
+
"name": "ipython",
|
690 |
+
"version": 3
|
691 |
+
},
|
692 |
+
"file_extension": ".py",
|
693 |
+
"mimetype": "text/x-python",
|
694 |
+
"name": "python",
|
695 |
+
"nbconvert_exporter": "python",
|
696 |
+
"pygments_lexer": "ipython3",
|
697 |
+
"version": "3.12.4"
|
698 |
+
}
|
699 |
+
},
|
700 |
+
"nbformat": 4,
|
701 |
+
"nbformat_minor": 2
|
702 |
+
}
|
Retrieval_Strat_Eval.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
app.py
CHANGED
@@ -33,8 +33,8 @@ else:
|
|
33 |
documents.extend(loader.load())
|
34 |
|
35 |
text_splitter = RecursiveCharacterTextSplitter(
|
36 |
-
chunk_size=
|
37 |
-
chunk_overlap=
|
38 |
)
|
39 |
|
40 |
docs = text_splitter.split_documents(documents)
|
|
|
33 |
documents.extend(loader.load())
|
34 |
|
35 |
text_splitter = RecursiveCharacterTextSplitter(
|
36 |
+
chunk_size=1000,
|
37 |
+
chunk_overlap=200,
|
38 |
)
|
39 |
|
40 |
docs = text_splitter.split_documents(documents)
|
contextual_compression_metrics.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metric,ContextualCompression
|
2 |
+
faithfulness,0.7490919748849417
|
3 |
+
answer_relevancy,0.9139929840714447
|
4 |
+
context_recall,0.7256944444444443
|
5 |
+
context_precision,0.9050925925582561
|
6 |
+
answer_correctness,0.5706849416579475
|
contextual_compression_ragas_results.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
multiquery_ft_embedding_metrics.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metric,Fine-Tune Embedding
|
2 |
+
faithfulness,0.8683514087024626
|
3 |
+
answer_relevancy,0.955776890181432
|
4 |
+
context_recall,0.9444444444444443
|
5 |
+
context_precision,0.9536680366355096
|
6 |
+
answer_correctness,0.6034069059659971
|
multiquery_ft_embedding_ragas_results.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
multiquery_metrics.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metric,MultiQuery
|
2 |
+
faithfulness,0.8968038152225203
|
3 |
+
answer_relevancy,0.9532110694802333
|
4 |
+
context_recall,0.890625
|
5 |
+
context_precision,0.9207324735259621
|
6 |
+
answer_correctness,0.690058377538071
|
multiquery_ragas_results.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|