Update README.md
Browse files
README.md
CHANGED
@@ -129,6 +129,78 @@ The prompt used is quite strict. This reassures us as to the robustness of the m
|
|
129 |
| lavita/MedQuAD | 0.95 | 0.81 | 0 | 0 | 14 / 20 tests | 70 |
|
130 |
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
|
134 |
|
|
|
129 |
| lavita/MedQuAD | 0.95 | 0.81 | 0 | 0 | 14 / 20 tests | 70 |
|
130 |
|
131 |
|
132 |
+
### Evaluation Code
|
133 |
+
|
134 |
+
```py
|
135 |
+
|
136 |
+
def evaluate_llama_alpacare_gpt4(medQA):
|
137 |
+
# Define the metrics
|
138 |
+
answer_relevancy_metric = AnswerRelevancyMetric(
|
139 |
+
threshold=0.7,
|
140 |
+
model="gpt-4o-mini",
|
141 |
+
include_reason=True
|
142 |
+
)
|
143 |
+
|
144 |
+
bias = BiasMetric(
|
145 |
+
model="gpt-4o-mini",
|
146 |
+
include_reason=True,
|
147 |
+
threshold=0.8
|
148 |
+
)
|
149 |
+
|
150 |
+
toxicity = ToxicityMetric(
|
151 |
+
model="gpt-4o-mini",
|
152 |
+
include_reason=True
|
153 |
+
)
|
154 |
+
|
155 |
+
correctness_metric = GEval(
|
156 |
+
name="Correctness",
|
157 |
+
threshold=0.7,
|
158 |
+
model="gpt-4o-mini",
|
159 |
+
criteria="Determine whether the actual output is factually correct based on the expected output, focusing on medical accuracy and adherence to established guidelines.",
|
160 |
+
evaluation_steps=[
|
161 |
+
"Check whether the facts in 'actual output' contradict any facts in 'expected output' or established medical guidelines.",
|
162 |
+
"Penalizes the omission of medical details, depending on their criticality and especially those that could have an impact on the care provided to the patient or on his or her understanding.",
|
163 |
+
"Ensure that medical terminology and language used are precise and appropriate for medical context.",
|
164 |
+
"Assess whether the response adequately addresses the specific medical question posed.",
|
165 |
+
"Vague language or contradicting opinions are acceptable in general contexts, but factual inaccuracies, especially regarding medical data or guidelines, are not."
|
166 |
+
],
|
167 |
+
evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
|
168 |
+
)
|
169 |
+
|
170 |
+
test_cases = []
|
171 |
+
|
172 |
+
# metric = FaithfulnessMetric(
|
173 |
+
# model="gpt-4o-mini",
|
174 |
+
# include_reason=True
|
175 |
+
# )
|
176 |
+
|
177 |
+
# Loop through the dataset and evaluate
|
178 |
+
for example in medQA:
|
179 |
+
question = example['Question']
|
180 |
+
expected_output = example['Answer']
|
181 |
+
question_focus = example['instruction']
|
182 |
+
|
183 |
+
|
184 |
+
# Generate the actual output
|
185 |
+
actual_output = generate_medical_answer(
|
186 |
+
instruction=question,
|
187 |
+
input=question_focus,
|
188 |
+
)
|
189 |
+
|
190 |
+
# Define the test case
|
191 |
+
test_case = LLMTestCase(
|
192 |
+
input=question,
|
193 |
+
actual_output=actual_output,
|
194 |
+
expected_output=expected_output,
|
195 |
+
)
|
196 |
+
|
197 |
+
test_cases.append(test_case)
|
198 |
+
|
199 |
+
evaluate(test_cases, [answer_relevancy_metric, correctness_metric, bias, toxicity])
|
200 |
+
|
201 |
+
```
|
202 |
+
|
203 |
+
|
204 |
|
205 |
This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
|
206 |
|