Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Minseok Bae
commited on
Commit
•
156ef43
1
Parent(s):
2864204
Refine the code style
Browse files- app.py +2 -2
- src/backend/evaluate_model.py +3 -7
- src/backend/manage_requests.py +1 -0
- src/backend/model_operations.py +5 -6
- src/backend/util.py +9 -11
- src/display/about.py +3 -4
- src/envs.py +2 -1
app.py
CHANGED
@@ -97,7 +97,7 @@ def filter_models(
|
|
97 |
if show_deleted:
|
98 |
filtered_df = df
|
99 |
else: # Show only still on the hub models
|
100 |
-
filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]
|
101 |
|
102 |
type_emoji = [t[0] for t in type_query]
|
103 |
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
@@ -181,7 +181,7 @@ with demo:
|
|
181 |
elem_id="leaderboard-table",
|
182 |
interactive=False,
|
183 |
visible=True,
|
184 |
-
column_widths=["2%", "33%"]
|
185 |
)
|
186 |
|
187 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
|
|
97 |
if show_deleted:
|
98 |
filtered_df = df
|
99 |
else: # Show only still on the hub models
|
100 |
+
filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]]
|
101 |
|
102 |
type_emoji = [t[0] for t in type_query]
|
103 |
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
|
|
181 |
elem_id="leaderboard-table",
|
182 |
interactive=False,
|
183 |
visible=True,
|
184 |
+
column_widths=["2%", "33%"]
|
185 |
)
|
186 |
|
187 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
src/backend/evaluate_model.py
CHANGED
@@ -69,13 +69,11 @@ class Evaluator:
|
|
69 |
dict: A dictionary containing evaluation results.
|
70 |
"""
|
71 |
try:
|
72 |
-
|
73 |
-
df = pd.read_csv(envs.SAMPLE_DATASET_PATH)
|
74 |
generated_summaries_df = self.summary_generator.generate_summaries(df)
|
75 |
|
76 |
avg_summary_len = self.summary_generator.avg_length
|
77 |
answer_rate = self.summary_generator.answer_rate
|
78 |
-
# error_rate = self.summary_generator.error_rate
|
79 |
|
80 |
hallucination_scores = self.eval_model.evaluate_hallucination(
|
81 |
generated_summaries_df)
|
@@ -83,16 +81,14 @@ class Evaluator:
|
|
83 |
hallucination_rate = self.eval_model.hallucination_rate
|
84 |
|
85 |
results = util.format_results(model_name=self.model, revision=self.revision,
|
86 |
-
precision=self.precision,
|
87 |
factual_consistency_rate=factual_consistency_rate,
|
88 |
hallucination_rate=hallucination_rate,
|
89 |
answer_rate=answer_rate,
|
90 |
avg_summary_len=avg_summary_len)
|
91 |
-
|
92 |
return results
|
93 |
except FileNotFoundError:
|
94 |
-
|
95 |
-
logging.error(f"File not found: {envs.SAMPLE_DATASET_PATH}")
|
96 |
raise
|
97 |
except Exception as e:
|
98 |
logging.error(f"Error during evaluation: {e}")
|
|
|
69 |
dict: A dictionary containing evaluation results.
|
70 |
"""
|
71 |
try:
|
72 |
+
df = pd.read_csv(envs.DATASET_PATH)
|
|
|
73 |
generated_summaries_df = self.summary_generator.generate_summaries(df)
|
74 |
|
75 |
avg_summary_len = self.summary_generator.avg_length
|
76 |
answer_rate = self.summary_generator.answer_rate
|
|
|
77 |
|
78 |
hallucination_scores = self.eval_model.evaluate_hallucination(
|
79 |
generated_summaries_df)
|
|
|
81 |
hallucination_rate = self.eval_model.hallucination_rate
|
82 |
|
83 |
results = util.format_results(model_name=self.model, revision=self.revision,
|
84 |
+
precision=self.precision,
|
85 |
factual_consistency_rate=factual_consistency_rate,
|
86 |
hallucination_rate=hallucination_rate,
|
87 |
answer_rate=answer_rate,
|
88 |
avg_summary_len=avg_summary_len)
|
|
|
89 |
return results
|
90 |
except FileNotFoundError:
|
91 |
+
logging.error(f"File not found: {envs.DATASET_PATH}")
|
|
|
92 |
raise
|
93 |
except Exception as e:
|
94 |
logging.error(f"Error during evaluation: {e}")
|
src/backend/manage_requests.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Optional
|
|
6 |
|
7 |
from huggingface_hub import HfApi, snapshot_download
|
8 |
|
|
|
9 |
@dataclass
|
10 |
class EvalRequest:
|
11 |
model: str
|
|
|
6 |
|
7 |
from huggingface_hub import HfApi, snapshot_download
|
8 |
|
9 |
+
|
10 |
@dataclass
|
11 |
class EvalRequest:
|
12 |
model: str
|
src/backend/model_operations.py
CHANGED
@@ -105,11 +105,11 @@ class SummaryGenerator:
|
|
105 |
for index, row in df.iterrows():
|
106 |
_source = row['text']
|
107 |
_dataset = row['dataset']
|
108 |
-
|
109 |
system_prompt = envs.SYSTEM_PROMPT
|
110 |
user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
111 |
|
112 |
-
while True:
|
113 |
try:
|
114 |
_summary = generate_summary(self.model, system_prompt,
|
115 |
user_prompt, self.api_base)
|
@@ -129,7 +129,7 @@ class SummaryGenerator:
|
|
129 |
summary.append(_summary)
|
130 |
source.append(_source)
|
131 |
dataset.append(_dataset)
|
132 |
-
|
133 |
time.sleep(1)
|
134 |
|
135 |
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
@@ -199,10 +199,9 @@ class EvaluationModel:
|
|
199 |
Returns:
|
200 |
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
201 |
"""
|
202 |
-
|
203 |
-
generated_summaries = np.array(summaries_df['summary'])
|
204 |
try:
|
205 |
-
scores = self.model.predict(
|
206 |
self.scores = scores
|
207 |
return self.scores
|
208 |
except Exception as e:
|
|
|
105 |
for index, row in df.iterrows():
|
106 |
_source = row['text']
|
107 |
_dataset = row['dataset']
|
108 |
+
|
109 |
system_prompt = envs.SYSTEM_PROMPT
|
110 |
user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
111 |
|
112 |
+
while True:
|
113 |
try:
|
114 |
_summary = generate_summary(self.model, system_prompt,
|
115 |
user_prompt, self.api_base)
|
|
|
129 |
summary.append(_summary)
|
130 |
source.append(_source)
|
131 |
dataset.append(_dataset)
|
132 |
+
|
133 |
time.sleep(1)
|
134 |
|
135 |
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
|
|
199 |
Returns:
|
200 |
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
201 |
"""
|
202 |
+
source_summary_pairs = util.create_pairs(summaries_df)
|
|
|
203 |
try:
|
204 |
+
scores = self.model.predict(source_summary_pairs)
|
205 |
self.scores = scores
|
206 |
return self.scores
|
207 |
except Exception as e:
|
src/backend/util.py
CHANGED
@@ -1,23 +1,21 @@
|
|
1 |
-
def
|
2 |
"""
|
3 |
-
|
4 |
|
5 |
Args:
|
6 |
-
|
7 |
|
8 |
Returns:
|
9 |
-
|
10 |
"""
|
11 |
-
|
12 |
-
|
|
|
13 |
|
14 |
-
return
|
15 |
-
You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described:'
|
16 |
-
Passage:\n {source_passage}
|
17 |
-
"""
|
18 |
|
19 |
|
20 |
-
def format_results(model_name: str, revision: str, precision: str,
|
21 |
factual_consistency_rate: float, hallucination_rate: float,
|
22 |
answer_rate: float, avg_summary_len: float) -> dict:
|
23 |
"""
|
|
|
1 |
+
def create_pairs(df):
|
2 |
"""
|
3 |
+
Creates pairs of source and summary from the dataframe.
|
4 |
|
5 |
Args:
|
6 |
+
df (DataFrame): The dataframe containing source and summary columns.
|
7 |
|
8 |
Returns:
|
9 |
+
list: A list of pairs [source, summary].
|
10 |
"""
|
11 |
+
pairs = []
|
12 |
+
for _, row in df.iterrows():
|
13 |
+
pairs.append([row['source'], row['summary']])
|
14 |
|
15 |
+
return pairs
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
+
def format_results(model_name: str, revision: str, precision: str,
|
19 |
factual_consistency_rate: float, hallucination_rate: float,
|
20 |
answer_rate: float, avg_summary_len: float) -> dict:
|
21 |
"""
|
src/display/about.py
CHANGED
@@ -9,15 +9,14 @@ class Task:
|
|
9 |
|
10 |
|
11 |
class Tasks(Enum):
|
12 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
13 |
-
hallucination_rate = Task("hallucination_rate",
|
14 |
"hallucination_rate", "Hallucination Rate")
|
15 |
accuracy = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate")
|
16 |
answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
|
17 |
-
average_summary_length = Task("average_summary_length",
|
18 |
"average_summary_length", "Average Summary Length")
|
19 |
# error_rate = Task("error_rate", "error_rate", "Error Rate")
|
20 |
-
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
|
|
9 |
|
10 |
|
11 |
class Tasks(Enum):
|
12 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
13 |
+
hallucination_rate = Task("hallucination_rate",
|
14 |
"hallucination_rate", "Hallucination Rate")
|
15 |
accuracy = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate")
|
16 |
answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
|
17 |
+
average_summary_length = Task("average_summary_length",
|
18 |
"average_summary_length", "Average Summary Length")
|
19 |
# error_rate = Task("error_rate", "error_rate", "Error Rate")
|
|
|
20 |
|
21 |
|
22 |
# Your leaderboard name
|
src/envs.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
|
|
5 |
# replace this with our token
|
6 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
7 |
|
@@ -21,7 +22,7 @@ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
|
21 |
DEVICE = "cpu"
|
22 |
API = HfApi(token=TOKEN)
|
23 |
|
24 |
-
|
25 |
SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
|
26 |
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
27 |
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
+
|
6 |
# replace this with our token
|
7 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
8 |
|
|
|
22 |
DEVICE = "cpu"
|
23 |
API = HfApi(token=TOKEN)
|
24 |
|
25 |
+
DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
|
26 |
SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
|
27 |
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
28 |
|