Minseok Bae commited on
Commit
156ef43
1 Parent(s): 2864204

Refine the code style

Browse files
app.py CHANGED
@@ -97,7 +97,7 @@ def filter_models(
97
  if show_deleted:
98
  filtered_df = df
99
  else: # Show only still on the hub models
100
- filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name] == True]
101
 
102
  type_emoji = [t[0] for t in type_query]
103
  filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
@@ -181,7 +181,7 @@ with demo:
181
  elem_id="leaderboard-table",
182
  interactive=False,
183
  visible=True,
184
- column_widths=["2%", "33%"]
185
  )
186
 
187
  # Dummy leaderboard for handling the case when the user uses backspace key
 
97
  if show_deleted:
98
  filtered_df = df
99
  else: # Show only still on the hub models
100
+ filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name]]
101
 
102
  type_emoji = [t[0] for t in type_query]
103
  filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
 
181
  elem_id="leaderboard-table",
182
  interactive=False,
183
  visible=True,
184
+ column_widths=["2%", "33%"]
185
  )
186
 
187
  # Dummy leaderboard for handling the case when the user uses backspace key
src/backend/evaluate_model.py CHANGED
@@ -69,13 +69,11 @@ class Evaluator:
69
  dict: A dictionary containing evaluation results.
70
  """
71
  try:
72
- # df = pd.read_csv(envs.SOURCE_PATH)
73
- df = pd.read_csv(envs.SAMPLE_DATASET_PATH)
74
  generated_summaries_df = self.summary_generator.generate_summaries(df)
75
 
76
  avg_summary_len = self.summary_generator.avg_length
77
  answer_rate = self.summary_generator.answer_rate
78
- # error_rate = self.summary_generator.error_rate
79
 
80
  hallucination_scores = self.eval_model.evaluate_hallucination(
81
  generated_summaries_df)
@@ -83,16 +81,14 @@ class Evaluator:
83
  hallucination_rate = self.eval_model.hallucination_rate
84
 
85
  results = util.format_results(model_name=self.model, revision=self.revision,
86
- precision=self.precision,
87
  factual_consistency_rate=factual_consistency_rate,
88
  hallucination_rate=hallucination_rate,
89
  answer_rate=answer_rate,
90
  avg_summary_len=avg_summary_len)
91
-
92
  return results
93
  except FileNotFoundError:
94
- # logging.error(f"File not found: {envs.SOURCE_PATH}")
95
- logging.error(f"File not found: {envs.SAMPLE_DATASET_PATH}")
96
  raise
97
  except Exception as e:
98
  logging.error(f"Error during evaluation: {e}")
 
69
  dict: A dictionary containing evaluation results.
70
  """
71
  try:
72
+ df = pd.read_csv(envs.DATASET_PATH)
 
73
  generated_summaries_df = self.summary_generator.generate_summaries(df)
74
 
75
  avg_summary_len = self.summary_generator.avg_length
76
  answer_rate = self.summary_generator.answer_rate
 
77
 
78
  hallucination_scores = self.eval_model.evaluate_hallucination(
79
  generated_summaries_df)
 
81
  hallucination_rate = self.eval_model.hallucination_rate
82
 
83
  results = util.format_results(model_name=self.model, revision=self.revision,
84
+ precision=self.precision,
85
  factual_consistency_rate=factual_consistency_rate,
86
  hallucination_rate=hallucination_rate,
87
  answer_rate=answer_rate,
88
  avg_summary_len=avg_summary_len)
 
89
  return results
90
  except FileNotFoundError:
91
+ logging.error(f"File not found: {envs.DATASET_PATH}")
 
92
  raise
93
  except Exception as e:
94
  logging.error(f"Error during evaluation: {e}")
src/backend/manage_requests.py CHANGED
@@ -6,6 +6,7 @@ from typing import Optional
6
 
7
  from huggingface_hub import HfApi, snapshot_download
8
 
 
9
  @dataclass
10
  class EvalRequest:
11
  model: str
 
6
 
7
  from huggingface_hub import HfApi, snapshot_download
8
 
9
+
10
  @dataclass
11
  class EvalRequest:
12
  model: str
src/backend/model_operations.py CHANGED
@@ -105,11 +105,11 @@ class SummaryGenerator:
105
  for index, row in df.iterrows():
106
  _source = row['text']
107
  _dataset = row['dataset']
108
-
109
  system_prompt = envs.SYSTEM_PROMPT
110
  user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
111
 
112
- while True:
113
  try:
114
  _summary = generate_summary(self.model, system_prompt,
115
  user_prompt, self.api_base)
@@ -129,7 +129,7 @@ class SummaryGenerator:
129
  summary.append(_summary)
130
  source.append(_source)
131
  dataset.append(_dataset)
132
-
133
  time.sleep(1)
134
 
135
  self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
@@ -199,10 +199,9 @@ class EvaluationModel:
199
  Returns:
200
  list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
201
  """
202
- source_docs = np.array(summaries_df['source'])
203
- generated_summaries = np.array(summaries_df['summary'])
204
  try:
205
- scores = self.model.predict(source_docs, generated_summaries)
206
  self.scores = scores
207
  return self.scores
208
  except Exception as e:
 
105
  for index, row in df.iterrows():
106
  _source = row['text']
107
  _dataset = row['dataset']
108
+
109
  system_prompt = envs.SYSTEM_PROMPT
110
  user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
111
 
112
+ while True:
113
  try:
114
  _summary = generate_summary(self.model, system_prompt,
115
  user_prompt, self.api_base)
 
129
  summary.append(_summary)
130
  source.append(_source)
131
  dataset.append(_dataset)
132
+
133
  time.sleep(1)
134
 
135
  self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
 
199
  Returns:
200
  list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
201
  """
202
+ source_summary_pairs = util.create_pairs(summaries_df)
 
203
  try:
204
+ scores = self.model.predict(source_summary_pairs)
205
  self.scores = scores
206
  return self.scores
207
  except Exception as e:
src/backend/util.py CHANGED
@@ -1,23 +1,21 @@
1
- def generate_prompt(source_passage: str) -> str:
2
  """
3
- Generates a prompt for a chatbot to summarize a given passage.
4
 
5
  Args:
6
- source_passage (str): The passage to be summarized.
7
 
8
  Returns:
9
- str: A formatted prompt string for the chatbot.
10
  """
11
- if not source_passage:
12
- raise ValueError("Source passage is empty.")
 
13
 
14
- return f"""You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided.
15
- You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described:'
16
- Passage:\n {source_passage}
17
- """
18
 
19
 
20
- def format_results(model_name: str, revision: str, precision: str,
21
  factual_consistency_rate: float, hallucination_rate: float,
22
  answer_rate: float, avg_summary_len: float) -> dict:
23
  """
 
1
+ def create_pairs(df):
2
  """
3
+ Creates pairs of source and summary from the dataframe.
4
 
5
  Args:
6
+ df (DataFrame): The dataframe containing source and summary columns.
7
 
8
  Returns:
9
+ list: A list of pairs [source, summary].
10
  """
11
+ pairs = []
12
+ for _, row in df.iterrows():
13
+ pairs.append([row['source'], row['summary']])
14
 
15
+ return pairs
 
 
 
16
 
17
 
18
+ def format_results(model_name: str, revision: str, precision: str,
19
  factual_consistency_rate: float, hallucination_rate: float,
20
  answer_rate: float, avg_summary_len: float) -> dict:
21
  """
src/display/about.py CHANGED
@@ -9,15 +9,14 @@ class Task:
9
 
10
 
11
  class Tasks(Enum):
12
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
13
- hallucination_rate = Task("hallucination_rate",
14
  "hallucination_rate", "Hallucination Rate")
15
  accuracy = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate")
16
  answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
17
- average_summary_length = Task("average_summary_length",
18
  "average_summary_length", "Average Summary Length")
19
  # error_rate = Task("error_rate", "error_rate", "Error Rate")
20
-
21
 
22
 
23
  # Your leaderboard name
 
9
 
10
 
11
  class Tasks(Enum):
12
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
13
+ hallucination_rate = Task("hallucination_rate",
14
  "hallucination_rate", "Hallucination Rate")
15
  accuracy = Task("factual_consistency_rate", "factual_consistency_rate", "Factual Consistency Rate")
16
  answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
17
+ average_summary_length = Task("average_summary_length",
18
  "average_summary_length", "Average Summary Length")
19
  # error_rate = Task("error_rate", "error_rate", "Error Rate")
 
20
 
21
 
22
  # Your leaderboard name
src/envs.py CHANGED
@@ -2,6 +2,7 @@ import os
2
 
3
  from huggingface_hub import HfApi
4
 
 
5
  # replace this with our token
6
  TOKEN = os.environ.get("HF_TOKEN", None)
7
 
@@ -21,7 +22,7 @@ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
21
  DEVICE = "cpu"
22
  API = HfApi(token=TOKEN)
23
 
24
- SOURCE_PATH = "src/datasets/leaderboard_dataset.csv"
25
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
26
  HEM_PATH = 'vectara/hallucination_evaluation_model'
27
 
 
2
 
3
  from huggingface_hub import HfApi
4
 
5
+
6
  # replace this with our token
7
  TOKEN = os.environ.get("HF_TOKEN", None)
8
 
 
22
  DEVICE = "cpu"
23
  API = HfApi(token=TOKEN)
24
 
25
+ DATASET_PATH = "src/datasets/leaderboard_dataset.csv"
26
  SAMPLE_DATASET_PATH = "src/datasets/sample_dataset.csv"
27
  HEM_PATH = 'vectara/hallucination_evaluation_model'
28