Yeyito commited on
Commit
c13858c
1 Parent(s): 4996c9a

Trying to stop OOMs on MMLU and GSM8K by halving seq len

Browse files
Files changed (1) hide show
  1. app.py +231 -307
app.py CHANGED
@@ -1,311 +1,235 @@
1
- import gradio as gr
2
- import subprocess
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import os
4
  import sys
5
- import time
6
- import pandas as pd
7
- from threading import Thread
8
-
9
- # Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
10
- project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
11
- src_dir = os.path.join(project_root, "src")
12
- sys.path.insert(0, src_dir)
13
-
14
- import run as evaluator # Import the run module
15
- from src.css_html import custom_css
16
- from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT, SUBMISSION_TEXT_2
17
- from src.envs import API, H4_TOKEN, REPO_ID
18
- from huggingface_hub import HfApi
19
- from src.utils import (
20
- AutoEvalColumn,
21
- fields,
22
- is_model_on_hub,
23
- make_clickable_names,
24
- styled_error,
25
- styled_message,
26
- )
27
-
28
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
29
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
30
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
31
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
32
-
33
- # CONFIGURATION:
34
- ref_model = "huggyllama/llama-7b"
35
- test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
36
- modelQueue = []
37
-
38
- def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
39
- API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
40
-
41
-
42
- def save_to_txt(model, results, model_type):
43
- file_path = "data/code_eval_board.csv"
44
-
45
- with open(file_path, "a") as f:
46
- f.write(f"\n{model_type},{model}," + str(results["arc"]) + "," + str(results["hellaswag"]) + "," + str(results["mmlu"]) + "," + str(results["truthfulQA"]) + "," + str(results["winogrande"]) + "," + str(results["gsm8k"]))
47
- f.close()
48
-
49
- restart_space()
50
-
51
- def run_test(model,ref_model,data):
52
- print(f"|| TESTING {data} ||")
53
- return evaluator.main(
54
- target_model=f"{model}",
55
- ref_model=f"{ref_model}",
56
- output_dir="out",
57
- data=f"{data}",
58
- length=64,
59
- key_name="input",
60
- ratio_gen=0.4
61
- ) # Call the main function in detect-pretrain-code-contamination/src/run.py
62
-
63
- def evaluate(model,model_type):
64
- global ref_model
65
- print(f"|| EVALUATING {model} ||")
66
- results = {
67
- "arc": run_test(model, ref_model, test_datasets[2]),
68
- "hellaswag": run_test(model, ref_model, test_datasets[4]),
69
- "mmlu": run_test(model, ref_model, test_datasets[1]),
70
- "truthfulQA": run_test(model, ref_model, test_datasets[0]),
71
- "winogrande": run_test(model, ref_model, test_datasets[5]),
72
- "gsm8k": run_test(model, ref_model, test_datasets[3]),
73
- "ref_model": ref_model,
74
- }
75
-
76
- # Save to .txt file in /Evaluations/{model}
77
- save_to_txt(model, results, model_type)
78
- return "\n".join([f"{k}:{results[k]}" for k in results])
79
-
80
- def worker_thread():
81
- global modelQueue, server
82
- while True:
83
- for submission in modelQueue:
84
- evaluate(submission[0],submission[1].split(" ")[0])
85
- modelQueue.pop(modelQueue.index(submission))
86
- time.sleep(1)
87
- time.sleep(1)
88
-
89
- def queue(model,model_type):
90
- global modelQueue
91
- modelQueue.append([model,model_type])
92
- print(f"QUEUE:\n{modelQueue}")
93
-
94
-
95
- ### bigcode/bigcode-models-leaderboard
96
- def add_new_eval(
97
- model: str,
98
- revision: str,
99
- precision: str,
100
- model_type: str,
101
- ):
102
- precision = precision
103
-
104
- if model_type is None or model_type == "" or model_type == []:
105
- return styled_error("Please select a model type.")
106
- print(model_type)
107
- # check the model actually exists before adding the eval
108
- if revision == "":
109
- revision = "main"
110
-
111
- model_on_hub, error = is_model_on_hub(model, revision)
112
- if not model_on_hub:
113
- return styled_error(f'Model "{model}" {error}')
114
-
115
- print("Adding new eval")
116
- queue(model,model_type)
117
- return styled_message("Your request has been submitted to the evaluation queue!\n")
118
-
119
- def select_columns(df, columns):
120
- always_here_cols = [
121
- AutoEvalColumn.model_type_symbol.name,
122
- AutoEvalColumn.model.name,
123
- ]
124
- # We use COLS to maintain sorting
125
- filtered_df = df[
126
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
127
- ]
128
- return filtered_df
129
-
130
-
131
- def filter_items(df, leaderboard_table, query):
132
- if query == "All":
133
- return df[leaderboard_table.columns]
134
  else:
135
- query = query[0] # take only the emoji character
136
- filtered_df = df[(df["T"] == query)]
137
- return filtered_df[leaderboard_table.columns]
138
-
139
- def search_table(df, leaderboard_table, query):
140
- filtered_df = df[(df["Models"].str.contains(query, case=False))]
141
- return filtered_df[leaderboard_table.columns]
142
-
143
- demo = gr.Blocks(css=custom_css)
144
- with demo:
145
- with gr.Row():
146
- gr.Markdown(
147
- """<div style="text-align: center;"><h1> 📄 LLM Contamination Detector </h1></div>\
148
- <br>\
149
- <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">🤗 Big Code Models Leaderboard ⭐</a>, we use an implementation of <a href="https://huggingface.co/papers/2310.16789">Detecting Pretraining Data from Large Language Models</a> paper found in <a href="https://github.com/swj0419/detect-pretrain-code-contamination/tree/master">this github repo</a>, to provide contamination scores for LLMs on the datasets used by Open LLM Leaderboard.\
150
- This space should NOT be used to flag or accuse models of cheating / being contamined, instead, it should form part of a holistic assesment by the parties involved.</p>""",
151
- elem_classes="markdown-text",
152
- )
153
-
154
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
155
- with gr.Column():
156
- with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
157
- with gr.TabItem("🔍 Evaluations", id=0):
158
- with gr.Column():
159
- with gr.Accordion("➡️ See filters", open=False):
160
- shown_columns = gr.CheckboxGroup(
161
- choices=[
162
- c
163
- for c in COLS
164
- if c
165
- not in [
166
- AutoEvalColumn.dummy.name,
167
- AutoEvalColumn.model.name,
168
- AutoEvalColumn.model_type_symbol.name,
169
- ]
170
- ],
171
- value=[
172
- c
173
- for c in COLS_LITE
174
- if c
175
- not in [
176
- AutoEvalColumn.dummy.name,
177
- AutoEvalColumn.model.name,
178
- AutoEvalColumn.model_type_symbol.name,
179
- ]
180
- ],
181
- label="",
182
- elem_id="column-select",
183
- interactive=True,
184
- )
185
- # with gr.Column(min_width=780):
186
- with gr.Row():
187
- search_bar = gr.Textbox(
188
- placeholder="🔍 Search for a model and press ENTER...",
189
- show_label=False,
190
- elem_id="search-bar",
191
- )
192
- filter_columns = gr.Radio(
193
- label="⏚ Filter model types",
194
- choices=["All", "🟢 Base", "🔶 Finetuned"],
195
- value="All",
196
- elem_id="filter-columns",
197
- )
198
-
199
- df = pd.read_csv("data/code_eval_board.csv")
200
- leaderboard_df = gr.components.Dataframe(
201
- value=df[
202
- [
203
- AutoEvalColumn.model_type_symbol.name,
204
- AutoEvalColumn.model.name,
205
- ]
206
- + shown_columns.value
207
- ],
208
- headers=[
209
- AutoEvalColumn.model_type_symbol.name,
210
- AutoEvalColumn.model.name,
211
- ]
212
- + shown_columns.value,
213
- datatype=TYPES,
214
- elem_id="leaderboard-table",
215
- interactive=False,
216
- )
217
-
218
- hidden_leaderboard_df = gr.components.Dataframe(
219
- value=df,
220
- headers=COLS,
221
- datatype=["str" for _ in range(len(COLS))],
222
- visible=False,
223
- )
224
-
225
- search_bar.submit(
226
- search_table,
227
- [hidden_leaderboard_df, leaderboard_df, search_bar],
228
- leaderboard_df,
229
- )
230
-
231
- filter_columns.change(
232
- filter_items,
233
- [hidden_leaderboard_df, leaderboard_df, filter_columns],
234
- leaderboard_df,
235
- )
236
-
237
- shown_columns.change(
238
- select_columns,
239
- [hidden_leaderboard_df, shown_columns],
240
- leaderboard_df,
241
- )
242
-
243
- gr.Markdown(
244
- """
245
- **Notes:**
246
- - The Huggingface team is working on their own implementation of this paper as a space, I'll be leaving this space up until that's available.
247
- - Some scores may not be entirely accurate according to the paper cited as I still work out the kinks and innacuracies of this implementation.
248
- - For any issues, questions, or comments either open a discussion in this space's community tab or message me directly to my discord: yeyito777.
249
- - Make sure to check the pinned discussion in this space's community tab for implementation details I'm not 100% about.
250
- """,
251
- elem_classes="markdown-text",
252
- )
253
-
254
- with gr.TabItem("📝 About", id=2):
255
- gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
256
- with gr.TabItem("🛠️ Submit models", id=3):
257
- gr.Markdown(SUBMISSION_TEXT)
258
- gr.Markdown(
259
- "## 📤 Submit a model here:", elem_classes="markdown-text"
260
- )
261
- with gr.Column():
262
- with gr.Row():
263
- model_name = gr.Textbox(label="Model name")
264
- revision_name = gr.Textbox(
265
- label="revision", placeholder="main"
266
- )
267
- with gr.Row():
268
- precision = gr.Dropdown(
269
- choices=[
270
- "float16",
271
- "bfloat16",
272
- "8bit",
273
- "4bit",
274
- ],
275
- label="Precision",
276
- multiselect=False,
277
- value="float16",
278
- interactive=True,
279
- )
280
- model_type = gr.Dropdown(
281
- choices=["🟢 base", "🔶 instruction-tuned"],
282
- label="Model type",
283
- multiselect=False,
284
- value=None,
285
- interactive=True,
286
- )
287
- submit_button = gr.Button("Submit Eval")
288
- submission_result = gr.Markdown()
289
- submit_button.click(
290
- add_new_eval,
291
- inputs=[model_name, revision_name, precision, model_type],
292
- outputs=[submission_result],
293
- )
294
- gr.Markdown(SUBMISSION_TEXT_2)
295
-
296
- thread = Thread(target=worker_thread)
297
- thread.start()
298
- demo.launch(share=True)
299
-
300
- # Some worries:
301
- # 1. Am I testing things correctly in eval.py, following the template format?
302
-
303
- # 2. Am I choosing the correct splits in run.py? The higherarchy I use is: test > val > train
304
- # (As in: if test exists, I go with that, then validation, then default)
305
-
306
- # 3. I decided to go with winogrande_debiased instead of winogrande_l arbitrarily.
307
- # (Not sure which one open llm leaderboard uses, or what is the standard)
308
-
309
- # 4. I'm unsure why in eval.py we append the output at the end of the input.
310
 
311
- # 5. Currently I'm using huggyllama/llama-7b as ref_model, should I switch to llama2-7B? Maybe Mistral-7B?
 
1
+ import logging
2
+ logging.basicConfig(level='ERROR')
3
+ import numpy as np
4
+ from pathlib import Path
5
+ import openai
6
+ import torch
7
+ import zlib
8
+ import statistics
9
+ from torch.utils.data import DataLoader
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM
11
+ from tqdm import tqdm
12
+ import math
13
+ import numpy as np
14
+ from datasets import load_dataset
15
+ from options import Options
16
+ from ipdb import set_trace as bp
17
+ from eval import *
18
+ from utils import evaluate_model
19
+ from analyze import analyze_data
20
+ import argparse
21
  import os
22
  import sys
23
+ import gc
24
+ import pickle
25
+
26
+ models = {}
27
+
28
+ def save_data(filename, data):
29
+ with open(filename, 'wb') as filehandle:
30
+ # store the data as binary data stream
31
+ pickle.dump(data, filehandle)
32
+
33
+ def load_data(filename):
34
+ with open(filename, 'rb') as filehandle:
35
+ # read the data as binary data stream
36
+ loaded_data = pickle.load(filehandle)
37
+
38
+ return loaded_data
39
+
40
+ def unload_model(model,tokenizer):
41
+ print("[X] Cannot unload model! Functionality not implemented!")
42
+
43
+ def load_model(name1):
44
+ if name1 not in models:
45
+ model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
46
+ model1.eval()
47
+ tokenizer1 = AutoTokenizer.from_pretrained(name1)
48
+
49
+ tokenizer1.pad_token = tokenizer1.eos_token
50
+ models[name1] = model1
51
+ models[name1 + "_tokenizer"] = tokenizer1
52
+ return models[name1], models[name1 + "_tokenizer"]
53
+
54
+ def calculatePerplexity(sentence, model, tokenizer, gpu):
55
+ """
56
+ exp(loss)
57
+ """
58
+ input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
59
+ input_ids = input_ids.to(gpu)
60
+ with torch.no_grad():
61
+ outputs = model(input_ids, labels=input_ids)
62
+ loss, logits = outputs[:2]
63
+
64
+ '''
65
+ extract logits:
66
+ '''
67
+ # Apply softmax to the logits to get probabilities
68
+ probabilities = torch.nn.functional.log_softmax(logits, dim=-1)
69
+ # probabilities = torch.nn.functional.softmax(logits, dim=-1)
70
+ all_prob = []
71
+ input_ids_processed = input_ids[0][1:]
72
+
73
+ for i, token_id in enumerate(input_ids_processed):
74
+ probability = probabilities[0, i, token_id].item()
75
+ all_prob.append(probability)
76
+ return torch.exp(loss).item(), all_prob, loss.item()
77
+
78
+ def sample_generation(sentence, model, tokenizer, args,data_name):
79
+ half_sentence_index = math.ceil(len(sentence.split())*args['prefix_length'])
80
+
81
+ if half_sentence_index > 0:
82
+ prefix = " ".join(sentence.split()[:half_sentence_index])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  else:
84
+ prefix = '<|startoftext|> '
85
+
86
+ input_ids = torch.tensor(tokenizer.encode(prefix)).unsqueeze(0)
87
+ input_ids = input_ids.to(model.device)
88
+
89
+ output = None
90
+ if data_name != "cais/mmlu" or data_name != "gsm8k":
91
+ output = model.generate(input_ids, max_new_tokens=len(sentence.split())-half_sentence_index, min_new_tokens=1, num_return_sequences=args['num_z'], pad_token_id=tokenizer.eos_token_id, **args['generate_args'])
92
+ else:
93
+ output = model.generate(input_ids, max_new_tokens=(len(sentence.split())-half_sentence_index)/2, min_new_tokens=1, num_return_sequences=args['num_z'], pad_token_id=tokenizer.eos_token_id, **args['generate_args'])
94
+ # print(output)
95
+ complete_generated_text = tokenizer.batch_decode(output, skip_special_tokens=True)
96
+
97
+ return complete_generated_text
98
+
99
+
100
+ def RMIA_1(text,target_loss,ref_loss,model1,tokenizer1,ratio_gen,neighbors_dl):
101
+ target_losses_z = evaluate_model(model1,tokenizer1,neighbors_dl)
102
+ result = torch.count_nonzero(target_losses_z < target_loss).item() / len(target_losses_z)
103
+ return result
104
+
105
+ def get_neighbors(text,ref_loss,model2,tokenizer2,ratio_gen,data_name):
106
+ cur_args = {'prefix_length': ratio_gen, 'num_z': 100, 'generate_args': {'do_sample': True}}
107
+ neighbors = sample_generation(text, model2, tokenizer2, cur_args,data_name)
108
+ neighbors_dl = DataLoader(neighbors, batch_size=32, shuffle=False)
109
+ return neighbors_dl
110
+
111
+ def evaluate_data(test_data, col_name, target_model, ref_model, ratio_gen, data_name):
112
+ global model1,model2,tokenizer1,tokenizer2
113
+ print(f"all data size: {len(test_data)}")
114
+ random.seed(0)
115
+ random.shuffle(test_data)
116
+ test_data = test_data[:100]
117
+
118
+ inference2_pass = None
119
+ neighbors_dls = None
120
+ ref_model_clean = ref_model.replace("/","-")
121
+ data_name_clean = data_name.replace("/","-")
122
+ os.makedirs(os.path.join(f"saves/{ref_model_clean}",f"{data_name_clean}"),exist_ok=True)
123
+ try:
124
+ inference2_pass = load_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt')
125
+ neighbors_dls = load_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt')
126
+ except:
127
+ ### MODEL 2 likelihoods
128
+ model2, tokenizer2 = load_model(ref_model)
129
+ inference2_pass = [] #0: p_ref, #1: all_prob_ref, #2: p_ref_likelihood
130
+ for ex in tqdm(test_data):
131
+ text = ex[col_name]
132
+ new_ex = inference_model2(model2, tokenizer2, text)
133
+ inference2_pass.append(new_ex)
134
+ # Invariant. Doesn't take in model1 so I'm good
135
+
136
+ ### Neighbors:
137
+ neighbors_dls = []
138
+ counter = 0
139
+ for ex in tqdm(test_data):
140
+ text = ex[col_name]
141
+ new_ex = get_neighbors(text,inference2_pass[counter][2],model2,tokenizer2,ratio_gen,data_name)
142
+ counter = counter + 1
143
+ neighbors_dls.append(new_ex)
144
+ unload_model(model2,tokenizer2)
145
+ # Because it uses temp it is not invariant, however taking a snapshot in time should be just fine.
146
+ save_data(f'saves/{ref_model_clean}/{data_name_clean}/inference2_pass.txt',inference2_pass)
147
+ save_data(f'saves/{ref_model_clean}/{data_name_clean}/neighbors_dls.txt',neighbors_dls)
148
+ print("Saved ref data, exiting.")
149
+
150
+ ### MODEL 1 likelihoods
151
+ model1, tokenizer1 = load_model(target_model)
152
+ inference1_pass = [] #0: p1, #1: all_prob, #2: p1_likelihood, #3: p_lower, #4: p_lower_likelihood
153
+ for ex in tqdm(test_data):
154
+ text = ex[col_name]
155
+ new_ex = inference_model1(model1,tokenizer1,text)
156
+ inference1_pass.append(new_ex)
157
+
158
+ ### RIMA results
159
+ model1, tokenizer1 = load_model(target_model)
160
+ counter = 0
161
+ results = []
162
+ for ex in tqdm(test_data):
163
+ text = ex[col_name]
164
+ new_ex = RMIA_1(text,inference1_pass[counter][2],inference2_pass[counter][2],model1,tokenizer1,ratio_gen,neighbors_dls[counter])
165
+ counter = counter + 1
166
+ results.append(new_ex)
167
+ unload_model(model1,tokenizer1)
168
+
169
+ ### Inference ex
170
+ all_output = []
171
+ counter = 0
172
+ for ex in tqdm(test_data):
173
+ text = ex[col_name]
174
+ pred = {}
175
+ pred["minkprob_w/_ref"] = results[counter]
176
+ pred["ppl"] = inference1_pass[counter][0]
177
+ pred["ppl/Ref_ppl (calibrate PPL to the reference model)"] = inference1_pass[counter][2]-inference2_pass[counter][2]
178
+ pred["ppl/lowercase_ppl"] = -(np.log(inference1_pass[counter][3]) / np.log(inference1_pass[counter][0])).item()
179
+ zlib_entropy = len(zlib.compress(bytes(text, 'utf-8')))
180
+ pred["ppl/zlib"] = np.log(inference1_pass[counter][0])/zlib_entropy
181
+ ex["pred"] = pred
182
+ counter = counter + 1
183
+ all_output.append(ex)
184
+ return all_output
185
+
186
+ def inference_model1 (model1, tokenizer1, text):
187
+ p1, all_prob, p1_likelihood = calculatePerplexity(text, model1, tokenizer1, gpu=model1.device)
188
+ p_lower, _, p_lower_likelihood = calculatePerplexity(text.lower(), model1, tokenizer1, gpu=model1.device)
189
+ return [p1, all_prob, p1_likelihood, p_lower, p_lower_likelihood]
190
+
191
+ def inference_model2 (model2, tokenizer2, text):
192
+ p_ref, all_prob_ref, p_ref_likelihood = calculatePerplexity(text, model2, tokenizer2, gpu=model2.device)
193
+ return [p_ref,all_prob_ref,p_ref_likelihood]
194
+
195
+ def main(target_model,ref_model,output_dir,data,length,key_name,ratio_gen):
196
+ output_dir = f"{output_dir}/{target_model}_{ref_model}/{key_name}"
197
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
198
+ # load model and data
199
+ data_name = data
200
+ if "jsonl" in data:
201
+ data = load_jsonl(f"{data}")
202
+ elif data == "truthful_qa":
203
+ # bp()
204
+ dataset = load_dataset(data, "multiple_choice", split="validation")
205
+ data = convert_huggingface_data_to_list_dic(dataset)
206
+ data = process_truthful_qa(data)
207
+ elif data == "cais/mmlu":
208
+ dataset = load_dataset(data, "all", split="test")
209
+ data = convert_huggingface_data_to_list_dic(dataset)
210
+ data = process_mmlu(data)
211
+ elif data == "ai2_arc":
212
+ dataset = load_dataset(data, "ARC-Challenge", split="test")
213
+ data = convert_huggingface_data_to_list_dic(dataset)
214
+ data = process_arc(data)
215
+ elif data == "gsm8k":
216
+ dataset = load_dataset(data, "main", split="test")
217
+ data = convert_huggingface_data_to_list_dic(dataset)
218
+ data = process_gsm8k(data)
219
+ elif data == "Rowan/hellaswag":
220
+ dataset = load_dataset(data, "default", split="validation")
221
+ # We use validation since labels for the test set are not available?
222
+ data = convert_huggingface_data_to_list_dic(dataset)
223
+ data = process_hellaswag(data)
224
+ elif data == "winogrande":
225
+ dataset = load_dataset(data,"winogrande_debiased", split="validation")
226
+ data = convert_huggingface_data_to_list_dic(dataset)
227
+ data = process_winogrande(data)
228
+
229
+ #model1, model2, tokenizer1, tokenizer2 = load_model(target_model, ref_model)
230
+
231
+ all_output = evaluate_data(data,key_name, target_model, ref_model,ratio_gen,data_name)
232
+ dump_jsonl(all_output, f"{output_dir}/all_output.jsonl")
233
+ return analyze_data(all_output)
234
+ # fig_fpr_tpr(all_output, output_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235