Spaces:
Sleeping
Sleeping
File size: 14,054 Bytes
9d3e407 9361469 9d3e407 a0059eb 9d3e407 9361469 9d3e407 a096a44 9d3e407 848f476 9d3e407 428da0b 9361469 428da0b 9d3e407 9361469 428da0b a0059eb 9d3e407 cb8f97d 9d3e407 a0059eb 9d3e407 0d59e36 9d3e407 0d59e36 9d3e407 837e020 0d59e36 9d3e407 0d59e36 9d3e407 0d59e36 9d3e407 0eec9d7 9d3e407 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# #TODO: license: MIT pending (evaluation suite itself can be completely open, nothing copyleft from the dataset reaches us here)
"""TODO: Add a description here."""
# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{huggingface:module,
title = {A great new module},
authors={huggingface, Inc.},
year={2023}
}
"""
# TODO: Add description of the module here
_DESCRIPTION = """\
This EvaluationSuite currently solves {1} tasks to test code intelligence of genereative language models for "creative programming" (fragment shaders).
"""
# via https://huggingface.co/docs/evaluate/evaluation_suite
import evaluate
from evaluate import evaluator #used by Suite.run()
from evaluate.evaluator.utils import DatasetColumn # used in .prepare_data()
from evaluate.evaluation_suite import SubTask
from datasets import Dataset
from typing import Any, Callable, Dict, List, Optional, Union # used in .prepare_pipeline()
import transformers
from transformers import Pipeline, pipeline, GenerationConfig, AutoTokenizer #GenerationConfig to specify greedy and avoid error
from datasets import load_dataset #used by Suite.run()
# write a custom evaluator, inherent from: https://github.com/huggingface/evaluate/blob/v0.4.0/src/evaluate/evaluator/text_generation.py#L31
class ReturnGenerationEvaluator(evaluate.TextGenerationEvaluator):
def __init__(self, task="text-generation", default_metric_name="exact_match", predictions_prefix: str = "generated"):
super().__init__(task=task, default_metric_name=default_metric_name)
self.predictions_prefix = predictions_prefix
greedy_cfg = GenerationConfig(
do_sample = False, # default to ensure greedy
num_beams = 1, # same as above
)
PIPELINE_KWARGS = {"return_full_text": False, "generation_config":greedy_cfg} #these kwargs are for the pipeline call, not the pipeline init - but that seems to still work.
# for the pipeline init we need to copy the whole function and add two lines. this still prints errors due to the pad_toke_id = eos_token_id change.
# from: https://github.com/huggingface/evaluate/blob/v0.4.0/src/evaluate/evaluator/base.py#L375
def prepare_pipeline(
self,
model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"], # noqa: F821
tokenizer: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None, # noqa: F821
feature_extractor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None, # noqa: F821
device: int = None,
):
"""
Prepare pipeline.
Args:
model_or_pipeline (`str` or `Pipeline` or `Callable` or `PreTrainedModel` or `TFPreTrainedModel`,
defaults to `None`):
If the argument in not specified, we initialize the default pipeline for the task. If the argument is of the type `str` or
is a model instance, we use it to initialize a new `Pipeline` with the given model. Otherwise we assume the
argument specifies a pre-initialized pipeline.
preprocessor (`PreTrainedTokenizerBase` or `FeatureExtractionMixin`, *optional*, defaults to `None`):
Argument can be used to overwrite a default preprocessor if `model_or_pipeline` represents a model for
which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
this argument.
Returns:
The initialized pipeline, with modifications for the specific task of generating text, even with long inputs.
"""
if device is None:
device = self._infer_device()
if (
isinstance(model_or_pipeline, str)
or isinstance(model_or_pipeline, transformers.PreTrainedModel)
or isinstance(model_or_pipeline, transformers.TFPreTrainedModel)
):
if isinstance(model_or_pipeline, str):
# load tokenizer manually, since the pipeline does fail to do so at times. needed for bigcode/santacoder for example.
tokenizer = AutoTokenizer.from_pretrained(model_or_pipeline, trust_remote_code=True)
pipe = pipeline(
self.task,
model=model_or_pipeline,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
device=device,
# my additions here:
handle_long_generation= "hole", #our solution? relevant: https://github.com/huggingface/transformers/issues/14033#issuecomment-948385227
# pad_token_id=tokenizer.eos_token_id, #to avoid the warning, however there might be issues as tokenizers will call this differently.
do_sample=False, #important to get reproduceable results but we need to make sure the generator is deterministic
trust_remote_code=True, # do we need this for some custom models? need to test if it works right here. one example is bigcode/santacoder
)
else:
if model_or_pipeline is None:
pipe = pipeline(self.task, device=device)
else:
pipe = model_or_pipeline
# if tokenizer is not None and feature_extractor is not None:
# logger.warning("Ignoring the value of the preprocessor argument (`tokenizer` or `feature_extractor`).") #excluded warning because I didn't import logger
if (pipe.task != self.task) and not (self.task == "translation" and pipe.task.startswith("translation")):
raise ValueError(
f"Incompatible `model_or_pipeline`. Please specify `model_or_pipeline` compatible with the `{self.task}` task."
)
# fixinging default for max_lenght
pipe.model.config.max_length = self._resolve_context_lenght(pipe=pipe)
# update the generation config with information from the pipe
self._update_generation_config(pipe)
return pipe
def _update_generation_config(self, pipe):
"""
Update the generation config with information from the pipe. Sets eos_token_id and pad_token_id.
Args:
pipe (:class:`~transformers.Pipeline`): we need to access the tokenizer.vocab
returns:
None
"""
semicolon_token_ids = [v for k,v in pipe.tokenizer.vocab.items() if ";" in k] # this requires the tokenizer, which we only have once a pipe is made.
# GenerationConfig.update also exists, but it does only replace, not add kwargs.
self.greedy_cfg.eos_token_id = semicolon_token_ids # eos_token_id can be a list, so we give them all possible tokens.
self.greedy_cfg.pad_token_id = semicolon_token_ids[0] # pad_token_id has to be an int, so we just take the first one.
return None # doesn't do anything?
def _resolve_context_lenght(self, model_or_pipeline=None, pipe=None): #TODO should really copy the typing hints here.
if isinstance(model_or_pipeline, transformers.GPT2Model): # you are comparing a string here -.-
return model_or_pipeline.config.n_ctx # how GPT2 models might handle is, seen with
if pipe is not None: #should I figure out a way to pass this.
return pipe.tokenizer.model_max_length # this is set to something small for pipeline default task, but we would want to put it to the max instead.
# tokenizer needs to know the context length for our pipe strategy, but it has to be passed to the tokenizer, not model.
# the tokenizer should read from the model config, but that can be wrong, or it has a task overwrite (for "text-generation" for example you get 50)
#model_or_pipeline only exists via the .compute call, so we have to take it in
# model_or_pipeline.tokenier.config.max_new_tokens = 1024 # we shouldn't return it, but overwrite the tokenizer config, which the pipeline relies on.
return 1024 # we shouldn't return it, but overwrite the tokenizer config, which the pipeline relies on.
def _estimate_stopping(self, labels, **kwargs):
""" estimates max_new_tokens for the pipeline call
by counting the characters in the longest string of the references adding 5 (for good measure but probably not needed)
Args:
labels: A list of dicts by knowing the labels
Returns:
`int`: the estimated max_new_tokens, should be smaller than context_lenght in all cases
"""
context_lenght = self._resolve_context_lenght(**kwargs)
estimate = min(max([len(ref) for ref in labels]) + 5, context_lenght) #does the min call get done inside the pipeline anyway? is there even a single case where the return statement is this long?
return estimate
# this one needs to be adjusted
def predictions_processor(self, predictions, *args, **kwargs):
"""
processes the output of the pipeline to be compatible with the metric.
generated texts cut off by the first semicolon and whitespaces are stripped (using python str builtins)
Args:
predictions: A list of lists of dicts
Returns:
`dict`: All the processed text are flattened and stored under the "predictions" key.
"""
return {"predictions": [pred[f"{self.predictions_prefix}_text"].split(";")[0].strip() for pred_list in predictions for pred in pred_list]}
# straight copy, doesn't seem to give me the
def prepare_data(self, data: Dataset, input_column: str, label_column: str, *args, **kwargs):
"""
Prepare data.
Args:
data (`Dataset`): Specifies the dataset we will run evaluation on.
input_column (`str`, defaults to `"text"`):
the name of the column containing the text feature in the dataset specified by `data`.
label_column (`str`, defaults to `"label"`):
the name of the column containing the labels in the dataset specified by `data`.
Returns:
`dict`: metric inputs. everything before the first semicolon and whitespaces are stripped (using python str builtins, just like the pred prep)
`list`: pipeline inputs.
"""
self.check_required_columns(data, {"input_column": input_column, "label_column": label_column}) #this will throw and exception with useful error messages
# don't put everything in the return statement, so you have the control...
references = [ref.split(";")[0].strip() for ref in data[label_column]]
self.PIPELINE_KWARGS.update({"max_new_tokens": self._estimate_stopping(references)}) #this is a hack, does it work tho?
return {"references": references}, data[input_column] #DatasetColumn(data, input_column) doesn't seem to work. data[input_column] does, but ignores any of the features of the helper class..
# via: https://huggingface.co/docs/evaluate/evaluation_suite
# relevant source: https://github.com/huggingface/evaluate/blob/v0.4.0/src/evaluate/evaluation_suite/__init__.py
class Suite(evaluate.EvaluationSuite):
def __init__(self, name):
super().__init__(name)
self.preprocessor = lambda x: {"return_statement": x["return_statement"].split(";")[0]} #like this? refactored to RetrunGenerationEvaluator
self.suite = [
# more subtasks are only possible once we can pass custom evaluators. -> https://github.com/huggingface/evaluate/pull/367
SubTask( #this one is adjusted already
task_type="text-generation", #this call an evaluator, but can you specify your own custom evaluator instead?
data="Vipitis/Shadertoys-fine",
subset="return_completion",
split="test", # use this to select a subset of the data during testing, perhaps remove later?
args_for_task={
# "metric": "exact_match",
"input_column": "body",
"label_column": "return_statement",
}
)
]
# from: https://github.com/huggingface/evaluate/blob/v0.4.0/src/evaluate/evaluation_suite/__init__.py#LL103C5-L129C27
def run(
self, model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"] = "Vipitis/santacoder-finetuned-Shadertoys-fine", #not so useful default model?
snippet: int = "" # noqa: F821
) -> Dict[str, float]:
self.assert_suite_nonempty()
results_all = []
for task in self.suite:
task_name = task.data
if task.data_preprocessor: # task requires extra preprocessing is all done inside the Evaluator
ds = load_dataset(task.data, name=task.subset, split=(task.split + f"[:{snippet}]"))
task.data = ds.map(task.data_preprocessor)
task_evaluator = ReturnGenerationEvaluator() #this is the change we make: specify our custom evaluator from above.
args_for_task = task.args_for_task
args_for_task["model_or_pipeline"] = model_or_pipeline
args_for_task["data"] = task.data
args_for_task["subset"] = task.subset
args_for_task["split"] = (task.split + f"[:{snippet}]") #make a downselection of the split via keywordarg in the .run() call?
results = task_evaluator.compute(**args_for_task)
results["model_cp"] = model_or_pipeline #added this to the output, should be useful. But be careful when passed something that is not a string. #TODO: currently the same for all tasks, maybe move to the list?
results["task_name"] = task_name + "/" + task.subset if task.subset else task_name
results["data_preprocessor"] = str(task.data_preprocessor) if task.data_preprocessor is not None else None
results_all.append(results)
return results_all |