TIGERScore / app.py
DongfuJiang's picture
update
2107a44
raw
history blame
6.5 kB
import os
import gradio as gr
import sys
import copy
import spaces
from datasets import load_dataset
from string import Template
from tigerscore import TIGERScorer
DESCRIPTIONS = """
We present ***TIGERScore***, a **T**rained metric that follows **I**nstruction **G**uidance to perform **E**xplainable, and **R**eference-free evaluation over a wide spectrum of text generation tasks. Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
[**Website**](https://tiger-ai-lab.github.io/TIGERScore/) |
[**Paper**](https://arxiv.org/abs/2310.00752) |
[**Code**](https://github.com/TIGER-AI-Lab/TIGERScore) |
[**TIGERScore-7B**](https://huggingface.co/TIGER-Lab/TIGERScore-7B) |
[**TIGERScore-13B**](https://huggingface.co/TIGER-Lab/TIGERScore-13B)
"""
EXAMPLES_DATASET = load_dataset("TIGER-Lab/MetricInstruct", split="train", streaming=True)
SHUFFLED_EXAMPLES_DATASET = EXAMPLES_DATASET.shuffle(seed=42)
EXAMPLES = []
fields = ["instruction", "input_context", "hypo_output"]
print("Loading examples...")
for i, ex in enumerate(SHUFFLED_EXAMPLES_DATASET.take(100)):
# if any([not ex[field] for field in fields]):
# continue
EXAMPLES.append([ex[field] for field in fields])
TEMPLATE = """You are evaluating errors in a model-generated output for a given instruction.
Instruction:
${generation_instruction}
${input_context}
Model-generated Output:
${hypothesis_output}
For each error you give in the response, please also elaborate the following information:
- error location (the words that are wrong in the output)
- error aspect it belongs to.
- explanation why it's an error, and the correction suggestions.
- severity of the error ("Major" or "Minor").
- reduction of score (between 0.5 and 5 given the severity of the error)
Your evaluation output:
"""
# from huggingface_hub import hf_hub_download
# from llama_cpp import Llama
# llm = Llama(
# model_path=hf_hub_download(
# repo_id=os.environ.get("REPO_ID", "TIGER-Lab/TIGERScore-13B-GGUF"),
# filename=os.environ.get("MODEL_FILE", "ggml-model-q4_0.gguf"),
# ),
# n_ctx=2048,
# n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM
# )
scorer = TIGERScorer(model_name="TIGER-Lab/TIGERScore-13B")
@spaces.GPU(duration=60)
def generate_text_hf(input_context, generation_instruction, hypo_output, max_new_tokens=1024, temperature=0.7, top_p=1.0):
global scorer
scorer.model = scorer.model.to("cuda")
for output in scorer.generate_stream(generation_instruction, hypo_output, input_context, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p):
yield output
def generate_text_llamacpp(input_context, generation_instruction, hypo_output, max_new_tokens=1024, temperature=0.7, top_p=1.0):
global llm
prompt_template = Template(TEMPLATE)
prompt = prompt_template.substitute(
generation_instruction=generation_instruction,
input_context=input_context,
hypothesis_output=hypo_output,
).strip("\n ")
gen_params = {
"max_tokens": max_new_tokens,
"top_p": top_p,
"top_k": 40,
"temperature": temperature,
"frequency_penalty": 0.0,
"presence_penalty": 0.0,
"echo": False,
"stream": True,
}
outputs = llm(prompt, **gen_params)
temp=""
for out in outputs:
stream = copy.deepcopy(out)
temp += stream["choices"][0]["text"]
yield temp
def get_examples(inst_textbox, input_textbox, hypo_output_textbox):
return inst_textbox, input_textbox, hypo_output_textbox
def clear_all(inst_textbox, input_textbox, hypo_output_textbox):
return "", "", ""
with gr.Blocks(theme='gradio/soft') as demo:
gr.Markdown("# 🐯 TIGERScore Demo")
with gr.Row():
gr.Markdown(DESCRIPTIONS)
gr.Image("https://jdf-prog.github.io/assets/img/publication_preview/tigerscore_preview.png")
gr.Markdown("## TIGERScore Inputs")
inst_textbox = gr.Textbox(lines=1, label="Instruction", placeholder="Enter instruction here", show_label=True)
input_textbox = gr.Textbox(lines=4, label="Input Context", placeholder="Enter input context here", show_label=True)
hypo_output_textbox = gr.Textbox(lines=4, label="Hypothesis Output", placeholder="Enter hypothesis output to be evaluated here", show_label=True)
with gr.Row():
clear_button = gr.Button('Clear', variant='primary')
submit_button = gr.Button('Submit', variant='primary')
with gr.Accordion(label='Advanced options', open=False):
max_new_tokens = gr.Slider(
label='Max new tokens to generate',
minimum=256,
maximum=1024,
step=1,
value=1024,
)
temperature = gr.Slider(
label='Temperature of generation',
minimum=0.1,
maximum=2.0,
step=0.1,
value=0.7,
)
top_p = gr.Slider(
label='Top-p of generation',
minimum=0.05,
maximum=1.0,
step=0.05,
value=1.0,
)
gr.Markdown("## TIGERScore Outputs")
evaluation_output_textbox = gr.Textbox(lines=4, label="Evaluation Output", placeholder="Evaluation output", show_label=True)
submit_button.click(
fn=generate_text_hf,
inputs=[input_textbox, inst_textbox, hypo_output_textbox, max_new_tokens, temperature, top_p],
outputs=evaluation_output_textbox,
)
clear_button.click(
fn=clear_all,
inputs=[inst_textbox, input_textbox, hypo_output_textbox],
outputs=[inst_textbox, input_textbox, hypo_output_textbox],
)
batch_examples = gr.Examples(
examples=EXAMPLES,
fn=get_examples,
cache_examples=True,
examples_per_page=5,
inputs=[inst_textbox, input_textbox, hypo_output_textbox],
outputs=[inst_textbox, input_textbox, hypo_output_textbox],
)
citations = gr.Markdown("""## Citation
```txt
@article{jiang2023TIGERScore,
title={TIGERScore: Towards Building Explainable Metric for All Text Generation Tasks},
author={Dongfu Jiang, Yishan Li, Ge Zhang, Wenhao Huang, Bill Yuchen Lin, Wenhu Chen},
journal={arXiv preprint arXiv:2310.00752},
year={2023}
}
```""")
demo.queue(max_size=20).launch()