Spaces:
Sleeping
Sleeping
File size: 3,943 Bytes
92da267 13305fa 92da267 ba2db22 92da267 66c4010 92da267 66c4010 92da267 a9b92de 8399959 8812439 a9b92de 937fe63 92da267 9182f58 f60a73d a9b92de 937fe63 9182f58 937fe63 9182f58 92da267 66c4010 e5cdcd9 8812439 e5cdcd9 799fc30 009000b f60a73d 799fc30 009000b d4e5151 799fc30 ee3fab3 cea7242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import nltk
from nltk import tokenize
nltk.download('punkt')
checkpoint = "yhavinga/t5-base-dutch"
tokenizer = T5TokenizerFast.from_pretrained(checkpoint)
tokenizer.sep_token = '<sep>'
tokenizer.add_tokens(['<sep>'])
hfmodel = T5ForConditionalGeneration.from_pretrained("Michelvh/t5-end2end-questions-generation-dutch")
def hf_run_model(input_string, **generator_args):
generator_args = {
"max_length": 256,
"num_beams": 4,
"length_penalty": 1.5,
"no_repeat_ngram_size": 3,
"early_stopping": True,
"num_return_sequences": 1,
}
input_string = input_string + " </s>"
input_ids = tokenizer.encode(input_string, return_tensors="pt")
res = hfmodel.generate(input_ids, **generator_args)
output = tokenizer.batch_decode(res, skip_special_tokens=True)
output = [item.split("<sep>") for item in output]
return output
def chunk_text(text, framesize=5):
sentences = tokenize.sent_tokenize(text)
frames = []
lastindex = len(sentences) - framesize + 1
for index in range(lastindex):
frames.append(" ".join(sentences[index:index+framesize]))
return frames
def flatten(l):
return [item for sublist in l for item in sublist]
def run_model_with_frames(text, framesize=4, overlap=3, progress=gr.Progress()):
if overlap > framesize:
return "Overlap should be smaller than batch size"
frames = create_frames(text, framesize, overlap)
counter = 0
total_steps = len(frames)
progress((counter, total_steps), desc="Starting...")
result = set()
for frame in frames:
questions = flatten(hf_run_model(frame))
for question in questions:
result.add(ensure_questionmark(question.strip()))
counter += 1
progress((counter, total_steps), desc="Generating...")
output_string = ""
for entry in result:
output_string += entry
output_string += "\n"
progress((counter, total_steps), desc="Done")
return output_string
def create_frames(text, framesize=4, overlap=3):
sentences = tokenize.sent_tokenize(text)
frames = []
stepsize = framesize - overlap
index = 0
sentenceslength = len(sentences)
while index < sentenceslength:
endindex = index + framesize
if endindex >= sentenceslength:
frame = " ".join(sentences[-framesize:])
index = sentenceslength
else:
frame = " ".join(sentences[index:endindex])
index += stepsize
frames.append(frame)
return frames
def ensure_questionmark(question):
if question.endswith("?"):
return question
return question + "?"
description = """
# Dutch question generator
Input some Dutch text and click the button to generate some questions!
The model is currently set up to generate as many questions, but this
can take a couple of minutes so have some patience ;)
The optimal text lenght is probably around 8-10 lines. Longer text
will obviously take longer. Please keep in mind that this is a work in
progress and might still be a little bit buggy."""
with gr.Blocks() as iface:
gr.Markdown(description)
context = gr.Textbox(label="Input text")
frame_size = gr.Number(value=5, label="Batch size", info="Size of the subparts that are used to generate questions. Increase to speed up the generation", precision=0)
overlap = gr.Number(value=4, label="Overlap", info="Overlap between batches. Should be bigger than batch size. Decrease to speed up generation", precision=0)
questions = gr.Textbox(label="Questions")
generate_btn = gr.Button("Generate questions")
generate_btn.click(fn=run_model_with_frames, inputs=[context, frame_size, overlap], outputs=questions, api_name="generate_questions")
#iface = gr.Interface(fn=run_model_with_frames, inputs="text", outputs="text")
iface.launch() |