File size: 3,943 Bytes
92da267
 
 
 
 
13305fa
 
92da267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba2db22
92da267
 
 
 
 
 
 
66c4010
92da267
 
66c4010
 
 
92da267
 
 
 
 
 
 
a9b92de
8399959
 
8812439
a9b92de
937fe63
 
92da267
 
9182f58
 
f60a73d
a9b92de
937fe63
9182f58
 
 
 
937fe63
9182f58
92da267
 
66c4010
e5cdcd9
 
 
 
 
 
 
8812439
e5cdcd9
 
 
 
 
 
 
 
 
 
 
 
 
 
799fc30
009000b
f60a73d
799fc30
 
 
009000b
 
d4e5151
799fc30
 
 
ee3fab3
 
 
 
 
 
 
 
 
 
cea7242
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import nltk
from nltk import tokenize

nltk.download('punkt')

checkpoint = "yhavinga/t5-base-dutch"
tokenizer = T5TokenizerFast.from_pretrained(checkpoint)
tokenizer.sep_token = '<sep>'
tokenizer.add_tokens(['<sep>'])

hfmodel = T5ForConditionalGeneration.from_pretrained("Michelvh/t5-end2end-questions-generation-dutch")

def hf_run_model(input_string, **generator_args):
    generator_args = {
    "max_length": 256,
    "num_beams": 4,
    "length_penalty": 1.5,
    "no_repeat_ngram_size": 3,
    "early_stopping": True,
    "num_return_sequences": 1,
    }
    input_string = input_string + " </s>"
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    res = hfmodel.generate(input_ids, **generator_args)
    output = tokenizer.batch_decode(res, skip_special_tokens=True)
    output = [item.split("<sep>") for item in output]
    return output


def chunk_text(text, framesize=5):
    sentences = tokenize.sent_tokenize(text)
    frames = []
    lastindex = len(sentences) - framesize + 1
    for index in range(lastindex):
        frames.append(" ".join(sentences[index:index+framesize]))
    return frames


def flatten(l):
    return [item for sublist in l for item in sublist]


def run_model_with_frames(text, framesize=4, overlap=3, progress=gr.Progress()):
    if overlap > framesize:
        return "Overlap should be smaller than batch size"
    frames = create_frames(text, framesize, overlap)
    counter = 0
    total_steps = len(frames)
    progress((counter, total_steps), desc="Starting...")
    result = set()
    for frame in frames:
        questions = flatten(hf_run_model(frame))
        for question in questions:
            result.add(ensure_questionmark(question.strip()))
        counter += 1
        progress((counter, total_steps), desc="Generating...")
    output_string = ""
    for entry in result:
        output_string += entry
        output_string += "\n"
    progress((counter, total_steps), desc="Done")
    return output_string


def create_frames(text, framesize=4, overlap=3):
    sentences = tokenize.sent_tokenize(text)
    frames = []
    stepsize = framesize - overlap
    index = 0
    sentenceslength = len(sentences)
    while index < sentenceslength:
        endindex = index + framesize
        if endindex >= sentenceslength:
            frame = " ".join(sentences[-framesize:])
            index = sentenceslength
        else:
            frame = " ".join(sentences[index:endindex])
            index += stepsize
        frames.append(frame)
    return frames


def ensure_questionmark(question):
    if question.endswith("?"):
        return question
    return question + "?"

description = """
# Dutch question generator

Input some Dutch text and click the button to generate some questions! 
The model is currently set up to generate as many questions, but this 
can take a couple of minutes so have some patience ;)


The optimal text lenght is probably around 8-10 lines. Longer text 
will obviously take longer. Please keep in mind that this is a work in 
progress and might still be a little bit buggy."""

with gr.Blocks() as iface:
    gr.Markdown(description)
    context = gr.Textbox(label="Input text")
    frame_size = gr.Number(value=5, label="Batch size", info="Size of the subparts that are used to generate questions. Increase to speed up the generation", precision=0)
    overlap = gr.Number(value=4, label="Overlap", info="Overlap between batches. Should be bigger than batch size. Decrease to speed up generation", precision=0)
    questions = gr.Textbox(label="Questions")
    generate_btn = gr.Button("Generate questions")
    generate_btn.click(fn=run_model_with_frames, inputs=[context, frame_size, overlap], outputs=questions, api_name="generate_questions")

#iface = gr.Interface(fn=run_model_with_frames, inputs="text", outputs="text")
iface.launch()