File size: 2,718 Bytes
b849606
81b7cc0
b849606
a88db4c
b849606
81b7cc0
 
d655623
81b7cc0
b849606
 
 
 
 
7fcb7f9
81b7cc0
 
 
 
 
 
d655623
401530d
7994488
b849606
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81b7cc0
 
 
 
 
 
 
 
 
 
 
7fcb7f9
81b7cc0
b849606
 
 
 
 
 
 
81b7cc0
b849606
 
81b7cc0
a88db4c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from transformers import MT5ForConditionalGeneration, AutoTokenizer, Text2TextGenerationPipeline, AutoModelForSeq2SeqLM
import gradio as gr
import re

# 翻译任务设置
trans_mdl = MT5ForConditionalGeneration.from_pretrained("K024/mt5-zh-ja-en-trimmed")
trans_tokenizer = AutoTokenizer.from_pretrained("K024/mt5-zh-ja-en-trimmed")
trans_pipe = Text2TextGenerationPipeline(model=trans_mdl, tokenizer=trans_tokenizer)

# 摘要任务设置
sum_mdl = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
sum_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")


def translation_job(job, text):
    # 设置翻译任务和提示语的映射
    job_key = ["中译日", "中译英", "日译中", "英译中", "日译英", "英译日"]
    job_value = ["zh2ja:", "zh2en:", "ja2zh:", "en2zh:", "ja2en:", "en2ja:"]
    job_map = dict(zip(job_key, job_value))

    input = job_map[job] + text
    print(input)
    response = trans_pipe(input, max_length=100, num_beams=4)
    return response[0]['generated_text']


def sum_job(text):
    # 去除源文本中的空格
    WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

    input_ids = sum_tokenizer(
        [WHITESPACE_HANDLER(text)],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )["input_ids"]

    output_ids = sum_mdl.generate(
        input_ids=input_ids,
        max_length=84,
        no_repeat_ngram_size=2,
        num_beams=4
    )[0]

    response = sum_tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return response


with gr.Blocks() as app:
    # 中英日三语翻译任务
    with gr.Tab("中英日三语翻译"):
        job_name = gr.Dropdown(
            ["中译日", "中译英", "日译中", "英译中", "日译英", "英译日"],
            label = "翻译任务选择",
            info = "单选"
        )
        source_text = gr.Textbox(lines=1, label="翻译文本", placeholder="请输入要翻译的文本")
        trans_result = gr.Textbox(lines=1, label="翻译结果")
        trans_btn = gr.Button("翻译")
        
    # 多语言自动摘要任务
    with gr.Tab("多语言自动摘要"):
        article_text = gr.Textbox(lines=8, label="待总结文本", placeholder="请输入要进行摘要的文本")
        sum_result = gr.Textbox(lines=2, label="摘要结果")
        sum_btn = gr.Button("摘要")
        
    trans_btn.click(translation_job, inputs=[job_name, source_text], outputs=trans_result)
    sum_btn.click(sum_job, inputs=article_text, outputs=sum_result)

app.launch()