File size: 2,716 Bytes
f0396e5
 
 
 
c27e127
f0396e5
c27e127
f0396e5
c27e127
f0396e5
c27e127
 
 
 
 
 
 
 
 
 
 
f0396e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c27e127
f0396e5
 
 
 
 
 
c27e127
f0396e5
 
 
 
 
 
 
 
c27e127
f0396e5
 
c27e127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0396e5
 
 
 
c27e127
 
f0396e5
 
 
 
c27e127
f0396e5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""TED Talks Summarizer App."""

from re import sub

from gradio import Interface, Textbox
from requests import get
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

repo_id = "pszemraj/led-base-book-summary"

model = AutoModelForSeq2SeqLM.from_pretrained(
    repo_id,
    low_cpu_mem_usage=True,
)

tokenizer = AutoTokenizer.from_pretrained(repo_id)

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)


def clean_text(text: str) -> str:
    """Cleans subtitle text of ted talks.

    Args:
        text (str): subtitle of ted talk

    Returns:
        cleaned_text (str): cleaned version of subtitle text
    """
    # remove string inside parantheses (i.e appluse)
    text = sub(r"\(.*\)", "", text)
    # format text by splitting/removing new lines
    text = text.split("\n")[1:]
    # remove empty strings
    text = list(filter(None, text))
    # remove timestamps as they contains pattern of "-->"
    cleaned_text = " ".join([x.strip() for x in text if "-->" not in x])
    return cleaned_text


def ted_talk_transcriber(link: str) -> str:
    """Creates transcription of ted talks from url.

    Args:
        link (str): url link of ted talks

    Returns:
        raw_text (str): raw transcription of the ted talk
    """
    # request link of the talk
    page = get(link)
    # extract unique talk id to reach subtitle file
    talk_id = str(page.content).split("project_masters/")[1].split("/")[0]
    raw_text = get(
        f"https://hls.ted.com/project_masters/{talk_id}/subtitles/en/full.vtt"
    ).text
    return raw_text


def text_summarizer(text: str) -> str:
    """Summarizes given text.

    Args:
        text (str): ted talks transcription

    Returns:
        str: summary
    """
    result = summarizer(
        text,
        min_length=8,
        max_length=256,
        no_repeat_ngram_size=3,
        encoder_no_repeat_ngram_size=3,
        repetition_penalty=3.5,
        num_beams=4,
        do_sample=False,
        early_stopping=True,
    )
    return result[0]["summary_text"]


def main(link: str) -> str:
    """Summarizes ted talks given link.

    Args:
        link (str): url link of ted talks

    Returns:
        str: summary
    """
    raw_text = ted_talk_transcriber(link)
    cleaned_transcript = clean_text(raw_text)
    return text_summarizer(cleaned_transcript)


logo = "<center><img src='file/TED.png' width=180px></center>"

Interface(
    main,
    inputs=Textbox(label="Type the TED Talks link"),
    examples=[
    "https://www.ted.com/talks/jen_gunter_the_truth_about_yeast_in_your_body"
             ],
    outputs=Textbox(label="Summary"),
    allow_flagging="never",
    description=logo,
).launch()