remzicam's picture
Update app.py
5d8a17a
raw
history blame
1.82 kB
"""TED Talks Summarizer App."""
from re import sub
from gradio import Interface, Series, Textbox
from requests import get
def clean_text(text):
"""Cleans subtitle text of ted talks.
Args:
text (str): subtitle of ted talk
Returns:
cleaned_text (str): cleaned version of subtitle text
"""
# remove string inside parantheses (i.e appluse)
text = sub(r"\(.*\)", "", text)
# format text by splitting/removing new lines
text = text.split("\n")[1:]
# remove empty strings
text = list(filter(None, text))
# remove timestamps as they contains pattern of "-->"
cleaned_text = " ".join([x.strip() for x in text if "-->" not in x])
return cleaned_text
def ted_talk_transcriber(link):
"""Creates transcription of ted talks from url.
Args:
link (str): url link of ted talks
Returns:
cleaned_transcript (str): transcription of the ted talk
"""
# request link of the talk
page = get(link)
# extract unique talk id to reach subtitle file
talk_id = str(page.content).split("project_masters/")[1].split("/")[0]
raw_text = get(
f"https://hls.ted.com/project_masters/{talk_id}/subtitles/en/full.vtt"
).text
cleaned_transcript = clean_text(raw_text)
return cleaned_transcript
transcriber = Interface(
ted_talk_transcriber,
"text",
"text",
)
summarizer = Interface.load(
"huggingface/Shobhank-iiitdwd/long-t5-tglobal-base-16384-book-summary"
)
logo = "<center><img src='file/TED.png' width=180px></center>"
Series(
transcriber,
summarizer,
inputs=Textbox(label="Type the TED Talks link"),
examples=[
"https://www.ted.com/talks/jen_gunter_the_truth_about_yeast_in_your_body"
],
allow_flagging="never",
description=logo,
).launch()