azsalihu's picture
Create app.py
6720d31
raw
history blame
2.17 kB
import torch
import PyPDF2
import gradio as gr
from IPython.display import Audio, display
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import scipy
from gtts import gTTS
from io import BytesIO
def extract_text(article):
pdfReader = PyPDF2.PdfReader(article)
pageObj = pdfReader.pages[0]
return pageObj.extract_text()
def summarize_abstract(text):
sentences = text.split(". ")
for i, sentence in enumerate(sentences):
if "Abstract" in sentence:
start = i + 1
end = start + 6
break
abstract = ". ".join(sentences[start:end+1])
tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary")
# Tokenize abstract
inputs = tokenizer(abstract, max_length=1024, return_tensors="pt", truncation=True)
# Generate summary
summary_ids = model.generate(inputs['input_ids'], max_length=50, min_length=30, no_repeat_ngram_size=3, encoder_no_repeat_ngram_size=3, repetition_penalty=3.5, num_beams=4, do_sample=True,early_stopping=False)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
if '.' in summary:
index = summary.rindex('.')
if index != -1:
summary = summary[:index+1]
return summary
def abstract_to_audio(text):
tts = gTTS(text, lang='en')
buffer = BytesIO()
tts.write_to_fp(buffer)
buffer.seek(0)
return buffer.read()
def abstract_audio(article):
text = extract_text(article)
summary = summarize_abstract(text)
audio = abstract_to_audio(summary)
return summary, audio
inputs = gr.File()
summary_text = gr.Text()
audio_summary = gr.Audio()
myApp = gr.Interface( fn= abstract_audio, inputs=gr.File(),
outputs=[gr.Text(),gr.Audio()], title="Summary of Abstract to Audio ", description="An App that helps you summarises the abstract of an Article\Journal and gives the audio of the summary", examples=["/content/NIPS-2015-hidden-technical-debt-in-machine-learning-systems-Paper.pdf"]
)
myApp.launch()