import torch import PyPDF2 import gradio as gr from IPython.display import Audio, display from transformers import pipeline from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import numpy as np import scipy from gtts import gTTS from io import BytesIO def extract_text(article): pdfReader = PyPDF2.PdfReader(article) pageObj = pdfReader.pages[0] return pageObj.extract_text() def summarize_abstract(text): sentences = text.split(". ") for i, sentence in enumerate(sentences): if "Abstract" in sentence: start = i + 1 end = start + 6 break abstract = ". ".join(sentences[start:end+1]) tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary") model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-base-book-summary") # Tokenize abstract inputs = tokenizer(abstract, max_length=1024, return_tensors="pt", truncation=True) # Generate summary summary_ids = model.generate(inputs['input_ids'], max_length=50, min_length=30, no_repeat_ngram_size=3, encoder_no_repeat_ngram_size=3, repetition_penalty=3.5, num_beams=4, do_sample=True,early_stopping=False) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) if '.' in summary: index = summary.rindex('.') if index != -1: summary = summary[:index+1] return summary def abstract_to_audio(text): tts = gTTS(text, lang='en') buffer = BytesIO() tts.write_to_fp(buffer) buffer.seek(0) return buffer.read() def abstract_audio(article): text = extract_text(article) summary = summarize_abstract(text) audio = abstract_to_audio(summary) return summary, audio inputs = gr.File() summary_text = gr.Text() audio_summary = gr.Audio() myApp = gr.Interface( fn= abstract_audio, inputs=gr.File(), outputs=[gr.Text(),gr.Audio()], title="Summary of Abstract to Audio ", description="An App that helps you summarises the abstract of an Article\Journal and gives the audio of the summary", examples=["/content/NIPS-2015-hidden-technical-debt-in-machine-learning-systems-Paper.pdf"] ) myApp.launch()