Spaces:
Runtime error
Runtime error
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import pickle | |
import torch | |
from transformers import PegasusTokenizer, PegasusForConditionalGeneration | |
import tensorflow as tf | |
from tensorflow.python.lib.io import file_io | |
from nltk.tokenize import sent_tokenize | |
import io | |
tf.compat.v1.disable_eager_execution() | |
# Let's load the model and the tokenizer | |
model_name = "human-centered-summarization/financial-summarization-pegasus" | |
tokenizer = PegasusTokenizer.from_pretrained(model_name) | |
model2 = PegasusForConditionalGeneration.from_pretrained(model_name) | |
#tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) | |
import nltk | |
from finbert_embedding.embedding import FinbertEmbedding | |
import pandas as pd | |
from nltk.cluster import KMeansClusterer | |
import numpy as np | |
import os | |
from scipy.spatial import distance_matrix | |
from tensorflow.python.lib.io import file_io | |
import pickle | |
nltk.download('punkt') | |
def pegasus(text): | |
'''A function to obtain summaries for each tokenized sentence. | |
It returns a summarized document as output''' | |
import nltk | |
nltk.download('punkt') | |
import os | |
data_path = "/tmp/" | |
if not os.path.exists(data_path): | |
os.makedirs(data_path) | |
input_ = "/tmp/input.txt" | |
with open(input_, "w") as file: | |
file.write(text) | |
# read the written txt into a variable | |
with open(input_ , 'r') as f: | |
text_ = f.read() | |
def tokenized_sentences(file): | |
'''A function to generate chunks of sentences and texts. | |
Returns tokenized texts''' | |
# Create empty arrays | |
tokenized_sentences = [] | |
sentences = [] | |
length = 0 | |
for sentence in sent_tokenize(file): | |
length += len(sentence) | |
# 512 is the maximum input length for the Pegasus model | |
if length < 512: | |
sentences.append(sentence) | |
else: | |
tokenized_sentences.append(sentences) | |
sentences = [sentence] | |
length = len(sentence) | |
sentences = [sentence.strip() for sentence in sentences] | |
size = len(sentences) | |
# Append all tokenized sentences | |
if sentences: | |
tokenized_sentences.append(sentences) | |
return tokenized_sentences | |
tokenized = tokenized_sentences(text_) | |
# Use GPU if available | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
global summary | |
# Create an empty array for all summaries | |
summary = [] | |
if size <= 4: | |
max_length= size | |
else: | |
max_length = size//4 | |
# Loop to encode tokens, to generate abstractive summary and finally decode tokens | |
for token in tokenized: | |
# Encoding | |
inputs = tokenizer.encode(' '.join(token), truncation=True, return_tensors='pt') | |
# Use CPU or GPU | |
inputs = inputs.to(device) | |
# Get summaries from transformer model | |
all_summary = model2.to(device).generate(inputs,do_sample=True, | |
max_length=max_length, top_k=50, top_p=0.95, | |
num_beams = 5, early_stopping=True) | |
# num_return_sequences=5) | |
# length_penalty=0.2, no_repeat_ngram_size=2 | |
# min_length=10, | |
# max_length=50) | |
# Decoding | |
output = [tokenizer.decode(each_summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for each_summary in all_summary] | |
# Append each output to array | |
summary.append(output) | |
# Get final summary | |
summary = [sentence for each in summary for sentence in each] | |
final = "".join(summary) | |
return final | |
import gradio as gr | |
interface1 = gr.Interface(fn=pegasus, | |
inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'), | |
outputs=gr.outputs.Textbox(label='Output- Pegasus')).launch() | |