Spaces:
Runtime error
Runtime error
import gc | |
import torch | |
import nltk | |
from nltk import sent_tokenize | |
import gradio as gr | |
from transformers import T5ForConditionalGeneration, T5Tokenizer | |
import language_tool_python | |
import re | |
nltk.download("punkt") | |
GPU_IDX = 1 # which GPU to use, starts from 0 | |
BATCH_SIZE = 64 # number of sentences to process in one batch | |
# autodetect the available device | |
if torch.cuda.is_available(): | |
num_gpus = torch.cuda.device_count() | |
print(f"Number of available GPUs: {num_gpus}") | |
assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available." | |
device = torch.device(f"cuda:{GPU_IDX}") | |
print(f"Using GPU: {GPU_IDX}") | |
else: | |
print("CUDA is not available. Using CPU instead.") | |
device = torch.device("cpu") | |
# ---------------------------- | |
# load encoder-decoder (sequence to sequence) language model | |
# seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2" | |
# seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device) | |
# seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq) | |
# print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}") | |
seq2seq_model = None | |
seq2seq_tokenizer = None | |
# ---------------------------- | |
# load decoder-only (causal) language model | |
from unsloth import FastLanguageModel | |
from unsloth.chat_templates import get_chat_template | |
# can only use GPU 0 when using unsloth FastLanguageModel | |
max_seq_length = 2048 # any can be chosed since RoPE Scaling is used | |
dtype = None # None for auto detection. Float16for Tesla T4, V100, Bfloat16 for Ampere+ | |
load_in_4bit = True # Use 4bit quantization to reduce memory usage | |
dec_only = "polygraf-ai/phi-3-mini-rank-128" | |
dec_only_model, dec_only_tokenizer = FastLanguageModel.from_pretrained( | |
model_name=dec_only, | |
max_seq_length=max_seq_length, | |
dtype=dtype, | |
load_in_4bit=load_in_4bit, | |
device_map="cuda:0", | |
) | |
FastLanguageModel.for_inference(dec_only_model) # native 2x faster inference | |
print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}") | |
# grammar correction tool | |
tool = language_tool_python.LanguageTool("en-US") | |
def format_and_correct_language_check(text: str) -> str: | |
return tool.correct(text) | |
def extract_citations(text): | |
citations = re.findall(r"<(\d+)>", text) | |
return [int(citation) for citation in citations] | |
def remove_citations(text): | |
text = re.sub(r"<\d+>", "", text) | |
text = re.sub(r"[\d+]", "", text) | |
return text | |
def humanize_batch_seq2seq( | |
model, | |
tokenizer, | |
sentences, | |
temperature, | |
repetition_penalty, | |
top_k, | |
length_penalty, | |
): | |
inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences] | |
inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device) | |
outputs = model.generate( | |
**inputs, | |
do_sample=True, | |
temperature=temperature, | |
repetition_penalty=repetition_penalty, | |
max_length=128, | |
top_k=top_k, | |
length_penalty=length_penalty, | |
) | |
answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs] | |
return answers | |
def humanize_batch_decoder_only( | |
model, | |
tokenizer, | |
sentences, | |
temperature, | |
repetition_penalty, | |
top_k, | |
length_penalty, | |
): | |
pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n" | |
# Construct the messages_batch using the tokenized sentences | |
messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences] | |
# Initialize the tokenizer with the chat template | |
tokenizer = get_chat_template( | |
tokenizer, | |
chat_template="phi-3", | |
mapping={ | |
"role": "from", | |
"content": "value", | |
"user": "human", | |
"assistant": "gpt", | |
}, # ShareGPT style | |
) | |
# Enable native 2x faster inference | |
FastLanguageModel.for_inference(model) | |
# Initialize an empty list to store responses | |
responses = [] | |
# Process each message individually | |
for message in messages_batch: | |
# Apply the chat template to the individual message | |
inputs = tokenizer.apply_chat_template( | |
[message], # Wrap the message in a list | |
tokenize=True, | |
add_generation_prompt=True, # Must add for generation | |
return_tensors="pt", | |
).to("cuda") | |
# Generate the response for the individual message | |
outputs = model.generate( | |
input_ids=inputs, | |
max_new_tokens=1024, | |
use_cache=True, | |
do_sample=True, | |
temperature=temperature, | |
repetition_penalty=repetition_penalty, | |
top_k=top_k, | |
length_penalty=length_penalty, | |
) | |
# Decode the output and store it | |
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=False) | |
responses.append(decoded_output[0]) | |
# Print or return the responses | |
generated_sentences = [] | |
for idx, response in enumerate(responses): | |
generated_sentence = response.split("<|assistant|>")[1].split("<|end|>")[0].strip() | |
generated_sentences.append(generated_sentence) | |
print(sentences[idx]) | |
print(generated_sentence) | |
print() | |
return generated_sentences | |
def humanize_text( | |
text, | |
progress=gr.Progress(), | |
model_name="Standard Model", | |
temperature=1.2, | |
repetition_penalty=1.0, | |
top_k=50, | |
length_penalty=1.0, | |
): | |
""" | |
Optimization here is to feed all sentences at once to the model. | |
Paragraphs are stored as a number of sentences per paragraph. | |
""" | |
progress(0, desc="Starting to Humanize") | |
# Map model names to their respective processing functions | |
model_map = { | |
"Standard Model": humanize_batch_seq2seq, | |
"Advanced Model (Beta)": humanize_batch_decoder_only, | |
} | |
assert model_name in model_map, f"Invalid model name: {model_name}" | |
process_function = model_map[model_name] | |
# Split the text into paragraphs and then into sentences | |
paragraphs = text.split("\n") | |
all_sentences = [] | |
sentences_per_paragraph = [] | |
citations_per_paragraph = [] | |
for paragraph in paragraphs: | |
citations_per_paragraph.append(extract_citations(paragraph)) | |
paragraph = remove_citations(paragraph) | |
sentences = sent_tokenize(paragraph) | |
sentences_per_paragraph.append(len(sentences)) | |
all_sentences.extend(sentences) | |
# Process all sentences in batches | |
paraphrased_sentences = [] | |
current_batch_size = BATCH_SIZE | |
i = 0 | |
while i < len(all_sentences): | |
try: | |
batch_sentences = all_sentences[i : i + current_batch_size] | |
# Call the selected processing function | |
paraphrased_batch = process_function( | |
(seq2seq_model if model_name == "Standard Model" else dec_only_model), | |
(seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer), | |
batch_sentences, | |
temperature, | |
repetition_penalty, | |
top_k, | |
length_penalty, | |
) | |
paraphrased_sentences.extend(paraphrased_batch) | |
i += current_batch_size # Move to the next batch | |
torch.cuda.empty_cache() | |
gc.collect() | |
progress.update(i / len(all_sentences)) | |
except RuntimeError as e: | |
if "out of memory" in str(e): | |
# Reduce the batch size by half and retry | |
current_batch_size = max(1, current_batch_size // 2) | |
print(f"Out of memory, reducing batch size to {current_batch_size}. Retrying...") | |
torch.cuda.empty_cache() | |
gc.collect() | |
else: | |
raise e | |
# Reconstruct paragraphs | |
humanized_paragraphs = [] | |
sentence_index = 0 | |
for num_sentences in sentences_per_paragraph: | |
humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences]) | |
humanized_paragraphs.append(humanized_paragraph) | |
sentence_index += num_sentences | |
for i, paragraph in enumerate(humanized_paragraphs): | |
citation_texts = [f"<{cid}>" for cid in citations_per_paragraph[i]] | |
humanized_paragraphs[i] = paragraph + " " + "".join(citation_texts) | |
humanized_text = "\n\n".join(humanized_paragraphs) | |
return humanized_text | |