article_writer / humanize.py
eljanmahammadli's picture
#feat: added YouTube as RAG input; removed standard humanizer
744d9e3
import gc
import torch
import nltk
from nltk import sent_tokenize
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer
import language_tool_python
import re
nltk.download("punkt")
GPU_IDX = 1 # which GPU to use, starts from 0
BATCH_SIZE = 64 # number of sentences to process in one batch
# autodetect the available device
if torch.cuda.is_available():
num_gpus = torch.cuda.device_count()
print(f"Number of available GPUs: {num_gpus}")
assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
device = torch.device(f"cuda:{GPU_IDX}")
print(f"Using GPU: {GPU_IDX}")
else:
print("CUDA is not available. Using CPU instead.")
device = torch.device("cpu")
# ----------------------------
# load encoder-decoder (sequence to sequence) language model
# seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
# seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
# seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
# print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
seq2seq_model = None
seq2seq_tokenizer = None
# ----------------------------
# load decoder-only (causal) language model
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
# can only use GPU 0 when using unsloth FastLanguageModel
max_seq_length = 2048 # any can be chosed since RoPE Scaling is used
dtype = None # None for auto detection. Float16for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage
dec_only = "polygraf-ai/phi-3-mini-rank-128"
dec_only_model, dec_only_tokenizer = FastLanguageModel.from_pretrained(
model_name=dec_only,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
device_map="cuda:0",
)
FastLanguageModel.for_inference(dec_only_model) # native 2x faster inference
print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}")
# grammar correction tool
tool = language_tool_python.LanguageTool("en-US")
def format_and_correct_language_check(text: str) -> str:
return tool.correct(text)
def extract_citations(text):
citations = re.findall(r"<(\d+)>", text)
return [int(citation) for citation in citations]
def remove_citations(text):
text = re.sub(r"<\d+>", "", text)
text = re.sub(r"[\d+]", "", text)
return text
def humanize_batch_seq2seq(
model,
tokenizer,
sentences,
temperature,
repetition_penalty,
top_k,
length_penalty,
):
inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
outputs = model.generate(
**inputs,
do_sample=True,
temperature=temperature,
repetition_penalty=repetition_penalty,
max_length=128,
top_k=top_k,
length_penalty=length_penalty,
)
answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
return answers
def humanize_batch_decoder_only(
model,
tokenizer,
sentences,
temperature,
repetition_penalty,
top_k,
length_penalty,
):
pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
# Construct the messages_batch using the tokenized sentences
messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
# Initialize the tokenizer with the chat template
tokenizer = get_chat_template(
tokenizer,
chat_template="phi-3",
mapping={
"role": "from",
"content": "value",
"user": "human",
"assistant": "gpt",
}, # ShareGPT style
)
# Enable native 2x faster inference
FastLanguageModel.for_inference(model)
# Initialize an empty list to store responses
responses = []
# Process each message individually
for message in messages_batch:
# Apply the chat template to the individual message
inputs = tokenizer.apply_chat_template(
[message], # Wrap the message in a list
tokenize=True,
add_generation_prompt=True, # Must add for generation
return_tensors="pt",
).to("cuda")
# Generate the response for the individual message
outputs = model.generate(
input_ids=inputs,
max_new_tokens=1024,
use_cache=True,
do_sample=True,
temperature=temperature,
repetition_penalty=repetition_penalty,
top_k=top_k,
length_penalty=length_penalty,
)
# Decode the output and store it
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)
responses.append(decoded_output[0])
# Print or return the responses
generated_sentences = []
for idx, response in enumerate(responses):
generated_sentence = response.split("<|assistant|>")[1].split("<|end|>")[0].strip()
generated_sentences.append(generated_sentence)
print(sentences[idx])
print(generated_sentence)
print()
return generated_sentences
def humanize_text(
text,
progress=gr.Progress(),
model_name="Standard Model",
temperature=1.2,
repetition_penalty=1.0,
top_k=50,
length_penalty=1.0,
):
"""
Optimization here is to feed all sentences at once to the model.
Paragraphs are stored as a number of sentences per paragraph.
"""
progress(0, desc="Starting to Humanize")
# Map model names to their respective processing functions
model_map = {
"Standard Model": humanize_batch_seq2seq,
"Advanced Model (Beta)": humanize_batch_decoder_only,
}
assert model_name in model_map, f"Invalid model name: {model_name}"
process_function = model_map[model_name]
# Split the text into paragraphs and then into sentences
paragraphs = text.split("\n")
all_sentences = []
sentences_per_paragraph = []
citations_per_paragraph = []
for paragraph in paragraphs:
citations_per_paragraph.append(extract_citations(paragraph))
paragraph = remove_citations(paragraph)
sentences = sent_tokenize(paragraph)
sentences_per_paragraph.append(len(sentences))
all_sentences.extend(sentences)
# Process all sentences in batches
paraphrased_sentences = []
current_batch_size = BATCH_SIZE
i = 0
while i < len(all_sentences):
try:
batch_sentences = all_sentences[i : i + current_batch_size]
# Call the selected processing function
paraphrased_batch = process_function(
(seq2seq_model if model_name == "Standard Model" else dec_only_model),
(seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer),
batch_sentences,
temperature,
repetition_penalty,
top_k,
length_penalty,
)
paraphrased_sentences.extend(paraphrased_batch)
i += current_batch_size # Move to the next batch
torch.cuda.empty_cache()
gc.collect()
progress.update(i / len(all_sentences))
except RuntimeError as e:
if "out of memory" in str(e):
# Reduce the batch size by half and retry
current_batch_size = max(1, current_batch_size // 2)
print(f"Out of memory, reducing batch size to {current_batch_size}. Retrying...")
torch.cuda.empty_cache()
gc.collect()
else:
raise e
# Reconstruct paragraphs
humanized_paragraphs = []
sentence_index = 0
for num_sentences in sentences_per_paragraph:
humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
humanized_paragraphs.append(humanized_paragraph)
sentence_index += num_sentences
for i, paragraph in enumerate(humanized_paragraphs):
citation_texts = [f"<{cid}>" for cid in citations_per_paragraph[i]]
humanized_paragraphs[i] = paragraph + " " + "".join(citation_texts)
humanized_text = "\n\n".join(humanized_paragraphs)
return humanized_text