import gc import torch import nltk from nltk import sent_tokenize import gradio as gr from transformers import T5ForConditionalGeneration, T5Tokenizer import language_tool_python import re nltk.download("punkt") GPU_IDX = 1 # which GPU to use, starts from 0 BATCH_SIZE = 64 # number of sentences to process in one batch # autodetect the available device if torch.cuda.is_available(): num_gpus = torch.cuda.device_count() print(f"Number of available GPUs: {num_gpus}") assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available." device = torch.device(f"cuda:{GPU_IDX}") print(f"Using GPU: {GPU_IDX}") else: print("CUDA is not available. Using CPU instead.") device = torch.device("cpu") # ---------------------------- # load encoder-decoder (sequence to sequence) language model # seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2" # seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device) # seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq) # print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}") seq2seq_model = None seq2seq_tokenizer = None # ---------------------------- # load decoder-only (causal) language model from unsloth import FastLanguageModel from unsloth.chat_templates import get_chat_template # can only use GPU 0 when using unsloth FastLanguageModel max_seq_length = 2048 # any can be chosed since RoPE Scaling is used dtype = None # None for auto detection. Float16for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = True # Use 4bit quantization to reduce memory usage dec_only = "polygraf-ai/phi-3-mini-rank-128" dec_only_model, dec_only_tokenizer = FastLanguageModel.from_pretrained( model_name=dec_only, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, device_map="cuda:0", ) FastLanguageModel.for_inference(dec_only_model) # native 2x faster inference print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}") # grammar correction tool tool = language_tool_python.LanguageTool("en-US") def format_and_correct_language_check(text: str) -> str: return tool.correct(text) def extract_citations(text): citations = re.findall(r"<(\d+)>", text) return [int(citation) for citation in citations] def remove_citations(text): text = re.sub(r"<\d+>", "", text) text = re.sub(r"[\d+]", "", text) return text def humanize_batch_seq2seq( model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty, ): inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences] inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device) outputs = model.generate( **inputs, do_sample=True, temperature=temperature, repetition_penalty=repetition_penalty, max_length=128, top_k=top_k, length_penalty=length_penalty, ) answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs] return answers def humanize_batch_decoder_only( model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty, ): pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n" # Construct the messages_batch using the tokenized sentences messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences] # Initialize the tokenizer with the chat template tokenizer = get_chat_template( tokenizer, chat_template="phi-3", mapping={ "role": "from", "content": "value", "user": "human", "assistant": "gpt", }, # ShareGPT style ) # Enable native 2x faster inference FastLanguageModel.for_inference(model) # Initialize an empty list to store responses responses = [] # Process each message individually for message in messages_batch: # Apply the chat template to the individual message inputs = tokenizer.apply_chat_template( [message], # Wrap the message in a list tokenize=True, add_generation_prompt=True, # Must add for generation return_tensors="pt", ).to("cuda") # Generate the response for the individual message outputs = model.generate( input_ids=inputs, max_new_tokens=1024, use_cache=True, do_sample=True, temperature=temperature, repetition_penalty=repetition_penalty, top_k=top_k, length_penalty=length_penalty, ) # Decode the output and store it decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=False) responses.append(decoded_output[0]) # Print or return the responses generated_sentences = [] for idx, response in enumerate(responses): generated_sentence = response.split("<|assistant|>")[1].split("<|end|>")[0].strip() generated_sentences.append(generated_sentence) print(sentences[idx]) print(generated_sentence) print() return generated_sentences def humanize_text( text, progress=gr.Progress(), model_name="Standard Model", temperature=1.2, repetition_penalty=1.0, top_k=50, length_penalty=1.0, ): """ Optimization here is to feed all sentences at once to the model. Paragraphs are stored as a number of sentences per paragraph. """ progress(0, desc="Starting to Humanize") # Map model names to their respective processing functions model_map = { "Standard Model": humanize_batch_seq2seq, "Advanced Model (Beta)": humanize_batch_decoder_only, } assert model_name in model_map, f"Invalid model name: {model_name}" process_function = model_map[model_name] # Split the text into paragraphs and then into sentences paragraphs = text.split("\n") all_sentences = [] sentences_per_paragraph = [] citations_per_paragraph = [] for paragraph in paragraphs: citations_per_paragraph.append(extract_citations(paragraph)) paragraph = remove_citations(paragraph) sentences = sent_tokenize(paragraph) sentences_per_paragraph.append(len(sentences)) all_sentences.extend(sentences) # Process all sentences in batches paraphrased_sentences = [] current_batch_size = BATCH_SIZE i = 0 while i < len(all_sentences): try: batch_sentences = all_sentences[i : i + current_batch_size] # Call the selected processing function paraphrased_batch = process_function( (seq2seq_model if model_name == "Standard Model" else dec_only_model), (seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer), batch_sentences, temperature, repetition_penalty, top_k, length_penalty, ) paraphrased_sentences.extend(paraphrased_batch) i += current_batch_size # Move to the next batch torch.cuda.empty_cache() gc.collect() progress.update(i / len(all_sentences)) except RuntimeError as e: if "out of memory" in str(e): # Reduce the batch size by half and retry current_batch_size = max(1, current_batch_size // 2) print(f"Out of memory, reducing batch size to {current_batch_size}. Retrying...") torch.cuda.empty_cache() gc.collect() else: raise e # Reconstruct paragraphs humanized_paragraphs = [] sentence_index = 0 for num_sentences in sentences_per_paragraph: humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences]) humanized_paragraphs.append(humanized_paragraph) sentence_index += num_sentences for i, paragraph in enumerate(humanized_paragraphs): citation_texts = [f"<{cid}>" for cid in citations_per_paragraph[i]] humanized_paragraphs[i] = paragraph + " " + "".join(citation_texts) humanized_text = "\n\n".join(humanized_paragraphs) return humanized_text