Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

article_writer / humanize.py

eljanmahammadli

#feat: added YouTube as RAG input; removed standard humanizer

744d9e3 2 months ago

raw

history blame contribute delete

8.52 kB

	import gc
	import torch
	import nltk
	from nltk import sent_tokenize
	import gradio as gr
	from transformers import T5ForConditionalGeneration, T5Tokenizer
	import language_tool_python
	import re

	nltk.download("punkt")

	GPU_IDX = 1 # which GPU to use, starts from 0
	BATCH_SIZE = 64 # number of sentences to process in one batch

	# autodetect the available device
	if torch.cuda.is_available():
	num_gpus = torch.cuda.device_count()
	print(f"Number of available GPUs: {num_gpus}")
	assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
	device = torch.device(f"cuda:{GPU_IDX}")
	print(f"Using GPU: {GPU_IDX}")
	else:
	print("CUDA is not available. Using CPU instead.")
	device = torch.device("cpu")

	# ----------------------------
	# load encoder-decoder (sequence to sequence) language model
	# seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
	# seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
	# seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
	# print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
	seq2seq_model = None
	seq2seq_tokenizer = None
	# ----------------------------
	# load decoder-only (causal) language model
	from unsloth import FastLanguageModel
	from unsloth.chat_templates import get_chat_template

	# can only use GPU 0 when using unsloth FastLanguageModel
	max_seq_length = 2048 # any can be chosed since RoPE Scaling is used
	dtype = None # None for auto detection. Float16for Tesla T4, V100, Bfloat16 for Ampere+
	load_in_4bit = True # Use 4bit quantization to reduce memory usage
	dec_only = "polygraf-ai/phi-3-mini-rank-128"
	dec_only_model, dec_only_tokenizer = FastLanguageModel.from_pretrained(
	model_name=dec_only,
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit=load_in_4bit,
	device_map="cuda:0",
	)
	FastLanguageModel.for_inference(dec_only_model) # native 2x faster inference
	print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}")


	# grammar correction tool
	tool = language_tool_python.LanguageTool("en-US")


	def format_and_correct_language_check(text: str) -> str:
	return tool.correct(text)


	def extract_citations(text):
	citations = re.findall(r"<(\d+)>", text)
	return [int(citation) for citation in citations]


	def remove_citations(text):
	text = re.sub(r"<\d+>", "", text)
	text = re.sub(r"[\d+]", "", text)
	return text


	def humanize_batch_seq2seq(
	model,
	tokenizer,
	sentences,
	temperature,
	repetition_penalty,
	top_k,
	length_penalty,
	):
	inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
	inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
	outputs = model.generate(
	**inputs,
	do_sample=True,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	max_length=128,
	top_k=top_k,
	length_penalty=length_penalty,
	)
	answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
	return answers


	def humanize_batch_decoder_only(
	model,
	tokenizer,
	sentences,
	temperature,
	repetition_penalty,
	top_k,
	length_penalty,
	):
	pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
	# Construct the messages_batch using the tokenized sentences
	messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
	# Initialize the tokenizer with the chat template
	tokenizer = get_chat_template(
	tokenizer,
	chat_template="phi-3",
	mapping={
	"role": "from",
	"content": "value",
	"user": "human",
	"assistant": "gpt",
	}, # ShareGPT style
	)

	# Enable native 2x faster inference
	FastLanguageModel.for_inference(model)
	# Initialize an empty list to store responses
	responses = []
	# Process each message individually
	for message in messages_batch:
	# Apply the chat template to the individual message
	inputs = tokenizer.apply_chat_template(
	[message], # Wrap the message in a list
	tokenize=True,
	add_generation_prompt=True, # Must add for generation
	return_tensors="pt",
	).to("cuda")
	# Generate the response for the individual message
	outputs = model.generate(
	input_ids=inputs,
	max_new_tokens=1024,
	use_cache=True,
	do_sample=True,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	top_k=top_k,
	length_penalty=length_penalty,
	)
	# Decode the output and store it
	decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)
	responses.append(decoded_output[0])

	# Print or return the responses
	generated_sentences = []
	for idx, response in enumerate(responses):
	generated_sentence = response.split("<\|assistant\|>")[1].split("<\|end\|>")[0].strip()
	generated_sentences.append(generated_sentence)
	print(sentences[idx])
	print(generated_sentence)
	print()

	return generated_sentences


	def humanize_text(
	text,
	progress=gr.Progress(),
	model_name="Standard Model",
	temperature=1.2,
	repetition_penalty=1.0,
	top_k=50,
	length_penalty=1.0,
	):
	"""
	Optimization here is to feed all sentences at once to the model.
	Paragraphs are stored as a number of sentences per paragraph.
	"""
	progress(0, desc="Starting to Humanize")
	# Map model names to their respective processing functions
	model_map = {
	"Standard Model": humanize_batch_seq2seq,
	"Advanced Model (Beta)": humanize_batch_decoder_only,
	}
	assert model_name in model_map, f"Invalid model name: {model_name}"
	process_function = model_map[model_name]

	# Split the text into paragraphs and then into sentences
	paragraphs = text.split("\n")
	all_sentences = []
	sentences_per_paragraph = []
	citations_per_paragraph = []
	for paragraph in paragraphs:
	citations_per_paragraph.append(extract_citations(paragraph))
	paragraph = remove_citations(paragraph)
	sentences = sent_tokenize(paragraph)
	sentences_per_paragraph.append(len(sentences))
	all_sentences.extend(sentences)

	# Process all sentences in batches
	paraphrased_sentences = []
	current_batch_size = BATCH_SIZE
	i = 0

	while i < len(all_sentences):
	try:
	batch_sentences = all_sentences[i : i + current_batch_size]

	# Call the selected processing function
	paraphrased_batch = process_function(
	(seq2seq_model if model_name == "Standard Model" else dec_only_model),
	(seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer),
	batch_sentences,
	temperature,
	repetition_penalty,
	top_k,
	length_penalty,
	)

	paraphrased_sentences.extend(paraphrased_batch)
	i += current_batch_size # Move to the next batch
	torch.cuda.empty_cache()
	gc.collect()
	progress.update(i / len(all_sentences))

	except RuntimeError as e:
	if "out of memory" in str(e):
	# Reduce the batch size by half and retry
	current_batch_size = max(1, current_batch_size // 2)
	print(f"Out of memory, reducing batch size to {current_batch_size}. Retrying...")
	torch.cuda.empty_cache()
	gc.collect()
	else:
	raise e

	# Reconstruct paragraphs
	humanized_paragraphs = []
	sentence_index = 0
	for num_sentences in sentences_per_paragraph:
	humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
	humanized_paragraphs.append(humanized_paragraph)
	sentence_index += num_sentences
	for i, paragraph in enumerate(humanized_paragraphs):
	citation_texts = [f"<{cid}>" for cid in citations_per_paragraph[i]]
	humanized_paragraphs[i] = paragraph + " " + "".join(citation_texts)
	humanized_text = "\n\n".join(humanized_paragraphs)
	return humanized_text