Spaces:

umutbozdag
/

humanizer_model

Runtime error

App Files Files Community

lucidmorto commited on Jul 15

Commit

0f051eb

•

1 Parent(s): 80915e3

feat: Update model and parameters for improved text humanization

Browse files

Switch to the custom "humanize_model" to align with training. Adjust text pre-processing by removing the "humanize: " prefix. Revise generation parameters to match training setup for better results. Improve description and title in the gradio interface to reflect the human-like text generation.

Streamline data preparation by removing placeholder formal text generation. Update training parameters with fewer epochs, larger batch size, more warmup steps, modified logging, evaluation, and save steps for efficient training.

Files changed (2) hide show

app.py +22 -14
humanizer.py +14 -17

app.py CHANGED Viewed

@@ -1,29 +1,37 @@
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-model_name = "t5-large"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 def generate_text(input_text):
     # Preprocess input text
     input_text = input_text.strip()
     # Prepare input for the model
-    input_ids = tokenizer.encode("humanize: " + input_text, return_tensors="pt", max_length=512, truncation=True)
-    # Generate text with improved parameters
     outputs = model.generate(
         input_ids,
-        max_length=300,
-        min_length=30,
         num_return_sequences=1,
-        no_repeat_ngram_size=3,
-        top_k=50,
-        top_p=0.95,
-        temperature=0.8,
         do_sample=True,
-        early_stopping=True
     )
     # Decode and clean up the generated text
@@ -34,8 +42,8 @@ iface = gr.Interface(
     fn=generate_text,
     inputs=gr.Textbox(lines=5, label="Input Text"),
     outputs=gr.Textbox(label="Generated Text"),
-    title="Text Generator",
-    description="Enter text to generate a summary or continuation."
 )
 iface.launch()

 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from huggingface_hub import HfApi
+# Get the latest model from your space
+api = HfApi()
+space_name = "umut-bozdag/humanizer_model"  # Replace with your actual space name
+model_files = api.list_repo_files(space_name)
+model_file = next(file for file in model_files if file.endswith('.bin'))
+model_revision = api.get_repo_info(space_name).sha
+# Load the model and tokenizer from the space
+tokenizer = AutoTokenizer.from_pretrained(space_name, revision=model_revision)
+model = AutoModelForSeq2SeqLM.from_pretrained(space_name, revision=model_revision)
 def generate_text(input_text):
     # Preprocess input text
     input_text = input_text.strip()
     # Prepare input for the model
+    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=256, truncation=True)
+    # Generate text with parameters matching your training setup
     outputs = model.generate(
         input_ids,
+        max_length=256,
         num_return_sequences=1,
+        no_repeat_ngram_size=2,
+        top_k=30,
+        top_p=0.9,
+        temperature=0.7,
         do_sample=True,
+        early_stopping=True,
+        num_beams=4
     )
     # Decode and clean up the generated text
     fn=generate_text,
     inputs=gr.Textbox(lines=5, label="Input Text"),
     outputs=gr.Textbox(label="Generated Text"),
+    title="Text Humanizer",
+    description="Enter text to generate a more human-like version."
 )
 iface.launch()

humanizer.py CHANGED Viewed

@@ -25,15 +25,9 @@ dataset = DatasetDict({
     "validation": test_valid["train"]
 })
-# Function to generate more formal text (placeholder - replace with actual implementation)
-def generate_formal_text(text):
-    # Implement formal text generation here
-    return text  # Placeholder
 # Prepare the dataset
 def prepare_data(example):
-    example["formal_text"] = generate_formal_text(example["body"])  # Changed from "text" to "body"
-    return example
 logger.info("Preparing dataset...")
 processed_dataset = {split: data.map(prepare_data) for split, data in dataset.items()}
@@ -44,8 +38,8 @@ model_name = "t5-large"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 def tokenize_function(examples):
-    model_inputs = tokenizer(examples["formal_text"], max_length=256, truncation=True, padding="max_length")
-    labels = tokenizer(examples["body"], max_length=256, truncation=True, padding="max_length")
     model_inputs["labels"] = labels["input_ids"]
     return model_inputs
@@ -63,22 +57,25 @@ model = T5ForConditionalGeneration.from_pretrained(model_name)
 training_args = Seq2SeqTrainingArguments(
     output_dir="./results",
-    num_train_epochs=5,  # Increased epochs
-    per_device_train_batch_size=16,  # Reduced batch size due to larger model
-    per_device_eval_batch_size=16,
-    warmup_steps=1000,  # Increased warmup steps
     weight_decay=0.01,
     logging_dir="./logs",
-    logging_steps=100,
     evaluation_strategy="steps",
-    eval_steps=500,
-    save_steps=500,
     use_cpu=False,
     load_best_model_at_end=True,
     metric_for_best_model="eval_loss",
     greater_is_better=False,
     fp16=True,
-    gradient_accumulation_steps=4,  # Increased to simulate larger batch sizes
 )
 optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)  # Slightly lower learning rate

     "validation": test_valid["train"]
 })
 # Prepare the dataset
 def prepare_data(example):
+    return {"input_text": example["body"], "target_text": example["body"]}
 logger.info("Preparing dataset...")
 processed_dataset = {split: data.map(prepare_data) for split, data in dataset.items()}
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 def tokenize_function(examples):
+    model_inputs = tokenizer(examples["input_text"], max_length=256, truncation=True, padding="max_length")
+    labels = tokenizer(examples["target_text"], max_length=256, truncation=True, padding="max_length")
     model_inputs["labels"] = labels["input_ids"]
     return model_inputs
 training_args = Seq2SeqTrainingArguments(
     output_dir="./results",
+    num_train_epochs=3,  # Reduced from 5 to 3
+    per_device_train_batch_size=32,  # Increased from 16 to 32
+    per_device_eval_batch_size=32,
+    warmup_steps=2000,  # Increased from 1000 to 2000
     weight_decay=0.01,
     logging_dir="./logs",
+    logging_steps=500,  # Increased from 100 to 500
     evaluation_strategy="steps",
+    eval_steps=2000,  # Increased from 500 to 2000
+    save_steps=2000,  # Increased from 500 to 2000
     use_cpu=False,
     load_best_model_at_end=True,
     metric_for_best_model="eval_loss",
     greater_is_better=False,
     fp16=True,
+    gradient_accumulation_steps=2,  # Reduced from 4 to 2
+    predict_with_generate=True,
+    generation_max_length=256,
+    generation_num_beams=4,
 )
 optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)  # Slightly lower learning rate