lucidmorto commited on
Commit
1375ad0
1 Parent(s): 5afe41c

feat: Add text humanization model training and interface

Browse files

Introduce a Gradio interface for the text humanization model, allowing users to input formal text and receive humanized output. Implement a training script that loads and processes the dataset, fine-tunes the model, and logs the progress and results. Include dependencies in the requirements file and automate the post-training deployment process to the Hugging Face Hub. This enhances the accessibility and usability of the text humanization functionality.

Files changed (4) hide show
  1. app.py +21 -0
  2. humanizer.py +107 -0
  3. requirements.txt +4 -0
  4. train.py +18 -0
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, T5ForConditionalGeneration
3
+
4
+ model_name = "umutbozdag/humanizer_model"
5
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
6
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
7
+
8
+ def humanize(text):
9
+ inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
10
+ outputs = model.generate(**inputs, max_length=128)
11
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
12
+
13
+ iface = gr.Interface(
14
+ fn=humanize,
15
+ inputs=gr.Textbox(lines=5, label="Formal Text"),
16
+ outputs=gr.Textbox(label="Humanized Text"),
17
+ title="Text Humanizer",
18
+ description="Enter formal text to humanize it."
19
+ )
20
+
21
+ iface.launch()
humanizer.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, DatasetDict
2
+ from transformers import AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
3
+ from transformers import EarlyStoppingCallback
4
+ from transformers.integrations import TensorBoardCallback
5
+ import torch
6
+ import logging
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Load the dataset and take only 1000 samples
13
+ logger.info("Loading dataset...")
14
+ dataset = load_dataset("LucasChu/reddit_comments")
15
+ dataset = dataset.shuffle(seed=42)
16
+ dataset["train"] = dataset["train"].select(range(10000))
17
+ logger.info("Dataset loaded, shuffled, and truncated to 10,000 samples.")
18
+
19
+ # Split the train dataset into train and test
20
+ train_testvalid = dataset["train"].train_test_split(test_size=0.2, seed=42)
21
+ test_valid = train_testvalid["test"].train_test_split(test_size=0.5, seed=42)
22
+
23
+ dataset = DatasetDict({
24
+ "train": train_testvalid["train"],
25
+ "test": test_valid["test"],
26
+ "validation": test_valid["train"]
27
+ })
28
+
29
+ # Function to generate more formal text (placeholder - replace with actual implementation)
30
+ def generate_formal_text(text):
31
+ # Implement formal text generation here
32
+ return text # Placeholder
33
+
34
+ # Prepare the dataset
35
+ def prepare_data(example):
36
+ example["formal_text"] = generate_formal_text(example["text"]) # Changed from "comment" to "text"
37
+ return example
38
+
39
+ logger.info("Preparing dataset...")
40
+ processed_dataset = {split: data.map(prepare_data) for split, data in dataset.items()}
41
+ logger.info("Dataset prepared.")
42
+
43
+ # Tokenize the dataset
44
+ model_name = "t5-small"
45
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
46
+
47
+ def tokenize_function(examples):
48
+ model_inputs = tokenizer(examples["formal_text"], max_length=128, truncation=True, padding="max_length")
49
+ labels = tokenizer(examples["text"], max_length=128, truncation=True, padding="max_length") # Changed from "comment" to "text"
50
+ model_inputs["labels"] = labels["input_ids"]
51
+ return model_inputs
52
+
53
+ logger.info("Tokenizing dataset...")
54
+ tokenized_dataset = {split: data.map(tokenize_function, batched=True) for split, data in processed_dataset.items()}
55
+ logger.info("Dataset tokenized.")
56
+
57
+ # Check available splits in the dataset
58
+ available_splits = list(tokenized_dataset.keys())
59
+ logger.info(f"Available splits in the dataset: {available_splits}")
60
+
61
+ # Set up the model and trainer
62
+ logger.info("Setting up model and trainer...")
63
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
64
+
65
+ training_args = Seq2SeqTrainingArguments(
66
+ output_dir="./results",
67
+ num_train_epochs=1,
68
+ per_device_train_batch_size=16,
69
+ warmup_steps=100,
70
+ weight_decay=0.01,
71
+ logging_dir="./logs",
72
+ logging_steps=100,
73
+ evaluation_strategy="steps" if "test" in available_splits else "no",
74
+ eval_steps=500,
75
+ save_steps=1000,
76
+ use_cpu=True,
77
+ load_best_model_at_end=True,
78
+ metric_for_best_model="eval_loss",
79
+ greater_is_better=False
80
+ )
81
+
82
+ trainer = Seq2SeqTrainer(
83
+ model=model,
84
+ args=training_args,
85
+ train_dataset=tokenized_dataset["train"],
86
+ eval_dataset=tokenized_dataset.get("test"),
87
+ tokenizer=tokenizer,
88
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=3), TensorBoardCallback()]
89
+ )
90
+ logger.info("Model and trainer set up.")
91
+
92
+ # Train the model
93
+ logger.info("Starting training...")
94
+ trainer.train()
95
+ logger.info("Training completed.")
96
+
97
+ # Log final results
98
+ logger.info("Evaluating model...")
99
+ results = trainer.evaluate()
100
+ logger.info(f"Final evaluation results: {results}")
101
+
102
+ # Save the model and tokenizer to the Hugging Face Hub
103
+ logger.info("Saving model and tokenizer to Hugging Face Hub...")
104
+ model_name = "umut-bozdag/humanize_model"
105
+ trainer.push_to_hub(model_name)
106
+ tokenizer.push_to_hub(model_name)
107
+ logger.info(f"Model and tokenizer saved successfully as '{model_name}'")
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ datasets
4
+ torch
train.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ # Run the training script
8
+ os.system("python humanizer.py")
9
+
10
+ # Push the changes to the main branch
11
+ api = HfApi()
12
+ api.upload_folder(
13
+ folder_path=".",
14
+ repo_id="umutbozdag/humanizer_model",
15
+ repo_type="space",
16
+ commit_message="Update after training",
17
+ token=os.getenv("HF_TOKEN")
18
+ )