Spaces:
Runtime error
Runtime error
lucidmorto
commited on
Commit
•
0f051eb
1
Parent(s):
80915e3
feat: Update model and parameters for improved text humanization
Browse filesSwitch to the custom "humanize_model" to align with training. Adjust text pre-processing by removing the "humanize: " prefix. Revise generation parameters to match training setup for better results. Improve description and title in the gradio interface to reflect the human-like text generation.
Streamline data preparation by removing placeholder formal text generation. Update training parameters with fewer epochs, larger batch size, more warmup steps, modified logging, evaluation, and save steps for efficient training.
- app.py +22 -14
- humanizer.py +14 -17
app.py
CHANGED
@@ -1,29 +1,37 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def generate_text(input_text):
|
9 |
# Preprocess input text
|
10 |
input_text = input_text.strip()
|
11 |
|
12 |
# Prepare input for the model
|
13 |
-
input_ids = tokenizer.encode(
|
14 |
|
15 |
-
# Generate text with
|
16 |
outputs = model.generate(
|
17 |
input_ids,
|
18 |
-
max_length=
|
19 |
-
min_length=30,
|
20 |
num_return_sequences=1,
|
21 |
-
no_repeat_ngram_size=
|
22 |
-
top_k=
|
23 |
-
top_p=0.
|
24 |
-
temperature=0.
|
25 |
do_sample=True,
|
26 |
-
early_stopping=True
|
|
|
27 |
)
|
28 |
|
29 |
# Decode and clean up the generated text
|
@@ -34,8 +42,8 @@ iface = gr.Interface(
|
|
34 |
fn=generate_text,
|
35 |
inputs=gr.Textbox(lines=5, label="Input Text"),
|
36 |
outputs=gr.Textbox(label="Generated Text"),
|
37 |
-
title="Text
|
38 |
-
description="Enter text to generate a
|
39 |
)
|
40 |
|
41 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
3 |
+
from huggingface_hub import HfApi
|
4 |
|
5 |
+
# Get the latest model from your space
|
6 |
+
api = HfApi()
|
7 |
+
space_name = "umut-bozdag/humanizer_model" # Replace with your actual space name
|
8 |
+
model_files = api.list_repo_files(space_name)
|
9 |
+
model_file = next(file for file in model_files if file.endswith('.bin'))
|
10 |
+
model_revision = api.get_repo_info(space_name).sha
|
11 |
+
|
12 |
+
# Load the model and tokenizer from the space
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained(space_name, revision=model_revision)
|
14 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(space_name, revision=model_revision)
|
15 |
|
16 |
def generate_text(input_text):
|
17 |
# Preprocess input text
|
18 |
input_text = input_text.strip()
|
19 |
|
20 |
# Prepare input for the model
|
21 |
+
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=256, truncation=True)
|
22 |
|
23 |
+
# Generate text with parameters matching your training setup
|
24 |
outputs = model.generate(
|
25 |
input_ids,
|
26 |
+
max_length=256,
|
|
|
27 |
num_return_sequences=1,
|
28 |
+
no_repeat_ngram_size=2,
|
29 |
+
top_k=30,
|
30 |
+
top_p=0.9,
|
31 |
+
temperature=0.7,
|
32 |
do_sample=True,
|
33 |
+
early_stopping=True,
|
34 |
+
num_beams=4
|
35 |
)
|
36 |
|
37 |
# Decode and clean up the generated text
|
|
|
42 |
fn=generate_text,
|
43 |
inputs=gr.Textbox(lines=5, label="Input Text"),
|
44 |
outputs=gr.Textbox(label="Generated Text"),
|
45 |
+
title="Text Humanizer",
|
46 |
+
description="Enter text to generate a more human-like version."
|
47 |
)
|
48 |
|
49 |
iface.launch()
|
humanizer.py
CHANGED
@@ -25,15 +25,9 @@ dataset = DatasetDict({
|
|
25 |
"validation": test_valid["train"]
|
26 |
})
|
27 |
|
28 |
-
# Function to generate more formal text (placeholder - replace with actual implementation)
|
29 |
-
def generate_formal_text(text):
|
30 |
-
# Implement formal text generation here
|
31 |
-
return text # Placeholder
|
32 |
-
|
33 |
# Prepare the dataset
|
34 |
def prepare_data(example):
|
35 |
-
|
36 |
-
return example
|
37 |
|
38 |
logger.info("Preparing dataset...")
|
39 |
processed_dataset = {split: data.map(prepare_data) for split, data in dataset.items()}
|
@@ -44,8 +38,8 @@ model_name = "t5-large"
|
|
44 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
45 |
|
46 |
def tokenize_function(examples):
|
47 |
-
model_inputs = tokenizer(examples["
|
48 |
-
labels = tokenizer(examples["
|
49 |
model_inputs["labels"] = labels["input_ids"]
|
50 |
return model_inputs
|
51 |
|
@@ -63,22 +57,25 @@ model = T5ForConditionalGeneration.from_pretrained(model_name)
|
|
63 |
|
64 |
training_args = Seq2SeqTrainingArguments(
|
65 |
output_dir="./results",
|
66 |
-
num_train_epochs=
|
67 |
-
per_device_train_batch_size=
|
68 |
-
per_device_eval_batch_size=
|
69 |
-
warmup_steps=
|
70 |
weight_decay=0.01,
|
71 |
logging_dir="./logs",
|
72 |
-
logging_steps=100
|
73 |
evaluation_strategy="steps",
|
74 |
-
eval_steps=500
|
75 |
-
save_steps=500
|
76 |
use_cpu=False,
|
77 |
load_best_model_at_end=True,
|
78 |
metric_for_best_model="eval_loss",
|
79 |
greater_is_better=False,
|
80 |
fp16=True,
|
81 |
-
gradient_accumulation_steps=
|
|
|
|
|
|
|
82 |
)
|
83 |
|
84 |
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5) # Slightly lower learning rate
|
|
|
25 |
"validation": test_valid["train"]
|
26 |
})
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
# Prepare the dataset
|
29 |
def prepare_data(example):
|
30 |
+
return {"input_text": example["body"], "target_text": example["body"]}
|
|
|
31 |
|
32 |
logger.info("Preparing dataset...")
|
33 |
processed_dataset = {split: data.map(prepare_data) for split, data in dataset.items()}
|
|
|
38 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
39 |
|
40 |
def tokenize_function(examples):
|
41 |
+
model_inputs = tokenizer(examples["input_text"], max_length=256, truncation=True, padding="max_length")
|
42 |
+
labels = tokenizer(examples["target_text"], max_length=256, truncation=True, padding="max_length")
|
43 |
model_inputs["labels"] = labels["input_ids"]
|
44 |
return model_inputs
|
45 |
|
|
|
57 |
|
58 |
training_args = Seq2SeqTrainingArguments(
|
59 |
output_dir="./results",
|
60 |
+
num_train_epochs=3, # Reduced from 5 to 3
|
61 |
+
per_device_train_batch_size=32, # Increased from 16 to 32
|
62 |
+
per_device_eval_batch_size=32,
|
63 |
+
warmup_steps=2000, # Increased from 1000 to 2000
|
64 |
weight_decay=0.01,
|
65 |
logging_dir="./logs",
|
66 |
+
logging_steps=500, # Increased from 100 to 500
|
67 |
evaluation_strategy="steps",
|
68 |
+
eval_steps=2000, # Increased from 500 to 2000
|
69 |
+
save_steps=2000, # Increased from 500 to 2000
|
70 |
use_cpu=False,
|
71 |
load_best_model_at_end=True,
|
72 |
metric_for_best_model="eval_loss",
|
73 |
greater_is_better=False,
|
74 |
fp16=True,
|
75 |
+
gradient_accumulation_steps=2, # Reduced from 4 to 2
|
76 |
+
predict_with_generate=True,
|
77 |
+
generation_max_length=256,
|
78 |
+
generation_num_beams=4,
|
79 |
)
|
80 |
|
81 |
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5) # Slightly lower learning rate
|