lucidmorto commited on
Commit
0f051eb
1 Parent(s): 80915e3

feat: Update model and parameters for improved text humanization

Browse files

Switch to the custom "humanize_model" to align with training. Adjust text pre-processing by removing the "humanize: " prefix. Revise generation parameters to match training setup for better results. Improve description and title in the gradio interface to reflect the human-like text generation.

Streamline data preparation by removing placeholder formal text generation. Update training parameters with fewer epochs, larger batch size, more warmup steps, modified logging, evaluation, and save steps for efficient training.

Files changed (2) hide show
  1. app.py +22 -14
  2. humanizer.py +14 -17
app.py CHANGED
@@ -1,29 +1,37 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
3
 
4
- model_name = "t5-large"
5
- tokenizer = AutoTokenizer.from_pretrained(model_name)
6
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 
 
 
 
 
 
 
7
 
8
  def generate_text(input_text):
9
  # Preprocess input text
10
  input_text = input_text.strip()
11
 
12
  # Prepare input for the model
13
- input_ids = tokenizer.encode("humanize: " + input_text, return_tensors="pt", max_length=512, truncation=True)
14
 
15
- # Generate text with improved parameters
16
  outputs = model.generate(
17
  input_ids,
18
- max_length=300,
19
- min_length=30,
20
  num_return_sequences=1,
21
- no_repeat_ngram_size=3,
22
- top_k=50,
23
- top_p=0.95,
24
- temperature=0.8,
25
  do_sample=True,
26
- early_stopping=True
 
27
  )
28
 
29
  # Decode and clean up the generated text
@@ -34,8 +42,8 @@ iface = gr.Interface(
34
  fn=generate_text,
35
  inputs=gr.Textbox(lines=5, label="Input Text"),
36
  outputs=gr.Textbox(label="Generated Text"),
37
- title="Text Generator",
38
- description="Enter text to generate a summary or continuation."
39
  )
40
 
41
  iface.launch()
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from huggingface_hub import HfApi
4
 
5
+ # Get the latest model from your space
6
+ api = HfApi()
7
+ space_name = "umut-bozdag/humanizer_model" # Replace with your actual space name
8
+ model_files = api.list_repo_files(space_name)
9
+ model_file = next(file for file in model_files if file.endswith('.bin'))
10
+ model_revision = api.get_repo_info(space_name).sha
11
+
12
+ # Load the model and tokenizer from the space
13
+ tokenizer = AutoTokenizer.from_pretrained(space_name, revision=model_revision)
14
+ model = AutoModelForSeq2SeqLM.from_pretrained(space_name, revision=model_revision)
15
 
16
  def generate_text(input_text):
17
  # Preprocess input text
18
  input_text = input_text.strip()
19
 
20
  # Prepare input for the model
21
+ input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=256, truncation=True)
22
 
23
+ # Generate text with parameters matching your training setup
24
  outputs = model.generate(
25
  input_ids,
26
+ max_length=256,
 
27
  num_return_sequences=1,
28
+ no_repeat_ngram_size=2,
29
+ top_k=30,
30
+ top_p=0.9,
31
+ temperature=0.7,
32
  do_sample=True,
33
+ early_stopping=True,
34
+ num_beams=4
35
  )
36
 
37
  # Decode and clean up the generated text
 
42
  fn=generate_text,
43
  inputs=gr.Textbox(lines=5, label="Input Text"),
44
  outputs=gr.Textbox(label="Generated Text"),
45
+ title="Text Humanizer",
46
+ description="Enter text to generate a more human-like version."
47
  )
48
 
49
  iface.launch()
humanizer.py CHANGED
@@ -25,15 +25,9 @@ dataset = DatasetDict({
25
  "validation": test_valid["train"]
26
  })
27
 
28
- # Function to generate more formal text (placeholder - replace with actual implementation)
29
- def generate_formal_text(text):
30
- # Implement formal text generation here
31
- return text # Placeholder
32
-
33
  # Prepare the dataset
34
  def prepare_data(example):
35
- example["formal_text"] = generate_formal_text(example["body"]) # Changed from "text" to "body"
36
- return example
37
 
38
  logger.info("Preparing dataset...")
39
  processed_dataset = {split: data.map(prepare_data) for split, data in dataset.items()}
@@ -44,8 +38,8 @@ model_name = "t5-large"
44
  tokenizer = AutoTokenizer.from_pretrained(model_name)
45
 
46
  def tokenize_function(examples):
47
- model_inputs = tokenizer(examples["formal_text"], max_length=256, truncation=True, padding="max_length")
48
- labels = tokenizer(examples["body"], max_length=256, truncation=True, padding="max_length")
49
  model_inputs["labels"] = labels["input_ids"]
50
  return model_inputs
51
 
@@ -63,22 +57,25 @@ model = T5ForConditionalGeneration.from_pretrained(model_name)
63
 
64
  training_args = Seq2SeqTrainingArguments(
65
  output_dir="./results",
66
- num_train_epochs=5, # Increased epochs
67
- per_device_train_batch_size=16, # Reduced batch size due to larger model
68
- per_device_eval_batch_size=16,
69
- warmup_steps=1000, # Increased warmup steps
70
  weight_decay=0.01,
71
  logging_dir="./logs",
72
- logging_steps=100,
73
  evaluation_strategy="steps",
74
- eval_steps=500,
75
- save_steps=500,
76
  use_cpu=False,
77
  load_best_model_at_end=True,
78
  metric_for_best_model="eval_loss",
79
  greater_is_better=False,
80
  fp16=True,
81
- gradient_accumulation_steps=4, # Increased to simulate larger batch sizes
 
 
 
82
  )
83
 
84
  optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5) # Slightly lower learning rate
 
25
  "validation": test_valid["train"]
26
  })
27
 
 
 
 
 
 
28
  # Prepare the dataset
29
  def prepare_data(example):
30
+ return {"input_text": example["body"], "target_text": example["body"]}
 
31
 
32
  logger.info("Preparing dataset...")
33
  processed_dataset = {split: data.map(prepare_data) for split, data in dataset.items()}
 
38
  tokenizer = AutoTokenizer.from_pretrained(model_name)
39
 
40
  def tokenize_function(examples):
41
+ model_inputs = tokenizer(examples["input_text"], max_length=256, truncation=True, padding="max_length")
42
+ labels = tokenizer(examples["target_text"], max_length=256, truncation=True, padding="max_length")
43
  model_inputs["labels"] = labels["input_ids"]
44
  return model_inputs
45
 
 
57
 
58
  training_args = Seq2SeqTrainingArguments(
59
  output_dir="./results",
60
+ num_train_epochs=3, # Reduced from 5 to 3
61
+ per_device_train_batch_size=32, # Increased from 16 to 32
62
+ per_device_eval_batch_size=32,
63
+ warmup_steps=2000, # Increased from 1000 to 2000
64
  weight_decay=0.01,
65
  logging_dir="./logs",
66
+ logging_steps=500, # Increased from 100 to 500
67
  evaluation_strategy="steps",
68
+ eval_steps=2000, # Increased from 500 to 2000
69
+ save_steps=2000, # Increased from 500 to 2000
70
  use_cpu=False,
71
  load_best_model_at_end=True,
72
  metric_for_best_model="eval_loss",
73
  greater_is_better=False,
74
  fp16=True,
75
+ gradient_accumulation_steps=2, # Reduced from 4 to 2
76
+ predict_with_generate=True,
77
+ generation_max_length=256,
78
+ generation_num_beams=4,
79
  )
80
 
81
  optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5) # Slightly lower learning rate