NoaiGPT commited on
Commit
9004b7f
1 Parent(s): 34bd341
Files changed (1) hide show
  1. app.py +180 -16
app.py CHANGED
@@ -1,18 +1,182 @@
1
- import gradio as gr
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- # Get the Hugging Face token from the environment variable
5
- hf_token = os.getenv("hf_token")
6
-
7
- if not hf_token:
8
- raise ValueError("No hf_token found in environment variables")
9
-
10
- # Load the model using the token
11
- try:
12
- interface = gr.load("models/NoaiGPT/777", api_key=hf_token)
13
- # Launch the Gradio interface
14
- interface.launch()
15
- except gradio.exceptions.ModelNotFoundError as e:
16
- print(f"Model not found: {e}")
17
- except Exception as e:
18
- print(f"An error occurred: {e}")
 
 
1
  import os
2
+ import json
3
+ import gradio as gr
4
+ import spaces
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
+ from sentence_splitter import SentenceSplitter
8
+ from itertools import product
9
+
10
+ # Get the Hugging Face token from environment variable
11
+ hf_token = os.getenv('HF_TOKEN')
12
+
13
+ cuda_available = torch.cuda.is_available()
14
+ device = torch.device("cuda" if cuda_available else "cpu")
15
+ print(f"Using device: {device}")
16
+
17
+ # Initialize paraphraser model and tokenizer
18
+ paraphraser_model_name = "NoaiGPT/777"
19
+ paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token)
20
+ paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device)
21
+
22
+ # Initialize classifier model and tokenizer
23
+ classifier_model_name = "andreas122001/roberta-mixed-detector"
24
+ classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
25
+ classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
26
+
27
+ # Initialize sentence splitter
28
+ splitter = SentenceSplitter(language='en')
29
+
30
+ def classify_text(text):
31
+ inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
32
+ with torch.no_grad():
33
+ outputs = classifier_model(**inputs)
34
+ probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
35
+ predicted_class = torch.argmax(probabilities, dim=-1).item()
36
+ main_label = classifier_model.config.id2label[predicted_class]
37
+ main_score = probabilities[0][predicted_class].item()
38
+ return main_label, main_score
39
+
40
+ @spaces.GPU
41
+ def generate_paraphrases(text, setting, output_format):
42
+ sentences = splitter.split(text)
43
+ all_sentence_paraphrases = []
44
+
45
+ if setting == 1:
46
+ num_return_sequences = 5
47
+ repetition_penalty = 1.1
48
+ no_repeat_ngram_size = 2
49
+ temperature = 1.0
50
+ max_length = 128
51
+ elif setting == 2:
52
+ num_return_sequences = 10
53
+ repetition_penalty = 1.2
54
+ no_repeat_ngram_size = 3
55
+ temperature = 1.2
56
+ max_length = 192
57
+ elif setting == 3:
58
+ num_return_sequences = 15
59
+ repetition_penalty = 1.3
60
+ no_repeat_ngram_size = 4
61
+ temperature = 1.4
62
+ max_length = 256
63
+ elif setting == 4:
64
+ num_return_sequences = 20
65
+ repetition_penalty = 1.4
66
+ no_repeat_ngram_size = 5
67
+ temperature = 1.6
68
+ max_length = 320
69
+ else:
70
+ num_return_sequences = 25
71
+ repetition_penalty = 1.5
72
+ no_repeat_ngram_size = 6
73
+ temperature = 1.8
74
+ max_length = 384
75
+
76
+ top_k = 50
77
+ top_p = 0.95
78
+ length_penalty = 1.0
79
+
80
+ formatted_output = "Original text:\n" + text + "\n\n"
81
+ formatted_output += "Paraphrased versions:\n"
82
+
83
+ json_output = {
84
+ "original_text": text,
85
+ "paraphrased_versions": [],
86
+ "combined_versions": [],
87
+ "human_like_versions": []
88
+ }
89
+
90
+ for i, sentence in enumerate(sentences):
91
+ inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
92
+
93
+ # Generate paraphrases using the specified parameters
94
+ outputs = paraphraser_model.generate(
95
+ inputs.input_ids,
96
+ attention_mask=inputs.attention_mask,
97
+ num_return_sequences=num_return_sequences,
98
+ repetition_penalty=repetition_penalty,
99
+ no_repeat_ngram_size=no_repeat_ngram_size,
100
+ temperature=temperature,
101
+ max_length=max_length,
102
+ top_k=top_k,
103
+ top_p=top_p,
104
+ do_sample=True,
105
+ early_stopping=False,
106
+ length_penalty=length_penalty
107
+ )
108
+
109
+ paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
110
+
111
+ formatted_output += f"Original sentence {i+1}: {sentence}\n"
112
+ for j, paraphrase in enumerate(paraphrases, 1):
113
+ formatted_output += f" Paraphrase {j}: {paraphrase}\n"
114
+
115
+ json_output["paraphrased_versions"].append({
116
+ f"original_sentence_{i+1}": sentence,
117
+ "paraphrases": paraphrases
118
+ })
119
+
120
+ all_sentence_paraphrases.append(paraphrases)
121
+ formatted_output += "\n"
122
+
123
+ all_combinations = list(product(*all_sentence_paraphrases))
124
+
125
+ formatted_output += "\nCombined paraphrased versions:\n"
126
+ combined_versions = []
127
+ for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations
128
+ combined_paraphrase = " ".join(combination)
129
+ combined_versions.append(combined_paraphrase)
130
+
131
+ json_output["combined_versions"] = combined_versions
132
+
133
+ # Classify combined versions
134
+ human_versions = []
135
+ for i, version in enumerate(combined_versions, 1):
136
+ label, score = classify_text(version)
137
+ formatted_output += f"Version {i}:\n{version}\n"
138
+ formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
139
+ if label == "human-produced" or (label == "machine-generated" and score < 0.98):
140
+ human_versions.append((version, label, score))
141
+
142
+ formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
143
+ for i, (version, label, score) in enumerate(human_versions, 1):
144
+ formatted_output += f"Version {i}:\n{version}\n"
145
+ formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
146
+
147
+ json_output["human_like_versions"] = [
148
+ {"version": version, "label": label, "confidence_score": score}
149
+ for version, label, score in human_versions
150
+ ]
151
+
152
+ # If no human-like versions, include the top 5 least confident machine-generated versions
153
+ if not human_versions:
154
+ human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5]
155
+ formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n"
156
+ for i, (version, label, score) in enumerate(human_versions, 1):
157
+ formatted_output += f"Version {i}:\n{version}\n"
158
+ formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
159
+
160
+ if output_format == "text":
161
+ return formatted_output, "\n\n".join([v[0] for v in human_versions])
162
+ else:
163
+ return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions])
164
+
165
+ # Define the Gradio interface
166
+ iface = gr.Interface(
167
+ fn=generate_paraphrases,
168
+ inputs=[
169
+ gr.Textbox(lines=5, label="Input Text"),
170
+ gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"),
171
+ gr.Radio(["text", "json"], label="Output Format")
172
+ ],
173
+ outputs=[
174
+ gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"),
175
+ gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
176
+ ],
177
+ title="Advanced Diverse Paraphraser with Human-like Filter",
178
+ description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
179
+ )
180
 
181
+ # Launch the interface
182
+ iface.launch()