Upload synth.py
Browse files
synth.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import torch
|
3 |
+
from transformers import pipeline
|
4 |
+
import random
|
5 |
+
|
6 |
+
# Initialize the chatbot with half-precision
|
7 |
+
chatbot = pipeline(
|
8 |
+
"text-generation",
|
9 |
+
model="mistralai/Mistral-7B-Instruct-v0.3",
|
10 |
+
torch_dtype=torch.float16,
|
11 |
+
device=0 # Assuming you are using a GPU
|
12 |
+
)
|
13 |
+
|
14 |
+
# Sentiments and their labels
|
15 |
+
sentiments = ["Positive or Encouraging", "Neutral or Factual", "Negative or Toxic"]
|
16 |
+
|
17 |
+
# List of content formats to cycle through
|
18 |
+
formats = [
|
19 |
+
"Feature Stories", "Instructional Manuals", "FAQs", "Policy Documents", "Live Stream Descriptions",
|
20 |
+
"Editorial Content", "Research Papers", "User Manuals", "Commentaries", "Opinion Pieces",
|
21 |
+
"Newsletters", "Online Courses", "Photo Essays", "Annual Reports", "User-Generated Content",
|
22 |
+
"Testimonials", "DIY Content", "How-To Videos", "Campaign Reports", "Legal Briefs",
|
23 |
+
"Blog Posts", "Case Studies", "Tutorials", "Interviews", "Press Releases",
|
24 |
+
"eBooks", "Infographics", "Webinars", "Podcast Descriptions", "Video Scripts",
|
25 |
+
"Advertisements", "Forum Discussions", "Whitepapers", "Surveys", "Product Reviews",
|
26 |
+
"Event Summaries", "Opinion Editorials", "Letters to the Editor", "Round-Up Posts",
|
27 |
+
"Buying Guides", "Checklists", "Cheat Sheets", "Recipes", "Travel Guides",
|
28 |
+
"Profiles", "Lists", "Q&A Sessions", "Debates", "Polls"
|
29 |
+
]
|
30 |
+
|
31 |
+
# List of topics to cycle through
|
32 |
+
topics = [
|
33 |
+
"Family", "Travel", "Politics", "Science", "Health", "Technology", "Sports",
|
34 |
+
"Education", "Environment", "Economics", "Culture", "History", "Music",
|
35 |
+
"Literature", "Food", "Art", "Fashion", "Entertainment", "Business",
|
36 |
+
"Relationships", "Fitness", "Automotive", "Finance", "Real Estate", "Law",
|
37 |
+
"Psychology", "Philosophy", "Religion", "Gardening", "DIY", "Hobbies",
|
38 |
+
"Pets", "Career", "Marketing", "Customer Service", "Networking", "Innovation",
|
39 |
+
"Artificial Intelligence", "Sustainability", "Social Issues", "Digital Media",
|
40 |
+
"Programming", "Cybersecurity", "Astronomy", "Geography", "Travel Tips",
|
41 |
+
"Cooking", "Parenting", "Productivity", "Mindfulness", "Mental Health",
|
42 |
+
"Self-Improvement", "Leadership", "Teamwork", "Volunteering", "Nonprofits",
|
43 |
+
"Gaming", "E-commerce", "Photography", "Videography", "Film", "Television",
|
44 |
+
"Streaming Services", "Podcasts", "Public Speaking", "Event Planning",
|
45 |
+
"Interior Design", "Architecture", "Urban Development", "Agriculture",
|
46 |
+
"Climate Change", "Renewable Energy", "Space Exploration", "Biotechnology",
|
47 |
+
"Cryptocurrency", "Blockchain", "Robotics", "Automated Systems", "Genetics",
|
48 |
+
"Medicine", "Pharmacy", "Veterinary Science", "Marine Biology", "Ecology",
|
49 |
+
"Conservation", "Wildlife", "Botany", "Zoology", "Geology", "Meteorology",
|
50 |
+
"Aviation", "Maritime", "Logistics", "Supply Chain", "Human Resources",
|
51 |
+
"Diversity and Inclusion", "Ethics", "Corporate Governance", "Public Relations",
|
52 |
+
"Journalism", "Advertising", "Sales", "Customer Experience", "Retail",
|
53 |
+
"Hospitality", "Tourism", "Luxury Goods", "Consumer Electronics", "Fashion Design",
|
54 |
+
"Textiles", "Jewelry", "Cosmetics", "Skincare", "Perfume", "Toys", "Gadgets",
|
55 |
+
"Home Appliances", "Furniture", "Home Improvement", "Landscaping", "Real Estate Investment"
|
56 |
+
]
|
57 |
+
|
58 |
+
# List of styles to cycle through
|
59 |
+
styles = [
|
60 |
+
"Super Casual", "Internet Slang", "Every Day", "Formal", "Conversational",
|
61 |
+
"Bad Grammar and Spelling", "Lazy typing", "Professional", "Academic",
|
62 |
+
"Technical", "Narrative", "Descriptive", "Analytical", "Critical",
|
63 |
+
"Objective", "Subjective", "Third Person", "First Person",
|
64 |
+
"Persuasive", "Informative", "Journalistic", "Reflective",
|
65 |
+
"DM", "Social", "Informal", "Casual", "Colloquial"
|
66 |
+
]
|
67 |
+
|
68 |
+
# List of starting phrases
|
69 |
+
starting_phrases = [
|
70 |
+
"Have you ever wondered", "Let's talk about", "It's interesting how",
|
71 |
+
"Did you know", "The reality is", "Many people believe",
|
72 |
+
"It's surprising that", "You might not know", "Let's dive into",
|
73 |
+
"Here's the thing", "A common misconception is", "It's clear that",
|
74 |
+
"Most people don't realize", "One thing to note is",
|
75 |
+
"The fact is", "Consider this", "Here's an example",
|
76 |
+
"Think about", "For instance", "To illustrate",
|
77 |
+
"In my experience", "A key point is", "It's worth noting",
|
78 |
+
"Let's explore", "Interestingly enough", "I want to highlight",
|
79 |
+
"When it comes to", "The truth is", "Many experts agree",
|
80 |
+
"Research shows", "Statistics indicate", "It's often said",
|
81 |
+
"In reality", "From my perspective", "Surprisingly",
|
82 |
+
"One thing I've noticed", "In recent studies", "Let's break down",
|
83 |
+
"People often forget", "You should know", "Interestingly",
|
84 |
+
"It turns out", "As it happens", "Experts suggest",
|
85 |
+
"The surprising fact is", "It's commonly known", "Let's be honest",
|
86 |
+
"The reality of", "It's fascinating that", "Have you noticed",
|
87 |
+
"The thing is", "It's a fact that", "Let's not forget",
|
88 |
+
"Studies have shown", "A notable point is", "It's often overlooked",
|
89 |
+
"An important aspect is", "Let's take a closer look",
|
90 |
+
"It's essential to understand", "Interestingly, research suggests",
|
91 |
+
"One aspect to consider is", "It's beneficial to know",
|
92 |
+
"It's worth considering", "The interesting thing is", "Let's examine",
|
93 |
+
"A surprising fact is", "It's helpful to know", "One surprising element is",
|
94 |
+
"Imagine this", "Here's a thought", "You might be surprised",
|
95 |
+
"Think of it this way", "Here's an idea", "It's funny how",
|
96 |
+
"Let me tell you", "Picture this", "The question is",
|
97 |
+
"Believe it or not", "You won't believe", "Let's face it",
|
98 |
+
"The best part is", "What's interesting is", "I discovered that",
|
99 |
+
"It's amazing how", "The funny thing is", "Here's why",
|
100 |
+
"What if I told you", "It's worth mentioning", "This reminds me of",
|
101 |
+
"Let me explain", "Here's something new", "I realized that",
|
102 |
+
"Have you seen", "You might enjoy", "I learned that",
|
103 |
+
"It's clear to see", "What's fascinating is", "Here's a question",
|
104 |
+
"I heard that", "The cool part is", "Here's what happened",
|
105 |
+
"It appears that", "It's evident that", "Let me share",
|
106 |
+
"You'll find that", "What's notable is", "Consider the fact that",
|
107 |
+
"It's interesting to note", "Hello everyone", "Hi there",
|
108 |
+
"Greetings", "Hey folks", "Good morning", "Good afternoon",
|
109 |
+
"Good evening", "Hey", "What's up", "Hi", "Hello",
|
110 |
+
"Amazing!", "Serious?", "Wow...", "That's pretty cool.",
|
111 |
+
"Can you believe it?", "Unbelievable!", "Incredible!", "No way!",
|
112 |
+
"Check this out", "Guess what?", "Surprise!", "Fascinating!",
|
113 |
+
"Impressive!", "I don't get it?", "Really?", "What?",
|
114 |
+
"Why?", "How come?", "Is that so?", "Are you sure?",
|
115 |
+
"What do you think?", "By the way", "Just so you know",
|
116 |
+
"For your information", "Incidentally", "On a side note",
|
117 |
+
"As a reminder", "In addition", "Besides that",
|
118 |
+
"While we're on the subject", "Speaking of which",
|
119 |
+
"Have you", "Has anyone", "Would we", "Would it be",
|
120 |
+
"OK, now", "OK but", "OK you", "OK nobody",
|
121 |
+
"Here's a quick fact", "To put it simply", "Here's why this matters",
|
122 |
+
"Let's consider", "Now, think about this", "Take this into account",
|
123 |
+
"Here's something to think about", "On that note",
|
124 |
+
"Interestingly enough", "Just imagine", "That reminds me",
|
125 |
+
"As it turns out", "Here's a fun fact", "The reality of it is",
|
126 |
+
"By the way, did you know", "Interestingly", "Speaking of",
|
127 |
+
"Now, let's dive in", "You'll be surprised to know",
|
128 |
+
"I recently discovered", "Would you believe", "Can you imagine",
|
129 |
+
"What's more", "Even more interesting is"
|
130 |
+
]
|
131 |
+
|
132 |
+
# CSV file setup with utf-8 encoding and quoting minimal
|
133 |
+
csv_file = "sentences.csv"
|
134 |
+
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
|
135 |
+
writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
136 |
+
writer.writerow(["text", "label"])
|
137 |
+
|
138 |
+
# Function to ensure correct quoting
|
139 |
+
def ensure_correct_quoting(text):
|
140 |
+
# Check if the text is already properly quoted
|
141 |
+
if text.startswith('"') and text.endswith('"'):
|
142 |
+
return text
|
143 |
+
else:
|
144 |
+
return f'"{text}"'
|
145 |
+
|
146 |
+
# Collect and save responses until reaching 100,000 rows
|
147 |
+
row_count = 0
|
148 |
+
format_index = 0
|
149 |
+
topic_index = 0
|
150 |
+
style_index = 0
|
151 |
+
|
152 |
+
while row_count < 100000:
|
153 |
+
for idx, sentiment in enumerate(sentiments):
|
154 |
+
format_type = formats[format_index % len(formats)]
|
155 |
+
format_index += 1
|
156 |
+
topic = topics[topic_index % len(topics)]
|
157 |
+
topic_index += 1
|
158 |
+
style = styles[style_index % len(styles)]
|
159 |
+
style_index += 1
|
160 |
+
start_phrase = random.choice(starting_phrases)
|
161 |
+
|
162 |
+
# Add the current sentiment prompt with the format, topic, and style
|
163 |
+
prompt = f"Start your paragraph with '{start_phrase}'. Write a single paragraph of text. Format: {format_type}. Topic: {topic}. Vibe: {sentiment}. Style: {style}."
|
164 |
+
|
165 |
+
response = chatbot(prompt, max_new_tokens=100) # Adjusted max_new_tokens for longer responses
|
166 |
+
|
167 |
+
# Debug print to check response format
|
168 |
+
print(f"Full model response: {response}")
|
169 |
+
|
170 |
+
# Extract the generated text from the response structure
|
171 |
+
generated_text = response[0]['generated_text']
|
172 |
+
|
173 |
+
# Remove any part of the prompt from the generated text if it exists
|
174 |
+
clean_text = generated_text.replace(prompt, "").strip().split('\n')[0]
|
175 |
+
|
176 |
+
# Ensure the text starts and ends with quotes only if it doesn't already
|
177 |
+
correctly_quoted_text = ensure_correct_quoting(clean_text)
|
178 |
+
|
179 |
+
# Append the clean response text to the CSV
|
180 |
+
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
|
181 |
+
writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
182 |
+
writer.writerow([correctly_quoted_text, idx])
|
183 |
+
|
184 |
+
row_count += 1
|
185 |
+
print(f"Response for sentiment '{sentiment}' saved to {csv_file}. Total rows: {row_count}")
|
186 |
+
|
187 |
+
if row_count >= 100000:
|
188 |
+
break
|
189 |
+
|
190 |
+
print("All responses saved. Total rows:", row_count)
|