capradeepgujaran commited on
Commit
78decde
1 Parent(s): e72b30a

Create openai_tts_tool.py

Browse files
Files changed (1) hide show
  1. openai_tts_tool.py +211 -0
openai_tts_tool.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import PyPDF2
4
+ from deep_translator import GoogleTranslator
5
+ from dotenv import load_dotenv
6
+ import tiktoken
7
+ import pytesseract
8
+ import fitz # PyMuPDF for PDF processing
9
+ import docx # For processing DOCX files
10
+ from PIL import Image
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ # Initialize OpenAI client
16
+ openai_api_key = os.getenv("OPENAI_API_KEY")
17
+ client = openai.OpenAI(api_key=openai_api_key)
18
+
19
+ # Define model specifications
20
+ MODEL_SPECS = {
21
+ 'gpt-4o': {
22
+ 'max_context_tokens': 128000,
23
+ 'max_output_tokens': 4096,
24
+ },
25
+ 'gpt-4o-mini': {
26
+ 'max_context_tokens': 128000,
27
+ 'max_output_tokens': 16384,
28
+ },
29
+ 'gpt-4': {
30
+ 'max_context_tokens': 8192,
31
+ 'max_output_tokens': 8192,
32
+ },
33
+ # Add other models as needed
34
+ }
35
+
36
+ # Set the path for Tesseract OCR (only needed on Windows)
37
+ pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' # Adjust path accordingly
38
+
39
+ # Function to extract text from PDF, using OCR for scanned documents
40
+ def extract_text_from_pdf(pdf_path):
41
+ doc = fitz.open(pdf_path)
42
+ text = ""
43
+ for page_num in range(doc.page_count):
44
+ page = doc[page_num]
45
+ page_text = page.get_text()
46
+
47
+ # If no text (i.e., scanned PDF), use OCR
48
+ if not page_text.strip():
49
+ pix = page.get_pixmap()
50
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
51
+ page_text = pytesseract.image_to_string(img)
52
+
53
+ text += page_text
54
+ return text
55
+
56
+ # Function to handle .docx files
57
+ def load_docx_file(docx_path):
58
+ doc = docx.Document(docx_path)
59
+ full_text = []
60
+ for para in doc.paragraphs:
61
+ full_text.append(para.text)
62
+ return '\n'.join(full_text)
63
+
64
+ # Function to handle .txt files
65
+ def load_txt_file(txt_path):
66
+ with open(txt_path, 'r', encoding='utf-8') as f:
67
+ return f.read()
68
+
69
+ # Function to handle file based on its extension
70
+ def load_file_based_on_extension(file_path):
71
+ if file_path.endswith('.pdf'):
72
+ return extract_text_from_pdf(file_path)
73
+ elif file_path.endswith('.docx'):
74
+ return load_docx_file(file_path)
75
+ elif file_path.endswith('.txt'):
76
+ return load_txt_file(file_path)
77
+ else:
78
+ raise ValueError(f"Unsupported file format: {file_path}")
79
+
80
+ # Function to process a folder and index all files within it
81
+ def process_folder(folder_path):
82
+ documents = []
83
+ for filename in os.listdir(folder_path):
84
+ file_path = os.path.join(folder_path, filename)
85
+ if os.path.isfile(file_path):
86
+ try:
87
+ text = load_file_based_on_extension(file_path)
88
+ documents.append(text)
89
+ except ValueError as e:
90
+ print(f"Skipping unsupported file: {file_path} ({e})")
91
+ return ' '.join(documents) # Combine all documents text
92
+
93
+ # Function to count tokens
94
+ def count_tokens(text, model_name):
95
+ encoding = tiktoken.encoding_for_model(model_name)
96
+ num_tokens = len(encoding.encode(text))
97
+ return num_tokens
98
+
99
+ # Function to split text into chunks
100
+ def split_text_into_chunks(text, max_tokens, model_name):
101
+ encoding = tiktoken.encoding_for_model(model_name)
102
+ tokens = encoding.encode(text)
103
+ chunks = []
104
+ start = 0
105
+ text_length = len(tokens)
106
+
107
+ while start < text_length:
108
+ end = start + max_tokens
109
+ chunk_tokens = tokens[start:end]
110
+ chunk_text = encoding.decode(chunk_tokens)
111
+ chunks.append(chunk_text)
112
+ start = end
113
+
114
+ return chunks
115
+
116
+ # Modified summarize_text function
117
+ def summarize_text(text, length, model_name, additional_prompt):
118
+ model_specs = MODEL_SPECS.get(model_name)
119
+ if not model_specs:
120
+ raise ValueError(f"Model specifications not found for model {model_name}")
121
+
122
+ max_output_tokens = model_specs['max_output_tokens']
123
+ max_context_tokens = model_specs['max_context_tokens']
124
+
125
+ if length > max_output_tokens:
126
+ length = max_output_tokens
127
+
128
+ input_token_count = count_tokens(text, model_name)
129
+ buffer_tokens = 500
130
+
131
+ if input_token_count + buffer_tokens + length > max_context_tokens:
132
+ max_chunk_tokens = max_context_tokens - buffer_tokens - length
133
+ chunks = split_text_into_chunks(text, max_chunk_tokens, model_name)
134
+ summaries = [summarize_text(chunk, length, model_name, additional_prompt) for chunk in chunks]
135
+ combined_summary = ' '.join(summaries)
136
+ final_summary = summarize_text(combined_summary, length, model_name, additional_prompt)
137
+ return final_summary
138
+ else:
139
+ prompt = (
140
+ f"Please provide a clear and concise summary of the following text in approximately {length} words. "
141
+ "Ensure that the summary does not include any special characters, symbols, or markdown formatting. "
142
+ "Use plain language and proper punctuation."
143
+ )
144
+ if additional_prompt:
145
+ prompt += f"\n\nAdditional instructions: {additional_prompt}"
146
+ prompt += f"\n\nText to summarize:\n{text}"
147
+
148
+ # Use the chat completion as per your snippet
149
+ completion = client.chat.completions.create(
150
+ model=model_name,
151
+ messages=[
152
+ {"role": "system", "content": "You are a helpful assistant"},
153
+ {"role": "user", "content": prompt}
154
+ ],
155
+ max_tokens=length
156
+ )
157
+ return completion.choices[0].message.content.strip()
158
+
159
+ # Function to calculate summary length based on desired audio duration
160
+ def calculate_summary_length_by_duration(duration_minutes, voice_speed):
161
+ words_per_minute = 150 if voice_speed == 'normal' else 120
162
+ summary_length = int(duration_minutes * words_per_minute)
163
+ return summary_length
164
+
165
+ # Function to translate the summarized text using deep-translator
166
+ def translate_text(text, target_language):
167
+ translated = GoogleTranslator(source='auto', target=target_language).translate(text)
168
+ return translated
169
+
170
+ # Function to estimate audio duration
171
+ def estimate_audio_duration(text, voice_speed):
172
+ word_count = len(text.split())
173
+ words_per_minute = 150 if voice_speed == 'normal' else 120
174
+ duration_minutes = word_count / words_per_minute
175
+ duration_seconds = duration_minutes * 60
176
+ return duration_seconds
177
+
178
+ # Function to convert text to audio using OpenAI TTS-1
179
+ def text_to_speech_openai(text, audio_path, voice, speed):
180
+ response = client.audio.speech.create(
181
+ model="tts-1-hd",
182
+ voice=voice,
183
+ input=text
184
+ )
185
+ response.stream_to_file(audio_path)
186
+
187
+ def process_input(pdf_path=None, input_text=None, summary_length=None, voice=None, language=None, voice_speed=None, model_name=None, additional_prompt=None, generate_audio=True, folder_path=None):
188
+ if folder_path:
189
+ extracted_text = process_folder(folder_path)
190
+ elif pdf_path:
191
+ extracted_text = load_file_based_on_extension(pdf_path)
192
+ elif input_text:
193
+ extracted_text = input_text
194
+ else:
195
+ raise ValueError("No input provided for processing.")
196
+
197
+ summary_text = summarize_text(extracted_text, summary_length, model_name, additional_prompt)
198
+ translated_summary = translate_text(summary_text, language)
199
+ estimated_audio_duration = estimate_audio_duration(translated_summary, voice_speed)
200
+
201
+ base_filename = os.path.splitext(os.path.basename(pdf_path or 'document'))[0]
202
+ audio_file_path = os.path.join('uploads', f"{base_filename}_audio_{language}.mp3")
203
+ summary_file_path = os.path.join('uploads', f"{base_filename}_summary_{language}.txt")
204
+
205
+ with open(summary_file_path, "w", encoding="utf-8") as summary_file:
206
+ summary_file.write(translated_summary)
207
+
208
+ if generate_audio:
209
+ text_to_speech_openai(translated_summary, audio_file_path, voice, voice_speed)
210
+
211
+ return translated_summary, audio_file_path if generate_audio else None, summary_file_path, estimated_audio_duration