oceansweep
commited on
Commit
•
e15e1c7
1
Parent(s):
af37bcf
Upload 22 files
Browse files- App_Function_Libraries/Audio/Audio_Files.py +692 -0
- App_Function_Libraries/Audio/Audio_Transcription_Lib.py +329 -0
- App_Function_Libraries/Audio/Diarization_Lib.py +275 -0
- App_Function_Libraries/Audio/__init__.py +0 -0
- App_Function_Libraries/Chat.py +54 -16
- App_Function_Libraries/Chunk_Lib.py +34 -21
- App_Function_Libraries/Gradio_Related.py +15 -6
- App_Function_Libraries/Local_File_Processing_Lib.py +2 -2
- App_Function_Libraries/Local_LLM_Inference_Engine_Lib.py +2 -2
- App_Function_Libraries/MediaWiki/Media_Wiki.py +70 -33
- App_Function_Libraries/PDF/PDF_Ingestion_Lib.py +318 -0
- App_Function_Libraries/PDF/__init__.py +0 -0
- App_Function_Libraries/Summarization/Summarization_General_Lib.py +2 -2
- App_Function_Libraries/Video_DL_Ingestion_Lib.py +332 -331
- App_Function_Libraries/Web_Scraping/Article_Extractor_Lib.py +381 -0
- App_Function_Libraries/Web_Scraping/Article_Summarization_Lib.py +249 -0
- App_Function_Libraries/Web_Scraping/__init__.py +0 -0
App_Function_Libraries/Audio/Audio_Files.py
ADDED
@@ -0,0 +1,692 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Audio_Files.py
|
2 |
+
#########################################
|
3 |
+
# Audio Processing Library
|
4 |
+
# This library is used to download or load audio files from a local directory.
|
5 |
+
#
|
6 |
+
####
|
7 |
+
#
|
8 |
+
# Functions:
|
9 |
+
#
|
10 |
+
# download_audio_file(url, save_path)
|
11 |
+
# process_audio(
|
12 |
+
# process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None)
|
13 |
+
#
|
14 |
+
#
|
15 |
+
#########################################
|
16 |
+
# Imports
|
17 |
+
import json
|
18 |
+
import logging
|
19 |
+
import os
|
20 |
+
import subprocess
|
21 |
+
import tempfile
|
22 |
+
import uuid
|
23 |
+
from datetime import datetime
|
24 |
+
from pathlib import Path
|
25 |
+
|
26 |
+
import requests
|
27 |
+
import yt_dlp
|
28 |
+
|
29 |
+
from App_Function_Libraries.Audio.Audio_Transcription_Lib import speech_to_text
|
30 |
+
from App_Function_Libraries.Chunk_Lib import improved_chunking_process
|
31 |
+
#
|
32 |
+
# Local Imports
|
33 |
+
from App_Function_Libraries.DB.DB_Manager import add_media_to_database, add_media_with_keywords, \
|
34 |
+
check_media_and_whisper_model
|
35 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \
|
36 |
+
perform_summarization
|
37 |
+
from App_Function_Libraries.Utils.Utils import create_download_directory, save_segments_to_json, downloaded_files, \
|
38 |
+
sanitize_filename
|
39 |
+
from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
|
40 |
+
|
41 |
+
#
|
42 |
+
#######################################################################################################################
|
43 |
+
# Function Definitions
|
44 |
+
#
|
45 |
+
|
46 |
+
MAX_FILE_SIZE = 500 * 1024 * 1024
|
47 |
+
|
48 |
+
|
49 |
+
def download_audio_file(url, current_whisper_model="", use_cookies=False, cookies=None):
|
50 |
+
try:
|
51 |
+
# Check if media already exists in the database and compare whisper models
|
52 |
+
should_download, reason = check_media_and_whisper_model(
|
53 |
+
url=url,
|
54 |
+
current_whisper_model=current_whisper_model
|
55 |
+
)
|
56 |
+
|
57 |
+
if not should_download:
|
58 |
+
logging.info(f"Skipping audio download: {reason}")
|
59 |
+
return None
|
60 |
+
|
61 |
+
logging.info(f"Proceeding with audio download: {reason}")
|
62 |
+
|
63 |
+
# Set up the request headers
|
64 |
+
headers = {}
|
65 |
+
if use_cookies and cookies:
|
66 |
+
try:
|
67 |
+
cookie_dict = json.loads(cookies)
|
68 |
+
headers['Cookie'] = '; '.join([f'{k}={v}' for k, v in cookie_dict.items()])
|
69 |
+
except json.JSONDecodeError:
|
70 |
+
logging.warning("Invalid cookie format. Proceeding without cookies.")
|
71 |
+
|
72 |
+
# Make the request
|
73 |
+
response = requests.get(url, headers=headers, stream=True)
|
74 |
+
# Raise an exception for bad status codes
|
75 |
+
response.raise_for_status()
|
76 |
+
|
77 |
+
# Get the file size
|
78 |
+
file_size = int(response.headers.get('content-length', 0))
|
79 |
+
if file_size > 500 * 1024 * 1024: # 500 MB limit
|
80 |
+
raise ValueError("File size exceeds the 500MB limit.")
|
81 |
+
|
82 |
+
# Generate a unique filename
|
83 |
+
file_name = f"audio_{uuid.uuid4().hex[:8]}.mp3"
|
84 |
+
save_path = os.path.join('downloads', file_name)
|
85 |
+
|
86 |
+
# Ensure the downloads directory exists
|
87 |
+
os.makedirs('downloads', exist_ok=True)
|
88 |
+
|
89 |
+
|
90 |
+
# Download the file
|
91 |
+
with open(save_path, 'wb') as f:
|
92 |
+
for chunk in response.iter_content(chunk_size=8192):
|
93 |
+
if chunk:
|
94 |
+
f.write(chunk)
|
95 |
+
|
96 |
+
logging.info(f"Audio file downloaded successfully: {save_path}")
|
97 |
+
return save_path
|
98 |
+
|
99 |
+
except requests.RequestException as e:
|
100 |
+
logging.error(f"Error downloading audio file: {str(e)}")
|
101 |
+
raise
|
102 |
+
except ValueError as e:
|
103 |
+
logging.error(str(e))
|
104 |
+
raise
|
105 |
+
except Exception as e:
|
106 |
+
logging.error(f"Unexpected error downloading audio file: {str(e)}")
|
107 |
+
raise
|
108 |
+
|
109 |
+
|
110 |
+
def process_audio(
|
111 |
+
audio_file_path,
|
112 |
+
num_speakers=2,
|
113 |
+
whisper_model="small.en",
|
114 |
+
custom_prompt_input=None,
|
115 |
+
offset=0,
|
116 |
+
api_name=None,
|
117 |
+
api_key=None,
|
118 |
+
vad_filter=False,
|
119 |
+
rolling_summarization=False,
|
120 |
+
detail_level=0.01,
|
121 |
+
keywords="default,no_keyword_set",
|
122 |
+
chunk_text_by_words=False,
|
123 |
+
max_words=0,
|
124 |
+
chunk_text_by_sentences=False,
|
125 |
+
max_sentences=0,
|
126 |
+
chunk_text_by_paragraphs=False,
|
127 |
+
max_paragraphs=0,
|
128 |
+
chunk_text_by_tokens=False,
|
129 |
+
max_tokens=0
|
130 |
+
):
|
131 |
+
try:
|
132 |
+
|
133 |
+
# Perform transcription
|
134 |
+
audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter)
|
135 |
+
|
136 |
+
if audio_file_path is None or segments is None:
|
137 |
+
logging.error("Process_Audio: Transcription failed or segments not available.")
|
138 |
+
return "Process_Audio: Transcription failed.", None, None, None, None, None
|
139 |
+
|
140 |
+
logging.debug(f"Process_Audio: Transcription audio_file: {audio_file_path}")
|
141 |
+
logging.debug(f"Process_Audio: Transcription segments: {segments}")
|
142 |
+
|
143 |
+
transcription_text = {'audio_file': audio_file_path, 'transcription': segments}
|
144 |
+
logging.debug(f"Process_Audio: Transcription text: {transcription_text}")
|
145 |
+
|
146 |
+
# Save segments to JSON
|
147 |
+
segments_json_path = save_segments_to_json(segments)
|
148 |
+
|
149 |
+
# Perform summarization
|
150 |
+
summary_text = None
|
151 |
+
if api_name:
|
152 |
+
if rolling_summarization is not None:
|
153 |
+
pass
|
154 |
+
# FIXME rolling summarization
|
155 |
+
# summary_text = rolling_summarize_function(
|
156 |
+
# transcription_text,
|
157 |
+
# detail=detail_level,
|
158 |
+
# api_name=api_name,
|
159 |
+
# api_key=api_key,
|
160 |
+
# custom_prompt=custom_prompt_input,
|
161 |
+
# chunk_by_words=chunk_text_by_words,
|
162 |
+
# max_words=max_words,
|
163 |
+
# chunk_by_sentences=chunk_text_by_sentences,
|
164 |
+
# max_sentences=max_sentences,
|
165 |
+
# chunk_by_paragraphs=chunk_text_by_paragraphs,
|
166 |
+
# max_paragraphs=max_paragraphs,
|
167 |
+
# chunk_by_tokens=chunk_text_by_tokens,
|
168 |
+
# max_tokens=max_tokens
|
169 |
+
# )
|
170 |
+
else:
|
171 |
+
summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key)
|
172 |
+
|
173 |
+
if summary_text is None:
|
174 |
+
logging.error("Summary text is None. Check summarization function.")
|
175 |
+
summary_file_path = None
|
176 |
+
else:
|
177 |
+
summary_text = 'Summary not available'
|
178 |
+
summary_file_path = None
|
179 |
+
|
180 |
+
# Save transcription and summary
|
181 |
+
download_path = create_download_directory("Audio_Processing")
|
182 |
+
json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text,
|
183 |
+
download_path)
|
184 |
+
|
185 |
+
# Update function call to add_media_to_database so that it properly applies the title, author and file type
|
186 |
+
# Add to database
|
187 |
+
add_media_to_database(None, {'title': 'Audio File', 'author': 'Unknown'}, segments, summary_text, keywords,
|
188 |
+
custom_prompt_input, whisper_model)
|
189 |
+
|
190 |
+
return transcription_text, summary_text, json_file_path, summary_file_path, None, None
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
logging.error(f"Error in process_audio: {str(e)}")
|
194 |
+
return str(e), None, None, None, None, None
|
195 |
+
|
196 |
+
|
197 |
+
def process_single_audio(audio_file_path, whisper_model, api_name, api_key, keep_original,custom_keywords, source,
|
198 |
+
custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
|
199 |
+
use_multi_level_chunking, chunk_language):
|
200 |
+
progress = []
|
201 |
+
transcription = ""
|
202 |
+
summary = ""
|
203 |
+
|
204 |
+
def update_progress(message):
|
205 |
+
progress.append(message)
|
206 |
+
return "\n".join(progress)
|
207 |
+
|
208 |
+
try:
|
209 |
+
# Check file size before processing
|
210 |
+
file_size = os.path.getsize(audio_file_path)
|
211 |
+
if file_size > MAX_FILE_SIZE:
|
212 |
+
update_progress(f"File size ({file_size / (1024 * 1024):.2f} MB) exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f} MB. Skipping this file.")
|
213 |
+
return "\n".join(progress), "", ""
|
214 |
+
|
215 |
+
# Perform transcription
|
216 |
+
update_progress("Starting transcription...")
|
217 |
+
segments = speech_to_text(audio_file_path, whisper_model=whisper_model)
|
218 |
+
transcription = " ".join([segment['Text'] for segment in segments])
|
219 |
+
update_progress("Audio transcribed successfully.")
|
220 |
+
|
221 |
+
# Perform summarization if API is provided
|
222 |
+
if api_name and api_key:
|
223 |
+
update_progress("Starting summarization...")
|
224 |
+
summary = perform_summarization(api_name, transcription, "Summarize the following audio transcript",
|
225 |
+
api_key)
|
226 |
+
update_progress("Audio summarized successfully.")
|
227 |
+
else:
|
228 |
+
summary = "No summary available"
|
229 |
+
|
230 |
+
# Prepare keywords
|
231 |
+
keywords = "audio,transcription"
|
232 |
+
if custom_keywords:
|
233 |
+
keywords += f",{custom_keywords}"
|
234 |
+
|
235 |
+
# Add to database
|
236 |
+
add_media_with_keywords(
|
237 |
+
url=source,
|
238 |
+
title=os.path.basename(audio_file_path),
|
239 |
+
media_type='audio',
|
240 |
+
content=transcription,
|
241 |
+
keywords=keywords,
|
242 |
+
prompt="Summarize the following audio transcript",
|
243 |
+
summary=summary,
|
244 |
+
transcription_model=whisper_model,
|
245 |
+
author="Unknown",
|
246 |
+
ingestion_date=None # This will use the current date
|
247 |
+
)
|
248 |
+
update_progress("Audio file added to database successfully.")
|
249 |
+
|
250 |
+
if not keep_original and source != "Uploaded File":
|
251 |
+
os.remove(audio_file_path)
|
252 |
+
update_progress(f"Temporary file {audio_file_path} removed.")
|
253 |
+
elif keep_original and source != "Uploaded File":
|
254 |
+
update_progress(f"Original audio file kept at: {audio_file_path}")
|
255 |
+
|
256 |
+
except Exception as e:
|
257 |
+
update_progress(f"Error processing {source}: {str(e)}")
|
258 |
+
transcription = f"Error: {str(e)}"
|
259 |
+
summary = "No summary due to error"
|
260 |
+
|
261 |
+
return "\n".join(progress), transcription, summary
|
262 |
+
|
263 |
+
|
264 |
+
def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
|
265 |
+
custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
|
266 |
+
use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize):
|
267 |
+
progress = []
|
268 |
+
temp_files = []
|
269 |
+
all_transcriptions = []
|
270 |
+
all_summaries = []
|
271 |
+
|
272 |
+
def update_progress(message):
|
273 |
+
progress.append(message)
|
274 |
+
return "\n".join(progress)
|
275 |
+
|
276 |
+
def cleanup_files():
|
277 |
+
for file in temp_files:
|
278 |
+
try:
|
279 |
+
if os.path.exists(file):
|
280 |
+
os.remove(file)
|
281 |
+
update_progress(f"Temporary file {file} removed.")
|
282 |
+
except Exception as e:
|
283 |
+
update_progress(f"Failed to remove temporary file {file}: {str(e)}")
|
284 |
+
|
285 |
+
def reencode_mp3(mp3_file_path):
|
286 |
+
try:
|
287 |
+
reencoded_mp3_path = mp3_file_path.replace(".mp3", "_reencoded.mp3")
|
288 |
+
subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, '-codec:a', 'libmp3lame', reencoded_mp3_path], check=True)
|
289 |
+
update_progress(f"Re-encoded {mp3_file_path} to {reencoded_mp3_path}.")
|
290 |
+
return reencoded_mp3_path
|
291 |
+
except subprocess.CalledProcessError as e:
|
292 |
+
update_progress(f"Error re-encoding {mp3_file_path}: {str(e)}")
|
293 |
+
raise
|
294 |
+
|
295 |
+
def convert_mp3_to_wav(mp3_file_path):
|
296 |
+
try:
|
297 |
+
wav_file_path = mp3_file_path.replace(".mp3", ".wav")
|
298 |
+
subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, wav_file_path], check=True)
|
299 |
+
update_progress(f"Converted {mp3_file_path} to {wav_file_path}.")
|
300 |
+
return wav_file_path
|
301 |
+
except subprocess.CalledProcessError as e:
|
302 |
+
update_progress(f"Error converting {mp3_file_path} to WAV: {str(e)}")
|
303 |
+
raise
|
304 |
+
|
305 |
+
try:
|
306 |
+
# Check and set the ffmpeg command
|
307 |
+
global ffmpeg_cmd
|
308 |
+
if os.name == "nt":
|
309 |
+
logging.debug("Running on Windows")
|
310 |
+
ffmpeg_cmd = os.path.join(os.getcwd(), "Bin", "ffmpeg.exe")
|
311 |
+
else:
|
312 |
+
ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
|
313 |
+
|
314 |
+
# Ensure ffmpeg is accessible
|
315 |
+
if not os.path.exists(ffmpeg_cmd) and os.name == "nt":
|
316 |
+
raise FileNotFoundError(f"ffmpeg executable not found at path: {ffmpeg_cmd}")
|
317 |
+
|
318 |
+
# Define chunk options early to avoid undefined errors
|
319 |
+
chunk_options = {
|
320 |
+
'method': chunk_method,
|
321 |
+
'max_size': max_chunk_size,
|
322 |
+
'overlap': chunk_overlap,
|
323 |
+
'adaptive': use_adaptive_chunking,
|
324 |
+
'multi_level': use_multi_level_chunking,
|
325 |
+
'language': chunk_language
|
326 |
+
}
|
327 |
+
|
328 |
+
# Process multiple URLs
|
329 |
+
urls = [url.strip() for url in audio_urls.split('\n') if url.strip()]
|
330 |
+
|
331 |
+
for i, url in enumerate(urls):
|
332 |
+
update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}")
|
333 |
+
|
334 |
+
# Download and process audio file
|
335 |
+
audio_file_path = download_audio_file(url, use_cookies, cookies)
|
336 |
+
if not os.path.exists(audio_file_path):
|
337 |
+
update_progress(f"Downloaded file not found: {audio_file_path}")
|
338 |
+
continue
|
339 |
+
|
340 |
+
temp_files.append(audio_file_path)
|
341 |
+
update_progress("Audio file downloaded successfully.")
|
342 |
+
|
343 |
+
# Re-encode MP3 to fix potential issues
|
344 |
+
reencoded_mp3_path = reencode_mp3(audio_file_path)
|
345 |
+
if not os.path.exists(reencoded_mp3_path):
|
346 |
+
update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
|
347 |
+
continue
|
348 |
+
|
349 |
+
temp_files.append(reencoded_mp3_path)
|
350 |
+
|
351 |
+
# Convert re-encoded MP3 to WAV
|
352 |
+
wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
|
353 |
+
if not os.path.exists(wav_file_path):
|
354 |
+
update_progress(f"Converted WAV file not found: {wav_file_path}")
|
355 |
+
continue
|
356 |
+
|
357 |
+
temp_files.append(wav_file_path)
|
358 |
+
|
359 |
+
# Initialize transcription
|
360 |
+
transcription = ""
|
361 |
+
|
362 |
+
# Transcribe audio
|
363 |
+
if diarize:
|
364 |
+
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
|
365 |
+
else:
|
366 |
+
segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
|
367 |
+
|
368 |
+
# Handle segments nested under 'segments' key
|
369 |
+
if isinstance(segments, dict) and 'segments' in segments:
|
370 |
+
segments = segments['segments']
|
371 |
+
|
372 |
+
if isinstance(segments, list):
|
373 |
+
transcription = " ".join([segment.get('Text', '') for segment in segments])
|
374 |
+
update_progress("Audio transcribed successfully.")
|
375 |
+
else:
|
376 |
+
update_progress("Unexpected segments format received from speech_to_text.")
|
377 |
+
logging.error(f"Unexpected segments format: {segments}")
|
378 |
+
continue
|
379 |
+
|
380 |
+
if not transcription.strip():
|
381 |
+
update_progress("Transcription is empty.")
|
382 |
+
else:
|
383 |
+
# Apply chunking
|
384 |
+
chunked_text = improved_chunking_process(transcription, chunk_options)
|
385 |
+
|
386 |
+
# Summarize
|
387 |
+
if api_name:
|
388 |
+
try:
|
389 |
+
summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
|
390 |
+
update_progress("Audio summarized successfully.")
|
391 |
+
except Exception as e:
|
392 |
+
logging.error(f"Error during summarization: {str(e)}")
|
393 |
+
summary = "Summary generation failed"
|
394 |
+
else:
|
395 |
+
summary = "No summary available (API not provided)"
|
396 |
+
|
397 |
+
all_transcriptions.append(transcription)
|
398 |
+
all_summaries.append(summary)
|
399 |
+
|
400 |
+
# Add to database
|
401 |
+
add_media_with_keywords(
|
402 |
+
url=url,
|
403 |
+
title=os.path.basename(wav_file_path),
|
404 |
+
media_type='audio',
|
405 |
+
content=transcription,
|
406 |
+
keywords=custom_keywords,
|
407 |
+
prompt=custom_prompt_input,
|
408 |
+
summary=summary,
|
409 |
+
transcription_model=whisper_model,
|
410 |
+
author="Unknown",
|
411 |
+
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
412 |
+
)
|
413 |
+
update_progress("Audio file processed and added to database.")
|
414 |
+
|
415 |
+
# Process uploaded file if provided
|
416 |
+
if audio_file:
|
417 |
+
if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
|
418 |
+
update_progress(
|
419 |
+
f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
|
420 |
+
else:
|
421 |
+
# Re-encode MP3 to fix potential issues
|
422 |
+
reencoded_mp3_path = reencode_mp3(audio_file.name)
|
423 |
+
if not os.path.exists(reencoded_mp3_path):
|
424 |
+
update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
|
425 |
+
return update_progress("Processing failed: Re-encoded file not found"), "", ""
|
426 |
+
|
427 |
+
temp_files.append(reencoded_mp3_path)
|
428 |
+
|
429 |
+
# Convert re-encoded MP3 to WAV
|
430 |
+
wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
|
431 |
+
if not os.path.exists(wav_file_path):
|
432 |
+
update_progress(f"Converted WAV file not found: {wav_file_path}")
|
433 |
+
return update_progress("Processing failed: Converted WAV file not found"), "", ""
|
434 |
+
|
435 |
+
temp_files.append(wav_file_path)
|
436 |
+
|
437 |
+
# Initialize transcription
|
438 |
+
transcription = ""
|
439 |
+
|
440 |
+
if diarize:
|
441 |
+
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
|
442 |
+
else:
|
443 |
+
segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
|
444 |
+
|
445 |
+
# Handle segments nested under 'segments' key
|
446 |
+
if isinstance(segments, dict) and 'segments' in segments:
|
447 |
+
segments = segments['segments']
|
448 |
+
|
449 |
+
if isinstance(segments, list):
|
450 |
+
transcription = " ".join([segment.get('Text', '') for segment in segments])
|
451 |
+
else:
|
452 |
+
update_progress("Unexpected segments format received from speech_to_text.")
|
453 |
+
logging.error(f"Unexpected segments format: {segments}")
|
454 |
+
|
455 |
+
chunked_text = improved_chunking_process(transcription, chunk_options)
|
456 |
+
|
457 |
+
if api_name and api_key:
|
458 |
+
try:
|
459 |
+
summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
|
460 |
+
update_progress("Audio summarized successfully.")
|
461 |
+
except Exception as e:
|
462 |
+
logging.error(f"Error during summarization: {str(e)}")
|
463 |
+
summary = "Summary generation failed"
|
464 |
+
else:
|
465 |
+
summary = "No summary available (API not provided)"
|
466 |
+
|
467 |
+
all_transcriptions.append(transcription)
|
468 |
+
all_summaries.append(summary)
|
469 |
+
|
470 |
+
add_media_with_keywords(
|
471 |
+
url="Uploaded File",
|
472 |
+
title=os.path.basename(wav_file_path),
|
473 |
+
media_type='audio',
|
474 |
+
content=transcription,
|
475 |
+
keywords=custom_keywords,
|
476 |
+
prompt=custom_prompt_input,
|
477 |
+
summary=summary,
|
478 |
+
transcription_model=whisper_model,
|
479 |
+
author="Unknown",
|
480 |
+
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
481 |
+
)
|
482 |
+
update_progress("Uploaded file processed and added to database.")
|
483 |
+
|
484 |
+
# Final cleanup
|
485 |
+
if not keep_original:
|
486 |
+
cleanup_files()
|
487 |
+
|
488 |
+
final_progress = update_progress("All processing complete.")
|
489 |
+
final_transcriptions = "\n\n".join(all_transcriptions)
|
490 |
+
final_summaries = "\n\n".join(all_summaries)
|
491 |
+
|
492 |
+
return final_progress, final_transcriptions, final_summaries
|
493 |
+
|
494 |
+
except Exception as e:
|
495 |
+
logging.error(f"Error processing audio files: {str(e)}")
|
496 |
+
cleanup_files()
|
497 |
+
return update_progress(f"Processing failed: {str(e)}"), "", ""
|
498 |
+
|
499 |
+
|
500 |
+
def download_youtube_audio(url):
|
501 |
+
try:
|
502 |
+
# Determine ffmpeg path based on the operating system.
|
503 |
+
ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg'
|
504 |
+
|
505 |
+
# Create a temporary directory
|
506 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
507 |
+
# Extract information about the video
|
508 |
+
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
|
509 |
+
info_dict = ydl.extract_info(url, download=False)
|
510 |
+
sanitized_title = sanitize_filename(info_dict['title'])
|
511 |
+
|
512 |
+
# Setup the temporary filenames
|
513 |
+
temp_video_path = Path(temp_dir) / f"{sanitized_title}_temp.mp4"
|
514 |
+
temp_audio_path = Path(temp_dir) / f"{sanitized_title}.mp3"
|
515 |
+
|
516 |
+
# Initialize yt-dlp with options for downloading
|
517 |
+
ydl_opts = {
|
518 |
+
'format': 'bestaudio[ext=m4a]/best[height<=480]', # Prefer best audio, or video up to 480p
|
519 |
+
'ffmpeg_location': ffmpeg_path,
|
520 |
+
'outtmpl': str(temp_video_path),
|
521 |
+
'noplaylist': True,
|
522 |
+
'quiet': True
|
523 |
+
}
|
524 |
+
|
525 |
+
# Execute yt-dlp to download the video/audio
|
526 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
527 |
+
ydl.download([url])
|
528 |
+
|
529 |
+
# Check if the file exists
|
530 |
+
if not temp_video_path.exists():
|
531 |
+
raise FileNotFoundError(f"Expected file was not found: {temp_video_path}")
|
532 |
+
|
533 |
+
# Use ffmpeg to extract audio
|
534 |
+
ffmpeg_command = [
|
535 |
+
ffmpeg_path,
|
536 |
+
'-i', str(temp_video_path),
|
537 |
+
'-vn', # No video
|
538 |
+
'-acodec', 'libmp3lame',
|
539 |
+
'-b:a', '192k',
|
540 |
+
str(temp_audio_path)
|
541 |
+
]
|
542 |
+
subprocess.run(ffmpeg_command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
543 |
+
|
544 |
+
# Check if the audio file was created
|
545 |
+
if not temp_audio_path.exists():
|
546 |
+
raise FileNotFoundError(f"Expected audio file was not found: {temp_audio_path}")
|
547 |
+
|
548 |
+
# Create a persistent directory for the download if it doesn't exist
|
549 |
+
persistent_dir = Path("downloads")
|
550 |
+
persistent_dir.mkdir(exist_ok=True)
|
551 |
+
|
552 |
+
# Move the file from the temporary directory to the persistent directory
|
553 |
+
persistent_file_path = persistent_dir / f"{sanitized_title}.mp3"
|
554 |
+
os.replace(str(temp_audio_path), str(persistent_file_path))
|
555 |
+
|
556 |
+
# Add the file to the list of downloaded files
|
557 |
+
downloaded_files.append(str(persistent_file_path))
|
558 |
+
|
559 |
+
return str(persistent_file_path), f"Audio downloaded successfully: {sanitized_title}.mp3"
|
560 |
+
except Exception as e:
|
561 |
+
return None, f"Error downloading audio: {str(e)}"
|
562 |
+
|
563 |
+
|
564 |
+
def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
|
565 |
+
keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
|
566 |
+
chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
|
567 |
+
use_multi_level_chunking=False, chunk_language='english'):
|
568 |
+
progress = []
|
569 |
+
error_message = ""
|
570 |
+
temp_files = []
|
571 |
+
|
572 |
+
def update_progress(message):
|
573 |
+
progress.append(message)
|
574 |
+
return "\n".join(progress)
|
575 |
+
|
576 |
+
def cleanup_files():
|
577 |
+
if not keep_original:
|
578 |
+
for file in temp_files:
|
579 |
+
try:
|
580 |
+
if os.path.exists(file):
|
581 |
+
os.remove(file)
|
582 |
+
update_progress(f"Temporary file {file} removed.")
|
583 |
+
except Exception as e:
|
584 |
+
update_progress(f"Failed to remove temporary file {file}: {str(e)}")
|
585 |
+
|
586 |
+
try:
|
587 |
+
# Download podcast
|
588 |
+
audio_file = download_audio_file(url, use_cookies, cookies)
|
589 |
+
temp_files.append(audio_file)
|
590 |
+
update_progress("Podcast downloaded successfully.")
|
591 |
+
|
592 |
+
# Extract metadata
|
593 |
+
metadata = extract_metadata(url)
|
594 |
+
title = title or metadata.get('title', 'Unknown Podcast')
|
595 |
+
author = author or metadata.get('uploader', 'Unknown Author')
|
596 |
+
|
597 |
+
# Format metadata for storage
|
598 |
+
metadata_text = f"""
|
599 |
+
Metadata:
|
600 |
+
Title: {title}
|
601 |
+
Author: {author}
|
602 |
+
Series: {metadata.get('series', 'N/A')}
|
603 |
+
Episode: {metadata.get('episode', 'N/A')}
|
604 |
+
Season: {metadata.get('season', 'N/A')}
|
605 |
+
Upload Date: {metadata.get('upload_date', 'N/A')}
|
606 |
+
Duration: {metadata.get('duration', 'N/A')} seconds
|
607 |
+
Description: {metadata.get('description', 'N/A')}
|
608 |
+
"""
|
609 |
+
|
610 |
+
# Update keywords
|
611 |
+
new_keywords = []
|
612 |
+
if metadata.get('series'):
|
613 |
+
new_keywords.append(f"series:{metadata['series']}")
|
614 |
+
if metadata.get('episode'):
|
615 |
+
new_keywords.append(f"episode:{metadata['episode']}")
|
616 |
+
if metadata.get('season'):
|
617 |
+
new_keywords.append(f"season:{metadata['season']}")
|
618 |
+
|
619 |
+
keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
|
620 |
+
|
621 |
+
update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
|
622 |
+
|
623 |
+
# Transcribe the podcast
|
624 |
+
try:
|
625 |
+
if enable_diarization:
|
626 |
+
segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
|
627 |
+
else:
|
628 |
+
segments = speech_to_text(audio_file, whisper_model=whisper_model)
|
629 |
+
transcription = " ".join([segment['Text'] for segment in segments])
|
630 |
+
update_progress("Podcast transcribed successfully.")
|
631 |
+
except Exception as e:
|
632 |
+
error_message = f"Transcription failed: {str(e)}"
|
633 |
+
raise
|
634 |
+
|
635 |
+
# Apply chunking
|
636 |
+
chunk_options = {
|
637 |
+
'method': chunk_method,
|
638 |
+
'max_size': max_chunk_size,
|
639 |
+
'overlap': chunk_overlap,
|
640 |
+
'adaptive': use_adaptive_chunking,
|
641 |
+
'multi_level': use_multi_level_chunking,
|
642 |
+
'language': chunk_language
|
643 |
+
}
|
644 |
+
chunked_text = improved_chunking_process(transcription, chunk_options)
|
645 |
+
|
646 |
+
# Combine metadata and transcription
|
647 |
+
full_content = metadata_text + "\n\nTranscription:\n" + transcription
|
648 |
+
|
649 |
+
# Summarize if API is provided
|
650 |
+
summary = None
|
651 |
+
if api_name and api_key:
|
652 |
+
try:
|
653 |
+
summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
|
654 |
+
update_progress("Podcast summarized successfully.")
|
655 |
+
except Exception as e:
|
656 |
+
error_message = f"Summarization failed: {str(e)}"
|
657 |
+
raise
|
658 |
+
|
659 |
+
# Add to database
|
660 |
+
try:
|
661 |
+
add_media_with_keywords(
|
662 |
+
url=url,
|
663 |
+
title=title,
|
664 |
+
media_type='podcast',
|
665 |
+
content=full_content,
|
666 |
+
keywords=keywords,
|
667 |
+
prompt=custom_prompt,
|
668 |
+
summary=summary or "No summary available",
|
669 |
+
transcription_model=whisper_model,
|
670 |
+
author=author,
|
671 |
+
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
672 |
+
)
|
673 |
+
update_progress("Podcast added to database successfully.")
|
674 |
+
except Exception as e:
|
675 |
+
error_message = f"Error adding podcast to database: {str(e)}"
|
676 |
+
raise
|
677 |
+
|
678 |
+
# Cleanup
|
679 |
+
cleanup_files()
|
680 |
+
|
681 |
+
return (update_progress("Processing complete."), full_content, summary or "No summary generated.",
|
682 |
+
title, author, keywords, error_message)
|
683 |
+
|
684 |
+
except Exception as e:
|
685 |
+
logging.error(f"Error processing podcast: {str(e)}")
|
686 |
+
cleanup_files()
|
687 |
+
return update_progress(f"Processing failed: {str(e)}"), "", "", "", "", "", str(e)
|
688 |
+
|
689 |
+
|
690 |
+
#
|
691 |
+
#
|
692 |
+
#######################################################################################################################
|
App_Function_Libraries/Audio/Audio_Transcription_Lib.py
ADDED
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Audio_Transcription_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Transcription Library
|
4 |
+
# This library is used to perform transcription of audio files.
|
5 |
+
# Currently, uses faster_whisper for transcription.
|
6 |
+
#
|
7 |
+
####################
|
8 |
+
# Function List
|
9 |
+
#
|
10 |
+
# 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
|
11 |
+
# 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
|
12 |
+
#
|
13 |
+
####################
|
14 |
+
#
|
15 |
+
# Import necessary libraries to run solo for testing
|
16 |
+
import gc
|
17 |
+
import json
|
18 |
+
import logging
|
19 |
+
import os
|
20 |
+
import queue
|
21 |
+
import sys
|
22 |
+
import subprocess
|
23 |
+
import tempfile
|
24 |
+
import threading
|
25 |
+
import time
|
26 |
+
# DEBUG Imports
|
27 |
+
#from memory_profiler import profile
|
28 |
+
import pyaudio
|
29 |
+
from faster_whisper import WhisperModel as OriginalWhisperModel
|
30 |
+
from typing import Optional, Union, List, Dict, Any
|
31 |
+
#
|
32 |
+
# Import Local
|
33 |
+
from App_Function_Libraries.Utils.Utils import load_comprehensive_config
|
34 |
+
#
|
35 |
+
#######################################################################################################################
|
36 |
+
# Function Definitions
|
37 |
+
#
|
38 |
+
|
39 |
+
# Convert video .m4a into .wav using ffmpeg
|
40 |
+
# ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
|
41 |
+
# https://www.gyan.dev/ffmpeg/builds/
|
42 |
+
#
|
43 |
+
|
44 |
+
|
45 |
+
whisper_model_instance = None
|
46 |
+
config = load_comprehensive_config()
|
47 |
+
processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
class WhisperModel(OriginalWhisperModel):
|
52 |
+
tldw_dir = os.path.dirname(os.path.dirname(__file__))
|
53 |
+
default_download_root = os.path.join(tldw_dir, 'App_Function_Libraries', 'models', 'Whisper')
|
54 |
+
|
55 |
+
valid_model_sizes = [
|
56 |
+
"tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
|
57 |
+
"large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
|
58 |
+
"distil-small.en", "distil-large-v3"
|
59 |
+
]
|
60 |
+
|
61 |
+
def __init__(
|
62 |
+
self,
|
63 |
+
model_size_or_path: str,
|
64 |
+
device: str = "auto",
|
65 |
+
device_index: Union[int, List[int]] = 0,
|
66 |
+
compute_type: str = "default",
|
67 |
+
cpu_threads: int = 16,
|
68 |
+
num_workers: int = 1,
|
69 |
+
download_root: Optional[str] = None,
|
70 |
+
local_files_only: bool = False,
|
71 |
+
files: Optional[Dict[str, Any]] = None,
|
72 |
+
**model_kwargs: Any
|
73 |
+
):
|
74 |
+
if download_root is None:
|
75 |
+
download_root = self.default_download_root
|
76 |
+
|
77 |
+
os.makedirs(download_root, exist_ok=True)
|
78 |
+
|
79 |
+
# FIXME - validate....
|
80 |
+
# Also write an integration test...
|
81 |
+
# Check if model_size_or_path is a valid model size
|
82 |
+
if model_size_or_path in self.valid_model_sizes:
|
83 |
+
# It's a model size, so we'll use the download_root
|
84 |
+
model_path = os.path.join(download_root, model_size_or_path)
|
85 |
+
if not os.path.isdir(model_path):
|
86 |
+
# If it doesn't exist, we'll let the parent class download it
|
87 |
+
model_size_or_path = model_size_or_path # Keep the original model size
|
88 |
+
else:
|
89 |
+
# If it exists, use the full path
|
90 |
+
model_size_or_path = model_path
|
91 |
+
else:
|
92 |
+
# It's not a valid model size, so assume it's a path
|
93 |
+
model_size_or_path = os.path.abspath(model_size_or_path)
|
94 |
+
|
95 |
+
super().__init__(
|
96 |
+
model_size_or_path,
|
97 |
+
device=device,
|
98 |
+
device_index=device_index,
|
99 |
+
compute_type=compute_type,
|
100 |
+
cpu_threads=cpu_threads,
|
101 |
+
num_workers=num_workers,
|
102 |
+
download_root=download_root,
|
103 |
+
local_files_only=local_files_only,
|
104 |
+
# Maybe? idk, FIXME
|
105 |
+
# files=files,
|
106 |
+
# **model_kwargs
|
107 |
+
)
|
108 |
+
|
109 |
+
def get_whisper_model(model_name, device):
|
110 |
+
global whisper_model_instance
|
111 |
+
if whisper_model_instance is None:
|
112 |
+
logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
|
113 |
+
whisper_model_instance = WhisperModel(model_name, device=device)
|
114 |
+
return whisper_model_instance
|
115 |
+
|
116 |
+
# # FIXME: This is a temporary solution.
|
117 |
+
# # This doesn't clear older models, which means potentially a lot of memory is being used...
|
118 |
+
# def get_whisper_model(model_name, device):
|
119 |
+
# global whisper_model_instance
|
120 |
+
# if whisper_model_instance is None:
|
121 |
+
# from faster_whisper import WhisperModel
|
122 |
+
# logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
|
123 |
+
#
|
124 |
+
# # FIXME - add logic to detect if the model is already downloaded
|
125 |
+
# # want to first check if the model is already downloaded
|
126 |
+
# # if not, download it using the existing logic in 'WhisperModel'
|
127 |
+
# # https://github.com/SYSTRAN/faster-whisper/blob/d57c5b40b06e59ec44240d93485a95799548af50/faster_whisper/transcribe.py#L584
|
128 |
+
# # Designated path should be `tldw/App_Function_Libraries/models/Whisper/`
|
129 |
+
# WhisperModel.download_root = os.path.join(os.path.dirname(__file__), 'models', 'Whisper')
|
130 |
+
# os.makedirs(WhisperModel.download_root, exist_ok=True)
|
131 |
+
# whisper_model_instance = WhisperModel(model_name, device=device)
|
132 |
+
# return whisper_model_instance
|
133 |
+
|
134 |
+
|
135 |
+
# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
|
136 |
+
#DEBUG
|
137 |
+
#@profile
|
138 |
+
def convert_to_wav(video_file_path, offset=0, overwrite=False):
|
139 |
+
out_path = os.path.splitext(video_file_path)[0] + ".wav"
|
140 |
+
|
141 |
+
if os.path.exists(out_path) and not overwrite:
|
142 |
+
print(f"File '{out_path}' already exists. Skipping conversion.")
|
143 |
+
logging.info(f"Skipping conversion as file already exists: {out_path}")
|
144 |
+
return out_path
|
145 |
+
print("Starting conversion process of .m4a to .WAV")
|
146 |
+
out_path = os.path.splitext(video_file_path)[0] + ".wav"
|
147 |
+
|
148 |
+
try:
|
149 |
+
if os.name == "nt":
|
150 |
+
logging.debug("ffmpeg being ran on windows")
|
151 |
+
|
152 |
+
if sys.platform.startswith('win'):
|
153 |
+
ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
|
154 |
+
logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
|
155 |
+
else:
|
156 |
+
ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
|
157 |
+
|
158 |
+
command = [
|
159 |
+
ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
|
160 |
+
"-ss", "00:00:00", # Start at the beginning of the video
|
161 |
+
"-i", video_file_path,
|
162 |
+
"-ar", "16000", # Audio sample rate
|
163 |
+
"-ac", "1", # Number of audio channels
|
164 |
+
"-c:a", "pcm_s16le", # Audio codec
|
165 |
+
out_path
|
166 |
+
]
|
167 |
+
try:
|
168 |
+
# Redirect stdin from null device to prevent ffmpeg from waiting for input
|
169 |
+
with open(os.devnull, 'rb') as null_file:
|
170 |
+
result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
|
171 |
+
if result.returncode == 0:
|
172 |
+
logging.info("FFmpeg executed successfully")
|
173 |
+
logging.debug("FFmpeg output: %s", result.stdout)
|
174 |
+
else:
|
175 |
+
logging.error("Error in running FFmpeg")
|
176 |
+
logging.error("FFmpeg stderr: %s", result.stderr)
|
177 |
+
raise RuntimeError(f"FFmpeg error: {result.stderr}")
|
178 |
+
except Exception as e:
|
179 |
+
logging.error("Error occurred - ffmpeg doesn't like windows")
|
180 |
+
raise RuntimeError("ffmpeg failed")
|
181 |
+
elif os.name == "posix":
|
182 |
+
os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
|
183 |
+
else:
|
184 |
+
raise RuntimeError("Unsupported operating system")
|
185 |
+
logging.info("Conversion to WAV completed: %s", out_path)
|
186 |
+
except subprocess.CalledProcessError as e:
|
187 |
+
logging.error("Error executing FFmpeg command: %s", str(e))
|
188 |
+
raise RuntimeError("Error converting video file to WAV")
|
189 |
+
except Exception as e:
|
190 |
+
logging.error("speech-to-text: Error transcribing audio: %s", str(e))
|
191 |
+
return {"error": str(e)}
|
192 |
+
gc.collect()
|
193 |
+
return out_path
|
194 |
+
|
195 |
+
|
196 |
+
# Transcribe .wav into .segments.json
|
197 |
+
#DEBUG
|
198 |
+
#@profile
|
199 |
+
def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
|
200 |
+
global whisper_model_instance, processing_choice
|
201 |
+
logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
|
202 |
+
|
203 |
+
time_start = time.time()
|
204 |
+
if audio_file_path is None:
|
205 |
+
raise ValueError("speech-to-text: No audio file provided")
|
206 |
+
logging.info("speech-to-text: Audio file path: %s", audio_file_path)
|
207 |
+
|
208 |
+
try:
|
209 |
+
_, file_ending = os.path.splitext(audio_file_path)
|
210 |
+
out_file = audio_file_path.replace(file_ending, ".segments.json")
|
211 |
+
prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
|
212 |
+
if os.path.exists(out_file):
|
213 |
+
logging.info("speech-to-text: Segments file already exists: %s", out_file)
|
214 |
+
with open(out_file) as f:
|
215 |
+
global segments
|
216 |
+
segments = json.load(f)
|
217 |
+
return segments
|
218 |
+
|
219 |
+
logging.info('speech-to-text: Starting transcription...')
|
220 |
+
options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
|
221 |
+
transcribe_options = dict(task="transcribe", **options)
|
222 |
+
# use function and config at top of file
|
223 |
+
logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
|
224 |
+
whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
|
225 |
+
segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
|
226 |
+
|
227 |
+
segments = []
|
228 |
+
for segment_chunk in segments_raw:
|
229 |
+
chunk = {
|
230 |
+
"Time_Start": segment_chunk.start,
|
231 |
+
"Time_End": segment_chunk.end,
|
232 |
+
"Text": segment_chunk.text
|
233 |
+
}
|
234 |
+
logging.debug("Segment: %s", chunk)
|
235 |
+
segments.append(chunk)
|
236 |
+
# Print to verify its working
|
237 |
+
print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
|
238 |
+
|
239 |
+
# Log it as well.
|
240 |
+
logging.debug(
|
241 |
+
f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
|
242 |
+
|
243 |
+
if segments:
|
244 |
+
segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
|
245 |
+
|
246 |
+
if not segments:
|
247 |
+
raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
|
248 |
+
logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
|
249 |
+
|
250 |
+
# Save the segments to a JSON file - prettified and non-prettified
|
251 |
+
# FIXME so this is an optional flag to save either the prettified json file or the normal one
|
252 |
+
save_json = True
|
253 |
+
if save_json:
|
254 |
+
logging.info("speech-to-text: Saving segments to JSON file")
|
255 |
+
output_data = {'segments': segments}
|
256 |
+
|
257 |
+
logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
|
258 |
+
with open(prettified_out_file, 'w') as f:
|
259 |
+
json.dump(output_data, f, indent=2)
|
260 |
+
|
261 |
+
logging.info("speech-to-text: Saving JSON to %s", out_file)
|
262 |
+
with open(out_file, 'w') as f:
|
263 |
+
json.dump(output_data, f)
|
264 |
+
|
265 |
+
logging.debug(f"speech-to-text: returning {segments[:500]}")
|
266 |
+
gc.collect()
|
267 |
+
return segments
|
268 |
+
|
269 |
+
except Exception as e:
|
270 |
+
logging.error("speech-to-text: Error transcribing audio: %s", str(e))
|
271 |
+
raise RuntimeError("speech-to-text: Error transcribing audio")
|
272 |
+
|
273 |
+
|
274 |
+
def record_audio(duration, sample_rate=16000, chunk_size=1024):
|
275 |
+
p = pyaudio.PyAudio()
|
276 |
+
stream = p.open(format=pyaudio.paInt16,
|
277 |
+
channels=1,
|
278 |
+
rate=sample_rate,
|
279 |
+
input=True,
|
280 |
+
frames_per_buffer=chunk_size)
|
281 |
+
|
282 |
+
print("Recording...")
|
283 |
+
frames = []
|
284 |
+
stop_recording = threading.Event()
|
285 |
+
audio_queue = queue.Queue()
|
286 |
+
|
287 |
+
def audio_callback():
|
288 |
+
for _ in range(0, int(sample_rate / chunk_size * duration)):
|
289 |
+
if stop_recording.is_set():
|
290 |
+
break
|
291 |
+
data = stream.read(chunk_size)
|
292 |
+
audio_queue.put(data)
|
293 |
+
|
294 |
+
audio_thread = threading.Thread(target=audio_callback)
|
295 |
+
audio_thread.start()
|
296 |
+
|
297 |
+
return p, stream, audio_queue, stop_recording, audio_thread
|
298 |
+
|
299 |
+
|
300 |
+
def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
|
301 |
+
stop_recording_event.set()
|
302 |
+
audio_thread.join()
|
303 |
+
|
304 |
+
frames = []
|
305 |
+
while not audio_queue.empty():
|
306 |
+
frames.append(audio_queue.get())
|
307 |
+
|
308 |
+
print("Recording finished.")
|
309 |
+
|
310 |
+
stream.stop_stream()
|
311 |
+
stream.close()
|
312 |
+
p.terminate()
|
313 |
+
|
314 |
+
return b''.join(frames)
|
315 |
+
|
316 |
+
def save_audio_temp(audio_data, sample_rate=16000):
|
317 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
318 |
+
import wave
|
319 |
+
wf = wave.open(temp_file.name, 'wb')
|
320 |
+
wf.setnchannels(1)
|
321 |
+
wf.setsampwidth(2)
|
322 |
+
wf.setframerate(sample_rate)
|
323 |
+
wf.writeframes(audio_data)
|
324 |
+
wf.close()
|
325 |
+
return temp_file.name
|
326 |
+
|
327 |
+
#
|
328 |
+
#
|
329 |
+
#######################################################################################################################
|
App_Function_Libraries/Audio/Diarization_Lib.py
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Diarization_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Diarization Library
|
4 |
+
# This library is used to perform diarization of audio files.
|
5 |
+
# Currently, uses FIXME for transcription.
|
6 |
+
#
|
7 |
+
####################
|
8 |
+
####################
|
9 |
+
# Function List
|
10 |
+
#
|
11 |
+
# 1. speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embedding", embedding_size=512, num_speakers=0)
|
12 |
+
#
|
13 |
+
####################
|
14 |
+
# Import necessary libraries
|
15 |
+
import logging
|
16 |
+
from pathlib import Path
|
17 |
+
from typing import Dict, List, Any
|
18 |
+
|
19 |
+
#
|
20 |
+
# Import Local Libraries
|
21 |
+
from App_Function_Libraries.Audio.Audio_Transcription_Lib import speech_to_text
|
22 |
+
#
|
23 |
+
# Import 3rd Party Libraries
|
24 |
+
from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
|
25 |
+
import yaml
|
26 |
+
#
|
27 |
+
#######################################################################################################################
|
28 |
+
# Function Definitions
|
29 |
+
#
|
30 |
+
|
31 |
+
def load_pipeline_from_pretrained(path_to_config: str | Path) -> SpeakerDiarization:
|
32 |
+
path_to_config = Path(path_to_config).resolve()
|
33 |
+
logging.debug(f"Loading pyannote pipeline from {path_to_config}...")
|
34 |
+
|
35 |
+
if not path_to_config.exists():
|
36 |
+
raise FileNotFoundError(f"Config file not found: {path_to_config}")
|
37 |
+
|
38 |
+
# Load the YAML configuration
|
39 |
+
with open(path_to_config, 'r') as config_file:
|
40 |
+
config = yaml.safe_load(config_file)
|
41 |
+
|
42 |
+
# Debug: print the entire config
|
43 |
+
logging.debug(f"Loaded config: {config}")
|
44 |
+
|
45 |
+
# Create the SpeakerDiarization pipeline
|
46 |
+
try:
|
47 |
+
pipeline = SpeakerDiarization(
|
48 |
+
segmentation=config['pipeline']['params']['segmentation'],
|
49 |
+
embedding=config['pipeline']['params']['embedding'],
|
50 |
+
clustering=config['pipeline']['params']['clustering'],
|
51 |
+
)
|
52 |
+
except KeyError as e:
|
53 |
+
logging.error(f"Error accessing config key: {e}")
|
54 |
+
raise
|
55 |
+
|
56 |
+
# Set other parameters
|
57 |
+
try:
|
58 |
+
pipeline_params = {
|
59 |
+
"segmentation": {},
|
60 |
+
"clustering": {},
|
61 |
+
}
|
62 |
+
|
63 |
+
if 'params' in config and 'segmentation' in config['params']:
|
64 |
+
if 'min_duration_off' in config['params']['segmentation']:
|
65 |
+
pipeline_params["segmentation"]["min_duration_off"] = config['params']['segmentation']['min_duration_off']
|
66 |
+
|
67 |
+
if 'params' in config and 'clustering' in config['params']:
|
68 |
+
if 'method' in config['params']['clustering']:
|
69 |
+
pipeline_params["clustering"]["method"] = config['params']['clustering']['method']
|
70 |
+
if 'min_cluster_size' in config['params']['clustering']:
|
71 |
+
pipeline_params["clustering"]["min_cluster_size"] = config['params']['clustering']['min_cluster_size']
|
72 |
+
if 'threshold' in config['params']['clustering']:
|
73 |
+
pipeline_params["clustering"]["threshold"] = config['params']['clustering']['threshold']
|
74 |
+
|
75 |
+
if 'pipeline' in config and 'params' in config['pipeline']:
|
76 |
+
if 'embedding_batch_size' in config['pipeline']['params']:
|
77 |
+
pipeline_params["embedding_batch_size"] = config['pipeline']['params']['embedding_batch_size']
|
78 |
+
if 'embedding_exclude_overlap' in config['pipeline']['params']:
|
79 |
+
pipeline_params["embedding_exclude_overlap"] = config['pipeline']['params']['embedding_exclude_overlap']
|
80 |
+
if 'segmentation_batch_size' in config['pipeline']['params']:
|
81 |
+
pipeline_params["segmentation_batch_size"] = config['pipeline']['params']['segmentation_batch_size']
|
82 |
+
|
83 |
+
logging.debug(f"Pipeline params: {pipeline_params}")
|
84 |
+
pipeline.instantiate(pipeline_params)
|
85 |
+
except KeyError as e:
|
86 |
+
logging.error(f"Error accessing config key: {e}")
|
87 |
+
raise
|
88 |
+
except Exception as e:
|
89 |
+
logging.error(f"Error instantiating pipeline: {e}")
|
90 |
+
raise
|
91 |
+
|
92 |
+
return pipeline
|
93 |
+
|
94 |
+
|
95 |
+
def audio_diarization(audio_file_path: str) -> list:
|
96 |
+
logging.info('audio-diarization: Loading pyannote pipeline')
|
97 |
+
|
98 |
+
base_dir = Path(__file__).parent.resolve()
|
99 |
+
config_path = base_dir / 'models' / 'pyannote_diarization_config.yaml'
|
100 |
+
logging.info(f"audio-diarization: Loading pipeline from {config_path}")
|
101 |
+
|
102 |
+
try:
|
103 |
+
pipeline = load_pipeline_from_pretrained(config_path)
|
104 |
+
except Exception as e:
|
105 |
+
logging.error(f"Failed to load pipeline: {str(e)}")
|
106 |
+
raise
|
107 |
+
|
108 |
+
logging.info(f"audio-diarization: Audio file path: {audio_file_path}")
|
109 |
+
|
110 |
+
try:
|
111 |
+
logging.info('audio-diarization: Starting diarization...')
|
112 |
+
diarization_result = pipeline(audio_file_path)
|
113 |
+
|
114 |
+
segments = []
|
115 |
+
for turn, _, speaker in diarization_result.itertracks(yield_label=True):
|
116 |
+
segment = {
|
117 |
+
"start": turn.start,
|
118 |
+
"end": turn.end,
|
119 |
+
"speaker": speaker
|
120 |
+
}
|
121 |
+
logging.debug(f"Segment: {segment}")
|
122 |
+
segments.append(segment)
|
123 |
+
logging.info("audio-diarization: Diarization completed with pyannote")
|
124 |
+
|
125 |
+
return segments
|
126 |
+
|
127 |
+
except Exception as e:
|
128 |
+
logging.error(f"audio-diarization: Error performing diarization: {str(e)}")
|
129 |
+
raise RuntimeError("audio-diarization: Error performing diarization") from e
|
130 |
+
|
131 |
+
|
132 |
+
# Old
|
133 |
+
# def audio_diarization(audio_file_path):
|
134 |
+
# logging.info('audio-diarization: Loading pyannote pipeline')
|
135 |
+
#
|
136 |
+
# #config file loading
|
137 |
+
# current_dir = os.path.dirname(os.path.abspath(__file__))
|
138 |
+
# # Construct the path to the config file
|
139 |
+
# config_path = os.path.join(current_dir, 'Config_Files', 'config.txt')
|
140 |
+
# # Read the config file
|
141 |
+
# config = configparser.ConfigParser()
|
142 |
+
# config.read(config_path)
|
143 |
+
# processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
|
144 |
+
#
|
145 |
+
# base_dir = Path(__file__).parent.resolve()
|
146 |
+
# config_path = base_dir / 'models' / 'config.yaml'
|
147 |
+
# pipeline = load_pipeline_from_pretrained(config_path)
|
148 |
+
#
|
149 |
+
# time_start = time.time()
|
150 |
+
# if audio_file_path is None:
|
151 |
+
# raise ValueError("audio-diarization: No audio file provided")
|
152 |
+
# logging.info("audio-diarization: Audio file path: %s", audio_file_path)
|
153 |
+
#
|
154 |
+
# try:
|
155 |
+
# _, file_ending = os.path.splitext(audio_file_path)
|
156 |
+
# out_file = audio_file_path.replace(file_ending, ".diarization.json")
|
157 |
+
# prettified_out_file = audio_file_path.replace(file_ending, ".diarization_pretty.json")
|
158 |
+
# if os.path.exists(out_file):
|
159 |
+
# logging.info("audio-diarization: Diarization file already exists: %s", out_file)
|
160 |
+
# with open(out_file) as f:
|
161 |
+
# global diarization_result
|
162 |
+
# diarization_result = json.load(f)
|
163 |
+
# return diarization_result
|
164 |
+
#
|
165 |
+
# logging.info('audio-diarization: Starting diarization...')
|
166 |
+
# diarization_result = pipeline(audio_file_path)
|
167 |
+
#
|
168 |
+
# segments = []
|
169 |
+
# for turn, _, speaker in diarization_result.itertracks(yield_label=True):
|
170 |
+
# chunk = {
|
171 |
+
# "Time_Start": turn.start,
|
172 |
+
# "Time_End": turn.end,
|
173 |
+
# "Speaker": speaker
|
174 |
+
# }
|
175 |
+
# logging.debug("Segment: %s", chunk)
|
176 |
+
# segments.append(chunk)
|
177 |
+
# logging.info("audio-diarization: Diarization completed with pyannote")
|
178 |
+
#
|
179 |
+
# output_data = {'segments': segments}
|
180 |
+
#
|
181 |
+
# logging.info("audio-diarization: Saving prettified JSON to %s", prettified_out_file)
|
182 |
+
# with open(prettified_out_file, 'w') as f:
|
183 |
+
# json.dump(output_data, f, indent=2)
|
184 |
+
#
|
185 |
+
# logging.info("audio-diarization: Saving JSON to %s", out_file)
|
186 |
+
# with open(out_file, 'w') as f:
|
187 |
+
# json.dump(output_data, f)
|
188 |
+
#
|
189 |
+
# except Exception as e:
|
190 |
+
# logging.error("audio-diarization: Error performing diarization: %s", str(e))
|
191 |
+
# raise RuntimeError("audio-diarization: Error performing diarization")
|
192 |
+
# return segments
|
193 |
+
|
194 |
+
def combine_transcription_and_diarization(audio_file_path: str) -> List[Dict[str, Any]]:
|
195 |
+
logging.info('combine-transcription-and-diarization: Starting transcription and diarization...')
|
196 |
+
|
197 |
+
try:
|
198 |
+
logging.info('Performing speech-to-text...')
|
199 |
+
transcription_result = speech_to_text(audio_file_path)
|
200 |
+
logging.info(f"Transcription result type: {type(transcription_result)}")
|
201 |
+
logging.info(f"Transcription result: {transcription_result[:3] if isinstance(transcription_result, list) and len(transcription_result) > 3 else transcription_result}")
|
202 |
+
|
203 |
+
logging.info('Performing audio diarization...')
|
204 |
+
diarization_result = audio_diarization(audio_file_path)
|
205 |
+
logging.info(f"Diarization result type: {type(diarization_result)}")
|
206 |
+
logging.info(f"Diarization result sample: {diarization_result[:3] if isinstance(diarization_result, list) and len(diarization_result) > 3 else diarization_result}")
|
207 |
+
|
208 |
+
if not transcription_result:
|
209 |
+
logging.error("Empty result from transcription")
|
210 |
+
return []
|
211 |
+
|
212 |
+
if not diarization_result:
|
213 |
+
logging.error("Empty result from diarization")
|
214 |
+
return []
|
215 |
+
|
216 |
+
# Handle the case where transcription_result is a dict with a 'segments' key
|
217 |
+
if isinstance(transcription_result, dict) and 'segments' in transcription_result:
|
218 |
+
transcription_segments = transcription_result['segments']
|
219 |
+
elif isinstance(transcription_result, list):
|
220 |
+
transcription_segments = transcription_result
|
221 |
+
else:
|
222 |
+
logging.error(f"Unexpected transcription result format: {type(transcription_result)}")
|
223 |
+
return []
|
224 |
+
|
225 |
+
logging.info(f"Number of transcription segments: {len(transcription_segments)}")
|
226 |
+
logging.info(f"Transcription segments sample: {transcription_segments[:3] if len(transcription_segments) > 3 else transcription_segments}")
|
227 |
+
|
228 |
+
if not isinstance(diarization_result, list):
|
229 |
+
logging.error(f"Unexpected diarization result format: {type(diarization_result)}")
|
230 |
+
return []
|
231 |
+
|
232 |
+
combined_result = []
|
233 |
+
for transcription_segment in transcription_segments:
|
234 |
+
if not isinstance(transcription_segment, dict):
|
235 |
+
logging.warning(f"Unexpected transcription segment format: {transcription_segment}")
|
236 |
+
continue
|
237 |
+
|
238 |
+
for diarization_segment in diarization_result:
|
239 |
+
if not isinstance(diarization_segment, dict):
|
240 |
+
logging.warning(f"Unexpected diarization segment format: {diarization_segment}")
|
241 |
+
continue
|
242 |
+
|
243 |
+
try:
|
244 |
+
trans_start = transcription_segment.get('Time_Start', 0)
|
245 |
+
trans_end = transcription_segment.get('Time_End', 0)
|
246 |
+
diar_start = diarization_segment.get('start', 0)
|
247 |
+
diar_end = diarization_segment.get('end', 0)
|
248 |
+
|
249 |
+
if trans_start >= diar_start and trans_end <= diar_end:
|
250 |
+
combined_segment = {
|
251 |
+
"Time_Start": trans_start,
|
252 |
+
"Time_End": trans_end,
|
253 |
+
"Speaker": diarization_segment.get('speaker', 'Unknown'),
|
254 |
+
"Text": transcription_segment.get('Text', '')
|
255 |
+
}
|
256 |
+
combined_result.append(combined_segment)
|
257 |
+
break
|
258 |
+
except Exception as e:
|
259 |
+
logging.error(f"Error processing segment: {str(e)}")
|
260 |
+
logging.error(f"Transcription segment: {transcription_segment}")
|
261 |
+
logging.error(f"Diarization segment: {diarization_segment}")
|
262 |
+
continue
|
263 |
+
|
264 |
+
logging.info(f"Combined result length: {len(combined_result)}")
|
265 |
+
logging.info(f"Combined result sample: {combined_result[:3] if len(combined_result) > 3 else combined_result}")
|
266 |
+
return combined_result
|
267 |
+
|
268 |
+
except Exception as e:
|
269 |
+
logging.error(f"Error in combine_transcription_and_diarization: {str(e)}", exc_info=True)
|
270 |
+
return []
|
271 |
+
|
272 |
+
|
273 |
+
#
|
274 |
+
#
|
275 |
+
#######################################################################################################################
|
App_Function_Libraries/Audio/__init__.py
ADDED
File without changes
|
App_Function_Libraries/Chat.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# Chat.py
|
2 |
# Chat functions for interacting with the LLMs as chatbots
|
3 |
-
|
4 |
# Imports
|
5 |
import json
|
6 |
import logging
|
@@ -15,9 +15,9 @@ from pathlib import Path
|
|
15 |
# Local Imports
|
16 |
from App_Function_Libraries.DB.DB_Manager import get_conversation_name, save_chat_history_to_database
|
17 |
from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \
|
18 |
-
chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface#, chat_with_vllm
|
19 |
from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \
|
20 |
-
chat_with_kobold, chat_with_llama, chat_with_oobabooga, chat_with_tabbyapi
|
21 |
from App_Function_Libraries.DB.SQLite_DB import load_media_content
|
22 |
from App_Function_Libraries.Utils.Utils import generate_unique_filename
|
23 |
#
|
@@ -54,8 +54,8 @@ def chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_messag
|
|
54 |
response = chat_with_oobabooga(input_data, api_key, prompt, temp, system_message)
|
55 |
elif api_endpoint.lower() == "tabbyapi":
|
56 |
response = chat_with_tabbyapi(input_data, prompt, temp, system_message)
|
57 |
-
|
58 |
-
|
59 |
elif api_endpoint.lower() == "local-llm":
|
60 |
response = chat_with_local_llm(input_data, prompt, temp, system_message)
|
61 |
elif api_endpoint.lower() == "huggingface":
|
@@ -64,6 +64,8 @@ def chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_messag
|
|
64 |
response = chat_with_ollama(input_data, prompt, temp, system_message)
|
65 |
elif api_endpoint.lower() == "aphrodite":
|
66 |
response = chat_with_aphrodite(input_data, prompt, temp, system_message)
|
|
|
|
|
67 |
else:
|
68 |
raise ValueError(f"Unsupported API endpoint: {api_endpoint}")
|
69 |
|
@@ -114,6 +116,8 @@ def chat(message, history, media_content, selected_parts, api_endpoint, api_key,
|
|
114 |
|
115 |
# Use the existing API request code based on the selected endpoint
|
116 |
response = chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_message)
|
|
|
|
|
117 |
except Exception as e:
|
118 |
logging.error(f"Error in chat function: {str(e)}")
|
119 |
return f"An error occurred: {str(e)}"
|
@@ -279,26 +283,60 @@ def update_chat_content(selected_item, use_content, use_summary, use_prompt, ite
|
|
279 |
print(f"Debug - Update Chat Content - No item selected or item not in mapping")
|
280 |
return {}, []
|
281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
CHARACTERS_FILE = Path('.', 'Helper_Scripts', 'Character_Cards', 'Characters.json')
|
284 |
|
|
|
285 |
def save_character(character_data):
|
286 |
-
|
287 |
-
|
288 |
-
characters = json.load(f)
|
289 |
-
else:
|
290 |
-
characters = {}
|
291 |
|
292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
|
294 |
-
with CHARACTERS_FILE.open('w') as f:
|
295 |
-
json.dump(characters, f, indent=2)
|
296 |
|
297 |
|
298 |
def load_characters():
|
299 |
-
|
300 |
-
|
301 |
-
|
|
|
|
|
|
|
|
|
302 |
return {}
|
303 |
|
304 |
|
|
|
1 |
# Chat.py
|
2 |
# Chat functions for interacting with the LLMs as chatbots
|
3 |
+
import base64
|
4 |
# Imports
|
5 |
import json
|
6 |
import logging
|
|
|
15 |
# Local Imports
|
16 |
from App_Function_Libraries.DB.DB_Manager import get_conversation_name, save_chat_history_to_database
|
17 |
from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \
|
18 |
+
chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface #, chat_with_vllm
|
19 |
from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \
|
20 |
+
chat_with_kobold, chat_with_llama, chat_with_oobabooga, chat_with_tabbyapi, chat_with_vllm, chat_with_custom_openai
|
21 |
from App_Function_Libraries.DB.SQLite_DB import load_media_content
|
22 |
from App_Function_Libraries.Utils.Utils import generate_unique_filename
|
23 |
#
|
|
|
54 |
response = chat_with_oobabooga(input_data, api_key, prompt, temp, system_message)
|
55 |
elif api_endpoint.lower() == "tabbyapi":
|
56 |
response = chat_with_tabbyapi(input_data, prompt, temp, system_message)
|
57 |
+
elif api_endpoint.lower() == "vllm":
|
58 |
+
response = chat_with_vllm(input_data, prompt, system_message)
|
59 |
elif api_endpoint.lower() == "local-llm":
|
60 |
response = chat_with_local_llm(input_data, prompt, temp, system_message)
|
61 |
elif api_endpoint.lower() == "huggingface":
|
|
|
64 |
response = chat_with_ollama(input_data, prompt, temp, system_message)
|
65 |
elif api_endpoint.lower() == "aphrodite":
|
66 |
response = chat_with_aphrodite(input_data, prompt, temp, system_message)
|
67 |
+
elif api_endpoint.lower() == "custom-openai-api":
|
68 |
+
response = chat_with_custom_openai(api_key, input_data, prompt, temp, system_message)
|
69 |
else:
|
70 |
raise ValueError(f"Unsupported API endpoint: {api_endpoint}")
|
71 |
|
|
|
116 |
|
117 |
# Use the existing API request code based on the selected endpoint
|
118 |
response = chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_message)
|
119 |
+
|
120 |
+
return response
|
121 |
except Exception as e:
|
122 |
logging.error(f"Error in chat function: {str(e)}")
|
123 |
return f"An error occurred: {str(e)}"
|
|
|
283 |
print(f"Debug - Update Chat Content - No item selected or item not in mapping")
|
284 |
return {}, []
|
285 |
|
286 |
+
#
|
287 |
+
# End of Chat functions
|
288 |
+
##########################################################################################################################
|
289 |
+
|
290 |
+
|
291 |
+
##########################################################################################################################
|
292 |
+
#
|
293 |
+
# Character Card Functions
|
294 |
|
295 |
CHARACTERS_FILE = Path('.', 'Helper_Scripts', 'Character_Cards', 'Characters.json')
|
296 |
|
297 |
+
|
298 |
def save_character(character_data):
|
299 |
+
characters_file = os.path.join(os.path.dirname(__file__), '..', 'Helper_Scripts', 'Character_Cards', 'Characters.json')
|
300 |
+
characters_dir = os.path.dirname(characters_file)
|
|
|
|
|
|
|
301 |
|
302 |
+
try:
|
303 |
+
if os.path.exists(characters_file):
|
304 |
+
with open(characters_file, 'r') as f:
|
305 |
+
characters = json.load(f)
|
306 |
+
else:
|
307 |
+
characters = {}
|
308 |
+
|
309 |
+
char_name = character_data['name']
|
310 |
+
|
311 |
+
# Save the image separately if it exists
|
312 |
+
if 'image' in character_data:
|
313 |
+
img_data = base64.b64decode(character_data['image'])
|
314 |
+
img_filename = f"{char_name.replace(' ', '_')}.png"
|
315 |
+
img_path = os.path.join(characters_dir, img_filename)
|
316 |
+
with open(img_path, 'wb') as f:
|
317 |
+
f.write(img_data)
|
318 |
+
character_data['image_path'] = os.path.abspath(img_path)
|
319 |
+
del character_data['image'] # Remove the base64 image data from the JSON
|
320 |
+
|
321 |
+
characters[char_name] = character_data
|
322 |
+
|
323 |
+
with open(characters_file, 'w') as f:
|
324 |
+
json.dump(characters, f, indent=2)
|
325 |
+
|
326 |
+
logging.info(f"Character '{char_name}' saved successfully.")
|
327 |
+
except Exception as e:
|
328 |
+
logging.error(f"Error saving character: {str(e)}")
|
329 |
|
|
|
|
|
330 |
|
331 |
|
332 |
def load_characters():
|
333 |
+
characters_file = os.path.join(os.path.dirname(__file__), '..', 'Helper_Scripts', 'Character_Cards', 'Characters.json')
|
334 |
+
if os.path.exists(characters_file):
|
335 |
+
with open(characters_file, 'r') as f:
|
336 |
+
characters = json.load(f)
|
337 |
+
logging.debug(f"Loaded {len(characters)} characters from {characters_file}")
|
338 |
+
return characters
|
339 |
+
logging.warning(f"Characters file not found: {characters_file}")
|
340 |
return {}
|
341 |
|
342 |
|
App_Function_Libraries/Chunk_Lib.py
CHANGED
@@ -7,6 +7,7 @@
|
|
7 |
####
|
8 |
# Import necessary libraries
|
9 |
import hashlib
|
|
|
10 |
import logging
|
11 |
import re
|
12 |
from typing import Any, Dict, List, Optional, Tuple
|
@@ -72,42 +73,53 @@ def load_document(file_path):
|
|
72 |
|
73 |
def improved_chunking_process(text: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
74 |
logging.debug("Improved chunking process started...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
options = chunk_options.copy()
|
76 |
if custom_chunk_options:
|
77 |
options.update(custom_chunk_options)
|
78 |
|
79 |
chunk_method = options.get('method', 'words')
|
80 |
-
base_size = options.get('base_size', 1000)
|
81 |
-
min_size = options.get('min_size', 100)
|
82 |
max_size = options.get('max_size', 2000)
|
83 |
overlap = options.get('overlap', 0)
|
84 |
language = options.get('language', None)
|
85 |
-
adaptive = options.get('adaptive', False)
|
86 |
-
multi_level = options.get('multi_level', False)
|
87 |
|
88 |
if language is None:
|
89 |
language = detect_language(text)
|
90 |
|
91 |
-
|
92 |
-
max_chunk_size = adaptive_chunk_size(text, base_size, min_size, max_size)
|
93 |
-
else:
|
94 |
-
max_chunk_size = base_size
|
95 |
-
|
96 |
-
if multi_level:
|
97 |
-
chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
|
98 |
-
else:
|
99 |
-
chunks = chunk_text(text, chunk_method, max_chunk_size, overlap, language)
|
100 |
|
101 |
chunks_with_metadata = []
|
|
|
102 |
for i, chunk in enumerate(chunks):
|
103 |
-
metadata =
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
111 |
|
112 |
chunks_with_metadata.append({
|
113 |
'text': chunk,
|
@@ -117,6 +129,7 @@ def improved_chunking_process(text: str, custom_chunk_options: Dict[str, Any] =
|
|
117 |
return chunks_with_metadata
|
118 |
|
119 |
|
|
|
120 |
def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
|
121 |
logging.debug("Multi-level chunking process started...")
|
122 |
# First level: chunk by paragraphs
|
|
|
7 |
####
|
8 |
# Import necessary libraries
|
9 |
import hashlib
|
10 |
+
import json
|
11 |
import logging
|
12 |
import re
|
13 |
from typing import Any, Dict, List, Optional, Tuple
|
|
|
73 |
|
74 |
def improved_chunking_process(text: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
75 |
logging.debug("Improved chunking process started...")
|
76 |
+
|
77 |
+
# Extract JSON metadata if present
|
78 |
+
json_content = {}
|
79 |
+
try:
|
80 |
+
json_end = text.index("}\n") + 1
|
81 |
+
json_content = json.loads(text[:json_end])
|
82 |
+
text = text[json_end:].strip()
|
83 |
+
logging.debug(f"Extracted JSON metadata: {json_content}")
|
84 |
+
except (ValueError, json.JSONDecodeError):
|
85 |
+
logging.debug("No JSON metadata found at the beginning of the text")
|
86 |
+
|
87 |
+
# Extract any additional header text
|
88 |
+
header_match = re.match(r"(This text was transcribed using.*?)\n\n", text, re.DOTALL)
|
89 |
+
header_text = ""
|
90 |
+
if header_match:
|
91 |
+
header_text = header_match.group(1)
|
92 |
+
text = text[len(header_text):].strip()
|
93 |
+
logging.debug(f"Extracted header text: {header_text}")
|
94 |
+
|
95 |
options = chunk_options.copy()
|
96 |
if custom_chunk_options:
|
97 |
options.update(custom_chunk_options)
|
98 |
|
99 |
chunk_method = options.get('method', 'words')
|
|
|
|
|
100 |
max_size = options.get('max_size', 2000)
|
101 |
overlap = options.get('overlap', 0)
|
102 |
language = options.get('language', None)
|
|
|
|
|
103 |
|
104 |
if language is None:
|
105 |
language = detect_language(text)
|
106 |
|
107 |
+
chunks = chunk_text(text, chunk_method, max_size, overlap, language)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
chunks_with_metadata = []
|
110 |
+
total_chunks = len(chunks)
|
111 |
for i, chunk in enumerate(chunks):
|
112 |
+
metadata = {
|
113 |
+
'chunk_index': i,
|
114 |
+
'total_chunks': total_chunks,
|
115 |
+
'chunk_method': chunk_method,
|
116 |
+
'max_size': max_size,
|
117 |
+
'overlap': overlap,
|
118 |
+
'language': language,
|
119 |
+
'relative_position': i / total_chunks
|
120 |
+
}
|
121 |
+
metadata.update(json_content) # Add the extracted JSON content to metadata
|
122 |
+
metadata['header_text'] = header_text # Add the header text to metadata
|
123 |
|
124 |
chunks_with_metadata.append({
|
125 |
'text': chunk,
|
|
|
129 |
return chunks_with_metadata
|
130 |
|
131 |
|
132 |
+
|
133 |
def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
|
134 |
logging.debug("Multi-level chunking process started...")
|
135 |
# First level: chunk by paragraphs
|
App_Function_Libraries/Gradio_Related.py
CHANGED
@@ -16,6 +16,8 @@ import gradio as gr
|
|
16 |
# Local Imports
|
17 |
from App_Function_Libraries.DB.DB_Manager import get_db_config
|
18 |
from App_Function_Libraries.Gradio_UI.Audio_ingestion_tab import create_audio_processing_tab
|
|
|
|
|
19 |
from App_Function_Libraries.Gradio_UI.Chat_ui import create_chat_management_tab, \
|
20 |
create_chat_interface_four, create_chat_interface_multi_api, create_chat_interface_stacked, create_chat_interface
|
21 |
from App_Function_Libraries.Gradio_UI.Config_tab import create_config_editor_tab
|
@@ -39,8 +41,9 @@ from App_Function_Libraries.Gradio_UI.RAG_QA_Chat_tab import create_rag_qa_chat_
|
|
39 |
from App_Function_Libraries.Gradio_UI.Re_summarize_tab import create_resummary_tab
|
40 |
from App_Function_Libraries.Gradio_UI.Search_Tab import create_prompt_view_tab, create_prompt_search_tab, \
|
41 |
create_search_summaries_tab, create_viewing_tab, create_search_tab
|
42 |
-
from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import
|
43 |
-
|
|
|
44 |
from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \
|
45 |
create_delete_trash_tab, create_search_and_mark_trash_tab
|
46 |
from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \
|
@@ -260,11 +263,9 @@ def launch_ui(share_public=None, server_mode=False):
|
|
260 |
create_search_tab()
|
261 |
create_search_summaries_tab()
|
262 |
|
263 |
-
with gr.TabItem("RAG Search
|
264 |
create_rag_tab()
|
265 |
create_rag_qa_chat_tab()
|
266 |
-
create_embeddings_tab()
|
267 |
-
create_view_embeddings_tab()
|
268 |
|
269 |
with gr.TabItem("Chat with an LLM"):
|
270 |
create_chat_interface()
|
@@ -274,9 +275,12 @@ def launch_ui(share_public=None, server_mode=False):
|
|
274 |
create_chat_with_llamafile_tab()
|
275 |
create_chat_management_tab()
|
276 |
chat_workflows_tab()
|
277 |
-
|
|
|
278 |
create_character_card_interaction_tab()
|
279 |
|
|
|
|
|
280 |
with gr.TabItem("View DB Items"):
|
281 |
create_viewing_tab()
|
282 |
create_prompt_view_tab()
|
@@ -295,6 +299,11 @@ def launch_ui(share_public=None, server_mode=False):
|
|
295 |
# FIXME
|
296 |
#create_compare_transcripts_tab()
|
297 |
|
|
|
|
|
|
|
|
|
|
|
298 |
with gr.TabItem("Writing Tools"):
|
299 |
with gr.Tabs():
|
300 |
from App_Function_Libraries.Gradio_UI.Writing_tab import create_document_feedback_tab
|
|
|
16 |
# Local Imports
|
17 |
from App_Function_Libraries.DB.DB_Manager import get_db_config
|
18 |
from App_Function_Libraries.Gradio_UI.Audio_ingestion_tab import create_audio_processing_tab
|
19 |
+
from App_Function_Libraries.Gradio_UI.Character_Interaction_tab import create_character_card_interaction_tab, \
|
20 |
+
create_multiple_character_chat_tab, create_narrator_controlled_conversation_tab
|
21 |
from App_Function_Libraries.Gradio_UI.Chat_ui import create_chat_management_tab, \
|
22 |
create_chat_interface_four, create_chat_interface_multi_api, create_chat_interface_stacked, create_chat_interface
|
23 |
from App_Function_Libraries.Gradio_UI.Config_tab import create_config_editor_tab
|
|
|
41 |
from App_Function_Libraries.Gradio_UI.Re_summarize_tab import create_resummary_tab
|
42 |
from App_Function_Libraries.Gradio_UI.Search_Tab import create_prompt_view_tab, create_prompt_search_tab, \
|
43 |
create_search_summaries_tab, create_viewing_tab, create_search_tab
|
44 |
+
from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import create_rag_tab
|
45 |
+
from App_Function_Libraries.Gradio_UI.Embeddings_tab import create_embeddings_tab, create_view_embeddings_tab, \
|
46 |
+
create_purge_embeddings_tab
|
47 |
from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \
|
48 |
create_delete_trash_tab, create_search_and_mark_trash_tab
|
49 |
from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \
|
|
|
263 |
create_search_tab()
|
264 |
create_search_summaries_tab()
|
265 |
|
266 |
+
with gr.TabItem("RAG Search"):
|
267 |
create_rag_tab()
|
268 |
create_rag_qa_chat_tab()
|
|
|
|
|
269 |
|
270 |
with gr.TabItem("Chat with an LLM"):
|
271 |
create_chat_interface()
|
|
|
275 |
create_chat_with_llamafile_tab()
|
276 |
create_chat_management_tab()
|
277 |
chat_workflows_tab()
|
278 |
+
create_multiple_character_chat_tab()
|
279 |
+
create_narrator_controlled_conversation_tab()
|
280 |
create_character_card_interaction_tab()
|
281 |
|
282 |
+
|
283 |
+
|
284 |
with gr.TabItem("View DB Items"):
|
285 |
create_viewing_tab()
|
286 |
create_prompt_view_tab()
|
|
|
299 |
# FIXME
|
300 |
#create_compare_transcripts_tab()
|
301 |
|
302 |
+
with gr.TabItem("Embeddings Management"):
|
303 |
+
create_embeddings_tab()
|
304 |
+
create_view_embeddings_tab()
|
305 |
+
create_purge_embeddings_tab()
|
306 |
+
|
307 |
with gr.TabItem("Writing Tools"):
|
308 |
with gr.Tabs():
|
309 |
from App_Function_Libraries.Gradio_UI.Writing_tab import create_document_feedback_tab
|
App_Function_Libraries/Local_File_Processing_Lib.py
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
|
21 |
# Import necessary libraries
|
22 |
# Import Local
|
23 |
-
from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav
|
24 |
from App_Function_Libraries.Video_DL_Ingestion_Lib import *
|
25 |
from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
|
26 |
-
from App_Function_Libraries.Utils import normalize_title, create_download_directory
|
27 |
|
28 |
#######################################################################################################################
|
29 |
# Function Definitions
|
|
|
20 |
|
21 |
# Import necessary libraries
|
22 |
# Import Local
|
23 |
+
from App_Function_Libraries.Audio.Audio_Transcription_Lib import convert_to_wav
|
24 |
from App_Function_Libraries.Video_DL_Ingestion_Lib import *
|
25 |
from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
|
26 |
+
from App_Function_Libraries.Utils.Utils import normalize_title, create_download_directory
|
27 |
|
28 |
#######################################################################################################################
|
29 |
# Function Definitions
|
App_Function_Libraries/Local_LLM_Inference_Engine_Lib.py
CHANGED
@@ -27,11 +27,11 @@ import subprocess
|
|
27 |
import sys
|
28 |
import time
|
29 |
|
30 |
-
from App_Function_Libraries.Utils import download_file
|
31 |
# Import 3rd-pary Libraries
|
32 |
#
|
33 |
# Import Local
|
34 |
-
from Article_Summarization_Lib import *
|
35 |
|
36 |
#
|
37 |
#
|
|
|
27 |
import sys
|
28 |
import time
|
29 |
|
30 |
+
from App_Function_Libraries.Utils.Utils import download_file
|
31 |
# Import 3rd-pary Libraries
|
32 |
#
|
33 |
# Import Local
|
34 |
+
from App_Function_Libraries.Web_Scraping.Article_Summarization_Lib import *
|
35 |
|
36 |
#
|
37 |
#
|
App_Function_Libraries/MediaWiki/Media_Wiki.py
CHANGED
@@ -7,6 +7,7 @@ import json
|
|
7 |
import logging
|
8 |
import os
|
9 |
import re
|
|
|
10 |
from typing import List, Dict, Any, Iterator, Optional
|
11 |
# 3rd-Party Imports
|
12 |
import mwparserfromhell
|
@@ -14,12 +15,19 @@ import mwxml
|
|
14 |
import yaml
|
15 |
#
|
16 |
# Local Imports
|
17 |
-
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
|
18 |
from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content
|
19 |
#
|
20 |
#######################################################################################################################
|
21 |
#
|
22 |
# Functions:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
|
25 |
"""Set up and return a logger with the given name and level."""
|
@@ -41,11 +49,11 @@ def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] =
|
|
41 |
# Usage
|
42 |
logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log')
|
43 |
|
44 |
-
#
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
|
50 |
def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[
|
51 |
Dict[str, Any]]:
|
@@ -57,11 +65,11 @@ def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redi
|
|
57 |
continue
|
58 |
|
59 |
for revision in page:
|
60 |
-
|
61 |
-
|
62 |
yield {
|
63 |
"title": page.title,
|
64 |
-
"content":
|
65 |
"namespace": page.namespace,
|
66 |
"page_id": page.id,
|
67 |
"revision_id": revision.id,
|
@@ -76,6 +84,7 @@ def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[st
|
|
76 |
current_chunk = ""
|
77 |
current_size = 0
|
78 |
|
|
|
79 |
for i in range(0, len(sections), 2):
|
80 |
section_title = sections[i] if i > 0 else "Introduction"
|
81 |
section_content = sections[i + 1] if i + 1 < len(sections) else ""
|
@@ -95,33 +104,54 @@ def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[st
|
|
95 |
return chunks
|
96 |
|
97 |
|
|
|
|
|
|
|
98 |
def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any],
|
99 |
-
is_combined: bool = False, item: Dict[str, Any] = None):
|
100 |
try:
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title)
|
120 |
-
|
121 |
-
else:
|
122 |
-
logger.info(f"Skipping existing article: {title}")
|
123 |
except Exception as e:
|
124 |
-
|
|
|
125 |
|
126 |
|
127 |
def load_checkpoint(file_path: str) -> int:
|
@@ -143,9 +173,12 @@ def import_mediawiki_dump(
|
|
143 |
skip_redirects: bool = False,
|
144 |
chunk_options: Dict[str, Any] = None,
|
145 |
single_item: bool = False,
|
146 |
-
progress_callback: Any = None
|
|
|
|
|
147 |
) -> Iterator[str]:
|
148 |
try:
|
|
|
149 |
if chunk_options is None:
|
150 |
chunk_options = config['chunking']
|
151 |
|
@@ -160,6 +193,10 @@ def import_mediawiki_dump(
|
|
160 |
for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects):
|
161 |
if item['page_id'] <= last_processed_id:
|
162 |
continue
|
|
|
|
|
|
|
|
|
163 |
process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item)
|
164 |
save_checkpoint(checkpoint_file, item['page_id'])
|
165 |
processed_pages += 1
|
|
|
7 |
import logging
|
8 |
import os
|
9 |
import re
|
10 |
+
import traceback
|
11 |
from typing import List, Dict, Any, Iterator, Optional
|
12 |
# 3rd-Party Imports
|
13 |
import mwparserfromhell
|
|
|
15 |
import yaml
|
16 |
#
|
17 |
# Local Imports
|
18 |
+
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
|
19 |
from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content
|
20 |
#
|
21 |
#######################################################################################################################
|
22 |
#
|
23 |
# Functions:
|
24 |
+
# Load configuration
|
25 |
+
def load_mediawiki_import_config():
|
26 |
+
with open(os.path.join('Config_Files', 'mediawiki_import_config.yaml'), 'r') as f:
|
27 |
+
return yaml.safe_load(f)
|
28 |
+
|
29 |
+
config = load_mediawiki_import_config()
|
30 |
+
|
31 |
|
32 |
def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
|
33 |
"""Set up and return a logger with the given name and level."""
|
|
|
49 |
# Usage
|
50 |
logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log')
|
51 |
|
52 |
+
# End of setup
|
53 |
+
#######################################################################################################################
|
54 |
+
#
|
55 |
+
# Functions:
|
56 |
+
|
57 |
|
58 |
def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[
|
59 |
Dict[str, Any]]:
|
|
|
65 |
continue
|
66 |
|
67 |
for revision in page:
|
68 |
+
wikicode = mwparserfromhell.parse(revision.text)
|
69 |
+
plain_text = wikicode.strip_code()
|
70 |
yield {
|
71 |
"title": page.title,
|
72 |
+
"content": plain_text,
|
73 |
"namespace": page.namespace,
|
74 |
"page_id": page.id,
|
75 |
"revision_id": revision.id,
|
|
|
84 |
current_chunk = ""
|
85 |
current_size = 0
|
86 |
|
87 |
+
logging.debug(f"optimized_chunking: Processing text with {len(sections) // 2} sections")
|
88 |
for i in range(0, len(sections), 2):
|
89 |
section_title = sections[i] if i > 0 else "Introduction"
|
90 |
section_content = sections[i + 1] if i + 1 < len(sections) else ""
|
|
|
104 |
return chunks
|
105 |
|
106 |
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any],
|
111 |
+
is_combined: bool = False, item: Dict[str, Any] = None, api_name: str = None):
|
112 |
try:
|
113 |
+
logging.debug(f"process_single_item: Processing item: {title}")
|
114 |
+
|
115 |
+
# Create a unique URL using the wiki name and article title
|
116 |
+
encoded_title = title.replace(" ", "_")
|
117 |
+
url = f"mediawiki:{wiki_name}:{encoded_title}"
|
118 |
+
logging.debug(f"Generated URL: {url}")
|
119 |
+
|
120 |
+
result = add_media_with_keywords(
|
121 |
+
url=url, # Use the generated URL here
|
122 |
+
title=title,
|
123 |
+
media_type="mediawiki_dump" if is_combined else "mediawiki_article",
|
124 |
+
content=content,
|
125 |
+
keywords=f"mediawiki,{wiki_name}" + (",full_dump" if is_combined else ",article"),
|
126 |
+
prompt="",
|
127 |
+
summary="",
|
128 |
+
transcription_model="",
|
129 |
+
author="MediaWiki",
|
130 |
+
ingestion_date=item['timestamp'].strftime('%Y-%m-%d') if item else None
|
131 |
+
)
|
132 |
+
logging.debug(f"Result from add_media_with_keywords: {result}")
|
133 |
+
|
134 |
+
# Unpack the result
|
135 |
+
media_id, message = result
|
136 |
+
logging.info(f"Media item result: {message}")
|
137 |
+
logging.debug(f"Final media_id: {media_id}")
|
138 |
+
|
139 |
+
chunks = optimized_chunking(content, chunk_options)
|
140 |
+
for i, chunk in enumerate(chunks):
|
141 |
+
logging.debug(f"Processing chunk {i + 1}/{len(chunks)} for item: {title}")
|
142 |
+
|
143 |
+
# FIXME
|
144 |
+
# def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
|
145 |
+
# create_embeddings: bool = False, create_summary: bool = False,
|
146 |
+
# api_name: str = None):
|
147 |
+
if api_name:
|
148 |
+
process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title, True, True, api_name)
|
149 |
+
else:
|
150 |
process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title)
|
151 |
+
logging.info(f"Successfully processed item: {title}")
|
|
|
|
|
152 |
except Exception as e:
|
153 |
+
logging.error(f"Error processing item {title}: {str(e)}")
|
154 |
+
logging.error(f"Exception details: {traceback.format_exc()}")
|
155 |
|
156 |
|
157 |
def load_checkpoint(file_path: str) -> int:
|
|
|
173 |
skip_redirects: bool = False,
|
174 |
chunk_options: Dict[str, Any] = None,
|
175 |
single_item: bool = False,
|
176 |
+
progress_callback: Any = None,
|
177 |
+
api_name: str = None,
|
178 |
+
api_key: str = None
|
179 |
) -> Iterator[str]:
|
180 |
try:
|
181 |
+
logging.info(f"Importing MediaWiki dump: {file_path}")
|
182 |
if chunk_options is None:
|
183 |
chunk_options = config['chunking']
|
184 |
|
|
|
193 |
for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects):
|
194 |
if item['page_id'] <= last_processed_id:
|
195 |
continue
|
196 |
+
# FIXME - ensure this works...
|
197 |
+
if api_name is not None:
|
198 |
+
# FIXME - add API key to the call/params
|
199 |
+
process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item, api_name)
|
200 |
process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item)
|
201 |
save_checkpoint(checkpoint_file, item['page_id'])
|
202 |
processed_pages += 1
|
App_Function_Libraries/PDF/PDF_Ingestion_Lib.py
ADDED
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PDF_Ingestion_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Library to hold functions for ingesting PDF files.#
|
4 |
+
#
|
5 |
+
####################
|
6 |
+
# Function List
|
7 |
+
#
|
8 |
+
# 1. convert_pdf_to_markdown(pdf_path)
|
9 |
+
# 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
|
10 |
+
# 3.
|
11 |
+
#
|
12 |
+
#
|
13 |
+
####################
|
14 |
+
import re
|
15 |
+
|
16 |
+
# Import necessary libraries
|
17 |
+
|
18 |
+
|
19 |
+
# Import Local
|
20 |
+
|
21 |
+
#######################################################################################################################
|
22 |
+
# Function Definitions
|
23 |
+
#
|
24 |
+
|
25 |
+
# Ingest a text file into the database with Title/Author/Keywords
|
26 |
+
|
27 |
+
|
28 |
+
# Constants
|
29 |
+
MAX_FILE_SIZE_MB = 50
|
30 |
+
CONVERSION_TIMEOUT_SECONDS = 300
|
31 |
+
|
32 |
+
# Marker PDF solution
|
33 |
+
# def convert_pdf_to_markdown(pdf_path):
|
34 |
+
# """
|
35 |
+
# Convert a PDF file to Markdown by calling a script in another virtual environment.
|
36 |
+
# """
|
37 |
+
#
|
38 |
+
# logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
|
39 |
+
# # Check if the file size exceeds the maximum allowed size
|
40 |
+
# file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
|
41 |
+
# if file_size_mb > MAX_FILE_SIZE_MB:
|
42 |
+
# raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
|
43 |
+
#
|
44 |
+
# logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
|
45 |
+
# # Path to the Python interpreter in the other virtual environment
|
46 |
+
# other_venv_python = "Helper_Scripts/marker_venv/bin/python"
|
47 |
+
#
|
48 |
+
# # Path to the conversion script
|
49 |
+
# converter_script = "Helper_Scripts/PDF_Converter.py"
|
50 |
+
#
|
51 |
+
# logging.debug("Marker: Attempting to convert PDF file to Markdown...")
|
52 |
+
# try:
|
53 |
+
# result = subprocess.run(
|
54 |
+
# [other_venv_python, converter_script, pdf_path],
|
55 |
+
# capture_output=True,
|
56 |
+
# text=True,
|
57 |
+
# timeout=CONVERSION_TIMEOUT_SECONDS
|
58 |
+
# )
|
59 |
+
# if result.returncode != 0:
|
60 |
+
# raise Exception(f"Conversion failed: {result.stderr}")
|
61 |
+
# return result.stdout
|
62 |
+
# except subprocess.TimeoutExpired:
|
63 |
+
# raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
|
64 |
+
#
|
65 |
+
#
|
66 |
+
# def process_and_ingest_pdf(file, title, author, keywords):
|
67 |
+
# if file is None:
|
68 |
+
# return "Please select a PDF file to upload."
|
69 |
+
#
|
70 |
+
# try:
|
71 |
+
# # Create a temporary directory
|
72 |
+
# with tempfile.TemporaryDirectory() as temp_dir:
|
73 |
+
# # Create a path for the temporary PDF file
|
74 |
+
# temp_path = os.path.join(temp_dir, "temp.pdf")
|
75 |
+
#
|
76 |
+
# # Copy the contents of the uploaded file to the temporary file
|
77 |
+
# shutil.copy(file.name, temp_path)
|
78 |
+
#
|
79 |
+
# # Call the ingest_pdf_file function with the temporary file path
|
80 |
+
# result = ingest_pdf_file(temp_path, title, author, keywords)
|
81 |
+
#
|
82 |
+
# return result
|
83 |
+
# except Exception as e:
|
84 |
+
# return f"Error processing PDF: {str(e)}"
|
85 |
+
#
|
86 |
+
#
|
87 |
+
# def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
|
88 |
+
# try:
|
89 |
+
# # Convert PDF to Markdown
|
90 |
+
# markdown_content = convert_pdf_to_markdown(file_path)
|
91 |
+
#
|
92 |
+
# # If title is not provided, use the filename without extension
|
93 |
+
# if not title:
|
94 |
+
# title = os.path.splitext(os.path.basename(file_path))[0]
|
95 |
+
#
|
96 |
+
# # If author is not provided, set it to 'Unknown'
|
97 |
+
# if not author:
|
98 |
+
# author = 'Unknown'
|
99 |
+
#
|
100 |
+
# # If keywords are not provided, use a default keyword
|
101 |
+
# if not keywords:
|
102 |
+
# keywords = 'pdf_file,markdown_converted'
|
103 |
+
# else:
|
104 |
+
# keywords = f'pdf_file,markdown_converted,{keywords}'
|
105 |
+
#
|
106 |
+
# # Add the markdown content to the database
|
107 |
+
# add_media_with_keywords(
|
108 |
+
# url=file_path,
|
109 |
+
# title=title,
|
110 |
+
# media_type='document',
|
111 |
+
# content=markdown_content,
|
112 |
+
# keywords=keywords,
|
113 |
+
# prompt='No prompt for PDF files',
|
114 |
+
# summary='No summary for PDF files',
|
115 |
+
# transcription_model='None',
|
116 |
+
# author=author,
|
117 |
+
# ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
118 |
+
# )
|
119 |
+
#
|
120 |
+
# return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
|
121 |
+
# except ValueError as e:
|
122 |
+
# logging.error(f"File size error: {str(e)}")
|
123 |
+
# return f"Error: {str(e)}", file_path
|
124 |
+
# except Exception as e:
|
125 |
+
# logging.error(f"Error ingesting PDF file: {str(e)}")
|
126 |
+
# return f"Error ingesting PDF file: {str(e)}", file_path
|
127 |
+
#
|
128 |
+
#
|
129 |
+
# def process_and_cleanup_pdf(file, title, author, keywords):
|
130 |
+
# # FIXME - Update to validate file upload/filetype is pdf....
|
131 |
+
# if file is None:
|
132 |
+
# return "No file uploaded. Please upload a PDF file."
|
133 |
+
#
|
134 |
+
# temp_dir = tempfile.mkdtemp()
|
135 |
+
# temp_file_path = os.path.join(temp_dir, "temp.pdf")
|
136 |
+
#
|
137 |
+
# try:
|
138 |
+
# # Copy the uploaded file to a temporary location
|
139 |
+
# shutil.copy2(file.name, temp_file_path)
|
140 |
+
#
|
141 |
+
# # Process the file
|
142 |
+
# result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
|
143 |
+
#
|
144 |
+
# return result
|
145 |
+
# except Exception as e:
|
146 |
+
# logging.error(f"Error in processing and cleanup: {str(e)}")
|
147 |
+
# return f"Error: {str(e)}"
|
148 |
+
# finally:
|
149 |
+
# # Clean up the temporary directory and its contents
|
150 |
+
# try:
|
151 |
+
# shutil.rmtree(temp_dir)
|
152 |
+
# logging.info(f"Removed temporary directory: {temp_dir}")
|
153 |
+
# except Exception as cleanup_error:
|
154 |
+
# logging.error(f"Error during cleanup: {str(cleanup_error)}")
|
155 |
+
# result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
|
156 |
+
|
157 |
+
|
158 |
+
import logging
|
159 |
+
#
|
160 |
+
#
|
161 |
+
#######################################################################################################################
|
162 |
+
#
|
163 |
+
# Non-Marker implementation
|
164 |
+
import os
|
165 |
+
import shutil
|
166 |
+
import tempfile
|
167 |
+
from datetime import datetime
|
168 |
+
|
169 |
+
import pymupdf
|
170 |
+
|
171 |
+
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
|
172 |
+
|
173 |
+
|
174 |
+
def extract_text_and_format_from_pdf(pdf_path):
|
175 |
+
"""
|
176 |
+
Extract text from a PDF file and convert it to Markdown, preserving formatting.
|
177 |
+
"""
|
178 |
+
try:
|
179 |
+
markdown_text = ""
|
180 |
+
with pymupdf.open(pdf_path) as doc:
|
181 |
+
for page_num, page in enumerate(doc, 1):
|
182 |
+
markdown_text += f"## Page {page_num}\n\n"
|
183 |
+
blocks = page.get_text("dict")["blocks"]
|
184 |
+
current_paragraph = ""
|
185 |
+
for block in blocks:
|
186 |
+
if block["type"] == 0: # Text block
|
187 |
+
for line in block["lines"]:
|
188 |
+
line_text = ""
|
189 |
+
for span in line["spans"]:
|
190 |
+
text = span["text"]
|
191 |
+
font_size = span["size"]
|
192 |
+
font_flags = span["flags"]
|
193 |
+
|
194 |
+
# Apply formatting based on font size and flags
|
195 |
+
if font_size > 20:
|
196 |
+
text = f"# {text}"
|
197 |
+
elif font_size > 16:
|
198 |
+
text = f"## {text}"
|
199 |
+
elif font_size > 14:
|
200 |
+
text = f"### {text}"
|
201 |
+
|
202 |
+
if font_flags & 2 ** 0: # Bold
|
203 |
+
text = f"**{text}**"
|
204 |
+
if font_flags & 2 ** 1: # Italic
|
205 |
+
text = f"*{text}*"
|
206 |
+
|
207 |
+
line_text += text + " "
|
208 |
+
|
209 |
+
# Remove hyphens at the end of lines
|
210 |
+
line_text = line_text.rstrip()
|
211 |
+
if line_text.endswith('-'):
|
212 |
+
line_text = line_text[:-1]
|
213 |
+
else:
|
214 |
+
line_text += " "
|
215 |
+
|
216 |
+
current_paragraph += line_text
|
217 |
+
|
218 |
+
# End of block, add paragraph
|
219 |
+
if current_paragraph:
|
220 |
+
# Remove extra spaces
|
221 |
+
current_paragraph = re.sub(r'\s+', ' ', current_paragraph).strip()
|
222 |
+
markdown_text += current_paragraph + "\n\n"
|
223 |
+
current_paragraph = ""
|
224 |
+
elif block["type"] == 1: # Image block
|
225 |
+
markdown_text += "[Image]\n\n"
|
226 |
+
markdown_text += "\n---\n\n" # Page separator
|
227 |
+
|
228 |
+
# Clean up hyphenated words
|
229 |
+
markdown_text = re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', markdown_text)
|
230 |
+
|
231 |
+
return markdown_text
|
232 |
+
except Exception as e:
|
233 |
+
logging.error(f"Error extracting text and formatting from PDF: {str(e)}")
|
234 |
+
raise
|
235 |
+
|
236 |
+
|
237 |
+
def extract_metadata_from_pdf(pdf_path):
|
238 |
+
"""
|
239 |
+
Extract metadata from a PDF file using PyMuPDF.
|
240 |
+
"""
|
241 |
+
try:
|
242 |
+
with pymupdf.open(pdf_path) as doc:
|
243 |
+
metadata = doc.metadata
|
244 |
+
return metadata
|
245 |
+
except Exception as e:
|
246 |
+
logging.error(f"Error extracting metadata from PDF: {str(e)}")
|
247 |
+
return {}
|
248 |
+
|
249 |
+
|
250 |
+
def process_and_ingest_pdf(file, title, author, keywords):
|
251 |
+
if file is None:
|
252 |
+
return "Please select a PDF file to upload."
|
253 |
+
|
254 |
+
try:
|
255 |
+
# Create a temporary directory
|
256 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
257 |
+
# Create a path for the temporary PDF file
|
258 |
+
temp_path = os.path.join(temp_dir, "temp.pdf")
|
259 |
+
|
260 |
+
# Copy the contents of the uploaded file to the temporary file
|
261 |
+
shutil.copy(file.name, temp_path)
|
262 |
+
|
263 |
+
# Extract text and convert to Markdown
|
264 |
+
markdown_text = extract_text_and_format_from_pdf(temp_path)
|
265 |
+
|
266 |
+
# Extract metadata from PDF
|
267 |
+
metadata = extract_metadata_from_pdf(temp_path)
|
268 |
+
|
269 |
+
# Use metadata for title and author if not provided
|
270 |
+
if not title:
|
271 |
+
title = metadata.get('title', os.path.splitext(os.path.basename(file.name))[0])
|
272 |
+
if not author:
|
273 |
+
author = metadata.get('author', 'Unknown')
|
274 |
+
|
275 |
+
# If keywords are not provided, use a default keyword
|
276 |
+
if not keywords:
|
277 |
+
keywords = 'pdf_file,markdown_converted'
|
278 |
+
else:
|
279 |
+
keywords = f'pdf_file,markdown_converted,{keywords}'
|
280 |
+
|
281 |
+
# Add metadata-based keywords
|
282 |
+
if 'subject' in metadata:
|
283 |
+
keywords += f",{metadata['subject']}"
|
284 |
+
|
285 |
+
# Add the PDF content to the database
|
286 |
+
add_media_with_keywords(
|
287 |
+
url=file.name,
|
288 |
+
title=title,
|
289 |
+
media_type='document',
|
290 |
+
content=markdown_text,
|
291 |
+
keywords=keywords,
|
292 |
+
prompt='No prompt for PDF files',
|
293 |
+
summary='No summary for PDF files',
|
294 |
+
transcription_model='None',
|
295 |
+
author=author,
|
296 |
+
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
297 |
+
)
|
298 |
+
|
299 |
+
return f"PDF file '{title}' by {author} ingested successfully and converted to Markdown."
|
300 |
+
except Exception as e:
|
301 |
+
logging.error(f"Error ingesting PDF file: {str(e)}")
|
302 |
+
return f"Error ingesting PDF file: {str(e)}"
|
303 |
+
|
304 |
+
|
305 |
+
def process_and_cleanup_pdf(file, title, author, keywords):
|
306 |
+
if file is None:
|
307 |
+
return "No file uploaded. Please upload a PDF file."
|
308 |
+
|
309 |
+
try:
|
310 |
+
result = process_and_ingest_pdf(file, title, author, keywords)
|
311 |
+
return result
|
312 |
+
except Exception as e:
|
313 |
+
logging.error(f"Error in processing and cleanup: {str(e)}")
|
314 |
+
return f"Error: {str(e)}"
|
315 |
+
|
316 |
+
#
|
317 |
+
# End of PDF_Ingestion_Lib.py
|
318 |
+
#######################################################################################################################
|
App_Function_Libraries/PDF/__init__.py
ADDED
File without changes
|
App_Function_Libraries/Summarization/Summarization_General_Lib.py
CHANGED
@@ -25,10 +25,10 @@ from typing import Optional
|
|
25 |
import requests
|
26 |
from requests import RequestException
|
27 |
|
28 |
-
from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav, speech_to_text
|
29 |
from App_Function_Libraries.Chunk_Lib import semantic_chunking, rolling_summarize, recursive_summarize_chunks, \
|
30 |
improved_chunking_process
|
31 |
-
from App_Function_Libraries.Diarization_Lib import combine_transcription_and_diarization
|
32 |
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
|
33 |
summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm
|
34 |
from App_Function_Libraries.DB.DB_Manager import add_media_to_database
|
|
|
25 |
import requests
|
26 |
from requests import RequestException
|
27 |
|
28 |
+
from App_Function_Libraries.Audio.Audio_Transcription_Lib import convert_to_wav, speech_to_text
|
29 |
from App_Function_Libraries.Chunk_Lib import semantic_chunking, rolling_summarize, recursive_summarize_chunks, \
|
30 |
improved_chunking_process
|
31 |
+
from App_Function_Libraries.Audio.Diarization_Lib import combine_transcription_and_diarization
|
32 |
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
|
33 |
summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm
|
34 |
from App_Function_Libraries.DB.DB_Manager import add_media_to_database
|
App_Function_Libraries/Video_DL_Ingestion_Lib.py
CHANGED
@@ -1,331 +1,332 @@
|
|
1 |
-
# Video_DL_Ingestion_Lib.py
|
2 |
-
#########################################
|
3 |
-
# Video Downloader and Ingestion Library
|
4 |
-
# This library is used to handle downloading videos from YouTube and other platforms.
|
5 |
-
# It also handles the ingestion of the videos into the database.
|
6 |
-
# It uses yt-dlp to extract video information and download the videos.
|
7 |
-
####
|
8 |
-
import json
|
9 |
-
####################
|
10 |
-
# Function List
|
11 |
-
#
|
12 |
-
# 1. get_video_info(url)
|
13 |
-
# 2. create_download_directory(title)
|
14 |
-
# 3. sanitize_filename(title)
|
15 |
-
# 4. normalize_title(title)
|
16 |
-
# 5. get_youtube(video_url)
|
17 |
-
# 6. get_playlist_videos(playlist_url)
|
18 |
-
# 7. download_video(video_url, download_path, info_dict, download_video_flag)
|
19 |
-
# 8. save_to_file(video_urls, filename)
|
20 |
-
# 9. save_summary_to_file(summary, file_path)
|
21 |
-
# 10. process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter, download_video, download_audio, rolling_summarization, detail_level, question_box, keywords, chunk_summarization, chunk_duration_input, words_per_second_input)
|
22 |
-
#
|
23 |
-
#
|
24 |
-
####################
|
25 |
-
# Import necessary libraries to run solo for testing
|
26 |
-
import logging
|
27 |
-
import os
|
28 |
-
import re
|
29 |
-
import sys
|
30 |
-
from urllib.parse import urlparse, parse_qs
|
31 |
-
|
32 |
-
import unicodedata
|
33 |
-
# 3rd-Party Imports
|
34 |
-
import yt_dlp
|
35 |
-
|
36 |
-
from App_Function_Libraries.DB.DB_Manager import check_media_and_whisper_model
|
37 |
-
|
38 |
-
|
39 |
-
# Import Local
|
40 |
-
#
|
41 |
-
#######################################################################################################################
|
42 |
-
# Function Definitions
|
43 |
-
#
|
44 |
-
|
45 |
-
def normalize_title(title):
|
46 |
-
# Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
|
47 |
-
title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
|
48 |
-
title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
|
49 |
-
'').replace(
|
50 |
-
'<', '').replace('>', '').replace('|', '')
|
51 |
-
return title
|
52 |
-
|
53 |
-
def get_video_info(url: str) -> dict:
|
54 |
-
ydl_opts = {
|
55 |
-
'quiet': True,
|
56 |
-
'no_warnings': True,
|
57 |
-
'skip_download': True,
|
58 |
-
}
|
59 |
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
60 |
-
try:
|
61 |
-
info_dict = ydl.extract_info(url, download=False)
|
62 |
-
return info_dict
|
63 |
-
except Exception as e:
|
64 |
-
logging.error(f"Error extracting video info: {e}")
|
65 |
-
return None
|
66 |
-
|
67 |
-
|
68 |
-
def get_youtube(video_url):
|
69 |
-
ydl_opts = {
|
70 |
-
'format': 'bestaudio[ext=m4a]',
|
71 |
-
'noplaylist': False,
|
72 |
-
'quiet': True,
|
73 |
-
'extract_flat': True
|
74 |
-
}
|
75 |
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
76 |
-
logging.debug("About to extract youtube info")
|
77 |
-
info_dict = ydl.extract_info(video_url, download=False)
|
78 |
-
logging.debug("Youtube info successfully extracted")
|
79 |
-
return info_dict
|
80 |
-
|
81 |
-
|
82 |
-
def get_playlist_videos(playlist_url):
|
83 |
-
ydl_opts = {
|
84 |
-
'extract_flat': True,
|
85 |
-
'skip_download': True,
|
86 |
-
'quiet': True
|
87 |
-
}
|
88 |
-
|
89 |
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
90 |
-
info = ydl.extract_info(playlist_url, download=False)
|
91 |
-
|
92 |
-
if 'entries' in info:
|
93 |
-
video_urls = [entry['url'] for entry in info['entries']]
|
94 |
-
playlist_title = info['title']
|
95 |
-
return video_urls, playlist_title
|
96 |
-
else:
|
97 |
-
print("No videos found in the playlist.")
|
98 |
-
return [], None
|
99 |
-
|
100 |
-
|
101 |
-
def download_video(video_url, download_path, info_dict, download_video_flag, current_whisper_model):
|
102 |
-
global video_file_path, ffmpeg_path
|
103 |
-
global audio_file_path
|
104 |
-
|
105 |
-
# Normalize Video Title name
|
106 |
-
logging.debug("About to normalize downloaded video title")
|
107 |
-
if 'title' not in info_dict or 'ext' not in info_dict:
|
108 |
-
logging.error("info_dict is missing 'title' or 'ext'")
|
109 |
-
return None
|
110 |
-
|
111 |
-
normalized_video_title = normalize_title(info_dict['title'])
|
112 |
-
|
113 |
-
#
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
'
|
146 |
-
'
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
'
|
168 |
-
'
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
'
|
199 |
-
'
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
'
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
# expanded_urls
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
'
|
271 |
-
'
|
272 |
-
'
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
'
|
288 |
-
'
|
289 |
-
'
|
290 |
-
'
|
291 |
-
'
|
292 |
-
'
|
293 |
-
'
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
'
|
300 |
-
'
|
301 |
-
'
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
#
|
331 |
-
|
|
|
|
1 |
+
# Video_DL_Ingestion_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Video Downloader and Ingestion Library
|
4 |
+
# This library is used to handle downloading videos from YouTube and other platforms.
|
5 |
+
# It also handles the ingestion of the videos into the database.
|
6 |
+
# It uses yt-dlp to extract video information and download the videos.
|
7 |
+
####
|
8 |
+
import json
|
9 |
+
####################
|
10 |
+
# Function List
|
11 |
+
#
|
12 |
+
# 1. get_video_info(url)
|
13 |
+
# 2. create_download_directory(title)
|
14 |
+
# 3. sanitize_filename(title)
|
15 |
+
# 4. normalize_title(title)
|
16 |
+
# 5. get_youtube(video_url)
|
17 |
+
# 6. get_playlist_videos(playlist_url)
|
18 |
+
# 7. download_video(video_url, download_path, info_dict, download_video_flag)
|
19 |
+
# 8. save_to_file(video_urls, filename)
|
20 |
+
# 9. save_summary_to_file(summary, file_path)
|
21 |
+
# 10. process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter, download_video, download_audio, rolling_summarization, detail_level, question_box, keywords, chunk_summarization, chunk_duration_input, words_per_second_input)
|
22 |
+
#
|
23 |
+
#
|
24 |
+
####################
|
25 |
+
# Import necessary libraries to run solo for testing
|
26 |
+
import logging
|
27 |
+
import os
|
28 |
+
import re
|
29 |
+
import sys
|
30 |
+
from urllib.parse import urlparse, parse_qs
|
31 |
+
|
32 |
+
import unicodedata
|
33 |
+
# 3rd-Party Imports
|
34 |
+
import yt_dlp
|
35 |
+
|
36 |
+
from App_Function_Libraries.DB.DB_Manager import check_media_and_whisper_model
|
37 |
+
|
38 |
+
|
39 |
+
# Import Local
|
40 |
+
#
|
41 |
+
#######################################################################################################################
|
42 |
+
# Function Definitions
|
43 |
+
#
|
44 |
+
|
45 |
+
def normalize_title(title):
|
46 |
+
# Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
|
47 |
+
title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
|
48 |
+
title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
|
49 |
+
'').replace(
|
50 |
+
'<', '').replace('>', '').replace('|', '')
|
51 |
+
return title
|
52 |
+
|
53 |
+
def get_video_info(url: str) -> dict:
|
54 |
+
ydl_opts = {
|
55 |
+
'quiet': True,
|
56 |
+
'no_warnings': True,
|
57 |
+
'skip_download': True,
|
58 |
+
}
|
59 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
60 |
+
try:
|
61 |
+
info_dict = ydl.extract_info(url, download=False)
|
62 |
+
return info_dict
|
63 |
+
except Exception as e:
|
64 |
+
logging.error(f"Error extracting video info: {e}")
|
65 |
+
return None
|
66 |
+
|
67 |
+
|
68 |
+
def get_youtube(video_url):
|
69 |
+
ydl_opts = {
|
70 |
+
'format': 'bestaudio[ext=m4a]',
|
71 |
+
'noplaylist': False,
|
72 |
+
'quiet': True,
|
73 |
+
'extract_flat': True
|
74 |
+
}
|
75 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
76 |
+
logging.debug("About to extract youtube info")
|
77 |
+
info_dict = ydl.extract_info(video_url, download=False)
|
78 |
+
logging.debug("Youtube info successfully extracted")
|
79 |
+
return info_dict
|
80 |
+
|
81 |
+
|
82 |
+
def get_playlist_videos(playlist_url):
|
83 |
+
ydl_opts = {
|
84 |
+
'extract_flat': True,
|
85 |
+
'skip_download': True,
|
86 |
+
'quiet': True
|
87 |
+
}
|
88 |
+
|
89 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
90 |
+
info = ydl.extract_info(playlist_url, download=False)
|
91 |
+
|
92 |
+
if 'entries' in info:
|
93 |
+
video_urls = [entry['url'] for entry in info['entries']]
|
94 |
+
playlist_title = info['title']
|
95 |
+
return video_urls, playlist_title
|
96 |
+
else:
|
97 |
+
print("No videos found in the playlist.")
|
98 |
+
return [], None
|
99 |
+
|
100 |
+
|
101 |
+
def download_video(video_url, download_path, info_dict, download_video_flag, current_whisper_model):
|
102 |
+
global video_file_path, ffmpeg_path
|
103 |
+
global audio_file_path
|
104 |
+
|
105 |
+
# Normalize Video Title name
|
106 |
+
logging.debug("About to normalize downloaded video title")
|
107 |
+
if 'title' not in info_dict or 'ext' not in info_dict:
|
108 |
+
logging.error("info_dict is missing 'title' or 'ext'")
|
109 |
+
return None
|
110 |
+
|
111 |
+
normalized_video_title = normalize_title(info_dict['title'])
|
112 |
+
|
113 |
+
# FIXME - make sure this works/checks against hte current model
|
114 |
+
# Check if media already exists in the database and compare whisper models
|
115 |
+
should_download, reason = check_media_and_whisper_model(
|
116 |
+
title=normalized_video_title,
|
117 |
+
url=video_url,
|
118 |
+
current_whisper_model=current_whisper_model
|
119 |
+
)
|
120 |
+
|
121 |
+
if not should_download:
|
122 |
+
logging.info(f"Skipping download: {reason}")
|
123 |
+
return None
|
124 |
+
|
125 |
+
logging.info(f"Proceeding with download: {reason}")
|
126 |
+
|
127 |
+
video_file_path = os.path.join(download_path, f"{normalized_video_title}.{info_dict['ext']}")
|
128 |
+
|
129 |
+
# Check for existence of video file
|
130 |
+
if os.path.exists(video_file_path):
|
131 |
+
logging.info(f"Video file already exists: {video_file_path}")
|
132 |
+
return video_file_path
|
133 |
+
|
134 |
+
# Setup path handling for ffmpeg on different OSs
|
135 |
+
if sys.platform.startswith('win'):
|
136 |
+
ffmpeg_path = os.path.join(os.getcwd(), 'Bin', 'ffmpeg.exe')
|
137 |
+
elif sys.platform.startswith('linux'):
|
138 |
+
ffmpeg_path = 'ffmpeg'
|
139 |
+
elif sys.platform.startswith('darwin'):
|
140 |
+
ffmpeg_path = 'ffmpeg'
|
141 |
+
|
142 |
+
if download_video_flag:
|
143 |
+
video_file_path = os.path.join(download_path, f"{normalized_video_title}.mp4")
|
144 |
+
ydl_opts_video = {
|
145 |
+
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]',
|
146 |
+
'outtmpl': video_file_path,
|
147 |
+
'ffmpeg_location': ffmpeg_path
|
148 |
+
}
|
149 |
+
|
150 |
+
try:
|
151 |
+
with yt_dlp.YoutubeDL(ydl_opts_video) as ydl:
|
152 |
+
logging.debug("yt_dlp: About to download video with youtube-dl")
|
153 |
+
ydl.download([video_url])
|
154 |
+
logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
|
155 |
+
if os.path.exists(video_file_path):
|
156 |
+
return video_file_path
|
157 |
+
else:
|
158 |
+
logging.error("yt_dlp: Video file not found after download")
|
159 |
+
return None
|
160 |
+
except Exception as e:
|
161 |
+
logging.error(f"yt_dlp: Error downloading video: {e}")
|
162 |
+
return None
|
163 |
+
elif not download_video_flag:
|
164 |
+
video_file_path = os.path.join(download_path, f"{normalized_video_title}.mp4")
|
165 |
+
# Set options for video and audio
|
166 |
+
ydl_opts = {
|
167 |
+
'format': 'bestaudio[ext=m4a]',
|
168 |
+
'quiet': True,
|
169 |
+
'outtmpl': video_file_path
|
170 |
+
}
|
171 |
+
|
172 |
+
try:
|
173 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
174 |
+
logging.debug("yt_dlp: About to download video with youtube-dl")
|
175 |
+
ydl.download([video_url])
|
176 |
+
logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
|
177 |
+
if os.path.exists(video_file_path):
|
178 |
+
return video_file_path
|
179 |
+
else:
|
180 |
+
logging.error("yt_dlp: Video file not found after download")
|
181 |
+
return None
|
182 |
+
except Exception as e:
|
183 |
+
logging.error(f"yt_dlp: Error downloading video: {e}")
|
184 |
+
return None
|
185 |
+
|
186 |
+
else:
|
187 |
+
logging.debug("download_video: Download video flag is set to False and video file path is not found")
|
188 |
+
return None
|
189 |
+
|
190 |
+
|
191 |
+
def extract_video_info(url):
|
192 |
+
try:
|
193 |
+
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
|
194 |
+
info = ydl.extract_info(url, download=False)
|
195 |
+
|
196 |
+
# Log only a subset of the info to avoid overwhelming the logs
|
197 |
+
log_info = {
|
198 |
+
'title': info.get('title'),
|
199 |
+
'duration': info.get('duration'),
|
200 |
+
'upload_date': info.get('upload_date')
|
201 |
+
}
|
202 |
+
logging.debug(f"Extracted info for {url}: {log_info}")
|
203 |
+
|
204 |
+
return info
|
205 |
+
except Exception as e:
|
206 |
+
logging.error(f"Error extracting video info for {url}: {str(e)}", exc_info=True)
|
207 |
+
return None
|
208 |
+
|
209 |
+
|
210 |
+
def get_youtube_playlist_urls(playlist_id):
|
211 |
+
ydl_opts = {
|
212 |
+
'extract_flat': True,
|
213 |
+
'quiet': True,
|
214 |
+
}
|
215 |
+
|
216 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
217 |
+
result = ydl.extract_info(f'https://www.youtube.com/playlist?list={playlist_id}', download=False)
|
218 |
+
return [entry['url'] for entry in result['entries'] if entry.get('url')]
|
219 |
+
|
220 |
+
|
221 |
+
def parse_and_expand_urls(urls):
|
222 |
+
logging.info(f"Starting parse_and_expand_urls with input: {urls}")
|
223 |
+
expanded_urls = []
|
224 |
+
|
225 |
+
for url in urls:
|
226 |
+
try:
|
227 |
+
logging.info(f"Processing URL: {url}")
|
228 |
+
parsed_url = urlparse(url)
|
229 |
+
logging.debug(f"Parsed URL components: {parsed_url}")
|
230 |
+
|
231 |
+
# YouTube playlist handling
|
232 |
+
if 'youtube.com' in parsed_url.netloc and 'list' in parsed_url.query:
|
233 |
+
playlist_id = parse_qs(parsed_url.query)['list'][0]
|
234 |
+
logging.info(f"Detected YouTube playlist with ID: {playlist_id}")
|
235 |
+
playlist_urls = get_youtube_playlist_urls(playlist_id)
|
236 |
+
logging.info(f"Expanded playlist URLs: {playlist_urls}")
|
237 |
+
expanded_urls.extend(playlist_urls)
|
238 |
+
|
239 |
+
# YouTube short URL handling
|
240 |
+
elif 'youtu.be' in parsed_url.netloc:
|
241 |
+
video_id = parsed_url.path.lstrip('/')
|
242 |
+
full_url = f'https://www.youtube.com/watch?v={video_id}'
|
243 |
+
logging.info(f"Expanded YouTube short URL to: {full_url}")
|
244 |
+
expanded_urls.append(full_url)
|
245 |
+
|
246 |
+
# Vimeo handling
|
247 |
+
elif 'vimeo.com' in parsed_url.netloc:
|
248 |
+
video_id = parsed_url.path.lstrip('/')
|
249 |
+
full_url = f'https://vimeo.com/{video_id}'
|
250 |
+
logging.info(f"Processed Vimeo URL: {full_url}")
|
251 |
+
expanded_urls.append(full_url)
|
252 |
+
|
253 |
+
# Add more platform-specific handling here
|
254 |
+
|
255 |
+
else:
|
256 |
+
logging.info(f"URL not recognized as special case, adding as-is: {url}")
|
257 |
+
expanded_urls.append(url)
|
258 |
+
|
259 |
+
except Exception as e:
|
260 |
+
logging.error(f"Error processing URL {url}: {str(e)}", exc_info=True)
|
261 |
+
# Optionally, you might want to add the problematic URL to expanded_urls
|
262 |
+
# expanded_urls.append(url)
|
263 |
+
|
264 |
+
logging.info(f"Final expanded URLs: {expanded_urls}")
|
265 |
+
return expanded_urls
|
266 |
+
|
267 |
+
|
268 |
+
def extract_metadata(url, use_cookies=False, cookies=None):
|
269 |
+
ydl_opts = {
|
270 |
+
'quiet': True,
|
271 |
+
'no_warnings': True,
|
272 |
+
'extract_flat': True,
|
273 |
+
'skip_download': True,
|
274 |
+
}
|
275 |
+
|
276 |
+
if use_cookies and cookies:
|
277 |
+
try:
|
278 |
+
cookie_dict = json.loads(cookies)
|
279 |
+
ydl_opts['cookiefile'] = cookie_dict
|
280 |
+
except json.JSONDecodeError:
|
281 |
+
logging.warning("Invalid cookie format. Proceeding without cookies.")
|
282 |
+
|
283 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
284 |
+
try:
|
285 |
+
info = ydl.extract_info(url, download=False)
|
286 |
+
metadata = {
|
287 |
+
'title': info.get('title'),
|
288 |
+
'uploader': info.get('uploader'),
|
289 |
+
'upload_date': info.get('upload_date'),
|
290 |
+
'view_count': info.get('view_count'),
|
291 |
+
'like_count': info.get('like_count'),
|
292 |
+
'duration': info.get('duration'),
|
293 |
+
'tags': info.get('tags'),
|
294 |
+
'description': info.get('description')
|
295 |
+
}
|
296 |
+
|
297 |
+
# Create a safe subset of metadata to log
|
298 |
+
safe_metadata = {
|
299 |
+
'title': metadata.get('title', 'No title'),
|
300 |
+
'duration': metadata.get('duration', 'Unknown duration'),
|
301 |
+
'upload_date': metadata.get('upload_date', 'Unknown upload date'),
|
302 |
+
'uploader': metadata.get('uploader', 'Unknown uploader')
|
303 |
+
}
|
304 |
+
|
305 |
+
logging.info(f"Successfully extracted metadata for {url}: {safe_metadata}")
|
306 |
+
return metadata
|
307 |
+
except Exception as e:
|
308 |
+
logging.error(f"Error extracting metadata for {url}: {str(e)}", exc_info=True)
|
309 |
+
return None
|
310 |
+
|
311 |
+
|
312 |
+
def generate_timestamped_url(url, hours, minutes, seconds):
|
313 |
+
# Extract video ID from the URL
|
314 |
+
video_id_match = re.search(r'(?:v=|)([0-9A-Za-z_-]{11}).*', url)
|
315 |
+
if not video_id_match:
|
316 |
+
return "Invalid YouTube URL"
|
317 |
+
|
318 |
+
video_id = video_id_match.group(1)
|
319 |
+
|
320 |
+
# Calculate total seconds
|
321 |
+
total_seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds)
|
322 |
+
|
323 |
+
# Generate the new URL
|
324 |
+
new_url = f"https://www.youtube.com/watch?v={video_id}&t={total_seconds}s"
|
325 |
+
|
326 |
+
return new_url
|
327 |
+
|
328 |
+
|
329 |
+
|
330 |
+
#
|
331 |
+
#
|
332 |
+
#######################################################################################################################
|
App_Function_Libraries/Web_Scraping/Article_Extractor_Lib.py
ADDED
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Article_Extractor_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Article Extraction Library
|
4 |
+
# This library is used to handle scraping and extraction of articles from web pages.
|
5 |
+
#
|
6 |
+
####################
|
7 |
+
# Function List
|
8 |
+
#
|
9 |
+
# 1. get_page_title(url)
|
10 |
+
# 2. get_article_text(url)
|
11 |
+
# 3. get_article_title(article_url_arg)
|
12 |
+
#
|
13 |
+
####################
|
14 |
+
#
|
15 |
+
# Import necessary libraries
|
16 |
+
import logging
|
17 |
+
# 3rd-Party Imports
|
18 |
+
import asyncio
|
19 |
+
import os
|
20 |
+
import tempfile
|
21 |
+
from datetime import datetime
|
22 |
+
from typing import List, Dict
|
23 |
+
from urllib.parse import urljoin, urlparse
|
24 |
+
from xml.dom import minidom
|
25 |
+
from playwright.async_api import async_playwright
|
26 |
+
from bs4 import BeautifulSoup
|
27 |
+
import requests
|
28 |
+
import trafilatura
|
29 |
+
import xml.etree.ElementTree as ET
|
30 |
+
|
31 |
+
|
32 |
+
# Import Local
|
33 |
+
#
|
34 |
+
#######################################################################################################################
|
35 |
+
# Function Definitions
|
36 |
+
#
|
37 |
+
|
38 |
+
def get_page_title(url: str) -> str:
|
39 |
+
try:
|
40 |
+
response = requests.get(url)
|
41 |
+
response.raise_for_status()
|
42 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
43 |
+
title_tag = soup.find('title')
|
44 |
+
return title_tag.string.strip() if title_tag else "Untitled"
|
45 |
+
except requests.RequestException as e:
|
46 |
+
logging.error(f"Error fetching page title: {e}")
|
47 |
+
return "Untitled"
|
48 |
+
|
49 |
+
|
50 |
+
async def scrape_article(url):
|
51 |
+
async def fetch_html(url: str) -> str:
|
52 |
+
async with async_playwright() as p:
|
53 |
+
browser = await p.chromium.launch(headless=True)
|
54 |
+
context = await browser.new_context(
|
55 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
|
56 |
+
page = await context.new_page()
|
57 |
+
await page.goto(url)
|
58 |
+
await page.wait_for_load_state("networkidle") # Wait for the network to be idle
|
59 |
+
content = await page.content()
|
60 |
+
await browser.close()
|
61 |
+
return content
|
62 |
+
|
63 |
+
# FIXME - Add option for extracting comments/tables/images
|
64 |
+
def extract_article_data(html: str, url: str) -> dict:
|
65 |
+
downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
|
66 |
+
metadata = trafilatura.extract_metadata(html)
|
67 |
+
|
68 |
+
result = {
|
69 |
+
'title': 'N/A',
|
70 |
+
'author': 'N/A',
|
71 |
+
'content': '',
|
72 |
+
'date': 'N/A',
|
73 |
+
'url': url,
|
74 |
+
'extraction_successful': False
|
75 |
+
}
|
76 |
+
|
77 |
+
if downloaded:
|
78 |
+
result['content'] = downloaded
|
79 |
+
result['extraction_successful'] = True
|
80 |
+
|
81 |
+
if metadata:
|
82 |
+
result.update({
|
83 |
+
'title': metadata.title if metadata.title else 'N/A',
|
84 |
+
'author': metadata.author if metadata.author else 'N/A',
|
85 |
+
'date': metadata.date if metadata.date else 'N/A'
|
86 |
+
})
|
87 |
+
else:
|
88 |
+
logging.warning("Metadata extraction failed.")
|
89 |
+
|
90 |
+
if not downloaded:
|
91 |
+
logging.warning("Content extraction failed.")
|
92 |
+
|
93 |
+
return result
|
94 |
+
|
95 |
+
def convert_html_to_markdown(html: str) -> str:
|
96 |
+
soup = BeautifulSoup(html, 'html.parser')
|
97 |
+
for para in soup.find_all('p'):
|
98 |
+
# Add a newline at the end of each paragraph for markdown separation
|
99 |
+
para.append('\n')
|
100 |
+
# Use .get_text() with separator to keep paragraph separation
|
101 |
+
return soup.get_text(separator='\n\n')
|
102 |
+
|
103 |
+
html = await fetch_html(url)
|
104 |
+
article_data = extract_article_data(html, url)
|
105 |
+
if article_data['extraction_successful']:
|
106 |
+
article_data['content'] = convert_html_to_markdown(article_data['content'])
|
107 |
+
return article_data
|
108 |
+
|
109 |
+
|
110 |
+
def collect_internal_links(base_url: str) -> set:
|
111 |
+
visited = set()
|
112 |
+
to_visit = {base_url}
|
113 |
+
|
114 |
+
while to_visit:
|
115 |
+
current_url = to_visit.pop()
|
116 |
+
if current_url in visited:
|
117 |
+
continue
|
118 |
+
|
119 |
+
try:
|
120 |
+
response = requests.get(current_url)
|
121 |
+
response.raise_for_status()
|
122 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
123 |
+
|
124 |
+
# Collect internal links
|
125 |
+
for link in soup.find_all('a', href=True):
|
126 |
+
full_url = urljoin(base_url, link['href'])
|
127 |
+
# Only process links within the same domain
|
128 |
+
if urlparse(full_url).netloc == urlparse(base_url).netloc:
|
129 |
+
if full_url not in visited:
|
130 |
+
to_visit.add(full_url)
|
131 |
+
|
132 |
+
visited.add(current_url)
|
133 |
+
except requests.RequestException as e:
|
134 |
+
logging.error(f"Error visiting {current_url}: {e}")
|
135 |
+
continue
|
136 |
+
|
137 |
+
return visited
|
138 |
+
|
139 |
+
|
140 |
+
def generate_temp_sitemap_from_links(links: set) -> str:
|
141 |
+
"""
|
142 |
+
Generate a temporary sitemap file from collected links and return its path.
|
143 |
+
|
144 |
+
:param links: A set of URLs to include in the sitemap
|
145 |
+
:return: Path to the temporary sitemap file
|
146 |
+
"""
|
147 |
+
# Create the root element
|
148 |
+
urlset = ET.Element("urlset")
|
149 |
+
urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
|
150 |
+
|
151 |
+
# Add each link to the sitemap
|
152 |
+
for link in links:
|
153 |
+
url = ET.SubElement(urlset, "url")
|
154 |
+
loc = ET.SubElement(url, "loc")
|
155 |
+
loc.text = link
|
156 |
+
lastmod = ET.SubElement(url, "lastmod")
|
157 |
+
lastmod.text = datetime.now().strftime("%Y-%m-%d")
|
158 |
+
changefreq = ET.SubElement(url, "changefreq")
|
159 |
+
changefreq.text = "daily"
|
160 |
+
priority = ET.SubElement(url, "priority")
|
161 |
+
priority.text = "0.5"
|
162 |
+
|
163 |
+
# Create the tree and get it as a string
|
164 |
+
xml_string = ET.tostring(urlset, 'utf-8')
|
165 |
+
|
166 |
+
# Pretty print the XML
|
167 |
+
pretty_xml = minidom.parseString(xml_string).toprettyxml(indent=" ")
|
168 |
+
|
169 |
+
# Create a temporary file
|
170 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as temp_file:
|
171 |
+
temp_file.write(pretty_xml)
|
172 |
+
temp_file_path = temp_file.name
|
173 |
+
|
174 |
+
logging.info(f"Temporary sitemap created at: {temp_file_path}")
|
175 |
+
return temp_file_path
|
176 |
+
|
177 |
+
|
178 |
+
def generate_sitemap_for_url(url: str) -> List[Dict[str, str]]:
|
179 |
+
"""
|
180 |
+
Generate a sitemap for the given URL using the create_filtered_sitemap function.
|
181 |
+
|
182 |
+
Args:
|
183 |
+
url (str): The base URL to generate the sitemap for
|
184 |
+
|
185 |
+
Returns:
|
186 |
+
List[Dict[str, str]]: A list of dictionaries, each containing 'url' and 'title' keys
|
187 |
+
"""
|
188 |
+
with tempfile.NamedTemporaryFile(mode="w+", suffix=".xml", delete=False) as temp_file:
|
189 |
+
create_filtered_sitemap(url, temp_file.name, is_content_page)
|
190 |
+
temp_file.seek(0)
|
191 |
+
tree = ET.parse(temp_file.name)
|
192 |
+
root = tree.getroot()
|
193 |
+
|
194 |
+
sitemap = []
|
195 |
+
for url_elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
|
196 |
+
loc = url_elem.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
|
197 |
+
sitemap.append({"url": loc, "title": loc.split("/")[-1] or url}) # Use the last part of the URL as a title
|
198 |
+
|
199 |
+
return sitemap
|
200 |
+
|
201 |
+
async def scrape_entire_site(base_url: str) -> List[Dict]:
|
202 |
+
"""
|
203 |
+
Scrape the entire site by generating a temporary sitemap and extracting content from each page.
|
204 |
+
|
205 |
+
:param base_url: The base URL of the site to scrape
|
206 |
+
:return: A list of dictionaries containing scraped article data
|
207 |
+
"""
|
208 |
+
# Step 1: Collect internal links from the site
|
209 |
+
links = collect_internal_links(base_url)
|
210 |
+
logging.info(f"Collected {len(links)} internal links.")
|
211 |
+
|
212 |
+
# Step 2: Generate the temporary sitemap
|
213 |
+
temp_sitemap_path = generate_temp_sitemap_from_links(links)
|
214 |
+
|
215 |
+
# Step 3: Scrape each URL in the sitemap
|
216 |
+
scraped_articles = []
|
217 |
+
try:
|
218 |
+
async def scrape_and_log(link):
|
219 |
+
logging.info(f"Scraping {link} ...")
|
220 |
+
article_data = await scrape_article(link)
|
221 |
+
|
222 |
+
if article_data:
|
223 |
+
logging.info(f"Title: {article_data['title']}")
|
224 |
+
logging.info(f"Author: {article_data['author']}")
|
225 |
+
logging.info(f"Date: {article_data['date']}")
|
226 |
+
logging.info(f"Content: {article_data['content'][:500]}...")
|
227 |
+
|
228 |
+
return article_data
|
229 |
+
return None
|
230 |
+
|
231 |
+
# Use asyncio.gather to scrape multiple articles concurrently
|
232 |
+
scraped_articles = await asyncio.gather(*[scrape_and_log(link) for link in links])
|
233 |
+
# Remove any None values (failed scrapes)
|
234 |
+
scraped_articles = [article for article in scraped_articles if article is not None]
|
235 |
+
|
236 |
+
finally:
|
237 |
+
# Clean up the temporary sitemap file
|
238 |
+
os.unlink(temp_sitemap_path)
|
239 |
+
logging.info("Temporary sitemap file deleted")
|
240 |
+
|
241 |
+
return scraped_articles
|
242 |
+
|
243 |
+
|
244 |
+
def scrape_by_url_level(base_url: str, level: int) -> list:
|
245 |
+
"""Scrape articles from URLs up to a certain level under the base URL."""
|
246 |
+
|
247 |
+
def get_url_level(url: str) -> int:
|
248 |
+
return len(urlparse(url).path.strip('/').split('/'))
|
249 |
+
|
250 |
+
links = collect_internal_links(base_url)
|
251 |
+
filtered_links = [link for link in links if get_url_level(link) <= level]
|
252 |
+
|
253 |
+
return [article for link in filtered_links if (article := scrape_article(link))]
|
254 |
+
|
255 |
+
|
256 |
+
def scrape_from_sitemap(sitemap_url: str) -> list:
|
257 |
+
"""Scrape articles from a sitemap URL."""
|
258 |
+
try:
|
259 |
+
response = requests.get(sitemap_url)
|
260 |
+
response.raise_for_status()
|
261 |
+
root = ET.fromstring(response.content)
|
262 |
+
|
263 |
+
return [article for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
|
264 |
+
if (article := scrape_article(url.text))]
|
265 |
+
except requests.RequestException as e:
|
266 |
+
logging.error(f"Error fetching sitemap: {e}")
|
267 |
+
return []
|
268 |
+
|
269 |
+
|
270 |
+
def convert_to_markdown(articles: list) -> str:
|
271 |
+
"""Convert a list of article data into a single markdown document."""
|
272 |
+
markdown = ""
|
273 |
+
for article in articles:
|
274 |
+
markdown += f"# {article['title']}\n\n"
|
275 |
+
markdown += f"Author: {article['author']}\n"
|
276 |
+
markdown += f"Date: {article['date']}\n\n"
|
277 |
+
markdown += f"{article['content']}\n\n"
|
278 |
+
markdown += "---\n\n" # Separator between articles
|
279 |
+
return markdown
|
280 |
+
|
281 |
+
|
282 |
+
def is_content_page(url: str) -> bool:
|
283 |
+
"""
|
284 |
+
Determine if a URL is likely to be a content page.
|
285 |
+
This is a basic implementation and may need to be adjusted based on the specific website structure.
|
286 |
+
|
287 |
+
:param url: The URL to check
|
288 |
+
:return: True if the URL is likely a content page, False otherwise
|
289 |
+
"""
|
290 |
+
#Add more specific checks here based on the website's structure
|
291 |
+
# Exclude common non-content pages
|
292 |
+
exclude_patterns = [
|
293 |
+
'/tag/', '/category/', '/author/', '/search/', '/page/',
|
294 |
+
'wp-content', 'wp-includes', 'wp-json', 'wp-admin',
|
295 |
+
'login', 'register', 'cart', 'checkout', 'account',
|
296 |
+
'.jpg', '.png', '.gif', '.pdf', '.zip'
|
297 |
+
]
|
298 |
+
return not any(pattern in url.lower() for pattern in exclude_patterns)
|
299 |
+
|
300 |
+
|
301 |
+
def create_filtered_sitemap(base_url: str, output_file: str, filter_function):
|
302 |
+
"""
|
303 |
+
Create a sitemap from internal links and filter them based on a custom function.
|
304 |
+
|
305 |
+
:param base_url: The base URL of the website
|
306 |
+
:param output_file: The file to save the sitemap to
|
307 |
+
:param filter_function: A function that takes a URL and returns True if it should be included
|
308 |
+
"""
|
309 |
+
links = collect_internal_links(base_url)
|
310 |
+
filtered_links = set(filter(filter_function, links))
|
311 |
+
|
312 |
+
root = ET.Element("urlset")
|
313 |
+
root.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
|
314 |
+
|
315 |
+
for link in filtered_links:
|
316 |
+
url = ET.SubElement(root, "url")
|
317 |
+
loc = ET.SubElement(url, "loc")
|
318 |
+
loc.text = link
|
319 |
+
|
320 |
+
tree = ET.ElementTree(root)
|
321 |
+
tree.write(output_file, encoding='utf-8', xml_declaration=True)
|
322 |
+
print(f"Filtered sitemap saved to {output_file}")
|
323 |
+
|
324 |
+
|
325 |
+
def scrape_from_filtered_sitemap(sitemap_file: str, filter_function) -> list:
|
326 |
+
"""
|
327 |
+
Scrape articles from a sitemap file, applying an additional filter function.
|
328 |
+
|
329 |
+
:param sitemap_file: Path to the sitemap file
|
330 |
+
:param filter_function: A function that takes a URL and returns True if it should be scraped
|
331 |
+
:return: List of scraped articles
|
332 |
+
"""
|
333 |
+
try:
|
334 |
+
tree = ET.parse(sitemap_file)
|
335 |
+
root = tree.getroot()
|
336 |
+
|
337 |
+
articles = []
|
338 |
+
for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
|
339 |
+
if filter_function(url.text):
|
340 |
+
article_data = scrape_article(url.text)
|
341 |
+
if article_data:
|
342 |
+
articles.append(article_data)
|
343 |
+
|
344 |
+
return articles
|
345 |
+
except ET.ParseError as e:
|
346 |
+
logging.error(f"Error parsing sitemap: {e}")
|
347 |
+
return []
|
348 |
+
|
349 |
+
|
350 |
+
def scrape_and_convert_with_filter(source: str, output_file: str, filter_function=is_content_page, level: int = None):
|
351 |
+
"""
|
352 |
+
Scrape articles from a sitemap or by URL level, apply filtering, and convert to a single markdown file.
|
353 |
+
|
354 |
+
:param source: URL of the sitemap, base URL for level-based scraping, or path to a local sitemap file
|
355 |
+
:param output_file: Path to save the output markdown file
|
356 |
+
:param filter_function: Function to filter URLs (default is is_content_page)
|
357 |
+
:param level: URL level for scraping (None if using sitemap)
|
358 |
+
"""
|
359 |
+
if level is not None:
|
360 |
+
# Scraping by URL level
|
361 |
+
articles = scrape_by_url_level(source, level)
|
362 |
+
articles = [article for article in articles if filter_function(article['url'])]
|
363 |
+
elif source.startswith('http'):
|
364 |
+
# Scraping from online sitemap
|
365 |
+
articles = scrape_from_sitemap(source)
|
366 |
+
articles = [article for article in articles if filter_function(article['url'])]
|
367 |
+
else:
|
368 |
+
# Scraping from local sitemap file
|
369 |
+
articles = scrape_from_filtered_sitemap(source, filter_function)
|
370 |
+
|
371 |
+
articles = [article for article in articles if filter_function(article['url'])]
|
372 |
+
markdown_content = convert_to_markdown(articles)
|
373 |
+
|
374 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
375 |
+
f.write(markdown_content)
|
376 |
+
|
377 |
+
logging.info(f"Scraped and filtered content saved to {output_file}")
|
378 |
+
|
379 |
+
#
|
380 |
+
#
|
381 |
+
#######################################################################################################################
|
App_Function_Libraries/Web_Scraping/Article_Summarization_Lib.py
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Article_Summarization_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Article Summarization Library
|
4 |
+
# This library is used to handle summarization of articles.
|
5 |
+
import asyncio
|
6 |
+
# FIXME - this library should be refactored into `Article_Extractor_Lib` and then renamed to `Web_Scraping_Lib`
|
7 |
+
|
8 |
+
#
|
9 |
+
####
|
10 |
+
#
|
11 |
+
####################
|
12 |
+
# Function List
|
13 |
+
#
|
14 |
+
# 1.
|
15 |
+
#
|
16 |
+
####################
|
17 |
+
#
|
18 |
+
# Import necessary libraries
|
19 |
+
import datetime
|
20 |
+
from datetime import datetime
|
21 |
+
import gradio as gr
|
22 |
+
import json
|
23 |
+
import os
|
24 |
+
import logging
|
25 |
+
import requests
|
26 |
+
# 3rd-Party Imports
|
27 |
+
#
|
28 |
+
# Local Imports
|
29 |
+
from App_Function_Libraries.Utils.Utils import sanitize_filename
|
30 |
+
from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_article
|
31 |
+
from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
|
32 |
+
summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
|
33 |
+
from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
|
34 |
+
summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
|
35 |
+
summarize_with_mistral
|
36 |
+
from App_Function_Libraries.DB.DB_Manager import ingest_article_to_db
|
37 |
+
#
|
38 |
+
#######################################################################################################################
|
39 |
+
# Function Definitions
|
40 |
+
#
|
41 |
+
|
42 |
+
async def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles, system_message=None):
|
43 |
+
urls = [url.strip() for url in urls.split('\n') if url.strip()]
|
44 |
+
custom_titles = custom_article_titles.split('\n') if custom_article_titles else []
|
45 |
+
|
46 |
+
results = []
|
47 |
+
errors = []
|
48 |
+
|
49 |
+
# Create a progress bar
|
50 |
+
progress = gr.Progress()
|
51 |
+
|
52 |
+
# FIXME - add progress tracking to the gradio UI
|
53 |
+
for i, url in enumerate(urls):
|
54 |
+
custom_title = custom_titles[i] if i < len(custom_titles) else None
|
55 |
+
try:
|
56 |
+
article = await scrape_article(url)
|
57 |
+
if article and article['extraction_successful']:
|
58 |
+
if custom_title:
|
59 |
+
article['title'] = custom_title
|
60 |
+
results.append(article)
|
61 |
+
except Exception as e:
|
62 |
+
error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
|
63 |
+
errors.append(error_message)
|
64 |
+
|
65 |
+
# Update progress
|
66 |
+
progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
|
67 |
+
|
68 |
+
if errors:
|
69 |
+
logging.error("\n".join(errors))
|
70 |
+
|
71 |
+
return results
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
|
76 |
+
try:
|
77 |
+
# Step 1: Scrape the article
|
78 |
+
article_data = asyncio.run(scrape_article(url))
|
79 |
+
print(f"Scraped Article Data: {article_data}") # Debugging statement
|
80 |
+
if not article_data:
|
81 |
+
return "Failed to scrape the article."
|
82 |
+
|
83 |
+
# Use the custom title if provided, otherwise use the scraped title
|
84 |
+
title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
|
85 |
+
author = article_data.get('author', 'Unknown')
|
86 |
+
content = article_data.get('content', '')
|
87 |
+
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
88 |
+
|
89 |
+
print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
|
90 |
+
|
91 |
+
# Custom system prompt for the article
|
92 |
+
system_message = system_message or "Act as a professional summarizer and summarize this article."
|
93 |
+
# Custom prompt for the article
|
94 |
+
article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article."
|
95 |
+
|
96 |
+
# Step 2: Summarize the article
|
97 |
+
summary = None
|
98 |
+
if api_name:
|
99 |
+
logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}")
|
100 |
+
|
101 |
+
# Sanitize filename for saving the JSON file
|
102 |
+
sanitized_title = sanitize_filename(title)
|
103 |
+
json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json")
|
104 |
+
|
105 |
+
with open(json_file_path, 'w') as json_file:
|
106 |
+
json.dump([{'text': content}], json_file, indent=2)
|
107 |
+
|
108 |
+
try:
|
109 |
+
if api_name.lower() == 'openai':
|
110 |
+
# def summarize_with_openai(api_key, input_data, custom_prompt_arg)
|
111 |
+
summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt, system_message)
|
112 |
+
|
113 |
+
elif api_name.lower() == "anthropic":
|
114 |
+
# def summarize_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5):
|
115 |
+
summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt, system_message)
|
116 |
+
elif api_name.lower() == "cohere":
|
117 |
+
# def summarize_with_cohere(api_key, input_data, model, custom_prompt_arg)
|
118 |
+
summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt, system_message)
|
119 |
+
|
120 |
+
elif api_name.lower() == "groq":
|
121 |
+
logging.debug(f"MAIN: Trying to summarize with groq")
|
122 |
+
# def summarize_with_groq(api_key, input_data, model, custom_prompt_arg):
|
123 |
+
summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt, system_message)
|
124 |
+
|
125 |
+
elif api_name.lower() == "openrouter":
|
126 |
+
logging.debug(f"MAIN: Trying to summarize with OpenRouter")
|
127 |
+
# def summarize_with_openrouter(api_key, input_data, custom_prompt_arg):
|
128 |
+
summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt, system_message)
|
129 |
+
|
130 |
+
elif api_name.lower() == "deepseek":
|
131 |
+
logging.debug(f"MAIN: Trying to summarize with DeepSeek")
|
132 |
+
# def summarize_with_deepseek(api_key, input_data, custom_prompt_arg):
|
133 |
+
summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt, system_message)
|
134 |
+
|
135 |
+
elif api_name.lower() == "mistral":
|
136 |
+
summary = summarize_with_mistral(api_key, json_file_path, article_custom_prompt, system_message)
|
137 |
+
|
138 |
+
elif api_name.lower() == "llama.cpp":
|
139 |
+
logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
|
140 |
+
# def summarize_with_llama(api_url, file_path, token, custom_prompt)
|
141 |
+
summary = summarize_with_llama(json_file_path, article_custom_prompt, system_message)
|
142 |
+
|
143 |
+
elif api_name.lower() == "kobold":
|
144 |
+
logging.debug(f"MAIN: Trying to summarize with Kobold.cpp")
|
145 |
+
# def summarize_with_kobold(input_data, kobold_api_token, custom_prompt_input, api_url):
|
146 |
+
summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt, system_message)
|
147 |
+
|
148 |
+
elif api_name.lower() == "ooba":
|
149 |
+
# def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url):
|
150 |
+
summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt, system_message)
|
151 |
+
|
152 |
+
elif api_name.lower() == "tabbyapi":
|
153 |
+
# def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP):
|
154 |
+
summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message)
|
155 |
+
|
156 |
+
elif api_name.lower() == "vllm":
|
157 |
+
logging.debug(f"MAIN: Trying to summarize with VLLM")
|
158 |
+
# def summarize_with_vllm(api_key, input_data, custom_prompt_input):
|
159 |
+
summary = summarize_with_vllm(json_file_path, article_custom_prompt, system_message)
|
160 |
+
|
161 |
+
elif api_name.lower() == "local-llm":
|
162 |
+
logging.debug(f"MAIN: Trying to summarize with Local LLM")
|
163 |
+
summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message)
|
164 |
+
|
165 |
+
elif api_name.lower() == "huggingface":
|
166 |
+
logging.debug(f"MAIN: Trying to summarize with huggingface")
|
167 |
+
# def summarize_with_huggingface(api_key, input_data, custom_prompt_arg):
|
168 |
+
summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message)
|
169 |
+
# Add additional API handlers here...
|
170 |
+
|
171 |
+
except requests.exceptions.ConnectionError as e:
|
172 |
+
logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}")
|
173 |
+
|
174 |
+
if summary:
|
175 |
+
logging.info(f"Article_Summarizer: Summary generated using {api_name} API")
|
176 |
+
save_summary_to_file(summary, json_file_path)
|
177 |
+
else:
|
178 |
+
summary = "Summary not available"
|
179 |
+
logging.warning(f"Failed to generate summary using {api_name} API")
|
180 |
+
|
181 |
+
else:
|
182 |
+
summary = "Article Summarization: No API provided for summarization."
|
183 |
+
|
184 |
+
print(f"Summary: {summary}") # Debugging statement
|
185 |
+
|
186 |
+
# Step 3: Ingest the article into the database
|
187 |
+
ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date,
|
188 |
+
article_custom_prompt)
|
189 |
+
|
190 |
+
return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}"
|
191 |
+
except Exception as e:
|
192 |
+
logging.error(f"Error processing URL {url}: {str(e)}")
|
193 |
+
return f"Failed to process URL {url}: {str(e)}"
|
194 |
+
|
195 |
+
|
196 |
+
def scrape_and_no_summarize_then_ingest(url, keywords, custom_article_title):
|
197 |
+
try:
|
198 |
+
# Step 1: Scrape the article
|
199 |
+
article_data = asyncio.run(scrape_article(url))
|
200 |
+
print(f"Scraped Article Data: {article_data}") # Debugging statement
|
201 |
+
if not article_data:
|
202 |
+
return "Failed to scrape the article."
|
203 |
+
|
204 |
+
# Use the custom title if provided, otherwise use the scraped title
|
205 |
+
title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
|
206 |
+
author = article_data.get('author', 'Unknown')
|
207 |
+
content = article_data.get('content', '')
|
208 |
+
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
209 |
+
|
210 |
+
print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
|
211 |
+
|
212 |
+
# Step 2: Ingest the article into the database
|
213 |
+
ingestion_result = ingest_article_to_db(url, title, author, content, keywords, ingestion_date, None, None)
|
214 |
+
|
215 |
+
return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nArticle Contents: {content}"
|
216 |
+
except Exception as e:
|
217 |
+
logging.error(f"Error processing URL {url}: {str(e)}")
|
218 |
+
return f"Failed to process URL {url}: {str(e)}"
|
219 |
+
|
220 |
+
|
221 |
+
def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
|
222 |
+
title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
|
223 |
+
author = "Unknown"
|
224 |
+
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
225 |
+
|
226 |
+
# Summarize the unstructured text
|
227 |
+
if api_name:
|
228 |
+
json_file_path = f"Results/{title.replace(' ', '_')}_segments.json"
|
229 |
+
with open(json_file_path, 'w') as json_file:
|
230 |
+
json.dump([{'text': text}], json_file, indent=2)
|
231 |
+
|
232 |
+
if api_name.lower() == 'openai':
|
233 |
+
summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message)
|
234 |
+
# Add other APIs as needed
|
235 |
+
else:
|
236 |
+
summary = "Unsupported API."
|
237 |
+
else:
|
238 |
+
summary = "No API provided for summarization."
|
239 |
+
|
240 |
+
# Ingest the unstructured text into the database
|
241 |
+
ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date,
|
242 |
+
custom_prompt)
|
243 |
+
return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}"
|
244 |
+
|
245 |
+
|
246 |
+
|
247 |
+
#
|
248 |
+
#
|
249 |
+
#######################################################################################################################
|
App_Function_Libraries/Web_Scraping/__init__.py
ADDED
File without changes
|