|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
import uuid
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
|
|
import requests
|
|
import yt_dlp
|
|
|
|
|
|
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords, \
|
|
check_media_and_whisper_model
|
|
from App_Function_Libraries.Metrics.metrics_logger import log_counter, log_histogram
|
|
from App_Function_Libraries.Summarization.Summarization_General_Lib import perform_summarization
|
|
from App_Function_Libraries.Utils.Utils import downloaded_files, \
|
|
sanitize_filename, generate_unique_id, temp_files
|
|
from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
|
|
from App_Function_Libraries.Audio.Audio_Transcription_Lib import speech_to_text
|
|
from App_Function_Libraries.Chunk_Lib import improved_chunking_process
|
|
|
|
|
|
|
|
|
|
|
|
MAX_FILE_SIZE = 500 * 1024 * 1024
|
|
|
|
|
|
def download_audio_file(url, current_whisper_model="", use_cookies=False, cookies=None):
|
|
try:
|
|
|
|
should_download, reason = check_media_and_whisper_model(
|
|
url=url,
|
|
current_whisper_model=current_whisper_model
|
|
)
|
|
|
|
if not should_download:
|
|
logging.info(f"Skipping audio download: {reason}")
|
|
return None
|
|
|
|
logging.info(f"Proceeding with audio download: {reason}")
|
|
|
|
|
|
headers = {}
|
|
if use_cookies and cookies:
|
|
try:
|
|
cookie_dict = json.loads(cookies)
|
|
headers['Cookie'] = '; '.join([f'{k}={v}' for k, v in cookie_dict.items()])
|
|
except json.JSONDecodeError:
|
|
logging.warning("Invalid cookie format. Proceeding without cookies.")
|
|
|
|
|
|
response = requests.get(url, headers=headers, stream=True)
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
file_size = int(response.headers.get('content-length', 0))
|
|
if file_size > 500 * 1024 * 1024:
|
|
raise ValueError("File size exceeds the 500MB limit.")
|
|
|
|
|
|
file_name = f"audio_{uuid.uuid4().hex[:8]}.mp3"
|
|
save_path = os.path.join('downloads', file_name)
|
|
|
|
|
|
os.makedirs('downloads', exist_ok=True)
|
|
|
|
|
|
|
|
with open(save_path, 'wb') as f:
|
|
for chunk in response.iter_content(chunk_size=8192):
|
|
if chunk:
|
|
f.write(chunk)
|
|
|
|
logging.info(f"Audio file downloaded successfully: {save_path}")
|
|
return save_path
|
|
|
|
except requests.RequestException as e:
|
|
logging.error(f"Error downloading audio file: {str(e)}")
|
|
raise
|
|
except ValueError as e:
|
|
logging.error(str(e))
|
|
raise
|
|
except Exception as e:
|
|
logging.error(f"Unexpected error downloading audio file: {str(e)}")
|
|
raise
|
|
|
|
def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
|
|
custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
|
|
use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize,
|
|
keep_timestamps, custom_title):
|
|
|
|
start_time = time.time()
|
|
processed_count = 0
|
|
failed_count = 0
|
|
progress = []
|
|
all_transcriptions = []
|
|
all_summaries = []
|
|
|
|
def format_transcription_with_timestamps(segments):
|
|
if keep_timestamps:
|
|
formatted_segments = []
|
|
for segment in segments:
|
|
start = segment.get('Time_Start', 0)
|
|
end = segment.get('Time_End', 0)
|
|
text = segment.get('Text', '').strip()
|
|
|
|
|
|
formatted_segments.append(f"[{start:.2f}-{end:.2f}] {text}")
|
|
|
|
|
|
return "\n".join(formatted_segments)
|
|
else:
|
|
|
|
return "\n".join([segment.get('Text', '').strip() for segment in segments])
|
|
|
|
def update_progress(message):
|
|
progress.append(message)
|
|
return "\n".join(progress)
|
|
|
|
def cleanup_files():
|
|
for file in temp_files:
|
|
try:
|
|
if os.path.exists(file):
|
|
os.remove(file)
|
|
update_progress(f"Temporary file {file} removed.")
|
|
except Exception as e:
|
|
update_progress(f"Failed to remove temporary file {file}: {str(e)}")
|
|
|
|
def reencode_mp3(mp3_file_path):
|
|
try:
|
|
reencoded_mp3_path = mp3_file_path.replace(".mp3", "_reencoded.mp3")
|
|
subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, '-codec:a', 'libmp3lame', reencoded_mp3_path], check=True)
|
|
update_progress(f"Re-encoded {mp3_file_path} to {reencoded_mp3_path}.")
|
|
return reencoded_mp3_path
|
|
except subprocess.CalledProcessError as e:
|
|
update_progress(f"Error re-encoding {mp3_file_path}: {str(e)}")
|
|
raise
|
|
|
|
def convert_mp3_to_wav(mp3_file_path):
|
|
try:
|
|
wav_file_path = mp3_file_path.replace(".mp3", ".wav")
|
|
subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, wav_file_path], check=True)
|
|
update_progress(f"Converted {mp3_file_path} to {wav_file_path}.")
|
|
return wav_file_path
|
|
except subprocess.CalledProcessError as e:
|
|
update_progress(f"Error converting {mp3_file_path} to WAV: {str(e)}")
|
|
raise
|
|
|
|
try:
|
|
|
|
global ffmpeg_cmd
|
|
if os.name == "nt":
|
|
logging.debug("Running on Windows")
|
|
ffmpeg_cmd = os.path.join(os.getcwd(), "Bin", "ffmpeg.exe")
|
|
else:
|
|
ffmpeg_cmd = 'ffmpeg'
|
|
|
|
|
|
if not os.path.exists(ffmpeg_cmd) and os.name == "nt":
|
|
raise FileNotFoundError(f"ffmpeg executable not found at path: {ffmpeg_cmd}")
|
|
|
|
|
|
chunk_options = {
|
|
'method': chunk_method,
|
|
'max_size': max_chunk_size,
|
|
'overlap': chunk_overlap,
|
|
'adaptive': use_adaptive_chunking,
|
|
'multi_level': use_multi_level_chunking,
|
|
'language': chunk_language
|
|
}
|
|
|
|
|
|
urls = [url.strip() for url in audio_urls.split('\n') if url.strip()]
|
|
|
|
for i, url in enumerate(urls):
|
|
update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}")
|
|
|
|
|
|
audio_file_path = download_audio_file(url, use_cookies, cookies)
|
|
if not os.path.exists(audio_file_path):
|
|
update_progress(f"Downloaded file not found: {audio_file_path}")
|
|
failed_count += 1
|
|
log_counter(
|
|
metric_name="audio_files_failed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
continue
|
|
|
|
temp_files.append(audio_file_path)
|
|
update_progress("Audio file downloaded successfully.")
|
|
|
|
|
|
reencoded_mp3_path = reencode_mp3(audio_file_path)
|
|
if not os.path.exists(reencoded_mp3_path):
|
|
update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
|
|
failed_count += 1
|
|
log_counter(
|
|
metric_name="audio_files_failed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
continue
|
|
|
|
temp_files.append(reencoded_mp3_path)
|
|
|
|
|
|
wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
|
|
if not os.path.exists(wav_file_path):
|
|
update_progress(f"Converted WAV file not found: {wav_file_path}")
|
|
failed_count += 1
|
|
log_counter(
|
|
metric_name="audio_files_failed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
continue
|
|
|
|
temp_files.append(wav_file_path)
|
|
|
|
|
|
transcription = ""
|
|
|
|
|
|
if diarize:
|
|
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
|
|
else:
|
|
segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
|
|
|
|
|
|
if isinstance(segments, dict) and 'segments' in segments:
|
|
segments = segments['segments']
|
|
|
|
if isinstance(segments, list):
|
|
|
|
logging.debug(f"Segments before formatting: {segments[:5]}")
|
|
transcription = format_transcription_with_timestamps(segments)
|
|
logging.debug(f"Formatted transcription (first 500 chars): {transcription[:500]}")
|
|
update_progress("Audio transcribed successfully.")
|
|
else:
|
|
update_progress("Unexpected segments format received from speech_to_text.")
|
|
logging.error(f"Unexpected segments format: {segments}")
|
|
failed_count += 1
|
|
log_counter(
|
|
metric_name="audio_files_failed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
continue
|
|
|
|
if not transcription.strip():
|
|
update_progress("Transcription is empty.")
|
|
failed_count += 1
|
|
log_counter(
|
|
metric_name="audio_files_failed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
else:
|
|
|
|
chunked_text = improved_chunking_process(transcription, chunk_options)
|
|
|
|
|
|
logging.debug(f"Audio Transcription API Name: {api_name}")
|
|
if api_name:
|
|
try:
|
|
summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
|
|
update_progress("Audio summarized successfully.")
|
|
except Exception as e:
|
|
logging.error(f"Error during summarization: {str(e)}")
|
|
summary = "Summary generation failed"
|
|
failed_count += 1
|
|
log_counter(
|
|
metric_name="audio_files_failed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
else:
|
|
summary = "No summary available (API not provided)"
|
|
|
|
all_transcriptions.append(transcription)
|
|
all_summaries.append(summary)
|
|
|
|
|
|
title = custom_title if custom_title else os.path.basename(wav_file_path)
|
|
|
|
|
|
add_media_with_keywords(
|
|
url=url,
|
|
title=title,
|
|
media_type='audio',
|
|
content=transcription,
|
|
keywords=custom_keywords,
|
|
prompt=custom_prompt_input,
|
|
summary=summary,
|
|
transcription_model=whisper_model,
|
|
author="Unknown",
|
|
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
|
)
|
|
update_progress("Audio file processed and added to database.")
|
|
processed_count += 1
|
|
log_counter(
|
|
metric_name="audio_files_processed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
|
|
|
|
if audio_file:
|
|
url = generate_unique_id()
|
|
if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
|
|
update_progress(
|
|
f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
|
|
else:
|
|
try:
|
|
|
|
reencoded_mp3_path = reencode_mp3(audio_file.name)
|
|
if not os.path.exists(reencoded_mp3_path):
|
|
update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
|
|
return update_progress("Processing failed: Re-encoded file not found"), "", ""
|
|
|
|
temp_files.append(reencoded_mp3_path)
|
|
|
|
|
|
wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
|
|
if not os.path.exists(wav_file_path):
|
|
update_progress(f"Converted WAV file not found: {wav_file_path}")
|
|
return update_progress("Processing failed: Converted WAV file not found"), "", ""
|
|
|
|
temp_files.append(wav_file_path)
|
|
|
|
|
|
transcription = ""
|
|
|
|
if diarize:
|
|
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
|
|
else:
|
|
segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
|
|
|
|
|
|
if isinstance(segments, dict) and 'segments' in segments:
|
|
segments = segments['segments']
|
|
|
|
if isinstance(segments, list):
|
|
transcription = format_transcription_with_timestamps(segments)
|
|
else:
|
|
update_progress("Unexpected segments format received from speech_to_text.")
|
|
logging.error(f"Unexpected segments format: {segments}")
|
|
|
|
chunked_text = improved_chunking_process(transcription, chunk_options)
|
|
|
|
logging.debug(f"Audio Transcription API Name: {api_name}")
|
|
if api_name:
|
|
try:
|
|
summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
|
|
update_progress("Audio summarized successfully.")
|
|
except Exception as e:
|
|
logging.error(f"Error during summarization: {str(e)}")
|
|
summary = "Summary generation failed"
|
|
else:
|
|
summary = "No summary available (API not provided)"
|
|
|
|
all_transcriptions.append(transcription)
|
|
all_summaries.append(summary)
|
|
|
|
|
|
title = custom_title if custom_title else os.path.basename(wav_file_path)
|
|
|
|
add_media_with_keywords(
|
|
url="Uploaded File",
|
|
title=title,
|
|
media_type='audio',
|
|
content=transcription,
|
|
keywords=custom_keywords,
|
|
prompt=custom_prompt_input,
|
|
summary=summary,
|
|
transcription_model=whisper_model,
|
|
author="Unknown",
|
|
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
|
)
|
|
update_progress("Uploaded file processed and added to database.")
|
|
processed_count += 1
|
|
log_counter(
|
|
metric_name="audio_files_processed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
except Exception as e:
|
|
update_progress(f"Error processing uploaded file: {str(e)}")
|
|
logging.error(f"Error processing uploaded file: {str(e)}")
|
|
failed_count += 1
|
|
log_counter(
|
|
metric_name="audio_files_failed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
return update_progress("Processing failed: Error processing uploaded file"), "", ""
|
|
|
|
if not keep_original:
|
|
cleanup_files()
|
|
|
|
end_time = time.time()
|
|
processing_time = end_time - start_time
|
|
|
|
log_histogram(
|
|
metric_name="audio_processing_time_seconds",
|
|
value=processing_time,
|
|
labels={"whisper_model": whisper_model, "api_name": api_name}
|
|
)
|
|
|
|
|
|
log_counter(
|
|
metric_name="total_audio_files_processed",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=processed_count
|
|
)
|
|
|
|
log_counter(
|
|
metric_name="total_audio_files_failed",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=failed_count
|
|
)
|
|
|
|
|
|
final_progress = update_progress("All processing complete.")
|
|
final_transcriptions = "\n\n".join(all_transcriptions)
|
|
final_summaries = "\n\n".join(all_summaries)
|
|
|
|
return final_progress, final_transcriptions, final_summaries
|
|
|
|
except Exception as e:
|
|
logging.error(f"Error processing audio files: {str(e)}")
|
|
log_counter(
|
|
metric_name="audio_files_failed_total",
|
|
labels={"whisper_model": whisper_model, "api_name": api_name},
|
|
value=1
|
|
)
|
|
cleanup_files()
|
|
return update_progress(f"Processing failed: {str(e)}"), "", ""
|
|
|
|
|
|
def format_transcription_with_timestamps(segments, keep_timestamps):
|
|
"""
|
|
Formats the transcription segments with or without timestamps.
|
|
|
|
Parameters:
|
|
segments (list): List of transcription segments.
|
|
keep_timestamps (bool): Whether to include timestamps.
|
|
|
|
Returns:
|
|
str: Formatted transcription.
|
|
"""
|
|
if keep_timestamps:
|
|
formatted_segments = []
|
|
for segment in segments:
|
|
start = segment.get('Time_Start', 0)
|
|
end = segment.get('Time_End', 0)
|
|
text = segment.get('Text', '').strip()
|
|
|
|
formatted_segments.append(f"[{start:.2f}-{end:.2f}] {text}")
|
|
return "\n".join(formatted_segments)
|
|
else:
|
|
return "\n".join([segment.get('Text', '').strip() for segment in segments])
|
|
|
|
|
|
def download_youtube_audio(url):
|
|
try:
|
|
|
|
ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg'
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
|
|
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
|
|
info_dict = ydl.extract_info(url, download=False)
|
|
sanitized_title = sanitize_filename(info_dict['title'])
|
|
|
|
|
|
temp_video_path = Path(temp_dir) / f"{sanitized_title}_temp.mp4"
|
|
temp_audio_path = Path(temp_dir) / f"{sanitized_title}.mp3"
|
|
|
|
|
|
ydl_opts = {
|
|
'format': 'bestaudio[ext=m4a]/best[height<=480]',
|
|
'ffmpeg_location': ffmpeg_path,
|
|
'outtmpl': str(temp_video_path),
|
|
'noplaylist': True,
|
|
'quiet': True
|
|
}
|
|
|
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
ydl.download([url])
|
|
|
|
|
|
if not temp_video_path.exists():
|
|
raise FileNotFoundError(f"Expected file was not found: {temp_video_path}")
|
|
|
|
|
|
ffmpeg_command = [
|
|
ffmpeg_path,
|
|
'-i', str(temp_video_path),
|
|
'-vn',
|
|
'-acodec', 'libmp3lame',
|
|
'-b:a', '192k',
|
|
str(temp_audio_path)
|
|
]
|
|
subprocess.run(ffmpeg_command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
|
|
|
|
if not temp_audio_path.exists():
|
|
raise FileNotFoundError(f"Expected audio file was not found: {temp_audio_path}")
|
|
|
|
|
|
persistent_dir = Path("downloads")
|
|
persistent_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
persistent_file_path = persistent_dir / f"{sanitized_title}.mp3"
|
|
os.replace(str(temp_audio_path), str(persistent_file_path))
|
|
|
|
|
|
downloaded_files.append(str(persistent_file_path))
|
|
|
|
return str(persistent_file_path), f"Audio downloaded successfully: {sanitized_title}.mp3"
|
|
except Exception as e:
|
|
return None, f"Error downloading audio: {str(e)}"
|
|
|
|
|
|
def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
|
|
keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
|
|
chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
|
|
use_multi_level_chunking=False, chunk_language='english', keep_timestamps=True):
|
|
"""
|
|
Processes a podcast by downloading the audio, transcribing it, summarizing the transcription,
|
|
and adding the results to the database. Metrics are logged throughout the process.
|
|
|
|
Parameters:
|
|
url (str): URL of the podcast.
|
|
title (str): Title of the podcast.
|
|
author (str): Author of the podcast.
|
|
keywords (str): Comma-separated keywords.
|
|
custom_prompt (str): Custom prompt for summarization.
|
|
api_name (str): API name for summarization.
|
|
api_key (str): API key for summarization.
|
|
whisper_model (str): Whisper model to use for transcription.
|
|
keep_original (bool): Whether to keep the original audio file.
|
|
enable_diarization (bool): Whether to enable speaker diarization.
|
|
use_cookies (bool): Whether to use cookies for authenticated downloads.
|
|
cookies (str): JSON-formatted cookies string.
|
|
chunk_method (str): Method for chunking text.
|
|
max_chunk_size (int): Maximum size for each text chunk.
|
|
chunk_overlap (int): Overlap size between chunks.
|
|
use_adaptive_chunking (bool): Whether to use adaptive chunking.
|
|
use_multi_level_chunking (bool): Whether to use multi-level chunking.
|
|
chunk_language (str): Language for chunking.
|
|
keep_timestamps (bool): Whether to keep timestamps in transcription.
|
|
|
|
Returns:
|
|
tuple: (progress_message, transcription, summary, title, author, keywords, error_message)
|
|
"""
|
|
start_time = time.time()
|
|
error_message = ""
|
|
temp_files = []
|
|
|
|
|
|
labels = {
|
|
"whisper_model": whisper_model,
|
|
"api_name": api_name if api_name else "None"
|
|
}
|
|
|
|
def update_progress(message):
|
|
"""
|
|
Updates the progress messages.
|
|
|
|
Parameters:
|
|
message (str): Progress message to append.
|
|
|
|
Returns:
|
|
str: Combined progress messages.
|
|
"""
|
|
progress.append(message)
|
|
return "\n".join(progress)
|
|
|
|
def cleanup_files():
|
|
if not keep_original:
|
|
for file in temp_files:
|
|
try:
|
|
if os.path.exists(file):
|
|
os.remove(file)
|
|
update_progress(f"Temporary file {file} removed.")
|
|
except Exception as e:
|
|
update_progress(f"Failed to remove temporary file {file}: {str(e)}")
|
|
|
|
progress = []
|
|
|
|
try:
|
|
|
|
if use_cookies:
|
|
cookies = json.loads(cookies)
|
|
|
|
|
|
audio_file = download_audio_file(url, whisper_model, use_cookies, cookies)
|
|
if not audio_file:
|
|
raise RuntimeError("Failed to download podcast audio.")
|
|
temp_files.append(audio_file)
|
|
update_progress("Podcast downloaded successfully.")
|
|
|
|
|
|
metadata = extract_metadata(url)
|
|
title = title or metadata.get('title', 'Unknown Podcast')
|
|
author = author or metadata.get('uploader', 'Unknown Author')
|
|
|
|
|
|
metadata_text = f"""
|
|
Metadata:
|
|
Title: {title}
|
|
Author: {author}
|
|
Series: {metadata.get('series', 'N/A')}
|
|
Episode: {metadata.get('episode', 'N/A')}
|
|
Season: {metadata.get('season', 'N/A')}
|
|
Upload Date: {metadata.get('upload_date', 'N/A')}
|
|
Duration: {metadata.get('duration', 'N/A')} seconds
|
|
Description: {metadata.get('description', 'N/A')}
|
|
"""
|
|
|
|
|
|
new_keywords = []
|
|
if metadata.get('series'):
|
|
new_keywords.append(f"series:{metadata['series']}")
|
|
if metadata.get('episode'):
|
|
new_keywords.append(f"episode:{metadata['episode']}")
|
|
if metadata.get('season'):
|
|
new_keywords.append(f"season:{metadata['season']}")
|
|
|
|
keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
|
|
update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
|
|
|
|
|
|
try:
|
|
if enable_diarization:
|
|
segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
|
|
else:
|
|
segments = speech_to_text(audio_file, whisper_model=whisper_model)
|
|
|
|
def format_segment(segment):
|
|
start = segment.get('start', 0)
|
|
end = segment.get('end', 0)
|
|
text = segment.get('Text', '')
|
|
|
|
if isinstance(segments, dict) and 'segments' in segments:
|
|
segments = segments['segments']
|
|
|
|
if isinstance(segments, list):
|
|
transcription = format_transcription_with_timestamps(segments, keep_timestamps)
|
|
update_progress("Podcast transcribed successfully.")
|
|
else:
|
|
raise ValueError("Unexpected segments format received from speech_to_text.")
|
|
|
|
if not transcription.strip():
|
|
raise ValueError("Transcription is empty.")
|
|
except Exception as e:
|
|
error_message = f"Transcription failed: {str(e)}"
|
|
raise RuntimeError(error_message)
|
|
|
|
|
|
chunk_options = {
|
|
'method': chunk_method,
|
|
'max_size': max_chunk_size,
|
|
'overlap': chunk_overlap,
|
|
'adaptive': use_adaptive_chunking,
|
|
'multi_level': use_multi_level_chunking,
|
|
'language': chunk_language
|
|
}
|
|
chunked_text = improved_chunking_process(transcription, chunk_options)
|
|
|
|
|
|
full_content = metadata_text + "\n\nTranscription:\n" + transcription
|
|
|
|
|
|
summary = None
|
|
if api_name:
|
|
try:
|
|
summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
|
|
update_progress("Podcast summarized successfully.")
|
|
except Exception as e:
|
|
error_message = f"Summarization failed: {str(e)}"
|
|
raise RuntimeError(error_message)
|
|
else:
|
|
summary = "No summary available (API not provided)"
|
|
|
|
|
|
try:
|
|
add_media_with_keywords(
|
|
url=url,
|
|
title=title,
|
|
media_type='podcast',
|
|
content=full_content,
|
|
keywords=keywords,
|
|
prompt=custom_prompt,
|
|
summary=summary or "No summary available",
|
|
transcription_model=whisper_model,
|
|
author=author,
|
|
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
|
)
|
|
update_progress("Podcast added to database successfully.")
|
|
except Exception as e:
|
|
error_message = f"Error adding podcast to database: {str(e)}"
|
|
raise RuntimeError(error_message)
|
|
|
|
|
|
cleanup_files()
|
|
|
|
|
|
end_time = time.time()
|
|
processing_time = end_time - start_time
|
|
|
|
|
|
log_counter(
|
|
metric_name="podcasts_processed_total",
|
|
labels=labels,
|
|
value=1
|
|
)
|
|
|
|
|
|
log_histogram(
|
|
metric_name="podcast_processing_time_seconds",
|
|
value=processing_time,
|
|
labels=labels
|
|
)
|
|
|
|
|
|
final_progress = update_progress("Processing complete.")
|
|
return (final_progress, full_content, summary or "No summary generated.",
|
|
title, author, keywords, error_message)
|
|
|
|
except Exception as e:
|
|
|
|
end_time = time.time()
|
|
processing_time = end_time - start_time
|
|
|
|
|
|
log_counter(
|
|
metric_name="podcasts_failed_total",
|
|
labels=labels,
|
|
value=1
|
|
)
|
|
|
|
|
|
log_histogram(
|
|
metric_name="podcast_processing_time_seconds",
|
|
value=processing_time,
|
|
labels=labels
|
|
)
|
|
|
|
logging.error(f"Error processing podcast: {str(e)}")
|
|
cleanup_files()
|
|
final_progress = update_progress(f"Processing failed: {str(e)}")
|
|
return (final_progress, "", "", "", "", "", str(e))
|
|
|
|
|
|
|
|
|
|
|