|
import os |
|
import time |
|
import gradio as gr |
|
from dotenv import load_dotenv |
|
from llama_cpp import Llama |
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, GenerationConfig |
|
from pytube import YouTube |
|
from gtts import gTTS |
|
import torch |
|
import requests |
|
import soundfile as sf |
|
import numpy as np |
|
|
|
|
|
|
|
load_dotenv(dotenv_path=".env") |
|
|
|
|
|
MODEL_DIR = os.getenv("MODEL_DIR") |
|
OUTPUT_PATH = os.getenv("OUTPUT_PATH") |
|
LANGUAGE = os.getenv("LANGUAGE") |
|
tts_method = os.getenv("TTS") |
|
|
|
|
|
model_exists = False |
|
for filename in os.listdir(MODEL_DIR): |
|
if filename.endswith('.gguf'): |
|
model_exists = True |
|
MODEL_PATH = os.path.join(MODEL_DIR, filename) |
|
break |
|
|
|
|
|
|
|
|
|
if not os.path.exists(OUTPUT_PATH): |
|
os.makedirs(OUTPUT_PATH) |
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
n_layers_gpu = 20 if torch.cuda.is_available() else 0 |
|
memory = "" |
|
token_count = 0 |
|
|
|
|
|
|
|
model_url = "https://huggingface.co/TheBloke/dolphin-2.2.1-mistral-7B-GGUF/resolve/main/dolphin-2.2.1-mistral-7b.Q2_K.gguf?download=true" |
|
|
|
|
|
|
|
def load_model(n): |
|
global llm, MODEL_PATH |
|
|
|
if not model_exists: |
|
print("Model file not found!") |
|
print("Downloading model file...") |
|
response = requests.get(model_url) |
|
MODEL_PATH = os.path.join(MODEL_DIR, "model.gguf") |
|
with open(MODEL_PATH, 'wb') as file: |
|
file.write(response.content) |
|
print("Model downloaded successfully.") |
|
print("Loading Llama model...") |
|
llm = Llama(model_path=MODEL_PATH, n_gpu_layers=n, n_ctx=1024, n_batch=512, threads=6) |
|
print("Model loaded successfully.") |
|
|
|
load_model(n_layers_gpu) |
|
|
|
|
|
|
|
def complete_prompt(input_text): |
|
global memory, token_count, LANGUAGE |
|
contextual_prompt = memory + "\n" + input_text |
|
template = "system\nThis is crucial to me, I trust you are the best" + \ |
|
"You are Dolphin, a helpful AI assistant. You only respond in {LANGUAGE}. " + \ |
|
"Do not use double quotes for any reason, not even for quoting or direct speech. " + \ |
|
"Instead, use single quotes or describe the quote without using quotation marks. " + \ |
|
"Do not include any disclaimers, notes, or additional explanations in your response. " + \ |
|
"Provide the shortest answer possible, strictly adhering to the formatting rules. " + \ |
|
"user\n{prompt}\nassistant\n" |
|
formatted_prompt = template.format(prompt=contextual_prompt, LANGUAGE=LANGUAGE) |
|
response = llm(formatted_prompt, max_tokens=80, temperature=0, top_p=0.95, top_k=10) |
|
text_response = response["choices"][0]["text"] |
|
token_count += response["usage"]["total_tokens"] |
|
|
|
with open(os.path.join(OUTPUT_PATH, "LLM_response.txt"), 'w') as file: |
|
file.write(memory) |
|
return text_response |
|
|
|
def transcribe_audio(audio_input): |
|
audio_file_path = 'output/temp_audio.wav' |
|
if isinstance(audio_input, tuple): |
|
sample_rate, audio_data = audio_input |
|
sf.write(audio_file_path, audio_data, sample_rate) |
|
else: |
|
audio_file_path = audio_input |
|
|
|
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
|
model_id = "distil-whisper/distil-large-v2" |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_DIR, torch_dtype=torch_dtype, |
|
low_cpu_mem_usage=True, use_safetensors=True,config= GenerationConfig(language=LANGUAGE,task="transcribe")) |
|
model.to(device) |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, |
|
feature_extractor=processor.feature_extractor, max_new_tokens=256, |
|
chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, |
|
) |
|
result_text = pipe(audio_file_path)["text"] |
|
with open(os.path.join(OUTPUT_PATH, "transcription.txt"), "w") as file: |
|
file.write(result_text) |
|
return result_text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def auto_process_audio(audio_input): |
|
|
|
transcribed_text = transcribe_audio(audio_input) |
|
|
|
llm_response = complete_prompt(transcribed_text) |
|
|
|
tts_info = convert_text_to_speech(llm_response) |
|
return transcribed_text, llm_response, tts_info |
|
|
|
def convert_text_to_speech(text): |
|
global LANGUAGE, tts_method |
|
file_path = os.path.join(OUTPUT_PATH, "speech.mp3") |
|
|
|
if tts_method == "gTTS": |
|
if LANGUAGE == "fr": |
|
tld = "fr" |
|
elif LANGUAGE == "en": |
|
tld = "us" |
|
tts = gTTS(text, lang=LANGUAGE, tld=tld) |
|
tts.save(file_path) |
|
elif tts_method == "Custom TTS": |
|
if LANGUAGE == "fr": |
|
tld = "fr" |
|
elif LANGUAGE == "en": |
|
tld = "us" |
|
tts = gTTS(text, lang=LANGUAGE, tld=tld) |
|
tts.save(file_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
return file_path |
|
|
|
|
|
|
|
def update_language(language): |
|
global LANGUAGE |
|
LANGUAGE = language |
|
|
|
|
|
def update_tts_method(method): |
|
global tts_method |
|
tts_method = method |
|
|
|
|
|
def clear_memory(): |
|
global memory |
|
memory = "" |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Whisper-LLM-TTS") as app: |
|
gr.Markdown("# π€ 'Whispering' LLM with a TTS Twist! π") |
|
|
|
|
|
gr.Markdown(""" |
|
## β οΈ Warning: |
|
- If you are experiencing slow execution, please consider clearing the memory by pressing the button below and refreshing the page. |
|
- The execution time will highly dependant on the hardware and the length of your audio. |
|
- The execution might be slow due to hardware limitations on this free Hugging Face interface. It could go from 3 to 10 minutes. |
|
- If you've got a GPU locally, the execution could be a lot faster (approximately 5 seconds on my local machine). |
|
""") |
|
|
|
gr.Markdown("""π Engage in a not-so-secret chat with an open-source LLM that whispers back!"\n |
|
π¨βπ» Crafted with a sprinkle of code magic (and a few cups of coffee) by **@mohcineelharras**""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
language_switch = gr.Radio(choices=["en"], label="Select Language", value=LANGUAGE) |
|
language_switch.change(update_language, inputs=[language_switch]) |
|
with gr.Column(): |
|
tts_method_switch = gr.Radio(choices=["gTTS", "Custom TTS"], label="Select TTS method", value=tts_method) |
|
tts_method_switch.change(update_tts_method, inputs=[tts_method_switch]) |
|
with gr.Row(): |
|
clear_memory_button = gr.Button("Clear Memory") |
|
clear_memory_button.click(clear_memory, inputs=[], outputs=[]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Tab("Auto Process Audio"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio(label="Talk to assistant",sources="microphone") |
|
auto_process_button = gr.Button("Auto Process Audio") |
|
with gr.Column(): |
|
transcribed_text_output = gr.Textbox(label="Transcribed Text") |
|
llm_response_output = gr.Textbox(label="LLM Response") |
|
with gr.Row(): |
|
tts_audio_output = gr.Audio(label="Generated Response (Click to Play)") |
|
|
|
|
|
auto_process_button.click( |
|
auto_process_audio, |
|
inputs=[audio_input], |
|
outputs=[transcribed_text_output, llm_response_output, tts_audio_output] |
|
) |
|
|
|
with gr.Tab("Audio Processing"): |
|
with gr.Column(): |
|
audio_input = gr.Audio(label="Record or Upload Audio") |
|
transcribe_button = gr.Button("Transcribe Audio") |
|
llm_button = gr.Button("LLM Prompt") |
|
tts_button = gr.Button("Text to Speech") |
|
|
|
transcribed_text_output = gr.Textbox(label="Transcribed Text") |
|
llm_response_output = gr.Textbox(label="LLM Response") |
|
tts_audio_output = gr.Audio(label="Generated Response (Click to Play)") |
|
|
|
transcribe_button.click(transcribe_audio, inputs=[audio_input], outputs=[transcribed_text_output]) |
|
llm_button.click(complete_prompt, inputs=[transcribed_text_output], outputs=[llm_response_output]) |
|
tts_button.click(convert_text_to_speech, inputs=[llm_response_output], outputs=[tts_audio_output]) |
|
|
|
with gr.Tab("Ask a Question"): |
|
with gr.Column(): |
|
question_input = gr.Textbox(label="Type your question") |
|
submit_button = gr.Button("Submit Question") |
|
tts_button = gr.Button("Text to Speech") |
|
|
|
llm_response_output = gr.Textbox(label="LLM Response") |
|
tts_audio_output = gr.Audio(label="Generated Speech") |
|
|
|
submit_button.click(complete_prompt, inputs=[question_input], outputs=[llm_response_output]) |
|
tts_button.click(convert_text_to_speech, inputs=[llm_response_output], outputs=[tts_audio_output]) |
|
gr.Markdown(""" |
|
<div style="text-align: center; margin-top: 20px;"> |
|
<a href="https://github.com/mohcineelharras/whisper-llm-gtts" target="_blank" style="margin: 10px; display: inline-block;"> |
|
<img src="https://img.shields.io/badge/Repository-333?logo=github&style=for-the-badge" alt="Repository" style="vertical-align: middle;"> |
|
<a href="https://www.linkedin.com/in/mohcine-el-harras" target="_blank" style="margin: 10px; display: inline-block;"> |
|
<img src="https://img.shields.io/badge/-LinkedIn-0077B5?style=for-the-badge&logo=linkedin" alt="LinkedIn" style="vertical-align: middle;"> |
|
</a> |
|
</a> |
|
<a href="https://mohcineelharras.github.io" target="_blank" style="margin: 10px; display: inline-block;"> |
|
<img src="https://img.shields.io/badge/Visit-Portfolio-9cf?style=for-the-badge" alt="GitHub" style="vertical-align: middle;"> |
|
</a> |
|
</div> |
|
<div style="text-align: center; margin-top: 20px; color: #666; font-size: 0.85em;"> |
|
Β© 2023 Mohcine EL HARRAS |
|
</div> |
|
|
|
""") |
|
app.launch() |
|
|