Spaces:

Teapack1
/

Assistant-Audio-Intent-Classification

Running

App Files Files Community

Teapack1 commited on Jan 2

Commit

183ee92

•

1 Parent(s): 8c837c9

Initial commit

Browse files

Files changed (7) hide show

README.md +2 -1
_app.py +0 -50
app.py +121 -48
requirements.txt +2 -2
static/script.js +49 -0
static/styles.css +73 -0
templates/index.html +55 -0

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
-title: ASR W ZeroShotClassification Assistant
 emoji: 🦀
 colorFrom: red
 colorTo: pink
 sdk: gradio
 sdk_version: 4.7.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
+title: Smart Assistant - Audio Intent Classification
 emoji: 🦀
 colorFrom: red
 colorTo: pink
 sdk: gradio
 sdk_version: 4.7.1
+python_version: 3.10.4
 app_file: app.py
 pinned: false
 license: apache-2.0

_app.py DELETED Viewed

@@ -1,50 +0,0 @@
-from transformers import pipeline
-from transformers.pipelines.audio_utils import ffmpeg_microphone_live
-import torch
-import gradio as gr
-asr_model = "openai/whisper-tiny.en"
-nlp_model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
-pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device)
-sampling_rate = pipe.feature_extractor.sampling_rate
-chunk_length_s = 10  # how often returns the text
-stream_chunk_s = 1  # how often the microphone is checked for new audio
-mic = ffmpeg_microphone_live(
-    sampling_rate=sampling_rate,
-    chunk_length_s=chunk_length_s,
-    stream_chunk_s=stream_chunk_s,
-)
-def listen_print_loop(responses):
-    for response in responses:
-        if response["text"]:
-            print(response["text"], end="\r")
-            return response["text"]
-        if not response["partial"]:
-            print("")
-classifier = pipeline("zero-shot-classification", model=nlp_model)
-candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "nothing about light"]
-while True:
-    context = listen_print_loop(pipe(mic))
-    print(context)
-    output = classifier(context, candidate_labels, multi_label=False)
-    top_label = output['labels'][0]
-    top_score = output['scores'][0]
-    print(f"Top Prediction: {top_label} with a score of {top_score:.2f}")
-iface = gr.Interface(
-    fn=transcribe,
-    inputs=gr.inputs.Audio(source="microphone", type="filepath"),
-    outputs="text",
-    title="Real-Time ASR Transcription",
-    description="Speak into the microphone and get the real-time transcription."
-)
-iface.launch()

app.py CHANGED Viewed

@@ -1,59 +1,132 @@
-import gradio as gr
-from transformers import pipeline
 import numpy as np
-import time
-# Initialize the pipelines
-transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en")
-classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
-candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "not about lighting"]
-last_update_time = time.time() - 5  # Initialize with a value to ensure immediate first update
-# Buffer to hold the last updated values
-last_transcription = ""
-last_classification = ""
-def transcribe_and_classify(stream, new_chunk):
-    global last_update_time, last_transcription, last_classification
-    sr, y = new_chunk
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y))
-    # Concatenate new audio chunk to the stream
-    if stream is not None:
-        stream = np.concatenate([stream, y])
-    else:
-        stream = y
-    # Transcribe the last 10 seconds of audio
-    transcription = transcriber({"sampling_rate": sr, "task": "transcribe", "language": "english", "raw": stream})["text"]
-    last_transcription = transcription  # Update the buffer
-    # Classify the transcribed text
-    if transcription.strip():
-        output = classifier(transcription, candidate_labels, multi_label=False)
-        top_label = output['labels'][0]
-        top_score = output['scores'][0]
-        last_classification = f"{top_label.upper()}, score: {top_score:.2f}"
-    # Return the last updated transcription and classification
-    return stream, last_transcription, last_classification
-# Define the Gradio interface
-demo = gr.Interface(
-    fn=transcribe_and_classify,
-    inputs=[
-        "state",
-        gr.Audio(sources=["microphone"])
-    ],
-    outputs=[
-        "state",
-        "text",
-        "text"
-    ],
-)
-# Launch the demo
-demo.launch(debug=True, share=True)

+from fastapi import FastAPI, WebSocket, Request, WebSocketDisconnect
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import HTMLResponse
+from fastapi.templating import Jinja2Templates
+import os
 import numpy as np
+from transformers import pipeline
+import torch
+from transformers.pipelines.audio_utils import ffmpeg_microphone_live
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+classifier = pipeline(
+    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
+)
+intent_class_pipe = pipeline(
+    "audio-classification", model="anton-l/xtreme_s_xlsr_minds14", device=device
+)
+async def launch_fn(
+    wake_word="marvin",
+    prob_threshold=0.5,
+    chunk_length_s=2.0,
+    stream_chunk_s=0.25,
+    debug=False,
+):
+    if wake_word not in classifier.model.config.label2id.keys():
+        raise ValueError(
+            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
+        )
+    sampling_rate = classifier.feature_extractor.sampling_rate
+    mic = ffmpeg_microphone_live(
+        sampling_rate=sampling_rate,
+        chunk_length_s=chunk_length_s,
+        stream_chunk_s=stream_chunk_s,
+    )
+    print("Listening for wake word...")
+    for prediction in classifier(mic):
+        prediction = prediction[0]
+        if debug:
+            print(prediction)
+        if prediction["label"] == wake_word:
+            if prediction["score"] > prob_threshold:
+                return True
+async def listen(websocket, chunk_length_s=2.0, stream_chunk_s=2.0):
+    sampling_rate = intent_class_pipe.feature_extractor.sampling_rate
+    mic = ffmpeg_microphone_live(
+        sampling_rate=sampling_rate,
+        chunk_length_s=chunk_length_s,
+        stream_chunk_s=stream_chunk_s,
+    )
+    audio_buffer = []
+    print("Listening")
+    for i in range(4):
+        audio_chunk = next(mic)
+        audio_buffer.append(audio_chunk["raw"])
+        prediction = intent_class_pipe(audio_chunk["raw"])
+        await websocket.send_text(f"chunk: {prediction[0]['label']} | {i+1} / 4")
+        if await is_silence(audio_chunk["raw"], threshold=0.7):
+            print("Silence detected, processing audio.")
+            break
+    combined_audio = np.concatenate(audio_buffer)
+    prediction = intent_class_pipe(combined_audio)
+    top_3_predictions = prediction[:3]
+    formatted_predictions = "\n".join([f"{pred['label']}: {pred['score'] * 100:.2f}%" for pred in top_3_predictions])
+    await websocket.send_text(f"classes: \n{formatted_predictions}")
+    return
+async def is_silence(audio_chunk, threshold):
+    silence = intent_class_pipe(audio_chunk)
+    if silence[0]["label"] == "silence" and silence[0]["score"] > threshold:
+        return True
+    else:
+        return False
+# Initialize FastAPI app
+app = FastAPI()
+# Set up static file directory
+app.mount("/static", StaticFiles(directory="static"), name="static")
+# Jinja2 Template for HTML rendering
+templates = Jinja2Templates(directory="templates")
+@app.get("/", response_class=HTMLResponse)
+async def get_home(request: Request):
+    return templates.TemplateResponse("index.html", {"request": request})
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    try:
+        process_active = False  # Flag to track the state of the process
+        while True:
+            message = await websocket.receive_text()
+            if message == "start" and not process_active:
+                process_active = True
+                await websocket.send_text("Listening for wake word...")
+                wake_word_detected = await launch_fn(debug=True)
+                if wake_word_detected:
+                    await websocket.send_text("Wake word detected. Listening for your query...")
+                    await listen(websocket)
+                    process_active = False  # Reset the process flag
+            elif message == "stop":
+                if process_active:
+                    # Implement logic to stop the ongoing process
+                    # This might involve setting a flag that your launch_fn and listen functions check
+                    process_active = False
+                    await websocket.send_text("Process stopped. Ready to restart.")
+                    break  # Or keep the loop running if you want to allow restarting without reconnecting
+    except WebSocketDisconnect:
+        print("Client disconnected.")

requirements.txt CHANGED Viewed

@@ -2,5 +2,5 @@ torch
 transformers
 torchaudio
 numpy
-sentencepiece
-gradio

 transformers
 torchaudio
 numpy
+fastapi
+uvicorn[standard]

static/script.js ADDED Viewed

	@@ -0,0 +1,49 @@

+let ws;
+let isRecording = false;
+function toggleRecording() {
+    if (!isRecording) {
+        ws = new WebSocket("ws://localhost:8000/ws");
+        ws.onopen = () => ws.send("start");
+        ws.onmessage = (event) => {
+            const serverMessage = event.data;
+            if (serverMessage.startsWith("chunk:")) {
+                const chunkText = serverMessage.substring(6); // Remove "chunk:" prefix
+                document.getElementById('audio-chunks').innerText = chunkText;
+            } else if (serverMessage === "Restarting system...") {
+                isRecording = false;
+                updateButton();
+            } else {
+                document.getElementById('results').innerText = serverMessage;
+            }
+        };
+        isRecording = true;
+    } else {
+        ws.send("stop");
+        isRecording = false;
+    }
+    updateButton();
+}
+function updateButton() {
+    const startButton = document.getElementById('startBtn');
+    if (isRecording) {
+        startButton.innerText = "Stop";
+        startButton.className = "stop-button";
+    } else {
+        startButton.innerText = "Start";
+        startButton.className = "start-button";
+    }
+}
+document.getElementById('startBtn').addEventListener('click', toggleRecording);
+document.getElementById('toggleClassListBtn').addEventListener('click', function() {
+    var classList = document.getElementById('class-list');
+    if (classList.style.display === "none") {
+        classList.style.display = "block";
+    } else {
+        classList.style.display = "none";
+    }
+});

static/styles.css ADDED Viewed

	@@ -0,0 +1,73 @@

+body, html {
+    margin: 0;
+    padding: 0;
+    font-family: Arial, sans-serif;
+    background-color: #eaeff2;
+}
+.container {
+    text-align: center;
+    margin-top: 50px;
+}
+header {
+    background-color: #007bff;
+    color: white;
+    padding: 20px 0;
+}
+main {
+    background-color: #ffffff;
+    padding: 20px;
+    margin: 20px auto;
+    border-radius: 10px;
+    box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+    width: 80%;
+    max-width: 800px;
+}
+section {
+    margin-bottom: 20px;
+}
+h1, h2 {
+    margin-bottom: 10px;
+}
+button {
+    background-color: #28a745;
+    color: white;
+    border: none;
+    padding: 10px 15px;
+    font-size: 16px;
+    border-radius: 5px;
+    cursor: pointer;
+}
+button:hover {
+    background-color: #218838;
+}
+#results {
+    padding: 20px;
+    background-color: #f4f4f4;
+    border: 1px solid #cccccc;
+    border-radius: 5px;
+}
+.start-button {
+    background-color: #28a745; /* Green */
+    /* other styling */
+}
+.stop-button {
+    background-color: #dc3545; /* Red */
+    /* other styling */
+}
+#audio-chunks {
+    padding: 20px;
+    background-color: #f4f4f4;
+    border: 1px solid #cccccc;
+    border-radius: 5px;
+}

templates/index.html ADDED Viewed

	@@ -0,0 +1,55 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>ML Audio Demo</title>
+    <link rel="stylesheet" type="text/css" href="/static/styles.css">
+</head>
+<body>
+    <div class="container">
+        <header>
+        </header>
+        <main>
+            <h1>Audio Intent Clasification Demo</h1>
+            <section id="recording-section">
+                <p id="recording-instructions">The system is activated by pressing Start and calling a name <i>Marvin</i>.</p>
+                <p id="recording-instructions">After that the system listens to an audio query and classifies its intention.</p>
+                <p id="recording-instructions">Model is trained on <i>PolyAI/minds14</i>. Dataset covers 14 intents extracted from a commercial system in the e-banking domain.</p>
+                <h3>Start the System:</h3>
+                <button id="startBtn">Start</button>
+                <div id="restart-button-container"></div>
+            </section>
+            <section id="audio-chunks-section">
+                <h3>Partial Predictions:</h3>
+                <div id="audio-chunks">Partial results will appear here...</div>
+            </section>
+            <section id="results-section">
+                <h3>Final Result:</h3>
+                <div id="results"><b>Intent classification will appear here...</b></div>
+            </section>
+            <section id="class-list-section">
+                <button id="toggleClassListBtn">see all classes</button>
+                <div id="class-list" style="display: none;">
+                    <p>1.abroad</p>
+                    <p>2.address</p>
+                    <p>3.app_error</p>
+                    <p>4.atm_limit</p>
+                    <p>5.balance</p>
+                    <p>6.business_loan</p>
+                    <p>7.card_issues</p>
+                    <p>8.cash_deposit</p>
+                    <p>9.direct_debit</p>
+                    <p>10.freeze</p>
+                    <p>11.high_value_payment</p>
+                    <p>12.joint_account</p>
+                    <p>13.latest_transactions</p>
+                    <p>14.pay_bill</p>
+                </div>
+            </section>
+        </main>
+    </div>
+    <script src="/static/script.js?v=8"></script>
+</body>
+</html>