Spaces:

mlnsio
/

videoChat

Sleeping

File size: 1,735 Bytes

import os
import requests
import torch
from transformers import pipeline
from settings import HF_API_URL, DATA_DIR
from pathlib import Path
from moviepy.editor import VideoFileClip

def convert_video_to_wav(video_path, output_path):
    """
    Converts a video file to a WAV audio file.

    Args:
        video_path (str): The path of the video file to be converted.
        output_path (str): The desired path for the output WAV audio file.

    Returns:
        None
    """
    video_clip = VideoFileClip(video_path)
    audio_clip = video_clip.audio
    audio_clip.write_audiofile(output_path)

def get_transcript1(filepath):
    audio_file = Path(DATA_DIR).joinpath(Path(filepath).stem + ".wav")
    print(audio_file)
    if not audio_file.exists():
        convert_video_to_wav(filepath, audio_file)
    headers = {"Authorization": f"Bearer {os.environ['HF_KEY']}"}
    with open(audio_file, "rb") as f:
        data = f.read()
    response = requests.post(HF_API_URL, headers=headers,
                             data=data)
    print(response, response.json())
    return response.json()["text"]

def get_transcript(url):
    """
    Converts a audio file to text and provides corresponding time stamps.
    """

    # Model to find wav to text and time stamps
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    pipe = pipeline(
        "automatic-speech-recognition", model="openai/whisper-base", device=device
    )

    file_data = pipe(
        url,
        max_new_tokens=256,
        generate_kwargs={"task": "transcribe"},
        chunk_length_s=30,
        batch_size=8,
        return_timestamps=True,
    )["chunks"]
    text = ""
    for doc in file_data:
        text += doc["text"]
    return text