File size: 1,735 Bytes
8677234 38efda5 8677234 38efda5 8677234 38efda5 8677234 38efda5 8677234 38efda5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
import requests
import torch
from transformers import pipeline
from settings import HF_API_URL, DATA_DIR
from pathlib import Path
from moviepy.editor import VideoFileClip
def convert_video_to_wav(video_path, output_path):
"""
Converts a video file to a WAV audio file.
Args:
video_path (str): The path of the video file to be converted.
output_path (str): The desired path for the output WAV audio file.
Returns:
None
"""
video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio
audio_clip.write_audiofile(output_path)
def get_transcript1(filepath):
audio_file = Path(DATA_DIR).joinpath(Path(filepath).stem + ".wav")
print(audio_file)
if not audio_file.exists():
convert_video_to_wav(filepath, audio_file)
headers = {"Authorization": f"Bearer {os.environ['HF_KEY']}"}
with open(audio_file, "rb") as f:
data = f.read()
response = requests.post(HF_API_URL, headers=headers,
data=data)
print(response, response.json())
return response.json()["text"]
def get_transcript(url):
"""
Converts a audio file to text and provides corresponding time stamps.
"""
# Model to find wav to text and time stamps
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition", model="openai/whisper-base", device=device
)
file_data = pipe(
url,
max_new_tokens=256,
generate_kwargs={"task": "transcribe"},
chunk_length_s=30,
batch_size=8,
return_timestamps=True,
)["chunks"]
text = ""
for doc in file_data:
text += doc["text"]
return text |