|
import os |
|
import requests |
|
import torch |
|
from transformers import pipeline |
|
from settings import HF_API_URL, DATA_DIR |
|
from pathlib import Path |
|
from moviepy.editor import VideoFileClip |
|
|
|
def convert_video_to_wav(video_path, output_path): |
|
""" |
|
Converts a video file to a WAV audio file. |
|
|
|
Args: |
|
video_path (str): The path of the video file to be converted. |
|
output_path (str): The desired path for the output WAV audio file. |
|
|
|
Returns: |
|
None |
|
""" |
|
video_clip = VideoFileClip(video_path) |
|
audio_clip = video_clip.audio |
|
audio_clip.write_audiofile(output_path) |
|
|
|
def get_transcript1(filepath): |
|
audio_file = Path(DATA_DIR).joinpath(Path(filepath).stem + ".wav") |
|
print(audio_file) |
|
if not audio_file.exists(): |
|
convert_video_to_wav(filepath, audio_file) |
|
headers = {"Authorization": f"Bearer {os.environ['HF_KEY']}"} |
|
with open(audio_file, "rb") as f: |
|
data = f.read() |
|
response = requests.post(HF_API_URL, headers=headers, |
|
data=data) |
|
print(response, response.json()) |
|
return response.json()["text"] |
|
|
|
def get_transcript(url): |
|
""" |
|
Converts a audio file to text and provides corresponding time stamps. |
|
""" |
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
pipe = pipeline( |
|
"automatic-speech-recognition", model="openai/whisper-base", device=device |
|
) |
|
|
|
file_data = pipe( |
|
url, |
|
max_new_tokens=256, |
|
generate_kwargs={"task": "transcribe"}, |
|
chunk_length_s=30, |
|
batch_size=8, |
|
return_timestamps=True, |
|
)["chunks"] |
|
text = "" |
|
for doc in file_data: |
|
text += doc["text"] |
|
return text |