File size: 2,806 Bytes
aa7cb02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import time
import torch
import torchaudio
import noisereduce as nr
import numpy as np
from models.nllb import nllb_translate

def translate(model_nllb, tokenizer_nllb, text, target_lang):
    print("Processing translation...")
    start_time = time.time()
    translation = nllb_translate(model_nllb, tokenizer_nllb, text, target_lang)
    print("Translation:", translation)
    print("Translation time:", time.time() - start_time)
    return translation

def just_inference(model, original_path, output_dir, text, lang):
    print("Inference...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    path_to_save = output_dir
    t0 = time.time()

    try:
        # Load the audio
        print("Loading audio...")
        wav, sr = torchaudio.load(original_path)
        print(f"Loaded audio with sample rate: {sr}")

        wav = wav.squeeze().numpy()
        print(f"Audio shape after squeezing: {wav.shape}")

        # Apply noise reduction
        print("Applying noise reduction...")
        reduced_noise_audio = nr.reduce_noise(y=wav, sr=sr)
        reduced_noise_audio = torch.tensor(reduced_noise_audio).unsqueeze(0)
        print(f"Reduced noise audio shape: {reduced_noise_audio.shape}")

        # Move the reduced noise audio to the correct device
        reduced_noise_audio = reduced_noise_audio.to(device)

        print("Getting conditioning latents...")
        gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[original_path])
        print("Got conditioning latents.")

        print("Starting inference stream...")
        chunks = model.inference_stream(
            text,
            lang,
            gpt_cond_latent,
            speaker_embedding,
            stream_chunk_size=15,
            speed=0.95
        )
        print("Inference stream started.")

        full_audio = torch.Tensor().to(device)
        for i, chunk in enumerate(chunks):
            try:
                if i == 1:
                    time_to_first_chunk = time.time() - t0
                    print(f"Time to first chunk: {time_to_first_chunk}")
                full_audio = torch.cat((full_audio, chunk.squeeze().to(device)), dim=-1)
                print(f"Processed chunk {i}, chunk shape: {chunk.shape}")
            except Exception as e:
                print(f"Error processing chunk {i}: {e}")
                raise

        # Move full_audio to CPU before saving
        full_audio = full_audio.cpu()

        print(f"Saving full audio to {path_to_save}...")
        torchaudio.save(path_to_save, full_audio.unsqueeze(0), 24000)
        print("Audio saved.")

        print("Inference finished")
        return full_audio

    except Exception as e:
        print(f"Error during processing: {e}")
        raise