File size: 10,248 Bytes

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Unity.Sentis;
using Newtonsoft.Json;


//      Inference for MusicGen-300
//      ==========================
//
//  Details
//  -------
//  The model predicts 4 streams of codes staggered like this:
//  * * * a b c
//  * * a b c
//  * a b c 
//  a b c
//  Then aligns the streams so that it groups all the a's togther etc.

// Put sentis files and json file in Assets/StreamingAssets folder
// Put this script on the Main Camera object
// Put an audiosource on the Main Camera
// Press play and see console window for updates

// See https://github.com/huggingface/transformers/blob/main/src/transformers/models/musicgen/modeling_musicgen.py


public class RunMusicGen : MonoBehaviour
{
    //Change this prompt to whatever you like:
    string prompt = "80s pop track with bassy drums and synth";

    // number of seconds to create clip for (up to 30 seconds)
    const int seconds = 2;

    // Make this value smaller to make music more random
    float predictability = 1f;

    BackendType backendType = BackendType.GPUCompute;

    public AudioClip clip;
    IWorker toWavEngine, decoderEngine, textEngine, projectEngine;

    const int numCodeBooks = 4;

    // Special music decoder tokens
    const int DECODER_START_TOKEN = 2048;

    // Special text encoder tokens
    const int END_TEXT_TOKEN = 1;

    int decoderTokens; //text tokens

    List<int> tokensSoFar = new();
    TensorFloat encoder_hidden_states;
    TensorInt encoder_attention_mask, input_ids;
    Ops ops;
    Model decoder;

    // How much to stagger each code stream by wrt the next one
    int DELAY = 1;

    // Vocab list
    List<string> tokens = new List<string>(); 

    //The output frequency must be 32kHz
    const int outputFrequency = 32000;

    int maxFrames;

    List<int> TOKENS;

    int frame = 0;
    bool hasDecodedMusic = false;
    void Start()
    {
        ops = WorkerFactory.CreateOps(backendType, null);

        maxFrames = 50 * seconds + 3;
        
        LoadVocab(); 

        TOKENS = GetTokens(prompt);

        Debug.Log("Parsed tokens=\n" + string.Join(",", TOKENS));
        
        CreateAttentionMask();
        ParseText();
        LoadDecoderModel();
        
        SetupMusicCodeStreams();
        
        frame = 0;
    }

    void LoadDecoderModel()
    {
        decoder = ModelLoader.Load(Application.streamingAssetsPath + "/decoder.sentis");
        decoderEngine = WorkerFactory.CreateWorker(backendType, decoder);
    }

    void CreateAttentionMask()
    {
        int[] mask = new int[1 * decoderTokens];
        for (int i = 0; i < mask.Length; i++) mask[i] = 1;
        encoder_attention_mask = new TensorInt(new TensorShape(1, decoderTokens), mask);
    }

    void SetupMusicCodeStreams()
    {
        //Sets the staggered start codes 
        tokensSoFar.AddRange(new int[numCodeBooks * maxFrames]);
        for (int j = 0; j < maxFrames; j++)
        {
            for (int i = 0; i < numCodeBooks; i++)
            {
                if ( i * DELAY >= j)
                {
                    tokensSoFar[i * maxFrames + j] = DECODER_START_TOKEN;
                }
                else
                {
                    tokensSoFar[i * maxFrames + j] = -1;
                }
            }
        }
        input_ids = new TensorInt(new TensorShape(numCodeBooks, maxFrames), tokensSoFar.ToArray());
    }

    List<int> GetTokens(string text)
    {
        //split over whitespace
        string[] words = text.ToLower().Split(null);
        for (int i = 0; i < words.Length; i++) words[i] = " " + words[i];

        var ids = new List<int>();

        string s = "";

        foreach (var word in words)
        {
            int start = 0;
            for (int i = word.Length; i >= 0; i--)
            {
                string subword = word.Substring(start, i - start);
                int index = tokens.IndexOf(subword);
                if (index >= 0)
                {
                    ids.Add(index);
                    s += subword + " ";
                    if (i == word.Length) break;
                    start = i;
                    i = word.Length + 1;
                }
            }
        }

        ids.Add(END_TEXT_TOKEN);

        decoderTokens = ids.Count;

        Debug.Log("Tokenized sentece = " + s);

        return ids;
    }

    void ParseText()
    {
        Model textencoder = ModelLoader.Load(Application.streamingAssetsPath + "/textencoder.sentis");
        textEngine = WorkerFactory.CreateWorker(BackendType.GPUCompute, textencoder);

        Model project = ModelLoader.Load(Application.streamingAssetsPath + "/project768_1024.sentis");
        projectEngine = WorkerFactory.CreateWorker(BackendType.GPUCompute, project);
        
        using var input = new TensorInt(new TensorShape(1, decoderTokens), TOKENS.ToArray());

        var inputs = new Dictionary<string, Tensor>
        {
            {"input_ids", input },
            {"attention_mask", encoder_attention_mask }
        };
        textEngine.Execute(inputs);

        var output = textEngine.PeekOutput() as TensorFloat;

        //Convert vectors of size 768 to size 1024
        projectEngine.Execute(output);
        encoder_hidden_states = projectEngine.PeekOutput() as TensorFloat;
        encoder_hidden_states.TakeOwnership();
    }

    private class TokenizerData
    {
        public ModelData model;
    }
    private class ModelData
    {
        public object[][] vocab;
    }

    void LoadVocab()
    {
        var data = Newtonsoft.Json.JsonConvert.DeserializeObject<TokenizerData>(System.IO.File.ReadAllText(
            Application.streamingAssetsPath+"/tokenizer.json"
            ));
        for(int i = 0; i < data.model.vocab.Length; i++)
        {
            string tokenName = (string)data.model.vocab[i][0];
            tokens.Add(tokenName);
        }
    }

    // Update is called once per frame
    void Update()
    {
        if (frame < maxFrames)
        {
            GetOneMusicToken();
        }
        else if(!hasDecodedMusic)
        {
            hasDecodedMusic = true;
            DecodeMusic();
        }
        frame++;
    }

    void GetOneMusicToken()
    {
        var inputs = new Dictionary<string, Tensor>
        {
            {"input_ids", input_ids },
            {"encoder_hidden_states", encoder_hidden_states },
            {"encoder_attention_mask" , encoder_attention_mask }
        };

        decoderEngine.Execute(inputs);
        var decoderOutput = decoderEngine.PeekOutput() as TensorFloat;                                                                     
        using var dec2 = ops.Mul(decoderOutput, predictability);
        using var probs = ops.Softmax(dec2, 2);
        probs.MakeReadable();

        int OFFSET = 1;

        //Add new tokens to code streams
        for (int j = 0; j < numCodeBooks; j++)
        {
            if (frame < maxFrames - OFFSET)
            {
                int N = j * maxFrames + frame + OFFSET;

                if (tokensSoFar[N] != DECODER_START_TOKEN)
                {
                    tokensSoFar[N] = SelectRandomToken(probs, j, frame);
                }
            }
        }
        Replace(ref input_ids, new TensorInt(input_ids.shape, tokensSoFar.ToArray()));
        Debug.Log("Frame=" + frame + "/" + maxFrames);
    }

    int SelectRandomToken(TensorFloat probs,int j, int frame)
    {
        int numItems = probs.shape[2];
        float p = UnityEngine.Random.Range(0, 1f);
        float tot = 0;
        for(int i = 0; i < numItems; i++)
        {
            tot += probs[j, frame, i];
            if (p <= tot) return i;
        }
        return numItems - 1;
    }
    void LoadMusicTokensToWavModel()
    {
        if (toWavEngine != null) return;
        Model toWav = ModelLoader.Load(Application.streamingAssetsPath + "/encodec.sentis");
        toWavEngine = WorkerFactory.CreateWorker(BackendType.GPUCompute, toWav);
    }

    void DecodeMusic()
    {
        Debug.Log("Please wait while music is decoded...");
        LoadMusicTokensToWavModel();

        using var input2 = AlignCodeStreams(input_ids);
        using var wavTokens = input2.ShallowReshape(new TensorShape(1, 1, numCodeBooks, maxFrames - 3));
        
        toWavEngine.Execute(wavTokens);
        var output = toWavEngine.PeekOutput() as TensorFloat;
        output.MakeReadable();

        int numSamples = Mathf.Min(output.shape.length, outputFrequency * seconds);
        Debug.Log("Number of samples=" + numSamples + " / " + output.shape.length);
        clip = AudioClip.Create("music", numSamples, 1, outputFrequency, false);

        float[] wav = new float[numSamples];
        System.Array.Copy(output.ToReadOnlyArray(), wav, numSamples);
        clip.SetData(wav, 0);

        var audioSource = GetComponent<AudioSource>();
        if (audioSource != null)
        {
            audioSource.PlayOneShot(clip);
        }
        else
        {
            Debug.Log("You need to attach audio source to this object to hear the music");
        }
    }

    TensorInt AlignCodeStreams(TensorInt input)
    {
        if (DELAY == 0)
        {
            return ops.Copy(input);
        }
        using var input2 = ops.Cast(input, DataType.Float);
        TensorFloat[] B = new TensorFloat[4];
        for (int i = 0; i < 4; i++) {
            using TensorFloat A = ops.Slice(input2, new int[] { i }, new int[] { i + 1 }, new int[] { 0 }, new int[] { 1 }) as TensorFloat;
            B[i] = ops.Pad(A, new int[] { 0, -i, 0, i - 3 });
        }
        using var input3 = ops.Concat(B, 0) as TensorFloat;
        for(int i = 0; i < 4; i++)
        {
            B[i].Dispose();
        }
        return ops.Cast(input3, DataType.Int) as TensorInt;
    }

    void Replace<T>(ref T A, T B) where T:System.IDisposable
    {
        A?.Dispose();
        A = B;
    }

    private void OnDestroy()
    {
        input_ids?.Dispose();
        encoder_attention_mask?.Dispose();
        encoder_hidden_states?.Dispose();
        ops?.Dispose();
        decoderEngine?.Dispose();
        toWavEngine?.Dispose();
        projectEngine?.Dispose();
        textEngine?.Dispose();
    }
}