yuekai
/

model_repo_whisper_qwen_1.5B_trtllm

Model card Files Files and versions Community

yuekai commited on 13 days ago

Commit

6fdf6ff

•

1 Parent(s): a063960

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

model_repo_whisper_qwen_trtllm/tensorrt_llm/1/.gitkeep +0 -0
model_repo_whisper_qwen_trtllm/tensorrt_llm/1/model.py +947 -0
model_repo_whisper_qwen_trtllm/tensorrt_llm/config.pbtxt +577 -0
model_repo_whisper_qwen_trtllm/tensorrt_llm/config.template +577 -0
model_repo_whisper_qwen_trtllm/whisper/0/__pycache__/fbank.cpython-310.pyc +0 -0
model_repo_whisper_qwen_trtllm/whisper/0/__pycache__/model.cpython-310.pyc +0 -0
model_repo_whisper_qwen_trtllm/whisper/0/__pycache__/whisper_trtllm.cpython-310.pyc +0 -0
model_repo_whisper_qwen_trtllm/whisper/0/fbank.py +91 -0
model_repo_whisper_qwen_trtllm/whisper/0/mel_filters.npz +3 -0
model_repo_whisper_qwen_trtllm/whisper/0/model.py +346 -0
model_repo_whisper_qwen_trtllm/whisper/0/whisper_trtllm.py +278 -0
model_repo_whisper_qwen_trtllm/whisper/1/__pycache__/fbank.cpython-310.pyc +0 -0
model_repo_whisper_qwen_trtllm/whisper/1/__pycache__/model.cpython-310.pyc +0 -0
model_repo_whisper_qwen_trtllm/whisper/1/__pycache__/whisper_trtllm.cpython-310.pyc +0 -0
model_repo_whisper_qwen_trtllm/whisper/1/fbank.py +91 -0
model_repo_whisper_qwen_trtllm/whisper/1/mel_filters.npz +3 -0
model_repo_whisper_qwen_trtllm/whisper/1/model.py +318 -0
model_repo_whisper_qwen_trtllm/whisper/1/whisper_trtllm.py +212 -0
model_repo_whisper_qwen_trtllm/whisper/2/__pycache__/fbank.cpython-310.pyc +0 -0
model_repo_whisper_qwen_trtllm/whisper/2/__pycache__/model.cpython-310.pyc +0 -0
model_repo_whisper_qwen_trtllm/whisper/2/__pycache__/whisper_trtllm.cpython-310.pyc +0 -0
model_repo_whisper_qwen_trtllm/whisper/2/fbank.py +91 -0
model_repo_whisper_qwen_trtllm/whisper/2/mel_filters.npz +3 -0
model_repo_whisper_qwen_trtllm/whisper/2/model.py +346 -0
model_repo_whisper_qwen_trtllm/whisper/2/whisper_trtllm.py +278 -0
model_repo_whisper_qwen_trtllm/whisper/config.pbtxt +61 -0

model_repo_whisper_qwen_trtllm/tensorrt_llm/1/.gitkeep ADDED Viewed

File without changes

model_repo_whisper_qwen_trtllm/tensorrt_llm/1/model.py ADDED Viewed

	@@ -0,0 +1,947 @@

+import datetime
+import json
+import os
+import sys
+import time
+from random import randint
+from threading import Lock, Thread
+import numpy as np
+import torch
+import triton_python_backend_utils as pb_utils
+from torch import from_numpy
+from torch.utils.dlpack import from_dlpack
+import tensorrt_llm.bindings.executor as trtllm
+def get_input_tensor_by_name(request,
+                             name,
+                             expected_batch_size=None,
+                             batch_index=None):
+    tensor = pb_utils.get_input_tensor_by_name(request, name)
+    if tensor is None:
+        return None
+    if tensor.is_cpu():
+        tensor = tensor.as_numpy()
+    else:
+        tensor = from_dlpack(tensor.to_dlpack())
+    if expected_batch_size is not None and tensor.shape[
+            0] != expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Expected batch size doesn't match batch size for tensor {name}. Expected {expected_batch_size} got {tensor.shape[0]}"
+        )
+    if batch_index is not None and expected_batch_size is not None and batch_index >= expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Invalid batch index in get_input_tensor_by_name for {name}")
+    if batch_index is not None:
+        # Add leading 1 batch dimension
+        if isinstance(tensor, np.ndarray):
+            return np.expand_dims(tensor[batch_index], axis=0)
+        elif isinstance(tensor, torch.Tensor):
+            return torch.unsqueeze(tensor[batch_index], dim=0)
+    else:
+        return tensor
+def get_input_scalar_by_name(request,
+                             name,
+                             expected_batch_size=1,
+                             batch_index=0):
+    tensor = pb_utils.get_input_tensor_by_name(request, name)
+    if tensor is None:
+        return None
+    tensor = tensor.as_numpy()
+    if tensor.size != expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Expected a scalar tensor for tensor {name}")
+    return tensor.item(batch_index)
+def read_parameter_as_type(value, name, pytype=str):
+    if value == "":
+        return None
+    if value.startswith("${") and value.endswith("}"):
+        return None
+    if pytype is bool:
+        return value.lower() in ["1", "true"]
+    try:
+        result = pytype(value)
+        return result
+    except:
+        pb_utils.Logger.log_warning(
+            f"Could not read parameter '{name}' with value '{value}', will use default."
+        )
+        return None
+def get_parameter(model_config, name, pytype=str):
+    if name not in model_config['parameters']:
+        return None
+    return read_parameter_as_type(
+        model_config['parameters'][name]['string_value'], name, pytype)
+def convert_word_list(word_list):
+    if word_list is None:
+        return None
+    word_list = word_list.tolist()
+    if len(word_list) == 0 or len(word_list[0]) != 2:
+        raise pb_utils.TritonModelException(f"Invalid format for word list.")
+    words, indices = word_list[0]
+    result = []
+    current_index = 0
+    for i in indices:
+        if i == -1:
+            continue
+        if i > len(words):
+            raise pb_utils.TritonModelException(
+                f"Invalid format for word list.")
+        current_word = []
+        while current_index < i:
+            current_word.append(words[current_index])
+            current_index += 1
+        result.append(current_word)
+    return result
+def parse_medusa_choices(medusa_choices):
+    if medusa_choices is None:
+        return None
+    try:
+        result = json.loads(
+            "[" + medusa_choices.replace("{", "[").replace("}", "]") + "]")
+        assert isinstance(result, list) and len(result) > 0
+        assert all([isinstance(x, list) for x in result])
+        assert all([isinstance(y, int) for x in result for y in x])
+    except Exception:
+        raise pb_utils.TritonModelException(
+            "Invalid format for medusa_choices")
+    return result
+def get_sampling_config_from_request(request, batch_size=1, batch_index=0):
+    kwargs = {}
+    kwargs['beam_width'] = get_input_scalar_by_name(
+        request, 'beam_width', batch_size, batch_index) or 1
+    kwargs['top_k'] = get_input_scalar_by_name(request, 'runtime_top_k',
+                                               batch_size, batch_index)
+    kwargs['top_p'] = get_input_scalar_by_name(request, 'runtime_top_p',
+                                               batch_size, batch_index)
+    kwargs['top_p'] = None if kwargs['top_p'] is None or kwargs[
+        'top_p'] <= 0 else kwargs['top_p']
+    kwargs['random_seed'] = get_input_scalar_by_name(request, 'random_seed',
+                                                     batch_size, batch_index)
+    kwargs['temperature'] = get_input_scalar_by_name(request, 'temperature',
+                                                     batch_size, batch_index)
+    kwargs['min_length'] = get_input_scalar_by_name(request, 'min_length',
+                                                    batch_size, batch_index)
+    kwargs['repetition_penalty'] = get_input_scalar_by_name(
+        request, 'repetition_penalty', batch_size, batch_index)
+    kwargs['presence_penalty'] = get_input_scalar_by_name(
+        request, 'presence_penalty', batch_size, batch_index)
+    kwargs['frequency_penalty'] = get_input_scalar_by_name(
+        request, 'frequency_penalty', batch_size, batch_index)
+    kwargs['length_penalty'] = get_input_scalar_by_name(
+        request, 'len_penalty', batch_size, batch_index)
+    kwargs['top_p_min'] = get_input_scalar_by_name(request,
+                                                   'runtime_top_p_min',
+                                                   batch_size, batch_index)
+    kwargs['top_p_reset_ids'] = get_input_scalar_by_name(
+        request, 'runtime_top_p_reset_ids', batch_size, batch_index)
+    kwargs['top_p_decay'] = get_input_scalar_by_name(request,
+                                                     'runtime_top_p_decay',
+                                                     batch_size, batch_index)
+    kwargs['beam_search_diversity_rate'] = get_input_scalar_by_name(
+        request, 'beam_search_diversity_rate', batch_size, batch_index)
+    kwargs['early_stopping'] = get_input_scalar_by_name(
+        request, 'early_stopping', batch_size, batch_index)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return trtllm.SamplingConfig(**kwargs)
+def get_output_config_from_request(request,
+                                   exclude_input_from_output,
+                                   batch_size=1,
+                                   batch_index=0):
+    kwargs = {}
+    kwargs["return_log_probs"] = get_input_scalar_by_name(
+        request, 'return_log_probs', batch_size, batch_index)
+    kwargs["return_context_logits"] = get_input_scalar_by_name(
+        request, 'return_context_logits', batch_size, batch_index)
+    kwargs["return_generation_logits"] = get_input_scalar_by_name(
+        request, 'return_generation_logits', batch_size, batch_index)
+    kwargs["exclude_input_from_output"] = exclude_input_from_output
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    return trtllm.OutputConfig(**kwargs)
+def get_external_draft_tokens_config_from_request(request,
+                                                  batch_size=1,
+                                                  batch_index=0):
+    kwargs = {}
+    draft_input_ids = get_input_tensor_by_name(request, 'draft_input_ids',
+                                               batch_size, batch_index)
+    if draft_input_ids is not None:
+        kwargs['tokens'] = draft_input_ids[0].tolist()
+    draft_logits = get_input_tensor_by_name(request, 'draft_logits',
+                                            batch_size, batch_index)
+    if draft_logits is not None:
+        kwargs['logits'] = from_numpy(draft_logits).squeeze()
+    kwargs['acceptance_threshold'] = get_input_scalar_by_name(
+        request, 'draft_acceptance_threshold', batch_size, batch_index)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.ExternalDraftTokensConfig(**kwargs)
+    return None
+def get_prompt_tuning_config_from_request(request,
+                                          batch_size=1,
+                                          batch_index=0):
+    # prompt_vocab_size is unused by executor.
+    kwargs = {}
+    prompt_embedding_table = get_input_tensor_by_name(
+        request, 'prompt_embedding_table', batch_size, batch_index)
+    if prompt_embedding_table is not None:
+        if isinstance(prompt_embedding_table, np.ndarray):
+            kwargs["embedding_table"] = from_numpy(
+                prompt_embedding_table).squeeze()
+        elif isinstance(prompt_embedding_table, torch.Tensor):
+            kwargs["embedding_table"] = from_dlpack(
+                prompt_embedding_table.to_dlpack()).squeeze(dim=0)
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.PromptTuningConfig(**kwargs)
+    return None
+def get_lora_config_from_request(request, batch_size=1, batch_index=0):
+    kwargs = {}
+    kwargs["task_id"] = get_input_scalar_by_name(request, 'lora_task_id',
+                                                 batch_size, batch_index)
+    lora_weights = get_input_tensor_by_name(request, 'lora_weights',
+                                            batch_size, batch_index)
+    if lora_weights is not None:
+        kwargs["weights"] = from_numpy(lora_weights).squeeze()
+    lora_config = get_input_tensor_by_name(request, 'lora_config', batch_size,
+                                           batch_index)
+    if lora_config is not None:
+        kwargs["config"] = from_numpy(lora_config).squeeze()
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    if len(kwargs) > 0:
+        return trtllm.LoraConfig(**kwargs)
+    return None
+def convert_request(request, exclude_input_from_output, decoupled):
+    inputs = {}
+    input_token_ids = get_input_tensor_by_name(request, 'input_ids')
+    if input_token_ids is None:
+        raise pb_utils.TritonModelException(
+            "A value is required for input_ids")
+    if len(input_token_ids.shape) != 2:
+        raise pb_utils.TritonModelException(f"Invalid format for input_ids")
+    batch_size = input_token_ids.shape[0]
+    requests = []
+    for batch_index in range(0, batch_size):
+        input_token_ids = get_input_tensor_by_name(request, 'input_ids',
+                                                   batch_size, batch_index)[0]
+        if input_token_ids is None:
+            raise pb_utils.TritonModelException(
+                "A value is required for input_ids")
+        input_token_ids = input_token_ids.tolist()
+        if len(input_token_ids) == 0:
+            raise pb_utils.TritonModelException(
+                f"Invalid format for input_ids")
+        input_length = get_input_scalar_by_name(request, 'input_lengths',
+                                                batch_size, batch_index)
+        if input_length is None:
+            input_length = len(input_token_ids)
+        # Trim input token ids with input_lengths
+        inputs['input_token_ids'] = input_token_ids[0:input_length]
+        inputs['max_new_tokens'] = get_input_scalar_by_name(
+            request, 'request_output_len', batch_size, batch_index)
+        if inputs['max_new_tokens'] is None:
+            raise pb_utils.TritonModelException(
+                "A value is required for request_output_len")
+        inputs['streaming'] = get_input_scalar_by_name(request, 'streaming',
+                                                       batch_size, batch_index)
+        if inputs['streaming'] and not decoupled:
+            raise pb_utils.TritonModelException(
+                "Streaming is only supported in decoupled mode.")
+        inputs['end_id'] = get_input_scalar_by_name(request, 'end_id',
+                                                    batch_size, batch_index)
+        inputs['pad_id'] = get_input_scalar_by_name(request, 'pad_id',
+                                                    batch_size, batch_index)
+        inputs['stop_words'] = convert_word_list(
+            get_input_tensor_by_name(request, 'stop_words_list', batch_size,
+                                     batch_index))
+        inputs['bad_words'] = convert_word_list(
+            get_input_tensor_by_name(request, 'bad_words_list', batch_size,
+                                     batch_index))
+        embedding_bias = get_input_tensor_by_name(request, 'embedding_bias',
+                                                  batch_size, batch_index)
+        if embedding_bias is not None and embedding_bias.size != 0:
+            inputs['embedding_bias'] = from_numpy(embedding_bias).squeeze()
+        sampling_config = get_sampling_config_from_request(
+            request, batch_size, batch_index)
+        output_config = get_output_config_from_request(
+            request, exclude_input_from_output, batch_size, batch_index)
+        external_draft_tokens_config = get_external_draft_tokens_config_from_request(
+            request, batch_size, batch_index)
+        prompt_tuning_config = get_prompt_tuning_config_from_request(
+            request, batch_size, batch_index)
+        lora_config = get_lora_config_from_request(request, batch_size,
+                                                   batch_index)
+        requests.append(
+            trtllm.Request(
+                **inputs,
+                sampling_config=sampling_config,
+                output_config=output_config,
+                external_draft_tokens_config=external_draft_tokens_config,
+                prompt_tuning_config=prompt_tuning_config,
+                lora_config=lora_config,
+            ))
+    return requests
+def convert_response(response, batch_index):
+    if response.has_error():
+        return pb_utils.InferenceResponse(output_tensors=[],
+                                          error=pb_utils.TritonError(
+                                              response.error_msg)), True
+    result = response.result
+    beam_lengths = np.expand_dims(
+        np.array([len(beam) for beam in result.output_token_ids], np.int32), 0)
+    max_beam_length = max([len(beam) for beam in result.output_token_ids])
+    output_ids = np.full((1, len(result.output_token_ids), max_beam_length),
+                         -1, np.int32)
+    for idx, beam in enumerate(result.output_token_ids):
+        output_ids[0, idx, :len(beam)] = beam
+    output_tensors = [
+        pb_utils.Tensor("output_ids", output_ids),
+        pb_utils.Tensor("sequence_length", beam_lengths),
+    ]
+    output_tensors.append(
+        pb_utils.Tensor(
+            "cum_log_probs",
+            np.expand_dims(np.array(result.cum_log_probs, np.float32), 0)
+            if result.cum_log_probs is not None else np.zeros(
+                (1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor(
+            "output_log_probs",
+            np.expand_dims(np.array(result.log_probs, np.float32), 0) if
+            result.log_probs is not None else np.zeros((1, 1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor(
+            "context_logits",
+            np.expand_dims(np.array(result.context_logits, np.float32), 0)
+            if result.context_logits is not None else np.zeros(
+                (1, 1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor(
+            "generation_logits",
+            np.expand_dims(np.array(result.generation_logits, np.float32), 0)
+            if result.generation_logits is not None else np.zeros(
+                (1, 1, 1, 1), np.float32)))
+    output_tensors.append(
+        pb_utils.Tensor("batch_index",
+                        np.expand_dims(np.array([batch_index], np.int32), 0)))
+    return pb_utils.InferenceResponse(output_tensors), result.is_final
+def convert_scheduler_policy(batch_scheduler_policy: str):
+    if batch_scheduler_policy.lower() == "max_utilization":
+        return trtllm.CapacitySchedulerPolicy.MAX_UTILIZATION
+    elif batch_scheduler_policy.lower() == "guaranteed_no_evict":
+        return trtllm.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
+    raise pb_utils.TritonModelException(
+        f"batch_scheduler_policy value of '{batch_scheduler_policy}' is not supported."
+    )
+def convert_batching_type(gpt_model_type: str):
+    if gpt_model_type is None:
+        return None
+    if gpt_model_type.lower(
+    ) == "inflight_fused_batching" or gpt_model_type.lower(
+    ) == "inflight_batching":
+        return trtllm.BatchingType.INFLIGHT
+    elif gpt_model_type.lower() == "v1":
+        return trtllm.BatchingType.STATIC
+    raise pb_utils.TritonModelException(
+        f"gpt_model_type value of '{gpt_model_type}' is not supported.")
+def convert_decoding_mode(decoding_mode: str):
+    if decoding_mode is None:
+        return None
+    elif decoding_mode == "auto":
+        return trtllm.DecodingMode.Auto()
+    elif decoding_mode == "top_k":
+        return trtllm.DecodingMode.TopK()
+    elif decoding_mode == "top_p":
+        return trtllm.DecodingMode.TopP()
+    elif decoding_mode == "top_k_top_p":
+        return trtllm.DecodingMode.TopKTopP()
+    elif decoding_mode == "beam_search":
+        return trtllm.DecodingMode.BeamSearch()
+    elif decoding_mode == "medusa":
+        return trtllm.DecodingMode.Medusa()
+    raise pb_utils.TritonModelException(
+        f"decoding_mode value of '{decoding_mode}' is not supported.")
+def convert_timestamp_to_seconds(timestamp: str):
+    return int(
+        datetime.datetime.strptime(timestamp,
+                                   "%m-%d-%Y %H:%M:%S.%f").timestamp())
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def get_scheduler_config(self, model_config):
+        batch_scheduler_policy = get_parameter(model_config,
+                                               "batch_scheduler_policy")
+        if batch_scheduler_policy is None:
+            return trtllm.SchedulerConfig()
+        return trtllm.SchedulerConfig(
+            convert_scheduler_policy(batch_scheduler_policy))
+    def get_kv_cache_config(self, model_config):
+        kwargs = {
+            "enable_block_reuse":
+            get_parameter(model_config, "enable_kv_cache_reuse", bool),
+            "max_tokens":
+            get_parameter(model_config, "max_tokens_in_paged_kv_cache", int),
+            "sink_token_length":
+            get_parameter(model_config, "sink_token_length", int),
+            "free_gpu_memory_fraction":
+            get_parameter(model_config, "kv_cache_free_gpu_mem_fraction",
+                          float),
+            "host_cache_size":
+            get_parameter(model_config, "kv_cache_host_memory_bytes", int),
+            "onboard_blocks":
+            get_parameter(model_config, "kv_cache_onboard_blocks", bool),
+        }
+        max_attention_window_size = get_parameter(model_config,
+                                                  "max_attention_window_size")
+        if max_attention_window_size:
+            kwargs["max_attention_window"] = [
+                int(x) for x in max_attention_window_size.split(",")
+            ]
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.KvCacheConfig(**kwargs)
+    def get_parallel_config(self, model_config):
+        kwargs = {}
+        gpu_device_ids = get_parameter(model_config, "gpu_device_ids")
+        if gpu_device_ids:
+            kwargs["device_ids"] = [int(x) for x in gpu_device_ids.split(",")]
+        self.use_orchestrator_mode = os.environ.get("TRTLLM_ORCHESTRATOR",
+                                                    "0") == "1"
+        if self.use_orchestrator_mode:
+            kwargs[
+                "communication_mode"] = trtllm.CommunicationMode.ORCHESTRATOR
+            worker_path = get_parameter(model_config, "worker_path")
+            if worker_path is not None:
+                raise pb_utils.TritonModelException(
+                    "worker_path parameter is specified, but this is no longer supported. Please specify executor_worker_path instead to specify the location of the trtllmExecutorWorker executable."
+                )
+            executor_worker_path = get_parameter(model_config,
+                                                 "executor_worker_path")
+            kwargs["orchestrator_config"] = trtllm.OrchestratorConfig(
+                True, executor_worker_path)
+        if len(kwargs) > 0:
+            return trtllm.ParallelConfig(**kwargs)
+        return None
+    def get_peft_cache_config(self, model_config):
+        kwargs = {
+            "optimal_adapter_size":
+            get_parameter(model_config, "lora_cache_optimal_adapter_size",
+                          int),
+            "max_adapter_size":
+            get_parameter(model_config, "lora_cache_max_adapter_size", int),
+            "device_cache_percent":
+            get_parameter(model_config, "lora_cache_gpu_memory_fraction",
+                          float),
+            "host_cache_size":
+            get_parameter(model_config, "lora_cache_host_memory_bytes", int),
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.PeftCacheConfig(**kwargs)
+    def get_decoding_config(self, model_config):
+        kwargs = {
+            "medusa_choices":
+            parse_medusa_choices(get_parameter(model_config,
+                                               "medusa_choices")),
+            "decoding_mode":
+            convert_decoding_mode(get_parameter(model_config,
+                                                "decoding_mode")),
+        }
+        print(kwargs)
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.DecodingConfig(**kwargs)
+    def get_extended_runtime_perf_knob_config(self, model_config):
+        kwargs = {
+            "multi_block_mode":
+            get_parameter(model_config, "multi_block_mode", bool),
+            "enable_context_fmha_fp32_acc":
+            get_parameter(model_config, "enable_context_fmha_fp32_acc", bool)
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.ExtendedRuntimePerfKnobConfig(**kwargs)
+    def get_executor_config(self, model_config):
+        kwargs = {
+            "max_beam_width":
+            get_parameter(model_config, "max_beam_width", int),
+            "scheduler_config":
+            self.get_scheduler_config(model_config),
+            "kv_cache_config":
+            self.get_kv_cache_config(model_config),
+            "enable_chunked_context":
+            get_parameter(model_config, "enable_chunked_context", bool),
+            "normalize_log_probs":
+            get_parameter(model_config, "normalize_log_probs", bool),
+            "batching_type":
+            convert_batching_type(get_parameter(model_config,
+                                                "gpt_model_type")),
+            "parallel_config":
+            self.get_parallel_config(model_config),
+            "peft_cache_config":
+            self.get_peft_cache_config(model_config),
+            "decoding_config":
+            self.get_decoding_config(model_config),
+            "max_queue_size":
+            model_config.get(
+                "dynamic_batching",
+                {},
+            ).get(
+                "default_queue_policy",
+                {},
+            ).get("max_queue_size"),
+            "extended_runtime_perf_knob_config":
+            self.get_extended_runtime_perf_knob_config(model_config)
+        }
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        return trtllm.ExecutorConfig(**kwargs)
+    def create_metrics(self, model: str, version: str, is_v1_model: bool):
+        self.request_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_request_metrics",
+            description="TRT LLM request metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.runtime_memory_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_runtime_memory_metrics",
+            description="TRT LLM runtime memory metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.kv_cache_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_kv_cache_block_metrics",
+            description="TRT LLM KV cache block metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        model_type = "v1" if is_v1_model else "inflight_batcher"
+        self.model_type_metric_family = pb_utils.MetricFamily(
+            name=f"nv_trt_llm_{model_type}_metrics",
+            description=f"TRT LLM {model_type}-specific metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        self.general_metric_family = pb_utils.MetricFamily(
+            name="nv_trt_llm_general_metrics",
+            description="General TRT LLM metrics",
+            kind=pb_utils.MetricFamily.GAUGE,
+        )
+        common_labels = {"model": model, "version": version}
+        self.all_metrics = {
+            # Request metrics
+            "num_active_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "active",
+                **common_labels
+            }),
+            "max_num_active_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "max",
+                **common_labels
+            }),
+            "num_scheduled_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "scheduled",
+                **common_labels
+            }),
+            "num_context_requests":
+            self.request_metric_family.Metric(labels={
+                "request_type": "context",
+                **common_labels
+            }),
+            # Runtime metrics
+            "cpu_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "cpu",
+                **common_labels
+            }),
+            "gpu_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "gpu",
+                **common_labels
+            }),
+            "pinned_mem_usage":
+            self.runtime_memory_metric_family.Metric(labels={
+                "memory_type": "pinned",
+                **common_labels
+            }),
+            # KV cache metrics
+            "max_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "max",
+                **common_labels
+            }),
+            "free_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "free",
+                **common_labels
+            }),
+            "used_num_blocks":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "used",
+                **common_labels
+            }),
+            "tokens_per_block":
+            self.kv_cache_metric_family.Metric(labels={
+                "kv_cache_block_type": "tokens_per",
+                **common_labels
+            }),
+            # General metrics
+            "timestamp":
+            self.general_metric_family.Metric(labels={
+                "general_type": "timestamp",
+                **common_labels
+            }),
+            "iter":
+            self.general_metric_family.Metric(labels={
+                "general_type": "iteration_counter",
+                **common_labels
+            }),
+        }
+        if is_v1_model:
+            self.all_metrics.update({
+                "num_ctx_tokens":
+                self.model_type_metric_family.Metric(labels={
+                    "v1_specific_metric": "total_context_tokens",
+                    **common_labels
+                }),
+                "num_gen_tokens":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "v1_specific_metric": "total_generation_tokens",
+                        **common_labels
+                    }),
+                "empty_gen_slots":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "v1_specific_metric": "empty_generation_slots",
+                        **common_labels
+                    }),
+            })
+        else:
+            self.all_metrics.update({
+                "num_ctx_tokens":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric":
+                        "total_context_tokens",
+                        **common_labels
+                    }),
+                "num_gen_requests":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric":
+                        "generation_requests",
+                        **common_labels
+                    }),
+                "micro_batch_id":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric": "micro_batch_id",
+                        **common_labels
+                    }),
+                "num_paused_requests":
+                self.model_type_metric_family.Metric(
+                    labels={
+                        "inflight_batcher_specific_metric": "paused_requests",
+                        **common_labels
+                    }),
+            })
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        model_config = json.loads(args['model_config'])
+        gpt_model_path = get_parameter(model_config, "gpt_model_path")
+        if get_parameter(model_config, "enable_trt_overlap", bool):
+            raise pb_utils.TritonModelException(
+                f"enable_trt_overlap=true is not supported.")
+        self.exclude_input_from_output = get_parameter(
+            model_config, "exclude_input_in_output", bool)
+        executor_config = self.get_executor_config(model_config)
+        self.executor = trtllm.Executor(gpt_model_path,
+                                        trtllm.ModelType.DECODER_ONLY,
+                                        executor_config)
+        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
+            model_config)
+        self.cancellation_check_period_ms = get_parameter(
+            model_config, "cancellation_check_period_ms", int) or 100
+        self.stats_check_period_ms = get_parameter(
+            model_config, "stats_check_period_ms", int) or 100
+        if not self.decoupled:
+            raise pb_utils.TritonModelException(
+                "Please enable decoupled transaction policy in the model configuration to serve this model"
+            )
+        self.create_metrics(args["model_name"],
+                            args["model_version"],
+                            is_v1_model=executor_config.batching_type ==
+                            trtllm.BatchingType.STATIC)
+        self.triton_user_id_to_req_ids = {}
+        self.triton_req_id_to_req_ids = {}
+        self.req_id_to_request_data = {}
+        self.lock = Lock()
+        self.running = False
+        self.awaiter_thread = Thread(target=self.awaiter_loop)
+        self.cancellation_thread = Thread(target=self.cancellation_loop)
+        self.metrics_thread = Thread(target=self.metrics_loop)
+        if self.executor.can_enqueue_requests():
+            self.running = True
+            self.awaiter_thread.start()
+            self.cancellation_thread.start()
+            self.metrics_thread.start()
+        else:
+            # In leader mode, worker ranks will wait here until leader is done.
+            self.executor.shutdown()
+    def handle_stop_request(self, triton_user_id, response_sender):
+        if triton_user_id is None or triton_user_id == "":
+            response_sender.send(
+                pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                    "A request id must be provided for request cancellation")),
+                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            return
+        with self.lock:
+            if triton_user_id in self.triton_user_id_to_req_ids:
+                req_ids = self.triton_user_id_to_req_ids[triton_user_id]
+                for req_id in req_ids:
+                    self.executor.cancel_request(req_id)
+        response_sender.send(
+            pb_utils.InferenceResponse(),
+            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        if not self.executor.can_enqueue_requests():
+            return
+        # Convert to executor requests.
+        triton_requests = []
+        executor_requests = []
+        batch_indices = []
+        triton_user_ids = []
+        triton_req_ids = []
+        for request in requests:
+            triton_user_id = request.request_id()
+            response_sender = request.get_response_sender()
+            stop = get_input_scalar_by_name(request, 'stop')
+            if stop:
+                self.handle_stop_request(triton_user_id, response_sender)
+            else:
+                #Unique request id used to identify each triton request
+                triton_req_id = str(randint(0, sys.maxsize))
+                self.triton_req_id_to_req_ids[triton_req_id] = set()
+                if triton_user_id is not None and triton_user_id != "":
+                    self.triton_user_id_to_req_ids[triton_user_id] = set()
+                try:
+                    converted_reqs = convert_request(
+                        request, self.exclude_input_from_output,
+                        self.decoupled)
+                except Exception as e:
+                    response_sender.send(
+                        pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                            f"An error occurred when processing the input values for request id {request.request_id()}, the error was '{e}'"
+                        )),
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    for batch_index, converted_req in enumerate(
+                            converted_reqs):
+                        triton_requests.append(request)
+                        executor_requests.append(converted_req)
+                        triton_user_ids.append(triton_user_id)
+                        triton_req_ids.append(triton_req_id)
+                        batch_indices.append(batch_index)
+        with self.lock:
+            request_ids = self.executor.enqueue_requests(executor_requests)
+            for req_id, triton_req_id, triton_user_id, triton_request, batch_index in zip(
+                    request_ids, triton_req_ids, triton_user_ids,
+                    triton_requests, batch_indices):
+                self.req_id_to_request_data[
+                    req_id] = triton_req_id, triton_user_id, batch_index, triton_request.get_response_sender(
+                    )
+                self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
+                if triton_user_id is not None and triton_user_id != "":
+                    self.triton_user_id_to_req_ids[triton_user_id].add(req_id)
+        return None
+    def awaiter_loop(self):
+        """Gets responses from executor and returns the results."""
+        while self.running:
+            for response in self.executor.await_responses(
+                    timeout=datetime.timedelta(milliseconds=1)):
+                req_id = response.request_id
+                with self.lock:
+                    if req_id not in self.req_id_to_request_data:
+                        continue
+                    triton_req_id, triton_user_id, batch_index, response_sender = self.req_id_to_request_data[
+                        req_id]
+                triton_response, is_final = convert_response(
+                    response, batch_index)
+                triton_request_final = False
+                if is_final:
+                    with self.lock:
+                        # Check if all executor requests part of that triton request are finished
+                        self.triton_req_id_to_req_ids[triton_req_id].remove(
+                            req_id)
+                        if len(self.triton_req_id_to_req_ids[triton_req_id]
+                               ) == 0:
+                            pb_utils.Logger.log_info(
+                                f"DELETING Req id {req_id}, triton_req_id {triton_req_id} "
+                            )
+                            triton_request_final = True
+                            del self.triton_req_id_to_req_ids[triton_req_id]
+                            if triton_user_id is not None and triton_user_id != "":
+                                del self.triton_user_id_to_req_ids[
+                                    triton_user_id]
+                        del self.req_id_to_request_data[req_id]
+                response_sender.send(
+                    triton_response,
+                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                    if triton_request_final else 0)
+                # Remove local reference so response_sender can be cleaned properly.
+                del response_sender
+    def cancellation_loop(self):
+        """Checks if any pending requests have been cancelled."""
+        while self.running:
+            time.sleep(self.cancellation_check_period_ms / 1000.0)
+            with self.lock:
+                for req_id, (triton_req_id, triton_user_id, batch_index,
+                             response_sender
+                             ) in self.req_id_to_request_data.items():
+                    if response_sender.is_cancelled():
+                        self.executor.cancel_request(req_id)
+                    # Remove local reference so response_sender can be cleaned properly.
+                    del response_sender
+    def metrics_loop(self):
+        """Updates triton metrics using stats from the executor."""
+        while self.running:
+            time.sleep(self.stats_check_period_ms / 1000.0)
+            for stat in self.executor.get_latest_iteration_stats():
+                try:
+                    for key, metric in self.all_metrics.items():
+                        value = None
+                        if hasattr(stat, key):
+                            value = getattr(stat, key)
+                        elif stat.kv_cache_stats is not None and hasattr(
+                                stat.kv_cache_stats, key):
+                            value = getattr(stat.kv_cache_stats, key)
+                        elif stat.static_batching_stats is not None and hasattr(
+                                stat.static_batching_stats, key):
+                            value = getattr(stat.static_batching_stats, key)
+                        elif stat.inflight_batching_stats is not None and hasattr(
+                                stat.inflight_batching_stats, key):
+                            value = getattr(stat.inflight_batching_stats, key)
+                        if value is not None:
+                            if key == "timestamp":
+                                value = convert_timestamp_to_seconds(value)
+                            metric.set(value)
+                        else:
+                            pb_utils.Logger.log_warn(
+                                f"Metric \"{key}\" not found.")
+                except Exception as e:
+                    pb_utils.Logger.log_warn(
+                        f"Error while processing metrics: {e}")
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        if self.executor.can_enqueue_requests():
+            self.running = False
+            self.awaiter_thread.join()
+            self.cancellation_thread.join()
+            self.metrics_thread.join()
+            self.executor.shutdown()

model_repo_whisper_qwen_trtllm/tensorrt_llm/config.pbtxt ADDED Viewed

	@@ -0,0 +1,577 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "tensorrt_llm"
+backend: "tensorrtllm"
+max_batch_size: 8
+model_transaction_policy {
+  decoupled: false
+}
+dynamic_batching {
+    preferred_batch_size: [ 8 ]
+    max_queue_delay_microseconds: 0
+    default_queue_policy: { max_queue_size: 0 }
+}
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_input_features"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_output_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "draft_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "draft_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "draft_acceptance_threshold"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "embedding_bias"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_reset_ids"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "early_stopping"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "streaming"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+	data_type: TYPE_UINT64
+	dims: [ 1 ]
+    reshape: { shape: [ ] }
+	optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+	data_type: TYPE_FP16
+	dims: [ -1, -1 ]
+	optional: true
+	allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+	data_type: TYPE_INT32
+	dims: [ -1, 3 ]
+	optional: true
+	allow_ragged_batch: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "batch_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
+parameters: {
+  key: "max_beam_width"
+  value: {
+    string_value: "1"
+  }
+}
+parameters: {
+  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
+  value: {
+    string_value: "no"
+  }
+}
+parameters: {
+  key: "gpt_model_type"
+  value: {
+    string_value: "inflight_fused_batching"
+  }
+}
+parameters: {
+  key: "gpt_model_path"
+  value: {
+    string_value: "/home/scratch.yuekaiz_wwfo_1/tekit/examples/qwen/qwen2_1.5B_instruct_fp16_merged_max_prompt_embedding_table_size_256"
+  }
+}
+parameters: {
+  key: "encoder_model_path"
+  value: {
+    string_value: "${encoder_engine_dir}"
+  }
+}
+parameters: {
+  key: "max_tokens_in_paged_kv_cache"
+  value: {
+    string_value: "2560"
+  }
+}
+parameters: {
+  key: "max_attention_window_size"
+  value: {
+    string_value: "2000"
+  }
+}
+parameters: {
+  key: "sink_token_length"
+  value: {
+    string_value: "${sink_token_length}"
+  }
+}
+parameters: {
+  key: "batch_scheduler_policy"
+  value: {
+    string_value: "${batch_scheduler_policy}"
+  }
+}
+parameters: {
+  key: "kv_cache_free_gpu_mem_fraction"
+  value: {
+    string_value: "0.5"
+  }
+}
+parameters: {
+  key: "kv_cache_host_memory_bytes"
+  value: {
+    string_value: "${kv_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "kv_cache_onboard_blocks"
+  value: {
+    string_value: "${kv_cache_onboard_blocks}"
+  }
+}
+# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
+# parameters: {
+#   key: "enable_trt_overlap"
+#   value: {
+#     string_value: "${enable_trt_overlap}"
+#   }
+# }
+parameters: {
+  key: "exclude_input_in_output"
+  value: {
+    string_value: "True"
+  }
+}
+parameters: {
+  key: "cancellation_check_period_ms"
+  value: {
+    string_value: "${cancellation_check_period_ms}"
+  }
+}
+parameters: {
+  key: "stats_check_period_ms"
+  value: {
+    string_value: "${stats_check_period_ms}"
+  }
+}
+parameters: {
+  key: "iter_stats_max_iterations"
+  value: {
+    string_value: "${iter_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "request_stats_max_iterations"
+  value: {
+    string_value: "${request_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "enable_kv_cache_reuse"
+  value: {
+    string_value: "False"
+  }
+}
+parameters: {
+  key: "normalize_log_probs"
+  value: {
+    string_value: "${normalize_log_probs}"
+  }
+}
+parameters: {
+  key: "enable_chunked_context"
+  value: {
+    string_value: "${enable_chunked_context}"
+  }
+}
+parameters: {
+  key: "gpu_device_ids"
+  value: {
+    string_value: "${gpu_device_ids}"
+  }
+}
+parameters: {
+  key: "lora_cache_optimal_adapter_size"
+  value: {
+    string_value: "${lora_cache_optimal_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_max_adapter_size"
+  value: {
+    string_value: "${lora_cache_max_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_gpu_memory_fraction"
+  value: {
+    string_value: "${lora_cache_gpu_memory_fraction}"
+  }
+}
+parameters: {
+  key: "lora_cache_host_memory_bytes"
+  value: {
+    string_value: "${lora_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "decoding_mode"
+  value: {
+    string_value: "${decoding_mode}"
+  }
+}
+parameters: {
+  key: "executor_worker_path"
+  value: {
+    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
+  }
+}
+parameters: {
+  key: "medusa_choices"
+    value: {
+      string_value: "${medusa_choices}"
+  }
+}
+parameters: {
+  key: "gpu_weights_percent"
+    value: {
+      string_value: "${gpu_weights_percent}"
+  }
+}
+parameters: {
+  key: "enable_context_fmha_fp32_acc"
+  value: {
+    string_value: "${enable_context_fmha_fp32_acc}"
+  }
+}
+parameters: {
+  key: "multi_block_mode"
+  value: {
+    string_value: "${multi_block_mode}"
+  }
+}

model_repo_whisper_qwen_trtllm/tensorrt_llm/config.template ADDED Viewed

	@@ -0,0 +1,577 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "tensorrt_llm"
+backend: "${triton_backend}"
+max_batch_size: ${triton_max_batch_size}
+model_transaction_policy {
+  decoupled: ${decoupled_mode}
+}
+dynamic_batching {
+    preferred_batch_size: [ ${triton_max_batch_size} ]
+    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
+    default_queue_policy: { max_queue_size: ${max_queue_size} }
+}
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_input_features"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_output_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "draft_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "draft_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "draft_acceptance_threshold"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "embedding_bias"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_reset_ids"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "early_stopping"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "streaming"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+	data_type: TYPE_UINT64
+	dims: [ 1 ]
+    reshape: { shape: [ ] }
+	optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+	data_type: TYPE_FP16
+	dims: [ -1, -1 ]
+	optional: true
+	allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+	data_type: TYPE_INT32
+	dims: [ -1, 3 ]
+	optional: true
+	allow_ragged_batch: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: TYPE_FP32
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "batch_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
+parameters: {
+  key: "max_beam_width"
+  value: {
+    string_value: "${max_beam_width}"
+  }
+}
+parameters: {
+  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
+  value: {
+    string_value: "no"
+  }
+}
+parameters: {
+  key: "gpt_model_type"
+  value: {
+    string_value: "${batching_strategy}"
+  }
+}
+parameters: {
+  key: "gpt_model_path"
+  value: {
+    string_value: "${engine_dir}"
+  }
+}
+parameters: {
+  key: "encoder_model_path"
+  value: {
+    string_value: "${encoder_engine_dir}"
+  }
+}
+parameters: {
+  key: "max_tokens_in_paged_kv_cache"
+  value: {
+    string_value: "${max_tokens_in_paged_kv_cache}"
+  }
+}
+parameters: {
+  key: "max_attention_window_size"
+  value: {
+    string_value: "${max_attention_window_size}"
+  }
+}
+parameters: {
+  key: "sink_token_length"
+  value: {
+    string_value: "${sink_token_length}"
+  }
+}
+parameters: {
+  key: "batch_scheduler_policy"
+  value: {
+    string_value: "${batch_scheduler_policy}"
+  }
+}
+parameters: {
+  key: "kv_cache_free_gpu_mem_fraction"
+  value: {
+    string_value: "${kv_cache_free_gpu_mem_fraction}"
+  }
+}
+parameters: {
+  key: "kv_cache_host_memory_bytes"
+  value: {
+    string_value: "${kv_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "kv_cache_onboard_blocks"
+  value: {
+    string_value: "${kv_cache_onboard_blocks}"
+  }
+}
+# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
+# parameters: {
+#   key: "enable_trt_overlap"
+#   value: {
+#     string_value: "${enable_trt_overlap}"
+#   }
+# }
+parameters: {
+  key: "exclude_input_in_output"
+  value: {
+    string_value: "${exclude_input_in_output}"
+  }
+}
+parameters: {
+  key: "cancellation_check_period_ms"
+  value: {
+    string_value: "${cancellation_check_period_ms}"
+  }
+}
+parameters: {
+  key: "stats_check_period_ms"
+  value: {
+    string_value: "${stats_check_period_ms}"
+  }
+}
+parameters: {
+  key: "iter_stats_max_iterations"
+  value: {
+    string_value: "${iter_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "request_stats_max_iterations"
+  value: {
+    string_value: "${request_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "enable_kv_cache_reuse"
+  value: {
+    string_value: "${enable_kv_cache_reuse}"
+  }
+}
+parameters: {
+  key: "normalize_log_probs"
+  value: {
+    string_value: "${normalize_log_probs}"
+  }
+}
+parameters: {
+  key: "enable_chunked_context"
+  value: {
+    string_value: "${enable_chunked_context}"
+  }
+}
+parameters: {
+  key: "gpu_device_ids"
+  value: {
+    string_value: "${gpu_device_ids}"
+  }
+}
+parameters: {
+  key: "lora_cache_optimal_adapter_size"
+  value: {
+    string_value: "${lora_cache_optimal_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_max_adapter_size"
+  value: {
+    string_value: "${lora_cache_max_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_gpu_memory_fraction"
+  value: {
+    string_value: "${lora_cache_gpu_memory_fraction}"
+  }
+}
+parameters: {
+  key: "lora_cache_host_memory_bytes"
+  value: {
+    string_value: "${lora_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "decoding_mode"
+  value: {
+    string_value: "${decoding_mode}"
+  }
+}
+parameters: {
+  key: "executor_worker_path"
+  value: {
+    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
+  }
+}
+parameters: {
+  key: "medusa_choices"
+    value: {
+      string_value: "${medusa_choices}"
+  }
+}
+parameters: {
+  key: "gpu_weights_percent"
+    value: {
+      string_value: "${gpu_weights_percent}"
+  }
+}
+parameters: {
+  key: "enable_context_fmha_fp32_acc"
+  value: {
+    string_value: "${enable_context_fmha_fp32_acc}"
+  }
+}
+parameters: {
+  key: "multi_block_mode"
+  value: {
+    string_value: "${multi_block_mode}"
+  }
+}

model_repo_whisper_qwen_trtllm/whisper/0/__pycache__/fbank.cpython-310.pyc ADDED Viewed

Binary file (3.07 kB). View file

model_repo_whisper_qwen_trtllm/whisper/0/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

model_repo_whisper_qwen_trtllm/whisper/0/__pycache__/whisper_trtllm.cpython-310.pyc ADDED Viewed

Binary file (9.21 kB). View file

model_repo_whisper_qwen_trtllm/whisper/0/fbank.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Union
+import os
+def mel_filters(device, n_mels: int =128) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
+        )
+    """
+    assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[torch.Tensor],
+    filters: torch.Tensor,
+    n_mels: int = 128,
+    n_fft: int = 400,
+    hop_length: int = 160,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 or 128 is supported
+    filters: torch.Tensor
+    Returns
+    -------
+    torch.Tensor, shape = (128, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    window = torch.hann_window(n_fft).to(audio.device)
+    stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    # cast to float 16
+    log_spec = log_spec.half()
+    return log_spec
+class FeatureExtractor(torch.nn.Module):
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def __init__(self, n_mels: int = 128):
+        self.device = torch.device("cuda")
+        self.n_mels = n_mels
+        self.filters = mel_filters(self.device, n_mels=self.n_mels)
+    def compute_feature(self, wav, target: int = 3000):
+        mel = log_mel_spectrogram(wav, self.filters)
+        assert mel.shape[1] <= target, f"{mel.shape[1]} > {target}, audio is too long"
+        if mel.shape[1] < target:
+            mel = F.pad(mel, (0, target - mel.shape[1]), mode='constant')
+        mel = mel.unsqueeze(0)
+        return mel

model_repo_whisper_qwen_trtllm/whisper/0/mel_filters.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
+size 4271

model_repo_whisper_qwen_trtllm/whisper/0/model.py ADDED Viewed

	@@ -0,0 +1,346 @@

+# -*- coding: utf-8 -*-
+import triton_python_backend_utils as pb_utils
+import numpy as np
+import json
+import torch
+from torch.utils.dlpack import from_dlpack, to_dlpack
+import re
+import transformers
+from transformers import AutoTokenizer
+from typing import Dict
+from pathlib import Path
+import traceback
+from .whisper_trtllm import WhisperTRTLLM
+from .fbank import FeatureExtractor
+DEFAULT_SPEECH_TOKEN = "<speech>"
+def preprocess(
+    messages,
+    tokenizer: transformers.PreTrainedTokenizer,
+    max_len: int = 128,
+) -> Dict:
+    """Preprocesses the data for supervised fine-tuning."""
+    texts = []
+    TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{''}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}"
+    for i, msg in enumerate(messages):
+        texts.append(
+            tokenizer.apply_chat_template(
+                msg,
+                tokenize=True,
+                add_generation_prompt=False,
+                chat_template=TEMPLATE,
+                padding="longest",
+                max_length=max_len,
+                truncation=True,
+            )
+        )
+    max_len_texts = max([len(text) for text in texts])
+    if tokenizer.padding_side == "right":
+        texts = [
+            text + [tokenizer.pad_token_id] * (max_len_texts - len(text))
+            for text in texts
+        ]
+    else:
+        texts = [
+            [tokenizer.pad_token_id] * (max_len_texts - len(text)) + text
+            for text in texts
+        ]
+    input_ids = torch.tensor(texts, dtype=torch.int)
+    attention_mask = input_ids.ne(tokenizer.pad_token_id)
+    return input_ids, attention_mask
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args['model_config'])
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(
+            model_config, "TRANSCRIPTS")
+        # Convert Triton types to numpy types
+        self.out0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config['data_type'])
+        #self.tokenizer = get_tokenizer(num_languages=100)
+        #self.blank = self.tokenizer.encode(" ", allowed_special=self.tokenizer.special_tokens_set)[0]
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
+        tokenizer.padding_side = "left"
+        special_tokens_dict = {"additional_special_tokens": [DEFAULT_SPEECH_TOKEN]}
+        tokenizer.add_special_tokens(special_tokens_dict)
+        self.tokenizer = tokenizer
+        self.eos = self.tokenizer.eos_token_id
+        self.default_speech_token_id = tokenizer.convert_tokens_to_ids(
+            DEFAULT_SPEECH_TOKEN
+        )
+        self.vocab_size = 151936
+        # self.vocab_size = 500000
+        # self.vocab_size = 160000
+        self.device = torch.device("cuda")
+        self.decoupled = False
+        self.logger = pb_utils.Logger
+        self.init_model(self.model_config['parameters'])
+    def init_model(self, parameters):
+        for key,value in parameters.items():
+            parameters[key] = value["string_value"]
+        engine_dir = parameters["engine_dir"]
+        n_mels = int(parameters["n_mels"])
+        adapter_dir="/home/scratch.yuekaiz_wwfo_1/icefall_asr_multi-hans_whisper_qwen2_1.5B/epoch-2-avg-6.pt"
+        checkpoint = torch.load(
+            adapter_dir, map_location="cpu"
+        )
+        self.model = WhisperTRTLLM(engine_dir)
+        missing_keys, _ = self.model.load_state_dict(checkpoint, strict=False)
+        # print(f"Missing keys: {missing_keys}")
+        self.feature_extractor = FeatureExtractor(n_mels=n_mels)
+    def _tokenize(self, prompt=None, num_speech_tokens=187):
+        if prompt is None:
+            prompts = [
+                [
+                    {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}请转写音频为文字"},
+                    {"role": "assistant", "content": ""},
+                ]
+            ]
+            # prompts = [
+            #     [
+            #         {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}你好，你是谁？"},
+            #         {"role": "assistant", "content": ""},
+            #     ]
+            # ]
+        input_ids, _ = preprocess(prompts, self.tokenizer, max_len=128)
+        input_ids = input_ids.tolist()[0]
+        speech_token_index = input_ids.index(self.default_speech_token_id)
+        # replace 151646 with list(range(self.vocab_size, self.vocab_size + num_speech_tokens))
+        prompt_ids = input_ids[:speech_token_index] + list(range(self.vocab_size, self.vocab_size + num_speech_tokens)) + input_ids[speech_token_index + 1:]
+        # prompt_ids = input_ids[:speech_token_index] + input_ids[speech_token_index + 1:]
+        return prompt_ids
+    def _prepare_inputs(self, request, speech_embeddings, input_ids):
+        """
+        Prepares inputs for the language model based on the parameters in the
+        request, image features, and prompt. It tokenizes prompt,
+        extracts and processes additional parameters from the request:
+            - max_tokens: Maximum number of tokens to generate (default: 50)
+            - temperature: Controls randomness in generation (default: 0.5)
+            - top_k: Top K sampling parameter (default: 1)
+            - frequency_penalty: Penalizes frequent tokens (default: 0.7)
+            - seed: Random seed for generation (default: 10)
+        Final llm input dictionary is combined out of all processed parameters,
+        prompt's tokens and image features. The latter will be passed to llm
+        through `prompt_embedding_table`.
+        Parameters
+        ----------
+        - request: The original request object containing additional parameters.
+        - image_features (list): A list containing image feature tensors.
+        - prompt (str): The text prompt to be processed.
+        Returns
+        -------
+        - dict: A dictionary containing all the prepared inputs for the language model.
+        """
+        input_ids = np.array(input_ids, dtype=np.int32)
+        max_tokens = 200
+        input_len = input_ids.shape[0]
+        assert speech_embeddings.shape[1] == 187, "Only support 187 speech tokens"
+        embedding_args = {
+            "prompt_vocab_size": np.array(
+                [[speech_embeddings.shape[1]]], dtype=np.int32
+            ),
+            "prompt_embedding_table": speech_embeddings.detach().cpu().numpy(),
+        }
+        # TODO: 加不加这个出来的结果一样？？？ input_ids 超过最大 vocab 也不会报错？？？
+        input_dict =  {
+            "input_ids": np.expand_dims(input_ids, 0),
+            "input_lengths": np.array([[input_len]], dtype=np.int32),
+            "request_output_len": np.array([[max_tokens]], dtype=np.int32),
+            "runtime_top_k": np.array([[1]], dtype=np.int32),
+            "end_id": np.array([[self.tokenizer.eos_token_id]], dtype=np.int32),
+            "pad_id": np.array([[self.tokenizer.pad_token_id]], dtype=np.int32),
+            "streaming": np.array([[0]], dtype=np.bool_),
+            **embedding_args,
+        }
+        print(input_ids)
+        for key, value in input_dict.items():
+            print(key, value.shape)
+        input_tensor_list = [pb_utils.Tensor(k, v) for k, v in input_dict.items()]
+        return input_tensor_list
+    def _prepare_llm_response(self, llm_request_inputs):
+        """
+        Prepares the response from the language model based on the provided
+        inputs. Creates a `pb_utils.InferenceRequest` object with passed
+        `llm_request_inputs` to send to a decoupled TensorRTLLM model.
+        For each response from the language model:
+            - Checks for errors and raise an exception if any are found.
+            - Extracts the "output_ids" tensor from the response.
+            - Determines the finish reason based on the presence of the
+              end-of-sequence token or reaching the maximum length.
+            - Appends the generated token IDs to `output_ids`.
+            - If the finish reason is determined, decodes the output IDs to text
+              and prepares the final response.
+        The final response includes the generated text, finish reason,
+        completion tokens, prompt tokens, and total tokens.
+        Parameters
+        ----------
+        - llm_request_inputs (dict): A dictionary containing the inputs for the language model.
+        Returns
+        -------
+        - pb_utils.InferenceResponse: The response object containing the generated text and additional metadata.
+        """
+        llm_request = pb_utils.InferenceRequest(
+            model_name="tensorrt_llm",
+            requested_output_names=["output_ids", "sequence_length"],
+            inputs=llm_request_inputs,
+        )
+        output_ids, output_len = [], 0
+        responses = llm_request.exec(decoupled=False)
+        responses = [responses]
+        for llm_response in responses:
+            if llm_response.has_error():
+                raise pb_utils.TritonModelException(llm_response.error().message())
+            stream_output_ids = (
+                pb_utils.get_output_tensor_by_name(llm_response, "output_ids")
+                .as_numpy()
+                .flatten()
+                .tolist()
+            )
+            finish_reason = "test"
+            if len(stream_output_ids) == 0 or (
+                len(stream_output_ids) != 0
+                and stream_output_ids[-1] == self.eos
+            ):
+                finish_reason = "stop"
+            output_ids += stream_output_ids
+            last_response = finish_reason != ""
+            output_len = len(output_ids)
+            if last_response:
+                print("final_output_ids", output_ids)
+                output_text = self.tokenizer.decode(output_ids).strip()
+                # print(output_text)
+                # output_text = re.sub(r'<\|.*?\|>', '', output_text)
+                response = pb_utils.InferenceResponse(
+                    output_tensors=[
+                        pb_utils.Tensor("TRANSCRIPTS", np.array([output_text], np.object_)),
+                    ]
+                )
+                yield response
+    def _extract_speech_embeddings(self, mel):
+        return self.model.process_batch(mel)
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            wav = pb_utils.get_input_tensor_by_name(request, "WAV").as_numpy()
+            assert wav.shape[0] == 1, "Only support batch size 1"
+            # To support batch > 1
+            # cat mel,text_prompt, also, need to increase decoder_input_len as a triton input
+            wav = torch.from_numpy(wav[0]).to(self.device)
+            # mel shape [1, 80, 3000] for remove_input_padding=False
+            mel = self.feature_extractor.compute_feature(wav)
+            print("==========================================================")
+            messages = [
+                [
+                    {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}请转写音频为文字"},
+                    {"role": "assistant", "content": ""},
+                ]
+            ] * len(mel)
+            input_ids, attention_mask = preprocess(messages, self.tokenizer, max_len=128)
+            generated_ids = self.model.decode(
+                mel, input_ids.to(self.device, dtype=torch.long), attention_mask.to(self.device)
+            )
+            print("pytorch model", generated_ids)
+            print("--------------------------------------------------------------------------")
+            speech_embeddings = self._extract_speech_embeddings(mel)
+            input_ids = self._tokenize()
+            if self.decoupled:
+                response_sender = request.get_response_sender()
+            try:
+                llm_request_inputs = self._prepare_inputs(
+                    request, speech_embeddings, input_ids
+                )
+                if isinstance(llm_request_inputs, pb_utils.TritonError):
+                    error = pb_utils.InferenceResponse(error=llm_request_inputs)
+                    if self.decoupled:
+                        response_sender.send(
+                            error, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                        )
+                    else:
+                        responses.append(error)
+                llm_responses = self._prepare_llm_response(llm_request_inputs)
+                for triton_response in llm_responses:
+                    if self.decoupled:
+                        response_sender.send(triton_response)
+                    else:
+                        responses.append(triton_response)
+                if self.decoupled:
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            except Exception:
+                self.logger.log_error(traceback.format_exc())
+                # If encountering an error, send a response with err msg
+                error_response = pb_utils.InferenceResponse(
+                    output_tensors=[],
+                    error=pb_utils.TritonError(traceback.format_exc()))
+                if self.decoupled:
+                    response_sender.send(error_response)
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    responses.append(error_response)
+        if self.decoupled:
+            return None
+        else:
+            assert len(responses) == len(requests)
+            return responses

model_repo_whisper_qwen_trtllm/whisper/0/whisper_trtllm.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from collections import OrderedDict
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+import tensorrt_llm
+import tensorrt_llm.logger as logger
+from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
+                                 trt_dtype_to_torch)
+from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+from tensorrt_llm.runtime.session import Session, TensorInfo
+from transformers.trainer_pt_utils import LabelSmoother
+from transformers import AutoModelForCausalLM, AutoTokenizer
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
+DEFAULT_SPEECH_TOKEN = "<speech>"
+def remove_tensor_padding(input_tensor, input_tensor_lengths=None, pad_value=0):
+    if input_tensor.dim() == 2:
+        # Text tensor case: batch, seq_len
+        assert torch.all(
+            input_tensor[:, 0] != pad_value
+        ), "First token in each sequence should not be pad_value"
+        assert input_tensor_lengths is None
+        # Create a mask for all non-pad tokens
+        mask = input_tensor != pad_value
+        # Apply the mask to input_tensor to remove pad tokens
+        output_tensor = input_tensor[mask].view(1, -1)
+    elif input_tensor.dim() == 3:
+        # Audio tensor case: batch, seq_len, feature_len
+        assert input_tensor_lengths is not None, "input_tensor_lengths must be provided for 3D input_tensor"
+        batch_size, seq_len, feature_len = input_tensor.shape
+        # Initialize a list to collect valid sequences
+        valid_sequences = []
+        for i in range(batch_size):
+            valid_length = input_tensor_lengths[i]
+            valid_sequences.append(input_tensor[i, :valid_length, :])
+        # Concatenate all valid sequences along the batch dimension
+        output_tensor = torch.cat(valid_sequences, dim=0)
+    else:
+        raise ValueError("Input tensor must have 2 or 3 dimensions")
+    return output_tensor
+def read_config(component, engine_dir):
+    config_path = engine_dir / component / 'config.json'
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    model_config = OrderedDict()
+    model_config.update(config['pretrained_config'])
+    model_config.update(config['build_config'])
+    return model_config
+class WhisperEncoding:
+    def __init__(self, engine_dir):
+        self.session = self.get_session(engine_dir)
+        config = read_config('encoder', engine_dir)
+        self.n_mels = config['n_mels']
+        self.dtype = config['dtype']
+        self.num_languages = config['num_languages']
+        self.encoder_config = config
+    def get_session(self, engine_dir):
+        serialize_path = engine_dir / 'encoder' / 'rank0.engine'
+        with open(serialize_path, 'rb') as f:
+            session = Session.from_serialized_engine(f.read())
+        return session
+    def get_audio_features(self,
+                           mel):
+        mel_input_lengths = torch.tensor(
+            [mel.shape[2] for _ in range(mel.shape[0])],
+            dtype=torch.int32,
+            device=mel.device)
+        if self.encoder_config['plugin_config']['remove_input_padding']:
+            # mel B,D,T -> B,T,D -> BxT, D
+            mel = mel.transpose(1, 2)
+            mel = remove_tensor_padding(mel, mel_input_lengths)
+        inputs = OrderedDict()
+        inputs['input_features'] = mel
+        inputs['input_lengths'] = mel_input_lengths
+        output_list = [
+            TensorInfo('input_features', str_dtype_to_trt(self.dtype),
+                       mel.shape),
+            TensorInfo('input_lengths', str_dtype_to_trt('int32'),
+                       mel_input_lengths.shape)
+        ]
+        output_info = (self.session).infer_shapes(output_list)
+        logger.debug(f'output info {output_info}')
+        outputs = {
+            t.name: torch.empty(tuple(t.shape),
+                                dtype=trt_dtype_to_torch(t.dtype),
+                                device='cuda')
+            for t in output_info
+        }
+        stream = torch.cuda.current_stream()
+        ok = self.session.run(inputs=inputs,
+                              outputs=outputs,
+                              stream=stream.cuda_stream)
+        assert ok, 'Engine execution failed'
+        stream.synchronize()
+        encoder_output = outputs['encoder_output']
+        encoder_output_lengths = mel_input_lengths // 2
+        return encoder_output
+class EncoderProjector(torch.nn.Module):
+    """
+    The encoder projector module. It is used to project the encoder outputs to the same dimension as the language model.
+    Modified from https://github.com/X-LANCE/SLAM-LLM/blob/main/src/slam_llm/models/projector.py.
+    Args:
+        encoder_dim (:obj:`int`): The dimension of the encoder outputs.
+        llm_dim (:obj:`int`): The dimension of the language model.
+        downsample_rate (:obj:`int`, `optional`, defaults to 5): The downsample rate to use.
+    """
+    def __init__(self, encoder_dim=1280, llm_dim=1536, downsample_rate=8):
+        super().__init__()
+        self.downsample_rate = downsample_rate
+        self.linear1 = nn.Linear(encoder_dim * self.downsample_rate, llm_dim)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(llm_dim, llm_dim)
+    def forward(self, x):
+        batch_size, seq_len, feat_dim = x.size()
+        num_frames_to_discard = seq_len % self.downsample_rate
+        if num_frames_to_discard > 0:
+            x = x[:, :-num_frames_to_discard, :]
+        seq_len = x.size(1)
+        x = x.contiguous()
+        x = x.view(
+            batch_size, seq_len // self.downsample_rate, feat_dim * self.downsample_rate
+        )
+        x = self.linear1(x)
+        x = self.relu(x)
+        x = self.linear2(x)
+        return x
+class SPEECH_LLM(nn.Module):
+    """
+    The Speech-to-Text model. It consists of an encoder, a language model and an encoder projector.
+    The encoder is used to extract speech features from the input speech signal.
+    The encoder projector is used to project the encoder outputs to the same dimension as the language model.
+    The language model is used to generate the text from the speech features.
+    Args:
+        encoder (:obj:`nn.Module`): The encoder module.
+        llm (:obj:`nn.Module`): The language model module.
+        encoder_projector (:obj:`nn.Module`): The encoder projector module.
+    """
+    def __init__(
+        self,
+        encoder: nn.Module,
+        llm: nn.Module,
+        encoder_projector: nn.Module,
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.llm = llm
+        self.encoder_projector = encoder_projector
+class WhisperTRTLLM(nn.Module):
+    def __init__(self, engine_dir):
+        super().__init__()
+        world_size = 1
+        runtime_rank = tensorrt_llm.mpi_rank()
+        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
+        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+        engine_dir = Path(engine_dir)
+        self.encoder = WhisperEncoding(engine_dir)
+        self.encoder_projector = EncoderProjector()
+        self.encoder_projector = self.encoder_projector.half().to("cuda")
+        llm = AutoModelForCausalLM.from_pretrained(
+            "/home/scratch.yuekaiz_wwfo_1/Qwen2_1.5B_merged",
+            attn_implementation="flash_attention_2",
+            torch_dtype=torch.float16,
+        )
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
+        tokenizer.padding_side = "left"
+        special_tokens_dict = {"additional_special_tokens": [DEFAULT_SPEECH_TOKEN]}
+        tokenizer.add_special_tokens(special_tokens_dict)
+        llm.config.pad_token_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
+        llm.config.bos_token_id = tokenizer.convert_tokens_to_ids("<|im_start|>")
+        llm.config.eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+        llm.config.default_speech_token_id = tokenizer.convert_tokens_to_ids(
+            DEFAULT_SPEECH_TOKEN
+        )
+        self.llm = llm.half().to("cuda")
+        # print llm embedding layer shape
+        print("llm embedding layer shape", self.llm.get_input_embeddings().weight.shape)
+    def process_batch(
+            self,
+            mel,
+            decoder_input_ids=None,
+            eot_id=50257,
+            max_new_tokens=96,
+            num_beams=1):
+        encoder_outputs = self.encoder.get_audio_features(mel)
+        speech_features = self.encoder_projector(encoder_outputs)
+        speech_features = speech_features.to(torch.float16)
+        # [1,187,1536]
+        return speech_features
+    def decode(
+        self,
+        fbank: torch.Tensor = None,
+        input_ids: torch.LongTensor = None,
+        attention_mask: torch.Tensor = None,
+        **kwargs,
+    ):
+        encoder_outs = self.encoder.get_audio_features(fbank)
+        speech_features = self.encoder_projector(encoder_outs)
+        speech_features = speech_features.to(torch.float16)
+        inputs_embeds = self.llm.get_input_embeddings()(input_ids)
+        speech_token_index = input_ids.tolist()[0].index(151646)
+        print("speech_token_index", speech_token_index, "speech_features_shape", speech_features.shape, "input_ids_shape", input_ids.shape, "inputs_embeds_shape", inputs_embeds.shape)
+        new_length = inputs_embeds.shape[1] + speech_features.shape[1] - 1
+        new_inputs_embeds = torch.zeros(1, new_length, 1536).to(inputs_embeds.device).half()
+        new_inputs_embeds[:, :3, :] = inputs_embeds[:, :3, :]
+        new_inputs_embeds[:, 3:3 + 187, :] = speech_features
+        new_inputs_embeds[:, 3 + 187:, :] = inputs_embeds[:, 4:, :]
+        inputs_embeds = new_inputs_embeds
+        generated_ids = self.llm.generate(
+            inputs_embeds=inputs_embeds,
+            max_new_tokens=kwargs.get("max_new_tokens", 200),
+            num_beams=kwargs.get("num_beams", 1),
+            do_sample=kwargs.get("do_sample", False),
+            min_length=kwargs.get("min_length", 1),
+            top_p=kwargs.get("top_p", 1.0),
+            repetition_penalty=kwargs.get("repetition_penalty", 1.0),
+            length_penalty=kwargs.get("length_penalty", 1.0),
+            temperature=kwargs.get("temperature", 1.0),
+            bos_token_id=self.llm.config.bos_token_id,
+            eos_token_id=self.llm.config.eos_token_id,
+            pad_token_id=self.llm.config.pad_token_id,
+        )
+        return generated_ids

model_repo_whisper_qwen_trtllm/whisper/1/__pycache__/fbank.cpython-310.pyc ADDED Viewed

Binary file (3.07 kB). View file

model_repo_whisper_qwen_trtllm/whisper/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (10.4 kB). View file

model_repo_whisper_qwen_trtllm/whisper/1/__pycache__/whisper_trtllm.cpython-310.pyc ADDED Viewed

Binary file (6.2 kB). View file

model_repo_whisper_qwen_trtllm/whisper/1/fbank.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Union
+import os
+def mel_filters(device, n_mels: int =128) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
+        )
+    """
+    assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[torch.Tensor],
+    filters: torch.Tensor,
+    n_mels: int = 128,
+    n_fft: int = 400,
+    hop_length: int = 160,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 or 128 is supported
+    filters: torch.Tensor
+    Returns
+    -------
+    torch.Tensor, shape = (128, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    window = torch.hann_window(n_fft).to(audio.device)
+    stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    # cast to float 16
+    log_spec = log_spec.half()
+    return log_spec
+class FeatureExtractor(torch.nn.Module):
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def __init__(self, n_mels: int = 128):
+        self.device = torch.device("cuda")
+        self.n_mels = n_mels
+        self.filters = mel_filters(self.device, n_mels=self.n_mels)
+    def compute_feature(self, wav, target: int = 3000):
+        mel = log_mel_spectrogram(wav, self.filters)
+        assert mel.shape[1] <= target, f"{mel.shape[1]} > {target}, audio is too long"
+        if mel.shape[1] < target:
+            mel = F.pad(mel, (0, target - mel.shape[1]), mode='constant')
+        mel = mel.unsqueeze(0)
+        return mel

model_repo_whisper_qwen_trtllm/whisper/1/mel_filters.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
+size 4271

model_repo_whisper_qwen_trtllm/whisper/1/model.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# -*- coding: utf-8 -*-
+import triton_python_backend_utils as pb_utils
+import numpy as np
+import json
+import torch
+from torch.utils.dlpack import from_dlpack, to_dlpack
+import re
+import transformers
+from transformers import AutoTokenizer
+from typing import Dict
+from pathlib import Path
+import traceback
+from .whisper_trtllm import WhisperTRTLLM
+from .fbank import FeatureExtractor
+DEFAULT_SPEECH_TOKEN = "<speech>"
+def preprocess(
+    messages,
+    tokenizer: transformers.PreTrainedTokenizer,
+    max_len: int = 128,
+) -> Dict:
+    """Preprocesses the data for supervised fine-tuning."""
+    texts = []
+    TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{''}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}"
+    for i, msg in enumerate(messages):
+        texts.append(
+            tokenizer.apply_chat_template(
+                msg,
+                tokenize=True,
+                add_generation_prompt=False,
+                chat_template=TEMPLATE,
+                padding="longest",
+                max_length=max_len,
+                truncation=True,
+            )
+        )
+    max_len_texts = max([len(text) for text in texts])
+    if tokenizer.padding_side == "right":
+        texts = [
+            text + [tokenizer.pad_token_id] * (max_len_texts - len(text))
+            for text in texts
+        ]
+    else:
+        texts = [
+            [tokenizer.pad_token_id] * (max_len_texts - len(text)) + text
+            for text in texts
+        ]
+    input_ids = torch.tensor(texts, dtype=torch.int)
+    attention_mask = input_ids.ne(tokenizer.pad_token_id)
+    return input_ids, attention_mask
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args['model_config'])
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(
+            model_config, "TRANSCRIPTS")
+        # Convert Triton types to numpy types
+        self.out0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config['data_type'])
+        #self.tokenizer = get_tokenizer(num_languages=100)
+        #self.blank = self.tokenizer.encode(" ", allowed_special=self.tokenizer.special_tokens_set)[0]
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
+        tokenizer.padding_side = "left"
+        special_tokens_dict = {"additional_special_tokens": [DEFAULT_SPEECH_TOKEN]}
+        tokenizer.add_special_tokens(special_tokens_dict)
+        self.tokenizer = tokenizer
+        self.eos = self.tokenizer.eos_token_id
+        self.default_speech_token_id = tokenizer.convert_tokens_to_ids(
+            DEFAULT_SPEECH_TOKEN
+        )
+        self.vocab_size = 151936
+        self.device = torch.device("cuda")
+        self.decoupled = False
+        self.logger = pb_utils.Logger
+        self.init_model(self.model_config['parameters'])
+    def init_model(self, parameters):
+        for key,value in parameters.items():
+            parameters[key] = value["string_value"]
+        engine_dir = parameters["engine_dir"]
+        n_mels = int(parameters["n_mels"])
+        adapter_dir="/home/scratch.yuekaiz_wwfo_1/icefall_asr_multi-hans_whisper_qwen2_1.5B/epoch-2-avg-6.pt"
+        checkpoint = torch.load(
+            adapter_dir, map_location="cpu"
+        )
+        self.model = WhisperTRTLLM(engine_dir)
+        missing_keys, _ = self.model.load_state_dict(checkpoint, strict=False)
+        print(f"Missing keys: {missing_keys}")
+        self.feature_extractor = FeatureExtractor(n_mels=n_mels)
+    def _tokenize(self, prompt=None, num_speech_tokens=187):
+        if prompt is None:
+            prompts = [
+                [
+                    {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}请转写音频为文字"},
+                    {"role": "assistant", "content": ""},
+                ]
+            ]
+        input_ids, _ = preprocess(prompts, self.tokenizer, max_len=128)
+        print(444444444444444, input_ids)
+        input_ids = input_ids.tolist()[0]
+        speech_token_index = input_ids.index(self.default_speech_token_id)
+        # replace 151646 with list(range(self.vocab_size, self.vocab_size + num_speech_tokens))
+        prompt_ids = input_ids[:speech_token_index] + list(range(self.vocab_size, self.vocab_size + num_speech_tokens)) + input_ids[speech_token_index + 1:]
+        print(prompt_ids)
+        return prompt_ids
+    def _prepare_inputs(self, request, speech_embeddings, input_ids):
+        """
+        Prepares inputs for the language model based on the parameters in the
+        request, image features, and prompt. It tokenizes prompt,
+        extracts and processes additional parameters from the request:
+            - max_tokens: Maximum number of tokens to generate (default: 50)
+            - temperature: Controls randomness in generation (default: 0.5)
+            - top_k: Top K sampling parameter (default: 1)
+            - frequency_penalty: Penalizes frequent tokens (default: 0.7)
+            - seed: Random seed for generation (default: 10)
+        Final llm input dictionary is combined out of all processed parameters,
+        prompt's tokens and image features. The latter will be passed to llm
+        through `prompt_embedding_table`.
+        Parameters
+        ----------
+        - request: The original request object containing additional parameters.
+        - image_features (list): A list containing image feature tensors.
+        - prompt (str): The text prompt to be processed.
+        Returns
+        -------
+        - dict: A dictionary containing all the prepared inputs for the language model.
+        """
+        input_ids = np.array(input_ids, dtype=np.int32)
+        max_tokens = 50
+        input_len = input_ids.shape[0]
+        print(4555555555, speech_embeddings.shape)
+        assert speech_embeddings.shape[1] == 187, "Only support 187 speech tokens"
+        embedding_args = {
+            "prompt_vocab_size": np.array(
+                [[speech_embeddings.shape[1]]], dtype=np.int32
+            ),
+            "prompt_embedding_table": speech_embeddings.detach().cpu().numpy(),
+        }
+        input_dict =  {
+            "input_ids": np.expand_dims(input_ids, 0),
+            "input_lengths": np.array([[input_len]], dtype=np.int32),
+            "request_output_len": np.array([[max_tokens]], dtype=np.int32),
+            "end_id": np.array([[self.tokenizer.eos_token_id]], dtype=np.int32),
+            "streaming": np.array([[0]], dtype=np.bool_),
+            **embedding_args,
+        }
+        input_tensor_list = [pb_utils.Tensor(k, v) for k, v in input_dict.items()]
+        return input_tensor_list
+    def _prepare_llm_response(self, llm_request_inputs):
+        """
+        Prepares the response from the language model based on the provided
+        inputs. Creates a `pb_utils.InferenceRequest` object with passed
+        `llm_request_inputs` to send to a decoupled TensorRTLLM model.
+        For each response from the language model:
+            - Checks for errors and raise an exception if any are found.
+            - Extracts the "output_ids" tensor from the response.
+            - Determines the finish reason based on the presence of the
+              end-of-sequence token or reaching the maximum length.
+            - Appends the generated token IDs to `output_ids`.
+            - If the finish reason is determined, decodes the output IDs to text
+              and prepares the final response.
+        The final response includes the generated text, finish reason,
+        completion tokens, prompt tokens, and total tokens.
+        Parameters
+        ----------
+        - llm_request_inputs (dict): A dictionary containing the inputs for the language model.
+        Returns
+        -------
+        - pb_utils.InferenceResponse: The response object containing the generated text and additional metadata.
+        """
+        llm_request = pb_utils.InferenceRequest(
+            model_name="tensorrt_llm",
+            requested_output_names=["output_ids", "sequence_length"],
+            inputs=llm_request_inputs,
+        )
+        output_ids, output_len = [], 0
+        responses = llm_request.exec(decoupled=False)
+        responses = [responses]
+        for llm_response in responses:
+            if llm_response.has_error():
+                raise pb_utils.TritonModelException(llm_response.error().message())
+            stream_output_ids = (
+                pb_utils.get_output_tensor_by_name(llm_response, "output_ids")
+                .as_numpy()
+                .flatten()
+                .tolist()
+            )
+            finish_reason = "test"
+            if len(stream_output_ids) == 0 or (
+                len(stream_output_ids) != 0
+                and stream_output_ids[-1] == self.eos
+            ):
+                finish_reason = "stop"
+            output_ids += stream_output_ids
+            last_response = finish_reason != ""
+            output_len = len(output_ids)
+            if last_response:
+                print(output_ids)
+                output_text = self.tokenizer.decode(output_ids).strip()
+                # print(output_text)
+                # output_text = re.sub(r'<\|.*?\|>', '', output_text)
+                response = pb_utils.InferenceResponse(
+                    output_tensors=[
+                        pb_utils.Tensor("TRANSCRIPTS", np.array([output_text], np.object_)),
+                    ]
+                )
+                yield response
+    def _extract_speech_embeddings(self, mel):
+        return self.model.process_batch(mel)
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            wav = pb_utils.get_input_tensor_by_name(request, "WAV").as_numpy()
+            assert wav.shape[0] == 1, "Only support batch size 1"
+            # To support batch > 1
+            # cat mel,text_prompt, also, need to increase decoder_input_len as a triton input
+            wav = torch.from_numpy(wav[0]).to(self.device)
+            # mel shape [1, 80, 3000] for remove_input_padding=False
+            mel = self.feature_extractor.compute_feature(wav)
+            speech_embeddings = self._extract_speech_embeddings(mel)
+            print(speech_embeddings.shape)
+            input_ids = self._tokenize()
+            print(input_ids)
+            if self.decoupled:
+                response_sender = request.get_response_sender()
+            try:
+                llm_request_inputs = self._prepare_inputs(
+                    request, speech_embeddings, input_ids
+                )
+                if isinstance(llm_request_inputs, pb_utils.TritonError):
+                    error = pb_utils.InferenceResponse(error=llm_request_inputs)
+                    if self.decoupled:
+                        response_sender.send(
+                            error, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                        )
+                    else:
+                        responses.append(error)
+                llm_responses = self._prepare_llm_response(llm_request_inputs)
+                for triton_response in llm_responses:
+                    if self.decoupled:
+                        response_sender.send(triton_response)
+                    else:
+                        responses.append(triton_response)
+                if self.decoupled:
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            except Exception:
+                self.logger.log_error(traceback.format_exc())
+                # If encountering an error, send a response with err msg
+                error_response = pb_utils.InferenceResponse(
+                    output_tensors=[],
+                    error=pb_utils.TritonError(traceback.format_exc()))
+                if self.decoupled:
+                    response_sender.send(error_response)
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    responses.append(error_response)
+        if self.decoupled:
+            return None
+        else:
+            assert len(responses) == len(requests)
+            return responses

model_repo_whisper_qwen_trtllm/whisper/1/whisper_trtllm.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from collections import OrderedDict
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+import tensorrt_llm
+import tensorrt_llm.logger as logger
+from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
+                                 trt_dtype_to_torch)
+from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+from tensorrt_llm.runtime.session import Session, TensorInfo
+def remove_tensor_padding(input_tensor, input_tensor_lengths=None, pad_value=0):
+    if input_tensor.dim() == 2:
+        # Text tensor case: batch, seq_len
+        assert torch.all(
+            input_tensor[:, 0] != pad_value
+        ), "First token in each sequence should not be pad_value"
+        assert input_tensor_lengths is None
+        # Create a mask for all non-pad tokens
+        mask = input_tensor != pad_value
+        # Apply the mask to input_tensor to remove pad tokens
+        output_tensor = input_tensor[mask].view(1, -1)
+    elif input_tensor.dim() == 3:
+        # Audio tensor case: batch, seq_len, feature_len
+        assert input_tensor_lengths is not None, "input_tensor_lengths must be provided for 3D input_tensor"
+        batch_size, seq_len, feature_len = input_tensor.shape
+        # Initialize a list to collect valid sequences
+        valid_sequences = []
+        for i in range(batch_size):
+            valid_length = input_tensor_lengths[i]
+            valid_sequences.append(input_tensor[i, :valid_length, :])
+        # Concatenate all valid sequences along the batch dimension
+        output_tensor = torch.cat(valid_sequences, dim=0)
+    else:
+        raise ValueError("Input tensor must have 2 or 3 dimensions")
+    return output_tensor
+def read_config(component, engine_dir):
+    config_path = engine_dir / component / 'config.json'
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    model_config = OrderedDict()
+    model_config.update(config['pretrained_config'])
+    model_config.update(config['build_config'])
+    return model_config
+class WhisperEncoding:
+    def __init__(self, engine_dir):
+        self.session = self.get_session(engine_dir)
+        config = read_config('encoder', engine_dir)
+        self.n_mels = config['n_mels']
+        self.dtype = config['dtype']
+        self.num_languages = config['num_languages']
+        self.encoder_config = config
+    def get_session(self, engine_dir):
+        serialize_path = engine_dir / 'encoder' / 'rank0.engine'
+        with open(serialize_path, 'rb') as f:
+            session = Session.from_serialized_engine(f.read())
+        return session
+    def get_audio_features(self,
+                           mel):
+        mel_input_lengths = torch.tensor(
+            [mel.shape[2] for _ in range(mel.shape[0])],
+            dtype=torch.int32,
+            device=mel.device)
+        if self.encoder_config['plugin_config']['remove_input_padding']:
+            # mel B,D,T -> B,T,D -> BxT, D
+            mel = mel.transpose(1, 2)
+            mel = remove_tensor_padding(mel, mel_input_lengths)
+        inputs = OrderedDict()
+        inputs['input_features'] = mel
+        inputs['input_lengths'] = mel_input_lengths
+        output_list = [
+            TensorInfo('input_features', str_dtype_to_trt(self.dtype),
+                       mel.shape),
+            TensorInfo('input_lengths', str_dtype_to_trt('int32'),
+                       mel_input_lengths.shape)
+        ]
+        output_info = (self.session).infer_shapes(output_list)
+        logger.debug(f'output info {output_info}')
+        outputs = {
+            t.name: torch.empty(tuple(t.shape),
+                                dtype=trt_dtype_to_torch(t.dtype),
+                                device='cuda')
+            for t in output_info
+        }
+        stream = torch.cuda.current_stream()
+        ok = self.session.run(inputs=inputs,
+                              outputs=outputs,
+                              stream=stream.cuda_stream)
+        assert ok, 'Engine execution failed'
+        stream.synchronize()
+        encoder_output = outputs['encoder_output']
+        encoder_output_lengths = mel_input_lengths // 2
+        return encoder_output
+class EncoderProjector(torch.nn.Module):
+    """
+    The encoder projector module. It is used to project the encoder outputs to the same dimension as the language model.
+    Modified from https://github.com/X-LANCE/SLAM-LLM/blob/main/src/slam_llm/models/projector.py.
+    Args:
+        encoder_dim (:obj:`int`): The dimension of the encoder outputs.
+        llm_dim (:obj:`int`): The dimension of the language model.
+        downsample_rate (:obj:`int`, `optional`, defaults to 5): The downsample rate to use.
+    """
+    def __init__(self, encoder_dim=1280, llm_dim=1536, downsample_rate=8):
+        super().__init__()
+        self.downsample_rate = downsample_rate
+        self.linear1 = nn.Linear(encoder_dim * self.downsample_rate, llm_dim)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(llm_dim, llm_dim)
+    def forward(self, x):
+        batch_size, seq_len, feat_dim = x.size()
+        num_frames_to_discard = seq_len % self.downsample_rate
+        if num_frames_to_discard > 0:
+            x = x[:, :-num_frames_to_discard, :]
+        seq_len = x.size(1)
+        x = x.contiguous()
+        x = x.view(
+            batch_size, seq_len // self.downsample_rate, feat_dim * self.downsample_rate
+        )
+        x = self.linear1(x)
+        x = self.relu(x)
+        x = self.linear2(x)
+        return x
+# class SPEECH_LLM(nn.Module):
+#     """
+#     The Speech-to-Text model. It consists of an encoder, a language model and an encoder projector.
+#     The encoder is used to extract speech features from the input speech signal.
+#     The encoder projector is used to project the encoder outputs to the same dimension as the language model.
+#     The language model is used to generate the text from the speech features.
+#     Args:
+#         encoder (:obj:`nn.Module`): The encoder module.
+#         llm (:obj:`nn.Module`): The language model module.
+#         encoder_projector (:obj:`nn.Module`): The encoder projector module.
+#     """
+#     def __init__(
+#         self,
+#         encoder: nn.Module = None,
+#         llm: nn.Module = None,
+#         encoder_projector: nn.Module = None,
+#     ):
+#         super().__init__()
+#         self.encoder = encoder
+#         self.llm = llm
+#         self.encoder_projector = encoder_projector
+class WhisperTRTLLM(nn.Module):
+    def __init__(self, engine_dir):
+        super().__init__()
+        world_size = 1
+        runtime_rank = tensorrt_llm.mpi_rank()
+        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
+        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+        engine_dir = Path(engine_dir)
+        self.encoder = WhisperEncoding(engine_dir)
+        self.encoder_projector = EncoderProjector()
+        self.encoder_projector = self.encoder_projector.half().to("cuda")
+    def process_batch(
+            self,
+            mel,
+            decoder_input_ids=None,
+            eot_id=50257,
+            max_new_tokens=96,
+            num_beams=1):
+        encoder_outputs = self.encoder.get_audio_features(mel)
+        speech_features = self.encoder_projector(encoder_outputs)
+        speech_features = speech_features.to(torch.float16)
+        print(2333333333333, speech_features.shape)
+        return speech_features

model_repo_whisper_qwen_trtllm/whisper/2/__pycache__/fbank.cpython-310.pyc ADDED Viewed

Binary file (3.07 kB). View file

model_repo_whisper_qwen_trtllm/whisper/2/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (10.4 kB). View file

model_repo_whisper_qwen_trtllm/whisper/2/__pycache__/whisper_trtllm.cpython-310.pyc ADDED Viewed

Binary file (7.37 kB). View file

model_repo_whisper_qwen_trtllm/whisper/2/fbank.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Union
+import os
+def mel_filters(device, n_mels: int =128) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
+        )
+    """
+    assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[torch.Tensor],
+    filters: torch.Tensor,
+    n_mels: int = 128,
+    n_fft: int = 400,
+    hop_length: int = 160,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 or 128 is supported
+    filters: torch.Tensor
+    Returns
+    -------
+    torch.Tensor, shape = (128, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    window = torch.hann_window(n_fft).to(audio.device)
+    stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    # cast to float 16
+    log_spec = log_spec.half()
+    return log_spec
+class FeatureExtractor(torch.nn.Module):
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def __init__(self, n_mels: int = 128):
+        self.device = torch.device("cuda")
+        self.n_mels = n_mels
+        self.filters = mel_filters(self.device, n_mels=self.n_mels)
+    def compute_feature(self, wav, target: int = 3000):
+        mel = log_mel_spectrogram(wav, self.filters)
+        assert mel.shape[1] <= target, f"{mel.shape[1]} > {target}, audio is too long"
+        if mel.shape[1] < target:
+            mel = F.pad(mel, (0, target - mel.shape[1]), mode='constant')
+        mel = mel.unsqueeze(0)
+        return mel

model_repo_whisper_qwen_trtllm/whisper/2/mel_filters.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
+size 4271

model_repo_whisper_qwen_trtllm/whisper/2/model.py ADDED Viewed

	@@ -0,0 +1,346 @@

+# -*- coding: utf-8 -*-
+import triton_python_backend_utils as pb_utils
+import numpy as np
+import json
+import torch
+from torch.utils.dlpack import from_dlpack, to_dlpack
+import re
+import transformers
+from transformers import AutoTokenizer
+from typing import Dict
+from pathlib import Path
+import traceback
+from .whisper_trtllm import WhisperTRTLLM
+from .fbank import FeatureExtractor
+DEFAULT_SPEECH_TOKEN = "<speech>"
+def preprocess(
+    messages,
+    tokenizer: transformers.PreTrainedTokenizer,
+    max_len: int = 128,
+) -> Dict:
+    """Preprocesses the data for supervised fine-tuning."""
+    texts = []
+    TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{''}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}"
+    for i, msg in enumerate(messages):
+        texts.append(
+            tokenizer.apply_chat_template(
+                msg,
+                tokenize=True,
+                add_generation_prompt=False,
+                chat_template=TEMPLATE,
+                padding="longest",
+                max_length=max_len,
+                truncation=True,
+            )
+        )
+    max_len_texts = max([len(text) for text in texts])
+    if tokenizer.padding_side == "right":
+        texts = [
+            text + [tokenizer.pad_token_id] * (max_len_texts - len(text))
+            for text in texts
+        ]
+    else:
+        texts = [
+            [tokenizer.pad_token_id] * (max_len_texts - len(text)) + text
+            for text in texts
+        ]
+    input_ids = torch.tensor(texts, dtype=torch.int)
+    attention_mask = input_ids.ne(tokenizer.pad_token_id)
+    return input_ids, attention_mask
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args['model_config'])
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(
+            model_config, "TRANSCRIPTS")
+        # Convert Triton types to numpy types
+        self.out0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config['data_type'])
+        #self.tokenizer = get_tokenizer(num_languages=100)
+        #self.blank = self.tokenizer.encode(" ", allowed_special=self.tokenizer.special_tokens_set)[0]
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
+        tokenizer.padding_side = "left"
+        special_tokens_dict = {"additional_special_tokens": [DEFAULT_SPEECH_TOKEN]}
+        tokenizer.add_special_tokens(special_tokens_dict)
+        self.tokenizer = tokenizer
+        self.eos = self.tokenizer.eos_token_id
+        self.default_speech_token_id = tokenizer.convert_tokens_to_ids(
+            DEFAULT_SPEECH_TOKEN
+        )
+        self.vocab_size = 151936
+        # self.vocab_size = 500000
+        # self.vocab_size = 160000
+        self.device = torch.device("cuda")
+        self.decoupled = False
+        self.logger = pb_utils.Logger
+        self.init_model(self.model_config['parameters'])
+    def init_model(self, parameters):
+        for key,value in parameters.items():
+            parameters[key] = value["string_value"]
+        engine_dir = parameters["engine_dir"]
+        n_mels = int(parameters["n_mels"])
+        adapter_dir="/home/scratch.yuekaiz_wwfo_1/icefall_asr_multi-hans_whisper_qwen2_1.5B/epoch-2-avg-6.pt"
+        checkpoint = torch.load(
+            adapter_dir, map_location="cpu"
+        )
+        self.model = WhisperTRTLLM(engine_dir)
+        missing_keys, _ = self.model.load_state_dict(checkpoint, strict=False)
+        # print(f"Missing keys: {missing_keys}")
+        self.feature_extractor = FeatureExtractor(n_mels=n_mels)
+    def _tokenize(self, prompt=None, num_speech_tokens=187):
+        if prompt is None:
+            prompts = [
+                [
+                    {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}请转写音频为文字"},
+                    {"role": "assistant", "content": ""},
+                ]
+            ]
+            # prompts = [
+            #     [
+            #         {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}你好，你是谁？"},
+            #         {"role": "assistant", "content": ""},
+            #     ]
+            # ]
+        input_ids, _ = preprocess(prompts, self.tokenizer, max_len=128)
+        input_ids = input_ids.tolist()[0]
+        speech_token_index = input_ids.index(self.default_speech_token_id)
+        # replace 151646 with list(range(self.vocab_size, self.vocab_size + num_speech_tokens))
+        prompt_ids = input_ids[:speech_token_index] + list(range(self.vocab_size, self.vocab_size + num_speech_tokens)) + input_ids[speech_token_index + 1:]
+        # prompt_ids = input_ids[:speech_token_index] + input_ids[speech_token_index + 1:]
+        return prompt_ids
+    def _prepare_inputs(self, request, speech_embeddings, input_ids):
+        """
+        Prepares inputs for the language model based on the parameters in the
+        request, image features, and prompt. It tokenizes prompt,
+        extracts and processes additional parameters from the request:
+            - max_tokens: Maximum number of tokens to generate (default: 50)
+            - temperature: Controls randomness in generation (default: 0.5)
+            - top_k: Top K sampling parameter (default: 1)
+            - frequency_penalty: Penalizes frequent tokens (default: 0.7)
+            - seed: Random seed for generation (default: 10)
+        Final llm input dictionary is combined out of all processed parameters,
+        prompt's tokens and image features. The latter will be passed to llm
+        through `prompt_embedding_table`.
+        Parameters
+        ----------
+        - request: The original request object containing additional parameters.
+        - image_features (list): A list containing image feature tensors.
+        - prompt (str): The text prompt to be processed.
+        Returns
+        -------
+        - dict: A dictionary containing all the prepared inputs for the language model.
+        """
+        input_ids = np.array(input_ids, dtype=np.int32)
+        max_tokens = 200
+        input_len = input_ids.shape[0]
+        assert speech_embeddings.shape[1] == 187, "Only support 187 speech tokens"
+        embedding_args = {
+            "prompt_vocab_size": np.array(
+                [[speech_embeddings.shape[1]]], dtype=np.int32
+            ),
+            "prompt_embedding_table": speech_embeddings.detach().cpu().numpy(),
+        }
+        # TODO: 加不加这个出来的结果一样？？？ input_ids 超过最大 vocab 也不会报错？？？
+        input_dict =  {
+            "input_ids": np.expand_dims(input_ids, 0),
+            "input_lengths": np.array([[input_len]], dtype=np.int32),
+            "request_output_len": np.array([[max_tokens]], dtype=np.int32),
+            "runtime_top_k": np.array([[1]], dtype=np.int32),
+            "end_id": np.array([[self.tokenizer.eos_token_id]], dtype=np.int32),
+            "pad_id": np.array([[self.tokenizer.pad_token_id]], dtype=np.int32),
+            "streaming": np.array([[0]], dtype=np.bool_),
+            **embedding_args,
+        }
+        # print(input_ids)
+        # for key, value in input_dict.items():
+        #     print(key, value.shape)
+        input_tensor_list = [pb_utils.Tensor(k, v) for k, v in input_dict.items()]
+        return input_tensor_list
+    def _prepare_llm_response(self, llm_request_inputs):
+        """
+        Prepares the response from the language model based on the provided
+        inputs. Creates a `pb_utils.InferenceRequest` object with passed
+        `llm_request_inputs` to send to a decoupled TensorRTLLM model.
+        For each response from the language model:
+            - Checks for errors and raise an exception if any are found.
+            - Extracts the "output_ids" tensor from the response.
+            - Determines the finish reason based on the presence of the
+              end-of-sequence token or reaching the maximum length.
+            - Appends the generated token IDs to `output_ids`.
+            - If the finish reason is determined, decodes the output IDs to text
+              and prepares the final response.
+        The final response includes the generated text, finish reason,
+        completion tokens, prompt tokens, and total tokens.
+        Parameters
+        ----------
+        - llm_request_inputs (dict): A dictionary containing the inputs for the language model.
+        Returns
+        -------
+        - pb_utils.InferenceResponse: The response object containing the generated text and additional metadata.
+        """
+        llm_request = pb_utils.InferenceRequest(
+            model_name="tensorrt_llm",
+            requested_output_names=["output_ids", "sequence_length"],
+            inputs=llm_request_inputs,
+        )
+        output_ids, output_len = [], 0
+        responses = llm_request.exec(decoupled=False)
+        responses = [responses]
+        for llm_response in responses:
+            if llm_response.has_error():
+                raise pb_utils.TritonModelException(llm_response.error().message())
+            stream_output_ids = (
+                pb_utils.get_output_tensor_by_name(llm_response, "output_ids")
+                .as_numpy()
+                .flatten()
+                .tolist()
+            )
+            finish_reason = "test"
+            if len(stream_output_ids) == 0 or (
+                len(stream_output_ids) != 0
+                and stream_output_ids[-1] == self.eos
+            ):
+                finish_reason = "stop"
+            output_ids += stream_output_ids
+            last_response = finish_reason != ""
+            output_len = len(output_ids)
+            if last_response:
+                print("final_output_ids", output_ids)
+                output_text = self.tokenizer.decode(output_ids).strip()
+                # print(output_text)
+                # output_text = re.sub(r'<\|.*?\|>', '', output_text)
+                response = pb_utils.InferenceResponse(
+                    output_tensors=[
+                        pb_utils.Tensor("TRANSCRIPTS", np.array([output_text], np.object_)),
+                    ]
+                )
+                yield response
+    def _extract_speech_embeddings(self, mel):
+        return self.model.process_batch(mel)
+    def execute(self, requests):
+        responses = []
+        for request in requests:
+            wav = pb_utils.get_input_tensor_by_name(request, "WAV").as_numpy()
+            assert wav.shape[0] == 1, "Only support batch size 1"
+            # To support batch > 1
+            # cat mel,text_prompt, also, need to increase decoder_input_len as a triton input
+            wav = torch.from_numpy(wav[0]).to(self.device)
+            # mel shape [1, 80, 3000] for remove_input_padding=False
+            mel = self.feature_extractor.compute_feature(wav)
+            # print("==========================================================")
+            # messages = [
+            #     [
+            #         {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}请转写音频为文字"},
+            #         {"role": "assistant", "content": ""},
+            #     ]
+            # ] * len(mel)
+            # input_ids, attention_mask = preprocess(messages, self.tokenizer, max_len=128)
+            # generated_ids = self.model.decode(
+            #     mel, input_ids.to(self.device, dtype=torch.long), attention_mask.to(self.device)
+            # )
+            # print("pytorch model", generated_ids)
+            # print("--------------------------------------------------------------------------")
+            speech_embeddings = self._extract_speech_embeddings(mel)
+            input_ids = self._tokenize()
+            if self.decoupled:
+                response_sender = request.get_response_sender()
+            try:
+                llm_request_inputs = self._prepare_inputs(
+                    request, speech_embeddings, input_ids
+                )
+                if isinstance(llm_request_inputs, pb_utils.TritonError):
+                    error = pb_utils.InferenceResponse(error=llm_request_inputs)
+                    if self.decoupled:
+                        response_sender.send(
+                            error, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+                        )
+                    else:
+                        responses.append(error)
+                llm_responses = self._prepare_llm_response(llm_request_inputs)
+                for triton_response in llm_responses:
+                    if self.decoupled:
+                        response_sender.send(triton_response)
+                    else:
+                        responses.append(triton_response)
+                if self.decoupled:
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            except Exception:
+                self.logger.log_error(traceback.format_exc())
+                # If encountering an error, send a response with err msg
+                error_response = pb_utils.InferenceResponse(
+                    output_tensors=[],
+                    error=pb_utils.TritonError(traceback.format_exc()))
+                if self.decoupled:
+                    response_sender.send(error_response)
+                    response_sender.send(
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                else:
+                    responses.append(error_response)
+        if self.decoupled:
+            return None
+        else:
+            assert len(responses) == len(requests)
+            return responses

model_repo_whisper_qwen_trtllm/whisper/2/whisper_trtllm.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from collections import OrderedDict
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+import tensorrt_llm
+import tensorrt_llm.logger as logger
+from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
+                                 trt_dtype_to_torch)
+from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+from tensorrt_llm.runtime.session import Session, TensorInfo
+from transformers.trainer_pt_utils import LabelSmoother
+from transformers import AutoModelForCausalLM, AutoTokenizer
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
+DEFAULT_SPEECH_TOKEN = "<speech>"
+def remove_tensor_padding(input_tensor, input_tensor_lengths=None, pad_value=0):
+    if input_tensor.dim() == 2:
+        # Text tensor case: batch, seq_len
+        assert torch.all(
+            input_tensor[:, 0] != pad_value
+        ), "First token in each sequence should not be pad_value"
+        assert input_tensor_lengths is None
+        # Create a mask for all non-pad tokens
+        mask = input_tensor != pad_value
+        # Apply the mask to input_tensor to remove pad tokens
+        output_tensor = input_tensor[mask].view(1, -1)
+    elif input_tensor.dim() == 3:
+        # Audio tensor case: batch, seq_len, feature_len
+        assert input_tensor_lengths is not None, "input_tensor_lengths must be provided for 3D input_tensor"
+        batch_size, seq_len, feature_len = input_tensor.shape
+        # Initialize a list to collect valid sequences
+        valid_sequences = []
+        for i in range(batch_size):
+            valid_length = input_tensor_lengths[i]
+            valid_sequences.append(input_tensor[i, :valid_length, :])
+        # Concatenate all valid sequences along the batch dimension
+        output_tensor = torch.cat(valid_sequences, dim=0)
+    else:
+        raise ValueError("Input tensor must have 2 or 3 dimensions")
+    return output_tensor
+def read_config(component, engine_dir):
+    config_path = engine_dir / component / 'config.json'
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    model_config = OrderedDict()
+    model_config.update(config['pretrained_config'])
+    model_config.update(config['build_config'])
+    return model_config
+class WhisperEncoding:
+    def __init__(self, engine_dir):
+        self.session = self.get_session(engine_dir)
+        config = read_config('encoder', engine_dir)
+        self.n_mels = config['n_mels']
+        self.dtype = config['dtype']
+        self.num_languages = config['num_languages']
+        self.encoder_config = config
+    def get_session(self, engine_dir):
+        serialize_path = engine_dir / 'encoder' / 'rank0.engine'
+        with open(serialize_path, 'rb') as f:
+            session = Session.from_serialized_engine(f.read())
+        return session
+    def get_audio_features(self,
+                           mel):
+        mel_input_lengths = torch.tensor(
+            [mel.shape[2] for _ in range(mel.shape[0])],
+            dtype=torch.int32,
+            device=mel.device)
+        if self.encoder_config['plugin_config']['remove_input_padding']:
+            # mel B,D,T -> B,T,D -> BxT, D
+            mel = mel.transpose(1, 2)
+            mel = remove_tensor_padding(mel, mel_input_lengths)
+        inputs = OrderedDict()
+        inputs['input_features'] = mel
+        inputs['input_lengths'] = mel_input_lengths
+        output_list = [
+            TensorInfo('input_features', str_dtype_to_trt(self.dtype),
+                       mel.shape),
+            TensorInfo('input_lengths', str_dtype_to_trt('int32'),
+                       mel_input_lengths.shape)
+        ]
+        output_info = (self.session).infer_shapes(output_list)
+        logger.debug(f'output info {output_info}')
+        outputs = {
+            t.name: torch.empty(tuple(t.shape),
+                                dtype=trt_dtype_to_torch(t.dtype),
+                                device='cuda')
+            for t in output_info
+        }
+        stream = torch.cuda.current_stream()
+        ok = self.session.run(inputs=inputs,
+                              outputs=outputs,
+                              stream=stream.cuda_stream)
+        assert ok, 'Engine execution failed'
+        stream.synchronize()
+        encoder_output = outputs['encoder_output']
+        encoder_output_lengths = mel_input_lengths // 2
+        return encoder_output
+class EncoderProjector(torch.nn.Module):
+    """
+    The encoder projector module. It is used to project the encoder outputs to the same dimension as the language model.
+    Modified from https://github.com/X-LANCE/SLAM-LLM/blob/main/src/slam_llm/models/projector.py.
+    Args:
+        encoder_dim (:obj:`int`): The dimension of the encoder outputs.
+        llm_dim (:obj:`int`): The dimension of the language model.
+        downsample_rate (:obj:`int`, `optional`, defaults to 5): The downsample rate to use.
+    """
+    def __init__(self, encoder_dim=1280, llm_dim=1536, downsample_rate=8):
+        super().__init__()
+        self.downsample_rate = downsample_rate
+        self.linear1 = nn.Linear(encoder_dim * self.downsample_rate, llm_dim)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(llm_dim, llm_dim)
+    def forward(self, x):
+        batch_size, seq_len, feat_dim = x.size()
+        num_frames_to_discard = seq_len % self.downsample_rate
+        if num_frames_to_discard > 0:
+            x = x[:, :-num_frames_to_discard, :]
+        seq_len = x.size(1)
+        x = x.contiguous()
+        x = x.view(
+            batch_size, seq_len // self.downsample_rate, feat_dim * self.downsample_rate
+        )
+        x = self.linear1(x)
+        x = self.relu(x)
+        x = self.linear2(x)
+        return x
+class SPEECH_LLM(nn.Module):
+    """
+    The Speech-to-Text model. It consists of an encoder, a language model and an encoder projector.
+    The encoder is used to extract speech features from the input speech signal.
+    The encoder projector is used to project the encoder outputs to the same dimension as the language model.
+    The language model is used to generate the text from the speech features.
+    Args:
+        encoder (:obj:`nn.Module`): The encoder module.
+        llm (:obj:`nn.Module`): The language model module.
+        encoder_projector (:obj:`nn.Module`): The encoder projector module.
+    """
+    def __init__(
+        self,
+        encoder: nn.Module,
+        llm: nn.Module,
+        encoder_projector: nn.Module,
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.llm = llm
+        self.encoder_projector = encoder_projector
+class WhisperTRTLLM(nn.Module):
+    def __init__(self, engine_dir):
+        super().__init__()
+        world_size = 1
+        runtime_rank = tensorrt_llm.mpi_rank()
+        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
+        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+        engine_dir = Path(engine_dir)
+        self.encoder = WhisperEncoding(engine_dir)
+        self.encoder_projector = EncoderProjector()
+        self.encoder_projector = self.encoder_projector.half().to("cuda")
+        # llm = AutoModelForCausalLM.from_pretrained(
+        #     "/home/scratch.yuekaiz_wwfo_1/Qwen2_1.5B_merged",
+        #     attn_implementation="flash_attention_2",
+        #     torch_dtype=torch.float16,
+        # )
+        # tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
+        # tokenizer.padding_side = "left"
+        # special_tokens_dict = {"additional_special_tokens": [DEFAULT_SPEECH_TOKEN]}
+        # tokenizer.add_special_tokens(special_tokens_dict)
+        # llm.config.pad_token_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
+        # llm.config.bos_token_id = tokenizer.convert_tokens_to_ids("<|im_start|>")
+        # llm.config.eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+        # llm.config.default_speech_token_id = tokenizer.convert_tokens_to_ids(
+        #     DEFAULT_SPEECH_TOKEN
+        # )
+        # self.llm = llm.half().to("cuda")
+        # # print llm embedding layer shape
+        # print("llm embedding layer shape", self.llm.get_input_embeddings().weight.shape)
+    def process_batch(
+            self,
+            mel,
+            decoder_input_ids=None,
+            eot_id=50257,
+            max_new_tokens=96,
+            num_beams=1):
+        encoder_outputs = self.encoder.get_audio_features(mel)
+        speech_features = self.encoder_projector(encoder_outputs)
+        speech_features = speech_features.to(torch.float16)
+        # [1,187,1536]
+        return speech_features
+    # def decode(
+    #     self,
+    #     fbank: torch.Tensor = None,
+    #     input_ids: torch.LongTensor = None,
+    #     attention_mask: torch.Tensor = None,
+    #     **kwargs,
+    # ):
+    #     encoder_outs = self.encoder.get_audio_features(fbank)
+    #     speech_features = self.encoder_projector(encoder_outs)
+    #     speech_features = speech_features.to(torch.float16)
+    #     inputs_embeds = self.llm.get_input_embeddings()(input_ids)
+    #     speech_token_index = input_ids.tolist()[0].index(151646)
+    #     print("speech_token_index", speech_token_index, "speech_features_shape", speech_features.shape, "input_ids_shape", input_ids.shape, "inputs_embeds_shape", inputs_embeds.shape)
+    #     new_length = inputs_embeds.shape[1] + speech_features.shape[1] - 1
+    #     new_inputs_embeds = torch.zeros(1, new_length, 1536).to(inputs_embeds.device).half()
+    #     new_inputs_embeds[:, :3, :] = inputs_embeds[:, :3, :]
+    #     new_inputs_embeds[:, 3:3 + 187, :] = speech_features
+    #     new_inputs_embeds[:, 3 + 187:, :] = inputs_embeds[:, 4:, :]
+    #     inputs_embeds = new_inputs_embeds
+    #     generated_ids = self.llm.generate(
+    #         inputs_embeds=inputs_embeds,
+    #         max_new_tokens=kwargs.get("max_new_tokens", 200),
+    #         num_beams=kwargs.get("num_beams", 1),
+    #         do_sample=kwargs.get("do_sample", False),
+    #         min_length=kwargs.get("min_length", 1),
+    #         top_p=kwargs.get("top_p", 1.0),
+    #         repetition_penalty=kwargs.get("repetition_penalty", 1.0),
+    #         length_penalty=kwargs.get("length_penalty", 1.0),
+    #         temperature=kwargs.get("temperature", 1.0),
+    #         bos_token_id=self.llm.config.bos_token_id,
+    #         eos_token_id=self.llm.config.eos_token_id,
+    #         pad_token_id=self.llm.config.pad_token_id,
+    #     )
+    #     return generated_ids

model_repo_whisper_qwen_trtllm/whisper/config.pbtxt ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "whisper"
+backend: "python"
+max_batch_size: 8
+parameters [
+  {
+   key: "n_mels",
+   value: {string_value:"80"} # 128 dim for large-v3, 80 dim for large-v2
+  },
+  {
+    key: "engine_dir"
+    value: { string_value: "/home/scratch.yuekaiz_wwfo_1/tekit/examples/whisper/whisper_multi_zh"}
+  }
+]
+input [
+  {
+    name: "TEXT_PREFIX"
+    data_type: TYPE_STRING
+    dims: [1]
+  },
+  {
+    name: "WAV"
+    data_type: TYPE_FP32
+    dims: [-1]
+  }
+]
+output [
+  {
+    name: "TRANSCRIPTS"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+dynamic_batching {
+    preferred_batch_size: [ 4, 8]
+    max_queue_delay_microseconds: 1000
+  }
+instance_group [
+    {
+      count: 1
+      kind: KIND_CPU
+    }
+  ]