|
import os |
|
|
|
from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASR_TIMESTAMPS |
|
from nemo.collections.asr.parts.utils.diarization_utils import ASR_DIAR_OFFLINE |
|
from omegaconf import OmegaConf |
|
from pyannote.audio import Pipeline |
|
|
|
ROOT = os.getcwd() |
|
MODEL_CONFIG = "info/configs/offline_diarization_asr.yaml" |
|
data_dir = os.path.join(ROOT, 'info/configs/') |
|
os.makedirs(data_dir, exist_ok=True) |
|
output_dir = os.path.join(ROOT, 'info/transcripts/') |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
def diarization(file_path): |
|
|
|
|
|
|
|
import json |
|
meta = { |
|
'audio_filepath': file_path, |
|
'offset': 0, |
|
'duration': None, |
|
'label': 'infer', |
|
'text': '-', |
|
'num_speakers': 2, |
|
'rttm_filepath': None, |
|
'uem_filepath': None |
|
} |
|
with open(os.path.join(data_dir, 'manifests/', 'input_manifest.json'), 'w') as fp: |
|
json.dump(meta, fp) |
|
fp.write('\n') |
|
|
|
|
|
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection") |
|
output = pipeline(file_path) |
|
initial_json = output.for_json() |
|
keys = ("audio_filepath", "offset", "duration", "label") |
|
output_json = [] |
|
for segment in initial_json["content"]: |
|
vad_json = dict.fromkeys(keys) |
|
vad_json["audio_filepath"] = file_path |
|
vad_json["offset"] = segment["segment"]["start"] |
|
vad_json["duration"] = segment["segment"]["end"] - segment["segment"]["start"] |
|
vad_json["label"] = "SPEECH" |
|
vad_json["uniq_id"] = initial_json["uri"] |
|
output_json.append(vad_json) |
|
with open(os.path.join(data_dir, 'manifests/', 'external_vad_manifest.json'), 'w') as f: |
|
for item in output_json: |
|
f.write(str(item).replace("'", '"') + '\n') |
|
|
|
config2 = OmegaConf.load(MODEL_CONFIG) |
|
config2.diarizer.asr.model_path = 'QuartzNet15x5Base-En' |
|
config2.diarizer.manifest_filepath = \ |
|
os.path.join(data_dir, 'manifests/', 'input_manifest.json') |
|
config2.diarizer.speaker_embeddings.model_path = 'titanet_large' |
|
config2.diarizer.vad.external_vad_manifest = \ |
|
os.path.join(data_dir, 'manifests/', 'external_vad_manifest.json') |
|
config2.diarizer.out_dir = output_dir |
|
config2.num_workers = 0 |
|
asr_ts_decoder = ASR_TIMESTAMPS(**config2.diarizer) |
|
asr_model = asr_ts_decoder.set_asr_model() |
|
word_hyp, word_ts_hyp = asr_ts_decoder.run_ASR(asr_model) |
|
print(word_hyp) |
|
print(word_ts_hyp) |
|
|
|
asr_diar_offline = ASR_DIAR_OFFLINE(**config2.diarizer) |
|
asr_diar_offline.word_ts_anchor_offset = asr_ts_decoder.word_ts_anchor_offset |
|
diar_hyp, diar_score = asr_diar_offline.run_diarization(config2, word_ts_hyp) |
|
print("Diarization hypothesis output: \n", diar_hyp) |
|
result = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp) |
|
file_to_show = os.path.join(data_dir, 'transcripts/pred_rttms/', file_path.split('/')[-1].split(".")[0], '.txt') |
|
print(file_to_show) |
|
print(diar_hyp) |
|
return result |
|
|