|
|
|
|
|
|
|
|
|
|
|
|
|
get_ipython().system('pip install webrtcvad')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers import pipeline
|
|
hindi_pipe = pipeline("automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_new")
|
|
whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
|
|
eng_pipe = pipeline(task="automatic-speech-recognition", model="C:/Users/WCHL/Desktop/huggingface_english/hf_eng")
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
import re
|
|
import librosa
|
|
import nbimporter
|
|
import torchaudio
|
|
import numpy as np
|
|
import scipy.signal
|
|
import webrtcvad
|
|
import soundfile as sf
|
|
import warnings
|
|
warnings.filterwarnings("ignore")
|
|
|
|
from transformers import pipeline
|
|
from text2int import text_to_int
|
|
from isNumber import is_number
|
|
from Text2List import text_to_list
|
|
from convert2list import convert_to_list
|
|
from processDoubles import process_doubles
|
|
from replaceWords import replace_words
|
|
from applyVad import apply_vad
|
|
from wienerFilter import wiener_filter
|
|
from highPassFilter import high_pass_filter
|
|
|
|
def noise_reduction_pipeline(filepath):
|
|
audio, sr = librosa.load(filepath, sr=None)
|
|
print(sr)
|
|
audio_hp = high_pass_filter(audio, sr, cutoff=100, order=5)
|
|
audio_wiener = wiener_filter(audio_hp)
|
|
audio_vad = apply_vad(audio_wiener, sr)
|
|
output_filepath = "processed_output.wav"
|
|
sf.write(output_filepath, audio_vad, sr)
|
|
return output_filepath
|
|
|
|
|
|
def transcribe_with_huggingface(filepath):
|
|
asr_pipeline = pipeline("automatic-speech-recognition", model="cdactvm/w2v-bert-2.0-hindi_new")
|
|
result = asr_pipeline(filepath)
|
|
text_value=result['text']
|
|
cleaned_text=text_value.replace("<s>", "")
|
|
converted_to_list=convert_to_list(cleaned_text,text_to_list())
|
|
processd_doubles=process_doubles(converted_to_list)
|
|
replaced_words = replace_words(processd_doubles)
|
|
converted_text=text_to_int(replaced_words)
|
|
print("Transcription: ", converted_text)
|
|
return converted_text
|
|
|
|
if __name__ == "__main__":
|
|
|
|
input_filepath = 'C:/Users/WCHL/Desktop/hp_sounds/101003/crm/hi/1728685442307.wav'
|
|
|
|
|
|
|
|
processed_filepath = noise_reduction_pipeline(input_filepath)
|
|
|
|
|
|
transcription = transcribe_with_huggingface(processed_filepath)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
result = hindi_pipe("C:/Users/WCHL/Desktop/hp_sounds/101003/crm/hi/1728685502007.wav")
|
|
|
|
|
|
text_value=result['text']
|
|
cleaned_text=text_value.replace("<s>", "")
|
|
converted_to_list=convert_to_list(cleaned_text,text_to_list())
|
|
processd_doubles=process_doubles(converted_to_list)
|
|
replaced_words = replace_words(processd_doubles)
|
|
converted_text=text_to_int(replaced_words)
|
|
|
|
print("Transcription: ", converted_text)
|
|
|
|
नमस्का जी 1 मन 2 पुलिस हेलप्लेन से बात कर रहे बताइए आपकी ाएमर्जेंसी है
|
|
नमिश्का जी 1 मन 2 पुलिस हेलप्लेन से बात कर रह बताइए आपकी क्या एमर्जेंसी है
|
|
नमस्का जी 1 मन 2 पुलिस हेलप्लेन से बात कर रह बताइए आपके क्या एमर्जेंसी हैवेल्कम 2 एमर्जनसी
|
|
वेल्कम 2 एमर्जनसी
|
|
वेलकम 2 एमर्जेंसी
|
|
और 9 र मलीख वेल्कम 2 एमर्जंसीनमस्कार जी 1 ्स 2 बारा पुलस हल्प्लाइन में आपका स्वागत ह बताइए आपकी के एमर्जेंसी है
|
|
नमस्कार जी 1 ्स दौबारा पुलिस हेल्प्लाइ में आपका स्वागत है बताइए आपकी के एमर्जेंसी है
|
|
नमस्कार जी 1 2 बारा पुलिस हल्प्लाइन में आपका स्वागत है बताइए आपकी क् एमर्जेंसी हैमस्कार जी 1 ्स 2 12 पुलस हल्प्लाइन में आपका स्वागत ह बताइए आपकी के एमर्जेंसी है
|
|
नमस्कार जी 1 ्स दौबारा पुलिस हेल्प्लाइ में आपका स्वागत है बताइए आपकी के एमर्जेंसी है
|
|
नमस्कार जी 1 2 12 पुलिस हल्प्लाइन में आपका स्वागत है बताइए आपकी क् एमर्जेंसी हैनमस्कार जी इक्सुबारा में आपका स्वागत हैइनम
|
|
नमस्कार जी इक्सुबारा में आपका स्वागत है कि इनमें
|
|
नमस्कार जी 1 ्सुबारा में आपका स्वागत हैइन
|
|
|
|
|
|
|
|
import os
|
|
import numpy as np
|
|
import scipy.signal
|
|
import webrtcvad
|
|
import soundfile as sf
|
|
import librosa
|
|
import logging
|
|
from transformers import pipeline
|
|
from text2int import text_to_int
|
|
from isNumber import is_number
|
|
from Text2List import text_to_list
|
|
from convert2list import convert_to_list
|
|
from processDoubles import process_doubles
|
|
from replaceWords import replace_words
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
def high_pass_filter(audio, sr, cutoff=100, order=5):
|
|
try:
|
|
sos = scipy.signal.butter(order, cutoff, btype='highpass', fs=sr, output='sos')
|
|
filtered_audio = scipy.signal.sosfilt(sos, audio)
|
|
return filtered_audio
|
|
except Exception as e:
|
|
logging.error(f"High-pass filter failed: {e}")
|
|
return audio
|
|
|
|
def wiener_filter(audio):
|
|
try:
|
|
return scipy.signal.wiener(audio)
|
|
except Exception as e:
|
|
logging.error(f"Wiener filter failed: {e}")
|
|
return audio
|
|
|
|
def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
|
|
try:
|
|
vad = webrtcvad.Vad(aggressiveness)
|
|
audio_int16 = np.int16(audio * 32767)
|
|
frame_size = int(sr * frame_duration / 1000)
|
|
frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
|
|
voiced_audio = np.concatenate([frame for frame in frames if vad.is_speech(frame.tobytes(), sample_rate=sr)])
|
|
voiced_audio = np.float32(voiced_audio) / 32767
|
|
return voiced_audio
|
|
except Exception as e:
|
|
logging.error(f"VAD processing failed: {e}")
|
|
return audio
|
|
|
|
def load_audio(filepath):
|
|
try:
|
|
audio, sr = librosa.load(filepath, sr=None)
|
|
return audio, sr
|
|
except Exception as e:
|
|
logging.error(f"Failed to load audio: {e}")
|
|
return None, None
|
|
|
|
def save_audio(filepath, audio, sr):
|
|
try:
|
|
sf.write(filepath, audio, sr)
|
|
logging.info(f"Audio saved at {filepath}")
|
|
except Exception as e:
|
|
logging.error(f"Failed to save audio: {e}")
|
|
|
|
def noise_reduction_pipeline(filepath):
|
|
|
|
audio, sr = load_audio(filepath)
|
|
if audio is None:
|
|
return None
|
|
|
|
|
|
audio_hp = high_pass_filter(audio, sr)
|
|
|
|
|
|
audio_wiener = wiener_filter(audio_hp)
|
|
|
|
|
|
audio_vad = apply_vad(audio_wiener, sr)
|
|
|
|
|
|
output_filepath = "processed_output.wav"
|
|
save_audio(output_filepath, audio_vad, sr)
|
|
|
|
return output_filepath
|
|
|
|
|
|
def transcribe_with_huggingface(filepath, model_name="cdactvm/w2v-bert-2.0-hindi_new"):
|
|
try:
|
|
|
|
logging.info("Loading ASR model...")
|
|
asr_pipeline = pipeline("automatic-speech-recognition", model=model_name)
|
|
|
|
|
|
result = asr_pipeline(filepath)
|
|
text_value = result.get('text', '')
|
|
|
|
|
|
cleaned_text = text_value.replace("<s>", "")
|
|
converted_to_list = convert_to_list(cleaned_text, text_to_list())
|
|
processed_doubles = process_doubles(converted_to_list)
|
|
replaced_words = replace_words(processed_doubles)
|
|
converted_text = text_to_int(replaced_words)
|
|
|
|
logging.info("Transcription completed.")
|
|
return converted_text
|
|
|
|
except Exception as e:
|
|
logging.error(f"ASR transcription failed: {e}")
|
|
return ""
|
|
|
|
if __name__ == "__main__":
|
|
|
|
input_filepath = 'C:/Users/WCHL/Desktop/hp_sounds/101005/crm/hi/1728268817091.wav'
|
|
|
|
|
|
processed_filepath = noise_reduction_pipeline(input_filepath)
|
|
|
|
|
|
if processed_filepath:
|
|
|
|
transcription = transcribe_with_huggingface(processed_filepath)
|
|
if transcription:
|
|
print("Transcription:", transcription)
|
|
else:
|
|
logging.warning("No transcription could be generated.")
|
|
else:
|
|
logging.warning("Noise reduction failed; skipping ASR transcription.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|