In [None]:
!pip install sounddevice scipy torch transformers lang_trans nltk tqdm pyquran

In [1]:
from os import path
import sounddevice as sd
import scipy.io.wavfile as wav
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from lang_trans.arabic import buckwalter
from nltk import edit_distance
from tqdm import tqdm
import pyquran as q

In [2]:
def record():
 fs = 16000 # Sample rate
 seconds = 5 # Duration of recording
 print("Recording...")
 myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1)
 sd.wait() # Wait until recording is finished
 print("Finished recording.")
 return fs , myrecording[:,0]

In [3]:
def load_Quran_fine_tuned_elgeish_xlsr_53_model_and_processor():
 global loaded_model, loaded_processor
 loaded_model = Wav2Vec2ForCTC.from_pretrained("Nuwaisir/Quran_speech_recognizer").eval()
 loaded_processor = Wav2Vec2Processor.from_pretrained("Nuwaisir/Quran_speech_recognizer")

In [4]:
def load_elgeish_xlsr_53_model_and_processor():
 global loaded_model, loaded_processor
 loaded_model = Wav2Vec2ForCTC.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic").eval()
 loaded_processor = Wav2Vec2Processor.from_pretrained("elgeish/wav2vec2-large-xlsr-53-arabic")

In [5]:
def predict(single):
 inputs = loaded_processor(single["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
 with torch.no_grad():
 predicted = torch.argmax(loaded_model(inputs.input_values).logits, dim=-1)
 predicted[predicted == -100] = loaded_processor.tokenizer.pad_token_id # see fine-tuning script
 pred_1 = loaded_processor.tokenizer.batch_decode(predicted)[0]
 single["predicted"] = buckwalter.untrans(pred_1)
 return single

In [6]:
def last_para_str(taskeel=False):
 quran_string = ''
 for i in range (78, 115):
 quran_string += ' '.join(q.quran.get_sura(i, with_tashkeel=taskeel,basmalah=False))
 quran_string += ' '
 return quran_string

def find_match_2(q_str, s, spaces, threshhold = 10):
 len_q = len(q_str)
 len_s = len(s)
 min_dist = 1000000000
 min_dist_pos = []
 for i in tqdm(spaces):
 j = i+1
 k = j + len_s + len_s // 3
 if k > len_q:
 break
 dist = edit_distance(q_str[j:k],s)
 if dist < min_dist:
 min_dist = dist
 min_dist_pos = [j]
 elif dist == min_dist:
 min_dist_pos.append(j)
 return min_dist, min_dist_pos

def find_all_index(s, ch):
 return [i for i, ltr in enumerate(s) if ltr == ch]

In [7]:
last_para = last_para_str(taskeel=True)
last_para_spaces = find_all_index(last_para,' ')
last_para_spaces.insert(0, -1)

In [13]:
def pipeline():
 fs, myrecording = record()
 single_example = {
 "speech": myrecording,
 "sampling_rate": fs,
 }
 predicted = predict(single_example)
 print(predicted["predicted"])
 dist,poses = find_match_2(last_para, predicted['predicted'], spaces=last_para_spaces)
 print("distance:",dist)
 print("number of matches:", len(poses))
 for i in poses:
 print(last_para[i:i+200],'\n')


### Load the elgeish_xlsr_53 model

In [9]:
# load_elgeish_xlsr_53_model_and_processor()

### Load Quran fine-tuned elgeish_xlsr_53 model

In [10]:
load_Quran_fine_tuned_elgeish_xlsr_53_model_and_processor()

In [14]:
# Recite after running this cell. The first 5 seconds will capture your audio
pipeline()

Recording...
Finished recording.
لِإِلَا فِ قْرَايشِ إِلَا فِيهِ


100%|█████████▉| 2304/2309 [00:03<00:00, 587.76it/s]

distance: 23
number of matches: 1
لِإِيلَفِ قُرَيْشٍ إِلَفِهِمْ رِحْلَةَ الشِّتَاءِ وَالصَّيْفِ فَلْيَعْبُدُوا رَبَّ هَذَا الْبَيْتِ الَّذِى أَطْعَمَهُم مِّن جُوعٍ وَءَامَنَهُم مِّنْ خَوْفٍ أَرَءَيْتَ الَّذِى يُكَذِّبُ بِالدِّينِ فَذَ 




