text fix

Files changed (16) hide show

app/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (152 Bytes). View file

app/__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (1.61 kB). View file

app/__pycache__/matcher.cpython-310.pyc ADDED Viewed

Binary file (807 Bytes). View file

app/__pycache__/mfcc.cpython-310.pyc ADDED Viewed

Binary file (1.63 kB). View file

app/__pycache__/string_processor.cpython-310.pyc ADDED Viewed

Binary file (657 Bytes). View file

app/__pycache__/transcriber.cpython-310.pyc ADDED Viewed

Binary file (1.16 kB). View file

app/main.py CHANGED Viewed

@@ -15,6 +15,8 @@ app = FastAPI(
         {
             "url": "http://127.0.0.1:8000/api/v1",
             "description": "Local Server",
             "url": "https://r3vibe-mother-tongue.hf.space/api/v1",
             "description": "Huggingface Server",
         }

         {
             "url": "http://127.0.0.1:8000/api/v1",
             "description": "Local Server",
+        },
+        {
             "url": "https://r3vibe-mother-tongue.hf.space/api/v1",
             "description": "Huggingface Server",
         }

app/matcher.py CHANGED Viewed

@@ -18,8 +18,6 @@ def sequence_match(a, b):
     return difflib.SequenceMatcher(None, a, b).ratio()
 def match(original, transcription):
     sequence = sequence_match(original, transcription)
     phonetic = phonetic_match(original, transcription)

     return difflib.SequenceMatcher(None, a, b).ratio()
 def match(original, transcription):
     sequence = sequence_match(original, transcription)
     phonetic = phonetic_match(original, transcription)

app/routers/V1/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (163 Bytes). View file

app/routers/V1/__pycache__/v1_routers.cpython-310.pyc ADDED Viewed

Binary file (411 Bytes). View file

app/routers/V1/voice/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (169 Bytes). View file

app/routers/V1/voice/__pycache__/voice_router.cpython-310.pyc ADDED Viewed

Binary file (1.94 kB). View file

app/routers/V1/voice/voice_router.py CHANGED Viewed

@@ -6,7 +6,7 @@ import os
 from app.transcriber import get_transcription
 from app.matcher import match
 from app.mfcc import mfcc_similarty_check
-from app.string_processor import process_text
 """ initialize the router """
@@ -51,7 +51,8 @@ async def transcribe_audio(
         try:
             text = get_transcription(filename_recorded)
-            sequence, phonetic = match(matcher_text, process_text(text))
             Euclidean, Cosine = mfcc_similarty_check(
                 filename_original, filename_recorded
             )

 from app.transcriber import get_transcription
 from app.matcher import match
 from app.mfcc import mfcc_similarty_check
+from app.string_processor import clean_transcription
 """ initialize the router """
         try:
             text = get_transcription(filename_recorded)
+            text = clean_transcription(text)
+            sequence, phonetic = match(matcher_text, text)
             Euclidean, Cosine = mfcc_similarty_check(
                 filename_original, filename_recorded
             )

app/routers/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (160 Bytes). View file

app/routers/__pycache__/routes.cpython-310.pyc ADDED Viewed

Binary file (378 Bytes). View file

app/string_processor.py CHANGED Viewed

@@ -1,18 +1,22 @@
-import string
 import re
-def process_text(text):
-    # Step 1: Strip whitespace from both ends
-    text = text.strip()
-    # Step 2: Remove all punctuation (including full stops and commas)
-    text = text.translate(str.maketrans("", "", string.punctuation))
-    # Step 3: Extract sentences (assuming you want to keep the text as a whole sentence)
-    sentences = re.split(r"(?<=[.!?]) +", text)
-    # Combine the sentences back into a single string without punctuation
-    processed_text = " ".join(sentences)
-    return processed_text

+import unicodedata
 import re
+def clean_transcription(text):
+    # Normalize the text to NFKD form
+    normalized_text = unicodedata.normalize('NFKD', text)
+    # Remove diacritics
+    cleaned_text = ''.join([c for c in normalized_text if not unicodedata.combining(c)])
+    # Explicitly remove the leading ʻ character and any other specific characters
+    cleaned_text = cleaned_text.replace('ʻ', '')
+    # Remove any remaining special characters (if any)
+    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
+    # Ensure the text is stripped of any unwanted leading or trailing whitespace
+    cleaned_text = cleaned_text.strip()
+    return cleaned_text