anuragshas commited on
Commit
64ae049
1 Parent(s): f1ff9aa

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +7 -5
README.md CHANGED
@@ -38,7 +38,7 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
38
  import pandas as pd
39
 
40
  # Evaluation notebook contains the procedure to download the data
41
- df = pd.read_csv("/content/te/test.tsv", sep="\t")
42
  df["path"] = "/content/te/clips/" + df["path"]
43
  test_dataset = Dataset.from_pandas(df)
44
 
@@ -72,7 +72,7 @@ from sklearn.model_selection import train_test_split
72
  import pandas as pd
73
 
74
  # Evaluation notebook contains the procedure to download the data
75
- df = pd.read_csv("/content/te/test.tsv", sep="\t")
76
  df["path"] = "/content/te/clips/" + df["path"]
77
  test_dataset = Dataset.from_pandas(df)
78
  wer = load_metric("wer")
@@ -81,12 +81,14 @@ processor = Wav2Vec2Processor.from_pretrained("anuragshas/wav2vec2-large-xlsr-53
81
  model = Wav2Vec2ForCTC.from_pretrained("anuragshas/wav2vec2-large-xlsr-53-telugu")
82
  model.to("cuda")
83
 
84
- chars_to_ignore_regex = '[\,\?\.\!\-\_\;\:\"\“\%\‘\”\।\’\'\&]'
85
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
86
 
87
  def normalizer(text):
88
  # Use your custom normalizer
89
- text = text.replace("\\n","\n")
 
 
90
  text = ' '.join(text.split())
91
  text = re.sub(r'''([a-z]+)''','',text,flags=re.IGNORECASE)
92
  text = re.sub(r'''%'''," శాతం ", text)
@@ -117,7 +119,7 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
117
 
118
  **Test Result**: 44.98%
119
  ## Training
120
- 70% of the OpenSLR Marathi dataset was used for training.
121
 
122
  Train Split of annotations is [here](https://www.dropbox.com/s/xqc0wtour7f9h4c/train.tsv)
123
 
 
38
  import pandas as pd
39
 
40
  # Evaluation notebook contains the procedure to download the data
41
+ df = pd.read_csv("/content/te/test.tsv", sep="\\t")
42
  df["path"] = "/content/te/clips/" + df["path"]
43
  test_dataset = Dataset.from_pandas(df)
44
 
 
72
  import pandas as pd
73
 
74
  # Evaluation notebook contains the procedure to download the data
75
+ df = pd.read_csv("/content/te/test.tsv", sep="\\t")
76
  df["path"] = "/content/te/clips/" + df["path"]
77
  test_dataset = Dataset.from_pandas(df)
78
  wer = load_metric("wer")
 
81
  model = Wav2Vec2ForCTC.from_pretrained("anuragshas/wav2vec2-large-xlsr-53-telugu")
82
  model.to("cuda")
83
 
84
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\_\\;\\:\\"\\“\\%\\‘\\”\\।\\’\\'\\&]'
85
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
86
 
87
  def normalizer(text):
88
  # Use your custom normalizer
89
+ text = text.replace("\\\
90
+ ","\
91
+ ")
92
  text = ' '.join(text.split())
93
  text = re.sub(r'''([a-z]+)''','',text,flags=re.IGNORECASE)
94
  text = re.sub(r'''%'''," శాతం ", text)
 
119
 
120
  **Test Result**: 44.98%
121
  ## Training
122
+ 70% of the OpenSLR Telugu dataset was used for training.
123
 
124
  Train Split of annotations is [here](https://www.dropbox.com/s/xqc0wtour7f9h4c/train.tsv)
125