Spaces:

cdactvm
/

Hindi_ASR

Running

App Files Files Community

Hindi_ASR / convert2list.py

cdactvm

Upload 12 files

bfde6e2 verified about 1 month ago

raw

history blame

3.34 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[ ]:


	# import nbimporter
	import nbimporter
	from Text2List import text_to_list
	def convert_to_list(text, text_list):
	matched_words = []
	unmatched_text = '' # To accumulate unmatched characters

	# Sort text_list by length in descending order to prioritize longest matches first
	text_list_sorted = sorted(text_list, key=len, reverse=True)

	while text:
	matched = False
	for word in text_list_sorted:
	if text.startswith(word):
	# Add any accumulated unmatched text before appending the matched word
	if unmatched_text:
	matched_words.append(unmatched_text)
	unmatched_text = '' # Reset unmatched text accumulator

	matched_words.append(word)
	text = text[len(word):] # Remove the matched part from text
	matched = True
	break

	if not matched:
	# Accumulate unmatched characters
	unmatched_text += text[0]
	text = text[1:]

	# If there's any remaining unmatched text, add it to the result
	if unmatched_text:
	matched_words.append(unmatched_text)

	# Join matched words and unmatched text with a space
	result = ' '.join(matched_words)
	return result

	# text = "जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच"

	if __name__=="__main__":
	converted=convert_to_list(text, text_to_list())
	print(converted)


	# In[ ]:


	# # import nbimporter
	# import nbimporter
	# from Text2List import text_to_list
	# def convert_to_list(text, text_list):
	# matched_words = []
	# unmatched_text = '' # To accumulate unmatched characters

	# # Sort text_list by length in descending order to prioritize longest matches first
	# text_list_sorted = sorted(text_list, key=len, reverse=True)

	# while text:
	# matched = False
	# for word in text_list_sorted:
	# if word in text:
	# # Add any accumulated unmatched text before appending the matched word
	# if unmatched_text:
	# matched_words.append(unmatched_text)
	# unmatched_text = '' # Reset unmatched text accumulator

	# matched_words.append(word)
	# text = text[len(word):] # Remove the matched part from text
	# matched = True
	# break

	# if not matched:
	# # Accumulate unmatched characters
	# unmatched_text += text[0]
	# text = text[1:]

	# # If there's any remaining unmatched text, add it to the result
	# if unmatched_text:
	# matched_words.append(unmatched_text)

	# # Join matched words and unmatched text with a space
	# result = ' '.join(matched_words)
	# return result

	# text = "जीरोएकदोतीनचार"

	# if __name__=="__main__":
	# converted=convert_to_list(text, text_to_list())
	# print(converted)


	# In[ ]:


	get_ipython().system('git clone https://huggingface.co/StephennFernandes/wav2vec2-XLS-R-300m-konkani')


	# In[ ]: