#!/usr/bin/env python | |
# coding: utf-8 | |
# In[ ]: | |
# import nbimporter | |
import nbimporter | |
from Text2List import text_to_list | |
def convert_to_list(text, text_list): | |
matched_words = [] | |
unmatched_text = '' # To accumulate unmatched characters | |
# Sort text_list by length in descending order to prioritize longest matches first | |
text_list_sorted = sorted(text_list, key=len, reverse=True) | |
while text: | |
matched = False | |
for word in text_list_sorted: | |
if text.startswith(word): | |
# Add any accumulated unmatched text before appending the matched word | |
if unmatched_text: | |
matched_words.append(unmatched_text) | |
unmatched_text = '' # Reset unmatched text accumulator | |
matched_words.append(word) | |
text = text[len(word):] # Remove the matched part from text | |
matched = True | |
break | |
if not matched: | |
# Accumulate unmatched characters | |
unmatched_text += text[0] | |
text = text[1:] | |
# If there's any remaining unmatched text, add it to the result | |
if unmatched_text: | |
matched_words.append(unmatched_text) | |
# Join matched words and unmatched text with a space | |
result = ' '.join(matched_words) | |
return result | |
# text = "जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच" | |
if __name__=="__main__": | |
converted=convert_to_list(text, text_to_list()) | |
print(converted) | |
# In[ ]: | |
# # import nbimporter | |
# import nbimporter | |
# from Text2List import text_to_list | |
# def convert_to_list(text, text_list): | |
# matched_words = [] | |
# unmatched_text = '' # To accumulate unmatched characters | |
# # Sort text_list by length in descending order to prioritize longest matches first | |
# text_list_sorted = sorted(text_list, key=len, reverse=True) | |
# while text: | |
# matched = False | |
# for word in text_list_sorted: | |
# if word in text: | |
# # Add any accumulated unmatched text before appending the matched word | |
# if unmatched_text: | |
# matched_words.append(unmatched_text) | |
# unmatched_text = '' # Reset unmatched text accumulator | |
# matched_words.append(word) | |
# text = text[len(word):] # Remove the matched part from text | |
# matched = True | |
# break | |
# if not matched: | |
# # Accumulate unmatched characters | |
# unmatched_text += text[0] | |
# text = text[1:] | |
# # If there's any remaining unmatched text, add it to the result | |
# if unmatched_text: | |
# matched_words.append(unmatched_text) | |
# # Join matched words and unmatched text with a space | |
# result = ' '.join(matched_words) | |
# return result | |
# text = "जीरोएकदोतीनचार" | |
# if __name__=="__main__": | |
# converted=convert_to_list(text, text_to_list()) | |
# print(converted) | |
# In[ ]: | |
get_ipython().system('git clone https://huggingface.co/StephennFernandes/wav2vec2-XLS-R-300m-konkani') | |
# In[ ]: | |