from collections import Counter from nltk.tokenize import RegexpTokenizer from source.config import Config class Vocab: """ Offers word2index and index2word functionality after counting words in input sentences. Allows choosing the size of the vocabulary by taking the most common words. Explicitly reserves four indices: , , and . """ def __init__(self, sentence_splitter=None): """ Args: sentence_splitter: tokenizing function """ self.config = Config() self.counter = Counter() self.word2index = dict() self.index2word = dict() self.size = 0 # predefined tokens self.PADDING_INDEX = 0 self.SOS = 1 self.EOS = 2 self.UNKNOWN_WORD_INDEX = 3 if sentence_splitter is None: # matches sequences of characters including ones between < > word_regex = r'(?:\w+|<\w+>)' # tokenize the string into words sentence_splitter = RegexpTokenizer(word_regex).tokenize self.splitter = sentence_splitter def add_sentence(self, sentence: str): """ Update word counts from sentence after tokenizing it into words """ self.counter.update(self.splitter(sentence)) def word_to_index(self, word: str) -> int: """ Map word to index from word2index dictionary in vocabulary Args: word (str): word to be mapped Returns: int: index matched to the word """ try: return self.word2index[word] except KeyError: return self.UNKNOWN_WORD_INDEX def index_to_word(self, index: int) -> str: """ Map word to index from index2word dictionary in vocabulary Args: word (str): index to be mapped Returns: str: word matched to the index """ try: return self.index2word[index] except KeyError: return self.index2word[self.UNKNOWN_WORD_INDEX] def load_vocab(self, filepath: str): """ Load the word2index and index2word dictionaries from a text file. Args: file_name (str): name of the text file where the vocabulary is saved (i.e 'word2index.txt') Note: the lines in file are assumed to be in form: 'word SPACE index' and it asssumes a header line """ self.word2index = dict() self.index2word = dict() try: with open(filepath) as file: for line in file: line = line.strip().split(' ') word, index = line[0], line[1] self.word2index[word] = int(index) self.index2word[int(index)] = word except Exception as e: print(f"Error loading vocabulary from file {filepath}: {e}")