File size: 2,907 Bytes
cb7427c
 
29003f1
cb7427c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bdc1ae
 
cb7427c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from source.config import Config


class Vocab:
    """
    Offers word2index and index2word functionality after counting words in input sentences.
    Allows choosing the size of the vocabulary by taking the most common words. Explicitly reserves four indices:
    <pad>, <sos>, <eos> and <unk>.
    """
    def __init__(self, sentence_splitter=None):
        """
        Args:
        sentence_splitter: tokenizing function
        """
        self.config = Config()

        self.counter = Counter()
        self.word2index = dict()
        self.index2word = dict()
        self.size = 0

        # predefined tokens
        self.PADDING_INDEX = 0
        self.SOS = 1
        self.EOS = 2
        self.UNKNOWN_WORD_INDEX = 3

        if sentence_splitter is None:
            # matches sequences of characters including ones between < >
            word_regex = r'(?:\w+|<\w+>)'
            # tokenize the string into words
            sentence_splitter = RegexpTokenizer(word_regex).tokenize
        self.splitter = sentence_splitter

    def add_sentence(self, sentence: str):
        """
        Update word counts from sentence after tokenizing it into words
        """
        self.counter.update(self.splitter(sentence))

    def word_to_index(self, word: str) -> int:
        """ Map word to index from word2index dictionary in vocabulary

        Args:
            word (str): word to be mapped

        Returns:
            int: index matched to the word
        """
        try:
            return self.word2index[word]
        except KeyError:
            return self.UNKNOWN_WORD_INDEX

    def index_to_word(self, index: int) -> str:
        """ Map word to index from index2word dictionary in vocabulary

        Args:
            word (str): index to be mapped

        Returns:
            str: word matched to the index
        """
        try:
            return self.index2word[index]
        except KeyError:
            return self.index2word[self.UNKNOWN_WORD_INDEX]

    def load_vocab(self, filepath: str):
        """ Load the word2index and index2word dictionaries from a text file.

        Args:
            file_name (str): name of the text file where the vocabulary is saved (i.e 'word2index.txt')
                Note: the lines in file are assumed to be in form: 'word SPACE index' and it asssumes a header line
        """

        self.word2index = dict()
        self.index2word = dict()

        try:
            with open(filepath) as file:
                for line in file:
                    line = line.strip().split(' ')
                    word, index = line[0], line[1]
                    self.word2index[word] = int(index)
                    self.index2word[int(index)] = word
        except Exception as e:
            print(f"Error loading vocabulary from file {filepath}: {e}")