File size: 4,291 Bytes
2080fde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import nltk
import re
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk import pos_tag_sents
from .dictionary import Dictionary
from .filter import filter_text as ft
from . import format_ph as ph

# Check that the nltk data is downloaded, if not, download it
try:
    nltk.data.find('taggers/averaged_perceptron_tagger.zip')
except LookupError:
    nltk.download('averaged_perceptron_tagger')


# Method to use Regex to replace the first instance of a word with its phonemes
def replace_first(target, replacement, text):
    # Skip if target invalid
    if target is None or target == '':
        return text
    # Replace the first instance of a word with its phonemes
    return re.sub(r'(?i)\b' + target + r'\b', replacement, text, 1)


class H2p:
    def __init__(self, dict_path=None, preload=False, phoneme_format=''):
        """
        Creates a H2p parser

        Supported phoneme formats:
            - Space delimited
            - Space delimited surrounded by { }

        :param dict_path: Path to a heteronym dictionary json file. Built-in dictionary will be used if None
        :type dict_path: str
        :param preload: Preloads the tokenizer and tagger during initialization
        :type preload: bool
        """

        # Supported phoneme formats
        self.phoneme_format = phoneme_format
        self.dict = Dictionary(dict_path)
        self.tokenize = TweetTokenizer().tokenize
        self.get_tags = pos_tag
        if preload:
            self.preload()

    # Method to preload tokenizer and pos_tag
    def preload(self):
        tokens = self.tokenize('a')
        assert tokens == ['a']
        assert pos_tag(tokens)[0][0] == 'a'

    # Method to check if a text line contains a heteronym
    def contains_het(self, text):
        # Filter the text
        text = ft(text)
        # Tokenize
        words = self.tokenize(text)
        # Check match with dictionary
        hets = []
        for word in words:
            if self.dict.contains(word):
                hets.append(word)
        return len(hets)>0, hets

    # Method to replace heteronyms in a text line to phonemes
    def replace_het(self, text):
        # Filter the text
        working_text = ft(text, preserve_case=True)
        # Tokenize
        words = self.tokenize(working_text)
        # Get pos tags
        tags = pos_tag(words)
        # Loop through words and pos tags
        for word, pos in tags:
            # Skip if word not in dictionary
            if not self.dict.contains(word):
                continue
            # Get phonemes
            phonemes = self.dict.get_phoneme(word, pos)
            # Format phonemes
            f_ph = ph.with_cb(ph.to_sds(phonemes))
            # Replace word with phonemes
            text = replace_first(word, f_ph, text)
        return text

    # Replaces heteronyms in a list of text lines
    # Slightly faster than replace_het() called on each line
    def replace_het_list(self, text_list):
        # Filter the text
        working_text_list = [ft(text, preserve_case=True) for text in text_list]
        # Tokenize
        list_sentence_words = [self.tokenize(text) for text in working_text_list]
        # Get pos tags list
        tags_list = pos_tag_sents(list_sentence_words)
        # Loop through lines
        for index in range(len(tags_list)):
            # Loop through words and pos tags in tags_list index
            for word, pos in tags_list[index]:
                # Skip if word not in dictionary
                if not self.dict.contains(word):
                    continue
                # Get phonemes
                phonemes = self.dict.get_phoneme(word, pos)
                # Format phonemes
                f_ph = ph.with_cb(ph.to_sds(phonemes))
                # Replace word with phonemes
                text_list[index] = replace_first(word, f_ph, text_list[index])
        return text_list

    # Method to tag a text line, returns a list of tags
    def tag(self, text):
        # Filter the text
        working_text = ft(text, preserve_case=True)
        # Tokenize
        words = self.tokenize(working_text)
        # Get pos tags
        tags = pos_tag(words)
        # Only return element 1 of each list
        return [tag[1] for tag in tags]