hussain-shk Shanks0465 commited on
Commit
ef23634
0 Parent(s):

Duplicate from ai4bharat/IndicTrans-MultilingualTranslation

Browse files

Co-authored-by: Umashankar <[email protected]>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +31 -0
  2. README.md +14 -0
  3. api/api.py +152 -0
  4. api/punctuate.py +220 -0
  5. app.py +107 -0
  6. indic_nlp_library/LICENSE +9 -0
  7. indic_nlp_library/README.md +142 -0
  8. indic_nlp_library/contrib/README.md +7 -0
  9. indic_nlp_library/contrib/correct_moses_tokenizer.py +29 -0
  10. indic_nlp_library/contrib/hindi_to_kannada_transliterator.py +62 -0
  11. indic_nlp_library/contrib/indic_scraper_project_sample.ipynb +569 -0
  12. indic_nlp_library/docs/Makefile +153 -0
  13. indic_nlp_library/docs/cmd.rst +8 -0
  14. indic_nlp_library/docs/code.rst +5 -0
  15. indic_nlp_library/docs/conf.py +242 -0
  16. indic_nlp_library/docs/index.rst +22 -0
  17. indic_nlp_library/docs/indicnlp.MD +122 -0
  18. indic_nlp_library/docs/indicnlp.cli.rst +11 -0
  19. indic_nlp_library/docs/indicnlp.morph.rst +11 -0
  20. indic_nlp_library/docs/indicnlp.normalize.rst +15 -0
  21. indic_nlp_library/docs/indicnlp.pdf +0 -0
  22. indic_nlp_library/docs/indicnlp.rst +47 -0
  23. indic_nlp_library/docs/indicnlp.script.rst +26 -0
  24. indic_nlp_library/docs/indicnlp.syllable.rst +11 -0
  25. indic_nlp_library/docs/indicnlp.tokenize.rst +26 -0
  26. indic_nlp_library/docs/indicnlp.transliterate.rst +34 -0
  27. indic_nlp_library/docs/make.bat +35 -0
  28. indic_nlp_library/docs/modules.rst +7 -0
  29. indic_nlp_library/indicnlp/__init__.py +10 -0
  30. indic_nlp_library/indicnlp/cli/__init__.py +0 -0
  31. indic_nlp_library/indicnlp/cli/cliparser.py +266 -0
  32. indic_nlp_library/indicnlp/common.py +58 -0
  33. indic_nlp_library/indicnlp/langinfo.py +488 -0
  34. indic_nlp_library/indicnlp/loader.py +35 -0
  35. indic_nlp_library/indicnlp/morph/__init__.py +0 -0
  36. indic_nlp_library/indicnlp/morph/unsupervised_morph.py +142 -0
  37. indic_nlp_library/indicnlp/normalize/__init__.py +0 -0
  38. indic_nlp_library/indicnlp/normalize/indic_normalize.py +984 -0
  39. indic_nlp_library/indicnlp/script/__init__.py +0 -0
  40. indic_nlp_library/indicnlp/script/english_script.py +154 -0
  41. indic_nlp_library/indicnlp/script/indic_scripts.py +301 -0
  42. indic_nlp_library/indicnlp/script/phonetic_sim.py +59 -0
  43. indic_nlp_library/indicnlp/syllable/__init__.py +0 -0
  44. indic_nlp_library/indicnlp/syllable/syllabifier.py +302 -0
  45. indic_nlp_library/indicnlp/test/__init__.py +0 -0
  46. indic_nlp_library/indicnlp/test/unit/__init__.py +0 -0
  47. indic_nlp_library/indicnlp/tokenize/__init__.py +0 -0
  48. indic_nlp_library/indicnlp/tokenize/indic_detokenize.py +134 -0
  49. indic_nlp_library/indicnlp/tokenize/indic_tokenize.py +111 -0
  50. indic_nlp_library/indicnlp/tokenize/sentence_tokenize.py +268 -0
.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: IndicTrans MultilingualTranslation
3
+ emoji: 🌍
4
+ colorFrom: indigo
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.3.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: ai4bharat/IndicTrans-MultilingualTranslation
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
api/api.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import re
4
+ from math import floor, ceil
5
+ from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
6
+ # from nltk.tokenize import sent_tokenize
7
+ from flask import Flask, request, jsonify
8
+ from flask_cors import CORS, cross_origin
9
+ import webvtt
10
+ from io import StringIO
11
+ from mosestokenizer import MosesSentenceSplitter
12
+
13
+ from indicTrans.inference.engine import Model
14
+ from punctuate import RestorePuncts
15
+ from indicnlp.tokenize.sentence_tokenize import sentence_split
16
+
17
+ app = Flask(__name__)
18
+ cors = CORS(app)
19
+ app.config['CORS_HEADERS'] = 'Content-Type'
20
+
21
+ indic2en_model = Model(expdir='models/v3/indic-en')
22
+ en2indic_model = Model(expdir='models/v3/en-indic')
23
+ m2m_model = Model(expdir='models/m2m')
24
+
25
+ rpunct = RestorePuncts()
26
+
27
+ indic_language_dict = {
28
+ 'Assamese': 'as',
29
+ 'Hindi' : 'hi',
30
+ 'Marathi' : 'mr',
31
+ 'Tamil' : 'ta',
32
+ 'Bengali' : 'bn',
33
+ 'Kannada' : 'kn',
34
+ 'Oriya' : 'or',
35
+ 'Telugu' : 'te',
36
+ 'Gujarati' : 'gu',
37
+ 'Malayalam' : 'ml',
38
+ 'Punjabi' : 'pa',
39
+ }
40
+
41
+ splitter = MosesSentenceSplitter('en')
42
+
43
+ def get_inference_params():
44
+ source_language = request.form['source_language']
45
+ target_language = request.form['target_language']
46
+
47
+ if source_language in indic_language_dict and target_language == 'English':
48
+ model = indic2en_model
49
+ source_lang = indic_language_dict[source_language]
50
+ target_lang = 'en'
51
+ elif source_language == 'English' and target_language in indic_language_dict:
52
+ model = en2indic_model
53
+ source_lang = 'en'
54
+ target_lang = indic_language_dict[target_language]
55
+ elif source_language in indic_language_dict and target_language in indic_language_dict:
56
+ model = m2m_model
57
+ source_lang = indic_language_dict[source_language]
58
+ target_lang = indic_language_dict[target_language]
59
+
60
+ return model, source_lang, target_lang
61
+
62
+ @app.route('/', methods=['GET'])
63
+ def main():
64
+ return "IndicTrans API"
65
+
66
+ @app.route('/supported_languages', methods=['GET'])
67
+ @cross_origin()
68
+ def supported_languages():
69
+ return jsonify(indic_language_dict)
70
+
71
+ @app.route("/translate", methods=['POST'])
72
+ @cross_origin()
73
+ def infer_indic_en():
74
+ model, source_lang, target_lang = get_inference_params()
75
+ source_text = request.form['text']
76
+
77
+ start_time = time.time()
78
+ target_text = model.translate_paragraph(source_text, source_lang, target_lang)
79
+ end_time = time.time()
80
+ return {'text':target_text, 'duration':round(end_time-start_time, 2)}
81
+
82
+ @app.route("/translate_vtt", methods=['POST'])
83
+ @cross_origin()
84
+ def infer_vtt_indic_en():
85
+ start_time = time.time()
86
+ model, source_lang, target_lang = get_inference_params()
87
+ source_text = request.form['text']
88
+ # vad_segments = request.form['vad_nochunk'] # Assuming it is an array of start & end timestamps
89
+
90
+ vad = webvtt.read_buffer(StringIO(source_text))
91
+ source_sentences = [v.text.replace('\r', '').replace('\n', ' ') for v in vad]
92
+
93
+ ## SUMANTH LOGIC HERE ##
94
+
95
+ # for each vad timestamp, do:
96
+ large_sentence = ' '.join(source_sentences) # only sentences in that time range
97
+ large_sentence = large_sentence.lower()
98
+ # split_sents = sentence_split(large_sentence, 'en')
99
+ # print(split_sents)
100
+
101
+ large_sentence = re.sub(r'[^\w\s]', '', large_sentence)
102
+ punctuated = rpunct.punctuate(large_sentence, batch_size=32)
103
+ end_time = time.time()
104
+ print("Time Taken for punctuation: {} s".format(end_time - start_time))
105
+ start_time = time.time()
106
+ split_sents = splitter([punctuated]) ### Please uncomment
107
+
108
+
109
+ # print(split_sents)
110
+ # output_sentence_punctuated = model.translate_paragraph(punctuated, source_lang, target_lang)
111
+ output_sents = model.batch_translate(split_sents, source_lang, target_lang)
112
+ # print(output_sents)
113
+ # output_sents = split_sents
114
+ # print(output_sents)
115
+ # align this to those range of source_sentences in `captions`
116
+
117
+ map_ = {split_sents[i] : output_sents[i] for i in range(len(split_sents))}
118
+ # print(map_)
119
+ punct_para = ' '.join(list(map_.keys()))
120
+ nmt_para = ' '.join(list(map_.values()))
121
+ nmt_words = nmt_para.split(' ')
122
+
123
+ len_punct = len(punct_para.split(' '))
124
+ len_nmt = len(nmt_para.split(' '))
125
+
126
+ start = 0
127
+ for i in range(len(vad)):
128
+ if vad[i].text == '':
129
+ continue
130
+
131
+ len_caption = len(vad[i].text.split(' '))
132
+ frac = (len_caption / len_punct)
133
+ # frac = round(frac, 2)
134
+
135
+ req_nmt_size = floor(frac * len_nmt)
136
+ # print(frac, req_nmt_size)
137
+
138
+ vad[i].text = ' '.join(nmt_words[start:start+req_nmt_size])
139
+ # print(vad[i].text)
140
+ # print(start, req_nmt_size)
141
+ start += req_nmt_size
142
+
143
+ end_time = time.time()
144
+
145
+ print("Time Taken for translation: {} s".format(end_time - start_time))
146
+
147
+ # vad.save('aligned.vtt')
148
+
149
+ return {
150
+ 'text': vad.content,
151
+ # 'duration':round(end_time-start_time, 2)
152
+ }
api/punctuate.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # 💾⚙️🔮
3
+
4
+ # taken from https://github.com/Felflare/rpunct/blob/master/rpunct/punctuate.py
5
+ # modified to support batching during gpu inference
6
+
7
+
8
+ __author__ = "Daulet N."
9
+ __email__ = "[email protected]"
10
+
11
+ import time
12
+ import logging
13
+ import webvtt
14
+ import torch
15
+ from io import StringIO
16
+ from nltk.tokenize import sent_tokenize
17
+ #from langdetect import detect
18
+ from simpletransformers.ner import NERModel
19
+
20
+
21
+ class RestorePuncts:
22
+ def __init__(self, wrds_per_pred=250):
23
+ self.wrds_per_pred = wrds_per_pred
24
+ self.overlap_wrds = 30
25
+ self.valid_labels = ['OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U']
26
+ self.model = NERModel("bert", "felflare/bert-restore-punctuation", labels=self.valid_labels,
27
+ args={"silent": True, "max_seq_length": 512})
28
+ # use_cuda isnt working and this hack seems to load the model correctly to the gpu
29
+ self.model.device = torch.device("cuda:1")
30
+ # dummy punctuate to load the model onto gpu
31
+ self.punctuate("hello how are you")
32
+
33
+ def punctuate(self, text: str, batch_size:int=32, lang:str=''):
34
+ """
35
+ Performs punctuation restoration on arbitrarily large text.
36
+ Detects if input is not English, if non-English was detected terminates predictions.
37
+ Overrride by supplying `lang='en'`
38
+
39
+ Args:
40
+ - text (str): Text to punctuate, can be few words to as large as you want.
41
+ - lang (str): Explicit language of input text.
42
+ """
43
+ #if not lang and len(text) > 10:
44
+ # lang = detect(text)
45
+ #if lang != 'en':
46
+ # raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
47
+ # If you are certain the input is English, pass argument lang='en' to this function.
48
+ # Punctuate received: {text}""")
49
+
50
+ def chunks(L, n):
51
+ return [L[x : x + n] for x in range(0, len(L), n)]
52
+
53
+
54
+
55
+ # plit up large text into bert digestable chunks
56
+ splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds)
57
+
58
+ texts = [i["text"] for i in splits]
59
+ batches = chunks(texts, batch_size)
60
+ preds_lst = []
61
+
62
+
63
+ for batch in batches:
64
+ batch_preds, _ = self.model.predict(batch)
65
+ preds_lst.extend(batch_preds)
66
+
67
+
68
+ # predict slices
69
+ # full_preds_lst contains tuple of labels and logits
70
+ #full_preds_lst = [self.predict(i['text']) for i in splits]
71
+ # extract predictions, and discard logits
72
+ #preds_lst = [i[0][0] for i in full_preds_lst]
73
+ # join text slices
74
+ combined_preds = self.combine_results(text, preds_lst)
75
+ # create punctuated prediction
76
+ punct_text = self.punctuate_texts(combined_preds)
77
+ return punct_text
78
+
79
+ def predict(self, input_slice):
80
+ """
81
+ Passes the unpunctuated text to the model for punctuation.
82
+ """
83
+ predictions, raw_outputs = self.model.predict([input_slice])
84
+ return predictions, raw_outputs
85
+
86
+ @staticmethod
87
+ def split_on_toks(text, length, overlap):
88
+ """
89
+ Splits text into predefined slices of overlapping text with indexes (offsets)
90
+ that tie-back to original text.
91
+ This is done to bypass 512 token limit on transformer models by sequentially
92
+ feeding chunks of < 512 toks.
93
+ Example output:
94
+ [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
95
+ """
96
+ wrds = text.replace('\n', ' ').split(" ")
97
+ resp = []
98
+ lst_chunk_idx = 0
99
+ i = 0
100
+
101
+ while True:
102
+ # words in the chunk and the overlapping portion
103
+ wrds_len = wrds[(length * i):(length * (i + 1))]
104
+ wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
105
+ wrds_split = wrds_len + wrds_ovlp
106
+
107
+ # Break loop if no more words
108
+ if not wrds_split:
109
+ break
110
+
111
+ wrds_str = " ".join(wrds_split)
112
+ nxt_chunk_start_idx = len(" ".join(wrds_len))
113
+ lst_char_idx = len(" ".join(wrds_split))
114
+
115
+ resp_obj = {
116
+ "text": wrds_str,
117
+ "start_idx": lst_chunk_idx,
118
+ "end_idx": lst_char_idx + lst_chunk_idx,
119
+ }
120
+
121
+ resp.append(resp_obj)
122
+ lst_chunk_idx += nxt_chunk_start_idx + 1
123
+ i += 1
124
+ logging.info(f"Sliced transcript into {len(resp)} slices.")
125
+ return resp
126
+
127
+ @staticmethod
128
+ def combine_results(full_text: str, text_slices):
129
+ """
130
+ Given a full text and predictions of each slice combines predictions into a single text again.
131
+ Performs validataion wether text was combined correctly
132
+ """
133
+ split_full_text = full_text.replace('\n', ' ').split(" ")
134
+ split_full_text = [i for i in split_full_text if i]
135
+ split_full_text_len = len(split_full_text)
136
+ output_text = []
137
+ index = 0
138
+
139
+ if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
140
+ text_slices = text_slices[:-1]
141
+
142
+ for _slice in text_slices:
143
+ slice_wrds = len(_slice)
144
+ for ix, wrd in enumerate(_slice):
145
+ # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
146
+ if index == split_full_text_len:
147
+ break
148
+
149
+ if split_full_text[index] == str(list(wrd.keys())[0]) and \
150
+ ix <= slice_wrds - 3 and text_slices[-1] != _slice:
151
+ index += 1
152
+ pred_item_tuple = list(wrd.items())[0]
153
+ output_text.append(pred_item_tuple)
154
+ elif split_full_text[index] == str(list(wrd.keys())[0]) and text_slices[-1] == _slice:
155
+ index += 1
156
+ pred_item_tuple = list(wrd.items())[0]
157
+ output_text.append(pred_item_tuple)
158
+ assert [i[0] for i in output_text] == split_full_text
159
+ return output_text
160
+
161
+ @staticmethod
162
+ def punctuate_texts(full_pred: list):
163
+ """
164
+ Given a list of Predictions from the model, applies the predictions to text,
165
+ thus punctuating it.
166
+ """
167
+ punct_resp = ""
168
+ for i in full_pred:
169
+ word, label = i
170
+ if label[-1] == "U":
171
+ punct_wrd = word.capitalize()
172
+ else:
173
+ punct_wrd = word
174
+
175
+ if label[0] != "O":
176
+ punct_wrd += label[0]
177
+
178
+ punct_resp += punct_wrd + " "
179
+ punct_resp = punct_resp.strip()
180
+ # Append trailing period if doesnt exist.
181
+ if punct_resp[-1].isalnum():
182
+ punct_resp += "."
183
+ return punct_resp
184
+
185
+
186
+ if __name__ == "__main__":
187
+
188
+ start = time.time()
189
+ punct_model = RestorePuncts()
190
+
191
+ load_model = time.time()
192
+ print(f'Time to load model: {load_model - start}')
193
+ # read test file
194
+ # with open('en_lower.txt', 'r') as fp:
195
+ # # test_sample = fp.read()
196
+ # lines = fp.readlines()
197
+
198
+ with open('sample.vtt', 'r') as fp:
199
+ source_text = fp.read()
200
+
201
+ # captions = webvtt.read_buffer(StringIO(source_text))
202
+ captions = webvtt.read('sample.vtt')
203
+ source_sentences = [caption.text.replace('\r', '').replace('\n', ' ') for caption in captions]
204
+
205
+ # print(source_sentences)
206
+
207
+ sent = ' '.join(source_sentences)
208
+ punctuated = punct_model.punctuate(sent)
209
+
210
+ tokenised = sent_tokenize(punctuated)
211
+ # print(tokenised)
212
+
213
+ for i in range(len(tokenised)):
214
+ captions[i].text = tokenised[i]
215
+ # return captions.content
216
+ captions.save('my_captions.vtt')
217
+
218
+ end = time.time()
219
+ print(f'Time for run: {end - load_model}')
220
+ print(f'Total time: {end - start}')
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from inference.engine import Model
4
+
5
+ e2i_model_download = "wget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1IpcnaQ2ScX_zodt2aLlXa_5Kkntl0nue' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\\n/p')&id=1IpcnaQ2ScX_zodt2aLlXa_5Kkntl0nue\" -O en-indic.zip && rm -rf /tmp/cookies.txt"
6
+ os.system(e2i_model_download)
7
+ os.system('unzip /home/user/app/en-indic.zip')
8
+
9
+ i2e_model_download = "wget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1-hzy09qi-OEogyge7rQG79K7iV4xsNWa' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\\n/p')&id=1-hzy09qi-OEogyge7rQG79K7iV4xsNWa\" -O indic-en.zip && rm -rf /tmp/cookies.txt"
10
+ os.system(i2e_model_download)
11
+ os.system('unzip /home/user/app/indic-en.zip')
12
+
13
+ i2i_model_download = "wget --show-progress -O m2m.tar https://ai4b-my.sharepoint.com/:u:/g/personal/sumanthdoddapaneni_ai4bharat_org/Eajn_jJIp5NEqeyqZ0GW4FgBdiANlZNQiy7dlwkaNr8DHw?download=1"
14
+ os.system(i2i_model_download)
15
+ os.system("tar -xvf /home/user/app/m2m.tar")
16
+
17
+ en2indic_model = Model(expdir='/home/user/app/en-indic')
18
+ indic2en_model = Model(expdir='/home/user/app/indic-en')
19
+ indic2indic_model = Model(expdir='/home/user/app/m2m')
20
+
21
+ LANGUAGES = {"Assamese": "as", "Bengali": "bn", "Gujarati": "gu", "Hindi": "hi", "Kannada": "kn",
22
+ "Malayalam": "ml", "Marathi": "mr", "Odia": "or", "Punjabi": "pa", "Tamil": "ta", "Telugu": "te", "English": "en"}
23
+
24
+
25
+ def translate(text, fromLang, toLang):
26
+ if (fromLang != "English" and toLang == "English"):
27
+ return indic2en_model.translate_paragraph(text, LANGUAGES[fromLang], LANGUAGES[toLang])
28
+ elif (fromLang == "English" and toLang != "English"):
29
+ return en2indic_model.translate_paragraph(text, LANGUAGES[fromLang], LANGUAGES[toLang])
30
+ elif (fromLang != "English" and toLang != "English"):
31
+ return indic2indic_model.translate_paragraph(text, LANGUAGES[fromLang], LANGUAGES[toLang])
32
+ else:
33
+ return text
34
+
35
+
36
+ languages = list(LANGUAGES.keys())
37
+
38
+ fromChoice = gr.inputs.Dropdown(
39
+ languages, type="value", default="Hindi", label="Select Source Language")
40
+
41
+ toChoice = gr.inputs.Dropdown(
42
+ languages, type="value", default="Tamil", label="Select Target Language")
43
+
44
+ text_output = gr.outputs.Textbox(
45
+ type="auto", label=f"Translation")
46
+
47
+ text = gr.inputs.Textbox(lines=5, placeholder="Enter Text to translate",
48
+ default="", label="Enter Text in Source Language")
49
+
50
+ supported_lang = ', '.join(languages)
51
+
52
+ interface_description = f"""
53
+ <html>
54
+ <body>
55
+ <h1>
56
+ Usage:
57
+ </h1>
58
+ <ul>
59
+ <li>Choose the Source Language and Target Language for translation.</li>
60
+ <li>Enter your text in source language in the textbox.</li>
61
+ <li>Click Submit and view your translated output.</li>
62
+ </ul>
63
+ <br/>
64
+ <span>Currently the model supports {supported_lang} </span>
65
+ </body>
66
+ </html>
67
+ """
68
+
69
+ interface_article = """
70
+ <html>
71
+ <body>
72
+ <div>
73
+ <h1>
74
+ About
75
+ </h1>
76
+ <h4>
77
+ Original repository can be found at <a href="https://github.com/AI4Bharat/indicTrans">here</a>.
78
+ </h4>
79
+ <br/>
80
+ <span>
81
+ The models used in this interface are multilingual single-script transformer based models for translating between English and Indian languages. The models are trained using the Samanantar corpus and at the time of their release was the state of the art open source model as evaluated on Facebook's FLORES benchmark.
82
+ </span>
83
+ <br/>
84
+ <h4>
85
+ These models are currently being used on AI Tools/Platforms such as:
86
+ </h4>
87
+ <ul>
88
+ <li><a href="https://ai4bharat.org/shoonya">Shoonya</a></li>
89
+ <li><a href="https://ai4bharat.org/chitralekha">Chitralekha</a> (deployed for NPTEL)</li>
90
+ <li><a href="https://ai4bharat.org/anuvaad">Anuvaad</a> (deployed for Supreme Court of India & Bangladesh)</li>
91
+ <li>Pratham Books</li>
92
+ </ul>
93
+ </div>
94
+ </body>
95
+ </html>
96
+ """
97
+
98
+ examples = [
99
+ ["A farmer lives in a village", "English", "Hindi"],
100
+ ["एक गाव मे एक किसान रहता ता", "Hindi", "English"],
101
+ ["एक गाव मे एक किसान रहता ता", "Hindi", "Tamil"]
102
+ ]
103
+
104
+
105
+ iface = gr.Interface(fn=translate, inputs=[text, fromChoice, toChoice], outputs=text_output,
106
+ title='IndicTrans - Multilingual Translation', description=interface_description, article=interface_article, examples=examples)
107
+ iface.launch(enable_queue=True)
indic_nlp_library/LICENSE ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2013-present Anoop Kunchukuttan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
indic_nlp_library/README.md ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Indic NLP Library
2
+
3
+ The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text.
4
+
5
+ The library provides the following functionalities:
6
+
7
+ - Text Normalization
8
+ - Script Information
9
+ - Word Tokenization and Detokenization
10
+ - Sentence Splitting
11
+ - Word Segmentation
12
+ - Syllabification
13
+ - Script Conversion
14
+ - Romanization
15
+ - Indicization
16
+ - Transliteration
17
+ - Translation
18
+
19
+ The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project.
20
+
21
+ **If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/indicnlpweb/indicnlp_catalog) for pointers.**
22
+
23
+ ## Pre-requisites
24
+
25
+ - Python 3.x
26
+ - (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible)
27
+ - [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)
28
+ - [Urduhack](https://github.com/urduhack/urduhack): Needed only if Urdu normalization is required. It has other dependencies like Tensorflow.
29
+ - Other dependencies are listed in setup.py
30
+
31
+
32
+ ## Configuration
33
+
34
+ - Installation from pip:
35
+
36
+ `pip install indic-nlp-library`
37
+
38
+ - If you want to use the project from the github repo, add the project to the Python Path:
39
+
40
+ - Clone this repository
41
+ - Install dependencies: `pip install -r requirements.txt`
42
+ - Run: `export PYTHONPATH=$PYTHONPATH:<project base directory>`
43
+
44
+ - In either case, export the path to the _Indic NLP Resources_ directory
45
+
46
+ Run: `export INDIC_RESOURCES_PATH=<path to Indic NLP resources>`
47
+
48
+ ## Usage
49
+
50
+ You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API.
51
+
52
+ ### Getting Started
53
+
54
+ Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API.
55
+ - You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb)
56
+
57
+ ### Documentation
58
+
59
+ You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest)
60
+
61
+ This documents the Python API as well as the commandline reference.
62
+
63
+ ## Citing
64
+
65
+ If you use this library, please include the following citation:
66
+
67
+ ```
68
+ @misc{kunchukuttan2020indicnlp,
69
+ author = "Anoop Kunchukuttan",
70
+ title = "{The IndicNLP Library}",
71
+ year = "2020",
72
+ howpublished={\url{https://github.com/anoopkunchukuttan/indic_nlp_library/blob/master/docs/indicnlp.pdf}}
73
+ }
74
+ ```
75
+ You can find the document [HERE](docs/indicnlp.pdf)
76
+
77
+ ## Website
78
+
79
+ `http://anoopkunchukuttan.github.io/indic_nlp_library`
80
+
81
+ ## Author
82
+ Anoop Kunchukuttan ([[email protected]]([email protected]))
83
+
84
+ ## Companies, Organizations, Projects using IndicNLP Library
85
+
86
+ - [AI4Bharat-IndicNLPSuite](https://indicnlp.ai4bharat.org)
87
+ - [The Classical Language Toolkit](http://cltk.org)
88
+ - [Microsoft NLP Recipes](https://github.com/microsoft/nlp-recipes)
89
+ - [Facebook M2M-100](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100)
90
+
91
+ ## Revision Log
92
+
93
+
94
+ 0.81 : 26 May 2021
95
+
96
+ - Bug fix in version number extraction
97
+
98
+ 0.80 : 24 May 2021
99
+
100
+ - Improved sentence splitting
101
+ - Bug fixes
102
+ - Support for Urdu Normalizer
103
+
104
+ 0.71 : 03 Sep 2020
105
+
106
+ - Improved documentation
107
+ - Bug fixes
108
+
109
+ 0.7 : 02 Apr 2020:
110
+
111
+ - Unified commandline
112
+ - Improved documentation
113
+ - Added setup.py
114
+
115
+ 0.6 : 16 Dec 2019:
116
+
117
+ - New romanizer and indicizer
118
+ - Script Unifiers
119
+ - Improved script normalizers
120
+ - Added contrib directory for sample uses
121
+ - changed to MIT license
122
+
123
+ 0.5 : 03 Jun 2019:
124
+
125
+ - Improved word tokenizer to handle dates and numbers.
126
+ - Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics.
127
+ - Added detokenizer
128
+ - Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts
129
+
130
+ 0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification.
131
+
132
+ 0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages
133
+
134
+ 0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages
135
+
136
+ 0.1 : 12 Mar 2014: Initial version. Supports text normalization.
137
+
138
+ ## LICENSE
139
+
140
+ Indic NLP Library is released under the MIT license
141
+
142
+
indic_nlp_library/contrib/README.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Contrib
2
+
3
+ Contains additional utilities and applications using Indic NLP library core
4
+
5
+ - `indic_scraper_project_sample.ipynb`: A simple pipeline for building monolingual corpora for Indian languages from crawled web content, Wikipedia, etc. An extensible framework which allows incorporation of website specific extractors, whereas generic NLP tasks like tokenization, sentence splitting, normalization, etc. are handled by the framework.
6
+ - `correct_moses_tokenizer.py`: This script corrects the incorrect tokenization done by Moses tokenizer. The Moses tokenizer splits on nukta and halant characters.
7
+ - `hindi_to_kannada_transliterator.py`: This script transliterates Hindi to Kannada. It removes/remaps characters only found in Hindi. It also adds halanta to words ending with consonant - as is the convention in Kannada.
indic_nlp_library/contrib/correct_moses_tokenizer.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from indicnlp import langinfo
3
+ from indicnlp import loader
4
+
5
+ if __name__ == '__main__':
6
+ """
7
+ This script corrects the incorrect tokenization done by Moses tokenizer.
8
+ The Moses tokenizer splits on nukta and halant characters
9
+ Usage: python correct_moses_tokenizer.py <infname> <outfname> <langcode>
10
+ """
11
+
12
+ loader.load()
13
+
14
+ infname=sys.argv[1]
15
+ outfname=sys.argv[2]
16
+ lang=sys.argv[3]
17
+
18
+ halant_char=langinfo.offset_to_char(langinfo.HALANTA_OFFSET,lang)
19
+ nukta_char=langinfo.offset_to_char(langinfo.NUKTA_OFFSET,lang)
20
+
21
+ with open(infname,'r',encoding='utf-8') as infile, \
22
+ open(outfname,'w',encoding='utf-8') as outfile:
23
+ for line in infile:
24
+ outfile.write(
25
+ line.replace(
26
+ ' {} '.format(halant_char), halant_char).replace(
27
+ ' {} '.format(nukta_char), nukta_char).replace(
28
+ ' {}{}'.format(nukta_char,halant_char),'{}{}'.format(nukta_char,halant_char))
29
+ )
indic_nlp_library/contrib/hindi_to_kannada_transliterator.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from indicnlp import common
3
+ common.set_resources_path(INDIC_NLP_RESOURCES)
4
+
5
+ from indicnlp import loader
6
+ from indicnlp.normalize import indic_normalize
7
+ from indicnlp.transliterate import unicode_transliterate
8
+
9
+ if __name__ == '__main__':
10
+ """
11
+ This script transliterates Hindi to Kannada. It removes/remaps
12
+ characters only found in Hindi. It also adds halanta to words ending
13
+ with consonant - as is the convention in Kannada
14
+ """
15
+
16
+ infname=sys.argv[1] # one sentence/word per line. Sentences should be space-tokenized
17
+ outfname=sys.agv[2]
18
+ loader.load()
19
+
20
+ normalizer_factory=indic_normalize.IndicNormalizerFactory()
21
+ normalizer=normalizer_factory.get_normalizer('hi')
22
+
23
+ with open(infname,'r',encoding='utf-8') as infile, \
24
+ open(outfname,'w',encoding='utf-8') as outfile:
25
+ for line in infile:
26
+ line=line.strip()
27
+ line=normalizer.normalize(line)
28
+
29
+ ## replace chandrabindus with anusvara
30
+ line=line.replace('\u0900','\u0902')
31
+ line=line.replace('\u0901','\u0902')
32
+
33
+ ### replace chandra e and o diacritics with e and o respectively
34
+ #line=line.replace('\u0945','\u0947')
35
+ #line=line.replace('\u0949','\u094b')
36
+
37
+ ### replace chandra e and o diacritics with a diacritic
38
+ ## this seems to be general usage
39
+ line=line.replace('\u0945','\u093e')
40
+ line=line.replace('\u0949','\u093e')
41
+
42
+ ## remove nukta
43
+ line=line.replace('\u093c','')
44
+
45
+ ## add halant if word ends with consonant
46
+ #if isc.is_consonant(isc.get_phonetic_feature_vector(line[-1],'hi')):
47
+ # line=line+'\u094d'
48
+ words=line.split(' ')
49
+ outwords=[]
50
+ for word in line.split(' '):
51
+ if isc.is_consonant(isc.get_phonetic_feature_vector(word[-1],'hi')):
52
+ word=word+'\u094d'
53
+ outwords.append(word)
54
+ line=' '.join(outwords)
55
+
56
+
57
+ ## script conversion
58
+ line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(line,'hi','kn')
59
+
60
+ outfile.write(line+'\n')
61
+
62
+
indic_nlp_library/contrib/indic_scraper_project_sample.ipynb ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Pre-requisites\n",
8
+ "\n",
9
+ "- Python 3.5+\n",
10
+ "- Python packages: \n",
11
+ " - `pip install bs4 pandas mmh3`\n",
12
+ "- [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library)\n",
13
+ "- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "markdown",
18
+ "metadata": {},
19
+ "source": [
20
+ "# Initialize the Indic NLP Library\n",
21
+ "\n",
22
+ "Run the cell below to initialize the Indic NLP Library"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "# The path to the local git repo for Indic NLP Library\n",
32
+ "INDIC_NLP_LIB_HOME=\"/disk1/src/indic_nlp_library\"\n",
33
+ "\n",
34
+ "# The path to the local git repo for Indic NLP Resources\n",
35
+ "INDIC_NLP_RESOURCES=\"/disk1/src/indic_nlp_resources\"\n",
36
+ "\n",
37
+ "import sys\n",
38
+ "sys.path.append('{}/src'.format(INDIC_NLP_LIB_HOME))\n",
39
+ "\n",
40
+ "from indicnlp import common\n",
41
+ "common.set_resources_path(INDIC_NLP_RESOURCES)\n",
42
+ "\n",
43
+ "from indicnlp import loader\n",
44
+ "loader.load()"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "from bs4 import BeautifulSoup\n",
54
+ "import os\n",
55
+ "import string\n",
56
+ "import indicnlp\n",
57
+ "from indicnlp.tokenize import indic_tokenize\n",
58
+ "from indicnlp.normalize import indic_normalize\n",
59
+ "from indicnlp.transliterate import unicode_transliterate\n",
60
+ "from indicnlp.tokenize import sentence_tokenize\n",
61
+ "import re\n",
62
+ "import collections\n",
63
+ "import random\n",
64
+ "import mmh3"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "markdown",
69
+ "metadata": {},
70
+ "source": [
71
+ "# Common Functions"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": null,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "def preprocess_sent(text,lang,normalizer):\n",
81
+ " \"\"\"\n",
82
+ " Pre-process text (normalization and tokenization)\n",
83
+ " \n",
84
+ " text: text string to preprocess\n",
85
+ " lang: language code (2-letter ISO code)\n",
86
+ " normalizer: normalizer object for language\n",
87
+ " \n",
88
+ " returns the processed text string\n",
89
+ " \"\"\"\n",
90
+ " return ' '.join(indic_tokenize.trivial_tokenize(normalizer.normalize(text.replace('\\n',' ')),lang)) \n",
91
+ "\n",
92
+ "def sent_split(text,lang):\n",
93
+ " \"\"\"\n",
94
+ " Sentence splitter\n",
95
+ " \n",
96
+ " text: text to sentence split \n",
97
+ " lang: language\n",
98
+ " \n",
99
+ " returns list of sentences \n",
100
+ " \"\"\"\n",
101
+ " return sentence_tokenize.sentence_split(text,lang)\n",
102
+ "\n",
103
+ "def extract_all_content(indir,lang,\n",
104
+ " article_extract_fn,\n",
105
+ " preprocess_fn=preprocess_sent,\n",
106
+ " narticles=-1,\n",
107
+ " start_artid=0):\n",
108
+ " \"\"\"\n",
109
+ " This method reads all files from the input directory, extracts text content from each file,\n",
110
+ " and pre-processes the text. This method is a generator. \n",
111
+ " For each sentence, the method yields a tuple of the format: \n",
112
+ " \n",
113
+ " (artid, fname, paraid, sentid, processed_text)\n",
114
+ " \n",
115
+ " indir: path to input directoryo containing files to be parsed \n",
116
+ " \n",
117
+ " lang: language to the files in the input directory\n",
118
+ " \n",
119
+ " article_extract_fn: the function to extract text content from each file. \n",
120
+ " Signature of the function: get_article_contents(fname,lang,encoding) \n",
121
+ " `fname` is name of the file, `lang` is langcode, \n",
122
+ " `encoding` is text-encoding (default=utf-8). \n",
123
+ " The function yields a tuple (paraid, sentid, extracted_text) \n",
124
+ " for each sentence.\n",
125
+ " \n",
126
+ " preprocess_fn: pre-processing function to apply to the extracted text. \n",
127
+ " The function takes a string as input and returns processed string as output.\n",
128
+ " \n",
129
+ " narticles: extract and process the first `narticles` from input directory. \n",
130
+ " if narticles=-1 (default), all files are extracted\n",
131
+ " \n",
132
+ " start_artid: the start of the article id to assign to extracted articles (default=0)\n",
133
+ " \n",
134
+ " \"\"\"\n",
135
+ "\n",
136
+ " fnames = os.listdir(indir)\n",
137
+ " if narticles>0:\n",
138
+ " fnames=fnames[:narticles]\n",
139
+ " nsent=0\n",
140
+ "\n",
141
+ " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
142
+ " normalizer=normalizer_factory.get_normalizer(lang)\n",
143
+ " \n",
144
+ " print('Number of articles: {}'.format(len(fnames)))\n",
145
+ " for artid, fname in enumerate(fnames,start_artid):\n",
146
+ "# print(fname)\n",
147
+ " if artid%100 == 0:\n",
148
+ " print('({}|{})'.format(artid,nsent),end=' ... ')\n",
149
+ " \n",
150
+ " try:\n",
151
+ " fpath=os.sep.join([indir,fname])\n",
152
+ " for paraid, sentid, sent in article_extract_fn(fpath,lang):\n",
153
+ " nsent+=1\n",
154
+ " yield( ( artid, fname, paraid, sentid, preprocess_fn(sent,lang,normalizer) ) )\n",
155
+ " except:\n",
156
+ " print('Cannot parse {}'.format(fname))\n",
157
+ " \n",
158
+ "def write_corpus(corpus_iterator,content_fname,article_mapping_fname,delimiter=' ||| ', encoding='utf-8'):\n",
159
+ " \"\"\"\n",
160
+ " Writes the extracted corpus to a file. The extracted data is organized in terms of articles, paragraphs \n",
161
+ " and sentences. The following is the format of the output file: \n",
162
+ " - one line per sentence\n",
163
+ " - format of line: article_id, para_id, sent_id, sentence\n",
164
+ " In addition to the content file mention, a metadata file which maps the article id to the filename is also written. \n",
165
+ " \n",
166
+ " corpus_iterator: iterator over the corpus, yielding tuple (artid, fname, paraid, sentid, processed_text). \n",
167
+ " The function `extract_all_content` yields a generator in this format. \n",
168
+ " content_fname: output content file to write the extracted data to in the format mentioned above\n",
169
+ " article_mapping_fname: output metadata file to write article id to filename mapping.\n",
170
+ " delimiter=' ||| ': delimiter for the content file. The default delimiter is the same \n",
171
+ " as used in the Moses phrase table\n",
172
+ " encoding: text encoding default - 'utf-8'\n",
173
+ " \n",
174
+ " \"\"\"\n",
175
+ " \n",
176
+ " artid_name_mapping={}\n",
177
+ " with open(content_fname,'w',encoding=encoding) as contentfile:\n",
178
+ " for artid, fname, paraid, sentid, text in corpus_iterator:\n",
179
+ " contentfile.write(delimiter.join([str(artid), str(paraid), str(sentid), text]) + '\\n')\n",
180
+ " artid_name_mapping[artid]=fname\n",
181
+ "\n",
182
+ " with open(article_mapping_fname,'w',encoding=encoding) as artmappingfile:\n",
183
+ " for artid, name in sorted(artid_name_mapping.items(),key=lambda x: x[0]):\n",
184
+ " artmappingfile.write('{} {} {}\\n'.format(artid,delimiter,name))\n",
185
+ "\n",
186
+ "def convert_txt_to_csv_format(infname, outfname, encoding='utf-8'):\n",
187
+ " \"\"\"\n",
188
+ " convert txt file to csv format. This method is used when the text file is directly available.\n",
189
+ " The input file has one sentence per line. Assumed to be preprocessed (tokenized, normalized)\n",
190
+ " \n",
191
+ " \"\"\"\n",
192
+ " with open(infname,'r',encoding=encoding) as infile, \\\n",
193
+ " open(outfname,'w',encoding=encoding) as outfile: \n",
194
+ " for i, line in enumerate(infile):\n",
195
+ " outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,line.strip()))\n",
196
+ " \n",
197
+ "def preprocess_convert_txt_to_csv_format(infname, outfname, lang, encoding='utf-8'):\n",
198
+ " \"\"\"\n",
199
+ " Convert raw text file to csv format\n",
200
+ " \"\"\"\n",
201
+ " \n",
202
+ " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
203
+ " normalizer=normalizer_factory.get_normalizer(lang)\n",
204
+ " \n",
205
+ " with open(infname,'r',encoding=encoding) as infile, \\\n",
206
+ " open(outfname,'w',encoding=encoding) as outfile: \n",
207
+ " i=0\n",
208
+ " for line in infile:\n",
209
+ " sents = sent_split(line.strip(),lang)\n",
210
+ " for sent in sents:\n",
211
+ " outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,\n",
212
+ " preprocess_sent(sent.strip(), lang, normalizer)) )\n",
213
+ " i=i+1\n",
214
+ "\n",
215
+ "def print_txt(infnames, outfname, encoding='utf-8'):\n",
216
+ " \"\"\"\n",
217
+ " Extract only the text from the content csv file. The output file has one sentence per file.\n",
218
+ " \"\"\"\n",
219
+ " with open(outfname,'w',encoding=encoding) as outfile: \n",
220
+ " for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
221
+ " with open(infname,'r',encoding=encoding) as infile:\n",
222
+ " for i, line in enumerate(infile):\n",
223
+ " fields=line.strip().split('|||')\n",
224
+ " if len(fields) >=4:\n",
225
+ " outfile.write('{}\\n'.format(fields[3].strip()))\n",
226
+ " \n",
227
+ "# def dedup_and_print_txt(infnames, outfname, encoding='utf-8'):\n",
228
+ " \n",
229
+ "# total=0\n",
230
+ "# unique=0\n",
231
+ "# hash_codes=set()\n",
232
+ " \n",
233
+ "# with open(outfname,'w',encoding=encoding) as outfile: \n",
234
+ "# for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
235
+ "# with open(infname,'r',encoding=encoding) as infile:\n",
236
+ "# for i, line in enumerate(infile):\n",
237
+ "# fields=line.strip().split('|||')\n",
238
+ "# if len(fields) >=4:\n",
239
+ "# sent=fields[3].strip()\n",
240
+ "# total+=1\n",
241
+ "# hs=hash(sent)\n",
242
+ "# if hs not in hash_codes:\n",
243
+ "# outfile.write('{}\\n'.format(sent))\n",
244
+ "# hash_codes.add(hs)\n",
245
+ "# unique+=1\n",
246
+ " \n",
247
+ "# print('Total: {}'.format(total))\n",
248
+ "# print('Unique: {}'.format(unique))\n",
249
+ "\n",
250
+ "def dedup_shuffle_and_print_txt(infnames, outfname, max_buf_size=100000,encoding='utf-8'):\n",
251
+ " \"\"\"\n",
252
+ " The method creates a sentence level corpora from multiple content csv files.\n",
253
+ " All sentences are extracted, they are de-duplicated using murmurhash and shuffled\n",
254
+ " before writing the entire corpus to the output file. The output file has one sentence per line.\n",
255
+ "\n",
256
+ " \"\"\"\n",
257
+ " \n",
258
+ " total=0\n",
259
+ " unique=0\n",
260
+ " hash_codes=set()\n",
261
+ " sent_buffer=[]\n",
262
+ " \n",
263
+ " with open(outfname,'w',encoding=encoding) as outfile: \n",
264
+ " for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
265
+ " print('Processing: {}'.format(infname))\n",
266
+ " with open(infname,'r',encoding=encoding) as infile:\n",
267
+ " for i, line in enumerate(infile):\n",
268
+ " fields=line.strip().split('|||')\n",
269
+ " if len(fields) >=4:\n",
270
+ " sent=fields[3].strip()\n",
271
+ " total+=1\n",
272
+ "# hs=hash(sent)\n",
273
+ " hs=mmh3.hash128(sent)\n",
274
+ " if hs not in hash_codes:\n",
275
+ "# outfile.write('{}\\n'.format(sent))\n",
276
+ " sent_buffer.append(sent)\n",
277
+ " hash_codes.add(hs)\n",
278
+ " unique+=1\n",
279
+ " if len(sent_buffer)>=max_buf_size:\n",
280
+ " random.shuffle(sent_buffer)\n",
281
+ " for sent in sent_buffer: \n",
282
+ " outfile.write('{}\\n'.format(sent))\n",
283
+ " sent_buffer.clear()\n",
284
+ " \n",
285
+ " if len(sent_buffer)>0:\n",
286
+ " random.shuffle(sent_buffer)\n",
287
+ " for sent in sent_buffer: \n",
288
+ " outfile.write('{}\\n'.format(sent))\n",
289
+ " sent_buffer.clear() \n",
290
+ " \n",
291
+ " print('Total: {}'.format(total))\n",
292
+ " print('Unique: {}'.format(unique))\n",
293
+ "\n",
294
+ "def extract_wikiextractor_file(infname, outfname, lang, \n",
295
+ " encoding='utf-8', delimiter=' ||| ', preprocess_fn=preprocess_sent):\n",
296
+ " \"\"\"\n",
297
+ " Extract text content into a content csv file from wikipedia article page. \n",
298
+ " The wikipedia article page is the output from `wikiextractor` [https://github.com/attardi/wikiextractor] \n",
299
+ " \n",
300
+ " \"\"\"\n",
301
+ " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
302
+ " normalizer=normalizer_factory.get_normalizer(lang)\n",
303
+ " \n",
304
+ " with open(infname,'r',encoding=encoding) as infile, \\\n",
305
+ " open(outfname,'w',encoding=encoding) as outfile: \n",
306
+ " artid=-1\n",
307
+ " paraid=0\n",
308
+ " for line in infile:\n",
309
+ " if line.find('<doc')==0:\n",
310
+ " artid+=1\n",
311
+ " paraid=0\n",
312
+ " continue\n",
313
+ " if line.find('</doc')==0:\n",
314
+ " continue\n",
315
+ " if len(line.strip())>0:\n",
316
+ " for sentid, sent in enumerate(sent_split(line.strip(),lang)):\n",
317
+ " sent=sent.strip()\n",
318
+ " if sent!='':\n",
319
+ " sent = preprocess_fn(sent,lang,normalizer)\n",
320
+ " outfile.write(delimiter.join([str(artid), str(paraid), str(sentid), sent]) + '\\n')\n",
321
+ " paraid+=1\n",
322
+ "\n",
323
+ " \n",
324
+ "def extract_leipzig_corpus(infname,outfname,lang,encoding='utf-8'):\n",
325
+ " \"\"\"\n",
326
+ " Extractor for files form the Leipzig corpus\n",
327
+ " [http://wortschatz.uni-leipzig.de/en/download/]\n",
328
+ " \n",
329
+ " \"\"\"\n",
330
+ " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
331
+ " normalizer=normalizer_factory.get_normalizer(lang) \n",
332
+ "\n",
333
+ " with open(infname,'r',encoding=encoding) as infile, \\\n",
334
+ " open(outfname,'w',encoding=encoding) as outfile: \n",
335
+ " for i, line in enumerate(infile):\n",
336
+ " outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,preprocess_sent(line,lang,normalizer))) \n",
337
+ " \n",
338
+ "def dataset_stats(fname):\n",
339
+ " \"\"\"\n",
340
+ " Extracts dataset statistics from the final extracted file. This input file contains\n",
341
+ " one sentence per line. The sentences are tokenized.\n",
342
+ " \"\"\"\n",
343
+ "\n",
344
+ " all_puncs=set(string.punctuation+'\\u0964\\u0965')\n",
345
+ " \n",
346
+ " sent_count=0\n",
347
+ " token_cnt=0\n",
348
+ " true_token_cnt=0\n",
349
+ " tokens=set()\n",
350
+ " \n",
351
+ " with open(fname,'r',encoding='utf-8') as infile:\n",
352
+ " for line in infile:\n",
353
+ " sent_count+=1\n",
354
+ " a=line.strip().split(' ')\n",
355
+ " token_cnt+=len(a)\n",
356
+ " b=list(filter(lambda x: x not in all_puncs,a))\n",
357
+ " true_token_cnt+=len(b)\n",
358
+ " tokens.update(b)\n",
359
+ " \n",
360
+ " print('== Stats ==')\n",
361
+ " print('Sent count: {}'.format(sent_count))\n",
362
+ " print('Token count: {}'.format(token_cnt))\n",
363
+ " print('True Token count: {}'.format(true_token_cnt))\n",
364
+ " print('Unique Token count: {}'.format(len(tokens)))\n"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "markdown",
369
+ "metadata": {},
370
+ "source": [
371
+ "# Marathi"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "markdown",
376
+ "metadata": {},
377
+ "source": [
378
+ "## Wikipedia"
379
+ ]
380
+ },
381
+ {
382
+ "cell_type": "markdown",
383
+ "metadata": {},
384
+ "source": [
385
+ "### Wikipedia extraction commands using wikiextractor\n",
386
+ "\n",
387
+ "```\n",
388
+ "### This uses WikiExtractor (https://github.com/attardi/wikiextractor)\n",
389
+ "\n",
390
+ "x=/disk1/crawl_project/ta/wikipedia\n",
391
+ "mkdir $x\n",
392
+ "cd $x\n",
393
+ "wget https://dumps.wikimedia.org/tawiki/20190501/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
394
+ "cd /disk1/src/wikiextractor\n",
395
+ "python3 WikiExtractor.py -cb 250k -o $x/extracted $x/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
396
+ "cd -\n",
397
+ "find extracted -name '*bz2' -exec bunzip2 -c {} \\; > text.xml\n",
398
+ "rm text.xml\n",
399
+ "rm tawiki-20190501-pages-articles-multistream.xml.bz2\n",
400
+ "rm -rf extracted\n",
401
+ "```"
402
+ ]
403
+ },
404
+ {
405
+ "cell_type": "markdown",
406
+ "metadata": {},
407
+ "source": [
408
+ "mrwiki-20190401-pages-articles-multistream.xml.bz2\n",
409
+ "\n",
410
+ "INFO: Finished 1-process extraction of 53715 articles in 123.6s (434.7 art/s)\n",
411
+ "\n",
412
+ "INFO: total of page: 102025, total of articl page: 53715; total of used articl page: 53715"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "markdown",
417
+ "metadata": {},
418
+ "source": [
419
+ "### Post-processing output generated by wikiextractor"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": null,
425
+ "metadata": {},
426
+ "outputs": [],
427
+ "source": [
428
+ "## tex.xml is extracted as shown in commanfs above\n",
429
+ "extract_wikiextractor_file('text.xml',\n",
430
+ " 'content_fname1.csv',\n",
431
+ " 'mr')"
432
+ ]
433
+ },
434
+ {
435
+ "cell_type": "markdown",
436
+ "metadata": {},
437
+ "source": [
438
+ "## Loksatta"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "markdown",
443
+ "metadata": {},
444
+ "source": [
445
+ "**Extractor function for Marathi Loksatta page**"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": null,
451
+ "metadata": {},
452
+ "outputs": [],
453
+ "source": [
454
+ "def get_article_contents_mr_loksatta(fname,lang,encoding='utf-8'):\n",
455
+ " with open(fname,'r',encoding=encoding) as infile: \n",
456
+ " soup = BeautifulSoup(infile)\n",
457
+ " for elem in soup.find_all('div'):\n",
458
+ " if 'itemprop' in elem.attrs and 'articleBody' in elem['itemprop']:\n",
459
+ " filtered_paras=list(filter(lambda x: x.name=='p' and len(x.attrs)==0,elem.children))\n",
460
+ " paraid=0\n",
461
+ " for blockid, block in enumerate(filtered_paras):\n",
462
+ "# print('Para: {}'.format(blockid))\n",
463
+ "# print(list(block.strings))\n",
464
+ " text=' '.join(block.strings)\n",
465
+ " if blockid==0 and text.find(':')>=0 and text.find(':')<20:\n",
466
+ " text=':'.join(text.split(':')[1:])\n",
467
+ " for para_text in text.split('\\n'): \n",
468
+ " for sentid, sent in enumerate(sent_split(para_text,lang)):\n",
469
+ " sent=sent.strip()\n",
470
+ " if sent!='':\n",
471
+ " # print('{}: {}'.format(sentid, sent))\n",
472
+ " yield((paraid,sentid,sent))\n",
473
+ " # yield((paraid,sentid,preprocess_sent(sent,'ml',normalizer)))\n",
474
+ " # print() \n",
475
+ " paraid+=1"
476
+ ]
477
+ },
478
+ {
479
+ "cell_type": "markdown",
480
+ "metadata": {},
481
+ "source": [
482
+ "**Extracting data from crawled HTML files**"
483
+ ]
484
+ },
485
+ {
486
+ "cell_type": "code",
487
+ "execution_count": null,
488
+ "metadata": {},
489
+ "outputs": [],
490
+ "source": [
491
+ "lang='mr'\n",
492
+ "posts_dir='directory_containing_crawled_html_pages'\n",
493
+ "content_fname='content_fname2.csv'\n",
494
+ "article_mapping_fname='article_mapping_fname'\n",
495
+ "get_article_contents=get_article_contents_mr_loksatta\n",
496
+ "narticles=-1"
497
+ ]
498
+ },
499
+ {
500
+ "cell_type": "code",
501
+ "execution_count": null,
502
+ "metadata": {},
503
+ "outputs": [],
504
+ "source": [
505
+ "write_corpus(\n",
506
+ " extract_all_content(posts_dir, lang, article_extract_fn=get_article_contents,narticles=narticles),\n",
507
+ " content_fname,\n",
508
+ " article_mapping_fname\n",
509
+ " )"
510
+ ]
511
+ },
512
+ {
513
+ "cell_type": "markdown",
514
+ "metadata": {},
515
+ "source": [
516
+ "## Aggregating all crawled data"
517
+ ]
518
+ },
519
+ {
520
+ "cell_type": "code",
521
+ "execution_count": null,
522
+ "metadata": {},
523
+ "outputs": [],
524
+ "source": [
525
+ "### aggregating, de-duplicating and shuffling all the data \n",
526
+ "dedup_shuffle_and_print_txt([ 'content_fname1.csv', 'content_fname2.csv' ], 'output_fname.txt' )\n",
527
+ "### extract dataset statistics\n",
528
+ "dataset_stats('output_fname.txt')"
529
+ ]
530
+ }
531
+ ],
532
+ "metadata": {
533
+ "kernelspec": {
534
+ "display_name": "Python 3",
535
+ "language": "python",
536
+ "name": "python3"
537
+ },
538
+ "language_info": {
539
+ "codemirror_mode": {
540
+ "name": "ipython",
541
+ "version": 3
542
+ },
543
+ "file_extension": ".py",
544
+ "mimetype": "text/x-python",
545
+ "name": "python",
546
+ "nbconvert_exporter": "python",
547
+ "pygments_lexer": "ipython3",
548
+ "version": "3.6.7"
549
+ },
550
+ "toc": {
551
+ "base_numbering": 1,
552
+ "nav_menu": {
553
+ "height": "703px",
554
+ "width": "326px"
555
+ },
556
+ "number_sections": true,
557
+ "sideBar": true,
558
+ "skip_h1_title": false,
559
+ "title_cell": "Table of Contents",
560
+ "title_sidebar": "Contents",
561
+ "toc_cell": false,
562
+ "toc_position": {},
563
+ "toc_section_display": true,
564
+ "toc_window_display": false
565
+ }
566
+ },
567
+ "nbformat": 4,
568
+ "nbformat_minor": 2
569
+ }
indic_nlp_library/docs/Makefile ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line.
5
+ SPHINXOPTS =
6
+ SPHINXBUILD = sphinx-build
7
+ PAPER =
8
+ BUILDDIR = _build
9
+
10
+ # Internal variables.
11
+ PAPEROPT_a4 = -D latex_paper_size=a4
12
+ PAPEROPT_letter = -D latex_paper_size=letter
13
+ ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
14
+ # the i18n builder cannot share the environment and doctrees with the others
15
+ I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
16
+
17
+ .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
18
+
19
+ help:
20
+ @echo "Please use \`make <target>' where <target> is one of"
21
+ @echo " html to make standalone HTML files"
22
+ @echo " dirhtml to make HTML files named index.html in directories"
23
+ @echo " singlehtml to make a single large HTML file"
24
+ @echo " pickle to make pickle files"
25
+ @echo " json to make JSON files"
26
+ @echo " htmlhelp to make HTML files and a HTML help project"
27
+ @echo " qthelp to make HTML files and a qthelp project"
28
+ @echo " devhelp to make HTML files and a Devhelp project"
29
+ @echo " epub to make an epub"
30
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
31
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
32
+ @echo " text to make text files"
33
+ @echo " man to make manual pages"
34
+ @echo " texinfo to make Texinfo files"
35
+ @echo " info to make Texinfo files and run them through makeinfo"
36
+ @echo " gettext to make PO message catalogs"
37
+ @echo " changes to make an overview of all changed/added/deprecated items"
38
+ @echo " linkcheck to check all external links for integrity"
39
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
40
+
41
+ clean:
42
+ -rm -rf $(BUILDDIR)/*
43
+
44
+ html:
45
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
46
+ @echo
47
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
48
+
49
+ dirhtml:
50
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
51
+ @echo
52
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
53
+
54
+ singlehtml:
55
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
56
+ @echo
57
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
58
+
59
+ pickle:
60
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
61
+ @echo
62
+ @echo "Build finished; now you can process the pickle files."
63
+
64
+ json:
65
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
66
+ @echo
67
+ @echo "Build finished; now you can process the JSON files."
68
+
69
+ htmlhelp:
70
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
71
+ @echo
72
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
73
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
74
+
75
+ qthelp:
76
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
77
+ @echo
78
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
79
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
80
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/IndicNLPLibrary.qhcp"
81
+ @echo "To view the help file:"
82
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/IndicNLPLibrary.qhc"
83
+
84
+ devhelp:
85
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
86
+ @echo
87
+ @echo "Build finished."
88
+ @echo "To view the help file:"
89
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/IndicNLPLibrary"
90
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/IndicNLPLibrary"
91
+ @echo "# devhelp"
92
+
93
+ epub:
94
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
95
+ @echo
96
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
97
+
98
+ latex:
99
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100
+ @echo
101
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
103
+ "(use \`make latexpdf' here to do that automatically)."
104
+
105
+ latexpdf:
106
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107
+ @echo "Running LaTeX files through pdflatex..."
108
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
109
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110
+
111
+ text:
112
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113
+ @echo
114
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
115
+
116
+ man:
117
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118
+ @echo
119
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120
+
121
+ texinfo:
122
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123
+ @echo
124
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125
+ @echo "Run \`make' in that directory to run these through makeinfo" \
126
+ "(use \`make info' here to do that automatically)."
127
+
128
+ info:
129
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130
+ @echo "Running Texinfo files through makeinfo..."
131
+ make -C $(BUILDDIR)/texinfo info
132
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133
+
134
+ gettext:
135
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136
+ @echo
137
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138
+
139
+ changes:
140
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141
+ @echo
142
+ @echo "The overview file is in $(BUILDDIR)/changes."
143
+
144
+ linkcheck:
145
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146
+ @echo
147
+ @echo "Link check complete; look for any errors in the above output " \
148
+ "or in $(BUILDDIR)/linkcheck/output.txt."
149
+
150
+ doctest:
151
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152
+ @echo "Testing of doctests in the sources finished, look at the " \
153
+ "results in $(BUILDDIR)/doctest/output.txt."
indic_nlp_library/docs/cmd.rst ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Commandline
2
+ ===========
3
+
4
+ .. argparse::
5
+ :module: indicnlp.cli.cliparser
6
+ :func: get_parser
7
+ :prog: cliparser.py
8
+
indic_nlp_library/docs/code.rst ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Auto Generated Documentation
2
+ ============================
3
+
4
+ .. automodule:: indicnlp.langinfo indicnlp.common
5
+ :members:
indic_nlp_library/docs/conf.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Indic NLP Library documentation build configuration file, created by
4
+ # sphinx-quickstart on Tue Nov 3 01:50:37 2015.
5
+ #
6
+ # This file is execfile()d with the current directory set to its containing dir.
7
+ #
8
+ # Note that not all possible configuration values are present in this
9
+ # autogenerated file.
10
+ #
11
+ # All configuration values have a default; values that are commented out
12
+ # serve to show the default.
13
+
14
+ import sys, os
15
+
16
+ # If extensions (or modules to document with autodoc) are in another directory,
17
+ # add these directories to sys.path here. If the directory is relative to the
18
+ # documentation root, use os.path.abspath to make it absolute, like shown here.
19
+ sys.path.insert(0, os.path.abspath('..'))
20
+
21
+ # -- General configuration -----------------------------------------------------
22
+
23
+ # If your documentation needs a minimal Sphinx version, state it here.
24
+ #needs_sphinx = '1.0'
25
+
26
+ # Add any Sphinx extension module names here, as strings. They can be extensions
27
+ # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
28
+ extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', 'sphinxarg.ext']
29
+
30
+ # Add any paths that contain templates here, relative to this directory.
31
+ templates_path = ['_templates']
32
+
33
+ # The suffix of source filenames.
34
+ source_suffix = '.rst'
35
+
36
+ # The encoding of source files.
37
+ #source_encoding = 'utf-8-sig'
38
+
39
+ # The master toctree document.
40
+ master_doc = 'index'
41
+
42
+ # General information about the project.
43
+ project = 'Indic NLP Library'
44
+ copyright = '2015, Anoop Kunchukuttan'
45
+
46
+ # The version info for the project you're documenting, acts as replacement for
47
+ # |version| and |release|, also used in various other places throughout the
48
+ # built documents.
49
+ #
50
+ # The short X.Y version.
51
+ version = '0.2'
52
+ # The full version, including alpha/beta/rc tags.
53
+ release = '0.2'
54
+
55
+ # The language for content autogenerated by Sphinx. Refer to documentation
56
+ # for a list of supported languages.
57
+ #language = None
58
+
59
+ # There are two options for replacing |today|: either, you set today to some
60
+ # non-false value, then it is used:
61
+ #today = ''
62
+ # Else, today_fmt is used as the format for a strftime call.
63
+ #today_fmt = '%B %d, %Y'
64
+
65
+ # List of patterns, relative to source directory, that match files and
66
+ # directories to ignore when looking for source files.
67
+ exclude_patterns = ['_build']
68
+
69
+ # The reST default role (used for this markup: `text`) to use for all documents.
70
+ #default_role = None
71
+
72
+ # If true, '()' will be appended to :func: etc. cross-reference text.
73
+ #add_function_parentheses = True
74
+
75
+ # If true, the current module name will be prepended to all description
76
+ # unit titles (such as .. function::).
77
+ #add_module_names = True
78
+
79
+ # If true, sectionauthor and moduleauthor directives will be shown in the
80
+ # output. They are ignored by default.
81
+ #show_authors = False
82
+
83
+ # The name of the Pygments (syntax highlighting) style to use.
84
+ pygments_style = 'sphinx'
85
+
86
+ # A list of ignored prefixes for module index sorting.
87
+ #modindex_common_prefix = []
88
+
89
+
90
+ # -- Options for HTML output ---------------------------------------------------
91
+
92
+ # The theme to use for HTML and HTML Help pages. See the documentation for
93
+ # a list of builtin themes.
94
+ html_theme = 'sphinx_rtd_theme'
95
+
96
+ # Theme options are theme-specific and customize the look and feel of a theme
97
+ # further. For a list of options available for each theme, see the
98
+ # documentation.
99
+ #html_theme_options = {}
100
+
101
+ # Add any paths that contain custom themes here, relative to this directory.
102
+ #html_theme_path = []
103
+
104
+ # The name for this set of Sphinx documents. If None, it defaults to
105
+ # "<project> v<release> documentation".
106
+ #html_title = None
107
+
108
+ # A shorter title for the navigation bar. Default is the same as html_title.
109
+ #html_short_title = None
110
+
111
+ # The name of an image file (relative to this directory) to place at the top
112
+ # of the sidebar.
113
+ #html_logo = None
114
+
115
+ # The name of an image file (within the static path) to use as favicon of the
116
+ # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
117
+ # pixels large.
118
+ #html_favicon = None
119
+
120
+ # Add any paths that contain custom static files (such as style sheets) here,
121
+ # relative to this directory. They are copied after the builtin static files,
122
+ # so a file named "default.css" will overwrite the builtin "default.css".
123
+ html_static_path = ['_static']
124
+
125
+ # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126
+ # using the given strftime format.
127
+ #html_last_updated_fmt = '%b %d, %Y'
128
+
129
+ # If true, SmartyPants will be used to convert quotes and dashes to
130
+ # typographically correct entities.
131
+ #html_use_smartypants = True
132
+
133
+ # Custom sidebar templates, maps document names to template names.
134
+ #html_sidebars = {}
135
+
136
+ # Additional templates that should be rendered to pages, maps page names to
137
+ # template names.
138
+ #html_additional_pages = {}
139
+
140
+ # If false, no module index is generated.
141
+ #html_domain_indices = True
142
+
143
+ # If false, no index is generated.
144
+ #html_use_index = True
145
+
146
+ # If true, the index is split into individual pages for each letter.
147
+ #html_split_index = False
148
+
149
+ # If true, links to the reST sources are added to the pages.
150
+ #html_show_sourcelink = True
151
+
152
+ # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153
+ #html_show_sphinx = True
154
+
155
+ # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156
+ #html_show_copyright = True
157
+
158
+ # If true, an OpenSearch description file will be output, and all pages will
159
+ # contain a <link> tag referring to it. The value of this option must be the
160
+ # base URL from which the finished HTML is served.
161
+ #html_use_opensearch = ''
162
+
163
+ # This is the file name suffix for HTML files (e.g. ".xhtml").
164
+ #html_file_suffix = None
165
+
166
+ # Output file base name for HTML help builder.
167
+ htmlhelp_basename = 'IndicNLPLibrarydoc'
168
+
169
+
170
+ # -- Options for LaTeX output --------------------------------------------------
171
+
172
+ latex_elements = {
173
+ # The paper size ('letterpaper' or 'a4paper').
174
+ #'papersize': 'letterpaper',
175
+
176
+ # The font size ('10pt', '11pt' or '12pt').
177
+ #'pointsize': '10pt',
178
+
179
+ # Additional stuff for the LaTeX preamble.
180
+ #'preamble': '',
181
+ }
182
+
183
+ # Grouping the document tree into LaTeX files. List of tuples
184
+ # (source start file, target name, title, author, documentclass [howto/manual]).
185
+ latex_documents = [
186
+ ('index', 'IndicNLPLibrary.tex', 'Indic NLP Library Documentation',
187
+ 'Anoop Kunchukuttan', 'manual'),
188
+ ]
189
+
190
+ # The name of an image file (relative to this directory) to place at the top of
191
+ # the title page.
192
+ #latex_logo = None
193
+
194
+ # For "manual" documents, if this is true, then toplevel headings are parts,
195
+ # not chapters.
196
+ #latex_use_parts = False
197
+
198
+ # If true, show page references after internal links.
199
+ #latex_show_pagerefs = False
200
+
201
+ # If true, show URL addresses after external links.
202
+ #latex_show_urls = False
203
+
204
+ # Documents to append as an appendix to all manuals.
205
+ #latex_appendices = []
206
+
207
+ # If false, no module index is generated.
208
+ #latex_domain_indices = True
209
+
210
+
211
+ # -- Options for manual page output --------------------------------------------
212
+
213
+ # One entry per manual page. List of tuples
214
+ # (source start file, name, description, authors, manual section).
215
+ man_pages = [
216
+ ('index', 'indicnlplibrary', 'Indic NLP Library Documentation',
217
+ ['Anoop Kunchukuttan'], 1)
218
+ ]
219
+
220
+ # If true, show URL addresses after external links.
221
+ #man_show_urls = False
222
+
223
+
224
+ # -- Options for Texinfo output ------------------------------------------------
225
+
226
+ # Grouping the document tree into Texinfo files. List of tuples
227
+ # (source start file, target name, title, author,
228
+ # dir menu entry, description, category)
229
+ texinfo_documents = [
230
+ ('index', 'IndicNLPLibrary', 'Indic NLP Library Documentation',
231
+ 'Anoop Kunchukuttan', 'IndicNLPLibrary', 'NLP library for Indian languages',
232
+ 'NLP'),
233
+ ]
234
+
235
+ # Documents to append as an appendix to all manuals.
236
+ #texinfo_appendices = []
237
+
238
+ # If false, no module index is generated.
239
+ #texinfo_domain_indices = True
240
+
241
+ # How to display URL addresses: 'footnote', 'no', or 'inline'.
242
+ #texinfo_show_urls = 'footnote'
indic_nlp_library/docs/index.rst ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. Indic NLP Library documentation master file, created by
2
+ sphinx-quickstart on Tue Nov 3 01:50:37 2015.
3
+ You can adapt this file completely to your liking, but it should at least
4
+ contain the root `toctree` directive.
5
+
6
+ :github_url: https://github.com/anoopkunchukuttan/indic_nlp_library
7
+
8
+ .. toctree::
9
+ :maxdepth: 2
10
+ :caption: Packages
11
+
12
+ indicnlp
13
+
14
+ .. toctree::
15
+ :maxdepth: 2
16
+ :caption: Commandline
17
+
18
+ cmd
19
+
20
+
21
+
22
+
indic_nlp_library/docs/indicnlp.MD ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Indic NLP Library
2
+ ## A unified approach to NLP for Indian languages
3
+
4
+ ### Anoop Kunchukuttan (`[email protected]`)
5
+
6
+ The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text.
7
+
8
+ The library provides the following functionalities:
9
+
10
+ - Text Normalization
11
+ - Script Information
12
+ - Word Tokenization and Detokenization
13
+ - Sentence Splitting
14
+ - Word Segmentation
15
+ - Syllabification
16
+ - Script Conversion
17
+ - Romanization
18
+ - Indicization
19
+ - Transliteration
20
+ - Translation
21
+
22
+ The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project.
23
+
24
+ **If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/anoopkunchukuttan/indic_nlp_library) for pointers.**
25
+
26
+ ## Pre-requisites
27
+
28
+ - Python 3.x
29
+ - (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible)
30
+ - [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)
31
+ - Other dependencies are listed in setup.py
32
+
33
+
34
+ ## Configuration
35
+
36
+ - Installation from pip:
37
+
38
+ `pip install indic-nlp-library`
39
+
40
+ - If you want to use the project from the github repo, add the project to the Python Path:
41
+
42
+ - Clone this repository
43
+ - Install dependencies: `pip install -r requirements.txt`
44
+ - Run: `export PYTHONPATH=$PYTHONPATH:<project base directory>`
45
+
46
+ - In either case, export the path to the _Indic NLP Resources_ directory
47
+
48
+ Run: `export INDIC_RESOURCES_PATH=<path to Indic NLP resources>`
49
+
50
+ ## Usage
51
+
52
+ You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API.
53
+
54
+ ### Getting Started
55
+
56
+ Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API.
57
+ - You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb)
58
+
59
+ ### Documentation
60
+
61
+ You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest)
62
+
63
+ This documents the Python API as well as the commandline reference.
64
+
65
+ ## Citing
66
+
67
+ If you use this library, please include the following citation:
68
+
69
+ ```
70
+ @unpublished{kunchukuttan2020indicnlp,
71
+ author = "Anoop Kunchukuttan",
72
+ title = "The IndicNLP Library",
73
+ year = "2020",
74
+ }
75
+ ```
76
+ You can find the document [HERE](docs/indicnlp.pdf)
77
+
78
+ ## Website
79
+
80
+ `http://anoopkunchukuttan.github.io/indic_nlp_library`
81
+
82
+ ## Author
83
+ Anoop Kunchukuttan ([[email protected]]([email protected]))
84
+
85
+ ## Version: 0.7
86
+
87
+ ## Revision Log
88
+
89
+ 0.7 : 02 Apr 2020:
90
+
91
+ - Unified commandline
92
+ - Improved documentation
93
+ - Added setup.py
94
+
95
+ 0.6 : 16 Dec 2019:
96
+
97
+ - New romanizer and indicizer
98
+ - Script Unifiers
99
+ - Improved script normalizers
100
+ - Added contrib directory for sample uses
101
+ - changed to MIT license
102
+
103
+ 0.5 : 03 Jun 2019:
104
+
105
+ - Improved word tokenizer to handle dates and numbers.
106
+ - Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics.
107
+ - Added detokenizer
108
+ - Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts
109
+
110
+ 0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification.
111
+
112
+ 0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages
113
+
114
+ 0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages
115
+
116
+ 0.1 : 12 Mar 2014: Initial version. Supports text normalization.
117
+
118
+ ## LICENSE
119
+
120
+ Indic NLP Library is released under the MIT license
121
+
122
+
indic_nlp_library/docs/indicnlp.cli.rst ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cli Package
2
+ =============
3
+
4
+ :mod:`cliparser` Module
5
+ --------------------------------
6
+
7
+ .. automodule:: indicnlp.cli.cliparser
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
indic_nlp_library/docs/indicnlp.morph.rst ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ morph Package
2
+ =============
3
+
4
+ :mod:`unsupervised_morph` Module
5
+ --------------------------------
6
+
7
+ .. automodule:: indicnlp.morph.unsupervised_morph
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
indic_nlp_library/docs/indicnlp.normalize.rst ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ normalize Package
2
+ =================
3
+
4
+ :mod:`indic_normalize` Module
5
+ -----------------------------
6
+
7
+ .. automodule:: indicnlp.normalize.indic_normalize
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ .. autoclass:: indicnlp.normalize.indic_normalize.
13
+ :members:
14
+ :undoc-members:
15
+ :show-inheritance:
indic_nlp_library/docs/indicnlp.pdf ADDED
Binary file (38.1 kB). View file
 
indic_nlp_library/docs/indicnlp.rst ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ indicnlp Package
2
+ ================
3
+
4
+ :mod:`common` Module
5
+ --------------------
6
+
7
+ .. automodule:: indicnlp.common
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ :mod:`langinfo` Module
13
+ ----------------------
14
+
15
+ .. automodule:: indicnlp.langinfo
16
+ :members:
17
+ :undoc-members:
18
+ :show-inheritance:
19
+
20
+ :mod:`loader` Module
21
+ --------------------
22
+
23
+ .. automodule:: indicnlp.loader
24
+ :members:
25
+ :undoc-members:
26
+ :show-inheritance:
27
+
28
+ Subpackages
29
+ -----------
30
+
31
+ .. toctree::
32
+
33
+ indicnlp.cli
34
+ indicnlp.morph
35
+ indicnlp.normalize
36
+ indicnlp.script
37
+ indicnlp.syllable
38
+ indicnlp.tokenize
39
+ indicnlp.transliterate
40
+
41
+ Indices and tables
42
+ ==================
43
+
44
+ * :ref:`genindex`
45
+ * :ref:`modindex`
46
+ * :ref:`search`
47
+
indic_nlp_library/docs/indicnlp.script.rst ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ script Package
2
+ ==============
3
+
4
+ :mod:`indic_scripts` Module
5
+ ---------------------------
6
+
7
+ .. automodule:: indicnlp.script.indic_scripts
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ :mod:`english_script` Module
13
+ ---------------------------
14
+
15
+ .. automodule:: indicnlp.script.english_script
16
+ :members:
17
+ :undoc-members:
18
+ :show-inheritance:
19
+
20
+ :mod:`phonetic_sim` Module
21
+ ---------------------------
22
+
23
+ .. automodule:: indicnlp.script.phonetic_sim
24
+ :members:
25
+ :undoc-members:
26
+ :show-inheritance:
indic_nlp_library/docs/indicnlp.syllable.rst ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ syllable Package
2
+ ==============
3
+
4
+ :mod:`syllabifier` Module
5
+ ---------------------------
6
+
7
+ .. automodule:: indicnlp.syllable.syllabifier
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
indic_nlp_library/docs/indicnlp.tokenize.rst ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tokenize Package
2
+ ================
3
+
4
+ :mod:`indic_tokenize` Module
5
+ ----------------------------
6
+
7
+ .. automodule:: indicnlp.tokenize.indic_tokenize
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ :mod:`indic_detokenize` Module
13
+ ------------------------------
14
+
15
+ .. automodule:: indicnlp.tokenize.indic_detokenize
16
+ :members:
17
+ :undoc-members:
18
+ :show-inheritance:
19
+
20
+ :mod:`sentence_tokenize` Module
21
+ ----------------------------
22
+
23
+ .. automodule:: indicnlp.tokenize.sentence_tokenize
24
+ :members:
25
+ :undoc-members:
26
+ :show-inheritance:
indic_nlp_library/docs/indicnlp.transliterate.rst ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transliterate Package
2
+ =====================
3
+
4
+ :mod:`sinhala_transliterator` Module
5
+ ------------------------------------
6
+
7
+ .. automodule:: indicnlp.transliterate.sinhala_transliterator
8
+ :members:
9
+ :undoc-members:
10
+ :show-inheritance:
11
+
12
+ :mod:`unicode_transliterate` Module
13
+ -----------------------------------
14
+
15
+ .. automodule:: indicnlp.transliterate.unicode_transliterate
16
+ :members:
17
+ :undoc-members:
18
+ :show-inheritance:
19
+
20
+ :mod:`acronym_transliterator` Module
21
+ -----------------------------------
22
+
23
+ .. automodule:: indicnlp.transliterate.acronym_transliterator
24
+ :members:
25
+ :undoc-members:
26
+ :show-inheritance:
27
+
28
+ :mod:`script_unifier` Module
29
+ -----------------------------------
30
+
31
+ .. automodule:: indicnlp.transliterate.script_unifier
32
+ :members:
33
+ :undoc-members:
34
+ :show-inheritance:
indic_nlp_library/docs/make.bat ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @ECHO OFF
2
+
3
+ pushd %~dp0
4
+
5
+ REM Command file for Sphinx documentation
6
+
7
+ if "%SPHINXBUILD%" == "" (
8
+ set SPHINXBUILD=sphinx-build
9
+ )
10
+ set SOURCEDIR=.
11
+ set BUILDDIR=_build
12
+
13
+ if "%1" == "" goto help
14
+
15
+ %SPHINXBUILD% >NUL 2>NUL
16
+ if errorlevel 9009 (
17
+ echo.
18
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19
+ echo.installed, then set the SPHINXBUILD environment variable to point
20
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
21
+ echo.may add the Sphinx directory to PATH.
22
+ echo.
23
+ echo.If you don't have Sphinx installed, grab it from
24
+ echo.http://sphinx-doc.org/
25
+ exit /b 1
26
+ )
27
+
28
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29
+ goto end
30
+
31
+ :help
32
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33
+
34
+ :end
35
+ popd
indic_nlp_library/docs/modules.rst ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ indicnlp
2
+ ===
3
+
4
+ .. toctree::
5
+ :maxdepth: 4
6
+
7
+ indicnlp
indic_nlp_library/indicnlp/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ try:
5
+ from .version import __version__ # noqa
6
+ except ImportError:
7
+ version_txt = os.path.join(os.path.dirname(__file__), "version.txt")
8
+ with open(version_txt) as f:
9
+ __version__ = f.read().strip()
10
+
indic_nlp_library/indicnlp/cli/__init__.py ADDED
File without changes
indic_nlp_library/indicnlp/cli/cliparser.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+
4
+ from indicnlp import loader
5
+ from indicnlp.tokenize import indic_tokenize
6
+ from indicnlp.tokenize import indic_detokenize
7
+ from indicnlp.normalize import indic_normalize
8
+ from indicnlp.morph import unsupervised_morph
9
+ from indicnlp.tokenize import sentence_tokenize
10
+ from indicnlp.syllable import syllabifier
11
+ from indicnlp.transliterate import unicode_transliterate
12
+ from indicnlp.transliterate import script_unifier
13
+
14
+ DEFAULT_ENCODING='utf-8'
15
+
16
+ def run_detokenize(args):
17
+ for line in args.infile:
18
+ args.outfile.write(indic_detokenize.trivial_detokenize(line,args.lang))
19
+
20
+ def run_tokenize(args):
21
+ for line in args.infile:
22
+ args.outfile.write(' '.join(
23
+ indic_tokenize.trivial_tokenize(line,args.lang)))
24
+
25
+ def run_sentence_split(args):
26
+ text=' '.join([ l.replace('\n','').replace('\r','') for l in args.infile])
27
+ outlines=sentence_tokenize.sentence_split(text,args.lang)
28
+ for line in outlines:
29
+ args.outfile.write(line+'\n')
30
+
31
+ def run_normalize(args):
32
+
33
+ # TODO: add more options to cli
34
+ remove_nuktas=False
35
+ normalize_nasals='do_nothing'
36
+
37
+ # create normalizer
38
+ factory=indic_normalize.IndicNormalizerFactory()
39
+ normalizer=factory.get_normalizer(args.lang,
40
+ remove_nuktas=remove_nuktas,
41
+ nasals_mode=normalize_nasals)
42
+
43
+ # DO normalization
44
+ for line in args.infile:
45
+ normalized_line=normalizer.normalize(line)
46
+ args.outfile.write(normalized_line)
47
+
48
+ def run_morph(args):
49
+
50
+ add_marker=False
51
+ analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer(args.lang,add_marker)
52
+ for line in args.infile:
53
+ morph_tokens=analyzer.morph_analyze_document(line.strip().split(' '))
54
+ args.outfile.write(' '.join(morph_tokens) + '\n')
55
+
56
+ def run_syllabify(args):
57
+ for line in args.infile:
58
+ new_line = ' '.join(
59
+ [ ' '.join(syllabifier.orthographic_syllabify(w,args.lang))
60
+ for w in line.strip().split(' ') ]
61
+ )
62
+ args.outfile.write(new_line+'\n')
63
+
64
+ def run_wc(args):
65
+ # if args.l==False and args.w==False and args.c==False:
66
+ # args.l, args.w, args.c= True, True, True
67
+
68
+ nl=0
69
+ nw=0
70
+ nc=0
71
+
72
+ for line in args.infile:
73
+ nl+=1
74
+ nw+=len(line.strip(' ').split(' '))
75
+ nc+=len(line)
76
+
77
+ print('{} {} {}'.format(nl,nw,nc))
78
+
79
+ def run_indic2roman(args):
80
+ for line in args.infile:
81
+ transliterated_line=unicode_transliterate.ItransTransliterator.to_itrans(
82
+ line,args.lang)
83
+ args.outfile.write(transliterated_line)
84
+
85
+ def run_roman2indic(args):
86
+ for line in args.infile:
87
+ transliterated_line=unicode_transliterate.ItransTransliterator.from_itrans(
88
+ line,args.lang)
89
+ args.outfile.write(transliterated_line)
90
+
91
+ def run_script_unify(args):
92
+
93
+ unifier=None
94
+
95
+ if args.mode=='aggressive':
96
+ unifier=script_unifier.AggressiveScriptUnifier(nasals_mode='to_anusvaara_relaxed', common_lang=args.common_lang)
97
+
98
+ elif args.mode=='basic':
99
+ unifier=script_unifier.BasicScriptUnifier(nasals_mode='do_nothing',
100
+ common_lang=args.common_lang)
101
+
102
+ elif args.mode=='naive':
103
+ unifier=script_unifier.NaiveScriptUnifier(common_lang=args.common_lang)
104
+
105
+ assert(unifier is not None)
106
+
107
+ for line in args.infile:
108
+ transliterated_line=unifier.transform(line,args.lang)
109
+ args.outfile.write(transliterated_line)
110
+
111
+ def run_script_convert(args):
112
+ for line in args.infile:
113
+ transliterated_line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(
114
+ line,args.srclang,args.tgtlang)
115
+ args.outfile.write(transliterated_line)
116
+
117
+ def add_common_monolingual_args(task_parser):
118
+ task_parser.add_argument('infile',
119
+ type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
120
+ nargs='?',
121
+ default=sys.stdin,
122
+ help='Input File path',
123
+ )
124
+ task_parser.add_argument('outfile',
125
+ type=argparse.FileType('w',encoding=DEFAULT_ENCODING),
126
+ nargs='?',
127
+ default=sys.stdout,
128
+ help='Output File path',
129
+ )
130
+ task_parser.add_argument('-l', '--lang',
131
+ help='Language',
132
+ )
133
+
134
+ def add_common_bilingual_args(task_parser):
135
+ task_parser.add_argument('infile',
136
+ type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
137
+ nargs='?',
138
+ default=sys.stdin,
139
+ help='Input File path',
140
+ )
141
+ task_parser.add_argument('outfile',
142
+ type=argparse.FileType('w',encoding=DEFAULT_ENCODING),
143
+ nargs='?',
144
+ default=sys.stdout,
145
+ help='Output File path',
146
+ )
147
+ task_parser.add_argument('-s', '--srclang',
148
+ help='Source Language',
149
+ )
150
+
151
+ task_parser.add_argument('-t', '--tgtlang',
152
+ help='Target Language',
153
+ )
154
+
155
+ def add_tokenize_parser(subparsers):
156
+ task_parser=subparsers.add_parser('tokenize',
157
+ help='tokenizer help')
158
+ add_common_monolingual_args(task_parser)
159
+ task_parser.set_defaults(func=run_tokenize)
160
+
161
+ def add_detokenize_parser(subparsers):
162
+ task_parser=subparsers.add_parser('detokenize',
163
+ help='de-tokenizer help')
164
+ add_common_monolingual_args(task_parser)
165
+ task_parser.set_defaults(func=run_detokenize)
166
+
167
+ def add_sentence_split_parser(subparsers):
168
+ task_parser=subparsers.add_parser('sentence_split', help='sentence split help')
169
+ add_common_monolingual_args(task_parser)
170
+ task_parser.set_defaults(func=run_sentence_split)
171
+
172
+ def add_normalize_parser(subparsers):
173
+ task_parser=subparsers.add_parser('normalize', help='normalizer help')
174
+ add_common_monolingual_args(task_parser)
175
+ task_parser.set_defaults(func=run_normalize)
176
+
177
+ def add_morph_parser(subparsers):
178
+ task_parser=subparsers.add_parser('morph', help='morph help')
179
+ add_common_monolingual_args(task_parser)
180
+ task_parser.set_defaults(func=run_morph)
181
+
182
+ def add_syllabify_parser(subparsers):
183
+ task_parser=subparsers.add_parser('syllabify', help='syllabify help')
184
+ add_common_monolingual_args(task_parser)
185
+ task_parser.set_defaults(func=run_syllabify)
186
+
187
+ def add_wc_parser(subparsers):
188
+ task_parser=subparsers.add_parser('wc', help='wc help')
189
+
190
+ task_parser.add_argument('infile',
191
+ type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
192
+ nargs='?',
193
+ default=sys.stdin,
194
+ help='Input File path',
195
+ )
196
+ # task_parser.add_argument('-l', action='store_true')
197
+ # task_parser.add_argument('-w', action='store_true')
198
+ # task_parser.add_argument('-c', action='store_true')
199
+ # task_parser.set_defaults(l=False)
200
+ # task_parser.set_defaults(w=False)
201
+ # task_parser.set_defaults(c=False)
202
+
203
+ task_parser.set_defaults(func=run_wc)
204
+
205
+ def add_indic2roman_parser(subparsers):
206
+ task_parser=subparsers.add_parser('indic2roman', help='indic2roman help')
207
+ add_common_monolingual_args(task_parser)
208
+ task_parser.set_defaults(func=run_indic2roman)
209
+
210
+ def add_roman2indic_parser(subparsers):
211
+ task_parser=subparsers.add_parser('roman2indic', help='roman2indic help')
212
+ add_common_monolingual_args(task_parser)
213
+ task_parser.set_defaults(func=run_indic2roman)
214
+
215
+ def add_script_unify_parser(subparsers):
216
+ task_parser=subparsers.add_parser('script_unify', help='script_unify help')
217
+ add_common_monolingual_args(task_parser)
218
+ task_parser.add_argument('-m','--mode',
219
+ default='basic',
220
+ choices=['naive', 'basic', 'aggressive'] ,
221
+ help='Script unification mode',
222
+ )
223
+ task_parser.add_argument('-c','--common_lang',
224
+ default='hi',
225
+ help='Common language in which all languages are represented',
226
+ )
227
+
228
+ task_parser.set_defaults(func=run_script_unify)
229
+
230
+ def add_script_convert_parser(subparsers):
231
+ task_parser=subparsers.add_parser('script_convert', help='script convert help')
232
+ add_common_bilingual_args(task_parser)
233
+ task_parser.set_defaults(func=run_script_convert)
234
+
235
+ def get_parser():
236
+ parser = argparse.ArgumentParser(prog='indicnlp')
237
+ subparsers = parser.add_subparsers(help='Invoke each operation with one of the subcommands', dest='subcommand')
238
+
239
+ add_tokenize_parser(subparsers)
240
+ add_detokenize_parser(subparsers)
241
+ add_sentence_split_parser(subparsers)
242
+ add_normalize_parser(subparsers)
243
+
244
+ add_morph_parser(subparsers)
245
+ add_syllabify_parser(subparsers)
246
+
247
+ add_wc_parser(subparsers)
248
+
249
+ add_indic2roman_parser(subparsers)
250
+ add_roman2indic_parser(subparsers)
251
+ add_script_unify_parser(subparsers)
252
+
253
+ add_script_convert_parser(subparsers)
254
+
255
+ return parser
256
+
257
+ def main():
258
+ parser=get_parser()
259
+ args=parser.parse_args()
260
+ # print(args)
261
+ args.func(args)
262
+
263
+ if __name__ == '__main__':
264
+ loader.load()
265
+ main()
266
+
indic_nlp_library/indicnlp/common.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import os
10
+
11
+ """
12
+ Path to the Indic NLP Resources directory
13
+ """
14
+ INDIC_RESOURCES_PATH=''
15
+
16
+ def init():
17
+ """
18
+ Initialize the module. The following actions are performed:
19
+
20
+ - Checks of INDIC_RESOURCES_PATH variable is set. If not, checks if it can beb initialized from
21
+ INDIC_RESOURCES_PATH environment variable. If that fails, an exception is raised
22
+ """
23
+ global INDIC_RESOURCES_PATH
24
+ try:
25
+ if INDIC_RESOURCES_PATH=='':
26
+ INDIC_RESOURCES_PATH=os.environ['INDIC_RESOURCES_PATH']
27
+ except Exception as e:
28
+ raise IndicNlpException('INDIC_RESOURCES_PATH not set')
29
+
30
+ if INDIC_RESOURCES_PATH=='':
31
+ raise IndicNlpException('INDIC_RESOURCES_PATH not set')
32
+
33
+
34
+
35
+ def get_resources_path():
36
+ """
37
+ Get the path to the Indic NLP Resources directory
38
+ """
39
+ return INDIC_RESOURCES_PATH
40
+
41
+ def set_resources_path(resources_path):
42
+ """
43
+ Set the path to the Indic NLP Resources directory
44
+ """
45
+ global INDIC_RESOURCES_PATH
46
+ INDIC_RESOURCES_PATH=resources_path
47
+
48
+ class IndicNlpException(Exception):
49
+ """
50
+ Exceptions thrown by Indic NLP Library components are instances of this class.
51
+ 'msg' attribute contains exception details.
52
+ """
53
+ def __init__(self, msg):
54
+ self.msg = msg
55
+
56
+ def __str__(self):
57
+ return repr(self.msg)
58
+
indic_nlp_library/indicnlp/langinfo.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ ## language codes
10
+ LC_TA='ta'
11
+
12
+ SCRIPT_RANGES={
13
+ 'pa':[0x0a00,0x0a7f] ,
14
+ 'gu':[0x0a80,0x0aff] ,
15
+ 'or':[0x0b00,0x0b7f] ,
16
+ 'ta':[0x0b80,0x0bff] ,
17
+ 'te':[0x0c00,0x0c7f] ,
18
+ 'kn':[0x0c80,0x0cff] ,
19
+ 'ml':[0x0d00,0x0d7f] ,
20
+ 'si':[0x0d80,0x0dff] ,
21
+ 'hi':[0x0900,0x097f] ,
22
+ 'mr':[0x0900,0x097f] ,
23
+ 'kK':[0x0900,0x097f] ,
24
+ 'sa':[0x0900,0x097f] ,
25
+ 'ne':[0x0900,0x097f] ,
26
+ 'sd':[0x0900,0x097f] ,
27
+ 'bn':[0x0980,0x09ff] ,
28
+ 'as':[0x0980,0x09ff] ,
29
+ }
30
+
31
+ DRAVIDIAN_LANGUAGES=['ta', 'te', 'kn', 'ml',]
32
+ IE_LANGUAGES=['hi', 'mr', 'kK', 'sa', 'ne', 'sd', 'bn', 'as', 'pa', 'gu', 'or', 'si', ]
33
+ DANDA_DELIM_LANGUAGES=['as','bn','hi','ne','or','pa','sa','sd']
34
+
35
+ URDU_RANGES=[
36
+ [0x0600,0x06ff],
37
+ [0x0750,0x077f],
38
+ [0xfb50,0xfdff],
39
+ [0xfe70,0xfeff],
40
+ ]
41
+
42
+ COORDINATED_RANGE_START_INCLUSIVE=0
43
+ COORDINATED_RANGE_END_INCLUSIVE=0x6f
44
+
45
+ NUMERIC_OFFSET_START=0x66
46
+ NUMERIC_OFFSET_END=0x6f
47
+
48
+ HALANTA_OFFSET=0x4d
49
+ AUM_OFFSET=0x50
50
+ NUKTA_OFFSET=0x3c
51
+
52
+ RUPEE_SIGN=0x20b9
53
+
54
+ DANDA=0x0964
55
+ DOUBLE_DANDA=0x0965
56
+
57
+ #TODO: add missing fricatives and approximants
58
+ VELAR_RANGE=[0x15,0x19]
59
+ PALATAL_RANGE=[0x1a,0x1e]
60
+ RETROFLEX_RANGE=[0x1f,0x23]
61
+ DENTAL_RANGE=[0x24,0x29]
62
+ LABIAL_RANGE=[0x2a,0x2e]
63
+
64
+ # verify
65
+ VOICED_LIST=[0x17,0x18,0x1c,0x1d,0x21,0x22,0x26,0x27,0x2c,0x2d]
66
+ UNVOICED_LIST=[0x15,0x16,0x1a,0x1b,0x1f,0x20,0x24,0x25,0x2a,0x2b] #TODO: add sibilants/sonorants
67
+ ASPIRATED_LIST=[0x16,0x18,0x1b,0x1d,0x20,0x22,0x25,0x27,0x2b,0x2d]
68
+ UNASPIRATED_LIST=[0x15,0x17,0x1a,0x1c,0x1f,0x21,0x24,0x26,0x2a,0x2c]
69
+ NASAL_LIST=[0x19,0x1e,0x23,0x28,0x29,0x2d]
70
+ FRICATIVE_LIST=[0x36,0x37,0x38]
71
+ APPROXIMANT_LIST=[0x2f,0x30,0x31,0x32,0x33,0x34,0x35]
72
+
73
+ #TODO: ha has to be properly categorized
74
+
75
+ def is_danda_delim(lang):
76
+ """
77
+ Returns True if danda/double danda is a possible delimiter for the language
78
+ """
79
+ return lang in DANDA_DELIM_LANGUAGES
80
+
81
+ def get_offset(c,lang):
82
+ """
83
+ Applicable to Brahmi derived Indic scripts
84
+ """
85
+ return ord(c)-SCRIPT_RANGES[lang][0]
86
+
87
+ def offset_to_char(c,lang):
88
+ """
89
+ Applicable to Brahmi derived Indic scripts
90
+ """
91
+ return chr(c+SCRIPT_RANGES[lang][0])
92
+
93
+ def in_coordinated_range(c_offset):
94
+ """
95
+ Applicable to Brahmi derived Indic scripts
96
+ """
97
+ return (c_offset>=COORDINATED_RANGE_START_INCLUSIVE and c_offset<=COORDINATED_RANGE_END_INCLUSIVE)
98
+
99
+ def is_indiclang_char(c,lang):
100
+ """
101
+ Applicable to Brahmi derived Indic scripts
102
+ """
103
+ o=get_offset(c,lang)
104
+ return (o>=0 and o<=0x7f) or ord(c)==DANDA or ord(c)==DOUBLE_DANDA
105
+
106
+ # def is_vowel(c,lang):
107
+ # """
108
+ # Is the character a vowel
109
+ # """
110
+ # o=get_offset(c,lang)
111
+ # return (o>=0x04 and o<=0x14)
112
+
113
+ # def is_vowel_sign(c,lang):
114
+ # """
115
+ # Is the character a vowel sign (maatraa)
116
+ # """
117
+ # o=get_offset(c,lang)
118
+ # return (o>=0x3e and o<=0x4c)
119
+
120
+ # def is_halanta(c,lang):
121
+ # """
122
+ # Is the character the halanta character
123
+ # """
124
+ # o=get_offset(c,lang)
125
+ # return (o==HALANTA_OFFSET)
126
+
127
+ # def is_nukta(c,lang):
128
+ # """
129
+ # Is the character the halanta character
130
+ # """
131
+ # o=get_offset(c,lang)
132
+ # return (o==NUKTA_OFFSET)
133
+
134
+ # def is_aum(c,lang):
135
+ # """
136
+ # Is the character a vowel sign (maatraa)
137
+ # """
138
+ # o=get_offset(c,lang)
139
+ # return (o==AUM_OFFSET)
140
+
141
+ # def is_consonant(c,lang):
142
+ # """
143
+ # Is the character a consonant
144
+ # """
145
+ # o=get_offset(c,lang)
146
+ # return (o>=0x15 and o<=0x39)
147
+
148
+ # def is_velar(c,lang):
149
+ # """
150
+ # Is the character a velar
151
+ # """
152
+ # o=get_offset(c,lang)
153
+ # return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])
154
+
155
+ # def is_palatal(c,lang):
156
+ # """
157
+ # Is the character a palatal
158
+ # """
159
+ # o=get_offset(c,lang)
160
+ # return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])
161
+
162
+ # def is_retroflex(c,lang):
163
+ # """
164
+ # Is the character a retroflex
165
+ # """
166
+ # o=get_offset(c,lang)
167
+ # return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])
168
+
169
+ # def is_dental(c,lang):
170
+ # """
171
+ # Is the character a dental
172
+ # """
173
+ # o=get_offset(c,lang)
174
+ # return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])
175
+
176
+ # def is_labial(c,lang):
177
+ # """
178
+ # Is the character a labial
179
+ # """
180
+ # o=get_offset(c,lang)
181
+ # return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])
182
+
183
+ # def is_voiced(c,lang):
184
+ # """
185
+ # Is the character a voiced consonant
186
+ # """
187
+ # o=get_offset(c,lang)
188
+ # return o in VOICED_LIST
189
+
190
+ # def is_unvoiced(c,lang):
191
+ # """
192
+ # Is the character a unvoiced consonant
193
+ # """
194
+ # o=get_offset(c,lang)
195
+ # return o in UNVOICED_LIST
196
+
197
+ # def is_aspirated(c,lang):
198
+ # """
199
+ # Is the character a aspirated consonant
200
+ # """
201
+ # o=get_offset(c,lang)
202
+ # return o in ASPIRATED_LIST
203
+
204
+ # def is_unaspirated(c,lang):
205
+ # """
206
+ # Is the character a unaspirated consonant
207
+ # """
208
+ # o=get_offset(c,lang)
209
+ # return o in UNASPIRATED_LIST
210
+
211
+ # def is_nasal(c,lang):
212
+ # """
213
+ # Is the character a nasal consonant
214
+ # """
215
+ # o=get_offset(c,lang)
216
+ # return o in NASAL_LIST
217
+
218
+ # def is_fricative(c,lang):
219
+ # """
220
+ # Is the character a fricative consonant
221
+ # """
222
+ # o=get_offset(c,lang)
223
+ # return o in FRICATIVE_LIST
224
+
225
+ # def is_approximant(c,lang):
226
+ # """
227
+ # Is the character an approximant consonant
228
+ # """
229
+ # o=get_offset(c,lang)
230
+ # return o in APPROXIMANT_LIST
231
+
232
+ # def is_number(c,lang):
233
+ # """
234
+ # Is the character a number
235
+ # """
236
+ # o=get_offset(c,lang)
237
+ # return (o>=0x66 and o<=0x6f)
238
+
239
+
240
+ def is_vowel(c,lang):
241
+ """
242
+ Is the character a vowel
243
+ """
244
+ o=get_offset(c,lang)
245
+ return (o>=0x04 and o<=0x14)
246
+
247
+ def is_vowel_sign(c,lang):
248
+ """
249
+ Is the character a vowel sign (maatraa)
250
+ """
251
+ o=get_offset(c,lang)
252
+ return (o>=0x3e and o<=0x4c)
253
+
254
+ def is_halanta(c,lang):
255
+ """
256
+ Is the character the halanta character
257
+ """
258
+ o=get_offset(c,lang)
259
+ return (o==HALANTA_OFFSET)
260
+
261
+ def is_nukta(c,lang):
262
+ """
263
+ Is the character the halanta character
264
+ """
265
+ o=get_offset(c,lang)
266
+ return (o==NUKTA_OFFSET)
267
+
268
+ def is_aum(c,lang):
269
+ """
270
+ Is the character a vowel sign (maatraa)
271
+ """
272
+ o=get_offset(c,lang)
273
+ return (o==AUM_OFFSET)
274
+
275
+ def is_consonant(c,lang):
276
+ """
277
+ Is the character a consonant
278
+ """
279
+ o=get_offset(c,lang)
280
+ return (o>=0x15 and o<=0x39)
281
+
282
+ def is_velar(c,lang):
283
+ """
284
+ Is the character a velar
285
+ """
286
+ o=get_offset(c,lang)
287
+ return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])
288
+
289
+ def is_palatal(c,lang):
290
+ """
291
+ Is the character a palatal
292
+ """
293
+ o=get_offset(c,lang)
294
+ return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])
295
+
296
+ def is_retroflex(c,lang):
297
+ """
298
+ Is the character a retroflex
299
+ """
300
+ o=get_offset(c,lang)
301
+ return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])
302
+
303
+ def is_dental(c,lang):
304
+ """
305
+ Is the character a dental
306
+ """
307
+ o=get_offset(c,lang)
308
+ return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])
309
+
310
+ def is_labial(c,lang):
311
+ """
312
+ Is the character a labial
313
+ """
314
+ o=get_offset(c,lang)
315
+ return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])
316
+
317
+ def is_voiced(c,lang):
318
+ """
319
+ Is the character a voiced consonant
320
+ """
321
+ o=get_offset(c,lang)
322
+ return o in VOICED_LIST
323
+
324
+ def is_unvoiced(c,lang):
325
+ """
326
+ Is the character a unvoiced consonant
327
+ """
328
+ o=get_offset(c,lang)
329
+ return o in UNVOICED_LIST
330
+
331
+ def is_aspirated(c,lang):
332
+ """
333
+ Is the character a aspirated consonant
334
+ """
335
+ o=get_offset(c,lang)
336
+ return o in ASPIRATED_LIST
337
+
338
+ def is_unaspirated(c,lang):
339
+ """
340
+ Is the character a unaspirated consonant
341
+ """
342
+ o=get_offset(c,lang)
343
+ return o in UNASPIRATED_LIST
344
+
345
+ def is_nasal(c,lang):
346
+ """
347
+ Is the character a nasal consonant
348
+ """
349
+ o=get_offset(c,lang)
350
+ return o in NASAL_LIST
351
+
352
+ def is_fricative(c,lang):
353
+ """
354
+ Is the character a fricative consonant
355
+ """
356
+ o=get_offset(c,lang)
357
+ return o in FRICATIVE_LIST
358
+
359
+ def is_approximant(c,lang):
360
+ """
361
+ Is the character an approximant consonant
362
+ """
363
+ o=get_offset(c,lang)
364
+ return o in APPROXIMANT_LIST
365
+
366
+ def is_number(c,lang):
367
+ """
368
+ Is the character a number
369
+ """
370
+ o=get_offset(c,lang)
371
+ return (o>=0x66 and o<=0x6f)
372
+
373
+
374
+ ##################################################
375
+
376
+ def is_vowel_offset(c_offset):
377
+ """
378
+ Is the offset a vowel
379
+ """
380
+ return (c_offset>=0x04 and c_offset<=0x14)
381
+
382
+ def is_vowel_sign_offset(c_offset):
383
+ """
384
+ Is the offset a vowel sign (maatraa)
385
+ """
386
+ return (c_offset>=0x3e and c_offset<=0x4c)
387
+
388
+ def is_halanta_offset(c_offset):
389
+ """
390
+ Is the offset the halanta offset
391
+ """
392
+ return (c_offset==HALANTA_OFFSET)
393
+
394
+ def is_nukta_offset(c_offset):
395
+ """
396
+ Is the offset the halanta offset
397
+ """
398
+ return (c_offset==NUKTA_OFFSET)
399
+
400
+ def is_aum_offset(c_offset):
401
+ """
402
+ Is the offset a vowel sign (maatraa)
403
+ """
404
+ return (c_offset==AUM_OFFSET)
405
+
406
+ def is_consonant_offset(c_offset):
407
+ """
408
+ Is the offset a consonant
409
+ """
410
+ return (c_offset>=0x15 and c_offset<=0x39)
411
+
412
+ def is_velar_offset(c_offset):
413
+ """
414
+ Is the offset a velar
415
+ """
416
+ return (c_offset>=VELAR_RANGE[0] and c_offset<=VELAR_RANGE[1])
417
+
418
+ def is_palatal_offset(c_offset):
419
+ """
420
+ Is the offset a palatal
421
+ """
422
+ return (c_offset>=PALATAL_RANGE[0] and c_offset<=PALATAL_RANGE[1])
423
+
424
+ def is_retroflex_offset(c_offset):
425
+ """
426
+ Is the offset a retroflex
427
+ """
428
+ return (c_offset>=RETROFLEX_RANGE[0] and c_offset<=RETROFLEX_RANGE[1])
429
+
430
+ def is_dental_offset(c_offset):
431
+ """
432
+ Is the offset a dental
433
+ """
434
+ return (c_offset>=DENTAL_RANGE[0] and c_offset<=DENTAL_RANGE[1])
435
+
436
+ def is_labial_offset(c_offset):
437
+ """
438
+ Is the offset a labial
439
+ """
440
+ return (c_offset>=LABIAL_RANGE[0] and c_offset<=LABIAL_RANGE[1])
441
+
442
+ def is_voiced_offset(c_offset):
443
+ """
444
+ Is the offset a voiced consonant
445
+ """
446
+ return c_offset in VOICED_LIST
447
+
448
+ def is_unvoiced_offset(c_offset):
449
+ """
450
+ Is the offset a unvoiced consonant
451
+ """
452
+ return c_offset in UNVOICED_LIST
453
+
454
+ def is_aspirated_offset(c_offset):
455
+ """
456
+ Is the offset a aspirated consonant
457
+ """
458
+ return c_offset in ASPIRATED_LIST
459
+
460
+ def is_unaspirated_offset(c_offset):
461
+ """
462
+ Is the offset a unaspirated consonant
463
+ """
464
+ return c_offset in UNASPIRATED_LIST
465
+
466
+ def is_nasal_offset(c_offset):
467
+ """
468
+ Is the offset a nasal consonant
469
+ """
470
+ return c_offset in NASAL_LIST
471
+
472
+ def is_fricative_offset(c_offset):
473
+ """
474
+ Is the offset a fricative consonant
475
+ """
476
+ return c_offset in FRICATIVE_LIST
477
+
478
+ def is_approximant_offset(c_offset):
479
+ """
480
+ Is the offset an approximant consonant
481
+ """
482
+ return c_offset in APPROXIMANT_LIST
483
+
484
+ def is_number_offset(c_offset):
485
+ """
486
+ Is the offset a number
487
+ """
488
+ return (c_offset>=0x66 and c_offset<=0x6f)
indic_nlp_library/indicnlp/loader.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ from indicnlp import common
10
+ from indicnlp.script import indic_scripts
11
+ from indicnlp.script import english_script
12
+ from indicnlp.transliterate import unicode_transliterate
13
+
14
+ def load():
15
+ """
16
+ Initializes the Indic NLP library. Clients should call this method before using the library.
17
+
18
+ Any module requiring initialization should have a init() method, to which a call must be made from this method
19
+ """
20
+
21
+ ### Order of intialization may matter
22
+
23
+ # Common has to be loaded first to get access to resources
24
+ common.init()
25
+
26
+ ## Initialization of Indic scripts module
27
+ indic_scripts.init()
28
+
29
+ ## Initialization of English scripts module
30
+ english_script.init()
31
+
32
+ ## Initialization of unicode_transliterate module
33
+ unicode_transliterate.init()
34
+
35
+
indic_nlp_library/indicnlp/morph/__init__.py ADDED
File without changes
indic_nlp_library/indicnlp/morph/unsupervised_morph.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import codecs, sys, itertools,re,os
10
+ import morfessor
11
+
12
+ from functools import lru_cache
13
+
14
+ from indicnlp import langinfo
15
+ from indicnlp import common
16
+ from indicnlp.tokenize import indic_tokenize
17
+
18
+ # Unsupervised Morphological Analyser for Indian languages.
19
+ #
20
+ # @author Anoop Kunchukuttan
21
+ #
22
+
23
+ class MorphAnalyzerI(object):
24
+ """
25
+ Interface for Morph Analyzer
26
+ """
27
+
28
+ def morph_analyze(word):
29
+ pass
30
+
31
+ def morph_analyze_document(tokens):
32
+ pass
33
+
34
+ class UnsupervisedMorphAnalyzer(MorphAnalyzerI):
35
+ """
36
+ Unsupervised Morphological analyser built using Morfessor 2.0
37
+ """
38
+
39
+ def __init__(self,lang,add_marker=False):
40
+ self.lang=lang
41
+ self.add_marker=add_marker
42
+
43
+ io = morfessor.MorfessorIO()
44
+ self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang)))
45
+
46
+ self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1]))
47
+ self._script_check_re=re.compile(self._script_range_pat)
48
+
49
+ def _contains_number(self,text):
50
+ if self.lang in langinfo.SCRIPT_RANGES:
51
+ for c in text:
52
+ offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0]
53
+ if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END:
54
+ return True
55
+ return False
56
+
57
+ def _morphanalysis_needed(self,word):
58
+ return self._script_check_re.match(word) and not self._contains_number(word)
59
+
60
+ @lru_cache(maxsize=16384)
61
+ def morph_analyze(self,word):
62
+ """
63
+ Morphanalyzes a single word and returns a list of component morphemes
64
+
65
+ @param word: string input word
66
+ """
67
+ m_list=[]
68
+ if self._morphanalysis_needed(word):
69
+ val=self._morfessor_model.viterbi_segment(word)
70
+ m_list=val[0]
71
+ if self.add_marker:
72
+ m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m) for i,m in enumerate(m_list)]
73
+ else:
74
+ if self.add_marker:
75
+ word='{}_E_'.format(word)
76
+ m_list=[word]
77
+ return m_list
78
+
79
+ ### Older implementation
80
+ #val=self._morfessor_model.viterbi_segment(word)
81
+ #m_list=val[0]
82
+ #if self.add_marker:
83
+ # m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m) for i,m in enumerate(m_list)]
84
+ #return m_list
85
+
86
+
87
+ def morph_analyze_document(self,tokens):
88
+ """
89
+ Morphanalyzes a document, represented as a list of tokens
90
+ Each word is morphanalyzed and result is a list of morphemes constituting the document
91
+
92
+ @param tokens: string sequence of words
93
+
94
+ @return list of segments in the document after morph analysis
95
+ """
96
+
97
+ out_tokens=[]
98
+ for token in tokens:
99
+ morphs=self.morph_analyze(token)
100
+ out_tokens.extend(morphs)
101
+ return out_tokens
102
+
103
+ #### Older implementation
104
+ #out_tokens=[]
105
+ #for token in tokens:
106
+ # if self._morphanalysis_needed(token):
107
+ # morphs=self.morph_analyze(token)
108
+ # out_tokens.extend(morphs)
109
+ # else:
110
+ # if self.add_marker:
111
+ # token=u'{}_E_'.format(token)
112
+ # out_tokens.append(token)
113
+ #return out_tokens
114
+
115
+
116
+ if __name__ == '__main__':
117
+
118
+ if len(sys.argv)<4:
119
+ print("Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]")
120
+ sys.exit(1)
121
+
122
+ language=sys.argv[3]
123
+ common.INDIC_RESOURCES_PATH=sys.argv[4]
124
+
125
+ add_marker=False
126
+
127
+ if len(sys.argv)==6:
128
+ add_marker= True if sys.argv[5] == 'True' else False
129
+
130
+ print('Loading morph analyser for ' + language)
131
+ analyzer=UnsupervisedMorphAnalyzer(language,add_marker)
132
+ print('Loaded morph analyser for ' + language)
133
+
134
+ with codecs.open(sys.argv[1],'r','utf-8') as ifile:
135
+ with codecs.open(sys.argv[2],'w','utf-8') as ofile:
136
+ for line in ifile.readlines():
137
+ line=line.strip()
138
+ tokens=indic_tokenize.trivial_tokenize(line)
139
+ morph_tokens=analyzer.morph_analyze_document(tokens)
140
+ ofile.write(' '.join(morph_tokens))
141
+ ofile.write('\n')
142
+
indic_nlp_library/indicnlp/normalize/__init__.py ADDED
File without changes
indic_nlp_library/indicnlp/normalize/indic_normalize.py ADDED
@@ -0,0 +1,984 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ #
4
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
5
+ # All rights reserved.
6
+ #
7
+ # This source code is licensed under the MIT license found in the
8
+ # LICENSE file in the root directory of this source tree.
9
+ #
10
+
11
+ #Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts
12
+ #
13
+ # @author Anoop Kunchukuttan
14
+ #
15
+
16
+ import sys, codecs, string, itertools, re
17
+ from indicnlp import langinfo
18
+
19
+
20
+ class NormalizerI(object):
21
+ """
22
+ The normalizer classes do the following:
23
+
24
+ * Some characters have multiple Unicode codepoints. The normalizer chooses a single standard representation
25
+ * Some control characters are deleted
26
+ * While typing using the Latin keyboard, certain typical mistakes occur which are corrected by the module
27
+
28
+ Base class for normalizer. Performs some common normalization, which includes:
29
+
30
+ * Byte order mark, word joiner, etc. removal
31
+ * ZERO_WIDTH_NON_JOINER and ZERO_WIDTH_JOINER removal
32
+ * ZERO_WIDTH_SPACE and NO_BREAK_SPACE replaced by spaces
33
+
34
+ Script specific normalizers should derive from this class and override the normalize() method.
35
+ They can call the super class 'normalize() method to avail of the common normalization
36
+
37
+ """
38
+
39
+ BYTE_ORDER_MARK='\uFEFF'
40
+ BYTE_ORDER_MARK_2='\uFFFE'
41
+ WORD_JOINER='\u2060'
42
+ SOFT_HYPHEN='\u00AD'
43
+
44
+ ZERO_WIDTH_SPACE='\u200B'
45
+ NO_BREAK_SPACE='\u00A0'
46
+
47
+ ZERO_WIDTH_NON_JOINER='\u200C'
48
+ ZERO_WIDTH_JOINER='\u200D'
49
+
50
+ def _normalize_punctuations(self, text):
51
+ """
52
+ Normalize punctuations.
53
+ Applied many of the punctuation normalizations that are part of MosesNormalizer
54
+ from sacremoses
55
+ """
56
+ text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
57
+ text=text.replace('„', r'"')
58
+ text=text.replace('“', r'"')
59
+ text=text.replace('”', r'"')
60
+ text=text.replace('–', r'-')
61
+ text=text.replace('—', r' - ')
62
+ text=text.replace('´', r"'")
63
+ text=text.replace('‘', r"'")
64
+ text=text.replace('‚', r"'")
65
+ text=text.replace('’', r"'")
66
+ text=text.replace("''", r'"')
67
+ text=text.replace('´´', r'"')
68
+ text=text.replace('…', r'...')
69
+
70
+ return text
71
+
72
+ def normalize(self,text):
73
+ pass
74
+
75
+
76
+ class BaseNormalizer(NormalizerI):
77
+
78
+ def __init__(self,lang,
79
+ remove_nuktas=False,
80
+ nasals_mode='do_nothing',
81
+ do_normalize_chandras=False,
82
+ do_normalize_vowel_ending=False):
83
+
84
+ self.lang=lang
85
+ self.remove_nuktas=remove_nuktas
86
+ self.nasals_mode=nasals_mode
87
+ self.do_normalize_chandras=do_normalize_chandras
88
+ self.do_normalize_vowel_ending=do_normalize_vowel_ending
89
+
90
+ self._init_normalize_chandras()
91
+ self._init_normalize_nasals()
92
+ self._init_normalize_vowel_ending()
93
+ #self._init_visarga_correction()
94
+
95
+ def _init_normalize_vowel_ending(self):
96
+
97
+ if self.lang in langinfo.IE_LANGUAGES:
98
+ self.fn_vowel_ending=self._normalize_word_vowel_ending_ie
99
+ elif self.lang in langinfo.DRAVIDIAN_LANGUAGES:
100
+ self.fn_vowel_ending=self._normalize_word_vowel_ending_dravidian
101
+ else:
102
+ self.fn_vowel_ending=lambda x: x
103
+
104
+ def _init_normalize_chandras(self):
105
+
106
+ substitution_offsets =\
107
+ [
108
+ [0x0d , 0x0f], # chandra e, independent
109
+ [0x11 , 0x13], # chandra o, independent
110
+ [0x45 , 0x47], # chandra e , 0xde],pendent
111
+ [0x49 , 0x4b], # chandra o , 0xde],pendent
112
+ # [0x72 , 0x0f], # mr: chandra e, independent
113
+
114
+ [0x00 , 0x02], # chandrabindu
115
+ [0x01 , 0x02], # chandrabindu
116
+ ]
117
+
118
+ self.chandra_substitutions = [
119
+ (langinfo.offset_to_char(x[0],self.lang), langinfo.offset_to_char(x[1],self.lang))
120
+ for x in substitution_offsets ]
121
+
122
+ def _normalize_chandras(self,text):
123
+ for match, repl in self.chandra_substitutions:
124
+ text=text.replace(match,repl)
125
+ return text
126
+
127
+ def _init_to_anusvaara_strict(self):
128
+ """
129
+ `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
130
+ """
131
+
132
+ pat_signatures=\
133
+ [
134
+ [0x19,0x15,0x18],
135
+ [0x1e,0x1a,0x1d],
136
+ [0x23,0x1f,0x22],
137
+ [0x28,0x24,0x27],
138
+ [0x29,0x24,0x27],
139
+ [0x2e,0x2a,0x2d],
140
+ ]
141
+
142
+ halant_offset=0x4d
143
+ anusvaara_offset=0x02
144
+
145
+ pats=[]
146
+
147
+ for pat_signature in pat_signatures:
148
+ pat=re.compile(r'{nasal}{halant}([{start_r}-{end_r}])'.format(
149
+ nasal=langinfo.offset_to_char(pat_signature[0],self.lang),
150
+ halant=langinfo.offset_to_char(halant_offset,self.lang),
151
+ start_r=langinfo.offset_to_char(pat_signature[1],self.lang),
152
+ end_r=langinfo.offset_to_char(pat_signature[2],self.lang),
153
+ ))
154
+ pats.append(pat)
155
+
156
+ repl_string='{anusvaara}\\1'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))
157
+
158
+ self.pats_repls=(pats,repl_string)
159
+
160
+ def _to_anusvaara_strict(self,text):
161
+
162
+ pats, repl_string = self.pats_repls
163
+ for pat in pats:
164
+ text=pat.sub(repl_string,text)
165
+
166
+ return text
167
+
168
+ def _init_to_anusvaara_relaxed(self):
169
+ """
170
+ `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
171
+ """
172
+
173
+ nasals_list=[0x19,0x1e,0x23,0x28,0x29,0x2e]
174
+ nasals_list_str=','.join([langinfo.offset_to_char(x,self.lang) for x in nasals_list])
175
+
176
+ halant_offset=0x4d
177
+ anusvaara_offset=0x02
178
+
179
+ pat=re.compile(r'[{nasals_list_str}]{halant}'.format(
180
+ nasals_list_str=nasals_list_str,
181
+ halant=langinfo.offset_to_char(halant_offset,self.lang),
182
+ ))
183
+
184
+ repl_string='{anusvaara}'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))
185
+
186
+ self.pats_repls = (pat,repl_string)
187
+
188
+ def _to_anusvaara_relaxed(self,text):
189
+ pat, repl_string = self.pats_repls
190
+ return pat.sub(repl_string,text)
191
+
192
+
193
+ def _init_to_nasal_consonants(self):
194
+ """
195
+ `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
196
+ """
197
+
198
+ pat_signatures=\
199
+ [
200
+ [0x19,0x15,0x18],
201
+ [0x1e,0x1a,0x1d],
202
+ [0x23,0x1f,0x22],
203
+ [0x28,0x24,0x27],
204
+ [0x29,0x24,0x27],
205
+ [0x2e,0x2a,0x2d],
206
+ ]
207
+
208
+ halant_offset=0x4d
209
+ anusvaara_offset=0x02
210
+
211
+ pats=[]
212
+ repl_strings=[]
213
+
214
+ for pat_signature in pat_signatures:
215
+ pat=re.compile(r'{anusvaara}([{start_r}-{end_r}])'.format(
216
+ anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang),
217
+ start_r=langinfo.offset_to_char(pat_signature[1],self.lang),
218
+ end_r=langinfo.offset_to_char(pat_signature[2],self.lang),
219
+ ))
220
+ pats.append(pat)
221
+ repl_string='{nasal}{halant}\\1'.format(
222
+ nasal=langinfo.offset_to_char(pat_signature[0],self.lang),
223
+ halant=langinfo.offset_to_char(halant_offset,self.lang),
224
+ )
225
+ repl_strings.append(repl_string)
226
+
227
+ self.pats_repls=list(zip(pats,repl_strings))
228
+
229
+ def _to_nasal_consonants(self,text):
230
+
231
+ for pat, repl in self.pats_repls:
232
+ text=pat.sub(repl,text)
233
+
234
+ return text
235
+
236
+ def _init_normalize_nasals(self):
237
+
238
+ if self.nasals_mode == 'to_anusvaara_strict':
239
+ self._init_to_anusvaara_strict()
240
+ elif self.nasals_mode == 'to_anusvaara_relaxed':
241
+ self._init_to_anusvaara_relaxed()
242
+ elif self.nasals_mode == 'to_nasal_consonants':
243
+ self._init_to_nasal_consonants()
244
+
245
+ def _normalize_nasals(self,text):
246
+ if self.nasals_mode == 'to_anusvaara_strict':
247
+ return self._to_anusvaara_strict(text)
248
+ elif self.nasals_mode == 'to_anusvaara_relaxed':
249
+ return self._to_anusvaara_relaxed(text)
250
+ elif self.nasals_mode == 'to_nasal_consonants':
251
+ return self._to_nasal_consonants(text)
252
+ else:
253
+ return text
254
+
255
+
256
+ def _normalize_word_vowel_ending_dravidian(self,word):
257
+ """
258
+ for Dravidian
259
+ - consonant ending: add 'a' ki maatra
260
+ - halant ending: no change
261
+ - 'a' ki maatra: no change
262
+ """
263
+ if len(word)>0 and langinfo.is_consonant(word[-1],self.lang):
264
+ return word+langinfo.offset_to_char(0x3e,self.lang)
265
+ else:
266
+ return word
267
+
268
+ def _normalize_word_vowel_ending_ie(self,word):
269
+ """
270
+ for IE
271
+ - consonant ending: add halant
272
+ - halant ending: no change
273
+ - 'a' ki maatra: no change
274
+ """
275
+ if len(word)>0 and langinfo.is_consonant(word[-1],self.lang):
276
+ return word+langinfo.offset_to_char(langinfo.HALANTA_OFFSET,self.lang)
277
+ else:
278
+ return word
279
+
280
+ def _normalize_vowel_ending(self,text):
281
+ return ' '.join([ self.fn_vowel_ending(w) for w in text.split(' ') ])
282
+
283
+ def normalize(self,text):
284
+ """
285
+ Method to be implemented for normalization for each script
286
+ """
287
+ text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
288
+ text=text.replace(NormalizerI.BYTE_ORDER_MARK_2,'')
289
+ text=text.replace(NormalizerI.WORD_JOINER,'')
290
+ text=text.replace(NormalizerI.SOFT_HYPHEN,'')
291
+
292
+ text=text.replace(NormalizerI.ZERO_WIDTH_SPACE,' ') # ??
293
+ text=text.replace(NormalizerI.NO_BREAK_SPACE,' ')
294
+
295
+ text=text.replace(NormalizerI.ZERO_WIDTH_NON_JOINER, '')
296
+ text=text.replace(NormalizerI.ZERO_WIDTH_JOINER,'')
297
+
298
+ text=self._normalize_punctuations(text)
299
+
300
+ if self.do_normalize_chandras:
301
+ text=self._normalize_chandras(text)
302
+ text=self._normalize_nasals(text)
303
+ if self.do_normalize_vowel_ending:
304
+ text=self._normalize_vowel_ending(text)
305
+
306
+ return text
307
+
308
+
309
+ def get_char_stats(self,text):
310
+ print(len(re.findall(NormalizerI.BYTE_ORDER_MARK,text)))
311
+ print(len(re.findall(NormalizerI.BYTE_ORDER_MARK_2,text)))
312
+ print(len(re.findall(NormalizerI.WORD_JOINER,text)))
313
+ print(len(re.findall(NormalizerI.SOFT_HYPHEN,text)))
314
+
315
+ print(len(re.findall(NormalizerI.ZERO_WIDTH_SPACE,text) ))
316
+ print(len(re.findall(NormalizerI.NO_BREAK_SPACE,text)))
317
+
318
+ print(len(re.findall(NormalizerI.ZERO_WIDTH_NON_JOINER,text)))
319
+ print(len(re.findall(NormalizerI.ZERO_WIDTH_JOINER,text)))
320
+
321
+ #for mobj in re.finditer(NormalizerI.ZERO_WIDTH_NON_JOINER,text):
322
+ # print text[mobj.start()-10:mobj.end()+10].replace('\n', ' ').replace(NormalizerI.ZERO_WIDTH_NON_JOINER,'').encode('utf-8')
323
+ #print hex(ord(text[mobj.end():mobj.end()+1]))
324
+
325
+ def correct_visarga(self,text,visarga_char,char_range):
326
+ text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text)
327
+
328
+
329
+
330
+ class DevanagariNormalizer(BaseNormalizer):
331
+ """
332
+ Normalizer for the Devanagari script. In addition to basic normalization by the super class,
333
+
334
+ * Replaces the composite characters containing nuktas by their decomposed form
335
+ * replace pipe character '|' by poorna virama character
336
+ * replace colon ':' by visarga if the colon follows a charcter in this script
337
+
338
+ """
339
+
340
+ NUKTA='\u093C'
341
+
342
+ def __init__(self,lang='hi',remove_nuktas=False,nasals_mode='do_nothing',
343
+ do_normalize_chandras=False,do_normalize_vowel_ending=False):
344
+ super(DevanagariNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
345
+
346
+ def normalize(self,text):
347
+
348
+ # common normalization for Indic scripts
349
+ text=super(DevanagariNormalizer,self).normalize(text)
350
+
351
+ # chandra a replacement for Marathi
352
+ text=text.replace('\u0972','\u090f')
353
+
354
+ # decomposing Nukta based composite characters
355
+ text=text.replace('\u0929','\u0928'+DevanagariNormalizer.NUKTA)
356
+ text=text.replace('\u0931','\u0930'+DevanagariNormalizer.NUKTA)
357
+ text=text.replace('\u0934','\u0933'+DevanagariNormalizer.NUKTA)
358
+ text=text.replace('\u0958','\u0915'+DevanagariNormalizer.NUKTA)
359
+ text=text.replace('\u0959','\u0916'+DevanagariNormalizer.NUKTA)
360
+ text=text.replace('\u095A','\u0917'+DevanagariNormalizer.NUKTA)
361
+ text=text.replace('\u095B','\u091C'+DevanagariNormalizer.NUKTA)
362
+ text=text.replace('\u095C','\u0921'+DevanagariNormalizer.NUKTA)
363
+ text=text.replace('\u095D','\u0922'+DevanagariNormalizer.NUKTA)
364
+ text=text.replace('\u095E','\u092B'+DevanagariNormalizer.NUKTA)
365
+ text=text.replace('\u095F','\u092F'+DevanagariNormalizer.NUKTA)
366
+
367
+ if self.remove_nuktas:
368
+ text=text.replace(DevanagariNormalizer.NUKTA,'')
369
+
370
+ # replace pipe character for poorna virama
371
+ text=text.replace('\u007c','\u0964')
372
+
373
+ # correct visarga
374
+ text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text)
375
+
376
+ return text
377
+
378
+ def get_char_stats(self,text):
379
+ super(DevanagariNormalizer,self).get_char_stats(text)
380
+
381
+ print((len(re.findall('\u0929',text))))
382
+ print((len(re.findall('\u0931',text))))
383
+ print((len(re.findall('\u0934',text))))
384
+ print((len(re.findall('\u0958',text))))
385
+ print((len(re.findall('\u0959',text))))
386
+ print((len(re.findall('\u095A',text))))
387
+ print((len(re.findall('\u095B',text))))
388
+ print((len(re.findall('\u095C',text))))
389
+ print((len(re.findall('\u095D',text))))
390
+ print((len(re.findall('\u095E',text))))
391
+ print((len(re.findall('\u095F',text))))
392
+
393
+ #print(len(re.findall(u'\u0928'+DevanagariNormalizer.NUKTA,text)))
394
+ #print(len(re.findall(u'\u0930'+DevanagariNormalizer.NUKTA,text)))
395
+ #print(len(re.findall(u'\u0933'+DevanagariNormalizer.NUKTA,text)))
396
+ #print(len(re.findall(u'\u0915'+DevanagariNormalizer.NUKTA,text)))
397
+ #print(len(re.findall(u'\u0916'+DevanagariNormalizer.NUKTA,text)))
398
+ #print(len(re.findall(u'\u0917'+DevanagariNormalizer.NUKTA,text)))
399
+ #print(len(re.findall(u'\u091C'+DevanagariNormalizer.NUKTA,text)))
400
+ #print(len(re.findall(u'\u0921'+DevanagariNormalizer.NUKTA,text)))
401
+ #print(len(re.findall(u'\u0922'+DevanagariNormalizer.NUKTA,text)))
402
+ #print(len(re.findall(u'\u092B'+DevanagariNormalizer.NUKTA,text)))
403
+ #print(len(re.findall(u'\u092F'+DevanagariNormalizer.NUKTA,text)))
404
+
405
+ class GurmukhiNormalizer(BaseNormalizer):
406
+ """
407
+ Normalizer for the Gurmukhi script. In addition to basic normalization by the super class,
408
+
409
+ * Replaces the composite characters containing nuktas by their decomposed form
410
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
411
+ * replace pipe character '|' by poorna virama character
412
+ * replace colon ':' by visarga if the colon follows a charcter in this script
413
+ """
414
+
415
+ NUKTA='\u0A3C'
416
+
417
+ VOWEL_NORM_MAPS={
418
+ ## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
419
+ ## Table 12-16
420
+ '\u0a05\u0a3e': '\u0a06',
421
+ '\u0a72\u0a3f': '\u0a07',
422
+ '\u0a72\u0a40': '\u0a08',
423
+ '\u0a73\u0a41': '\u0a09',
424
+ '\u0a73\u0a42': '\u0a0a',
425
+ '\u0a72\u0a47': '\u0a0f',
426
+ '\u0a05\u0a48': '\u0a10',
427
+ '\u0a73\u0a4b': '\u0a13',
428
+ '\u0a05\u0a4c': '\u0a14',
429
+ }
430
+
431
+ def __init__(self,lang='pa',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
432
+ do_normalize_vowel_ending=False,
433
+ do_canonicalize_addak=False,
434
+ do_canonicalize_tippi=False,
435
+ do_replace_vowel_bases=False):
436
+ super(GurmukhiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
437
+ self.do_canonicalize_addak=do_canonicalize_addak
438
+ self.do_canonicalize_tippi=do_canonicalize_tippi
439
+ self.do_replace_vowel_bases=do_replace_vowel_bases
440
+
441
+
442
+ def _normalize_vowels(self,text):
443
+ """
444
+
445
+ """
446
+
447
+ ## standard vowel replacements as per suggestions in
448
+ ## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
449
+ ## Table 12-16
450
+
451
+ for k,v in GurmukhiNormalizer.VOWEL_NORM_MAPS.items():
452
+ text=text.replace(k,v)
453
+
454
+ ## the above mappings should account for majority of the variantions,
455
+ ## Rest are handled via this generic rule which looks at the diacritic
456
+ ## following the 2 special characters
457
+ ## TBD: don't see evidence for this in Wikipedia corpus
458
+
459
+ ## If these special characters occur without any diacritic, replace them with closet
460
+ ## equivalent vowels
461
+ if self.do_replace_vowel_bases:
462
+ text=text.replace('\u0a72','\u0a07')
463
+ text=text.replace('\u0a73','\u0a09')
464
+
465
+ return text
466
+
467
+
468
+ def normalize(self,text):
469
+
470
+ # Addak
471
+ if self.do_canonicalize_addak:
472
+ ## replace addak+consonant with consonat+halant+consonant
473
+ text=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',text)
474
+
475
+ # Tippi
476
+ if self.do_canonicalize_tippi:
477
+ text=text.replace('\u0a70','\u0a02')
478
+
479
+ # Vowels: Gurumuki has multiple ways of representing independent vowels due
480
+ # to the characters 'iri' and 'ura'.
481
+ text=self._normalize_vowels(text)
482
+
483
+ # common normalization for Indic scripts
484
+ text=super(GurmukhiNormalizer,self).normalize(text)
485
+
486
+ # decomposing Nukta based composite characters
487
+ text=text.replace('\u0a33','\u0a32'+GurmukhiNormalizer.NUKTA)
488
+ text=text.replace('\u0a36','\u0a38'+GurmukhiNormalizer.NUKTA)
489
+ text=text.replace('\u0a59','\u0a16'+GurmukhiNormalizer.NUKTA)
490
+ text=text.replace('\u0a5a','\u0a17'+GurmukhiNormalizer.NUKTA)
491
+ text=text.replace('\u0a5b','\u0a1c'+GurmukhiNormalizer.NUKTA)
492
+ text=text.replace('\u0a5e','\u0a2b'+GurmukhiNormalizer.NUKTA)
493
+
494
+ if self.remove_nuktas:
495
+ text=text.replace(GurmukhiNormalizer.NUKTA,'')
496
+
497
+ # replace the poorna virama codes specific to script
498
+ # with generic Indic script codes
499
+ text=text.replace('\u0a64','\u0964')
500
+ text=text.replace('\u0a65','\u0965')
501
+
502
+ ## replace pipe character for poorna virama
503
+ text=text.replace('\u007c','\u0964')
504
+
505
+ # correct visarge
506
+ text=re.sub(r'([\u0a00-\u0a7f]):','\\1\u0a03',text)
507
+
508
+ return text
509
+
510
+
511
+ class GujaratiNormalizer(BaseNormalizer):
512
+ """
513
+ Normalizer for the Gujarati script. In addition to basic normalization by the super class,
514
+
515
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
516
+ * replace colon ':' by visarga if the colon follows a charcter in this script
517
+ """
518
+
519
+ NUKTA='\u0ABC'
520
+
521
+ def __init__(self,lang='gu',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
522
+ do_normalize_vowel_ending=False):
523
+ super(GujaratiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
524
+
525
+ def normalize(self,text):
526
+
527
+ # common normalization for Indic scripts
528
+ text=super(GujaratiNormalizer,self).normalize(text)
529
+
530
+ # decomposing Nukta based composite characters
531
+ if self.remove_nuktas:
532
+ text=text.replace(GujaratiNormalizer.NUKTA,'')
533
+
534
+
535
+ # replace the poorna virama codes specific to script
536
+ # with generic Indic script codes
537
+ text=text.replace('\u0ae4','\u0964')
538
+ text=text.replace('\u0ae5','\u0965')
539
+
540
+ # correct visarge
541
+ text=re.sub(r'([\u0a80-\u0aff]):','\\1\u0a83',text)
542
+
543
+ return text
544
+
545
+
546
+ class OriyaNormalizer(BaseNormalizer):
547
+ """
548
+ Normalizer for the Oriya script. In addition to basic normalization by the super class,
549
+
550
+ * Replaces the composite characters containing nuktas by their decomposed form
551
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
552
+ * Canonicalize two part dependent vowels
553
+ * Replace 'va' with 'ba'
554
+ * replace pipe character '|' by poorna virama character
555
+ * replace colon ':' by visarga if the colon follows a charcter in this script
556
+ """
557
+
558
+ NUKTA='\u0B3C'
559
+
560
+ VOWEL_NORM_MAPS={
561
+ ## See Table 12-22 in http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
562
+ '\u0b05\u0b3e': '\u0b06',
563
+ '\u0b0f\u0b57': '\u0b10',
564
+ '\u0b13\u0b57': '\u0b14',
565
+ }
566
+
567
+
568
+ def __init__(self,lang='or',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
569
+ do_normalize_vowel_ending=False,
570
+ do_remap_wa=False):
571
+ super(OriyaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
572
+ self.do_remap_wa=do_remap_wa
573
+
574
+ def normalize(self,text):
575
+
576
+ # common normalization for Indic scripts
577
+ text=super(OriyaNormalizer,self).normalize(text)
578
+
579
+ ## standard vowel replacements as per suggestions in Unicode documents
580
+ for k,v in OriyaNormalizer.VOWEL_NORM_MAPS.items():
581
+ text=text.replace(k,v)
582
+
583
+ # decomposing Nukta based composite characters
584
+ text=text.replace('\u0b5c','\u0b21'+OriyaNormalizer.NUKTA)
585
+ text=text.replace('\u0b5d','\u0b22'+OriyaNormalizer.NUKTA)
586
+
587
+ if self.remove_nuktas:
588
+ text=text.replace(OriyaNormalizer.NUKTA,'')
589
+
590
+ # replace the poorna virama codes specific to script
591
+ # with generic Indic script codes
592
+ text=text.replace('\u0b64','\u0964')
593
+ text=text.replace('\u0b65','\u0965')
594
+
595
+ # replace pipe character for poorna virama
596
+ text=text.replace('\u0b7c','\u0964')
597
+
598
+ # replace wa with ba
599
+ if self.do_remap_wa:
600
+ text=text.replace('\u0b71','\u0b2c')
601
+
602
+ # replace va with ba
603
+ # NOTE: documentation (chapter on Indic scripts) and codepoint chart seem contradictory
604
+ # (this applied to wa to ba rule also above)
605
+ text=text.replace('\u0b35','\u0b2c')
606
+
607
+ # AI dependent vowel sign
608
+ text=text.replace('\u0b47\u0b56','\u0b58')
609
+
610
+ # two part dependent vowels
611
+ text=text.replace('\u0b47\u0b3e','\u0b4b')
612
+ text=text.replace('\u0b47\u0b57','\u0b4c')
613
+
614
+
615
+ # additional consonant - not clear how to handle this
616
+ # ignore
617
+
618
+ # correct visarge
619
+ text=re.sub(r'([\u0b00-\u0b7f]):','\\1\u0b03',text)
620
+
621
+ return text
622
+
623
+
624
+ class BengaliNormalizer(BaseNormalizer):
625
+ """
626
+ Normalizer for the Bengali script. In addition to basic normalization by the super class,
627
+
628
+ * Replaces the composite characters containing nuktas by their decomposed form
629
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
630
+ * Canonicalize two part dependent vowels
631
+ * replace pipe character '|' by poorna virama character
632
+ * replace colon ':' by visarga if the colon follows a charcter in this script
633
+
634
+ """
635
+
636
+ NUKTA='\u09BC'
637
+
638
+ def __init__(self,lang='bn',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
639
+ do_normalize_vowel_ending=False,
640
+ do_remap_assamese_chars=False):
641
+ super(BengaliNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
642
+ self.do_remap_assamese_chars=do_remap_assamese_chars
643
+
644
+ def normalize(self,text):
645
+
646
+ # common normalization for Indic scripts
647
+ text=super(BengaliNormalizer,self).normalize(text)
648
+
649
+ # decomposing Nukta based composite characters
650
+ text=text.replace('\u09dc','\u09a1'+BengaliNormalizer.NUKTA)
651
+ text=text.replace('\u09dd','\u09a2'+BengaliNormalizer.NUKTA)
652
+ text=text.replace('\u09df','\u09af'+BengaliNormalizer.NUKTA)
653
+
654
+ if self.remove_nuktas:
655
+ text=text.replace(BengaliNormalizer.NUKTA,'')
656
+
657
+ if self.do_remap_assamese_chars and self.lang=='as':
658
+ text=text.replace('\u09f0','\u09b0') # 'ra' character
659
+ text=text.replace('\u09f1','\u09ac') # 'va' character
660
+
661
+ # replace the poorna virama codes specific to script
662
+ # with generic Indic script codes
663
+ text=text.replace('\u09e4','\u0964')
664
+ text=text.replace('\u09e5','\u0965')
665
+
666
+ # replace pipe character for poorna virama
667
+ text=text.replace('\u007c','\u0964')
668
+ # replace bengali currency numerator four for poorna virama (it looks similar and is used as a substitute)
669
+ text=text.replace('\u09f7','\u0964')
670
+
671
+ # two part dependent vowels
672
+ text=text.replace('\u09c7\u09be','\u09cb')
673
+ text=text.replace('\u09c7\u09d7','\u09cc')
674
+
675
+ # correct visarge
676
+ text=re.sub(r'([\u0980-\u09ff]):','\\1\u0983',text)
677
+
678
+ return text
679
+
680
+
681
+ class TamilNormalizer(BaseNormalizer):
682
+ """
683
+ Normalizer for the Tamil script. In addition to basic normalization by the super class,
684
+
685
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
686
+ * canonicalize two-part dependent vowel signs
687
+ * replace colon ':' by visarga if the colon follows a charcter in this script
688
+ """
689
+
690
+ def __init__(self,lang='ta',remove_nuktas=False,nasals_mode='do_nothing',
691
+ do_normalize_chandras=False,do_normalize_vowel_ending=False):
692
+ super(TamilNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
693
+
694
+ def normalize(self,text):
695
+
696
+ # common normalization for Indic scripts
697
+ text=super(TamilNormalizer,self).normalize(text)
698
+
699
+ # replace the poorna virama codes specific to script
700
+ # with generic Indic script codes
701
+ text=text.replace('\u0be4','\u0964')
702
+ text=text.replace('\u0be5','\u0965')
703
+
704
+ # two part dependent vowels
705
+ text=text.replace('\u0b92\u0bd7','\u0b94')
706
+ text=text.replace('\u0bc6\u0bbe','\u0bca')
707
+ text=text.replace('\u0bc7\u0bbe','\u0bcb')
708
+ text=text.replace('\u0bc6\u0bd7','\u0bcc')
709
+
710
+ # correct visarge
711
+ text=re.sub(r'([\u0b80-\u0bff]):','\\1\u0b83',text)
712
+
713
+ return text
714
+
715
+
716
+ class TeluguNormalizer(BaseNormalizer):
717
+ """
718
+ Normalizer for the Teluguscript. In addition to basic normalization by the super class,
719
+
720
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
721
+ * canonicalize two-part dependent vowel signs
722
+ * replace colon ':' by visarga if the colon follows a charcter in this script
723
+ """
724
+
725
+ def __init__(self,lang='te',remove_nuktas=False,nasals_mode='do_nothing',
726
+ do_normalize_chandras=False,do_normalize_vowel_ending=False):
727
+ super(TeluguNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
728
+
729
+ def normalize(self,text):
730
+
731
+ # common normalization for Indic scripts
732
+ text=super(TeluguNormalizer,self).normalize(text)
733
+
734
+ # replace the poorna virama codes specific to script
735
+ # with generic Indic script codes
736
+ text=text.replace('\u0c64','\u0964')
737
+ text=text.replace('\u0c65','\u0965')
738
+
739
+ # dependent vowels
740
+ text=text.replace('\u0c46\u0c56','\u0c48')
741
+
742
+ # correct visarge
743
+ text=re.sub(r'([\u0c00-\u0c7f]):','\\1\u0c03',text)
744
+
745
+ return text
746
+
747
+ def get_char_stats(self,text):
748
+ pass
749
+
750
+ class KannadaNormalizer(BaseNormalizer):
751
+ """
752
+ Normalizer for the Kannada script. In addition to basic normalization by the super class,
753
+
754
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
755
+ * canonicalize two-part dependent vowel signs
756
+ * replace colon ':' by visarga if the colon follows a charcter in this script
757
+ """
758
+
759
+ def __init__(self,lang='kn',remove_nuktas=False,nasals_mode='do_nothing',
760
+ do_normalize_chandras=False,do_normalize_vowel_ending=False):
761
+ super(KannadaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
762
+
763
+
764
+ def normalize(self,text):
765
+
766
+ # common normalization for Indic scripts
767
+ text=super(KannadaNormalizer,self).normalize(text)
768
+
769
+ # replace the poorna virama codes specific to script
770
+ # with generic Indic script codes
771
+ text=text.replace('\u0ce4','\u0964')
772
+ text=text.replace('\u0ce5','\u0965')
773
+
774
+ # dependent vowels
775
+ text=text.replace('\u0cbf\u0cd5','\u0cc0')
776
+ text=text.replace('\u0cc6\u0cd5','\u0cc7')
777
+ text=text.replace('\u0cc6\u0cd6','\u0cc8')
778
+ text=text.replace('\u0cc6\u0cc2','\u0cca')
779
+ text=text.replace('\u0cca\u0cd5','\u0ccb')
780
+
781
+ # correct visarge
782
+ text=re.sub(r'([\u0c80-\u0cff]):','\\1\u0c83',text)
783
+
784
+ return text
785
+
786
+
787
+ class MalayalamNormalizer(BaseNormalizer):
788
+ """
789
+ Normalizer for the Malayalam script. In addition to basic normalization by the super class,
790
+
791
+ * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
792
+ * canonicalize two-part dependent vowel signs
793
+ * Change from old encoding of chillus (till Unicode 5.0) to new encoding
794
+ * replace colon ':' by visarga if the colon follows a charcter in this script
795
+ """
796
+
797
+ CHILLU_CHAR_MAP= {
798
+ '\u0d7a': '\u0d23',
799
+ '\u0d7b': '\u0d28',
800
+ '\u0d7c': '\u0d30',
801
+ '\u0d7d': '\u0d32',
802
+ '\u0d7e': '\u0d33',
803
+ '\u0d7f': '\u0d15',
804
+ }
805
+
806
+ def _canonicalize_chillus(self,text):
807
+ for chillu, char in MalayalamNormalizer.CHILLU_CHAR_MAP.items():
808
+ text=text.replace(chillu,'{}\u0d4d'.format(char))
809
+ return text
810
+
811
+ def _correct_geminated_T(self,text):
812
+ return text.replace('\u0d31\u0d4d\u0d31','\u0d1f\u0d4d\u0d1f')
813
+
814
+ def __init__(self,lang='ml',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
815
+ do_normalize_vowel_ending=False,
816
+ do_canonicalize_chillus=False, do_correct_geminated_T=False):
817
+ super(MalayalamNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
818
+ self.do_canonicalize_chillus=do_canonicalize_chillus
819
+ self.do_correct_geminated_T=do_correct_geminated_T
820
+
821
+ def normalize(self,text):
822
+
823
+ # Change from old encoding of chillus (till Unicode 5.0) to new encoding
824
+ text=text.replace('\u0d23\u0d4d\u200d','\u0d7a')
825
+ text=text.replace('\u0d28\u0d4d\u200d','\u0d7b')
826
+ text=text.replace('\u0d30\u0d4d\u200d','\u0d7c')
827
+ text=text.replace('\u0d32\u0d4d\u200d','\u0d7d')
828
+ text=text.replace('\u0d33\u0d4d\u200d','\u0d7e')
829
+ text=text.replace('\u0d15\u0d4d\u200d','\u0d7f')
830
+
831
+ # Normalize chillus
832
+ if self.do_canonicalize_chillus:
833
+ text=self._canonicalize_chillus(text)
834
+
835
+ # common normalization for Indic scripts
836
+ text=super(MalayalamNormalizer,self).normalize(text)
837
+
838
+ # replace the poorna virama codes specific to script
839
+ # with generic Indic script codes
840
+ text=text.replace('\u0d64','\u0964')
841
+ text=text.replace('\u0d65','\u0965')
842
+
843
+ # dependent vowels
844
+ text=text.replace('\u0d46\u0d3e','\u0d4a')
845
+ text=text.replace('\u0d47\u0d3e','\u0d4b')
846
+
847
+ # au forms
848
+ text=text.replace('\u0d46\u0d57','\u0d4c')
849
+ text=text.replace('\u0d57','\u0d4c')
850
+
851
+ # correct geminated T
852
+ if self.do_correct_geminated_T:
853
+ text=self._correct_geminated_T(text)
854
+
855
+ # correct visarga
856
+ text=re.sub(r'([\u0d00-\u0d7f]):','\\1\u0d03',text)
857
+
858
+ return text
859
+
860
+ class UrduNormalizer(NormalizerI):
861
+ '''Uses UrduHack library.
862
+ https://docs.urduhack.com/en/stable/_modules/urduhack/normalization/character.html#normalize
863
+ '''
864
+
865
+ def __init__(self, lang, remove_nuktas=True):
866
+ self.lang = lang
867
+ self.remove_nuktas = remove_nuktas
868
+
869
+ from urduhack.normalization import (
870
+ remove_diacritics,
871
+ normalize_characters,
872
+ normalize_combine_characters
873
+ ) # TODO: Use only required normalizers
874
+ from urduhack.preprocessing import (
875
+ normalize_whitespace,
876
+ digits_space,
877
+ all_punctuations_space,
878
+ english_characters_space
879
+ )
880
+
881
+ def normalize(self, text):
882
+ text = self._normalize_punctuations(text)
883
+ text = UrduNormalizer.normalize_whitespace(text)
884
+ if self.remove_nuktas:
885
+ text = UrduNormalizer.remove_diacritics(text)
886
+ text = UrduNormalizer.normalize_characters(text)
887
+ text = UrduNormalizer.normalize_combine_characters(text)
888
+ text = UrduNormalizer.digits_space(text)
889
+ text = UrduNormalizer.all_punctuations_space(text)
890
+ text = UrduNormalizer.english_characters_space(text)
891
+ return text
892
+
893
+
894
+ class IndicNormalizerFactory(object):
895
+ """
896
+ Factory class to create language specific normalizers.
897
+
898
+ """
899
+
900
+ def get_normalizer(self,language,**kwargs):
901
+ """
902
+ Call the get_normalizer function to get the language specific normalizer
903
+
904
+ Paramters:
905
+ |language: language code
906
+ |remove_nuktas: boolean, should the normalizer remove nukta characters
907
+ """
908
+ normalizer=None
909
+ if language in ['hi','mr','sa','kK','ne','sd']:
910
+ normalizer=DevanagariNormalizer(lang=language, **kwargs)
911
+ elif language in ['ur']:
912
+ normalizer = UrduNormalizer(lang=language, **kwargs)
913
+ elif language in ['pa']:
914
+ normalizer=GurmukhiNormalizer(lang=language, **kwargs)
915
+ elif language in ['gu']:
916
+ normalizer=GujaratiNormalizer(lang=language, **kwargs)
917
+ elif language in ['bn']:
918
+ normalizer=BengaliNormalizer(lang=language, **kwargs)
919
+ elif language in ['as']:
920
+ normalizer=BengaliNormalizer(lang=language, **kwargs)
921
+ elif language in ['or']:
922
+ normalizer=OriyaNormalizer(lang=language, **kwargs)
923
+ elif language in ['ml']:
924
+ normalizer=MalayalamNormalizer(lang=language, **kwargs)
925
+ elif language in ['kn']:
926
+ normalizer=KannadaNormalizer(lang=language, **kwargs)
927
+ elif language in ['ta']:
928
+ normalizer=TamilNormalizer(lang=language, **kwargs)
929
+ elif language in ['te']:
930
+ normalizer=TeluguNormalizer(lang=language, **kwargs)
931
+ else:
932
+ normalizer=BaseNormalizer(lang=language, **kwargs)
933
+
934
+ return normalizer
935
+
936
+ def is_language_supported(self,language):
937
+ """
938
+ Is the language supported?
939
+ """
940
+ if language in ['hi','mr','sa','kK','ne','sd',
941
+ 'ur',
942
+ 'pa',
943
+ 'gu',
944
+ 'bn','as',
945
+ 'or',
946
+ 'ml',
947
+ 'kn',
948
+ 'ta',
949
+ 'te']:
950
+ return True
951
+ else:
952
+ return False
953
+
954
+
955
+ if __name__ == '__main__':
956
+
957
+ if len(sys.argv)<4:
958
+ print("Usage: python normalize.py <infile> <outfile> <language> [<replace_nukta(True,False)>] [<normalize_nasals(do_nothing|to_anusvaara_strict|to_anusvaara_relaxed|to_nasal_consonants)>]")
959
+ sys.exit(1)
960
+
961
+ language=sys.argv[3]
962
+ remove_nuktas=False
963
+ normalize_nasals='do_nothing'
964
+ if len(sys.argv)>=5:
965
+ remove_nuktas=bool(sys.argv[4])
966
+ if len(sys.argv)>=6:
967
+ normalize_nasals=sys.argv[5]
968
+
969
+ # create normalizer
970
+ factory=IndicNormalizerFactory()
971
+ normalizer=factory.get_normalizer(language,remove_nuktas=remove_nuktas,nasals_mode=normalize_nasals)
972
+
973
+ # DO normalization
974
+ with codecs.open(sys.argv[1],'r','utf-8') as ifile:
975
+ with codecs.open(sys.argv[2],'w','utf-8') as ofile:
976
+ for line in ifile.readlines():
977
+ normalized_line=normalizer.normalize(line)
978
+ ofile.write(normalized_line)
979
+
980
+ ## gather status about normalization
981
+ #with codecs.open(sys.argv[1],'r','utf-8') as ifile:
982
+ # normalizer=DevanagariNormalizer()
983
+ # text=string.join(ifile.readlines(),sep='')
984
+ # normalizer.get_char_stats(text)
indic_nlp_library/indicnlp/script/__init__.py ADDED
File without changes
indic_nlp_library/indicnlp/script/english_script.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+
12
+ from indicnlp import common
13
+ from indicnlp.common import IndicNlpException
14
+
15
+
16
+ #### Maps from ARPABET to Internal Id
17
+ ARPABET_ID_MAP={}
18
+ ID_ARPABET_MAP={}
19
+
20
+
21
+ ###
22
+ # Phonetic Information about script characters
23
+ ###
24
+
25
+ """ Phonetic data for English """
26
+ ENGLISH_PHONETIC_DATA=None
27
+
28
+ """ Phonetic vector for English"""
29
+ ENGLISH_PHONETIC_VECTORS=None
30
+
31
+ """ Length of phonetic vector """
32
+ PHONETIC_VECTOR_LENGTH=38
33
+
34
+ """ Start offset for the phonetic feature vector in the phonetic data vector """
35
+ PHONETIC_VECTOR_START_OFFSET=6
36
+
37
+ ## PHONETIC PROPERTIES in order in which they occur in the vector
38
+ ## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
39
+ PV_PROP=['basic_type',
40
+ 'vowel_length',
41
+ 'vowel_strength',
42
+ 'vowel_status',
43
+ 'consonant_type',
44
+ 'articulation_place',
45
+ 'aspiration',
46
+ 'voicing',
47
+ 'nasalization',
48
+ 'vowel_horizontal',
49
+ 'vowel_vertical',
50
+ 'vowel_roundness',
51
+ ]
52
+
53
+ ###
54
+ # Bit vector ranges for various properties
55
+ ###
56
+
57
+ PV_PROP_RANGES={
58
+ 'basic_type': [0,6],
59
+ 'vowel_length': [6,8],
60
+ 'vowel_strength': [8,11],
61
+ 'vowel_status': [11,13],
62
+ 'consonant_type': [13,18],
63
+ 'articulation_place': [18,23],
64
+ 'aspiration': [23,25],
65
+ 'voicing': [25,27],
66
+ 'nasalization': [27,29],
67
+ 'vowel_horizontal': [29,32],
68
+ 'vowel_vertical': [32,36],
69
+ 'vowel_roundness': [36,38],
70
+ }
71
+
72
+
73
+ ####
74
+ # Indexes into the Phonetic Vector
75
+ ####
76
+ PVIDX_BT_VOWEL=0
77
+ PVIDX_BT_CONSONANT=1
78
+ PVIDX_BT_NUKTA=2
79
+ PVIDX_BT_HALANT=3
80
+ PVIDX_BT_ANUSVAAR=4
81
+ PVIDX_BT_MISC=5
82
+ PVIDX_BT_S=PVIDX_BT_VOWEL
83
+ PVIDX_BT_E=PVIDX_BT_MISC+1
84
+
85
+ PVIDX_VSTAT_DEP=12
86
+
87
+ ####
88
+ SCRIPT_RANGE_START=0x0D00
89
+ ## TBD
90
+ SCRIPT_RANGE_END=0x0D2E
91
+
92
+
93
+ def init():
94
+ """
95
+ To be called by library loader, do not call it in your program
96
+ """
97
+
98
+ global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
99
+
100
+ ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8')
101
+
102
+ ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
103
+
104
+ PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1]
105
+
106
+ ### Load mapping from ARPABET representation of phoneme to internal ID
107
+ global ARPABET_ID_MAP, ID_ARPABET_MAP
108
+
109
+ with open(common.get_resources_path()+'/script/english_arpabet_list.csv','r',encoding='utf-8') as infile:
110
+ for ph_id, name in enumerate(iter(infile)):
111
+ name=name.strip()
112
+ ARPABET_ID_MAP[name]=ph_id
113
+ ID_ARPABET_MAP[ph_id]=name
114
+
115
+
116
+ def phoneme_to_offset(ph):
117
+ return ARPABET_ID_MAP[ph]
118
+
119
+ def offset_to_phoneme(ph_id):
120
+ return ID_ARPABET_MAP[ph_id]
121
+
122
+ def phoneme_to_enc(ph):
123
+ return chr(SCRIPT_RANGE_START+phoneme_to_offset(ph))
124
+
125
+ def enc_to_phoneme(ph):
126
+ return offset_to_phoneme(enc_to_offset(ph))
127
+
128
+ def enc_to_offset(c):
129
+ return ord(c)-SCRIPT_RANGE_START
130
+
131
+ def in_range(offset):
132
+ return offset>=SCRIPT_RANGE_START and offset<SCRIPT_RANGE_END
133
+
134
+ def get_phonetic_info(lang):
135
+ return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)
136
+
137
+ def invalid_vector():
138
+ ## TODO: check if np datatype is correct?
139
+ return np.array([0]*PHONETIC_VECTOR_LENGTH)
140
+
141
+ def get_phonetic_feature_vector(p,lang):
142
+
143
+ offset=enc_to_offset(p)
144
+
145
+ if not in_range(offset):
146
+ return invalid_vector()
147
+
148
+ phonetic_data, phonetic_vectors= get_phonetic_info(lang)
149
+
150
+ if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
151
+ return invalid_vector()
152
+
153
+ return phonetic_vectors[offset]
154
+
indic_nlp_library/indicnlp/script/indic_scripts.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import pandas as pd
10
+ import numpy as np
11
+ import os
12
+
13
+ from indicnlp import common
14
+ from indicnlp.common import IndicNlpException
15
+ from indicnlp import langinfo as li
16
+
17
+ ###
18
+ # Phonetic Information about script characters
19
+ ###
20
+
21
+ """ Phonetic data about all languages except Tamil """
22
+ ALL_PHONETIC_DATA=None
23
+
24
+ """ Phonetic data for Tamil """
25
+ TAMIL_PHONETIC_DATA=None
26
+
27
+ """ Phonetic vector for all languages except Tamil """
28
+ ALL_PHONETIC_VECTORS=None
29
+
30
+ """ Phonetic vector for Tamil """
31
+ TAMIL_PHONETIC_VECTORS=None
32
+
33
+ """ Length of phonetic vector """
34
+ PHONETIC_VECTOR_LENGTH=38
35
+
36
+ """ Start offset for the phonetic feature vector in the phonetic data vector """
37
+ PHONETIC_VECTOR_START_OFFSET=6
38
+
39
+ ## PHONETIC PROPERTIES in order in which they occur in the vector
40
+ ## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
41
+ PV_PROP=['basic_type',
42
+ 'vowel_length',
43
+ 'vowel_strength',
44
+ 'vowel_status',
45
+ 'consonant_type',
46
+ 'articulation_place',
47
+ 'aspiration',
48
+ 'voicing',
49
+ 'nasalization',
50
+ 'vowel_horizontal',
51
+ 'vowel_vertical',
52
+ 'vowel_roundness',
53
+ ]
54
+
55
+ ###
56
+ # Bit vector ranges for various properties
57
+ ###
58
+
59
+ PV_PROP_RANGES={
60
+ 'basic_type': [0,6],
61
+ 'vowel_length': [6,8],
62
+ 'vowel_strength': [8,11],
63
+ 'vowel_status': [11,13],
64
+ 'consonant_type': [13,18],
65
+ 'articulation_place': [18,23],
66
+ 'aspiration': [23,25],
67
+ 'voicing': [25,27],
68
+ 'nasalization': [27,29],
69
+ 'vowel_horizontal': [29,32],
70
+ 'vowel_vertical': [32,36],
71
+ 'vowel_roundness': [36,38],
72
+ }
73
+
74
+
75
+ ####
76
+ # Indexes into the Phonetic Vector
77
+ ####
78
+ PVIDX_BT_VOWEL=0
79
+ PVIDX_BT_CONSONANT=1
80
+ PVIDX_BT_NUKTA=2
81
+ PVIDX_BT_HALANT=3
82
+ PVIDX_BT_ANUSVAAR=4
83
+ PVIDX_BT_MISC=5
84
+ PVIDX_BT_S=PVIDX_BT_VOWEL
85
+ PVIDX_BT_E=PVIDX_BT_MISC+1
86
+
87
+ PVIDX_VSTAT_DEP=12
88
+
89
+ #####
90
+ # Unicode information about characters
91
+ #####
92
+
93
+ SCRIPT_OFFSET_START=0
94
+ SCRIPT_OFFSET_RANGE=0x80
95
+
96
+ def init():
97
+ """
98
+ To be called by library loader, do not call it in your program
99
+ """
100
+
101
+ global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
102
+
103
+ ALL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','all_script_phonetic_data.csv'),encoding='utf-8')
104
+ TAMIL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','tamil_script_phonetic_data.csv'),encoding='utf-8')
105
+
106
+ ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
107
+ TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
108
+
109
+ PHONETIC_VECTOR_LENGTH=ALL_PHONETIC_VECTORS.shape[1]
110
+
111
+ def is_supported_language(lang):
112
+ return lang in list(li.SCRIPT_RANGES.keys())
113
+
114
+ def get_offset(c,lang):
115
+ if not is_supported_language(lang):
116
+ raise IndicNlpException('Language {} not supported'.format(lang))
117
+ return ord(c)-li.SCRIPT_RANGES[lang][0]
118
+
119
+ def offset_to_char(off,lang):
120
+ """
121
+ Applicable to Brahmi derived Indic scripts
122
+ """
123
+ if not is_supported_language(lang):
124
+ raise IndicNlpException('Language {} not supported'.format(lang))
125
+ return chr(off+li.SCRIPT_RANGES[lang][0])
126
+
127
+ def is_indiclang_char(c,lang):
128
+ """
129
+ Applicable to Brahmi derived Indic scripts
130
+ Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts
131
+ """
132
+ if not is_supported_language(lang):
133
+ raise IndicNlpException('Language {} not supported'.format(lang))
134
+ o=get_offset(c,lang)
135
+ return (o>=SCRIPT_OFFSET_START and o<SCRIPT_OFFSET_RANGE) \
136
+ or ord(c)==li.DANDA or ord(c)==li.DOUBLE_DANDA
137
+
138
+ def in_coordinated_range_offset(c_offset):
139
+ """
140
+ Applicable to Brahmi derived Indic scripts
141
+ """
142
+ return (c_offset>=li.COORDINATED_RANGE_START_INCLUSIVE and c_offset<=li.COORDINATED_RANGE_END_INCLUSIVE)
143
+
144
+ def in_coordinated_range(c,lang):
145
+ if not is_supported_language(lang):
146
+ raise IndicNlpException('Language {} not supported'.format(lang))
147
+ return in_coordinated_range_offset(get_offset(c,lang))
148
+
149
+ def get_phonetic_info(lang):
150
+ if not is_supported_language(lang):
151
+ raise IndicNlpException('Language {} not supported'.format(lang))
152
+ phonetic_data= ALL_PHONETIC_DATA if lang!=li.LC_TA else TAMIL_PHONETIC_DATA
153
+ phonetic_vectors= ALL_PHONETIC_VECTORS if lang!=li.LC_TA else TAMIL_PHONETIC_VECTORS
154
+
155
+ return (phonetic_data, phonetic_vectors)
156
+
157
+ def invalid_vector():
158
+ ## TODO: check if np datatype is correct?
159
+ return np.array([0]*PHONETIC_VECTOR_LENGTH)
160
+
161
+ def get_phonetic_feature_vector(c,lang):
162
+
163
+ offset=get_offset(c,lang)
164
+
165
+ if not in_coordinated_range_offset(offset):
166
+ return invalid_vector()
167
+
168
+ phonetic_data, phonetic_vectors= get_phonetic_info(lang)
169
+
170
+ if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
171
+ return invalid_vector()
172
+
173
+ return phonetic_vectors[offset]
174
+
175
+ def get_phonetic_feature_vector_offset(offset,lang):
176
+
177
+ if not in_coordinated_range_offset(offset):
178
+ return invalid_vector()
179
+
180
+ phonetic_data, phonetic_vectors= get_phonetic_info(lang)
181
+
182
+ if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
183
+ return invalid_vector()
184
+
185
+ return phonetic_vectors[offset]
186
+
187
+ ### Unary operations on vectors
188
+ def is_valid(v):
189
+ return np.sum(v)>0
190
+
191
+ def is_vowel(v):
192
+ return v[PVIDX_BT_VOWEL]==1
193
+
194
+ def is_consonant(v):
195
+ return v[PVIDX_BT_CONSONANT]==1
196
+
197
+ def is_halant(v):
198
+ return v[PVIDX_BT_HALANT]==1
199
+
200
+ def is_nukta(v):
201
+ return v[PVIDX_BT_NUKTA]==1
202
+
203
+ def is_anusvaar(v):
204
+ return v[PVIDX_BT_ANUSVAAR]==1
205
+
206
+ def is_misc(v):
207
+ return v[PVIDX_BT_MISC]==1
208
+
209
+ def is_dependent_vowel(v):
210
+ return is_vowel(v) and v[PVIDX_VSTAT_DEP]==1
211
+
212
+ def is_plosive(v):
213
+ return is_consonant(v) and get_property_vector(v,'consonant_type')[0]==1
214
+
215
+ ### Binary operations on phonetic vectors
216
+
217
+ def or_vectors(v1,v2):
218
+ return np.array([ 1 if (b1+b2)>=1 else 0 for b1,b2 in zip(v1,v2) ])
219
+
220
+ def xor_vectors(v1,v2):
221
+ return np.array([ 1 if b1!=b2 else 0 for b1,b2 in zip(v1,v2) ])
222
+
223
+ ### Getting properties from phonetic vectors
224
+
225
+ def get_property_vector(v,prop_name):
226
+ return v[PV_PROP_RANGES[prop_name][0]:PV_PROP_RANGES[prop_name][1]]
227
+
228
+ def get_property_value(v,prop_name):
229
+ factor_bits=get_property_vector(v,prop_name).tolist()
230
+
231
+ v=0
232
+ c=1
233
+ for b in factor_bits[::-1]:
234
+ v+=(c*b)
235
+ c=c*2.0
236
+
237
+ return int(v)
238
+
239
+ def lcsr_indic(srcw,tgtw,slang,tlang):
240
+ """
241
+ compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
242
+ This works for Indic scripts by mapping both languages to a common script
243
+
244
+ srcw: source language string
245
+ tgtw: source language string
246
+ slang: source language
247
+ tlang: target language
248
+ """
249
+ score_mat=np.zeros((len(srcw)+1,len(tgtw)+1))
250
+
251
+ for si,sc in enumerate(srcw,1):
252
+ for ti,tc in enumerate(tgtw,1):
253
+ so=get_offset(sc,slang)
254
+ to=get_offset(tc,tlang)
255
+
256
+ if in_coordinated_range_offset(so) and in_coordinated_range_offset(to) and so==to:
257
+ score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
258
+ elif not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) and sc==tc:
259
+ score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
260
+ else:
261
+ score_mat[si,ti]= max(
262
+ score_mat[si,ti-1],
263
+ score_mat[si-1,ti])
264
+
265
+ return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw)))
266
+
267
+ def lcsr_any(srcw,tgtw):
268
+ """
269
+ LCSR computation if both languages have the same script
270
+ """
271
+ score_mat=np.zeros((len(srcw)+1,len(tgtw)+1))
272
+
273
+ for si,sc in enumerate(srcw,1):
274
+ for ti,tc in enumerate(tgtw,1):
275
+
276
+ if sc==tc:
277
+ score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
278
+ else:
279
+ score_mat[si,ti]= max(
280
+ score_mat[si,ti-1],
281
+ score_mat[si-1,ti])
282
+
283
+ return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw)))
284
+
285
+ def lcsr(srcw,tgtw,slang,tlang):
286
+ """
287
+ compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
288
+
289
+ srcw: source language string
290
+ tgtw: source language string
291
+ slang: source language
292
+ tlang: target language
293
+ """
294
+
295
+ if slang==tlang or not is_supported_language(slang) or not is_supported_language(tlang):
296
+ return lcsr_any(srcw,tgtw,slang,tlang)
297
+ else:
298
+ return lcsr_indic(srcw,tgtw)
299
+
300
+
301
+
indic_nlp_library/indicnlp/script/phonetic_sim.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ from indicnlp import loader
10
+ from indicnlp import langinfo
11
+ from indicnlp.script.indic_scripts import *
12
+ import numpy as np
13
+ import gzip
14
+ import pandas as pd
15
+ import sys
16
+
17
+ def equal(v1,v2):
18
+ return 0.0 if np.sum( xor_vectors(v1, v2)) > 0 else 1.0
19
+
20
+ def dice(v1,v2):
21
+ dotprod=2*float(np.dot( v1, v2.T ))
22
+ return dotprod/float(len(v1)+len(v2))
23
+
24
+ def jaccard(v1,v2):
25
+ dotprod=float(np.dot( v1, v2.T ))
26
+ return dotprod/float(len(v1)+len(v2)-dotprod)
27
+
28
+ def cosine(v1,v2):
29
+ dotprod=float(np.dot( v1, v2.T ))
30
+ norm1=float(np.dot( v1, v1.T ))
31
+ norm2=float(np.dot( v2, v2.T ))
32
+ return ((dotprod)/(np.sqrt(norm1*norm2)+0.00001))
33
+
34
+ def dotprod(v1,v2):
35
+ return float(np.dot( v1, v2.T ))
36
+
37
+ def sim1(v1,v2,base=5.0):
38
+ return np.power(base,dotprod(v1,v2))
39
+
40
+ def softmax(v1,v2):
41
+ return sim1(v1,v2,np.e)
42
+
43
+ def create_similarity_matrix(sim_func,slang,tlang,normalize=True):
44
+
45
+ dim=langinfo.COORDINATED_RANGE_END_INCLUSIVE-langinfo.COORDINATED_RANGE_START_INCLUSIVE+1
46
+ sim_mat=np.zeros((dim,dim))
47
+
48
+ for offset1 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1):
49
+ v1=get_phonetic_feature_vector(offset_to_char(offset1,slang),slang)
50
+ for offset2 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1):
51
+ v2=get_phonetic_feature_vector(offset_to_char(offset2,tlang),tlang)
52
+ sim_mat[offset1,offset2]=sim_func(v1,v2)
53
+
54
+ if normalize:
55
+ sums=np.sum(sim_mat, axis=1)
56
+ sim_mat=(sim_mat.transpose()/sums).transpose()
57
+
58
+ return sim_mat
59
+
indic_nlp_library/indicnlp/syllable/__init__.py ADDED
File without changes
indic_nlp_library/indicnlp/syllable/syllabifier.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import codecs, sys
10
+ from indicnlp.script import indic_scripts as si
11
+ import re
12
+
13
+ chillu_char_map= {
14
+ '\u0d7a': '\u0d23',
15
+ '\u0d7b': '\u0d28',
16
+ '\u0d7c': '\u0d30',
17
+ '\u0d7d': '\u0d32',
18
+ '\u0d7e': '\u0d33',
19
+ '\u0d7f': '\u0d15',
20
+ }
21
+
22
+ char_chillu_map= {}
23
+ for k,v in chillu_char_map.items():
24
+ char_chillu_map[v]=k
25
+
26
+ def normalize_malayalam(word):
27
+
28
+ word_mask=re.sub(r'[0-9]','0',word)
29
+
30
+ # instead of chillu characters, use consonant+halant
31
+ for chillu,char in chillu_char_map.items():
32
+ word=word.replace(chillu,'{}\u0d4d'.format(char))
33
+ word_mask=word_mask.replace(chillu,'41')
34
+
35
+ word_mask=re.sub(r'[^0-9]','0',word_mask)
36
+
37
+ return word, word_mask
38
+
39
+ def denormalize_malayalam(word, word_mask):
40
+
41
+ word=list(word)
42
+ word_mask=list(word_mask)
43
+
44
+ ## pattern 4
45
+ idx=0
46
+ while idx>=0:
47
+ try:
48
+ idx=word_mask.index('4',idx)
49
+ word[idx:idx+2]=char_chillu_map[word[idx]]
50
+ word_mask[idx:idx+2]='0'
51
+ start=idx
52
+ except ValueError as e:
53
+ break
54
+
55
+ return ''.join(word)
56
+
57
+ def normalize_punjabi(word):
58
+ word_mask=re.sub(r'[0-9]','0',word)
59
+
60
+ ## replace tippi with anusvaar
61
+ word=word.replace('\u0a70','\u0a02')
62
+ word_mask=word_mask.replace('\u0a70','2')
63
+
64
+ ## replace addak+consonant with consonat+halant+consonant
65
+ word=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',word)
66
+ word_mask=re.sub(r'\u0a71(.)','311',word_mask)
67
+
68
+ word_mask=re.sub(r'[^0-9]','0',word_mask)
69
+
70
+ return word, word_mask
71
+
72
+ def denormalize_punjabi(word, word_mask):
73
+
74
+ word=list(word)
75
+ word_mask=list(word_mask)
76
+
77
+ ## pattern 2
78
+ idx=0
79
+ while idx>=0:
80
+ try:
81
+ idx=word_mask.index('2',idx)
82
+ word[idx]='\u0a70'
83
+ word_mask[idx]='0'
84
+ start=idx
85
+ except ValueError as e:
86
+ break
87
+
88
+ ## pattern 3
89
+ idx=0
90
+ while idx>=0:
91
+ try:
92
+ idx=word_mask.index('3',idx)
93
+ word[idx:idx+3]='\u0a71{}'.format(word[idx])
94
+ word_mask[idx:idx+3]='00'
95
+ start=idx
96
+ except ValueError as e:
97
+ break
98
+
99
+ return ''.join(word)
100
+
101
+ def char_backoff(syllables_list,vocab):
102
+ syllables_final=[]
103
+
104
+ if vocab is None:
105
+ syllables_final=syllables_list
106
+ else:
107
+ for s in syllables_list:
108
+ if s in vocab:
109
+ syllables_final.append(s)
110
+ else:
111
+ for x in s:
112
+ syllables_final.append(x)
113
+
114
+ return syllables_final
115
+
116
+
117
+ def orthographic_syllabify_improved(word,lang,vocab=None):
118
+
119
+ word_mask=['0']*len(word)
120
+
121
+ if lang=='ml':
122
+ word, word_mask = normalize_malayalam(word)
123
+ word=word
124
+ elif lang=='pa':
125
+ word, word_mask = normalize_punjabi(word)
126
+
127
+ p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
128
+
129
+ syllables=[]
130
+ syllables_mask=[]
131
+
132
+ for i in range(len(word)):
133
+ v=p_vectors[i]
134
+
135
+ syllables.append(word[i])
136
+ syllables_mask.append(word_mask[i])
137
+
138
+ ### simplified syllabification
139
+ #if i+1<len(word) and \
140
+ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
141
+ # syllables.append(u' ')
142
+ # syllables_mask.append(u'0')
143
+
144
+ #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
145
+ # syllables.append(u' ')
146
+ # syllables_mask.append(u'0')
147
+
148
+ #elif i+1<len(word) and \
149
+ # (si.is_consonant(v) or si.is_nukta(v)) and \
150
+ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
151
+ # syllables.append(u' ')
152
+ # syllables_mask.append(u'0')
153
+
154
+ #### better syllabification
155
+ if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
156
+ syllables.append(' ')
157
+ syllables_mask.append('0')
158
+
159
+ elif not si.is_valid(v) or si.is_misc(v) :
160
+ syllables.append(' ')
161
+ syllables_mask.append('0')
162
+
163
+ elif si.is_vowel(v):
164
+
165
+ anu_nonplos= ( i+2<len(word) and \
166
+ si.is_anusvaar(p_vectors[i+1]) and \
167
+ not si.is_plosive(p_vectors[i+2])\
168
+ )
169
+
170
+ anu_eow= ( i+2==len(word) and \
171
+ si.is_anusvaar(p_vectors[i+1]) )
172
+
173
+ if not(anu_nonplos or anu_eow):
174
+ syllables.append(' ')
175
+ syllables_mask.append('0')
176
+
177
+ elif i+1<len(word) and \
178
+ (si.is_consonant(v) or si.is_nukta(v)):
179
+ if si.is_consonant(p_vectors[i+1]):
180
+ syllables.append(' ')
181
+ syllables_mask.append('0')
182
+ elif si.is_vowel(p_vectors[i+1]) and \
183
+ not si.is_dependent_vowel(p_vectors[i+1]):
184
+ syllables.append(' ')
185
+ syllables_mask.append('0')
186
+ elif si.is_anusvaar(p_vectors[i+1]):
187
+ anu_nonplos= ( i+2<len(word) and \
188
+ not si.is_plosive(p_vectors[i+2])\
189
+ )
190
+
191
+ anu_eow= i+2==len(word)
192
+
193
+ if not(anu_nonplos or anu_eow):
194
+ syllables.append(' ')
195
+ syllables_mask.append('0')
196
+
197
+ syllables_mask=''.join(syllables_mask)
198
+ syllables=''.join(syllables)
199
+
200
+ #assert len(syllables_mask) == len(syllables)
201
+ #assert syllables_mask.find('01') == -1
202
+ if syllables_mask.find('01') >= 0:
203
+ print('Warning')
204
+
205
+ if lang=='ml':
206
+ syllables = denormalize_malayalam(syllables,syllables_mask)
207
+ elif lang=='pa':
208
+ syllables = denormalize_punjabi(syllables,syllables_mask)
209
+
210
+ syllables_list = syllables.strip().split(' ')
211
+ return(char_backoff(syllables_list,vocab))
212
+
213
+ def orthographic_syllabify(word,lang,vocab=None):
214
+
215
+ p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
216
+
217
+ syllables=[]
218
+
219
+ for i in range(len(word)):
220
+ v=p_vectors[i]
221
+
222
+ syllables.append(word[i])
223
+
224
+ ### simplified syllabification
225
+ #if i+1<len(word) and \
226
+ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
227
+ # syllables.append(u' ')
228
+
229
+ #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
230
+ # syllables.append(u' ')
231
+
232
+ #elif i+1<len(word) and \
233
+ # (si.is_consonant(v) or si.is_nukta(v)) and \
234
+ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
235
+ # syllables.append(u' ')
236
+
237
+ #### better syllabification
238
+ if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
239
+ syllables.append(' ')
240
+
241
+ elif not si.is_valid(v) or si.is_misc(v) :
242
+ syllables.append(' ')
243
+
244
+ elif si.is_vowel(v):
245
+
246
+ anu_nonplos= ( i+2<len(word) and \
247
+ si.is_anusvaar(p_vectors[i+1]) and \
248
+ not si.is_plosive(p_vectors[i+2])\
249
+ )
250
+
251
+ anu_eow= ( i+2==len(word) and \
252
+ si.is_anusvaar(p_vectors[i+1]) )
253
+
254
+ if not(anu_nonplos or anu_eow):
255
+ syllables.append(' ')
256
+
257
+ elif i+1<len(word) and \
258
+ (si.is_consonant(v) or si.is_nukta(v)):
259
+ if si.is_consonant(p_vectors[i+1]):
260
+ syllables.append(' ')
261
+ elif si.is_vowel(p_vectors[i+1]) and \
262
+ not si.is_dependent_vowel(p_vectors[i+1]):
263
+ syllables.append(' ')
264
+ elif si.is_anusvaar(p_vectors[i+1]):
265
+ anu_nonplos= ( i+2<len(word) and \
266
+ not si.is_plosive(p_vectors[i+2])\
267
+ )
268
+
269
+ anu_eow= i+2==len(word)
270
+
271
+ if not(anu_nonplos or anu_eow):
272
+ syllables.append(' ')
273
+
274
+ syllables_list = ''.join(syllables).strip().split(' ')
275
+ return(char_backoff(syllables_list,vocab))
276
+
277
+ def orthographic_simple_syllabify(word,lang,vocab=None):
278
+
279
+ p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
280
+
281
+ syllables=[]
282
+
283
+ for i in range(len(word)):
284
+ v=p_vectors[i]
285
+
286
+ syllables.append(word[i])
287
+
288
+ ## simplified syllabification
289
+ if i+1<len(word) and \
290
+ (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
291
+ syllables.append(' ')
292
+
293
+ elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
294
+ syllables.append(' ')
295
+
296
+ elif i+1<len(word) and \
297
+ (si.is_consonant(v) or si.is_nukta(v)) and \
298
+ (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
299
+ syllables.append(' ')
300
+
301
+ syllables_list = ''.join(syllables).strip().split(' ')
302
+ return(char_backoff(syllables_list,vocab))
indic_nlp_library/indicnlp/test/__init__.py ADDED
File without changes
indic_nlp_library/indicnlp/test/unit/__init__.py ADDED
File without changes
indic_nlp_library/indicnlp/tokenize/__init__.py ADDED
File without changes
indic_nlp_library/indicnlp/tokenize/indic_detokenize.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ #Program for detokenizing Indian language input
10
+ #
11
+ # @author Anoop Kunchukuttan
12
+ #
13
+ """
14
+ De-tokenizer for Indian languages.
15
+ """
16
+
17
+ import string, re, sys
18
+ from indicnlp.common import IndicNlpException
19
+
20
+ ## detokenizer patterns
21
+ left_attach=r'!%)\]},.:;>?\u0964\u0965'
22
+ pat_la=re.compile(r'[ ](['+left_attach+r'])')
23
+
24
+ right_attach=r'#$(\[{<@'
25
+ pat_ra=re.compile(r'(['+right_attach+r'])[ ]')
26
+
27
+ lr_attach=r'-/\\'
28
+ pat_lra=re.compile(r'[ ](['+lr_attach+r'])[ ]')
29
+
30
+ #donknow=u'&*+=^_|~'
31
+
32
+ ## date, numbers, section/article numbering
33
+ ## TODO: handle indic numbers
34
+ pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+')
35
+
36
+ ### e-mail address
37
+ #pat_num=re.compile(ur'[a-zA-Z]+[ ]?
38
+
39
+ def trivial_detokenize_indic(text):
40
+ """detokenize string for Indian language scripts using Brahmi-derived scripts
41
+
42
+ A trivial detokenizer which:
43
+
44
+ - decides whether punctuation attaches to left/right or both
45
+ - handles number sequences
46
+ - handles quotes smartly (deciding left or right attachment)
47
+
48
+ Args:
49
+ text (str): tokenized text to process
50
+
51
+ Returns:
52
+ str: detokenized string
53
+ """
54
+
55
+ s=text
56
+ ### some normalizations
57
+
58
+ #numbers and dates
59
+ new_s=''
60
+ prev=0
61
+ for m in pat_num_seq.finditer(s):
62
+ start=m.start()
63
+ end=m.end()
64
+ if start>prev:
65
+ new_s=new_s+s[prev:start]
66
+ new_s=new_s+s[start:end].replace(' ','')
67
+ prev=end
68
+
69
+ new_s=new_s+s[prev:]
70
+ s=new_s
71
+
72
+ ### consective single quotes or backslashes become double quotes
73
+ #s=s.replace("' '", "''")
74
+ #s=s.replace("` `", '``')
75
+
76
+ s=pat_lra.sub('\\1',s)
77
+ s=pat_la.sub('\\1',s)
78
+ s=pat_ra.sub('\\1',s)
79
+
80
+ # assumes well formedness of quotes and alternates between right and left attach
81
+
82
+ alt_attach='\'"`'
83
+ for punc in alt_attach:
84
+ cnt=0
85
+ out_str=[]
86
+ for c in s:
87
+ if c == punc:
88
+ if cnt%2==0:
89
+ out_str.append('@RA')
90
+ else:
91
+ out_str.append('@LA')
92
+ cnt+=1
93
+ else:
94
+ out_str.append(c)
95
+
96
+ s=''.join(out_str).replace('@RA ',punc).replace(' @LA',punc
97
+ ).replace('@RA',punc).replace('@LA',punc)
98
+
99
+ return s
100
+
101
+ def trivial_detokenize(text,lang='hi'):
102
+ """detokenize string for languages of the Indian subcontinent
103
+
104
+ A trivial detokenizer which:
105
+
106
+ - decides whether punctuation attaches to left/right or both
107
+ - handles number sequences
108
+ - handles quotes smartly (deciding left or right attachment)
109
+
110
+ Args:
111
+ text (str): tokenized text to process
112
+
113
+ Returns:
114
+ str: detokenized string
115
+
116
+ Raises:
117
+ IndicNlpException: If language is not supported
118
+ """
119
+ if lang=='ur':
120
+ raise IndicNlpException('No detokenizer available for Urdu')
121
+ else:
122
+ return trivial_detokenize_indic(text)
123
+
124
+ # if __name__ == '__main__':
125
+
126
+ # if len(sys.argv)<4:
127
+ # print("Usage: python indic_detokenize.py <infile> <outfile> <language>")
128
+ # sys.exit(1)
129
+
130
+ # with open(sys.argv[1],'r', encoding='utf-8') as ifile:
131
+ # with open(sys.argv[2],'w', encoding='utf-8') as ofile:
132
+ # for line in ifile:
133
+ # detokenized_line=trivial_detokenize(line,sys.argv[3])
134
+ # ofile.write(detokenized_line)
indic_nlp_library/indicnlp/tokenize/indic_tokenize.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ #Program for tokenizing Indian language input
10
+ #
11
+ # @author Anoop Kunchukuttan
12
+ #
13
+ """
14
+ Tokenizer for Indian languages. Currently, simple punctuation-based tokenizers
15
+ are supported (see `trivial_tokenize`). Major Indian language punctuations are
16
+ handled.
17
+ """
18
+ import string, re, sys
19
+
20
+ from indicnlp.common import IndicNlpException
21
+
22
+ ### tokenizer patterns
23
+ triv_tokenizer_indic_pat=re.compile(r'(['+string.punctuation+r'\u0964\u0965'+r'])')
24
+ triv_tokenizer_urdu_pat=re.compile(r'(['+string.punctuation+r'\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4'+r'])')
25
+
26
+ ## date, numbers, section/article numbering
27
+ pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+')
28
+
29
+ def trivial_tokenize_indic(text):
30
+ """tokenize string for Indian language scripts using Brahmi-derived scripts
31
+
32
+ A trivial tokenizer which just tokenizes on the punctuation boundaries.
33
+ This also includes punctuations for the Indian language scripts (the
34
+ purna virama and the deergha virama). This is a language independent
35
+ tokenizer
36
+
37
+ Args:
38
+ text (str): text to tokenize
39
+
40
+ Returns:
41
+ list: list of tokens
42
+
43
+ """
44
+ tok_str=triv_tokenizer_indic_pat.sub(r' \1 ',text.replace('\t',' '))
45
+ # return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
46
+
47
+ s=re.sub(r'[ ]+',' ',tok_str).strip(' ')
48
+
49
+ # do not tokenize numbers and dates
50
+ new_s=''
51
+ prev=0
52
+ for m in pat_num_seq.finditer(s):
53
+ start=m.start()
54
+ end=m.end()
55
+ if start>prev:
56
+ new_s=new_s+s[prev:start]
57
+ new_s=new_s+s[start:end].replace(' ','')
58
+ prev=end
59
+
60
+ new_s=new_s+s[prev:]
61
+ s=new_s
62
+
63
+ return s.split(' ')
64
+
65
+ def trivial_tokenize_urdu(text):
66
+ """tokenize Urdu string
67
+
68
+ A trivial tokenizer which just tokenizes on the punctuation boundaries.
69
+ This also includes punctuations for the Urdu script.
70
+ These punctuations characters were identified from the Unicode database
71
+ for Arabic script by looking for punctuation symbols.
72
+
73
+ Args:
74
+ text (str): text to tokenize
75
+
76
+ Returns:
77
+ list: list of tokens
78
+ """
79
+ tok_str=triv_tokenizer_urdu_pat.sub(r' \1 ',text.replace('\t',' '))
80
+ return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')
81
+
82
+ def trivial_tokenize(text,lang='hi'):
83
+ """trivial tokenizer for Indian languages using Brahmi for Arabic scripts
84
+
85
+ A trivial tokenizer which just tokenizes on the punctuation boundaries.
86
+ Major punctuations specific to Indian langauges are handled.
87
+ These punctuations characters were identified from the Unicode database.
88
+
89
+ Args:
90
+ text (str): text to tokenize
91
+ lang (str): ISO 639-2 language code
92
+
93
+ Returns:
94
+ list: list of tokens
95
+ """
96
+ if lang=='ur':
97
+ return trivial_tokenize_urdu(text)
98
+ else:
99
+ return trivial_tokenize_indic(text)
100
+
101
+ # if __name__ == '__main__':
102
+
103
+ # if len(sys.argv)<4:
104
+ # print("Usage: python indic_tokenize.py <infile> <outfile> <language>")
105
+ # sys.exit(1)
106
+
107
+ # with open(sys.argv[1],'r', encoding='utf-8') as ifile:
108
+ # with open(sys.argv[2],'w', encoding='utf-8') as ofile:
109
+ # for line in ifile:
110
+ # tokenized_line=' '.join(trivial_tokenize(line,sys.argv[3]))
111
+ # ofile.write(tokenized_line)
indic_nlp_library/indicnlp/tokenize/sentence_tokenize.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright (c) 2013-present, Anoop Kunchukuttan
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ #Program for sentence splitting of Indian language input
10
+ #
11
+ # @author Anoop Kunchukuttan
12
+ #
13
+ """
14
+ Sentence splitter for Indian languages. Contains a rule-based
15
+ sentence splitter that can understand common non-breaking phrases
16
+ in many Indian languages.
17
+ """
18
+
19
+ import re
20
+
21
+ from indicnlp.transliterate import unicode_transliterate
22
+ from indicnlp import langinfo
23
+
24
+
25
+ ## for language which have danda as delimiter
26
+ ## period is not part of the sentence delimiters
27
+ DELIM_PAT_DANDA=re.compile(r'[\?!\u0964\u0965]')
28
+
29
+ ## for languages which don't have danda as delimiter
30
+ DELIM_PAT_NO_DANDA=re.compile(r'[\.\?!\u0964\u0965]')
31
+
32
+ ## pattern to check for presence of danda in text
33
+ CONTAINS_DANDA=re.compile(r'[\u0964\u0965]')
34
+
35
+ def is_acronym_abbvr(text,lang):
36
+ """Is the text a non-breaking phrase
37
+
38
+ Args:
39
+ text (str): text to check for non-breaking phrase
40
+ lang (str): ISO 639-2 language code
41
+
42
+ Returns:
43
+ boolean: true if `text` is a non-breaking phrase
44
+ """
45
+
46
+ ack_chars = {
47
+ ## acronym for latin characters
48
+ 'ए', 'ऎ',
49
+ 'बी', 'बि',
50
+ 'सी', 'सि',
51
+ 'डी', 'डि',
52
+ 'ई', 'इ',
53
+ 'एफ', 'ऎफ',
54
+ 'जी', 'जि',
55
+ 'एच','ऎच',
56
+ 'आई', 'आइ','ऐ',
57
+ 'जे', 'जॆ',
58
+ 'के', 'कॆ',
59
+ 'एल', 'ऎल',
60
+ 'एम','ऎम',
61
+ 'एन','ऎन',
62
+ 'ओ', 'ऒ',
63
+ 'पी', 'पि',
64
+ 'क्यू', 'क्यु',
65
+ 'आर',
66
+ 'एस','ऎस',
67
+ 'टी', 'टि',
68
+ 'यू', 'यु',
69
+ 'वी', 'वि', 'व्ही', 'व्हि',
70
+ 'डब्ल्यू', 'डब्ल्यु',
71
+ 'एक्स','ऎक्स',
72
+ 'वाय',
73
+ 'जेड', 'ज़ेड',
74
+ ## add halant to the previous English character mappings.
75
+ 'एफ्',
76
+ 'ऎफ्',
77
+ 'एच्',
78
+ 'ऎच्',
79
+ 'एल्',
80
+ 'ऎल्',
81
+ 'एम्',
82
+ 'ऎम्',
83
+ 'एन्',
84
+ 'ऎन्',
85
+ 'आर्',
86
+ 'एस्',
87
+ 'ऎस्',
88
+ 'एक्स्',
89
+ 'ऎक्स्',
90
+ 'वाय्',
91
+ 'जेड्', 'ज़ेड्',
92
+
93
+ #Indic vowels
94
+ 'ऄ',
95
+ 'अ',
96
+ 'आ',
97
+ 'इ',
98
+ 'ई',
99
+ 'उ',
100
+ 'ऊ',
101
+ 'ऋ',
102
+ 'ऌ',
103
+ 'ऍ',
104
+ 'ऎ',
105
+ 'ए',
106
+ 'ऐ',
107
+ 'ऑ',
108
+ 'ऒ',
109
+ 'ओ',
110
+ 'औ',
111
+ 'ॠ',
112
+ 'ॡ',
113
+
114
+ #Indic consonants
115
+ 'क',
116
+ 'ख',
117
+ 'ग',
118
+ 'घ',
119
+ 'ङ',
120
+ 'च',
121
+ 'छ',
122
+ 'ज',
123
+ 'झ',
124
+ 'ञ',
125
+ 'ट',
126
+ 'ठ',
127
+ 'ड',
128
+ 'ढ',
129
+ 'ण',
130
+ 'त',
131
+ 'थ',
132
+ 'द',
133
+ 'ध',
134
+ 'न',
135
+ 'ऩ',
136
+ 'प',
137
+ 'फ',
138
+ 'ब',
139
+ 'भ',
140
+ 'म',
141
+ 'य',
142
+ 'र',
143
+ 'ऱ',
144
+ 'ल',
145
+ 'ळ',
146
+ 'ऴ',
147
+ 'व',
148
+ 'श',
149
+ 'ष',
150
+ 'स',
151
+ 'ह',
152
+
153
+ ## abbreviation
154
+ 'श्री',
155
+ 'डॉ',
156
+ 'कु',
157
+ 'चि',
158
+ 'सौ',
159
+ }
160
+
161
+ return unicode_transliterate.UnicodeIndicTransliterator.transliterate(text,lang,'hi') in ack_chars
162
+
163
+ def sentence_split(text,lang,delim_pat='auto'): ## New signature
164
+ """split the text into sentences
165
+
166
+ A rule-based sentence splitter for Indian languages written in
167
+ Brahmi-derived scripts. The text is split at sentence delimiter
168
+ boundaries. The delimiters can be configured by passing appropriate
169
+ parameters.
170
+
171
+ The sentence splitter can identify non-breaking phrases like
172
+ single letter, common abbreviations/honorofics for some Indian
173
+ languages.
174
+
175
+ Args:
176
+ text (str): text to split into sentence
177
+ lang (str): ISO 639-2 language code
178
+ delim_pat (str): regular expression to identify sentence delimiter characters. If set to 'auto', the delimiter pattern is chosen automatically based on the language and text.
179
+
180
+
181
+ Returns:
182
+ list: list of sentences identified from the input text
183
+ """
184
+
185
+ #print('Input: {}'.format(delim_pat))
186
+ if delim_pat=='auto':
187
+ if langinfo.is_danda_delim(lang):
188
+ # in modern texts it is possible that period is used as delimeter
189
+ # instead of DANDA. Hence, a check. Use danda delimiter pattern
190
+ # only if text contains at least one danda
191
+ if CONTAINS_DANDA.search(text) is None:
192
+ delim_pat=DELIM_PAT_NO_DANDA
193
+ #print('LANG has danda delim. TEXT_CONTAINS_DANDA: FALSE --> DELIM_PAT_NO_DANDA')
194
+ else:
195
+ delim_pat=DELIM_PAT_DANDA
196
+ #print('LANG has danda delim. TEXT_CONTAINS_DANDA: TRUE --> DELIM_PAT_DANDA')
197
+ else:
198
+ delim_pat=DELIM_PAT_NO_DANDA
199
+ #print('LANG has no danda delim --> DELIM_PAT_NO_DANDA')
200
+
201
+ ## otherwise, assume the caller set the delimiter pattern
202
+
203
+ ### Phase 1: break on sentence delimiters.
204
+ cand_sentences=[]
205
+ begin=0
206
+ text = text.strip()
207
+ for mo in delim_pat.finditer(text):
208
+ p1=mo.start()
209
+ p2=mo.end()
210
+
211
+ ## NEW
212
+ if p1>0 and text[p1-1].isnumeric():
213
+ continue
214
+
215
+ end=p1+1
216
+ s= text[begin:end].strip()
217
+ if len(s)>0:
218
+ cand_sentences.append(s)
219
+ begin=p1+1
220
+
221
+ s= text[begin:].strip()
222
+ if len(s)>0:
223
+ cand_sentences.append(s)
224
+
225
+ if not delim_pat.search('.'):
226
+ ## run phase 2 only if delimiter pattern contains period
227
+ #print('No need to run phase2')
228
+ return cand_sentences
229
+ # print(cand_sentences)
230
+ # print('====')
231
+
232
+ # return cand_sentences
233
+
234
+ ### Phase 2: Address the fact that '.' may not always be a sentence delimiter
235
+ ### Method: If there is a run of lines containing only a word (optionally) and '.',
236
+ ### merge these lines as well one sentence preceding and succeeding this run of lines.
237
+ final_sentences=[]
238
+ sen_buffer=''
239
+ bad_state=False
240
+
241
+ for i, sentence in enumerate(cand_sentences):
242
+ words=sentence.split(' ')
243
+ #if len(words)<=2 and words[-1]=='.':
244
+ if len(words)==1 and sentence[-1]=='.':
245
+ bad_state=True
246
+ sen_buffer = sen_buffer + ' ' + sentence
247
+ ## NEW condition
248
+ elif sentence[-1]=='.' and is_acronym_abbvr(words[-1][:-1],lang):
249
+ if len(sen_buffer)>0 and not bad_state:
250
+ final_sentences.append(sen_buffer)
251
+ bad_state=True
252
+ sen_buffer = sentence
253
+ elif bad_state:
254
+ sen_buffer = sen_buffer + ' ' + sentence
255
+ if len(sen_buffer)>0:
256
+ final_sentences.append(sen_buffer)
257
+ sen_buffer=''
258
+ bad_state=False
259
+ else: ## good state
260
+ if len(sen_buffer)>0:
261
+ final_sentences.append(sen_buffer)
262
+ sen_buffer=sentence
263
+ bad_state=False
264
+
265
+ if len(sen_buffer)>0:
266
+ final_sentences.append(sen_buffer)
267
+
268
+ return final_sentences