Update README.md
Browse files
README.md
CHANGED
@@ -319,6 +319,91 @@ target = results[0].hypotheses[0][1:]
|
|
319 |
print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
|
320 |
```
|
321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
## Available languages
|
323 |
- https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
|
324 |
|
|
|
319 |
print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
|
320 |
```
|
321 |
|
322 |
+
## How to run this model (batch syntax)
|
323 |
+
```
|
324 |
+
import os
|
325 |
+
import ctranslate2
|
326 |
+
import transformers
|
327 |
+
|
328 |
+
#set defaults
|
329 |
+
home_path=os.path.expanduser('~')
|
330 |
+
#model_folder=home_path+'/Downloads/models/nllb-200-distilled-600M-ctranslate2' #3 GB of memory
|
331 |
+
#model_folder=home_path+'/Downloads/models/nllb-200-distilled-1.3B-ctranslate2' #5.5 GB of memory
|
332 |
+
model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2-float16' #13 GB of memory in almost all cases, 7.6 GB on CUDA + GeForce RTX 2000 series and newer
|
333 |
+
#model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2' #13 GB of memory
|
334 |
+
|
335 |
+
string1='Hello world!'
|
336 |
+
string2='Awesome.'
|
337 |
+
raw_list=[string1, string2]
|
338 |
+
|
339 |
+
#https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
|
340 |
+
source_language_code = "eng_Latn"
|
341 |
+
target_language_code = "fra_Latn"
|
342 |
+
|
343 |
+
device='cpu'
|
344 |
+
#device='cuda'
|
345 |
+
|
346 |
+
#load models
|
347 |
+
translator = ctranslate2.Translator(model_folder,device=device)
|
348 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_folder, src_lang=source_language_code, clean_up_tokenization_spaces=True)
|
349 |
+
|
350 |
+
#tokenize input
|
351 |
+
encoded_list=[]
|
352 |
+
for text in raw_list:
|
353 |
+
encoded_list.append(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
|
354 |
+
|
355 |
+
#translate
|
356 |
+
#https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?#ctranslate2.Translator.translate_batch
|
357 |
+
translated_list = translator.translate_batch(encoded_list, target_prefix=[[target_language_code]]*len(raw_list))
|
358 |
+
assert(len(raw_list)==len(translated_list))
|
359 |
+
|
360 |
+
#decode
|
361 |
+
for counter,tokens in enumerate(translated_list):
|
362 |
+
translated_list[counter]=tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:]))
|
363 |
+
|
364 |
+
#output
|
365 |
+
for text in translated_list:
|
366 |
+
print(text)
|
367 |
+
```
|
368 |
+
|
369 |
+
[Functional programming](https://docs.python.org/3/howto/functional.html) version
|
370 |
+
|
371 |
+
```
|
372 |
+
import os
|
373 |
+
import ctranslate2
|
374 |
+
import transformers
|
375 |
+
|
376 |
+
#set defaults
|
377 |
+
home_path=os.path.expanduser('~')
|
378 |
+
#model_folder=home_path+'/Downloads/models/nllb-200-distilled-600M-ctranslate2' #3 GB of memory
|
379 |
+
#model_folder=home_path+'/Downloads/models/nllb-200-distilled-1.3B-ctranslate2' #5.5 GB of memory
|
380 |
+
model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2-float16' #13 GB of memory in almost all cases, 7.6 GB on CUDA + GeForce RTX 2000 series and newer
|
381 |
+
#model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2' #13 GB of memory
|
382 |
+
|
383 |
+
string1='Hello world!'
|
384 |
+
string2='Awesome.'
|
385 |
+
raw_list=[string1, string2]
|
386 |
+
|
387 |
+
#https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
|
388 |
+
source_language_code = "eng_Latn"
|
389 |
+
target_language_code = "fra_Latn"
|
390 |
+
|
391 |
+
device='cpu'
|
392 |
+
#device='cuda'
|
393 |
+
|
394 |
+
#load models
|
395 |
+
translator = ctranslate2.Translator(model_folder,device=device)
|
396 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_folder, src_lang=source_language_code, clean_up_tokenization_spaces=True)
|
397 |
+
|
398 |
+
#invoke black magic
|
399 |
+
translated_list=[tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:])) for tokens in translator.translate_batch([tokenizer.convert_ids_to_tokens(tokenizer.encode(text)) for text in raw_list], target_prefix=[[target_language_code]]*len(raw_list))]
|
400 |
+
assert(len(raw_list)==len(translated_list))
|
401 |
+
|
402 |
+
#output
|
403 |
+
for text in translated_list:
|
404 |
+
print(text)
|
405 |
+
```
|
406 |
+
|
407 |
## Available languages
|
408 |
- https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
|
409 |
|