entai2965 commited on
Commit
5901938
1 Parent(s): 3483171

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +85 -0
README.md CHANGED
@@ -319,6 +319,91 @@ target = results[0].hypotheses[0][1:]
319
  print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
320
  ```
321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  ## Available languages
323
  - https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
324
 
 
319
  print(tokenizer.decode(tokenizer.convert_tokens_to_ids(target)))
320
  ```
321
 
322
+ ## How to run this model (batch syntax)
323
+ ```
324
+ import os
325
+ import ctranslate2
326
+ import transformers
327
+
328
+ #set defaults
329
+ home_path=os.path.expanduser('~')
330
+ model_folder=home_path+'/Downloads/models/nllb-200-distilled-600M-ctranslate2' #3 GB of memory
331
+ #model_folder=home_path+'/Downloads/models/nllb-200-distilled-1.3B-ctranslate2' #5.5 GB of memory
332
+ #model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2-float16' #13 GB of memory in almost all cases, 7.6 GB on CUDA + GeForce RTX 2000 series and newer
333
+ #model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2' #13 GB of memory
334
+
335
+ string1='Hello world!'
336
+ string2='Awesome.'
337
+ raw_list=[string1, string2]
338
+
339
+ #https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
340
+ source_language_code = "eng_Latn"
341
+ target_language_code = "fra_Latn"
342
+
343
+ device='cpu'
344
+ #device='cuda'
345
+
346
+ #load models
347
+ translator = ctranslate2.Translator(model_folder,device=device)
348
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_folder, src_lang=source_language_code, clean_up_tokenization_spaces=True)
349
+
350
+ #tokenize input
351
+ encoded_list=[]
352
+ for text in raw_list:
353
+ encoded_list.append(tokenizer.convert_ids_to_tokens(tokenizer.encode(text)))
354
+
355
+ #translate
356
+ #https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?#ctranslate2.Translator.translate_batch
357
+ translated_list = translator.translate_batch(encoded_list, target_prefix=[[target_language_code]]*len(raw_list))
358
+ assert(len(raw_list)==len(translated_list))
359
+
360
+ #decode
361
+ for counter,tokens in enumerate(translated_list):
362
+ translated_list[counter]=tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:]))
363
+
364
+ #output
365
+ for text in translated_list:
366
+ print(text)
367
+ ```
368
+
369
+ [Functional programming](https://docs.python.org/3/howto/functional.html) version
370
+
371
+ ```
372
+ import os
373
+ import ctranslate2
374
+ import transformers
375
+
376
+ #set defaults
377
+ home_path=os.path.expanduser('~')
378
+ #model_folder=home_path+'/Downloads/models/nllb-200-distilled-600M-ctranslate2' #3 GB of memory
379
+ model_folder=home_path+'/Downloads/models/nllb-200-distilled-1.3B-ctranslate2' #5.5 GB of memory
380
+ #model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2-float16' #13 GB of memory in almost all cases, 7.6 GB on CUDA + GeForce RTX 2000 series and newer
381
+ #model_folder=home_path+'/Downloads/models/nllb-200-3.3B-ctranslate2' #13 GB of memory
382
+
383
+ string1='Hello world!'
384
+ string2='Awesome.'
385
+ raw_list=[string1, string2]
386
+
387
+ #https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
388
+ source_language_code = "eng_Latn"
389
+ target_language_code = "fra_Latn"
390
+
391
+ device='cpu'
392
+ #device='cuda'
393
+
394
+ #load models
395
+ translator = ctranslate2.Translator(model_folder,device=device)
396
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_folder, src_lang=source_language_code, clean_up_tokenization_spaces=True)
397
+
398
+ #invoke black magic
399
+ translated_list=[tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens.hypotheses[0][1:])) for tokens in translator.translate_batch([tokenizer.convert_ids_to_tokens(tokenizer.encode(text)) for text in raw_list], target_prefix=[[target_language_code]]*len(raw_list))]
400
+ assert(len(raw_list)==len(translated_list))
401
+
402
+ #output
403
+ for text in translated_list:
404
+ print(text)
405
+ ```
406
+
407
  ## Available languages
408
  - https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200
409