krsnaman commited on
Commit
99815a4
1 Parent(s): 5867b86

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -2
README.md CHANGED
@@ -60,9 +60,9 @@ pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")
60
  # To get lang_id use any of ['<2as>', '<2bn>', '<2gu>', '<2hi>', '<2kn>', '<2ml>', '<2mr>', '<2or>', '<2pa>', '<2ta>', '<2te>']
61
 
62
  # First tokenize the input and outputs. The format below is how IndicBARTSS was trained so the input should be "Sentence </s> <2xx>" where xx is the language code. Similarly, the output should be "<2yy> Sentence </s>".
63
- inp = tokenizer("7 फरवरी, 2016 [SEP] खेल 7 फरवरी, 2016 को कैलिफोर्निया के सांता क्लारा में सैन फ्रांसिस्को खाड़ी क्षेत्र में लेवी स्टेडियम में खेला गया था। </s> <2hi>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids # tensor([[ 466, 1981, 80, 25573, 64001, 64004]])
64
 
65
- out = tokenizer("<2hi> सुपर बाउल किस दिन खेला गया? </s>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids # tensor([[64006, 942, 43, 32720, 8384, 64001]])
66
 
67
  model_outputs=model(input_ids=inp, decoder_input_ids=out[:,0:-1], labels=out[:,1:])
68
 
 
60
  # To get lang_id use any of ['<2as>', '<2bn>', '<2gu>', '<2hi>', '<2kn>', '<2ml>', '<2mr>', '<2or>', '<2pa>', '<2ta>', '<2te>']
61
 
62
  # First tokenize the input and outputs. The format below is how IndicBARTSS was trained so the input should be "Sentence </s> <2xx>" where xx is the language code. Similarly, the output should be "<2yy> Sentence </s>".
63
+ inp = tokenizer("7 फरवरी, 2016 [SEP] खेल 7 फरवरी, 2016 को कैलिफोर्निया के सांता क्लारा में सैन फ्रांसिस्को खाड़ी क्षेत्र में लेवी स्टेडियम में खेला गया था। </s> <2hi>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids
64
 
65
+ out = tokenizer("<2hi> सुपर बाउल किस दिन खेला गया? </s>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids
66
 
67
  model_outputs=model(input_ids=inp, decoder_input_ids=out[:,0:-1], labels=out[:,1:])
68