Vedant Vyas
commited on
Commit
•
c1cfbda
1
Parent(s):
446d0be
generating data
Browse files- data/create_data.py +32 -0
- data/data.json +100 -0
- readme.md +10 -0
- requirements.txt +2 -1
- trainModel.py +16 -0
data/create_data.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
|
4 |
+
def main():
|
5 |
+
print('Creating data...')
|
6 |
+
# Create JSON data
|
7 |
+
# Create JSON data
|
8 |
+
|
9 |
+
WordList = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q']
|
10 |
+
file = open('data.json', 'w')
|
11 |
+
for _ in range(100):
|
12 |
+
file.write('{"translation":{"let":')
|
13 |
+
randomNum = random.randint(1, 50)
|
14 |
+
word = []
|
15 |
+
for i in random.sample(range(1, 100), randomNum):
|
16 |
+
word.append(WordList[i%17])
|
17 |
+
wordStr = ''.join(word)
|
18 |
+
wordRev = wordStr[::-1]
|
19 |
+
file.write(f'"{wordStr}", "rev" :"{wordRev}"')
|
20 |
+
file.write('}}\n')
|
21 |
+
|
22 |
+
file.close()
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
if __name__ == '__main__':
|
32 |
+
main()
|
data/data.json
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"translation":{"let":"kmeihdamhqbppnknbgjmaibqhqiiemodjcecfqkofb", "rev" :"bfokqfcecjdomeiiqhqbiamjgbnknppbqhmadhiemk"}}
|
2 |
+
{"translation":{"let":"feqcklngqpkljodgijbqkaanpeqnalcohjiaaoepbeblcigdl", "rev" :"ldgiclbebpeoaaijhoclanqepnaakqbjigdojlkpqgnlkcqef"}}
|
3 |
+
{"translation":{"let":"hmbfeeiadfcmqnknipqfbgomgjkadakq", "rev" :"qkadakjgmogbfqpinknqmcfdaieefbmh"}}
|
4 |
+
{"translation":{"let":"mo", "rev" :"om"}}
|
5 |
+
{"translation":{"let":"fjablbnhokiedcdhp", "rev" :"phdcdeikohnblbajf"}}
|
6 |
+
{"translation":{"let":"jhnmmhani", "rev" :"inahmmnhj"}}
|
7 |
+
{"translation":{"let":"nmoamhfnbhdoiphd", "rev" :"dhpiodhbnfhmaomn"}}
|
8 |
+
{"translation":{"let":"nfqhpofdegmkbpcolikpoqkopm", "rev" :"mpokqopkilocpbkmgedfophqfn"}}
|
9 |
+
{"translation":{"let":"fkbqiehnnfqeqoom", "rev" :"mooqeqfnnheiqbkf"}}
|
10 |
+
{"translation":{"let":"ilfaicflhofjjmqoeegk", "rev" :"kgeeoqmjjfohlfciafli"}}
|
11 |
+
{"translation":{"let":"fhbdnaeicfopbddgqcjl", "rev" :"ljcqgddbpofcieandbhf"}}
|
12 |
+
{"translation":{"let":"ablpjlgffoionbikekqidfnqljmoddkhea", "rev" :"aehkddomjlqnfdiqkekibnoioffgljplba"}}
|
13 |
+
{"translation":{"let":"kon", "rev" :"nok"}}
|
14 |
+
{"translation":{"let":"afdbqdkqhoahejpelgnkloccchpjlggceiiofmmnbcldcgqg", "rev" :"gqgcdlcbnmmfoiiecggljphcccolknglepjehaohqkdqbdfa"}}
|
15 |
+
{"translation":{"let":"fgiigcgfqpokmdmophkdljh", "rev" :"hjldkhpomdmkopqfgcgiigf"}}
|
16 |
+
{"translation":{"let":"fokedojicgiph", "rev" :"hpigcijodekof"}}
|
17 |
+
{"translation":{"let":"icjpenbbcpifcgjhddcaolkan", "rev" :"nakloacddhjgcfipcbbnepjci"}}
|
18 |
+
{"translation":{"let":"hlhkopqcnhabkibjqkocjm", "rev" :"mjcokqjbikbahncqpokhlh"}}
|
19 |
+
{"translation":{"let":"clghiobfq", "rev" :"qfboihglc"}}
|
20 |
+
{"translation":{"let":"dgncnppnqnmfoqdlifimcpajjmpjojeepgkgibhkanilfikeo", "rev" :"oekiflinakhbigkgpeejojpmjjapcmifildqofmnqnppncngd"}}
|
21 |
+
{"translation":{"let":"iodhnekfnjbmeqpaca", "rev" :"acapqembjnfkenhdoi"}}
|
22 |
+
{"translation":{"let":"khlolfpibfk", "rev" :"kfbipflolhk"}}
|
23 |
+
{"translation":{"let":"hjlekcmdndinkp", "rev" :"pknidndmckeljh"}}
|
24 |
+
{"translation":{"let":"cgemjdplnadoeq", "rev" :"qeodanlpdjmegc"}}
|
25 |
+
{"translation":{"let":"gkiaegqnopelkqlhfopcdighac", "rev" :"cahgidcpofhlqkleponqgeaikg"}}
|
26 |
+
{"translation":{"let":"hjcmfgmijhgbeigonjjqi", "rev" :"iqjjnogiebghjimgfmcjh"}}
|
27 |
+
{"translation":{"let":"niponelgnqe", "rev" :"eqnglenopin"}}
|
28 |
+
{"translation":{"let":"cfqnenkonajmdknilbcfglgnplbod", "rev" :"doblpnglgfcblinkdmjanoknenqfc"}}
|
29 |
+
{"translation":{"let":"cqkfqaofhdcjniafpmbei", "rev" :"iebmpfainjcdhfoaqfkqc"}}
|
30 |
+
{"translation":{"let":"knlnkmjkenlqp", "rev" :"pqlnekjmknlnk"}}
|
31 |
+
{"translation":{"let":"cjhfkdoonfqdapepihjgpdgfpf", "rev" :"fpfgdpgjhipepadqfnoodkfhjc"}}
|
32 |
+
{"translation":{"let":"ipqfhlbeebgqcdqbgbbaicjinkkdogochlnfpneclcnibddka", "rev" :"akddbinclcenpfnlhcogodkknijciabbgbqdcqgbeeblhfqpi"}}
|
33 |
+
{"translation":{"let":"eclpohqf", "rev" :"fqhoplce"}}
|
34 |
+
{"translation":{"let":"ocnikilfdbqhocmnhjgalifejelhqafgjaipkohmhdfjb", "rev" :"bjfdhmhokpiajgfaqhlejefilagjhnmcohqbdflikinco"}}
|
35 |
+
{"translation":{"let":"eoeeibhkmhogkillhlbgbnhjfocpqhqqoicgajjlfoddogbmd", "rev" :"dmbgoddofljjagcioqqhqpcofjhnbgblhllikgohmkhbieeoe"}}
|
36 |
+
{"translation":{"let":"cdpojifjbndfobmnejiljjmojonhqfdidiqekehflkqpc", "rev" :"cpqklfhekeqididfqhnojomjjlijenmbofdnbjfijopdc"}}
|
37 |
+
{"translation":{"let":"ckcehejhfjp", "rev" :"pjfhjeheckc"}}
|
38 |
+
{"translation":{"let":"lgplgdffqicglolnienmqajqkkcknmlejaqkd", "rev" :"dkqajelmnkckkqjaqmneinlolgciqffdglpgl"}}
|
39 |
+
{"translation":{"let":"niolfclqnahgimdngflfaidcdbpibljhddh", "rev" :"hddhjlbipbdcdiaflfgndmighanqlcfloin"}}
|
40 |
+
{"translation":{"let":"pgcfpnoknmmkhbdlbifhjebhlageigofidlcgjnmaaqlcq", "rev" :"qclqaamnjgcldifogiegalhbejhfibldbhkmmnkonpfcgp"}}
|
41 |
+
{"translation":{"let":"hoknmkdaegfiqgcgdqknfbffgcgepablj", "rev" :"jlbapegcgffbfnkqdgcgqifgeadkmnkoh"}}
|
42 |
+
{"translation":{"let":"dbgbdmjqfmgeqbidfalooplcdjjdgaaqlnjlecnloifkm", "rev" :"mkfiolnceljnlqaagdjjdclpoolafdibqegmfqjmdbgbd"}}
|
43 |
+
{"translation":{"let":"dildqcnbihlkfjhcgqpajqbfblifqgncemidfnejqhnpmmmhgn", "rev" :"nghmmmpnhqjenfdimecngqfilbfbqjapqgchjfklhibncqdlid"}}
|
44 |
+
{"translation":{"let":"qnekalgommcllnffapjhibgc", "rev" :"cgbihjpaffnllcmmoglakenq"}}
|
45 |
+
{"translation":{"let":"ffcfklnpjifmlqjmbpgq", "rev" :"qgpbmjqlmfijpnlkfcff"}}
|
46 |
+
{"translation":{"let":"fmbdaeqepgbpnnpfhnkg", "rev" :"gknhfpnnpbgpeqeadbmf"}}
|
47 |
+
{"translation":{"let":"keondhhneimpehpfphkdfkpdoqgejbbqe", "rev" :"eqbbjegqodpkfdkhpfphepmienhhdnoek"}}
|
48 |
+
{"translation":{"let":"qijcppbglonpjhfoebhnlfcdilnbnacjjalfdgmakgeofmq", "rev" :"qmfoegkamgdflajjcanbnlidcflnhbeofhjpnolgbppcjiq"}}
|
49 |
+
{"translation":{"let":"mhffcenmodhppj", "rev" :"jpphdomnecffhm"}}
|
50 |
+
{"translation":{"let":"djfkkhjibaaghqllligeoigb", "rev" :"bgioegilllqhgaabijhkkfjd"}}
|
51 |
+
{"translation":{"let":"mmo", "rev" :"omm"}}
|
52 |
+
{"translation":{"let":"omiqhheqcplkjcbqaqpqbaddnennjcghkmmidkfh", "rev" :"hfkdimmkhgcjnnenddabqpqaqbcjklpcqehhqimo"}}
|
53 |
+
{"translation":{"let":"hcadoqjnmkilf", "rev" :"flikmnjqodach"}}
|
54 |
+
{"translation":{"let":"jgmpeebjakplbbjamhminpfncplghmogccdkih", "rev" :"hikdccgomhglpcnfpnimhmajbblpkajbeepmgj"}}
|
55 |
+
{"translation":{"let":"gelbbogqmcgmiklcilfcopmlionehhhdodkoakmjjdgkiqd", "rev" :"dqikgdjjmkaokdodhhhenoilmpocfliclkimgcmqgobbleg"}}
|
56 |
+
{"translation":{"let":"dkqndjqcappkijgomohhcbacnklpfbndfgebidaegiglhlebf", "rev" :"fbelhlgigeadibegfdnbfplkncabchhomogjikppacqjdnqkd"}}
|
57 |
+
{"translation":{"let":"hnlepbfcdqmfqcpiqobbeakmog", "rev" :"gomkaebboqipcqfmqdcfbpelnh"}}
|
58 |
+
{"translation":{"let":"ednabgfmdjmigfj", "rev" :"jfgimjdmfgbande"}}
|
59 |
+
{"translation":{"let":"qmfi", "rev" :"ifmq"}}
|
60 |
+
{"translation":{"let":"jdkqgahqcgbfidnmlghodmmhafcbnpmklcpbai", "rev" :"iabpclkmpnbcfahmmdohglmndifbgcqhagqkdj"}}
|
61 |
+
{"translation":{"let":"bfolnmibffeeheigljffhkdbn", "rev" :"nbdkhffjlgieheeffbimnlofb"}}
|
62 |
+
{"translation":{"let":"ncliiah", "rev" :"haiilcn"}}
|
63 |
+
{"translation":{"let":"bbdnkjcepnkifibpnhecgjnqjaq", "rev" :"qajqnjgcehnpbifiknpecjkndbb"}}
|
64 |
+
{"translation":{"let":"jhnaabidkllfib", "rev" :"bifllkdibaanhj"}}
|
65 |
+
{"translation":{"let":"mck", "rev" :"kcm"}}
|
66 |
+
{"translation":{"let":"bhfhnljipjhdfokgaiofgidjeciallabq", "rev" :"qballaicejdigfoiagkofdhjpijlnhfhb"}}
|
67 |
+
{"translation":{"let":"fjhkloiepbehdbeipefohlabfcfcqlceeomdnl", "rev" :"lndmoeeclqcfcfbalhofepiebdhebpeiolkhjf"}}
|
68 |
+
{"translation":{"let":"icfehgglfbfbkkdnadkmp", "rev" :"pmkdandkkbfbflgghefci"}}
|
69 |
+
{"translation":{"let":"ipjebbacgkpjecnqinjm", "rev" :"mjniqncejpkgcabbejpi"}}
|
70 |
+
{"translation":{"let":"akapeiojceaaqnbomplnkobbjifmhadqcdgp", "rev" :"pgdcqdahmfijbboknlpmobnqaaecjoiepaka"}}
|
71 |
+
{"translation":{"let":"neokiciombfbhboaedcq", "rev" :"qcdeaobhbfbmoicikoen"}}
|
72 |
+
{"translation":{"let":"eiafgnflg", "rev" :"glfngfaie"}}
|
73 |
+
{"translation":{"let":"egcnlf", "rev" :"flncge"}}
|
74 |
+
{"translation":{"let":"kgmiqofjoldadammbkj", "rev" :"jkbmmadadlojfoqimgk"}}
|
75 |
+
{"translation":{"let":"igldqkjkblic", "rev" :"cilbkjkqdlgi"}}
|
76 |
+
{"translation":{"let":"gkgnheeigjgbfeajofjcomhpioddnldfkhaanpepljkglp", "rev" :"plgkjlpepnaahkfdlnddoiphmocjfojaefbgjgieehngkg"}}
|
77 |
+
{"translation":{"let":"jiigeqelhlcfqfhobnjphkligohnidodekccao", "rev" :"oacckedodinhogilkhpjnbohfqfclhleqegiij"}}
|
78 |
+
{"translation":{"let":"agplnbjeije", "rev" :"ejiejbnlpga"}}
|
79 |
+
{"translation":{"let":"egehjhligd", "rev" :"dgilhjhege"}}
|
80 |
+
{"translation":{"let":"dgfnfqmhdchnabnieglhllmkieqkoigqkjcfapd", "rev" :"dpafcjkqgiokqeikmllhlgeinbanhcdhmqfnfgd"}}
|
81 |
+
{"translation":{"let":"jocgjhllacodkmnfljqfjbobg", "rev" :"gbobjfqjlfnmkdocallhjgcoj"}}
|
82 |
+
{"translation":{"let":"laagaadjhmnqifqgomichdnqheihbmbdbkoiompbe", "rev" :"ebpmoiokbdbmbhiehqndhcimogqfiqnmhjdaagaal"}}
|
83 |
+
{"translation":{"let":"dbfchgqihmpmeeanmoonhpgljhogphfpifqckceainlokjcdd", "rev" :"ddcjkolniaeckcqfipfhpgohjlgphnoomnaeempmhiqghcfbd"}}
|
84 |
+
{"translation":{"let":"ndebjooabakojoqqeqhbbolbkphjqllhkpmjf", "rev" :"fjmpkhllqjhpkblobbhqeqqojokabaoojbedn"}}
|
85 |
+
{"translation":{"let":"lqnlqppkccnbndjofpoaiajiikgcjjbbmlmkkhemmqfifaoddc", "rev" :"cddoafifqmmehkkmlmbbjjcgkiijaiaopfojdnbncckppqlnql"}}
|
86 |
+
{"translation":{"let":"hpakdpdjgfnejkodfj", "rev" :"jfdokjenfgjdpdkaph"}}
|
87 |
+
{"translation":{"let":"naeqqqondp", "rev" :"pdnoqqqean"}}
|
88 |
+
{"translation":{"let":"ohdfeimqoejmphackodikhnajaoqfgmbebikfpininllndcd", "rev" :"dcdnllninipfkibebmgfqoajanhkidokcahpmjeoqmiefdho"}}
|
89 |
+
{"translation":{"let":"qggmfdhp", "rev" :"phdfmggq"}}
|
90 |
+
{"translation":{"let":"iohgjcigepmjmk", "rev" :"kmjmpegicjghoi"}}
|
91 |
+
{"translation":{"let":"qpohdhafocfalploejgdpmqlmkdgaklknpiicqaaibjliik", "rev" :"kiiljbiaaqciipnklkagdkmlqmpdgjeolplafcofahdhopq"}}
|
92 |
+
{"translation":{"let":"dnligfpmlead", "rev" :"daelmpfgilnd"}}
|
93 |
+
{"translation":{"let":"feja", "rev" :"ajef"}}
|
94 |
+
{"translation":{"let":"gnmieihi", "rev" :"ihieimng"}}
|
95 |
+
{"translation":{"let":"qqbcpoengg", "rev" :"ggneopcbqq"}}
|
96 |
+
{"translation":{"let":"ihbgaenomidfdagfcqo", "rev" :"oqcfgadfdimoneagbhi"}}
|
97 |
+
{"translation":{"let":"ahfdemaejhmcaefeeiaqpcipbckldfnlodnmlokopqcjlhfj", "rev" :"jfhljcqpokolmndolnfdlkcbpicpqaieefeacmhjeamedfha"}}
|
98 |
+
{"translation":{"let":"ggqjbnqedemjonpddahc", "rev" :"chaddpnojmedeqnbjqgg"}}
|
99 |
+
{"translation":{"let":"akpjobfl", "rev" :"lfbojpka"}}
|
100 |
+
{"translation":{"let":"qbm", "rev" :"mbq"}}
|
readme.md
CHANGED
@@ -1 +1,11 @@
|
|
1 |
## Readme
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
## Readme
|
2 |
+
|
3 |
+
- 'let' = word generated using random letters
|
4 |
+
- 'rev' = reverse word generated
|
5 |
+
|
6 |
+
|
7 |
+
### Goal
|
8 |
+
- to train a model on this dataset so that it can generate reverse of words
|
9 |
+
|
10 |
+
### Test DataSet
|
11 |
+
- I am not using all the letters in my training set, so test set should be just words formed using remaining letters
|
requirements.txt
CHANGED
@@ -5,4 +5,5 @@ protobuf
|
|
5 |
sacrebleu >= 1.4.12
|
6 |
py7zr
|
7 |
torch >= 1.3
|
8 |
-
evaluate
|
|
|
|
5 |
sacrebleu >= 1.4.12
|
6 |
py7zr
|
7 |
torch >= 1.3
|
8 |
+
evaluate
|
9 |
+
transformers
|
trainModel.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
2 |
+
|
3 |
+
tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
4 |
+
model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
5 |
+
|
6 |
+
|
7 |
+
max_source_length = 128
|
8 |
+
max_target_length = 128
|
9 |
+
|
10 |
+
input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
|
11 |
+
labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
|
12 |
+
|
13 |
+
# the forward function automatically creates the correct decoder_input_ids
|
14 |
+
loss = model(input_ids=input_ids, labels=labels).loss
|
15 |
+
loss.item()
|
16 |
+
print(loss.item())
|