Vedant Vyas commited on
Commit
c1cfbda
1 Parent(s): 446d0be

generating data

Browse files
Files changed (5) hide show
  1. data/create_data.py +32 -0
  2. data/data.json +100 -0
  3. readme.md +10 -0
  4. requirements.txt +2 -1
  5. trainModel.py +16 -0
data/create_data.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+
4
+ def main():
5
+ print('Creating data...')
6
+ # Create JSON data
7
+ # Create JSON data
8
+
9
+ WordList = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q']
10
+ file = open('data.json', 'w')
11
+ for _ in range(100):
12
+ file.write('{"translation":{"let":')
13
+ randomNum = random.randint(1, 50)
14
+ word = []
15
+ for i in random.sample(range(1, 100), randomNum):
16
+ word.append(WordList[i%17])
17
+ wordStr = ''.join(word)
18
+ wordRev = wordStr[::-1]
19
+ file.write(f'"{wordStr}", "rev" :"{wordRev}"')
20
+ file.write('}}\n')
21
+
22
+ file.close()
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+ if __name__ == '__main__':
32
+ main()
data/data.json ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"translation":{"let":"kmeihdamhqbppnknbgjmaibqhqiiemodjcecfqkofb", "rev" :"bfokqfcecjdomeiiqhqbiamjgbnknppbqhmadhiemk"}}
2
+ {"translation":{"let":"feqcklngqpkljodgijbqkaanpeqnalcohjiaaoepbeblcigdl", "rev" :"ldgiclbebpeoaaijhoclanqepnaakqbjigdojlkpqgnlkcqef"}}
3
+ {"translation":{"let":"hmbfeeiadfcmqnknipqfbgomgjkadakq", "rev" :"qkadakjgmogbfqpinknqmcfdaieefbmh"}}
4
+ {"translation":{"let":"mo", "rev" :"om"}}
5
+ {"translation":{"let":"fjablbnhokiedcdhp", "rev" :"phdcdeikohnblbajf"}}
6
+ {"translation":{"let":"jhnmmhani", "rev" :"inahmmnhj"}}
7
+ {"translation":{"let":"nmoamhfnbhdoiphd", "rev" :"dhpiodhbnfhmaomn"}}
8
+ {"translation":{"let":"nfqhpofdegmkbpcolikpoqkopm", "rev" :"mpokqopkilocpbkmgedfophqfn"}}
9
+ {"translation":{"let":"fkbqiehnnfqeqoom", "rev" :"mooqeqfnnheiqbkf"}}
10
+ {"translation":{"let":"ilfaicflhofjjmqoeegk", "rev" :"kgeeoqmjjfohlfciafli"}}
11
+ {"translation":{"let":"fhbdnaeicfopbddgqcjl", "rev" :"ljcqgddbpofcieandbhf"}}
12
+ {"translation":{"let":"ablpjlgffoionbikekqidfnqljmoddkhea", "rev" :"aehkddomjlqnfdiqkekibnoioffgljplba"}}
13
+ {"translation":{"let":"kon", "rev" :"nok"}}
14
+ {"translation":{"let":"afdbqdkqhoahejpelgnkloccchpjlggceiiofmmnbcldcgqg", "rev" :"gqgcdlcbnmmfoiiecggljphcccolknglepjehaohqkdqbdfa"}}
15
+ {"translation":{"let":"fgiigcgfqpokmdmophkdljh", "rev" :"hjldkhpomdmkopqfgcgiigf"}}
16
+ {"translation":{"let":"fokedojicgiph", "rev" :"hpigcijodekof"}}
17
+ {"translation":{"let":"icjpenbbcpifcgjhddcaolkan", "rev" :"nakloacddhjgcfipcbbnepjci"}}
18
+ {"translation":{"let":"hlhkopqcnhabkibjqkocjm", "rev" :"mjcokqjbikbahncqpokhlh"}}
19
+ {"translation":{"let":"clghiobfq", "rev" :"qfboihglc"}}
20
+ {"translation":{"let":"dgncnppnqnmfoqdlifimcpajjmpjojeepgkgibhkanilfikeo", "rev" :"oekiflinakhbigkgpeejojpmjjapcmifildqofmnqnppncngd"}}
21
+ {"translation":{"let":"iodhnekfnjbmeqpaca", "rev" :"acapqembjnfkenhdoi"}}
22
+ {"translation":{"let":"khlolfpibfk", "rev" :"kfbipflolhk"}}
23
+ {"translation":{"let":"hjlekcmdndinkp", "rev" :"pknidndmckeljh"}}
24
+ {"translation":{"let":"cgemjdplnadoeq", "rev" :"qeodanlpdjmegc"}}
25
+ {"translation":{"let":"gkiaegqnopelkqlhfopcdighac", "rev" :"cahgidcpofhlqkleponqgeaikg"}}
26
+ {"translation":{"let":"hjcmfgmijhgbeigonjjqi", "rev" :"iqjjnogiebghjimgfmcjh"}}
27
+ {"translation":{"let":"niponelgnqe", "rev" :"eqnglenopin"}}
28
+ {"translation":{"let":"cfqnenkonajmdknilbcfglgnplbod", "rev" :"doblpnglgfcblinkdmjanoknenqfc"}}
29
+ {"translation":{"let":"cqkfqaofhdcjniafpmbei", "rev" :"iebmpfainjcdhfoaqfkqc"}}
30
+ {"translation":{"let":"knlnkmjkenlqp", "rev" :"pqlnekjmknlnk"}}
31
+ {"translation":{"let":"cjhfkdoonfqdapepihjgpdgfpf", "rev" :"fpfgdpgjhipepadqfnoodkfhjc"}}
32
+ {"translation":{"let":"ipqfhlbeebgqcdqbgbbaicjinkkdogochlnfpneclcnibddka", "rev" :"akddbinclcenpfnlhcogodkknijciabbgbqdcqgbeeblhfqpi"}}
33
+ {"translation":{"let":"eclpohqf", "rev" :"fqhoplce"}}
34
+ {"translation":{"let":"ocnikilfdbqhocmnhjgalifejelhqafgjaipkohmhdfjb", "rev" :"bjfdhmhokpiajgfaqhlejefilagjhnmcohqbdflikinco"}}
35
+ {"translation":{"let":"eoeeibhkmhogkillhlbgbnhjfocpqhqqoicgajjlfoddogbmd", "rev" :"dmbgoddofljjagcioqqhqpcofjhnbgblhllikgohmkhbieeoe"}}
36
+ {"translation":{"let":"cdpojifjbndfobmnejiljjmojonhqfdidiqekehflkqpc", "rev" :"cpqklfhekeqididfqhnojomjjlijenmbofdnbjfijopdc"}}
37
+ {"translation":{"let":"ckcehejhfjp", "rev" :"pjfhjeheckc"}}
38
+ {"translation":{"let":"lgplgdffqicglolnienmqajqkkcknmlejaqkd", "rev" :"dkqajelmnkckkqjaqmneinlolgciqffdglpgl"}}
39
+ {"translation":{"let":"niolfclqnahgimdngflfaidcdbpibljhddh", "rev" :"hddhjlbipbdcdiaflfgndmighanqlcfloin"}}
40
+ {"translation":{"let":"pgcfpnoknmmkhbdlbifhjebhlageigofidlcgjnmaaqlcq", "rev" :"qclqaamnjgcldifogiegalhbejhfibldbhkmmnkonpfcgp"}}
41
+ {"translation":{"let":"hoknmkdaegfiqgcgdqknfbffgcgepablj", "rev" :"jlbapegcgffbfnkqdgcgqifgeadkmnkoh"}}
42
+ {"translation":{"let":"dbgbdmjqfmgeqbidfalooplcdjjdgaaqlnjlecnloifkm", "rev" :"mkfiolnceljnlqaagdjjdclpoolafdibqegmfqjmdbgbd"}}
43
+ {"translation":{"let":"dildqcnbihlkfjhcgqpajqbfblifqgncemidfnejqhnpmmmhgn", "rev" :"nghmmmpnhqjenfdimecngqfilbfbqjapqgchjfklhibncqdlid"}}
44
+ {"translation":{"let":"qnekalgommcllnffapjhibgc", "rev" :"cgbihjpaffnllcmmoglakenq"}}
45
+ {"translation":{"let":"ffcfklnpjifmlqjmbpgq", "rev" :"qgpbmjqlmfijpnlkfcff"}}
46
+ {"translation":{"let":"fmbdaeqepgbpnnpfhnkg", "rev" :"gknhfpnnpbgpeqeadbmf"}}
47
+ {"translation":{"let":"keondhhneimpehpfphkdfkpdoqgejbbqe", "rev" :"eqbbjegqodpkfdkhpfphepmienhhdnoek"}}
48
+ {"translation":{"let":"qijcppbglonpjhfoebhnlfcdilnbnacjjalfdgmakgeofmq", "rev" :"qmfoegkamgdflajjcanbnlidcflnhbeofhjpnolgbppcjiq"}}
49
+ {"translation":{"let":"mhffcenmodhppj", "rev" :"jpphdomnecffhm"}}
50
+ {"translation":{"let":"djfkkhjibaaghqllligeoigb", "rev" :"bgioegilllqhgaabijhkkfjd"}}
51
+ {"translation":{"let":"mmo", "rev" :"omm"}}
52
+ {"translation":{"let":"omiqhheqcplkjcbqaqpqbaddnennjcghkmmidkfh", "rev" :"hfkdimmkhgcjnnenddabqpqaqbcjklpcqehhqimo"}}
53
+ {"translation":{"let":"hcadoqjnmkilf", "rev" :"flikmnjqodach"}}
54
+ {"translation":{"let":"jgmpeebjakplbbjamhminpfncplghmogccdkih", "rev" :"hikdccgomhglpcnfpnimhmajbblpkajbeepmgj"}}
55
+ {"translation":{"let":"gelbbogqmcgmiklcilfcopmlionehhhdodkoakmjjdgkiqd", "rev" :"dqikgdjjmkaokdodhhhenoilmpocfliclkimgcmqgobbleg"}}
56
+ {"translation":{"let":"dkqndjqcappkijgomohhcbacnklpfbndfgebidaegiglhlebf", "rev" :"fbelhlgigeadibegfdnbfplkncabchhomogjikppacqjdnqkd"}}
57
+ {"translation":{"let":"hnlepbfcdqmfqcpiqobbeakmog", "rev" :"gomkaebboqipcqfmqdcfbpelnh"}}
58
+ {"translation":{"let":"ednabgfmdjmigfj", "rev" :"jfgimjdmfgbande"}}
59
+ {"translation":{"let":"qmfi", "rev" :"ifmq"}}
60
+ {"translation":{"let":"jdkqgahqcgbfidnmlghodmmhafcbnpmklcpbai", "rev" :"iabpclkmpnbcfahmmdohglmndifbgcqhagqkdj"}}
61
+ {"translation":{"let":"bfolnmibffeeheigljffhkdbn", "rev" :"nbdkhffjlgieheeffbimnlofb"}}
62
+ {"translation":{"let":"ncliiah", "rev" :"haiilcn"}}
63
+ {"translation":{"let":"bbdnkjcepnkifibpnhecgjnqjaq", "rev" :"qajqnjgcehnpbifiknpecjkndbb"}}
64
+ {"translation":{"let":"jhnaabidkllfib", "rev" :"bifllkdibaanhj"}}
65
+ {"translation":{"let":"mck", "rev" :"kcm"}}
66
+ {"translation":{"let":"bhfhnljipjhdfokgaiofgidjeciallabq", "rev" :"qballaicejdigfoiagkofdhjpijlnhfhb"}}
67
+ {"translation":{"let":"fjhkloiepbehdbeipefohlabfcfcqlceeomdnl", "rev" :"lndmoeeclqcfcfbalhofepiebdhebpeiolkhjf"}}
68
+ {"translation":{"let":"icfehgglfbfbkkdnadkmp", "rev" :"pmkdandkkbfbflgghefci"}}
69
+ {"translation":{"let":"ipjebbacgkpjecnqinjm", "rev" :"mjniqncejpkgcabbejpi"}}
70
+ {"translation":{"let":"akapeiojceaaqnbomplnkobbjifmhadqcdgp", "rev" :"pgdcqdahmfijbboknlpmobnqaaecjoiepaka"}}
71
+ {"translation":{"let":"neokiciombfbhboaedcq", "rev" :"qcdeaobhbfbmoicikoen"}}
72
+ {"translation":{"let":"eiafgnflg", "rev" :"glfngfaie"}}
73
+ {"translation":{"let":"egcnlf", "rev" :"flncge"}}
74
+ {"translation":{"let":"kgmiqofjoldadammbkj", "rev" :"jkbmmadadlojfoqimgk"}}
75
+ {"translation":{"let":"igldqkjkblic", "rev" :"cilbkjkqdlgi"}}
76
+ {"translation":{"let":"gkgnheeigjgbfeajofjcomhpioddnldfkhaanpepljkglp", "rev" :"plgkjlpepnaahkfdlnddoiphmocjfojaefbgjgieehngkg"}}
77
+ {"translation":{"let":"jiigeqelhlcfqfhobnjphkligohnidodekccao", "rev" :"oacckedodinhogilkhpjnbohfqfclhleqegiij"}}
78
+ {"translation":{"let":"agplnbjeije", "rev" :"ejiejbnlpga"}}
79
+ {"translation":{"let":"egehjhligd", "rev" :"dgilhjhege"}}
80
+ {"translation":{"let":"dgfnfqmhdchnabnieglhllmkieqkoigqkjcfapd", "rev" :"dpafcjkqgiokqeikmllhlgeinbanhcdhmqfnfgd"}}
81
+ {"translation":{"let":"jocgjhllacodkmnfljqfjbobg", "rev" :"gbobjfqjlfnmkdocallhjgcoj"}}
82
+ {"translation":{"let":"laagaadjhmnqifqgomichdnqheihbmbdbkoiompbe", "rev" :"ebpmoiokbdbmbhiehqndhcimogqfiqnmhjdaagaal"}}
83
+ {"translation":{"let":"dbfchgqihmpmeeanmoonhpgljhogphfpifqckceainlokjcdd", "rev" :"ddcjkolniaeckcqfipfhpgohjlgphnoomnaeempmhiqghcfbd"}}
84
+ {"translation":{"let":"ndebjooabakojoqqeqhbbolbkphjqllhkpmjf", "rev" :"fjmpkhllqjhpkblobbhqeqqojokabaoojbedn"}}
85
+ {"translation":{"let":"lqnlqppkccnbndjofpoaiajiikgcjjbbmlmkkhemmqfifaoddc", "rev" :"cddoafifqmmehkkmlmbbjjcgkiijaiaopfojdnbncckppqlnql"}}
86
+ {"translation":{"let":"hpakdpdjgfnejkodfj", "rev" :"jfdokjenfgjdpdkaph"}}
87
+ {"translation":{"let":"naeqqqondp", "rev" :"pdnoqqqean"}}
88
+ {"translation":{"let":"ohdfeimqoejmphackodikhnajaoqfgmbebikfpininllndcd", "rev" :"dcdnllninipfkibebmgfqoajanhkidokcahpmjeoqmiefdho"}}
89
+ {"translation":{"let":"qggmfdhp", "rev" :"phdfmggq"}}
90
+ {"translation":{"let":"iohgjcigepmjmk", "rev" :"kmjmpegicjghoi"}}
91
+ {"translation":{"let":"qpohdhafocfalploejgdpmqlmkdgaklknpiicqaaibjliik", "rev" :"kiiljbiaaqciipnklkagdkmlqmpdgjeolplafcofahdhopq"}}
92
+ {"translation":{"let":"dnligfpmlead", "rev" :"daelmpfgilnd"}}
93
+ {"translation":{"let":"feja", "rev" :"ajef"}}
94
+ {"translation":{"let":"gnmieihi", "rev" :"ihieimng"}}
95
+ {"translation":{"let":"qqbcpoengg", "rev" :"ggneopcbqq"}}
96
+ {"translation":{"let":"ihbgaenomidfdagfcqo", "rev" :"oqcfgadfdimoneagbhi"}}
97
+ {"translation":{"let":"ahfdemaejhmcaefeeiaqpcipbckldfnlodnmlokopqcjlhfj", "rev" :"jfhljcqpokolmndolnfdlkcbpicpqaieefeacmhjeamedfha"}}
98
+ {"translation":{"let":"ggqjbnqedemjonpddahc", "rev" :"chaddpnojmedeqnbjqgg"}}
99
+ {"translation":{"let":"akpjobfl", "rev" :"lfbojpka"}}
100
+ {"translation":{"let":"qbm", "rev" :"mbq"}}
readme.md CHANGED
@@ -1 +1,11 @@
1
  ## Readme
 
 
 
 
 
 
 
 
 
 
 
1
  ## Readme
2
+
3
+ - 'let' = word generated using random letters
4
+ - 'rev' = reverse word generated
5
+
6
+
7
+ ### Goal
8
+ - to train a model on this dataset so that it can generate reverse of words
9
+
10
+ ### Test DataSet
11
+ - I am not using all the letters in my training set, so test set should be just words formed using remaining letters
requirements.txt CHANGED
@@ -5,4 +5,5 @@ protobuf
5
  sacrebleu >= 1.4.12
6
  py7zr
7
  torch >= 1.3
8
- evaluate
 
 
5
  sacrebleu >= 1.4.12
6
  py7zr
7
  torch >= 1.3
8
+ evaluate
9
+ transformers
trainModel.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
2
+
3
+ tokenizer = T5Tokenizer.from_pretrained("t5-small")
4
+ model = T5ForConditionalGeneration.from_pretrained("t5-small")
5
+
6
+
7
+ max_source_length = 128
8
+ max_target_length = 128
9
+
10
+ input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
11
+ labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
12
+
13
+ # the forward function automatically creates the correct decoder_input_ids
14
+ loss = model(input_ids=input_ids, labels=labels).loss
15
+ loss.item()
16
+ print(loss.item())