Shankhdhar commited on Jul 16, 2021

Commit

5944e36

•

1 Parent(s): ec34615

Saving weights and logs of step 100

Browse files

Files changed (24) hide show

.gitattributes +3 -0
Train.tsv +3 -0
Untitled.ipynb +356 -134
Val.tsv +0 -0
events.out.tfevents.1626357469.t1v-n-1926f308-w-0.95268.3.v2 +2 -2
events.out.tfevents.1626357733.t1v-n-1926f308-w-0.96764.3.v2 +2 -2
events.out.tfevents.1626358530.t1v-n-1926f308-w-0.97646.3.v2 +3 -0
events.out.tfevents.1626359662.t1v-n-1926f308-w-0.99131.3.v2 +3 -0
events.out.tfevents.1626359960.t1v-n-1926f308-w-0.100528.3.v2 +3 -0
events.out.tfevents.1626447876.t1v-n-1926f308-w-0.111464.3.v2 +3 -0
events.out.tfevents.1626451747.t1v-n-1926f308-w-0.115707.3.v2 +3 -0
events.out.tfevents.1626451941.t1v-n-1926f308-w-0.117207.3.v2 +3 -0
flax_model.msgpack +1 -1
flowmasteri.txt +0 -0
run.sh +7 -7
run_clm_flax.py +5 -3
text_collection/text_collection.py +104 -0
text_collection/text_collection.py.lock +0 -0
train/train2.txt +3 -0
train/val2.txt +0 -0
val/val2.txt +0 -0
wutang.txt +0 -0
ye.txt +0 -0
zenske.txt +0 -0

.gitattributes CHANGED Viewed

@@ -15,3 +15,6 @@
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Train.tsv filter=lfs diff=lfs merge=lfs -text
+train/train2.tsv filter=lfs diff=lfs merge=lfs -text
+train/train2.txt filter=lfs diff=lfs merge=lfs -text

Train.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60458763e1bfeeeb3239b0a1b58bbdb329b50dffdf5508500266350461fc6bdf
+size 19817903

Untitled.ipynb CHANGED Viewed

@@ -2,17 +2,19 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 12,
    "id": "81fd300c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "id": "1237ddf7",
    "metadata": {},
    "outputs": [
@@ -20,20 +22,33 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "3418\n"
      ]
     }
    ],
    "source": [
     "with open(\"Lilgpt.txt\",'r') as file:\n",
     "    data = file.read()\n",
-    "List = data.split(\"<EOS>\")\n",
     "print(len(List))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "18b93c2e",
    "metadata": {},
    "outputs": [
@@ -120,7 +135,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
    "id": "9c7698db",
    "metadata": {},
    "outputs": [
@@ -128,11 +143,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
-      "<BOS>\n",
-      "3 Headed Goat[Intro]\n",
       "(Aviator)\n",
-      "\n",
       "[Chorus: Lil Baby]\n",
       "These ain't no Guess jeans\n",
       "I dropped out of school, I'm still good at math, but, nigga, don't test me\n",
@@ -143,7 +155,6 @@
       "He say I'm hard and he say I'm garbage, I'm rich regardless\n",
       "We in Miami in the middle of the winter, and we on them jet skis\n",
       "If we in Atlanta, I'm runnin' the 'Cat and I'm workin' the red key\n",
-      "\n",
       "[Verse 1: Lil Durk]\n",
       "I cannot mention my homies inside of my song 'cause I know they be trappin' a lot\n",
       "I can't keep takin' these pills, when I'm in the trenches, they say I be cappin' a lot\n",
@@ -160,7 +171,6 @@
       "Only twenty-five, livin' like a boss, ridin' 'round with a chauffeur\n",
       "I don't sell drugs, still be paranoid, keep lookin' over my shoulder\n",
       "Niggas lyin' like I'm stealin' swag, boy, that's my shit like I wrote it\n",
-      "\n",
       "[Verse 3: Polo G]\n",
       "Uh\n",
       "These rappers really nice as hell\n",
@@ -196,22 +206,24 @@
       "Play like I'm dumb, as soon as it pop, I'm goin' retarded\n",
       "He say I'm hard and he say I'm garbage, I'm rich regardless\n",
       "We in Miami in the middle of the winter, and we on them jet skis\n",
-      "If we in Atlanta, I'm runnin' the 'Cat and I'm workin' the red key35EmbedShare URLCopyEmbedCopy\n",
-      "<EOS>\n"
      ]
     }
    ],
    "source": [
     "NewList = []\n",
     "for l in List:\n",
-    "    n = l+\"<EOS>\"\n",
     "    NewList.append(n)\n",
     "print(NewList[1])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
    "id": "f31e28d8",
    "metadata": {},
    "outputs": [
@@ -219,21 +231,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "330\n",
-      "3088\n"
      ]
     }
    ],
    "source": [
     "counter = 0\n",
     "List_val = []\n",
     "List_train = []\n",
-    "for l in List:\n",
-    "    n = l+\"<EOS>\"\n",
-    "    if counter<330:\n",
-    "        List_val.append(n)\n",
     "    else:\n",
-    "        List_train.append(n)\n",
     "    counter += 1\n",
     "print(len(List_val))\n",
     "print(len(List_train))"
@@ -241,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "id": "9efd0b25",
    "metadata": {},
    "outputs": [
@@ -249,165 +261,375 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "<BOS>\n",
-      "My Beyoncé[Chorus: Lil Durk]\n",
-      "Ooh, I like the way she move\n",
-      "Shorty my baby, my everything, she the truth\n",
-      "Together we cool, me and her can't lose\n",
-      "Keep 'em on their feet, baby, I know they so confused\n",
-      "Shorty my Beyoncé\n",
-      "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
-      "Shorty my Beyoncé\n",
-      "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
-      "My Beyoncé\n",
-      "\n",
-      "[Verse 1: Lil Durk]\n",
-      "Trippin' on that drank, but I know she worth it\n",
-      "Independent baby, I know she workin'\n",
-      "Adriana's serving drinks, 20 bottles, urgent\n",
-      "I know it can be better but nobody's perfect\n",
-      "We flirted for a minute, DeJ, that's my baby\n",
-      "I ain't trippin', I'm like Henny, yeah I'm in her kidneys\n",
-      "She like to play her songs to the way I'm hittin' it\n",
-      "Turn around like, \"Damn Durk, I like the way you hittin' it\"\n",
-      "Don't believe the rumors, girl\n",
-      "You know I'll do you, girl\n",
-      "I don't wanna hear the shit about the niggas\n",
-      "That tried to do you, girl\n",
-      "Fuck the past right now\n",
-      "Shawty got you right now\n",
-      "And you hot right now\n",
-      "You can get it right now, baby\n",
-      "[Chorus: Lil Durk]\n",
-      "Ooh, I like the way she move\n",
-      "Shorty my baby, my everything, she the truth\n",
-      "Together we cool, me and her can't lose\n",
-      "Keep 'em on their feet, baby, I know they so confused\n",
-      "Shorty my Beyoncé\n",
-      "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
-      "Shorty my Beyoncé\n",
-      "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
-      "My Beyoncé\n",
-      "[Verse 2: DeJ Loaf]\n",
-      "I let him get it when he want it, knock it down and push up on it\n",
-      "I was plottin' for a while, now I got him where I want him\n",
-      "They didn't understand none of this was planned\n",
-      "99 problems but a bitch better not be none\n",
-      "Na na, na na, yeah yeah\n",
-      "This ain't what he want, I told him that\n",
-      "Leave your girl, be through with that\n",
-      "Get with DeJ, he ain't ever goin' back\n",
-      "He was shy when I seen him, now he smile\n",
-      "Heard a few rumors but they ain't my style\n",
-      "I be hatin' when he out of town\n",
-      "Hotel, I FaceTime you, no towel\n",
-      "They ain't get it but they ain't our problem\n",
-      "What the fuck can they do about it?\n",
-      "Durk and DeJ\n",
-      "I'm thinkin' 'bout changin' my last name\n",
-      "[Chorus: Lil Durk]\n",
-      "Ooh, I like the way she move\n",
-      "Shorty my baby, my everything, she the truth\n",
-      "Together we cool, me and her can't lose\n",
-      "Keep 'em on their feet, baby, I know they so confused\n",
-      "Shorty my Beyoncé\n",
-      "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
-      "Shorty my Beyoncé\n",
-      "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
-      "My Beyoncé\n",
-      "\n",
-      "[Verse 3: Lil Durk]\n",
-      "You and I\n",
-      "White dress, flowers, and a suit and tie\n",
-      "Me and you like Bonnie and Clyde\n",
-      "No beat the case, we're do or die\n",
-      "Who am I to say you ain't natural?\n",
-      "Your haters my haters, ain't switchin' up, baby, I got you\n",
-      "I'm with her like a tattoo\n",
-      "The way you wear that dress, they gon' attack you\n",
-      "The way you look at me, baby, I got you\n",
-      "I hit it from the front, I like the back too\n",
-      "She say, \"Lay down so I can ride you\"\n",
-      "I know that she fiending\n",
-      "She scratchin' my back, I like how she screamin'\n",
-      "I fuck her and leave her, she fiending\n",
-      "Shawty my Beyoncé\n",
-      "[Chorus: Lil Durk]\n",
-      "Ooh, I like the way she move\n",
-      "Shorty my baby, my everything, she the truth\n",
-      "Together we cool, me and her can't lose\n",
-      "Keep 'em on their feet, baby, I know they so confused\n",
-      "Shorty my Beyoncé\n",
-      "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
-      "Shorty my Beyoncé\n",
-      "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
-      "My Beyoncé\n",
-      "\n",
-      "[Outro: Lil Durk]\n",
-      "Ooh, ooh\n",
-      "Durk and DeJ, Durk and DeJ, Durk and DeJ\n",
-      "Ooh, ooh43EmbedShare URLCopyEmbedCopy\n",
-      "<EOS>\n"
      ]
     }
    ],
    "source": [
-    "print(List_val[0])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
    "id": "5dfecab1",
    "metadata": {},
    "outputs": [],
    "source": [
     "val_set =List_val[0]\n",
     "for i in range(1,len(List_val)):\n",
-    "    val_set = val_set+List_val[i]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
    "id": "3a895d2f",
    "metadata": {},
    "outputs": [],
    "source": [
     "train_set =List_train[0]\n",
     "for i in range(1,len(List_train)):\n",
-    "    train_set = train_set+List_train[i]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
    "id": "74ea6efc",
    "metadata": {},
    "outputs": [],
    "source": [
-    "file1 = open(\"train.txt\",\"w+\")\n",
     "file1.write(train_set)\n",
     "file1.close()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
    "id": "3416e6da",
    "metadata": {},
    "outputs": [],
    "source": [
-    "file2 = open(\"val.txt\",\"w+\")\n",
     "file2.write(val_set)\n",
     "file2.close()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "1bd0ca86",
    "metadata": {},
    "outputs": [],
    "source": []
   }

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "81fd300c",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "import random"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "1237ddf7",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "8500\n"
      ]
     }
    ],
    "source": [
     "with open(\"Lilgpt.txt\",'r') as file:\n",
     "    data = file.read()\n",
+    "List1 = data.split(\"<EOS>\")\n",
+    "with open(\"ye.txt\",'r') as file:\n",
+    "    data = file.read()\n",
+    "List2 = data.split(\"<EOS>\")\n",
+    "with open(\"zenske.txt\",'r') as file:\n",
+    "    data = file.read()\n",
+    "List3 = data.split(\"<EOS>\")\n",
+    "with open(\"flowmasteri.txt\",'r') as file:\n",
+    "    data = file.read()\n",
+    "List4 = data.split(\"<EOS>\")\n",
+    "with open(\"wutang.txt\",'r') as file:\n",
+    "    data = file.read()\n",
+    "List5 = data.split(\"<EOS>\")\n",
+    "List = List1+List2+List3+List4+List5\n",
     "print(len(List))"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "id": "18b93c2e",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "id": "9c7698db",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "<BOS>3 Headed Goat[Intro]\n",
       "(Aviator)\n",
       "[Chorus: Lil Baby]\n",
       "These ain't no Guess jeans\n",
       "I dropped out of school, I'm still good at math, but, nigga, don't test me\n",
       "He say I'm hard and he say I'm garbage, I'm rich regardless\n",
       "We in Miami in the middle of the winter, and we on them jet skis\n",
       "If we in Atlanta, I'm runnin' the 'Cat and I'm workin' the red key\n",
       "[Verse 1: Lil Durk]\n",
       "I cannot mention my homies inside of my song 'cause I know they be trappin' a lot\n",
       "I can't keep takin' these pills, when I'm in the trenches, they say I be cappin' a lot\n",
       "Only twenty-five, livin' like a boss, ridin' 'round with a chauffeur\n",
       "I don't sell drugs, still be paranoid, keep lookin' over my shoulder\n",
       "Niggas lyin' like I'm stealin' swag, boy, that's my shit like I wrote it\n",
       "[Verse 3: Polo G]\n",
       "Uh\n",
       "These rappers really nice as hell\n",
       "Play like I'm dumb, as soon as it pop, I'm goin' retarded\n",
       "He say I'm hard and he say I'm garbage, I'm rich regardless\n",
       "We in Miami in the middle of the winter, and we on them jet skis\n",
+      "If we in Atlanta, I'm runnin' the 'Cat and I'm workin' the red key35EmbedShare URLCopyEmbedCopy<EOS>\n"
      ]
     }
    ],
    "source": [
     "NewList = []\n",
     "for l in List:\n",
+    "    n = l\n",
+    "    n = n.replace(\"<BOS>\",\"\")\n",
+    "    n = os.linesep.join([s for s in n.splitlines() if s])\n",
+    "    n = \"<BOS>\"+ n + \"<EOS>\"\n",
     "    NewList.append(n)\n",
     "print(NewList[1])"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "f31e28d8",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "800\n",
+      "7700\n"
      ]
     }
    ],
    "source": [
+    "random.shuffle(NewList)\n",
     "counter = 0\n",
     "List_val = []\n",
     "List_train = []\n",
+    "for l in NewList:\n",
+    "    if counter<800:\n",
+    "        List_val.append(l)\n",
     "    else:\n",
+    "        List_train.append(l)\n",
     "    counter += 1\n",
     "print(len(List_val))\n",
     "print(len(List_train))"
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "9efd0b25",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "<BOS>PainI finally moved out my momma house\n",
+      "Got a happy home\n",
+      "Only thing fucked up, Daddy gone (Gone forever man)\n",
+      "Listen up this is real shit (Real shit)\n",
+      "It's fucked up when u got nobody to share it with (Fucked up)\n",
+      "Give my last so my grandma can see me now (See me now)\n",
+      "Smilin tellin everybody bout her grandbaby (her grandbaby)\n",
+      "T-Lady hold on all I got is you (Mama)\n",
+      "It's fucked up what I done put me and my mama through\n",
+      "What's happenin Ivory I know you lookin down watchin me\n",
+      "Everyday I got yo kids fresh from head to feet (All of em)\n",
+      "Wish I can put u in the bentley with the brown [? ]\n",
+      "I wish we never would've went out to that club\n",
+      "On the road to riches mane lost a flock of niggas\n",
+      "[? ] a whole block of niggas\n",
+      "Niggas bite the hand that feed em I done seen it all\n",
+      "Jus got a call... Slo dead... Not my fuckin dawg\n",
+      "I never thought this pain last this many years\n",
+      "I... I never thought this pain last this many years\n",
+      "Still in tears\n",
+      "I... I never thought this pain last this many years\n",
+      "I lost my daddy in a heartbeat\n",
+      "Real talk eyes turnin right in front me\n",
+      "Ever since I saw that shit... it hunt me\n",
+      "Pray for me cause I shake in my fuckin sleep\n",
+      "Wish I could[? ] shoot the 50 nigga\n",
+      "I shoulda put him under my wing and I miss him nigga\n",
+      "I told [? ] way back stop ridin with ya pack\n",
+      "He ain't listen\n",
+      "And now my nigga right back in penitentury he don't listen\n",
+      "I lost Ivory and it fucked me up\n",
+      "My whole life changed\n",
+      "Nigga put me up in beast mode\n",
+      "My heart cold\n",
+      "And as the years role\n",
+      "I wish that he could have everything I got\n",
+      "And a lil mo'\n",
+      "Yo kids love me nigga\n",
+      "My kids and yo kids call each other sister and brothers\n",
+      "You forever my lil hustler\n",
+      "Just saw yo gangsta at the car lot nigga and a tear dropped nigga\n",
+      "I never thought this pain last this many years\n",
+      "I... I never thought this pain last this many years\n",
+      "Still in tears\n",
+      "I... I never thought this pain last this many years\n",
+      "Seem like it was yesterday\n",
+      "The mo money I get this shit hurt\n",
+      "I thought that would take the stress away\n",
+      "To see u smilin with yo chain countin money nigga\n",
+      "When Trina pulled you on stage you was stuntin nigga\n",
+      "But through it all mane you kno how we rock nigga\n",
+      "From the yo to the block to the hospital\n",
+      "Never again would I have another big dawg\n",
+      "But ya mama and kids I got em all but I\n",
+      "I never thought this pain last this many years\n",
+      "I... I never thought this pain last this many years\n",
+      "Still in tears\n",
+      "I... I never thought this pain last this many yea2EmbedShare URLCopyEmbedCopy<EOS>\n"
      ]
     }
    ],
    "source": [
+    "print(List_train[1])"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "5dfecab1",
    "metadata": {},
    "outputs": [],
    "source": [
     "val_set =List_val[0]\n",
     "for i in range(1,len(List_val)):\n",
+    "    val_set = val_set+\"\\n\\n\"+ List_val[i]"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "id": "3a895d2f",
    "metadata": {},
    "outputs": [],
    "source": [
     "train_set =List_train[0]\n",
     "for i in range(1,len(List_train)):\n",
+    "    train_set = train_set+\"\\n\\n\"+List_train[i]"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "id": "74ea6efc",
    "metadata": {},
    "outputs": [],
    "source": [
+    "file1 = open(\"train2.txt\",\"w+\")\n",
     "file1.write(train_set)\n",
     "file1.close()"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "id": "3416e6da",
    "metadata": {},
    "outputs": [],
    "source": [
+    "file2 = open(\"val2.txt\",\"w+\")\n",
     "file2.write(val_set)\n",
     "file2.close()"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "id": "1bd0ca86",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Songs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>&lt;BOS&gt;Playa Hater[Intro: The Notorious B.I.G.]\\...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>&lt;BOS&gt;PainI finally moved out my momma house\\nG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>&lt;BOS&gt;I Don’t Do Much[Flip talking]\\nCounting m...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>&lt;BOS&gt;Fuck the Weatherman[Intro]\\nLike seriousl...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>&lt;BOS&gt;It’s Kim Bitches (Get That Money)[Intro]\\...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                               Songs\n",
+       "0  <BOS>Playa Hater[Intro: The Notorious B.I.G.]\\...\n",
+       "1  <BOS>PainI finally moved out my momma house\\nG...\n",
+       "2  <BOS>I Don’t Do Much[Flip talking]\\nCounting m...\n",
+       "3  <BOS>Fuck the Weatherman[Intro]\\nLike seriousl...\n",
+       "4  <BOS>It’s Kim Bitches (Get That Money)[Intro]\\..."
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_train = pd.DataFrame(List_train,columns = [\"Songs\"])\n",
+    "df_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "f78a2abc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Songs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>&lt;BOS&gt;Straight Gutta[Verse 1: Streetlife]\\nI’m ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>&lt;BOS&gt;Georgia... Bush / Weezy’z Ambitionz[Part ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>&lt;BOS&gt;1991 Freestyle[Intro: Ol' Dirty Bastard]\\...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>&lt;BOS&gt;The Wolf* iTunes bonus track\\n[RZA]\\nWatc...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>&lt;BOS&gt;Never Mind[Pre-Chorus]\\nI'm a pimp under ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                               Songs\n",
+       "0  <BOS>Straight Gutta[Verse 1: Streetlife]\\nI’m ...\n",
+       "1  <BOS>Georgia... Bush / Weezy’z Ambitionz[Part ...\n",
+       "2  <BOS>1991 Freestyle[Intro: Ol' Dirty Bastard]\\...\n",
+       "3  <BOS>The Wolf* iTunes bonus track\\n[RZA]\\nWatc...\n",
+       "4  <BOS>Never Mind[Pre-Chorus]\\nI'm a pimp under ..."
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_val = pd.DataFrame(List_val,columns = [\"Songs\"])\n",
+    "df_val.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "240e7d42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train.to_csv(\"Train.tsv\",sep=\"\\t\",index = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "5899b85b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_val.to_csv(\"Val.tsv\",sep=\"\\t\",index = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c0dd966c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"Train.tsv\",sep = \"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "922af4c8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Songs</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>&lt;BOS&gt;Playa Hater[Intro: The Notorious B.I.G.]\\...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>&lt;BOS&gt;PainI finally moved out my momma house\\nG...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>&lt;BOS&gt;I Don’t Do Much[Flip talking]\\nCounting m...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>&lt;BOS&gt;Fuck the Weatherman[Intro]\\nLike seriousl...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>&lt;BOS&gt;It’s Kim Bitches (Get That Money)[Intro]\\...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                               Songs\n",
+       "0  <BOS>Playa Hater[Intro: The Notorious B.I.G.]\\...\n",
+       "1  <BOS>PainI finally moved out my momma house\\nG...\n",
+       "2  <BOS>I Don’t Do Much[Flip talking]\\nCounting m...\n",
+       "3  <BOS>Fuck the Weatherman[Intro]\\nLike seriousl...\n",
+       "4  <BOS>It’s Kim Bitches (Get That Money)[Intro]\\..."
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be9f103f",
+   "metadata": {},
    "outputs": [],
    "source": []
   }

Val.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

events.out.tfevents.1626357469.t1v-n-1926f308-w-0.95268.3.v2 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:65d6d947b3da9df3dfc09cc3a657592d85fc8ba3783af91c196fbab5f0846dd3
-size 60409

 version https://git-lfs.github.com/spec/v1
+oid sha256:d2254be5c9709a35e6a20a7a795789ea4d605f4e2bde68aeda8de3b6623dbd7f
+size 113336

events.out.tfevents.1626357733.t1v-n-1926f308-w-0.96764.3.v2 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb1f4267c9667e83a6c35e620806d8776ce4dc66d1d56559186813567ba5443e
-size 40

 version https://git-lfs.github.com/spec/v1
+oid sha256:5c5b373c809857af52071b4078af469dfd5781cc3edccee286293ccf860202be
+size 7499

events.out.tfevents.1626358530.t1v-n-1926f308-w-0.97646.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00566fffb65e756600911e0f37b177d585f68f4ad68d1a6c0a3530b59a66079c
+size 113336

events.out.tfevents.1626359662.t1v-n-1926f308-w-0.99131.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52dc8b0bbb392d47d3f14c2270a79387f36808449954df4e8b48828f69f720c8
+size 40

events.out.tfevents.1626359960.t1v-n-1926f308-w-0.100528.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b36a686716fbe468b3e656c14476497e60407502dd59b6819aa337f76722e0d
+size 113336

events.out.tfevents.1626447876.t1v-n-1926f308-w-0.111464.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e57a9d1e97e640dbcc715deff93ca140880825c478cc2e3e93dc019996fb5082
+size 40

events.out.tfevents.1626451747.t1v-n-1926f308-w-0.115707.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8da8c6d1c7dddaf6f0c6d63a47f39127cfe30dd180c6b96de36aabe985bcd54a
+size 40

events.out.tfevents.1626451941.t1v-n-1926f308-w-0.117207.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1d2fbd80ea41fd70f1baff6dc2dbb4107fe920c2bdecd42a07b86a1bad0b24e
+size 44657

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52a21818a42f9e9e503a5cd0d5d0279f31672a49dfc997a12eea5a025b0b4109
 size 497764120

 version https://git-lfs.github.com/spec/v1
+oid sha256:742e9e9fd4077234f5e0a5f68fd466c23b7ffae51c771abaafa5ae4d083edfca
 size 497764120

flowmasteri.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

run.sh CHANGED Viewed

@@ -4,18 +4,18 @@ python3 run_clm_flax.py \
     --model_type="gpt2" \
     --config_name="./" \
     --tokenizer_name="./" \
-    --train_file="/home/anantshankhdhar/gpt2-rap-lyric-generator/train.txt" \
-    --validation_file="/home/anantshankhdhar/gpt2-rap-lyric-generator/val.txt" \
     --do_train \
     --do_eval \
-    --block_size="512" \
     --per_device_train_batch_size="64" \
     --per_device_eval_batch_size="32" \
-    --learning_rate="5e-5"  \
     --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
     --overwrite_output_dir \
     --num_train_epochs="100" \
-    --logging_steps="50" \
-    --save_steps="750" \
-    --eval_steps="50" \
     --push_to_hub

     --model_type="gpt2" \
     --config_name="./" \
     --tokenizer_name="./" \
+    --train_file="/home/anantshankhdhar/gpt2-rap-lyric-generator/Train.tsv" \
+    --validation_file="/home/anantshankhdhar/gpt2-rap-lyric-generator/Val.tsv" \
+    --block_size="512" \
     --do_train \
     --do_eval \
     --per_device_train_batch_size="64" \
     --per_device_eval_batch_size="32" \
+    --learning_rate="5e-4"  \
     --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
     --overwrite_output_dir \
     --num_train_epochs="100" \
+    --logging_steps="100" \
+    --save_steps="100" \
+    --eval_steps="100" \
     --push_to_hub

run_clm_flax.py CHANGED Viewed

@@ -161,10 +161,10 @@ class DataTrainingArguments:
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
 class TrainState(train_state.TrainState):
@@ -306,7 +306,9 @@ def main():
         extension = data_args.train_file.split(".")[-1]
         if extension == "txt":
             extension = "text"
-        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.

         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt","tsv"], "`train_file` should be a csv, a json or a txt file."
             if self.validation_file is not None:
                 extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt","tsv"], "`validation_file` should be a csv, a json or a txt file."
 class TrainState(train_state.TrainState):
         extension = data_args.train_file.split(".")[-1]
         if extension == "txt":
             extension = "text"
+        if extension == "tsv":
+            extension = "csv"
+        dataset = load_dataset(extension, data_files=data_files, delimiter='\t',column_names = ["Songs"],cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.

text_collection/text_collection.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Large-scale Indonesian Summarization Dataset"""
+import glob
+import json
+import os
+import re
+from pathlib import Path
+import datasets
+logger = datasets.logging.get_logger(__name__)
+_CITATION = """\
+"""
+_DESCRIPTION = """\
+This module load text dataset from local directory. The text dataset should have the format like Oscar dataset
+where each new entry is separated by empty lines.
+"""
+_HOMEPAGE = ""
+_LICENSE = ""
+class TextCollectionConfig(datasets.BuilderConfig):
+    """BuilderConfig for TextCollection"""
+    def __init__(self, **kwargs):
+        """BuilderConfig for TextCollection.
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(TextCollectionConfig, self).__init__(**kwargs)
+class TextCollection(datasets.GeneratorBasedBuilder):
+    VERSION = datasets.Version("1.0.0")
+    BUILDER_CONFIGS = [
+        TextCollectionConfig(
+            name="text_collection",
+            version=VERSION,
+            description="Id Collection dataset",
+        ),
+    ]
+    @property
+    def manual_download_instructions(self):
+        return """\
+            You need to manually collect text datasets in a directory.  The text dataset can then be loaded
+            using the following command:
+            `datasets.load_dataset("text_collection", data_dir="<path/to/dataset>")`.
+            """
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features({"id": datasets.Value("int64"), "text": datasets.Value("string")}),
+            supervised_keys=None,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
+        print("# Data directory", data_dir)
+        if not os.path.exists(data_dir):
+            raise FileNotFoundError(
+                "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('id_liputan6', "
+                "'canonical', data_dir=...)`. Manual download instructions:\n{}".format(
+                    data_dir, self.manual_download_instructions
+                )
+            )
+        split_generators = [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "article_dir": os.path.join(data_dir, ""),
+                    "split": "train",
+                },
+            )
+        ]
+        return split_generators
+    def _generate_examples(self, article_dir, split):
+        logger.info("⏳ Generating %s examples from = %s", split, article_dir)
+        id_ = 0
+        current_lines = []
+        for path in sorted(glob.glob(os.path.join(article_dir, "**/*.txt"), recursive=True)):
+            with open(path, "r") as f:
+                print("# Reading", path)
+                for line in f:
+                    if len(line.strip()) > 0:
+                        current_lines.append(line)
+                    elif current_lines:
+                        feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
+                        yield feature
+                        id_ += 1
+                        current_lines = []
+                # last paragraph
+                if current_lines:
+                    feature = id_, {"id": id_, "text": "".join(current_lines).rstrip()}
+                    yield feature
+                    id_ += 1
+                    current_lines = []

text_collection/text_collection.py.lock ADDED Viewed

File without changes

train/train2.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5874cf342a7153d6949d6ecdbe72cabfde2bede960a4409bd7f82e88d6d4ed0f
+size 19715139

train/val2.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

val/val2.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

wutang.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

ye.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

zenske.txt ADDED Viewed

The diff for this file is too large to render. See raw diff