Spaces:

Norod78
/

FantasyChildrenScifi-hebrew-gpt_neo-small

Running

App Files Files Community

Doron Adler commited on Jul 28, 2022

Commit

363236f

•

1 Parent(s): 3437d8c

Hebrew text generator: Science Fiction and Fantasy (GPT-Neo)

Browse files

Files changed (13) hide show

README.md +4 -5
app.py +129 -0
model/.gitattributes +1 -0
model/added_tokens.json +6 -0
model/config.json +54 -0
model/merges.txt +0 -0
model/pytorch_model.bin +3 -0
model/special_tokens_map.json +6 -0
model/tokenizer.json +0 -0
model/tokenizer_config.json +22 -0
model/vocab.json +0 -0
requirements.txt +4 -0
start.sh +10 -0

README.md CHANGED Viewed

@@ -1,10 +1,9 @@
 ---
-title: FantasyChildrenScifi-hebrew-gpt Neo-small
-emoji: 💻
-colorFrom: red
-colorTo: purple
 sdk: streamlit
-sdk_version: 1.10.0
 app_file: app.py
 pinned: false
 license: mit

 ---
+title: Hebrew GPT Neo - Science Fiction and Fantasy
+emoji: 🧙‍♀️
+colorFrom: yellow
+colorTo: blue
 sdk: streamlit
 app_file: app.py
 pinned: false
 license: mit

app.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# -*- coding: utf-8 -*-
+import argparse
+import re
+import os
+import streamlit as st
+import random
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import tokenizers
+#os.environ["TOKENIZERS_PARALLELISM"] = "false"
+random.seed(None)
+suggested_text_list = ['השד הופיע מול','קאלי שלפה את','פעם אחת לפני שנים רבות', 'הארי פוטר חייך חיוך נבוך', 'ואז הפרתי את כל כללי הטקס כש']
+@st.cache(hash_funcs={tokenizers.Tokenizer: id, tokenizers.AddedToken: id})
+def load_model(model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    return model, tokenizer
+def extend(input_text, max_size=20, top_k=50, top_p=0.95):
+    if len(input_text) == 0:
+        input_text = ""
+    encoded_prompt = tokenizer.encode(
+    input_text, add_special_tokens=False, return_tensors="pt")
+    encoded_prompt = encoded_prompt.to(device)
+    if encoded_prompt.size()[-1] == 0:
+        input_ids = None
+    else:
+        input_ids = encoded_prompt
+    output_sequences = model.generate(
+    input_ids=input_ids,
+    max_length=max_size + len(encoded_prompt[0]),
+    top_k=top_k,
+    top_p=top_p,
+    do_sample=True,
+    repetition_penalty=5.0,
+    num_return_sequences=1)
+    # Remove the batch dimension when returning multiple sequences
+    if len(output_sequences.shape) > 2:
+        output_sequences.squeeze_()
+    generated_sequences = []
+    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
+        generated_sequence = generated_sequence.tolist()
+        # Decode text
+        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
+        # Remove all text after the stop token
+        text = text[: text.find(stop_token) if stop_token else None]
+        # Remove all text after 3 newlines
+        text = text[: text.find(new_lines) if new_lines else None]
+        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
+        total_sequence = (
+            input_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
+        )
+        generated_sequences.append(total_sequence)
+    parsed_text = total_sequence.replace("<|startoftext|>", "").replace("\r","").replace("\n\n", "\n")
+    if len(parsed_text) == 0:
+        parsed_text = "שגיאה"
+    return parsed_text
+if __name__ == "__main__":
+    st.title("Hebrew text generator: Science Fiction and Fantasy (GPT-Neo)")
+    model, tokenizer = load_model("./model")
+    stop_token = "<|endoftext|>"
+    new_lines = "\n\n\n"
+    np.random.seed(None)
+    random_seed = np.random.randint(10000,size=1)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = 0 if torch.cuda.is_available()==False else torch.cuda.device_count()
+    torch.manual_seed(random_seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(random_seed)
+    model.to(device)
+    text_area = st.text_area("Enter the first few words (or leave blank), tap on \"Generate Text\" below. Tapping again will produce a different result.", 'האדם האחרון עלי אדמות ישב לבד בחדרו כשלפתע נשמעה דפיקה')
+    st.sidebar.subheader("Configurable parameters")
+    max_len = st.sidebar.slider("Max-Length", 0, 512, 160,help="The maximum length of the sequence to be generated.")
+    top_k = st.sidebar.slider("Top-K", 0, 100, 40, help="The number of highest probability vocabulary tokens to keep for top-k-filtering.")
+    top_p = st.sidebar.slider("Top-P", 0.0, 1.0, 0.92, help="If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.")
+    if st.button("Generate Text"):
+        with st.spinner(text="Generating results..."):
+            st.subheader("Result")
+            print(f"device:{device}, n_gpu:{n_gpu}, random_seed:{random_seed}, maxlen:{max_len}, top_k:{top_k}, top_p:{top_p}")
+            if len(text_area.strip()) == 0:
+                text_area = random.choice(suggested_text_list)
+            result = extend(input_text=text_area,
+                            max_size=int(max_len),
+                            top_k=int(top_k),
+                            top_p=float(top_p))
+            print("Done length: " + str(len(result)) + " bytes")
+            #<div class="rtl" dir="rtl" style="text-align:right;">
+            st.markdown(f"<p dir=\"rtl\" style=\"text-align:right;\"> {result} </p>", unsafe_allow_html=True)
+            st.write("\n\nResult length: " + str(len(result)) + " bytes\n Random seed: " + str(random_seed) + "\ntop_k: " + str(top_k) + "\ntop_p: " + str(top_p) + "\nmax_len: " + str(max_len) + "\ndevice: " + str(device) + "\nn_gpu: " + str(n_gpu))
+            print(f"\"{result}\"")
+    st.markdown(
+        """Hebrew text generation model based on EleutherAI's gpt-neo architecture. Originally trained on a TPUv3-8 which was made avilable to me via the [TPU Research Cloud Program](https://sites.research.google/trc/). The model was then slightly fine-tuned upon science fiction and fantasy text."""
+    )
+    st.markdown("<footer><hr><p style=\"font-size:14px\">The site is fan made and is not affiliated with any author in any way.</p><p style=\"font-size:12px\">By <a href=\"https://linktr.ee/Norod78\">Doron Adler</a></p></footer> ", unsafe_allow_html=True)

model/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text

model/added_tokens.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "<|endoftext|>": 50257,
+  "<|pad|>": 50260,
+  "<|startoftext|>": 50258,
+  "<|unknown|>": 50259
+}

model/config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "_name_or_path": "Norod78/hebrew-gpt_neo-small",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPTNeoForCausalLM"
+  ],
+  "attention_dropout": 0,
+  "attention_layers": [
+    "global",
+    "global",
+    "global",
+    "global",
+    "global",
+    "global",
+    "global",
+    "global",
+    "global",
+    "global",
+    "global",
+    "global"
+  ],
+  "attention_types": [
+    [
+      [
+        "global"
+      ],
+      12
+    ]
+  ],
+  "bos_token_id": 50256,
+  "embed_dropout": 0,
+  "eos_token_id": 50256,
+  "gradient_checkpointing": false,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "layer_norm_epsilon": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "gpt_neo",
+  "num_heads": 12,
+  "num_layers": 12,
+  "pad_token_id": 50256,
+  "resid_dropout": 0,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.0",
+  "use_cache": true,
+  "vocab_size": 50261,
+  "window_size": 256
+}

model/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25e83f166977308069becae45ac59d48a2c08c0de8b3135a9acb63455fc0aec9
+size 551197393

model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|startoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|pad|>",
+  "unk_token": "<unk>"
+}

model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "bos_token": "<|startoftext|>",
+  "do_lower_case": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "full_tokenizer_file": null,
+  "max_len": 1024,
+  "name_or_path": "Norod78/hebrew-gpt_neo-small",
+  "pad_token": "<|pad|>",
+  "special_tokens_map_file": "special_tokens_map.json",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

model/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit
+transformers
+tokenizers
+torch

start.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/env bash
+set -e
+if [ "$DEBUG" = true ] ; then
+    echo 'Debugging - ON'
+    nodemon --exec streamlit run app.py
+else
+    echo 'Debugging - OFF'
+    streamlit run app.py
+fi