Spaces:

tmnam20
/

code-summarization

Sleeping

App Files Files Community

trminhnam20082002 commited on May 7, 2023

Commit

8b57e03

•

1 Parent(s): 9e4fcf2

feat: add repo

Browse files

Files changed (12) hide show

.gitattributes +1 -0
.gitignore +3 -0
README.md +4 -4
app.py +115 -0
config/config.json +26 -0
demo.ipynb +702 -0
download_model.py +11 -0
model.py +263 -0
models/pytorch_model.bin +3 -0
models/pytorch_model_cpu.bin +3 -0
requirements.txt +7 -0
st_utils.py +232 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/*.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+cache/*
+# models/*
+__pycache__/*

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Code Summarization
-emoji: 🏢
-colorFrom: gray
-colorTo: purple
 sdk: streamlit
 sdk_version: 1.19.0
 app_file: app.py

 ---
+title: Codebert Code Summarization
+emoji: 🏃
+colorFrom: yellow
+colorTo: gray
 sdk: streamlit
 sdk_version: 1.19.0
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import streamlit as st
+from st_utils import (
+    load_tokenizer_and_model,
+    generate_docstring,
+    download_model,
+    # list_files,
+)
+from huggingface_hub import hf_hub_download
+import os
+# list_files(os.getcwd())
+# Set the title and description of the app
+st.title("Text Summarization App")
+st.write(
+    """
+This app uses the Hugging Face transformers library to generate summaries of input text.
+Simply select one of the sample Python functions from the dropdown menu below, and click the 'Summarize' button to generate a summary.
+"""
+)
+# Download the model from the Hugging Face Hub if it doesn't exist
+download_model()
+# load the tokenizer and model
+tokenizer, model, device = load_tokenizer_and_model("./models/pytorch_model.bin")
+# Create a dropdown menu for the user to select a sample Python function
+values = [
+    "",
+    "def multiply(a, b):\n    return a * b",
+    "def get_data():\n    data = []\n    for i in range(10):\n        data.append(i)\n    return data",
+    "def search(data, target):\n    for i in range(len(data)):\n        if data[i] == target:\n            return i\n    return -1",
+]
+st.subheader("Select a sample Python function:")
+selected_value = st.selectbox("", values)
+# Create a text input area for the user to enter their text
+text_input = st.text_area(
+    "Or enter your Python function here:",
+    height=300,
+    value=values[0],
+)
+# Define a function to generate a summary
+def generate_summary(text):
+    summary = generate_docstring(model, tokenizer, device, text, max_length=30)
+    return summary
+# When the user clicks the 'Summarize' button, generate a summary
+if st.button("Summarize") and (len(selected_value) > 0 or len(text_input) > 0):
+    with st.spinner("Generating summary..."):
+        if len(selected_value) > 0:
+            summaries = generate_summary(selected_value)
+            st.subheader("Docstrings:")
+            for i, summary in enumerate(summaries):
+                st.write(f"{i + 1}. " + summary)
+        else:
+            summaries = generate_summary(text_input)
+            st.subheader("Docstrings:")
+            for i, summary in enumerate(summaries):
+                st.write(f"{i + 1}. " + summary)
+# import streamlit as st
+# from st_utils import load_tokenizer_and_model, generate_docstring, download_model
+# # Download the model from the Hugging Face Hub if it doesn't exist
+# # Set the title and description of the app
+# st.title("Text Summarization App")
+# st.write(
+#     """
+# This app uses the Hugging Face transformers library to generate summaries of input text.
+# Simply enter your text in the input area below, and click the 'Summarize' button to generate a summary.
+# """
+# )
+# tokenizer, model, device = load_tokenizer_and_model("./models/pytorch_model.bin")
+# # Create a text input area for the user to enter their text
+# values = [
+#     "def multiply(a, b):\n    return a * b",
+#     "def get_data():\n    data = []\n    for i in range(10):\n        data.append(i)\n    return data",
+#     "def search(data, target):\n    for i in range(len(data)):\n        if data[i] == target:\n            return i\n    return -1",
+# ]
+# st.subheader("Enter your Python function here:")
+# text_input = st.text_area(
+#     "Input text here...",
+#     height=300,
+#     value=values[2],
+# )
+# # Define a function to generate a summary
+# def generate_summary(text):
+#     summary = generate_docstring(model, tokenizer, device, text, max_length=30)
+#     return summary
+# # When the user clicks the 'Summarize' button, generate a summary
+# if st.button("Summarize") and len(text_input) > 0:
+#     with st.spinner("Generating summary..."):
+#         # summary = generate_summary(text_input)
+#         # st.write("Summary:")
+#         # st.code(summary, language="text")
+#         summaries = generate_summary(text_input)
+#         st.subheader("Summary:")
+#         for i, summary in enumerate(summaries):
+#             st.write(f"{i + 1}. " + summary)

config/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "RobertaModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.25.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

demo.ipynb ADDED Viewed

	@@ -0,0 +1,702 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from __future__ import absolute_import\n",
+    "import torch\n",
+    "import logging\n",
+    "import torch.nn as nn\n",
+    "from model import Seq2Seq\n",
+    "from transformers import (\n",
+    "    RobertaConfig, \n",
+    "    RobertaModel, \n",
+    "    RobertaTokenizer\n",
+    ")\n",
+    "\n",
+    "import regex as re\n",
+    "\n",
+    "# disable warnings\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "# base model is RoBERTa\n",
+    "MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}\n",
+    "\n",
+    "# initialize logging\n",
+    "logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n",
+    "                    datefmt = '%m/%d/%Y %H:%M:%S',\n",
+    "                    level = logging.INFO)\n",
+    "logger = logging.getLogger(__name__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class CONFIG:\n",
+    "    max_source_length = 256\n",
+    "    max_target_length = 128\n",
+    "    beam_size = 10\n",
+    "    local_rank = -1\n",
+    "    no_cuda = False\n",
+    "\n",
+    "    do_train = True\n",
+    "    do_eval = True\n",
+    "    do_test = True\n",
+    "    train_batch_size = 12\n",
+    "    eval_batch_size = 32\n",
+    "\n",
+    "    model_type = \"roberta\"\n",
+    "    model_name_or_path = \"microsoft/codebert-base\"\n",
+    "    output_dir = \"/content/drive/MyDrive/CodeSummarization\"\n",
+    "    load_model_path = None\n",
+    "    train_filename = \"dataset/python/train.jsonl\"\n",
+    "    dev_filename = \"dataset/python/valid.jsonl\"\n",
+    "    test_filename = \"dataset/python/test.jsonl\"\n",
+    "    config_name = \"\"\n",
+    "    tokenizer_name = \"\"\n",
+    "    cache_dir = \"cache\"\n",
+    "\n",
+    "    save_every = 5000\n",
+    "\n",
+    "    gradient_accumulation_steps = 1\n",
+    "    learning_rate = 5e-5\n",
+    "    weight_decay = 1e-4\n",
+    "    adam_epsilon = 1e-8\n",
+    "    max_grad_norm = 1.0\n",
+    "    num_train_epochs = 3.0\n",
+    "    max_steps = -1\n",
+    "    warmup_steps = 0\n",
+    "    train_steps = 100000\n",
+    "    eval_steps = 10000\n",
+    "    n_gpu = torch.cuda.device_count()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ded94a2103074dc5b4413a2774888bca",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1edf49c06d214de2ab403e4e6137f714",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "970cfab5b847490ea56f2fdc4e475393",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4df44ac11f74ec6b4460e40802ad890",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6d2355af24624caabff2b7881799bc03",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<s> index: 0\n",
+      "</s> index: 2\n",
+      "<pad> index: 1\n",
+      "<mask> index: 50264\n"
+     ]
+    }
+   ],
+   "source": [
+    "import logging\n",
+    "from transformers import RobertaTokenizer\n",
+    "logger = logging.getLogger(__name__)\n",
+    "tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', cache_dir=CONFIG.cache_dir)\n",
+    "\n",
+    "print(f'{tokenizer.cls_token} index: {tokenizer.cls_token_id}')\n",
+    "print(f'{tokenizer.sep_token} index: {tokenizer.sep_token_id}')\n",
+    "print(f'{tokenizer.pad_token} index: {tokenizer.pad_token_id}')\n",
+    "print(f'{tokenizer.mask_token} index: {tokenizer.mask_token_id}') "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_str = \"def sina_xml_to_url_list(xml_data):\\n    \\\"\\\"\\\"str->list\\n    Convert XML to URL List.\\n    From Biligrab.\\n    \\\"\\\"\\\"\\n    rawurl = []\\n    dom = parseString(xml_data)\\n    for node in dom.getElementsByTagName('durl'):\\n        url = node.getElementsByTagName('url')[0]\\n        rawurl.append(url.childNodes[0].data)\\n    return rawurl\"\n",
+    "input_tokens = tokenizer.tokenize(input_str)\n",
+    "print(input_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['def',\n",
+       " 'sina_xml_to_url_list',\n",
+       " '(',\n",
+       " 'xml_data',\n",
+       " ')',\n",
+       " ':',\n",
+       " 'rawurl',\n",
+       " '=',\n",
+       " '[',\n",
+       " ']',\n",
+       " 'dom',\n",
+       " '=',\n",
+       " 'parseString',\n",
+       " '(',\n",
+       " 'xml_data',\n",
+       " ')',\n",
+       " 'for',\n",
+       " 'node',\n",
+       " 'in',\n",
+       " 'dom',\n",
+       " '.',\n",
+       " 'getElementsByTagName',\n",
+       " '(',\n",
+       " \"'\",\n",
+       " 'durl',\n",
+       " \"'\",\n",
+       " ')',\n",
+       " ':',\n",
+       " 'url',\n",
+       " '=',\n",
+       " 'node',\n",
+       " '.',\n",
+       " 'getElementsByTagName',\n",
+       " '(',\n",
+       " \"'\",\n",
+       " 'url',\n",
+       " \"'\",\n",
+       " ')',\n",
+       " '[',\n",
+       " '0',\n",
+       " ']',\n",
+       " 'rawurl',\n",
+       " '.',\n",
+       " 'append',\n",
+       " '(',\n",
+       " 'url',\n",
+       " '.',\n",
+       " 'childNodes',\n",
+       " '[',\n",
+       " '0',\n",
+       " ']',\n",
+       " '.',\n",
+       " 'data',\n",
+       " ')',\n",
+       " 'return',\n",
+       " 'rawurl']"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def preprocessing(code_segment):\n",
+    "    \n",
+    "    # remove newlines\n",
+    "    code_segment = re.sub(r'\\n', ' ', code_segment)\n",
+    "    \n",
+    "    # remove docstring\n",
+    "    code_segment = re.sub(r'\"\"\".*?\"\"\"', '', code_segment, flags=re.DOTALL)\n",
+    "    \n",
+    "    # remove multiple spaces\n",
+    "    code_segment = re.sub(r'\\s+', ' ', code_segment)\n",
+    "    \n",
+    "    # remove comments\n",
+    "    code_segment = re.sub(r'#.*', '', code_segment)\n",
+    "\n",
+    "    # remove html tags\n",
+    "    code_segment = re.sub(r'<.*?>', '', code_segment)\n",
+    "\n",
+    "    # remove urls\n",
+    "    code_segment = re.sub(r'http\\S+', '', code_segment)\n",
+    "    \n",
+    "    # split special chars into different tokens\n",
+    "    code_segment = re.sub(r'([^\\w\\s])', r' \\1 ', code_segment)\n",
+    "    \n",
+    "    return code_segment.split()\n",
+    "\n",
+    "preprocessing(input_str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tokens = ['def', 'get_data', '(', ')', ':', 'data', '=', '[', ']', 'for', 'i', 'in', 'range', '(', '10', ')', ':', 'data', '.', 'append', '(', 'i', ')', 'return', 'data']\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_str = \"def get_data():\\n    data = []\\n    for i in range(10):\\n        data.append(i)\\n    return data\"\n",
+    "input_tokens = preprocessing(input_str)\n",
+    "print(f'Tokens = {input_tokens}')\n",
+    "# tokenizer.encode_plus(input_tokens, max_length=CONFIG.max_source_length, pad_to_max_length=True, truncation=True, return_tensors=\"pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tokens = ['def', 'sina_xml_to_url_list', '(', 'xml_data', ')', ':', 'rawurl', '=', '[', ']', 'dom', '=', 'parseString', '(', 'xml_data', ')', 'for', 'node', 'in', 'dom', '.', 'getElementsByTagName', '(', \"'\", 'durl', \"'\", ')', ':', 'url', '=', 'node', '.', 'getElementsByTagName', '(', \"'\", 'url', \"'\", ')', '[', '0', ']', 'rawurl', '.', 'append', '(', 'url', '.', 'childNodes', '[', '0', ']', '.', 'data', ')', 'return', 'rawurl']\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([[    0,  9232,     3,  1640,     3,    43,    35,     3,  5214, 10975,\n",
+       "           742, 12623,  5214,     3,  1640,     3,    43,  1990, 46840,   179,\n",
+       "         12623,     4,     3,  1640,   108,     3,   108,    43,    35,  6423,\n",
+       "          5214, 46840,     4,     3,  1640,   108,  6423,   108,    43, 10975,\n",
+       "           288,   742,     3,     4, 48696,  1640,  6423,     4,     3, 10975,\n",
+       "           288,   742,     4, 23687,    43, 30921,     3,     2,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+       "             1,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+       "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+       "         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_str = \"def sina_xml_to_url_list(xml_data):\\n    \\\"\\\"\\\"str->list\\n    Convert XML to URL List.\\n    From Biligrab.\\n    \\\"\\\"\\\"\\n    rawurl = []\\n    dom = parseString(xml_data)\\n    for node in dom.getElementsByTagName('durl'):\\n        url = node.getElementsByTagName('url')[0]\\n        rawurl.append(url.childNodes[0].data)\\n    return rawurl\"\n",
+    "input_tokens = preprocessing(input_str)\n",
+    "print(f'Tokens = {input_tokens}')\n",
+    "# tokenizer.encode_plus(input_tokens, max_length=CONFIG.max_source_length, pad_to_max_length=True, truncation=True, return_tensors=\"pt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'input_ids': tensor([[    0,  9232,     3,  1640,    43,    35, 23687,  5214, 10975,   742,\n",
+      "          1990,   118,   179,  9435,  1640,   698,    43,    35, 23687,     4,\n",
+      "         48696,  1640,   118,    43, 30921, 23687,     2,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+      "         1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}\n"
+     ]
+    }
+   ],
+   "source": [
+    "encoded_input = tokenizer.encode_plus(\n",
+    "    input_tokens, \n",
+    "    max_length=CONFIG.max_source_length, \n",
+    "    pad_to_max_length=True, \n",
+    "    truncation=True, \n",
+    "    return_tensors=\"pt\"\n",
+    ")\n",
+    "print(encoded_input)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Config model\n",
+    "config_class, model_class, tokenizer_class = (RobertaConfig, RobertaModel, RobertaTokenizer)\n",
+    "model_config = config_class.from_pretrained(CONFIG.config_name if CONFIG.config_name else CONFIG.model_name_or_path, cache_dir=CONFIG.cache_dir)\n",
+    "model_config.save_pretrained('config')\n",
+    "\n",
+    "# load tokenizer\n",
+    "tokenizer = tokenizer_class.from_pretrained(\n",
+    "    CONFIG.tokenizer_name if CONFIG.tokenizer_name else CONFIG.model_name_or_path,\n",
+    "    cache_dir=CONFIG.cache_dir,\n",
+    "    # do_lower_case=args.do_lower_case\n",
+    ")\n",
+    "\n",
+    "# load encoder from pretrained RoBERTa\n",
+    "encoder = model_class.from_pretrained(CONFIG.model_name_or_path, config=model_config, cache_dir=CONFIG.cache_dir)    \n",
+    "\n",
+    "# build decoder \n",
+    "decoder_layer = nn.TransformerDecoderLayer(d_model=model_config.hidden_size, nhead=model_config.num_attention_heads)\n",
+    "decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)\n",
+    "\n",
+    "# build seq2seq model from pretrained encoder and from-scratch decoder\n",
+    "model=Seq2Seq(\n",
+    "    encoder=encoder,\n",
+    "    decoder=decoder,\n",
+    "    config=model_config,\n",
+    "    beam_size=CONFIG.beam_size,\n",
+    "    max_length=CONFIG.max_target_length,\n",
+    "    sos_id=tokenizer.cls_token_id,\n",
+    "    eos_id=tokenizer.sep_token_id\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "state_dict = torch.load(\"./models/pytorch_model.bin\")\n",
+    "model.load_state_dict(state_dict)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# move model to GPU\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() and not CONFIG.no_cuda else \"cpu\")\n",
+    "model = model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'input_ids': tensor([[    0,  9232,     3,  1640,    43,    35, 23687,  5214, 10975,   742,\n",
+      "          1990,   118,   179,  9435,  1640,   698,    43,    35, 23687,     4,\n",
+      "         48696,  1640,   118,    43, 30921, 23687,     2,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
+      "             1,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+      "         1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_str = \"def get_data():\\n    data = []\\n    for i in range(10):\\n        data.append(i)\\n    return data\"\n",
+    "input_tokens = preprocessing(input_str)\n",
+    "encoded_input = tokenizer.encode_plus(\n",
+    "    input_tokens, \n",
+    "    max_length=CONFIG.max_source_length, \n",
+    "    pad_to_max_length=True, \n",
+    "    truncation=True, \n",
+    "    return_tensors=\"pt\"\n",
+    ")\n",
+    "print(encoded_input)\n",
+    "\n",
+    "input_ids = encoded_input[\"input_ids\"].to(device)\n",
+    "input_mask = encoded_input[\"attention_mask\"].to(device)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Summary.shape = torch.Size([1, 10, 128])\n",
+      "Summary = tensor([[[42555,    10,   889,  ...,     0,     0,     0],\n",
+      "         [42555,    10,   889,  ...,     0,     0,     0],\n",
+      "         [42555,    10,   889,  ...,     0,     0,     0],\n",
+      "         ...,\n",
+      "         [42555,    10,   889,  ...,     0,     0,     0],\n",
+      "         [42555,    10,   889,  ...,     0,     0,     0],\n",
+      "         [42555,    10,   889,  ...,     0,     0,     0]]], device='cuda:0')\n"
+     ]
+    }
+   ],
+   "source": [
+    "output = model(input_ids, input_mask)\n",
+    "print(f'Summary.shape = {output.shape}')\n",
+    "print(f'Summary = {output}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([128])\n",
+      "Return a list of data.\n",
+      "torch.Size([128])\n",
+      "Return a list of int values.\n",
+      "torch.Size([128])\n",
+      "Return a list of ints.\n",
+      "torch.Size([128])\n",
+      "Return a list of ints\n",
+      "torch.Size([128])\n",
+      "Return a list of the number of integers.\n",
+      "torch.Size([128])\n",
+      "Return a list of the number of data.\n",
+      "torch.Size([128])\n",
+      "Return a list of the number of digits.\n",
+      "torch.Size([128])\n",
+      "Return a list of the number of numbers.\n",
+      "torch.Size([128])\n",
+      "Return a list of data in a list.\n",
+      "torch.Size([128])\n",
+      "Return a list of data in a list of data\n"
+     ]
+    }
+   ],
+   "source": [
+    "# decode summary with tokenizer\n",
+    "summary = output[0]\n",
+    "for i in range(10):\n",
+    "    print(f'{summary[i].shape}')\n",
+    "    pred = tokenizer.decode(summary[i], skip_special_tokens=True)\n",
+    "    print(pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "aio",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "c4b1d2403d5bedfc2b499b2d1212ae0437b5f8ebf43026ed45c1b9608ddeb20c"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

download_model.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+from huggingface_hub import hf_hub_download
+path = hf_hub_download(
+    repo_id="tmnam20/codebert-code-summarization",
+    filename="pytorch_model.bin",
+    cache_dir="cache",
+    local_dir="models",
+)
+print(path)

model.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import torch
+import torch.nn as nn
+import torch
+from torch.autograd import Variable
+import copy
+class Seq2Seq(nn.Module):
+    """
+    Build Seqence-to-Sequence.
+    Parameters:
+    * `encoder`- encoder of seq2seq model. e.g. roberta
+    * `decoder`- decoder of seq2seq model. e.g. transformer
+    * `config`- configuration of encoder model.
+    * `beam_size`- beam size for beam search.
+    * `max_length`- max length of target for beam search.
+    * `sos_id`- start of symbol ids in target for beam search.
+    * `eos_id`- end of symbol ids in target for beam search.
+    """
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        config,
+        beam_size=None,
+        max_length=None,
+        sos_id=None,
+        eos_id=None,
+    ):
+        super(Seq2Seq, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.config = config
+        self.register_buffer("bias", torch.tril(torch.ones(2048, 2048)))
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.lsm = nn.LogSoftmax(dim=-1)
+        self.tie_weights()
+        self.beam_size = beam_size
+        self.max_length = max_length
+        self.sos_id = sos_id
+        self.eos_id = eos_id
+    def _tie_or_clone_weights(self, first_module, second_module):
+        """Tie or clone module weights depending of weither we are using TorchScript or not"""
+        if self.config.torchscript:
+            first_module.weight = nn.Parameter(second_module.weight.clone())
+        else:
+            first_module.weight = second_module.weight
+    def tie_weights(self):
+        """Make sure we are sharing the input and output embeddings.
+        Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(
+            self.lm_head, self.encoder.embeddings.word_embeddings
+        )
+    def forward(
+        self,
+        source_ids=None,
+        source_mask=None,
+        target_ids=None,
+        target_mask=None,
+        args=None,
+    ):
+        outputs = self.encoder(source_ids, attention_mask=source_mask)
+        encoder_output = outputs[0].permute([1, 0, 2]).contiguous()
+        if target_ids is not None:
+            attn_mask = -1e4 * (
+                1 - self.bias[: target_ids.shape[1], : target_ids.shape[1]]
+            )
+            tgt_embeddings = (
+                self.encoder.embeddings(target_ids).permute([1, 0, 2]).contiguous()
+            )
+            out = self.decoder(
+                tgt_embeddings,
+                encoder_output,
+                tgt_mask=attn_mask,
+                memory_key_padding_mask=(1 - source_mask).bool(),
+            )
+            hidden_states = torch.tanh(self.dense(out)).permute([1, 0, 2]).contiguous()
+            lm_logits = self.lm_head(hidden_states)
+            # Shift so that tokens < n predict n
+            active_loss = target_mask[..., 1:].ne(0).view(-1) == 1
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = target_ids[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1))[active_loss],
+                shift_labels.view(-1)[active_loss],
+            )
+            outputs = loss, loss * active_loss.sum(), active_loss.sum()
+            return outputs
+        else:
+            # Predict
+            preds = []
+            zero = torch.cuda.LongTensor(1).fill_(0)
+            for i in range(source_ids.shape[0]):
+                context = encoder_output[:, i : i + 1]
+                context_mask = source_mask[i : i + 1, :]
+                beam = Beam(self.beam_size, self.sos_id, self.eos_id)
+                input_ids = beam.getCurrentState()
+                context = context.repeat(1, self.beam_size, 1)
+                context_mask = context_mask.repeat(self.beam_size, 1)
+                for _ in range(self.max_length):
+                    if beam.done():
+                        break
+                    attn_mask = -1e4 * (
+                        1 - self.bias[: input_ids.shape[1], : input_ids.shape[1]]
+                    )
+                    tgt_embeddings = (
+                        self.encoder.embeddings(input_ids)
+                        .permute([1, 0, 2])
+                        .contiguous()
+                    )
+                    out = self.decoder(
+                        tgt_embeddings,
+                        context,
+                        tgt_mask=attn_mask,
+                        memory_key_padding_mask=(1 - context_mask).bool(),
+                    )
+                    out = torch.tanh(self.dense(out))
+                    hidden_states = out.permute([1, 0, 2]).contiguous()[:, -1, :]
+                    out = self.lsm(self.lm_head(hidden_states)).data
+                    beam.advance(out)
+                    input_ids.data.copy_(
+                        input_ids.data.index_select(0, beam.getCurrentOrigin())
+                    )
+                    input_ids = torch.cat((input_ids, beam.getCurrentState()), -1)
+                hyp = beam.getHyp(beam.getFinal())
+                pred = beam.buildTargetTokens(hyp)[: self.beam_size]
+                pred = [
+                    torch.cat(
+                        [x.view(-1) for x in p] + [zero] * (self.max_length - len(p))
+                    ).view(1, -1)
+                    for p in pred
+                ]
+                preds.append(torch.cat(pred, 0).unsqueeze(0))
+            preds = torch.cat(preds, 0)
+            return preds
+class Beam(object):
+    def __init__(self, size, sos, eos):
+        self.size = size
+        self.tt = torch.cuda
+        # The score for each translation on the beam.
+        self.scores = self.tt.FloatTensor(size).zero_()
+        # The backpointers at each time-step.
+        self.prevKs = []
+        # The outputs at each time-step.
+        self.nextYs = [self.tt.LongTensor(size).fill_(0)]
+        self.nextYs[0][0] = sos
+        # Has EOS topped the beam yet.
+        self._eos = eos
+        self.eosTop = False
+        # Time and k pair for finished.
+        self.finished = []
+    def getCurrentState(self):
+        "Get the outputs for the current timestep."
+        batch = self.tt.LongTensor(self.nextYs[-1]).view(-1, 1)
+        return batch
+    def getCurrentOrigin(self):
+        "Get the backpointers for the current timestep."
+        return self.prevKs[-1]
+    def advance(self, wordLk):
+        """
+        Given prob over words for every last beam `wordLk` and attention
+        `attnOut`: Compute and update the beam search.
+        Parameters:
+        * `wordLk`- probs of advancing from the last step (K x words)
+        * `attnOut`- attention at the last step
+        Returns: True if beam search is complete.
+        """
+        numWords = wordLk.size(1)
+        # Sum the previous scores.
+        if len(self.prevKs) > 0:
+            beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
+            # Don't let EOS have children.
+            for i in range(self.nextYs[-1].size(0)):
+                if self.nextYs[-1][i] == self._eos:
+                    beamLk[i] = -1e20
+        else:
+            beamLk = wordLk[0]
+        flatBeamLk = beamLk.view(-1)
+        bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
+        self.scores = bestScores
+        # bestScoresId is flattened beam x word array, so calculate which
+        # word and beam each score came from
+        prevK = bestScoresId // numWords
+        self.prevKs.append(prevK)
+        self.nextYs.append((bestScoresId - prevK * numWords))
+        for i in range(self.nextYs[-1].size(0)):
+            if self.nextYs[-1][i] == self._eos:
+                s = self.scores[i]
+                self.finished.append((s, len(self.nextYs) - 1, i))
+        # End condition is when top-of-beam is EOS and no global score.
+        if self.nextYs[-1][0] == self._eos:
+            self.eosTop = True
+    def done(self):
+        return self.eosTop and len(self.finished) >= self.size
+    def getFinal(self):
+        if len(self.finished) == 0:
+            self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
+        self.finished.sort(key=lambda a: -a[0])
+        if len(self.finished) != self.size:
+            unfinished = []
+            for i in range(self.nextYs[-1].size(0)):
+                if self.nextYs[-1][i] != self._eos:
+                    s = self.scores[i]
+                    unfinished.append((s, len(self.nextYs) - 1, i))
+            unfinished.sort(key=lambda a: -a[0])
+            self.finished += unfinished[: self.size - len(self.finished)]
+        return self.finished[: self.size]
+    def getHyp(self, beam_res):
+        """
+        Walk back to construct the full hypothesis.
+        """
+        hyps = []
+        for _, timestep, k in beam_res:
+            hyp = []
+            for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
+                hyp.append(self.nextYs[j + 1][k])
+                k = self.prevKs[j][k]
+            hyps.append(hyp[::-1])
+        return hyps
+    def buildTargetTokens(self, preds):
+        sentence = []
+        for pred in preds:
+            tokens = []
+            for tok in pred:
+                if tok == self._eos:
+                    break
+                tokens.append(tok)
+            sentence.append(tokens)
+        return sentence

models/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0ed191f9e4881d50dac7787d5508aee66719f84ec52d7690e4398636bdb000e
+size 706920105

models/pytorch_model_cpu.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81b3d88069dc37314eee6668a48af6a004df66b84cf8cb339d100453d525720b
+size 706917005

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+huggingface_hub==0.14.1
+numpy==1.23.1
+regex==2022.10.31
+streamlit==1.15.1
+torch[cu116]==1.13.1
+tqdm==4.64.1
+transformers==4.25.1

st_utils.py ADDED Viewed

	@@ -0,0 +1,232 @@

+from __future__ import absolute_import
+import streamlit as st
+import torch
+import os
+import sys
+import pickle
+import torch
+import json
+import random
+import logging
+import argparse
+import numpy as np
+from io import open
+from itertools import cycle
+import torch.nn as nn
+from model import Seq2Seq
+from tqdm import tqdm, trange
+import regex as re
+from torch.utils.data import (
+    DataLoader,
+    Dataset,
+    SequentialSampler,
+    RandomSampler,
+    TensorDataset,
+)
+from torch.utils.data.distributed import DistributedSampler
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    get_linear_schedule_with_warmup,
+    RobertaConfig,
+    RobertaModel,
+    RobertaTokenizer,
+)
+from huggingface_hub import hf_hub_download
+import io
+# def list_files(startpath, prev_level=0):
+#     # list files recursively
+#     for root, dirs, files in os.walk(startpath):
+#         level = root.replace(startpath, "").count(os.sep) + prev_level
+#         indent = " " * 4 * (level)
+#         print("{}{}/".format(indent, os.path.basename(root)))
+#         # st.write("{}{}/".format(indent, os.path.basename(root)))
+#         subindent = " " * 4 * (level + 1)
+#         for f in files:
+#             print("{}{}".format(subindent, f))
+#             # st.write("{}{}".format(subindent, f))
+#         for d in dirs:
+#             list_files(d, level + 1)
+class CONFIG:
+    max_source_length = 256
+    max_target_length = 128
+    beam_size = 10
+    local_rank = -1
+    no_cuda = False
+    do_train = True
+    do_eval = True
+    do_test = True
+    train_batch_size = 12
+    eval_batch_size = 32
+    model_type = "roberta"
+    model_name_or_path = "microsoft/codebert-base"
+    output_dir = "/content/drive/MyDrive/CodeSummarization"
+    load_model_path = None
+    train_filename = "dataset/python/train.jsonl"
+    dev_filename = "dataset/python/valid.jsonl"
+    test_filename = "dataset/python/test.jsonl"
+    config_name = ""
+    tokenizer_name = ""
+    cache_dir = "cache"
+    save_every = 5000
+    gradient_accumulation_steps = 1
+    learning_rate = 5e-5
+    weight_decay = 1e-4
+    adam_epsilon = 1e-8
+    max_grad_norm = 1.0
+    num_train_epochs = 3.0
+    max_steps = -1
+    warmup_steps = 0
+    train_steps = 100000
+    eval_steps = 10000
+    n_gpu = torch.cuda.device_count()
+# download model with streamlit cache decorator
+@st.cache(persist=False, show_spinner=True)
+def download_model():
+    if not os.path.exists(r"models/pytorch_model.bin"):
+        os.makedirs("./models", exist_ok=True)
+        path = hf_hub_download(
+            repo_id="tmnam20/codebert-code-summarization",
+            filename="pytorch_model.bin",
+            cache_dir="cache",
+            local_dir=os.path.join(os.getcwd(), "models"),
+            local_dir_use_symlinks=False,
+            force_download=True,
+        )
+# load with streamlit cache decorator
+@st.cache(persist=False, show_spinner=True)
+def load_tokenizer_and_model(pretrained_path):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Config model
+    config_class, model_class, tokenizer_class = (
+        RobertaConfig,
+        RobertaModel,
+        RobertaTokenizer,
+    )
+    model_config = config_class.from_pretrained(
+        CONFIG.config_name if CONFIG.config_name else CONFIG.model_name_or_path,
+        cache_dir=CONFIG.cache_dir,
+    )
+    model_config.save_pretrained("config")
+    # load tokenizer
+    tokenizer = tokenizer_class.from_pretrained(
+        CONFIG.tokenizer_name if CONFIG.tokenizer_name else CONFIG.model_name_or_path,
+        cache_dir=CONFIG.cache_dir,
+        # do_lower_case=args.do_lower_case
+    )
+    # load encoder from pretrained RoBERTa
+    encoder = model_class.from_pretrained(
+        CONFIG.model_name_or_path, config=model_config, cache_dir=CONFIG.cache_dir
+    )
+    # build decoder
+    decoder_layer = nn.TransformerDecoderLayer(
+        d_model=model_config.hidden_size, nhead=model_config.num_attention_heads
+    )
+    decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+    # build seq2seq model from pretrained encoder and from-scratch decoder
+    model = Seq2Seq(
+        encoder=encoder,
+        decoder=decoder,
+        config=model_config,
+        beam_size=CONFIG.beam_size,
+        max_length=CONFIG.max_target_length,
+        sos_id=tokenizer.cls_token_id,
+        eos_id=tokenizer.sep_token_id,
+    )
+    try:
+        state_dict = torch.load(
+            os.path.join(os.getcwd(), "models", "pytorch_model.bin"),
+            map_location=device,
+        )
+    except RuntimeError as e:
+        try:
+            state_dict = torch.load(
+                os.path.join(os.getcwd(), "models", "pytorch_model.bin"),
+                map_location="cpu",
+            )
+        except RuntimeError as e:
+            state_dict = torch.load(
+                os.path.join(os.getcwd(), "models", "pytorch_model_cpu.bin"),
+                map_location="cpu",
+            )
+    model.load_state_dict(state_dict)
+    model = model.to("cpu")
+    torch.save(
+        model.state_dict(), os.path.join(os.getcwd(), "models", "pytorch_model_cpu.bin")
+    )
+    model = model.to(device)
+    return tokenizer, model, device
+def preprocessing(code_segment):
+    # remove newlines
+    code_segment = re.sub(r"\n", " ", code_segment)
+    # remove docstring
+    code_segment = re.sub(r'""".*?"""', "", code_segment, flags=re.DOTALL)
+    # remove multiple spaces
+    code_segment = re.sub(r"\s+", " ", code_segment)
+    # remove comments
+    code_segment = re.sub(r"#.*", "", code_segment)
+    # remove html tags
+    code_segment = re.sub(r"<.*?>", "", code_segment)
+    # remove urls
+    code_segment = re.sub(r"http\S+", "", code_segment)
+    # split special chars into different tokens
+    code_segment = re.sub(r"([^\w\s])", r" \1 ", code_segment)
+    return code_segment.split()
+def generate_docstring(model, tokenizer, device, code_segemnt, max_length=None):
+    input_tokens = preprocessing(code_segemnt)
+    encoded_input = tokenizer.encode_plus(
+        input_tokens,
+        max_length=CONFIG.max_source_length,
+        pad_to_max_length=True,
+        truncation=True,
+        return_tensors="pt",
+    )
+    input_ids = encoded_input["input_ids"].to(device)
+    input_mask = encoded_input["attention_mask"].to(device)
+    if max_length is not None:
+        model.max_length = max_length
+    summary = model(input_ids, input_mask)
+    # decode summary with tokenizer
+    summaries = []
+    for i in range(summary.shape[1]):
+        summaries.append(tokenizer.decode(summary[0][i], skip_special_tokens=True))
+    return summaries
+    # return tokenizer.decode(summary[0][0], skip_special_tokens=True)