kisooofficial minseokKoo commited on
Commit
0c774af
0 Parent(s):

Duplicate from minseokKoo/Auto_Classifier

Browse files

Co-authored-by: KooMinSeok <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.vs/VSWorkspaceState.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "ExpandedNodes": [
3
+ ""
4
+ ],
5
+ "SelectedNode": "\\predict.py",
6
+ "PreviewInSolutionExplorer": false
7
+ }
.vs/false_alarm_detection/FileContentIndex/c4b8d8b1-bdcf-465a-8b48-9a548a0e40d4.vsidx ADDED
Binary file (298 kB). View file
 
.vs/false_alarm_detection/FileContentIndex/read.lock ADDED
File without changes
.vs/false_alarm_detection/v17/.wsuo ADDED
Binary file (22 kB). View file
 
.vs/slnx.sqlite ADDED
Binary file (90.1 kB). View file
 
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Auto Classifier
3
+ emoji: 💻
4
+ colorFrom: red
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.16.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: openrail
11
+ duplicated_from: minseokKoo/Auto_Classifier
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import re
4
+ import os
5
+ import sys
6
+ import random
7
+ import transformers
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
+ from transformers import RobertaTokenizer, RobertaForSequenceClassification
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
13
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
14
+ import gradio as gr
15
+
16
+
17
+
18
+ def greet(co):
19
+ code_text = []
20
+
21
+ code_text.append(co)
22
+
23
+ code_text = ' '.join(code_text)
24
+ code_text = re.sub('\/\*[\S\s]*\*\/', '', code_text)
25
+ code_text = re.sub('\/\/.*', '', code_text)
26
+ code_text = re.sub('(\\\\n)+', '\\n', code_text)
27
+
28
+ # 1. CFA-CodeBERTa-small.pt -> CodeBERTa-small-v1 finetunig model
29
+ path = os.getcwd() + '/models/CFA-CodeBERTa-small.pt'
30
+ tokenizer = AutoTokenizer.from_pretrained("huggingface/CodeBERTa-small-v1")
31
+ input_ids = tokenizer.encode(
32
+ code_text, max_length=512, truncation=True, padding='max_length')
33
+ input_ids = torch.tensor([input_ids])
34
+ model = RobertaForSequenceClassification.from_pretrained(
35
+ path, num_labels=2)
36
+ model.to('cpu')
37
+ pred_1 = model(input_ids)[0].detach().cpu().numpy()[0]
38
+ # model(input_ids)[0].argmax().detach().cpu().numpy().item()
39
+
40
+ # 2. CFA-codebert-c.pt -> codebert-c finetuning model
41
+ path = os.getcwd() + '/models/CFA-codebert-c.pt'
42
+ tokenizer = AutoTokenizer.from_pretrained(path)
43
+ input_ids = tokenizer(code_text, padding=True, max_length=512,
44
+ truncation=True, return_token_type_ids=True)['input_ids']
45
+ input_ids = torch.tensor([input_ids])
46
+ model = AutoModelForSequenceClassification.from_pretrained(
47
+ path, num_labels=2)
48
+ model.to('cpu')
49
+ pred_2 = model(input_ids)[0].detach().cpu().numpy()[0]
50
+
51
+ # 3. CFA-codebert-c-v2.pt -> undersampling + codebert-c finetuning model
52
+ path = os.getcwd() + '/models/CFA-codebert-c-v2.pt'
53
+ tokenizer = RobertaTokenizer.from_pretrained(path)
54
+ input_ids = tokenizer(code_text, padding=True, max_length=512,
55
+ truncation=True, return_token_type_ids=True)['input_ids']
56
+ input_ids = torch.tensor([input_ids])
57
+ model = RobertaForSequenceClassification.from_pretrained(
58
+ path, num_labels=2)
59
+ model.to('cpu')
60
+ pred_3 = model(input_ids)[0].detach().cpu().numpy()
61
+
62
+ # 4. codeT5 finetuning model
63
+ path = os.getcwd() + '/models/CFA-codeT5'
64
+ model_params = {
65
+ # model_type: t5-base/t5-large
66
+ "MODEL": path,
67
+ "TRAIN_BATCH_SIZE": 8, # training batch size
68
+ "VALID_BATCH_SIZE": 8, # validation batch size
69
+ "VAL_EPOCHS": 1, # number of validation epochs
70
+ "MAX_SOURCE_TEXT_LENGTH": 512, # max length of source text
71
+ "MAX_TARGET_TEXT_LENGTH": 3, # max length of target text
72
+ "SEED": 2022, # set seed for reproducibility
73
+ }
74
+ data = pd.DataFrame({'code': [code_text]})
75
+ pred_4 = T5Trainer(
76
+ dataframe=data,
77
+ source_text="code",
78
+ model_params=model_params
79
+ )
80
+ pred_4 = int(pred_4[0])
81
+
82
+ # ensemble
83
+ tot_result = (pred_1 * 0.8 + pred_2 * 0.1 +
84
+ pred_3 * 0.1 + pred_4 * 0.1).argmax()
85
+ if tot_result == 0:
86
+ return "false positive !!"
87
+ else:
88
+ return "true positive !!"
89
+
90
+
91
+
92
+
93
+ # codeT5
94
+ class YourDataSetClass(Dataset):
95
+
96
+ def __init__(
97
+ self, dataframe, tokenizer, source_len, source_text):
98
+
99
+ self.tokenizer = tokenizer
100
+ self.data = dataframe
101
+ self.source_len = source_len
102
+ # self.summ_len = target_len
103
+ # self.target_text = self.data[target_text]
104
+ self.source_text = self.data[source_text]
105
+
106
+ def __len__(self):
107
+ return len(self.source_text)
108
+
109
+ def __getitem__(self, index):
110
+
111
+ source_text = str(self.source_text[index])
112
+ source_text = " ".join(source_text.split())
113
+ source = self.tokenizer.batch_encode_plus(
114
+ [source_text],
115
+ max_length=self.source_len,
116
+ pad_to_max_length=True,
117
+ truncation=True,
118
+ padding="max_length",
119
+ return_tensors="pt",
120
+ )
121
+ source_ids = source["input_ids"].squeeze()
122
+ source_mask = source["attention_mask"].squeeze()
123
+ return {
124
+ "source_ids": source_ids.to(dtype=torch.long),
125
+ "source_mask": source_mask.to(dtype=torch.long),
126
+ }
127
+
128
+
129
+ def validate(epoch, tokenizer, model, device, loader):
130
+ model.eval()
131
+ predictions = []
132
+ with torch.no_grad():
133
+ for _, data in enumerate(loader, 0):
134
+ ids = data['source_ids'].to(device, dtype=torch.long)
135
+ mask = data['source_mask'].to(device, dtype=torch.long)
136
+
137
+ generated_ids = model.generate(
138
+ input_ids=ids,
139
+ attention_mask=mask,
140
+ max_length=150,
141
+ num_beams=2,
142
+ repetition_penalty=2.5,
143
+ length_penalty=1.0,
144
+ early_stopping=True
145
+ )
146
+
147
+ preds = [tokenizer.decode(
148
+ g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
149
+ if ((preds != '0') | (preds != '1')):
150
+ preds = '0'
151
+
152
+ predictions.extend(preds)
153
+ return predictions
154
+
155
+
156
+ def T5Trainer(dataframe, source_text, model_params, step="test",):
157
+
158
+ torch.manual_seed(model_params["SEED"]) # pytorch random seed
159
+ np.random.seed(model_params["SEED"]) # numpy random seed
160
+ torch.backends.cudnn.deterministic = True
161
+
162
+ tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
163
+
164
+ model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
165
+ model = model.to('cpu')
166
+
167
+ dataframe = dataframe[[source_text]]
168
+
169
+ val_dataset = dataframe
170
+ val_set = YourDataSetClass(
171
+ val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], source_text)
172
+
173
+ val_params = {
174
+ 'batch_size': model_params["VALID_BATCH_SIZE"],
175
+ 'shuffle': False,
176
+ 'num_workers': 0
177
+ }
178
+
179
+ val_loader = DataLoader(val_set, **val_params)
180
+
181
+ for epoch in range(model_params["VAL_EPOCHS"]):
182
+ predictions = validate(epoch, tokenizer, model, 'cpu', val_loader)
183
+
184
+ return predictions
185
+
186
+
187
+ #################################################################################
188
+
189
+ '''demo = gr.Interface(
190
+ fn = greet,
191
+ inputs = "text",
192
+ outputs= "number")
193
+ demo.launch(share=True)
194
+ '''
195
+ with gr.Blocks() as demo1:
196
+ gr.Markdown(
197
+ """
198
+ <h1 align="center">
199
+ False-Alarm-Detector
200
+ </h1>
201
+ """)
202
+
203
+ gr.Markdown(
204
+ """
205
+ 정적 분석기로 오류라고 보고된 코드를 입력하면,
206
+ 오류가 True-positive 인지 False-positive 인지 분류 해 주는 프로그램이다.
207
+ """)
208
+
209
+ with gr.Accordion(label='모델에 대한 설명 ( 여기를 클릭 하시오. )',open=False):
210
+ gr.Markdown(
211
+ """
212
+ 총 3개의 모델을 사용하였다.
213
+ 1. codeBERTa-small-v1
214
+ - codeBERTa-small-v1 설명
215
+ 2. codeBERT - C
216
+ - codeBERT - C 설명
217
+ 3. codeT5
218
+ - codeT5 설명
219
+ """
220
+ )
221
+ with gr.Row():
222
+ with gr.Column():
223
+ inputs_1 = gr.Textbox(placeholder="코드를 입력하시오.", label='Code')
224
+ with gr.Row():
225
+ btn = gr.Button("결과 출력")
226
+ with gr.Column():
227
+ outputs_1 = gr.Text(label = 'Result')
228
+ btn.click(fn = greet, inputs = inputs_1, outputs= outputs_1)
229
+
230
+ if __name__ == "__main__":
231
+ demo1.launch()
models/CFA-CodeBERTa-small.pt/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/content/drive/MyDrive/sanhak_d2a/checkpoint_ver2/model_48000.pt",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 6,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "problem_type": "single_label_classification",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.25.1",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 52000
28
+ }
models/CFA-CodeBERTa-small.pt/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ebc62a218f82300fb2dff958c293f6526b00be7bc92adf3d4012ce0c79f340e
3
+ size 333849845
models/CFA-codeT5/config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "t5-small",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 512,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "relu",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "relu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": false,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "t5",
19
+ "n_positions": 512,
20
+ "num_decoder_layers": 6,
21
+ "num_heads": 8,
22
+ "num_layers": 6,
23
+ "output_past": true,
24
+ "pad_token_id": 0,
25
+ "relative_attention_max_distance": 128,
26
+ "relative_attention_num_buckets": 32,
27
+ "task_specific_params": {
28
+ "summarization": {
29
+ "early_stopping": true,
30
+ "length_penalty": 2.0,
31
+ "max_length": 200,
32
+ "min_length": 30,
33
+ "no_repeat_ngram_size": 3,
34
+ "num_beams": 4,
35
+ "prefix": "summarize: "
36
+ },
37
+ "translation_en_to_de": {
38
+ "early_stopping": true,
39
+ "max_length": 300,
40
+ "num_beams": 4,
41
+ "prefix": "translate English to German: "
42
+ },
43
+ "translation_en_to_fr": {
44
+ "early_stopping": true,
45
+ "max_length": 300,
46
+ "num_beams": 4,
47
+ "prefix": "translate English to French: "
48
+ },
49
+ "translation_en_to_ro": {
50
+ "early_stopping": true,
51
+ "max_length": 300,
52
+ "num_beams": 4,
53
+ "prefix": "translate English to Romanian: "
54
+ }
55
+ },
56
+ "torch_dtype": "float32",
57
+ "transformers_version": "4.25.1",
58
+ "use_cache": true,
59
+ "vocab_size": 32128
60
+ }
models/CFA-codeT5/logs.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [14:31:15] [Data]: Reading data... <ipython-input-13-7b186dfd070a>:122
2
+
3
+ VALID Dataset: (1, 1)
models/CFA-codeT5/predictions.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ,Generated Text
2
+ 0,0
models/CFA-codeT5/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:116cfaf06330d424e695fb5780b8dd91eb9593a68f14d4b7bb60edb5a6d603c6
3
+ size 242071641
models/CFA-codeT5/special_tokens_map.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
107
+ }
models/CFA-codeT5/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
models/CFA-codeT5/tokenizer_config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "extra_ids": 100,
106
+ "model_max_length": 512,
107
+ "name_or_path": "t5-small",
108
+ "pad_token": "<pad>",
109
+ "sp_model_kwargs": {},
110
+ "special_tokens_map_file": null,
111
+ "tokenizer_class": "T5Tokenizer",
112
+ "unk_token": "<unk>"
113
+ }
models/CFA-codebert-c-v2.pt/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "neulab/codebert-c",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "problem_type": "single_label_classification",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.25.1",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 50265
29
+ }
models/CFA-codebert-c-v2.pt/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/CFA-codebert-c-v2.pt/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4700941c692199dd089f27a8e009b9b16a07807cc00dde7efd2b786e171438e
3
+ size 498662069
models/CFA-codebert-c-v2.pt/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
models/CFA-codebert-c-v2.pt/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "model_max_length": 512,
37
+ "name_or_path": "neulab/codebert-c",
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "sep_token": {
47
+ "__type": "AddedToken",
48
+ "content": "</s>",
49
+ "lstrip": false,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
+ "special_tokens_map_file": "/home/ualon/.cache/huggingface/transformers/6b6d54aefb63b9d58f063d74c065c9b46f06a8d4021859f4a1334aa6779e2528.0dc5b1041f62041ebbd23b1297f2f573769d5c97d8b7c28180ec86b8f6185aa8",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": {
58
+ "__type": "AddedToken",
59
+ "content": "<unk>",
60
+ "lstrip": false,
61
+ "normalized": true,
62
+ "rstrip": false,
63
+ "single_word": false
64
+ }
65
+ }
models/CFA-codebert-c-v2.pt/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51f91209766606979ae7781ee99d8c562fd6a4679bf01919c4b35438b5192fbb
3
+ size 3387
models/CFA-codebert-c-v2.pt/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/CFA-codebert-c.pt/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "neulab/codebert-c",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "problem_type": "single_label_classification",
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.25.1",
26
+ "type_vocab_size": 1,
27
+ "use_cache": true,
28
+ "vocab_size": 50265
29
+ }
models/CFA-codebert-c.pt/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/CFA-codebert-c.pt/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7822986c75e0f31144128a9f4e4601053c46862ce51ae9d4eb10b579df4ce769
3
+ size 498662069
models/CFA-codebert-c.pt/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
models/CFA-codebert-c.pt/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/CFA-codebert-c.pt/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "model_max_length": 512,
37
+ "name_or_path": "neulab/codebert-c",
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "sep_token": {
47
+ "__type": "AddedToken",
48
+ "content": "</s>",
49
+ "lstrip": false,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
+ "special_tokens_map_file": "/home/ualon/.cache/huggingface/transformers/6b6d54aefb63b9d58f063d74c065c9b46f06a8d4021859f4a1334aa6779e2528.0dc5b1041f62041ebbd23b1297f2f573769d5c97d8b7c28180ec86b8f6185aa8",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": {
58
+ "__type": "AddedToken",
59
+ "content": "<unk>",
60
+ "lstrip": false,
61
+ "normalized": true,
62
+ "rstrip": false,
63
+ "single_word": false
64
+ }
65
+ }
models/CFA-codebert-c.pt/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d79cc06ab8ecfdc58775613d97db99b1198279b90238c9c5b6801ea168e6c393
3
+ size 3387
models/CFA-codebert-c.pt/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ torch
4
+ transformers
5
+ tensorflow-cpu
6
+ sentencepiece