File size: 1,389 Bytes
2274e74 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from typing import List
from tokenizers import NormalizedString, PreTokenizedString
from tokenizers.pre_tokenizers import PreTokenizer
from transformers import PreTrainedTokenizerFast
try:
from clang import cindex
except ModuleNotFoundError as e:
raise ModuleNotFoundError(
"VulBERTa Clang tokenizer requires `libclang`. Please install it via `pip install libclang`.",
) from e
class ClangPreTokenizer:
cidx = cindex.Index.create()
def clang_split(
self,
i: int,
normalized_string: NormalizedString,
) -> List[NormalizedString]:
tok = []
tu = self.cidx.parse(
"tmp.c",
args=[""],
unsaved_files=[("tmp.c", str(normalized_string.original))],
options=0,
)
for t in tu.get_tokens(extent=tu.cursor.extent):
spelling = t.spelling.strip()
if spelling == "":
continue
tok.append(NormalizedString(spelling))
return tok
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.clang_split)
class VulBERTaTokenizer(PreTrainedTokenizerFast):
def __init__(
self,
*args,
**kwargs,
):
super().__init__(
*args,
**kwargs,
)
self._tokenizer.pre_tokenizer = PreTokenizer.custom(ClangPreTokenizer())
|