# import py_vncorenlp # from transformers import AutoTokenizer, pipeline # import torch # import os # from model.keyword_extraction_utils import extract_keywords # # # class KeyBERTVi: # # def __init__(self, stopwords_file_path=None): # self.annotator = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"], # save_dir=f'{dir_path}/pretrained-models/vncorenlp') # # model = py_vncorenlp.VnCoreNLP(save_dir='/absolute/path/to/vncorenlp') # print("Loading PhoBERT model") # self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2") # # # use absolute path because torch is cached # self.phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt') # self.phobert.eval() # # print("Loading NER model") # ner_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base") # ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt') # ner_model.eval() # self.ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer) # # if stopwords_file_path is None: # stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt' # with open(stopwords_file_path) as f: # self.stopwords = [w.strip() for w in f.readlines()] # # def extract_keywords(self, title, text, ngram_range=(1, 3), top_n=5, use_kmeans=False, use_mmr=False, min_freq=1): # keyword_ls = extract_keywords(text, title, # self.ner_pipeline, # self.annotator, # self.phobert_tokenizer, # self.phobert, # self.stopwords, # ngram_n=ngram_range, # top_n=top_n, # use_kmeans=use_kmeans, # use_mmr=use_mmr, # min_freq=min_freq) # return keyword_ls # # def highlight(self, text, keywords): # kw_ls = [' '.join(kw.split('_')) for kw, score in keywords] # for key in kw_ls: # text = text.replace(f" {key}", f" {key}") # return text # # # dir_path = os.path.dirname(os.path.realpath(__file__)) # if __name__ == "__main__": # # args # # print(dir_path) # # stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt' # # # text_file_path = sys.argv[1] # # with open(f'{dir_path}/{text_file_path}', 'r') as f: # # text = ' '.join([ln.strip() for ln in f.readlines()]) # # print(text) # # # kw_model = KeyBERTVi() # # model_name_on_hub = "KeyBERTVi" # # kw_model.save_pretrained(model_name_on_hub) # # kw_model.phobert_tokenizer.save_pretrained(model_name_on_hub) # # # title = None # # keyword_ls = kw_model.extract_keywords(title, text, ngram_range=(1, 3), top_n=5) # # print(keyword_ls)