File size: 3,126 Bytes
664c81e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# import py_vncorenlp
# from transformers import AutoTokenizer, pipeline
# import torch
# import os
# from model.keyword_extraction_utils import extract_keywords
#
#
# class KeyBERTVi:
#
#     def __init__(self, stopwords_file_path=None):
#         self.annotator = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"],
#                                                 save_dir=f'{dir_path}/pretrained-models/vncorenlp')
#         # model = py_vncorenlp.VnCoreNLP(save_dir='/absolute/path/to/vncorenlp')
#         print("Loading PhoBERT model")
#         self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
#
#         # use absolute path because torch is cached
#         self.phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt')
#         self.phobert.eval()
#
#         print("Loading NER model")
#         ner_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
#         ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
#         ner_model.eval()
#         self.ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
#
#         if stopwords_file_path is None:
#             stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
#         with open(stopwords_file_path) as f:
#             self.stopwords = [w.strip() for w in f.readlines()]
#
#     def extract_keywords(self, title, text, ngram_range=(1, 3), top_n=5, use_kmeans=False, use_mmr=False, min_freq=1):
#         keyword_ls = extract_keywords(text, title,
#                                       self.ner_pipeline,
#                                       self.annotator,
#                                       self.phobert_tokenizer,
#                                       self.phobert,
#                                       self.stopwords,
#                                       ngram_n=ngram_range,
#                                       top_n=top_n,
#                                       use_kmeans=use_kmeans,
#                                       use_mmr=use_mmr,
#                                       min_freq=min_freq)
#         return keyword_ls
#
#     def highlight(self, text, keywords):
#         kw_ls = [' '.join(kw.split('_')) for kw, score in keywords]
#         for key in kw_ls:
#             text = text.replace(f" {key}", f" <mark>{key}</mark>")
#         return text
#
#
# dir_path = os.path.dirname(os.path.realpath(__file__))
# if __name__ == "__main__":
#     # args
#     # print(dir_path)
#
#     stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
#
#     # text_file_path = sys.argv[1]
#     # with open(f'{dir_path}/{text_file_path}', 'r') as f:
#     #     text = ' '.join([ln.strip() for ln in f.readlines()])
#         # print(text)
#
#     # kw_model = KeyBERTVi()
#     # model_name_on_hub = "KeyBERTVi"
#     # kw_model.save_pretrained(model_name_on_hub)
#     # kw_model.phobert_tokenizer.save_pretrained(model_name_on_hub)
#
#     # title = None
#     # keyword_ls = kw_model.extract_keywords(title, text, ngram_range=(1, 3), top_n=5)
#     # print(keyword_ls)