added alternatives,
Browse files
app.py
CHANGED
@@ -46,7 +46,7 @@ def calculateEmbeddings(sentences,tokenizer,model):
|
|
46 |
# explicit no operation hash function, because model and tokenizer are not going to change
|
47 |
@st.cache(hash_funcs={transformers.models.bert.tokenization_bert_fast.BertTokenizerFast: lambda _: None, transformers.models.bert.modeling_bert.BertModel: lambda _: None})
|
48 |
def load_model_and_tokenizer():
|
49 |
-
multilingual_checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
|
50 |
tokenizer = AutoTokenizer.from_pretrained(multilingual_checkpoint)
|
51 |
model = AutoModel.from_pretrained(multilingual_checkpoint)
|
52 |
print(type(tokenizer))
|
@@ -58,7 +58,7 @@ model,tokenizer = load_model_and_tokenizer();
|
|
58 |
raw_text_file = 'joint_text_filtered.md'
|
59 |
all_sentences = load_raw_sentences(raw_text_file)
|
60 |
|
61 |
-
embeddings_file = 'multibert_embedded.pt'
|
62 |
all_embeddings = load_embeddings(embeddings_file)
|
63 |
|
64 |
|
@@ -68,7 +68,7 @@ st.caption('[HU] Adjon meg egy tetszőleges kifejezést és a rendszer visszaadj
|
|
68 |
|
69 |
|
70 |
|
71 |
-
text_area_input_query = st.text_area('[HU] Beviteli mező - [EN] Query input',value='
|
72 |
|
73 |
if text_area_input_query:
|
74 |
query_embedding = calculateEmbeddings([text_area_input_query],tokenizer,model)
|
|
|
46 |
# explicit no operation hash function, because model and tokenizer are not going to change
|
47 |
@st.cache(hash_funcs={transformers.models.bert.tokenization_bert_fast.BertTokenizerFast: lambda _: None, transformers.models.bert.modeling_bert.BertModel: lambda _: None})
|
48 |
def load_model_and_tokenizer():
|
49 |
+
multilingual_checkpoint = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' #alternative: SZTAKI-HLT/hubert-base-cc
|
50 |
tokenizer = AutoTokenizer.from_pretrained(multilingual_checkpoint)
|
51 |
model = AutoModel.from_pretrained(multilingual_checkpoint)
|
52 |
print(type(tokenizer))
|
|
|
58 |
raw_text_file = 'joint_text_filtered.md'
|
59 |
all_sentences = load_raw_sentences(raw_text_file)
|
60 |
|
61 |
+
embeddings_file = 'multibert_embedded.pt' #alternative: hunbert_embedded.pt
|
62 |
all_embeddings = load_embeddings(embeddings_file)
|
63 |
|
64 |
|
|
|
68 |
|
69 |
|
70 |
|
71 |
+
text_area_input_query = st.text_area('[HU] Beviteli mező - [EN] Query input',value='Mik a részfeladatok?')
|
72 |
|
73 |
if text_area_input_query:
|
74 |
query_embedding = calculateEmbeddings([text_area_input_query],tokenizer,model)
|