Spaces:
Runtime error
Runtime error
import streamlit as st | |
import torch | |
import sentence_transformers as sent | |
import datasets as ds | |
d = ds.load_dataset("wikipedia", "20220301.simple") | |
t = d["train"] | |
titles = t['title'] | |
def load_model(): | |
return sent.SentenceTransformer("distiluse-base-multilingual-cased-v1")#"all-MiniLM-L6-v2") | |
def load_wikipedia_embeddings(): | |
return torch.load("titles-simple-0.pt", map_location=torch.device('cpu')) | |
st.title("Multilingual Semantic Search for Wikipedia Simple English") | |
st.markdown(""" | |
Use semantic search to find related articles in Wikipedia Simple English: using a language model (sentence-transformers/distiluse-base-multilingual-cased-v1) we can find the closests titles from Wikipedia Simple English (wikipedia) queried in any of the model's trained languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish: | |
- colesterol | |
- développement humain | |
- Crise dos mísseis de Cuba | |
Also, "near natural language" queries are usually enough to bring up relevant results. Try: | |
- ¿cuál es el edificio más alto del mundo? | |
- comment préparer du poulet frit | |
- melhores películas de pixar | |
(note: search is done only on the article titles, not the content) | |
""") | |
model = load_model() | |
embeddings = load_wikipedia_embeddings() | |
#queries = ["Aristoteles", "Autismo", "Mental", "crecimiento poblacional"] | |
query = st.text_input("Query (es, fr, pt, ...)") | |
if query != "": | |
queries = [query] | |
queries_emb = model.encode(queries, convert_to_tensor=True) | |
hits = sent.util.semantic_search(queries_emb, embeddings, top_k=5) | |
for i,q in enumerate(queries): | |
f"----\n{q}:\n" | |
for hit in hits[i]: | |
cid = hit['corpus_id'] | |
title = titles[cid] | |
url = t[cid]['url'] | |
text = t[cid]['text'][:500] + "..." | |
st.header(f"{title}") | |
url | |
text | |
hit | |