Spaces:
Runtime error
Runtime error
Konrad Wojtasik
commited on
Commit
•
f1acfeb
1
Parent(s):
ffcea41
Add encoded corpus
Browse files- .gitattributes +9 -0
- app.py +26 -8
- distiluse-base-multilingual-cased-v1-fiqa-pl-corpus +3 -0
- distiluse-base-multilingual-cased-v1-nfcorpus-pl-corpus +3 -0
- distiluse-base-multilingual-cased-v1-scifact-pl-corpus +3 -0
- mcontriever-fiqa-pl-corpus +3 -0
- mcontriever-nfcorpus-pl-corpus +3 -0
- mcontriever-scifacts-pl-corpus +3 -0
- multilingual-e5-base-fiqa-pl-corpus +3 -0
- multilingual-e5-base-nfcorpus-pl-corpus +3 -0
- multilingual-e5-base-scifact-pl-corpus +3 -0
.gitattributes
CHANGED
@@ -32,3 +32,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
distiluse-base-multilingual-cased-v1-fiqa-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
36 |
+
distiluse-base-multilingual-cased-v1-nfcorpus-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
37 |
+
distiluse-base-multilingual-cased-v1-scifact-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
38 |
+
mcontriever-nfcorpus-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
39 |
+
mcontriever-scifacts-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
40 |
+
mcontriever-fiqa-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
41 |
+
multilingual-e5-base-nfcorpus-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
42 |
+
multilingual-e5-base-scifact-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
43 |
+
multilingual-e5-base-fiqa-pl-corpus filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -32,20 +32,35 @@ def load_data(dataset_type):
|
|
32 |
return queries, corpus
|
33 |
|
34 |
@st.cache_data()
|
35 |
-
def bi_encode(
|
36 |
|
37 |
global bi_encoder
|
38 |
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
|
39 |
-
bi_encoder = SentenceTransformer(
|
40 |
|
41 |
-
|
|
|
42 |
|
43 |
-
|
44 |
|
45 |
-
|
46 |
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
|
51 |
st.success(f"Embeddings computed. Shape: {corpus_embeddings.shape}")
|
@@ -150,7 +165,10 @@ def search_func(query, bi_encoder_type, top_k=top_k):
|
|
150 |
|
151 |
##### Sematic Search #####
|
152 |
# Encode the query using the bi-encoder and find potentially relevant passages
|
153 |
-
|
|
|
|
|
|
|
154 |
question_embedding = question_embedding.cpu()
|
155 |
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k,score_function=util.dot_score)
|
156 |
hits = hits[0] # Get the hits for the first query
|
|
|
32 |
return queries, corpus
|
33 |
|
34 |
@st.cache_data()
|
35 |
+
def bi_encode(bi_encoder_name,passages, dataset_name='scifact-pl'):
|
36 |
|
37 |
global bi_encoder
|
38 |
#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
|
39 |
+
bi_encoder = SentenceTransformer(bi_encoder_name,use_auth_token=auth_token)
|
40 |
|
41 |
+
# Thos code would be used if we would embed the passages, but here to make it fast we will load already embedded tensors:
|
42 |
+
# with st.spinner('Encoding passages into a vector space...'):
|
43 |
|
44 |
+
# if bi_encoder_name == 'intfloat/multilingual-e5-base':
|
45 |
|
46 |
+
# corpus_embeddings = bi_encoder.encode(['passage: ' + sentence for sentence in passages], convert_to_tensor=True)
|
47 |
|
48 |
+
# else:
|
49 |
+
# corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True)
|
50 |
+
|
51 |
+
with st.spinner('Loading encoded passages...'):
|
52 |
+
|
53 |
+
if bi_encoder_name == "sentence-transformers/distiluse-base-multilingual-cased-v1":
|
54 |
+
name = 'distiluse-base-multilingual-cased-v1'
|
55 |
+
|
56 |
+
elif bi_encoder_name == 'intfloat/multilingual-e5-base':
|
57 |
+
name = 'multilingual-e5-base'
|
58 |
+
|
59 |
+
elif bi_encoder_name == 'nthakur/mcontriever-base-msmarco':
|
60 |
+
name = 'mcontriever'
|
61 |
+
|
62 |
+
corpus_embeddings_name = "-".join([name, dataset_name, "corpus"])
|
63 |
+
corpus_embeddings = torch.load(corpus_embeddings_name)
|
64 |
|
65 |
|
66 |
st.success(f"Embeddings computed. Shape: {corpus_embeddings.shape}")
|
|
|
165 |
|
166 |
##### Sematic Search #####
|
167 |
# Encode the query using the bi-encoder and find potentially relevant passages
|
168 |
+
if bi_encoder_type == 'intfloat/multilingual-e5-base':
|
169 |
+
question_embedding = bi_encoder.encode("query: " + query, convert_to_tensor=True)
|
170 |
+
else:
|
171 |
+
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
172 |
question_embedding = question_embedding.cpu()
|
173 |
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k,score_function=util.dot_score)
|
174 |
hits = hits[0] # Get the hits for the first query
|
distiluse-base-multilingual-cased-v1-fiqa-pl-corpus
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2312ba328c80b17088d5ae7d704c0362ed086fe58ac7899230047ddc530e8f3f
|
3 |
+
size 118043631
|
distiluse-base-multilingual-cased-v1-nfcorpus-pl-corpus
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:855b2b61bb109bd2c4964af10b924e15a70b0f79e08380dfb9c39697e189c513
|
3 |
+
size 7441403
|
distiluse-base-multilingual-cased-v1-scifact-pl-corpus
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e09e44d2269375c67add8ce85666882551fb547fee01e3259ecdf88842c3301
|
3 |
+
size 10615800
|
mcontriever-fiqa-pl-corpus
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b7e6fe50d931782dd6c24019ebc1db17e5543d247fb4a212c3ef4faf2007b62
|
3 |
+
size 177064804
|
mcontriever-nfcorpus-pl-corpus
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8440623d38b4bd95c56cecbd21d9ca86f479393228a2934f3d0cc3c5edc9cad6
|
3 |
+
size 11161456
|
mcontriever-scifacts-pl-corpus
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:291dd709b7bbe015f1176c9fe37fe8e7a64d55673cac2729a304c7b26a40addd
|
3 |
+
size 15923056
|
multilingual-e5-base-fiqa-pl-corpus
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f8183a87b7c515c54aeddd770e1531cc85fcf99b4f9e3715c0716d020689a8d
|
3 |
+
size 177064831
|
multilingual-e5-base-nfcorpus-pl-corpus
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db3b211fc13de3a311addae59b061049e55a2794639127740c64b0680af8615b
|
3 |
+
size 11161547
|
multilingual-e5-base-scifact-pl-corpus
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4166d06d8037cc70990c3f1ac894aaae0b6187bf69e8f6a6e25d9236edf891ea
|
3 |
+
size 15923144
|