Konrad Wojtasik commited on
Commit
f1acfeb
1 Parent(s): ffcea41

Add encoded corpus

Browse files
.gitattributes CHANGED
@@ -32,3 +32,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ distiluse-base-multilingual-cased-v1-fiqa-pl-corpus filter=lfs diff=lfs merge=lfs -text
36
+ distiluse-base-multilingual-cased-v1-nfcorpus-pl-corpus filter=lfs diff=lfs merge=lfs -text
37
+ distiluse-base-multilingual-cased-v1-scifact-pl-corpus filter=lfs diff=lfs merge=lfs -text
38
+ mcontriever-nfcorpus-pl-corpus filter=lfs diff=lfs merge=lfs -text
39
+ mcontriever-scifacts-pl-corpus filter=lfs diff=lfs merge=lfs -text
40
+ mcontriever-fiqa-pl-corpus filter=lfs diff=lfs merge=lfs -text
41
+ multilingual-e5-base-nfcorpus-pl-corpus filter=lfs diff=lfs merge=lfs -text
42
+ multilingual-e5-base-scifact-pl-corpus filter=lfs diff=lfs merge=lfs -text
43
+ multilingual-e5-base-fiqa-pl-corpus filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -32,20 +32,35 @@ def load_data(dataset_type):
32
  return queries, corpus
33
 
34
  @st.cache_data()
35
- def bi_encode(bi_enc,passages):
36
 
37
  global bi_encoder
38
  #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
39
- bi_encoder = SentenceTransformer(bi_enc,use_auth_token=auth_token)
40
 
41
- with st.spinner('Encoding passages into a vector space...'):
 
42
 
43
- if bi_enc == 'intfloat/multilingual-e5-base':
44
 
45
- corpus_embeddings = bi_encoder.encode(['passage: ' + sentence for sentence in passages], convert_to_tensor=True)
46
 
47
- else:
48
- corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
 
51
  st.success(f"Embeddings computed. Shape: {corpus_embeddings.shape}")
@@ -150,7 +165,10 @@ def search_func(query, bi_encoder_type, top_k=top_k):
150
 
151
  ##### Sematic Search #####
152
  # Encode the query using the bi-encoder and find potentially relevant passages
153
- question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
 
 
 
154
  question_embedding = question_embedding.cpu()
155
  hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k,score_function=util.dot_score)
156
  hits = hits[0] # Get the hits for the first query
 
32
  return queries, corpus
33
 
34
  @st.cache_data()
35
+ def bi_encode(bi_encoder_name,passages, dataset_name='scifact-pl'):
36
 
37
  global bi_encoder
38
  #We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
39
+ bi_encoder = SentenceTransformer(bi_encoder_name,use_auth_token=auth_token)
40
 
41
+ # Thos code would be used if we would embed the passages, but here to make it fast we will load already embedded tensors:
42
+ # with st.spinner('Encoding passages into a vector space...'):
43
 
44
+ # if bi_encoder_name == 'intfloat/multilingual-e5-base':
45
 
46
+ # corpus_embeddings = bi_encoder.encode(['passage: ' + sentence for sentence in passages], convert_to_tensor=True)
47
 
48
+ # else:
49
+ # corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True)
50
+
51
+ with st.spinner('Loading encoded passages...'):
52
+
53
+ if bi_encoder_name == "sentence-transformers/distiluse-base-multilingual-cased-v1":
54
+ name = 'distiluse-base-multilingual-cased-v1'
55
+
56
+ elif bi_encoder_name == 'intfloat/multilingual-e5-base':
57
+ name = 'multilingual-e5-base'
58
+
59
+ elif bi_encoder_name == 'nthakur/mcontriever-base-msmarco':
60
+ name = 'mcontriever'
61
+
62
+ corpus_embeddings_name = "-".join([name, dataset_name, "corpus"])
63
+ corpus_embeddings = torch.load(corpus_embeddings_name)
64
 
65
 
66
  st.success(f"Embeddings computed. Shape: {corpus_embeddings.shape}")
 
165
 
166
  ##### Sematic Search #####
167
  # Encode the query using the bi-encoder and find potentially relevant passages
168
+ if bi_encoder_type == 'intfloat/multilingual-e5-base':
169
+ question_embedding = bi_encoder.encode("query: " + query, convert_to_tensor=True)
170
+ else:
171
+ question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
172
  question_embedding = question_embedding.cpu()
173
  hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k,score_function=util.dot_score)
174
  hits = hits[0] # Get the hits for the first query
distiluse-base-multilingual-cased-v1-fiqa-pl-corpus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2312ba328c80b17088d5ae7d704c0362ed086fe58ac7899230047ddc530e8f3f
3
+ size 118043631
distiluse-base-multilingual-cased-v1-nfcorpus-pl-corpus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:855b2b61bb109bd2c4964af10b924e15a70b0f79e08380dfb9c39697e189c513
3
+ size 7441403
distiluse-base-multilingual-cased-v1-scifact-pl-corpus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e09e44d2269375c67add8ce85666882551fb547fee01e3259ecdf88842c3301
3
+ size 10615800
mcontriever-fiqa-pl-corpus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b7e6fe50d931782dd6c24019ebc1db17e5543d247fb4a212c3ef4faf2007b62
3
+ size 177064804
mcontriever-nfcorpus-pl-corpus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8440623d38b4bd95c56cecbd21d9ca86f479393228a2934f3d0cc3c5edc9cad6
3
+ size 11161456
mcontriever-scifacts-pl-corpus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:291dd709b7bbe015f1176c9fe37fe8e7a64d55673cac2729a304c7b26a40addd
3
+ size 15923056
multilingual-e5-base-fiqa-pl-corpus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f8183a87b7c515c54aeddd770e1531cc85fcf99b4f9e3715c0716d020689a8d
3
+ size 177064831
multilingual-e5-base-nfcorpus-pl-corpus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db3b211fc13de3a311addae59b061049e55a2794639127740c64b0680af8615b
3
+ size 11161547
multilingual-e5-base-scifact-pl-corpus ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4166d06d8037cc70990c3f1ac894aaae0b6187bf69e8f6a6e25d9236edf891ea
3
+ size 15923144