File size: 1,152 Bytes
5e17fcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import ir_datasets
import pandas as pd
from autogluon.multimodal import MultiModalPredictor


dataset = ir_datasets.load("beir/fiqa/dev")




dataset = ir_datasets.load("beir/fiqa/dev")

docs_df = pd.DataFrame(dataset.docs_iter()).set_index("doc_id").sample(frac=0.0001)
query_df = pd.DataFrame(dataset.queries_iter()).set_index("query_id")

model_name = "sentence-transformers/all-MiniLM-L6-v2"

predictor = MultiModalPredictor(
    pipeline="feature_extraction",
    hyperparameters={
        "model.hf_text.checkpoint_name": model_name
    }
)

document_embedding = predictor.extract_embedding(docs_df)

query = "What happened when the dot com bubble burst?"
query_embedding = predictor.extract_embedding([query])

import numpy as np

q_norm = query_embedding['0'] / np.linalg.norm(query_embedding['0'], axis=-1, keepdims=True)
d_norm = document_embedding['text'] / np.linalg.norm(document_embedding['text'], axis=-1, keepdims=True)
scores = d_norm.dot(q_norm[0])


print(f'Question: {query}')
print()
for idx in np.argsort(-scores)[:2]:
    print(f'Top {idx} result:')
    print('-----------------')
    print(docs_df['text'].iloc[idx])
    print()