suzhoum commited on
Commit
5e17fcf
1 Parent(s): d59a442
Files changed (3) hide show
  1. app.py +27 -7
  2. automm_semantic_embedding.py +43 -0
  3. requirements.txt +1 -0
app.py CHANGED
@@ -1,18 +1,24 @@
1
  import gradio as gr
 
 
 
2
  from autogluon.multimodal import MultiModalPredictor
3
 
4
 
5
  def text_embedding(query: str):
6
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
7
-
 
8
  predictor = MultiModalPredictor(
9
  pipeline="feature_extraction",
10
  hyperparameters={
11
  "model.hf_text.checkpoint_name": model_name
12
  }
13
  )
 
 
14
  query_embedding = predictor.extract_embedding([query])
15
- return query_embedding["0"]
16
 
17
 
18
  def main():
@@ -20,13 +26,27 @@ def main():
20
  gr.Markdown("# Text Embedding for Search Queries")
21
  gr.Markdown("Ask an open question!")
22
  with gr.Row():
23
- inp = gr.Textbox(show_label=False)
24
  with gr.Row():
25
- btn = gr.Button("Generate Embedding")
 
 
 
26
  with gr.Row():
27
- out = gr.DataFrame(label="Embedding", show_label=True)
28
-
29
- btn.click(fn=text_embedding, inputs=inp, outputs=out)
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  demo.launch()
32
 
 
1
  import gradio as gr
2
+ import ir_datasets
3
+ import pandas as pd
4
+
5
  from autogluon.multimodal import MultiModalPredictor
6
 
7
 
8
  def text_embedding(query: str):
9
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
10
+ # dataset = ir_datasets.load("beir/fiqa/dev")
11
+ # docs_df = pd.DataFrame(dataset.docs_iter()).set_index("doc_id").sample(frac=0.001)
12
  predictor = MultiModalPredictor(
13
  pipeline="feature_extraction",
14
  hyperparameters={
15
  "model.hf_text.checkpoint_name": model_name
16
  }
17
  )
18
+ # query_embedding = predictor.extract_embedding(docs_df)
19
+ # return query_embedding["text"]
20
  query_embedding = predictor.extract_embedding([query])
21
+ return query_embedding["0"]
22
 
23
 
24
  def main():
 
26
  gr.Markdown("# Text Embedding for Search Queries")
27
  gr.Markdown("Ask an open question!")
28
  with gr.Row():
29
+ inp_single = gr.Textbox(show_label=False)
30
  with gr.Row():
31
+ btn_single = gr.Button("Generate Embedding")
32
+ with gr.Row():
33
+ out_single = gr.DataFrame(label="Embedding", show_label=True)
34
+ gr.Markdown("You can select one of the sample datasets for batch inference")
35
  with gr.Row():
36
+ with gr.Column():
37
+ btn_fiqa = gr.Button("fiqa")
38
+ with gr.Column():
39
+ btn_faiss = gr.Button("faiss")
40
+ with gr.Row():
41
+ out_batch = gr.DataFrame(label="Embedding", show_label=True)
42
+ gr.Markdown("You can also try out our batch inference by uploading a file")
43
+ with gr.Row():
44
+ out_batch = gr.File(interactive=True)
45
+ with gr.Row():
46
+ btn_file = gr.Button("Generate Embedding")
47
+
48
+ btn_single.click(fn=text_embedding, inputs=inp_single, outputs=out_single)
49
+ btn_file.click(fn=text_embedding, inputs=inp_single, outputs=out_single)
50
 
51
  demo.launch()
52
 
automm_semantic_embedding.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ir_datasets
2
+ import pandas as pd
3
+ from autogluon.multimodal import MultiModalPredictor
4
+
5
+
6
+ dataset = ir_datasets.load("beir/fiqa/dev")
7
+
8
+
9
+
10
+
11
+ dataset = ir_datasets.load("beir/fiqa/dev")
12
+
13
+ docs_df = pd.DataFrame(dataset.docs_iter()).set_index("doc_id").sample(frac=0.0001)
14
+ query_df = pd.DataFrame(dataset.queries_iter()).set_index("query_id")
15
+
16
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
17
+
18
+ predictor = MultiModalPredictor(
19
+ pipeline="feature_extraction",
20
+ hyperparameters={
21
+ "model.hf_text.checkpoint_name": model_name
22
+ }
23
+ )
24
+
25
+ document_embedding = predictor.extract_embedding(docs_df)
26
+
27
+ query = "What happened when the dot com bubble burst?"
28
+ query_embedding = predictor.extract_embedding([query])
29
+
30
+ import numpy as np
31
+
32
+ q_norm = query_embedding['0'] / np.linalg.norm(query_embedding['0'], axis=-1, keepdims=True)
33
+ d_norm = document_embedding['text'] / np.linalg.norm(document_embedding['text'], axis=-1, keepdims=True)
34
+ scores = d_norm.dot(q_norm[0])
35
+
36
+
37
+ print(f'Question: {query}')
38
+ print()
39
+ for idx in np.argsort(-scores)[:2]:
40
+ print(f'Top {idx} result:')
41
+ print('-----------------')
42
+ print(docs_df['text'].iloc[idx])
43
+ print()
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio
2
  wheel
3
  setuptools
 
4
  git+https://github.com/awslabs/autogluon.git@master#subdirectory=autogluon
 
1
  gradio
2
  wheel
3
  setuptools
4
+ ir_datasets
5
  git+https://github.com/awslabs/autogluon.git@master#subdirectory=autogluon