File size: 5,812 Bytes
afbf115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import time
import gradio as gr
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
import faiss
from usearch.index import Index
import datetime


# Load titles and texts
title_text_dataset = load_dataset("suanan/BP_POC", split="train", num_proc=4).select_columns(["url", "title", "text"])

# Load the int8 and binary indices. Int8 is loaded as a view to save memory, as we never actually perform search with it.
int8_view = Index.restore("index/BP_CBG_int8_usearch_1m.index", view=True)
binary_index: faiss.IndexBinaryFlat = faiss.read_index_binary("index/BP_CBG_ubinary_faiss_1m.index")
# binary_ivf: faiss.IndexBinaryIVF = faiss.read_index_binary("BP_ubinary_ivf_faiss_50m.index")

# Load the SentenceTransformer model for embedding the queries
model = SentenceTransformer(
    "BAAI/bge-m3",
    prompts={
        "retrieval": "Represent this sentence for searching relevant passages: ",
    },
    default_prompt_name="retrieval",
)


def search(query, top_k: int = 100, rescore_multiplier: int = 1, use_approx: bool = False):
    # 獲取當前時間
    now = datetime.datetime.now()
    print(f"當前時間: {now}, 問題: {query}")
    # 1. Embed the query as float32
    start_time = time.time()
    query_embedding = model.encode(query)
    embed_time = time.time() - start_time

    # 2. Quantize the query to ubinary
    start_time = time.time()
    query_embedding_ubinary = quantize_embeddings(query_embedding.reshape(1, -1), "ubinary")
    quantize_time = time.time() - start_time

    # 3. Search the binary index (either exact or approximate)
    # index = binary_ivf if use_approx else binary_index
    index = binary_index
    start_time = time.time()
    _scores, binary_ids = index.search(query_embedding_ubinary, top_k * rescore_multiplier)
    binary_ids = binary_ids[0]
    search_time = time.time() - start_time

    # 4. Load the corresponding int8 embeddings
    start_time = time.time()
    int8_embeddings = int8_view[binary_ids].astype(int)
    load_time = time.time() - start_time

    # 5. Rescore the top_k * rescore_multiplier using the float32 query embedding and the int8 document embeddings
    start_time = time.time()
    scores = query_embedding @ int8_embeddings.T
    rescore_time = time.time() - start_time

    # 6. Sort the scores and return the top_k
    start_time = time.time()
    indices = scores.argsort()[::-1][:top_k]
    top_k_indices = binary_ids[indices]
    top_k_scores = scores[indices]
    top_k_urls, top_k_titles, top_k_texts = zip(
        *[(title_text_dataset[idx]["url"], title_text_dataset[idx]["title"], title_text_dataset[idx]["text"]) for idx in top_k_indices.tolist()]
    )
    df = pd.DataFrame(
        {"Score": [round(value, 2) for value in top_k_scores], "Url": top_k_urls, "Title": top_k_titles, "Text": top_k_texts}
    )
    sort_time = time.time() - start_time

    return df, {
        "Embed Time": f"{embed_time:.4f} s",
        "Quantize Time": f"{quantize_time:.4f} s",
        "Search Time": f"{search_time:.4f} s",
        "Load Time": f"{load_time:.4f} s",
        "Rescore Time": f"{rescore_time:.4f} s",
        "Sort Time": f"{sort_time:.4f} s",
        "Total search Time": f"{quantize_time + search_time + load_time + rescore_time + sort_time:.4f} s",
    }
def update_info(value):
    return f"{value}筆顯示出來"

with gr.Blocks(title="") as demo:
    gr.Markdown(
        """
## 官網 Dataset & opensource model BAAI/bge-m3
### v1 測試POC


Details:
1. 中文搜尋ok,英文像是:iphone 15,embedding的時候沒有轉成小寫,需要 寫成iPhone才可以準確搜尋到
2. 環境資源: python 3.10, linux: ubuntu 22.04, only cpu, ram max:7.7GB min:4.5GB 使用以上資源
3.

建立步驟:
1. excel 轉成 [dataset](https://huggingface.co/datasets/suanan/BP_POC), 花費約10秒內
2. dataset 內 轉成 title & text 做 embedding,以後可以新增keyword來加強搜尋出來的結果排序往前
3. 之後透過 Quantized Retrieval - Binary Search solution進行搜尋


"""
    )
    with gr.Row():
        with gr.Column(scale=75):
            query = gr.Textbox(
                label="官網 Dataset & opensource model BAAI/bge-m3,  v1 測試POC",
                placeholder="輸入搜尋關鍵字或問句",
            )
        with gr.Column(scale=25):
            use_approx = gr.Radio(
                choices=[("精確搜尋", False), ("相關搜尋", True)],
                value=False,
                label="搜尋方法",
            )

    with gr.Row():
        with gr.Column(scale=2):
            top_k = gr.Slider(
                minimum=10,
                maximum=1000,
                step=5,
                value=100,
                label="顯示搜尋前幾筆",
            )
            info_text = gr.Textbox(value=update_info(top_k.value), interactive=False)
        with gr.Column(scale=2):
            rescore_multiplier = gr.Slider(
                minimum=1,
                maximum=10,
                step=1,
                value=1,
                label="Rescore multiplier",
                info="Search for `rescore_multiplier` as many documents to rescore",
            )

    search_button = gr.Button(value="Search")

    with gr.Row():
        with gr.Column(scale=4):
            output = gr.Dataframe(headers=["Score", "Title", "Text"])
        with gr.Column(scale=1):
            json = gr.JSON()
    top_k.change(fn=update_info, inputs=top_k, outputs=info_text)
    query.submit(search, inputs=[query, top_k, rescore_multiplier, use_approx], outputs=[output, json])
    search_button.click(search, inputs=[query, top_k, rescore_multiplier, use_approx], outputs=[output, json])

demo.queue()
demo.launch(share=True)