Spaces:
Build error
Build error
Upload 17 files
#22
by
awinml
- opened
- app.py +34 -7
- requirements.txt +1 -1
- utils/models.py +60 -13
- utils/nltkmodules.py +3 -2
- utils/retriever.py +120 -47
- utils/vector_index.py +13 -1
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import re
|
2 |
-
|
3 |
import openai
|
4 |
import streamlit_scrollable_textbox as stx
|
5 |
|
@@ -8,23 +8,27 @@ import streamlit as st
|
|
8 |
|
9 |
st.set_page_config(layout="wide") # isort: split
|
10 |
|
|
|
11 |
from utils.entity_extraction import (
|
12 |
clean_entities,
|
|
|
13 |
extract_quarter_year,
|
14 |
extract_ticker_spacy,
|
15 |
format_entities_flan_alpaca,
|
16 |
generate_alpaca_ner_prompt,
|
17 |
-
extract_keywords
|
18 |
)
|
19 |
from utils.models import (
|
20 |
generate_entities_flan_alpaca_checkpoint,
|
21 |
generate_entities_flan_alpaca_inference_api,
|
22 |
generate_text_flan_t5,
|
23 |
-
get_data,
|
24 |
get_alpaca_model,
|
|
|
25 |
get_flan_alpaca_xl_model,
|
26 |
get_flan_t5_model,
|
27 |
get_instructor_embedding_model,
|
|
|
|
|
|
|
28 |
get_mpnet_embedding_model,
|
29 |
get_sgpt_embedding_model,
|
30 |
get_spacy_model,
|
@@ -55,6 +59,7 @@ from utils.retriever import (
|
|
55 |
sentence_id_combine,
|
56 |
text_lookup,
|
57 |
year_quarter_range,
|
|
|
58 |
)
|
59 |
from utils.transcript_retrieval import retrieve_transcript
|
60 |
from utils.vector_index import (
|
@@ -62,7 +67,6 @@ from utils.vector_index import (
|
|
62 |
create_sparse_embeddings,
|
63 |
hybrid_score_norm,
|
64 |
)
|
65 |
-
from utils import nltkmodules
|
66 |
|
67 |
st.title("Question Answering on Earnings Call Transcripts")
|
68 |
|
@@ -75,6 +79,8 @@ col1, col2 = st.columns([3, 3], gap="medium")
|
|
75 |
|
76 |
|
77 |
with st.sidebar:
|
|
|
|
|
78 |
ner_choice = st.selectbox("Select NER Model", ["Spacy", "Alpaca"])
|
79 |
document_type = st.selectbox(
|
80 |
"Select Query Type", ["Single-Document", "Multi-Document"]
|
@@ -85,6 +91,18 @@ with st.sidebar:
|
|
85 |
["Single-Company", "Compare Companies"],
|
86 |
)
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
if ner_choice == "Spacy":
|
89 |
ner_model = get_spacy_model()
|
90 |
|
@@ -305,7 +323,7 @@ elif encoder_model == "Instructor":
|
|
305 |
)
|
306 |
pinecone_index_name = "week13-instructor-xl"
|
307 |
pinecone_index = pinecone.Index(pinecone_index_name)
|
308 |
-
retriever_model =
|
309 |
instruction = (
|
310 |
"Represent the financial question for retrieving supporting documents:"
|
311 |
)
|
@@ -318,7 +336,7 @@ elif encoder_model == "Hybrid Instructor - SPLADE":
|
|
318 |
)
|
319 |
pinecone_index_name = "week13-splade-instructor-xl"
|
320 |
pinecone_index = pinecone.Index(pinecone_index_name)
|
321 |
-
retriever_model =
|
322 |
(
|
323 |
sparse_retriever_model,
|
324 |
sparse_retriever_tokenizer,
|
@@ -382,6 +400,7 @@ if document_type == "Single-Document":
|
|
382 |
dense_query_embedding, sparse_query_embedding = hybrid_score_norm(
|
383 |
dense_query_embedding, sparse_query_embedding, 0.3
|
384 |
)
|
|
|
385 |
query_results = query_pinecone_sparse(
|
386 |
dense_query_embedding,
|
387 |
sparse_query_embedding,
|
@@ -392,6 +411,7 @@ if document_type == "Single-Document":
|
|
392 |
ticker,
|
393 |
participant_type,
|
394 |
keywords,
|
|
|
395 |
threshold,
|
396 |
)
|
397 |
|
@@ -413,6 +433,7 @@ if document_type == "Single-Document":
|
|
413 |
ticker,
|
414 |
participant_type,
|
415 |
keywords,
|
|
|
416 |
threshold,
|
417 |
)
|
418 |
|
@@ -459,6 +480,7 @@ else:
|
|
459 |
ticker,
|
460 |
participant_type,
|
461 |
keywords,
|
|
|
462 |
threshold,
|
463 |
)
|
464 |
results_list = sentence_id_combine(
|
@@ -490,6 +512,7 @@ else:
|
|
490 |
ticker,
|
491 |
participant_type,
|
492 |
keywords,
|
|
|
493 |
threshold,
|
494 |
)
|
495 |
results_list = sentence_id_combine(
|
@@ -535,6 +558,7 @@ else:
|
|
535 |
ticker_first,
|
536 |
participant_type,
|
537 |
keywords,
|
|
|
538 |
threshold,
|
539 |
)
|
540 |
results_list = sentence_id_combine(
|
@@ -557,6 +581,7 @@ else:
|
|
557 |
ticker_second,
|
558 |
participant_type,
|
559 |
keywords,
|
|
|
560 |
threshold,
|
561 |
)
|
562 |
results_list = sentence_id_combine(
|
@@ -591,6 +616,7 @@ else:
|
|
591 |
ticker_first,
|
592 |
participant_type,
|
593 |
keywords,
|
|
|
594 |
threshold,
|
595 |
)
|
596 |
results_list = sentence_id_combine(
|
@@ -612,6 +638,7 @@ else:
|
|
612 |
ticker_second,
|
613 |
participant_type,
|
614 |
keywords,
|
|
|
615 |
threshold,
|
616 |
)
|
617 |
results_list = sentence_id_combine(
|
@@ -778,7 +805,7 @@ if decoder_model == "GPT-J":
|
|
778 |
)
|
779 |
submitted = st.form_submit_button("Submit")
|
780 |
|
781 |
-
tab1, tab2 = st.tabs(["
|
782 |
|
783 |
|
784 |
with tab1:
|
|
|
1 |
import re
|
2 |
+
import numpy as np
|
3 |
import openai
|
4 |
import streamlit_scrollable_textbox as stx
|
5 |
|
|
|
8 |
|
9 |
st.set_page_config(layout="wide") # isort: split
|
10 |
|
11 |
+
from utils import nltkmodules
|
12 |
from utils.entity_extraction import (
|
13 |
clean_entities,
|
14 |
+
extract_keywords,
|
15 |
extract_quarter_year,
|
16 |
extract_ticker_spacy,
|
17 |
format_entities_flan_alpaca,
|
18 |
generate_alpaca_ner_prompt,
|
|
|
19 |
)
|
20 |
from utils.models import (
|
21 |
generate_entities_flan_alpaca_checkpoint,
|
22 |
generate_entities_flan_alpaca_inference_api,
|
23 |
generate_text_flan_t5,
|
|
|
24 |
get_alpaca_model,
|
25 |
+
get_data,
|
26 |
get_flan_alpaca_xl_model,
|
27 |
get_flan_t5_model,
|
28 |
get_instructor_embedding_model,
|
29 |
+
get_instructor_embedding_model_api,
|
30 |
+
get_bm25_model,
|
31 |
+
preprocess_text,
|
32 |
get_mpnet_embedding_model,
|
33 |
get_sgpt_embedding_model,
|
34 |
get_spacy_model,
|
|
|
59 |
sentence_id_combine,
|
60 |
text_lookup,
|
61 |
year_quarter_range,
|
62 |
+
get_bm25_search_hits,
|
63 |
)
|
64 |
from utils.transcript_retrieval import retrieve_transcript
|
65 |
from utils.vector_index import (
|
|
|
67 |
create_sparse_embeddings,
|
68 |
hybrid_score_norm,
|
69 |
)
|
|
|
70 |
|
71 |
st.title("Question Answering on Earnings Call Transcripts")
|
72 |
|
|
|
79 |
|
80 |
|
81 |
with st.sidebar:
|
82 |
+
use_bm25 = st.checkbox("Use BM25 for filtering results")
|
83 |
+
|
84 |
ner_choice = st.selectbox("Select NER Model", ["Spacy", "Alpaca"])
|
85 |
document_type = st.selectbox(
|
86 |
"Select Query Type", ["Single-Document", "Multi-Document"]
|
|
|
91 |
["Single-Company", "Compare Companies"],
|
92 |
)
|
93 |
|
94 |
+
|
95 |
+
corpus, bm25 = get_bm25_model(data)
|
96 |
+
|
97 |
+
tokenized_query = preprocess_text(query_text).split()
|
98 |
+
sparse_scores = np.argsort(bm25.get_scores(tokenized_query), axis=0)[::-1]
|
99 |
+
indices_hits = get_bm25_search_hits(corpus, sparse_scores, 50)
|
100 |
+
|
101 |
+
if use_bm25 == True:
|
102 |
+
indices = indices_hits
|
103 |
+
else:
|
104 |
+
indices = None
|
105 |
+
|
106 |
if ner_choice == "Spacy":
|
107 |
ner_model = get_spacy_model()
|
108 |
|
|
|
323 |
)
|
324 |
pinecone_index_name = "week13-instructor-xl"
|
325 |
pinecone_index = pinecone.Index(pinecone_index_name)
|
326 |
+
retriever_model = get_instructor_embedding_model_api()
|
327 |
instruction = (
|
328 |
"Represent the financial question for retrieving supporting documents:"
|
329 |
)
|
|
|
336 |
)
|
337 |
pinecone_index_name = "week13-splade-instructor-xl"
|
338 |
pinecone_index = pinecone.Index(pinecone_index_name)
|
339 |
+
retriever_model = get_instructor_embedding_model_api()
|
340 |
(
|
341 |
sparse_retriever_model,
|
342 |
sparse_retriever_tokenizer,
|
|
|
400 |
dense_query_embedding, sparse_query_embedding = hybrid_score_norm(
|
401 |
dense_query_embedding, sparse_query_embedding, 0.3
|
402 |
)
|
403 |
+
|
404 |
query_results = query_pinecone_sparse(
|
405 |
dense_query_embedding,
|
406 |
sparse_query_embedding,
|
|
|
411 |
ticker,
|
412 |
participant_type,
|
413 |
keywords,
|
414 |
+
indices,
|
415 |
threshold,
|
416 |
)
|
417 |
|
|
|
433 |
ticker,
|
434 |
participant_type,
|
435 |
keywords,
|
436 |
+
indices,
|
437 |
threshold,
|
438 |
)
|
439 |
|
|
|
480 |
ticker,
|
481 |
participant_type,
|
482 |
keywords,
|
483 |
+
indices,
|
484 |
threshold,
|
485 |
)
|
486 |
results_list = sentence_id_combine(
|
|
|
512 |
ticker,
|
513 |
participant_type,
|
514 |
keywords,
|
515 |
+
indices,
|
516 |
threshold,
|
517 |
)
|
518 |
results_list = sentence_id_combine(
|
|
|
558 |
ticker_first,
|
559 |
participant_type,
|
560 |
keywords,
|
561 |
+
indices,
|
562 |
threshold,
|
563 |
)
|
564 |
results_list = sentence_id_combine(
|
|
|
581 |
ticker_second,
|
582 |
participant_type,
|
583 |
keywords,
|
584 |
+
indices,
|
585 |
threshold,
|
586 |
)
|
587 |
results_list = sentence_id_combine(
|
|
|
616 |
ticker_first,
|
617 |
participant_type,
|
618 |
keywords,
|
619 |
+
indices,
|
620 |
threshold,
|
621 |
)
|
622 |
results_list = sentence_id_combine(
|
|
|
638 |
ticker_second,
|
639 |
participant_type,
|
640 |
keywords,
|
641 |
+
indices,
|
642 |
threshold,
|
643 |
)
|
644 |
results_list = sentence_id_combine(
|
|
|
805 |
)
|
806 |
submitted = st.form_submit_button("Submit")
|
807 |
|
808 |
+
tab1, tab2 = st.tabs(["Retrieved Text", "Retrieved Documents"])
|
809 |
|
810 |
|
811 |
with tab1:
|
requirements.txt
CHANGED
@@ -14,4 +14,4 @@ streamlit-scrollable-textbox
|
|
14 |
openai
|
15 |
InstructorEmbedding
|
16 |
gradio_client
|
17 |
-
|
|
|
14 |
openai
|
15 |
InstructorEmbedding
|
16 |
gradio_client
|
17 |
+
rank_bm25
|
utils/models.py
CHANGED
@@ -20,26 +20,59 @@ from transformers import (
|
|
20 |
T5Tokenizer,
|
21 |
pipeline,
|
22 |
)
|
23 |
-
|
24 |
-
import
|
|
|
|
|
|
|
|
|
25 |
import streamlit as st
|
26 |
|
27 |
|
28 |
-
@st.
|
29 |
def get_data():
|
30 |
data = pd.read_csv("earnings_calls_cleaned_metadata.csv")
|
31 |
return data
|
32 |
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
# Initialize Spacy Model
|
35 |
|
36 |
|
37 |
-
@st.
|
38 |
def get_spacy_model():
|
39 |
return spacy.load("en_core_web_trf")
|
40 |
|
41 |
|
42 |
-
@st.
|
43 |
def get_flan_alpaca_xl_model():
|
44 |
model = AutoModelForSeq2SeqLM.from_pretrained(
|
45 |
"/home/user/app/models/flan-alpaca-xl/"
|
@@ -53,19 +86,19 @@ def get_flan_alpaca_xl_model():
|
|
53 |
# Initialize models from HuggingFace
|
54 |
|
55 |
|
56 |
-
@st.
|
57 |
def get_t5_model():
|
58 |
return pipeline("summarization", model="t5-small", tokenizer="t5-small")
|
59 |
|
60 |
|
61 |
-
@st.
|
62 |
def get_flan_t5_model():
|
63 |
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
|
64 |
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
|
65 |
return model, tokenizer
|
66 |
|
67 |
|
68 |
-
@st.
|
69 |
def get_mpnet_embedding_model():
|
70 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
71 |
model = SentenceTransformer(
|
@@ -75,7 +108,7 @@ def get_mpnet_embedding_model():
|
|
75 |
return model
|
76 |
|
77 |
|
78 |
-
@st.
|
79 |
def get_splade_sparse_embedding_model():
|
80 |
model_sparse = "naver/splade-cocondenser-ensembledistil"
|
81 |
# check device
|
@@ -87,7 +120,7 @@ def get_splade_sparse_embedding_model():
|
|
87 |
return model_sparse, tokenizer
|
88 |
|
89 |
|
90 |
-
@st.
|
91 |
def get_sgpt_embedding_model():
|
92 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
93 |
model = SentenceTransformer(
|
@@ -97,20 +130,34 @@ def get_sgpt_embedding_model():
|
|
97 |
return model
|
98 |
|
99 |
|
100 |
-
@st.
|
101 |
def get_instructor_embedding_model():
|
102 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
103 |
model = INSTRUCTOR("hkunlp/instructor-xl")
|
104 |
return model
|
105 |
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
-
@st.
|
108 |
def get_alpaca_model():
|
109 |
client = Client("https://awinml-alpaca-cpp.hf.space")
|
110 |
return client
|
111 |
|
112 |
|
113 |
-
@st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
def save_key(api_key):
|
115 |
return api_key
|
116 |
|
|
|
20 |
T5Tokenizer,
|
21 |
pipeline,
|
22 |
)
|
23 |
+
from rank_bm25 import BM25Okapi, BM25L, BM25Plus
|
24 |
+
import numpy as np
|
25 |
+
from nltk.tokenize import word_tokenize
|
26 |
+
from nltk.corpus import stopwords
|
27 |
+
from nltk.stem.porter import PorterStemmer
|
28 |
+
import re
|
29 |
import streamlit as st
|
30 |
|
31 |
|
32 |
+
@st.cache_resource
|
33 |
def get_data():
|
34 |
data = pd.read_csv("earnings_calls_cleaned_metadata.csv")
|
35 |
return data
|
36 |
|
37 |
|
38 |
+
# Preprocessing for BM25
|
39 |
+
|
40 |
+
|
41 |
+
def tokenizer(
|
42 |
+
string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}"
|
43 |
+
):
|
44 |
+
regex = reg
|
45 |
+
string = string.replace("-", " ")
|
46 |
+
return " ".join(re.findall(regex, string))
|
47 |
+
|
48 |
+
|
49 |
+
def preprocess_text(text):
|
50 |
+
# Convert to lowercase
|
51 |
+
text = text.lower()
|
52 |
+
# Tokenize the text
|
53 |
+
tokens = word_tokenize(text)
|
54 |
+
# Remove stop words
|
55 |
+
stop_words = set(stopwords.words("english"))
|
56 |
+
tokens = [token for token in tokens if token not in stop_words]
|
57 |
+
# Stem the tokens
|
58 |
+
porter_stemmer = PorterStemmer()
|
59 |
+
tokens = [porter_stemmer.stem(token) for token in tokens]
|
60 |
+
# Join the tokens back into a single string
|
61 |
+
preprocessed_text = " ".join(tokens)
|
62 |
+
preprocessed_text = tokenizer(preprocessed_text)
|
63 |
+
|
64 |
+
return preprocessed_text
|
65 |
+
|
66 |
+
|
67 |
# Initialize Spacy Model
|
68 |
|
69 |
|
70 |
+
@st.cache_resource
|
71 |
def get_spacy_model():
|
72 |
return spacy.load("en_core_web_trf")
|
73 |
|
74 |
|
75 |
+
@st.cache_resource
|
76 |
def get_flan_alpaca_xl_model():
|
77 |
model = AutoModelForSeq2SeqLM.from_pretrained(
|
78 |
"/home/user/app/models/flan-alpaca-xl/"
|
|
|
86 |
# Initialize models from HuggingFace
|
87 |
|
88 |
|
89 |
+
@st.cache_resource
|
90 |
def get_t5_model():
|
91 |
return pipeline("summarization", model="t5-small", tokenizer="t5-small")
|
92 |
|
93 |
|
94 |
+
@st.cache_resource
|
95 |
def get_flan_t5_model():
|
96 |
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
|
97 |
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
|
98 |
return model, tokenizer
|
99 |
|
100 |
|
101 |
+
@st.cache_resource
|
102 |
def get_mpnet_embedding_model():
|
103 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
104 |
model = SentenceTransformer(
|
|
|
108 |
return model
|
109 |
|
110 |
|
111 |
+
@st.cache_resource
|
112 |
def get_splade_sparse_embedding_model():
|
113 |
model_sparse = "naver/splade-cocondenser-ensembledistil"
|
114 |
# check device
|
|
|
120 |
return model_sparse, tokenizer
|
121 |
|
122 |
|
123 |
+
@st.cache_resource
|
124 |
def get_sgpt_embedding_model():
|
125 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
126 |
model = SentenceTransformer(
|
|
|
130 |
return model
|
131 |
|
132 |
|
133 |
+
@st.cache_resource
|
134 |
def get_instructor_embedding_model():
|
135 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
136 |
model = INSTRUCTOR("hkunlp/instructor-xl")
|
137 |
return model
|
138 |
|
139 |
+
@st.cache_resource
|
140 |
+
def get_instructor_embedding_model_api():
|
141 |
+
client = Client("https://awinml-api-instructor-xl-2.hf.space/")
|
142 |
+
return client
|
143 |
+
|
144 |
|
145 |
+
@st.cache_resource
|
146 |
def get_alpaca_model():
|
147 |
client = Client("https://awinml-alpaca-cpp.hf.space")
|
148 |
return client
|
149 |
|
150 |
|
151 |
+
@st.cache_resource
|
152 |
+
def get_bm25_model(data):
|
153 |
+
corpus = data.Text.tolist()
|
154 |
+
corpus_clean = [preprocess_text(x) for x in corpus]
|
155 |
+
tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
|
156 |
+
bm25 = BM25Plus(tokenized_corpus)
|
157 |
+
return corpus, bm25
|
158 |
+
|
159 |
+
|
160 |
+
@st.cache_resource
|
161 |
def save_key(api_key):
|
162 |
return api_key
|
163 |
|
utils/nltkmodules.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import nltk
|
2 |
|
3 |
-
nltk.download(
|
4 |
-
nltk.download(
|
|
|
|
1 |
import nltk
|
2 |
|
3 |
+
nltk.download("wordnet")
|
4 |
+
nltk.download("punkt")
|
5 |
+
nltk.download("stopwords")
|
utils/retriever.py
CHANGED
@@ -1,6 +1,16 @@
|
|
1 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
dense_vec,
|
3 |
-
sparse_vec,
|
4 |
top_k,
|
5 |
index,
|
6 |
year,
|
@@ -8,6 +18,7 @@ def query_pinecone_sparse(
|
|
8 |
ticker,
|
9 |
participant_type,
|
10 |
keywords=None,
|
|
|
11 |
threshold=0.25,
|
12 |
):
|
13 |
if participant_type == "Company Speaker":
|
@@ -16,68 +27,126 @@ def query_pinecone_sparse(
|
|
16 |
participant = "Question"
|
17 |
|
18 |
# Create filter dictionary based on keywords
|
19 |
-
filter_dict = [{
|
20 |
|
21 |
if year == "All":
|
22 |
if quarter == "All":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
xc = index.query(
|
24 |
vector=dense_vec,
|
25 |
-
sparse_vector=sparse_vec,
|
26 |
top_k=top_k,
|
27 |
filter={
|
28 |
-
"Year":
|
29 |
-
|
30 |
-
int("2020"),
|
31 |
-
int("2019"),
|
32 |
-
int("2018"),
|
33 |
-
int("2017"),
|
34 |
-
int("2016"),
|
35 |
-
]
|
36 |
-
},
|
37 |
-
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
38 |
"Ticker": {"$eq": ticker},
|
39 |
"QA_Flag": {"$eq": participant},
|
40 |
-
"Keywords": {"$in": keywords}
|
|
|
41 |
},
|
42 |
include_metadata=True,
|
43 |
)
|
44 |
else:
|
45 |
xc = index.query(
|
46 |
vector=dense_vec,
|
47 |
-
sparse_vector=sparse_vec,
|
48 |
top_k=top_k,
|
49 |
filter={
|
50 |
-
"Year":
|
51 |
-
"$in": [
|
52 |
-
int("2020"),
|
53 |
-
int("2019"),
|
54 |
-
int("2018"),
|
55 |
-
int("2017"),
|
56 |
-
int("2016"),
|
57 |
-
]
|
58 |
-
},
|
59 |
"Quarter": {"$eq": quarter},
|
60 |
"Ticker": {"$eq": ticker},
|
61 |
"QA_Flag": {"$eq": participant},
|
62 |
-
"Keywords": {"$in": keywords}
|
63 |
},
|
64 |
include_metadata=True,
|
65 |
)
|
66 |
-
else:
|
67 |
-
# search pinecone index for context passage with the answer
|
68 |
-
xc = index.query(
|
69 |
-
vector=dense_vec,
|
70 |
-
sparse_vector=sparse_vec,
|
71 |
-
top_k=top_k,
|
72 |
-
filter={
|
73 |
-
"Year": int(year),
|
74 |
-
"Quarter": {"$eq": quarter},
|
75 |
-
"Ticker": {"$eq": ticker},
|
76 |
-
"QA_Flag": {"$eq": participant},
|
77 |
-
"Keywords": {"$in": keywords}
|
78 |
-
},
|
79 |
-
include_metadata=True,
|
80 |
-
)
|
81 |
# filter the context passages based on the score threshold
|
82 |
filtered_matches = []
|
83 |
for match in xc["matches"]:
|
@@ -87,8 +156,9 @@ def query_pinecone_sparse(
|
|
87 |
return xc
|
88 |
|
89 |
|
90 |
-
def
|
91 |
dense_vec,
|
|
|
92 |
top_k,
|
93 |
index,
|
94 |
year,
|
@@ -96,6 +166,7 @@ def query_pinecone(
|
|
96 |
ticker,
|
97 |
participant_type,
|
98 |
keywords=None,
|
|
|
99 |
threshold=0.25,
|
100 |
):
|
101 |
if participant_type == "Company Speaker":
|
@@ -104,13 +175,13 @@ def query_pinecone(
|
|
104 |
participant = "Question"
|
105 |
|
106 |
# Create filter dictionary based on keywords
|
107 |
-
filter_dict = [{
|
108 |
-
|
109 |
|
110 |
if year == "All":
|
111 |
if quarter == "All":
|
112 |
xc = index.query(
|
113 |
vector=dense_vec,
|
|
|
114 |
top_k=top_k,
|
115 |
filter={
|
116 |
"Year": {
|
@@ -125,13 +196,14 @@ def query_pinecone(
|
|
125 |
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
126 |
"Ticker": {"$eq": ticker},
|
127 |
"QA_Flag": {"$eq": participant},
|
128 |
-
"Keywords": {"$in": keywords}
|
129 |
},
|
130 |
include_metadata=True,
|
131 |
)
|
132 |
else:
|
133 |
xc = index.query(
|
134 |
vector=dense_vec,
|
|
|
135 |
top_k=top_k,
|
136 |
filter={
|
137 |
"Year": {
|
@@ -146,7 +218,7 @@ def query_pinecone(
|
|
146 |
"Quarter": {"$eq": quarter},
|
147 |
"Ticker": {"$eq": ticker},
|
148 |
"QA_Flag": {"$eq": participant},
|
149 |
-
"Keywords": {"$in": keywords}
|
150 |
},
|
151 |
include_metadata=True,
|
152 |
)
|
@@ -154,13 +226,14 @@ def query_pinecone(
|
|
154 |
# search pinecone index for context passage with the answer
|
155 |
xc = index.query(
|
156 |
vector=dense_vec,
|
|
|
157 |
top_k=top_k,
|
158 |
filter={
|
159 |
"Year": int(year),
|
160 |
"Quarter": {"$eq": quarter},
|
161 |
"Ticker": {"$eq": ticker},
|
162 |
"QA_Flag": {"$eq": participant},
|
163 |
-
"Keywords": {"$in": keywords}
|
164 |
},
|
165 |
include_metadata=True,
|
166 |
)
|
|
|
1 |
+
def get_bm25_search_hits(corpus, sparse_scores, top_n=50):
|
2 |
+
bm25_search = []
|
3 |
+
indices = []
|
4 |
+
for idx in sparse_scores:
|
5 |
+
if len(bm25_search) <= top_n:
|
6 |
+
bm25_search.append(corpus[idx])
|
7 |
+
indices.append(idx)
|
8 |
+
indices = [int(x) for x in indices]
|
9 |
+
return indices
|
10 |
+
|
11 |
+
|
12 |
+
def query_pinecone(
|
13 |
dense_vec,
|
|
|
14 |
top_k,
|
15 |
index,
|
16 |
year,
|
|
|
18 |
ticker,
|
19 |
participant_type,
|
20 |
keywords=None,
|
21 |
+
indices=None,
|
22 |
threshold=0.25,
|
23 |
):
|
24 |
if participant_type == "Company Speaker":
|
|
|
27 |
participant = "Question"
|
28 |
|
29 |
# Create filter dictionary based on keywords
|
30 |
+
filter_dict = [{"Keywords": word} for word in keywords]
|
31 |
|
32 |
if year == "All":
|
33 |
if quarter == "All":
|
34 |
+
if indices != None:
|
35 |
+
xc = index.query(
|
36 |
+
vector=dense_vec,
|
37 |
+
top_k=top_k,
|
38 |
+
filter={
|
39 |
+
"Year": {
|
40 |
+
"$in": [
|
41 |
+
int("2020"),
|
42 |
+
int("2019"),
|
43 |
+
int("2018"),
|
44 |
+
int("2017"),
|
45 |
+
int("2016"),
|
46 |
+
]
|
47 |
+
},
|
48 |
+
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
49 |
+
"Ticker": {"$eq": ticker},
|
50 |
+
"QA_Flag": {"$eq": participant},
|
51 |
+
"Keywords": {"$in": keywords},
|
52 |
+
"index": {"$in": indices},
|
53 |
+
},
|
54 |
+
include_metadata=True,
|
55 |
+
)
|
56 |
+
else:
|
57 |
+
xc = index.query(
|
58 |
+
vector=dense_vec,
|
59 |
+
top_k=top_k,
|
60 |
+
filter={
|
61 |
+
"Year": {
|
62 |
+
"$in": [
|
63 |
+
int("2020"),
|
64 |
+
int("2019"),
|
65 |
+
int("2018"),
|
66 |
+
int("2017"),
|
67 |
+
int("2016"),
|
68 |
+
]
|
69 |
+
},
|
70 |
+
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
71 |
+
"Ticker": {"$eq": ticker},
|
72 |
+
"QA_Flag": {"$eq": participant},
|
73 |
+
"Keywords": {"$in": keywords},
|
74 |
+
},
|
75 |
+
include_metadata=True,
|
76 |
+
)
|
77 |
+
else:
|
78 |
+
if indices != None:
|
79 |
+
xc = index.query(
|
80 |
+
vector=dense_vec,
|
81 |
+
top_k=top_k,
|
82 |
+
filter={
|
83 |
+
"Year": {
|
84 |
+
"$in": [
|
85 |
+
int("2020"),
|
86 |
+
int("2019"),
|
87 |
+
int("2018"),
|
88 |
+
int("2017"),
|
89 |
+
int("2016"),
|
90 |
+
]
|
91 |
+
},
|
92 |
+
"Quarter": {"$eq": quarter},
|
93 |
+
"Ticker": {"$eq": ticker},
|
94 |
+
"QA_Flag": {"$eq": participant},
|
95 |
+
"Keywords": {"$in": keywords},
|
96 |
+
"index": {"$in": indices},
|
97 |
+
},
|
98 |
+
include_metadata=True,
|
99 |
+
)
|
100 |
+
else:
|
101 |
+
xc = index.query(
|
102 |
+
vector=dense_vec,
|
103 |
+
top_k=top_k,
|
104 |
+
filter={
|
105 |
+
"Year": {
|
106 |
+
"$in": [
|
107 |
+
int("2020"),
|
108 |
+
int("2019"),
|
109 |
+
int("2018"),
|
110 |
+
int("2017"),
|
111 |
+
int("2016"),
|
112 |
+
]
|
113 |
+
},
|
114 |
+
"Quarter": {"$eq": quarter},
|
115 |
+
"Ticker": {"$eq": ticker},
|
116 |
+
"QA_Flag": {"$eq": participant},
|
117 |
+
"Keywords": {"$in": keywords},
|
118 |
+
},
|
119 |
+
include_metadata=True,
|
120 |
+
)
|
121 |
+
else:
|
122 |
+
# search pinecone index for context passage with the answer
|
123 |
+
if indices != None:
|
124 |
xc = index.query(
|
125 |
vector=dense_vec,
|
|
|
126 |
top_k=top_k,
|
127 |
filter={
|
128 |
+
"Year": int(year),
|
129 |
+
"Quarter": {"$eq": quarter},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
"Ticker": {"$eq": ticker},
|
131 |
"QA_Flag": {"$eq": participant},
|
132 |
+
"Keywords": {"$in": keywords},
|
133 |
+
"index": {"$in": indices},
|
134 |
},
|
135 |
include_metadata=True,
|
136 |
)
|
137 |
else:
|
138 |
xc = index.query(
|
139 |
vector=dense_vec,
|
|
|
140 |
top_k=top_k,
|
141 |
filter={
|
142 |
+
"Year": int(year),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
"Quarter": {"$eq": quarter},
|
144 |
"Ticker": {"$eq": ticker},
|
145 |
"QA_Flag": {"$eq": participant},
|
146 |
+
"Keywords": {"$in": keywords},
|
147 |
},
|
148 |
include_metadata=True,
|
149 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
# filter the context passages based on the score threshold
|
151 |
filtered_matches = []
|
152 |
for match in xc["matches"]:
|
|
|
156 |
return xc
|
157 |
|
158 |
|
159 |
+
def query_pinecone_sparse(
|
160 |
dense_vec,
|
161 |
+
sparse_vec,
|
162 |
top_k,
|
163 |
index,
|
164 |
year,
|
|
|
166 |
ticker,
|
167 |
participant_type,
|
168 |
keywords=None,
|
169 |
+
indices=None,
|
170 |
threshold=0.25,
|
171 |
):
|
172 |
if participant_type == "Company Speaker":
|
|
|
175 |
participant = "Question"
|
176 |
|
177 |
# Create filter dictionary based on keywords
|
178 |
+
filter_dict = [{"Keywords": word} for word in keywords]
|
|
|
179 |
|
180 |
if year == "All":
|
181 |
if quarter == "All":
|
182 |
xc = index.query(
|
183 |
vector=dense_vec,
|
184 |
+
sparse_vector=sparse_vec,
|
185 |
top_k=top_k,
|
186 |
filter={
|
187 |
"Year": {
|
|
|
196 |
"Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
|
197 |
"Ticker": {"$eq": ticker},
|
198 |
"QA_Flag": {"$eq": participant},
|
199 |
+
"Keywords": {"$in": keywords},
|
200 |
},
|
201 |
include_metadata=True,
|
202 |
)
|
203 |
else:
|
204 |
xc = index.query(
|
205 |
vector=dense_vec,
|
206 |
+
sparse_vector=sparse_vec,
|
207 |
top_k=top_k,
|
208 |
filter={
|
209 |
"Year": {
|
|
|
218 |
"Quarter": {"$eq": quarter},
|
219 |
"Ticker": {"$eq": ticker},
|
220 |
"QA_Flag": {"$eq": participant},
|
221 |
+
"Keywords": {"$in": keywords},
|
222 |
},
|
223 |
include_metadata=True,
|
224 |
)
|
|
|
226 |
# search pinecone index for context passage with the answer
|
227 |
xc = index.query(
|
228 |
vector=dense_vec,
|
229 |
+
sparse_vector=sparse_vec,
|
230 |
top_k=top_k,
|
231 |
filter={
|
232 |
"Year": int(year),
|
233 |
"Quarter": {"$eq": quarter},
|
234 |
"Ticker": {"$eq": ticker},
|
235 |
"QA_Flag": {"$eq": participant},
|
236 |
+
"Keywords": {"$in": keywords},
|
237 |
},
|
238 |
include_metadata=True,
|
239 |
)
|
utils/vector_index.py
CHANGED
@@ -1,11 +1,23 @@
|
|
1 |
import torch
|
|
|
|
|
2 |
|
3 |
|
4 |
def create_dense_embeddings(query, model, instruction=None):
|
5 |
if instruction == None:
|
6 |
dense_emb = model.encode([query]).tolist()
|
7 |
else:
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
return dense_emb
|
10 |
|
11 |
|
|
|
1 |
import torch
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
|
5 |
|
6 |
def create_dense_embeddings(query, model, instruction=None):
|
7 |
if instruction == None:
|
8 |
dense_emb = model.encode([query]).tolist()
|
9 |
else:
|
10 |
+
# Fetching embedding from API for Instructor
|
11 |
+
json_output_embedding = model.predict(
|
12 |
+
instruction,
|
13 |
+
query,
|
14 |
+
api_name="/predict",
|
15 |
+
)
|
16 |
+
|
17 |
+
json_file = open(json_output_embedding, "r")
|
18 |
+
json_dict = json.load(json_file)
|
19 |
+
dense_array = np.array(json_dict["data"], dtype=np.float64)
|
20 |
+
dense_emb = dense_array.tolist()
|
21 |
return dense_emb
|
22 |
|
23 |
|