awinml commited on
Commit
e375940
1 Parent(s): 5755682

Upload 191 files

Browse files
Files changed (3) hide show
  1. Transcripts/AMZN/2019-Apr-25-AMZN.txt +1 -1
  2. app.py +86 -35
  3. utils.py +177 -20
Transcripts/AMZN/2019-Apr-25-AMZN.txt CHANGED
@@ -69,7 +69,7 @@ With that, we will move to Q&A. Operator, please remind our listeners how to ini
69
 
70
  ================================================================================
71
  Questions and Answers
72
- ================================================================================s
73
  --------------------------------------------------------------------------------
74
  Operator [1]
75
  --------------------------------------------------------------------------------
 
69
 
70
  ================================================================================
71
  Questions and Answers
72
+ ================================================================================
73
  --------------------------------------------------------------------------------
74
  Operator [1]
75
  --------------------------------------------------------------------------------
app.py CHANGED
@@ -1,29 +1,31 @@
 
 
 
1
  import pinecone
2
  import streamlit as st
3
-
4
- st.set_page_config(layout="wide")
5
-
6
- import streamlit_scrollable_textbox as stx
7
- import openai
8
  from utils import (
 
 
 
 
9
  get_data,
 
10
  get_mpnet_embedding_model,
11
  get_sgpt_embedding_model,
12
- get_flan_t5_model,
13
  get_t5_model,
14
- save_key,
15
- )
16
-
17
- from utils import (
18
- retrieve_transcript,
19
  query_pinecone,
20
- format_query,
 
 
21
  sentence_id_combine,
22
  text_lookup,
23
- generate_prompt,
24
- gpt_model,
25
  )
26
 
 
 
27
 
28
  st.title("Abstractive Question Answering")
29
 
@@ -73,12 +75,14 @@ with st.sidebar:
73
  st.subheader("Select Options:")
74
 
75
  with st.sidebar:
76
- num_results = int(st.number_input("Number of Results to query", 1, 15, value=6))
 
 
77
 
78
 
79
  # Choose encoder model
80
 
81
- encoder_models_choice = ["MPNET", "SGPT"]
82
  with st.sidebar:
83
  encoder_model = st.selectbox("Select Encoder Model", encoder_models_choice)
84
 
@@ -97,18 +101,34 @@ with st.sidebar:
97
 
98
  if encoder_model == "MPNET":
99
  # Connect to pinecone environment
100
- pinecone.init(api_key=st.secrets["pinecone_mpnet"], environment="us-east1-gcp")
 
 
101
  pinecone_index_name = "week2-all-mpnet-base"
102
  pinecone_index = pinecone.Index(pinecone_index_name)
103
  retriever_model = get_mpnet_embedding_model()
104
 
105
  elif encoder_model == "SGPT":
106
  # Connect to pinecone environment
107
- pinecone.init(api_key=st.secrets["pinecone_sgpt"], environment="us-east1-gcp")
 
 
108
  pinecone_index_name = "week2-sgpt-125m"
109
  pinecone_index = pinecone.Index(pinecone_index_name)
110
  retriever_model = get_sgpt_embedding_model()
111
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  with st.sidebar:
114
  window = int(st.number_input("Sentence Window Size", 0, 10, value=1))
@@ -116,23 +136,52 @@ with st.sidebar:
116
  with st.sidebar:
117
  threshold = float(
118
  st.number_input(
119
- label="Similarity Score Threshold", step=0.05, format="%.2f", value=0.25
 
 
 
120
  )
121
  )
122
 
123
  data = get_data()
124
 
125
- query_results = query_pinecone(
126
- query_text,
127
- num_results,
128
- retriever_model,
129
- pinecone_index,
130
- year,
131
- quarter,
132
- ticker,
133
- participant_type,
134
- threshold,
135
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  if threshold <= 0.90:
138
  context_list = sentence_id_combine(data, query_results, lag=window)
@@ -145,7 +194,9 @@ prompt = generate_prompt(query_text, context_list)
145
  if decoder_model == "GPT3 - (text-davinci-003)":
146
  with col2:
147
  with st.form("my_form"):
148
- edited_prompt = st.text_area(label="Model Prompt", value=prompt, height=270)
 
 
149
 
150
  openai_key = st.text_input(
151
  "Enter OpenAI key",
@@ -166,20 +217,20 @@ elif decoder_model == "T5":
166
  output_text = []
167
  for context_text in context_list:
168
  output_text.append(t5_pipeline(context_text)[0]["summary_text"])
169
- generated_text = ". ".join(output_text)
170
  with col2:
171
  st.subheader("Answer:")
172
- st.write(t5_pipeline(generated_text)[0]["summary_text"])
 
173
 
174
  elif decoder_model == "FLAN-T5":
175
  flan_t5_pipeline = get_flan_t5_model()
176
  output_text = []
177
  for context_text in context_list:
178
  output_text.append(flan_t5_pipeline(context_text)[0]["summary_text"])
179
- generated_text = ". ".join(output_text)
180
  with col2:
181
  st.subheader("Answer:")
182
- st.write(flan_t5_pipeline(generated_text)[0]["summary_text"])
 
183
 
184
  with col1:
185
  with st.expander("See Retrieved Text"):
 
1
+ import openai
2
+ import streamlit_scrollable_textbox as stx
3
+
4
  import pinecone
5
  import streamlit as st
 
 
 
 
 
6
  from utils import (
7
+ create_dense_embeddings,
8
+ create_sparse_embeddings,
9
+ format_query,
10
+ generate_prompt,
11
  get_data,
12
+ get_flan_t5_model,
13
  get_mpnet_embedding_model,
14
  get_sgpt_embedding_model,
15
+ get_splade_sparse_embedding_model,
16
  get_t5_model,
17
+ gpt_model,
18
+ hybrid_score_norm,
 
 
 
19
  query_pinecone,
20
+ query_pinecone_sparse,
21
+ retrieve_transcript,
22
+ save_key,
23
  sentence_id_combine,
24
  text_lookup,
 
 
25
  )
26
 
27
+ st.set_page_config(layout="wide")
28
+
29
 
30
  st.title("Abstractive Question Answering")
31
 
 
75
  st.subheader("Select Options:")
76
 
77
  with st.sidebar:
78
+ num_results = int(
79
+ st.number_input("Number of Results to query", 1, 15, value=6)
80
+ )
81
 
82
 
83
  # Choose encoder model
84
 
85
+ encoder_models_choice = ["MPNET", "SGPT", "Hybrid MPNET - SPLADE"]
86
  with st.sidebar:
87
  encoder_model = st.selectbox("Select Encoder Model", encoder_models_choice)
88
 
 
101
 
102
  if encoder_model == "MPNET":
103
  # Connect to pinecone environment
104
+ pinecone.init(
105
+ api_key=st.secrets["pinecone_mpnet"], environment="us-east1-gcp"
106
+ )
107
  pinecone_index_name = "week2-all-mpnet-base"
108
  pinecone_index = pinecone.Index(pinecone_index_name)
109
  retriever_model = get_mpnet_embedding_model()
110
 
111
  elif encoder_model == "SGPT":
112
  # Connect to pinecone environment
113
+ pinecone.init(
114
+ api_key=st.secrets["pinecone_sgpt"], environment="us-east1-gcp"
115
+ )
116
  pinecone_index_name = "week2-sgpt-125m"
117
  pinecone_index = pinecone.Index(pinecone_index_name)
118
  retriever_model = get_sgpt_embedding_model()
119
 
120
+ elif encoder_model == "Hybrid MPNET - SPLADE":
121
+ pinecone.init(
122
+ api_key=st.secrets["pinecone_hybrid_splade_mpnet"],
123
+ environment="us-central1-gcp",
124
+ )
125
+ pinecone_index_name = "splade-mpnet"
126
+ pinecone_index = pinecone.Index(pinecone_index_name)
127
+ retriever_model = get_mpnet_embedding_model()
128
+ (
129
+ sparse_retriever_model,
130
+ sparse_retriever_tokenizer,
131
+ ) = get_splade_sparse_embedding_model()
132
 
133
  with st.sidebar:
134
  window = int(st.number_input("Sentence Window Size", 0, 10, value=1))
 
136
  with st.sidebar:
137
  threshold = float(
138
  st.number_input(
139
+ label="Similarity Score Threshold",
140
+ step=0.05,
141
+ format="%.2f",
142
+ value=0.25,
143
  )
144
  )
145
 
146
  data = get_data()
147
 
148
+ if encoder_model == "Hybrid SGPT - SPLADE":
149
+ dense_query_embedding = create_dense_embeddings(
150
+ query_text, retriever_model
151
+ )
152
+ sparse_query_embedding = create_sparse_embeddings(
153
+ query_text, sparse_retriever_model, sparse_retriever_tokenizer
154
+ )
155
+ dense_query_embedding, sparse_query_embedding = hybrid_score_norm(
156
+ dense_query_embedding, sparse_query_embedding, 0
157
+ )
158
+ query_results = query_pinecone_sparse(
159
+ dense_query_embedding,
160
+ sparse_query_embedding,
161
+ num_results,
162
+ pinecone_index,
163
+ year,
164
+ quarter,
165
+ ticker,
166
+ participant_type,
167
+ threshold,
168
+ )
169
+
170
+ else:
171
+ dense_query_embedding = create_dense_embeddings(
172
+ query_text, retriever_model
173
+ )
174
+ query_results = query_pinecone(
175
+ dense_query_embedding,
176
+ num_results,
177
+ pinecone_index,
178
+ year,
179
+ quarter,
180
+ ticker,
181
+ participant_type,
182
+ threshold,
183
+ )
184
+
185
 
186
  if threshold <= 0.90:
187
  context_list = sentence_id_combine(data, query_results, lag=window)
 
194
  if decoder_model == "GPT3 - (text-davinci-003)":
195
  with col2:
196
  with st.form("my_form"):
197
+ edited_prompt = st.text_area(
198
+ label="Model Prompt", value=prompt, height=270
199
+ )
200
 
201
  openai_key = st.text_input(
202
  "Enter OpenAI key",
 
217
  output_text = []
218
  for context_text in context_list:
219
  output_text.append(t5_pipeline(context_text)[0]["summary_text"])
 
220
  with col2:
221
  st.subheader("Answer:")
222
+ for text in output_text:
223
+ st.markdown(f"- {text}")
224
 
225
  elif decoder_model == "FLAN-T5":
226
  flan_t5_pipeline = get_flan_t5_model()
227
  output_text = []
228
  for context_text in context_list:
229
  output_text.append(flan_t5_pipeline(context_text)[0]["summary_text"])
 
230
  with col2:
231
  st.subheader("Answer:")
232
+ for text in output_text:
233
+ st.markdown(f"- {text}")
234
 
235
  with col1:
236
  with st.expander("See Retrieved Text"):
utils.py CHANGED
@@ -1,18 +1,18 @@
1
- import streamlit as st
2
- import pandas as pd
3
  import pandas as pd
4
- from tqdm import tqdm
5
- import pinecone
6
  import torch
7
  from sentence_transformers import SentenceTransformer
 
8
  from transformers import (
9
- pipeline,
10
- AutoTokenizer,
11
- AutoModelForCausalLM,
12
  AutoModelForSeq2SeqLM,
 
 
13
  )
14
- import openai
15
- import streamlit_scrollable_textbox as stx
 
16
 
17
 
18
  @st.experimental_singleton
@@ -32,7 +32,11 @@ def get_t5_model():
32
  @st.experimental_singleton
33
  def get_flan_t5_model():
34
  return pipeline(
35
- "summarization", model="google/flan-t5-small", tokenizer="google/flan-t5-small"
 
 
 
 
36
  )
37
 
38
 
@@ -46,6 +50,18 @@ def get_mpnet_embedding_model():
46
  return model
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  @st.experimental_singleton
50
  def get_sgpt_embedding_model():
51
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -61,20 +77,152 @@ def save_key(api_key):
61
  return api_key
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def query_pinecone(
65
- query, top_k, model, index, year, quarter, ticker, participant_type, threshold=0.25
 
 
 
 
 
 
 
66
  ):
67
  if participant_type == "Company Speaker":
68
  participant = "Answer"
69
  else:
70
  participant = "Question"
71
- # generate embeddings for the query
72
- xq = model.encode([query]).tolist()
73
 
74
  if year == "All":
75
  if quarter == "All":
76
  xc = index.query(
77
- xq,
78
  top_k=top_k,
79
  filter={
80
  "Year": {
@@ -94,7 +242,7 @@ def query_pinecone(
94
  )
95
  else:
96
  xc = index.query(
97
- xq,
98
  top_k=top_k,
99
  filter={
100
  "Year": {
@@ -115,7 +263,7 @@ def query_pinecone(
115
  else:
116
  # search pinecone index for context passage with the answer
117
  xc = index.query(
118
- xq,
119
  top_k=top_k,
120
  filter={
121
  "Year": int(year),
@@ -136,24 +284,33 @@ def query_pinecone(
136
 
137
  def format_query(query_results):
138
  # extract passage_text from Pinecone search result
139
- context = [result["metadata"]["Text"] for result in query_results["matches"]]
 
 
140
  return context
141
 
142
 
143
  def sentence_id_combine(data, query_results, lag=1):
144
  # Extract sentence IDs from query results
145
- ids = [result["metadata"]["Sentence_id"] for result in query_results["matches"]]
 
 
 
146
  # Generate new IDs by adding a lag value to the original IDs
147
  new_ids = [id + i for id in ids for i in range(-lag, lag + 1)]
148
  # Remove duplicates and sort the new IDs
149
  new_ids = sorted(set(new_ids))
150
  # Create a list of lookup IDs by grouping the new IDs in groups of lag*2+1
151
  lookup_ids = [
152
- new_ids[i : i + (lag * 2 + 1)] for i in range(0, len(new_ids), lag * 2 + 1)
 
153
  ]
154
  # Create a list of context sentences by joining the sentences corresponding to the lookup IDs
155
  context_list = [
156
- " ".join(data.Text.iloc[lookup_id].to_list()) for lookup_id in lookup_ids
 
 
 
157
  ]
158
  return context_list
159
 
 
1
+ import openai
 
2
  import pandas as pd
3
+ import streamlit_scrollable_textbox as stx
 
4
  import torch
5
  from sentence_transformers import SentenceTransformer
6
+ from tqdm import tqdm
7
  from transformers import (
8
+ AutoModelForMaskedLM,
 
 
9
  AutoModelForSeq2SeqLM,
10
+ AutoTokenizer,
11
+ pipeline,
12
  )
13
+
14
+ import pinecone
15
+ import streamlit as st
16
 
17
 
18
  @st.experimental_singleton
 
32
  @st.experimental_singleton
33
  def get_flan_t5_model():
34
  return pipeline(
35
+ "summarization",
36
+ model="google/flan-t5-small",
37
+ tokenizer="google/flan-t5-small",
38
+ max_length=512,
39
+ # length_penalty = 0
40
  )
41
 
42
 
 
50
  return model
51
 
52
 
53
+ @st.experimental_singleton
54
+ def get_splade_sparse_embedding_model():
55
+ model_sparse = "naver/splade-cocondenser-ensembledistil"
56
+ # check device
57
+ device = "cuda" if torch.cuda.is_available() else "cpu"
58
+ tokenizer = AutoTokenizer.from_pretrained(model_sparse)
59
+ model_sparse = AutoModelForMaskedLM.from_pretrained(model_sparse)
60
+ # move to gpu if available
61
+ model_sparse.to(device)
62
+ return model_sparse, tokenizer
63
+
64
+
65
  @st.experimental_singleton
66
  def get_sgpt_embedding_model():
67
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
77
  return api_key
78
 
79
 
80
+ def create_dense_embeddings(query, model):
81
+ dense_emb = model.encode([query]).tolist()
82
+ return dense_emb
83
+
84
+
85
+ def create_sparse_embeddings(query, model, tokenizer):
86
+ device = "cuda" if torch.cuda.is_available() else "cpu"
87
+ inputs = tokenizer(query, return_tensors="pt").to(device)
88
+
89
+ with torch.no_grad():
90
+ logits = model(**inputs).logits
91
+
92
+ inter = torch.log1p(torch.relu(logits[0]))
93
+ token_max = torch.max(inter, dim=0) # sum over input tokens
94
+ nz_tokens = torch.where(token_max.values > 0)[0]
95
+ nz_weights = token_max.values[nz_tokens]
96
+
97
+ order = torch.sort(nz_weights, descending=True)
98
+ nz_weights = nz_weights[order[1]]
99
+ nz_tokens = nz_tokens[order[1]]
100
+ return {
101
+ "indices": nz_tokens.cpu().numpy().tolist(),
102
+ "values": nz_weights.cpu().numpy().tolist(),
103
+ }
104
+
105
+
106
+ def hybrid_score_norm(dense, sparse, alpha: float):
107
+ """Hybrid score using a convex combination
108
+
109
+ alpha * dense + (1 - alpha) * sparse
110
+
111
+ Args:
112
+ dense: Array of floats representing
113
+ sparse: a dict of `indices` and `values`
114
+ alpha: scale between 0 and 1
115
+ """
116
+ if alpha < 0 or alpha > 1:
117
+ raise ValueError("Alpha must be between 0 and 1")
118
+ hs = {
119
+ "indices": sparse["indices"],
120
+ "values": [v * (1 - alpha) for v in sparse["values"]],
121
+ }
122
+ return [v * alpha for v in dense], hs
123
+
124
+
125
+ def query_pinecone_sparse(
126
+ dense_vec,
127
+ sparse_vec,
128
+ top_k,
129
+ index,
130
+ year,
131
+ quarter,
132
+ ticker,
133
+ participant_type,
134
+ threshold=0.25,
135
+ ):
136
+ if participant_type == "Company Speaker":
137
+ participant = "Answer"
138
+ else:
139
+ participant = "Question"
140
+
141
+ if year == "All":
142
+ if quarter == "All":
143
+ xc = index.query(
144
+ vector=dense_vec,
145
+ sparse_vector=sparse_vec,
146
+ top_k=top_k,
147
+ filter={
148
+ "Year": {
149
+ "$in": [
150
+ int("2020"),
151
+ int("2019"),
152
+ int("2018"),
153
+ int("2017"),
154
+ int("2016"),
155
+ ]
156
+ },
157
+ "Quarter": {"$in": ["Q1", "Q2", "Q3", "Q4"]},
158
+ "Ticker": {"$eq": ticker},
159
+ "QA_Flag": {"$eq": participant},
160
+ },
161
+ include_metadata=True,
162
+ )
163
+ else:
164
+ xc = index.query(
165
+ vector=dense_vec,
166
+ sparse_vector=sparse_vec,
167
+ top_k=top_k,
168
+ filter={
169
+ "Year": {
170
+ "$in": [
171
+ int("2020"),
172
+ int("2019"),
173
+ int("2018"),
174
+ int("2017"),
175
+ int("2016"),
176
+ ]
177
+ },
178
+ "Quarter": {"$eq": quarter},
179
+ "Ticker": {"$eq": ticker},
180
+ "QA_Flag": {"$eq": participant},
181
+ },
182
+ include_metadata=True,
183
+ )
184
+ else:
185
+ # search pinecone index for context passage with the answer
186
+ xc = index.query(
187
+ vector=dense_vec,
188
+ sparse_vector=sparse_vec,
189
+ top_k=top_k,
190
+ filter={
191
+ "Year": int(year),
192
+ "Quarter": {"$eq": quarter},
193
+ "Ticker": {"$eq": ticker},
194
+ "QA_Flag": {"$eq": participant},
195
+ },
196
+ include_metadata=True,
197
+ )
198
+ # filter the context passages based on the score threshold
199
+ filtered_matches = []
200
+ for match in xc["matches"]:
201
+ if match["score"] >= threshold:
202
+ filtered_matches.append(match)
203
+ xc["matches"] = filtered_matches
204
+ return xc
205
+
206
+
207
  def query_pinecone(
208
+ dense_vec,
209
+ top_k,
210
+ index,
211
+ year,
212
+ quarter,
213
+ ticker,
214
+ participant_type,
215
+ threshold=0.25,
216
  ):
217
  if participant_type == "Company Speaker":
218
  participant = "Answer"
219
  else:
220
  participant = "Question"
 
 
221
 
222
  if year == "All":
223
  if quarter == "All":
224
  xc = index.query(
225
+ vector=dense_vec,
226
  top_k=top_k,
227
  filter={
228
  "Year": {
 
242
  )
243
  else:
244
  xc = index.query(
245
+ vector=dense_vec,
246
  top_k=top_k,
247
  filter={
248
  "Year": {
 
263
  else:
264
  # search pinecone index for context passage with the answer
265
  xc = index.query(
266
+ vector=dense_vec,
267
  top_k=top_k,
268
  filter={
269
  "Year": int(year),
 
284
 
285
  def format_query(query_results):
286
  # extract passage_text from Pinecone search result
287
+ context = [
288
+ result["metadata"]["Text"] for result in query_results["matches"]
289
+ ]
290
  return context
291
 
292
 
293
  def sentence_id_combine(data, query_results, lag=1):
294
  # Extract sentence IDs from query results
295
+ ids = [
296
+ result["metadata"]["Sentence_id"]
297
+ for result in query_results["matches"]
298
+ ]
299
  # Generate new IDs by adding a lag value to the original IDs
300
  new_ids = [id + i for id in ids for i in range(-lag, lag + 1)]
301
  # Remove duplicates and sort the new IDs
302
  new_ids = sorted(set(new_ids))
303
  # Create a list of lookup IDs by grouping the new IDs in groups of lag*2+1
304
  lookup_ids = [
305
+ new_ids[i : i + (lag * 2 + 1)]
306
+ for i in range(0, len(new_ids), lag * 2 + 1)
307
  ]
308
  # Create a list of context sentences by joining the sentences corresponding to the lookup IDs
309
  context_list = [
310
+ " ".join(
311
+ data.loc[data["Sentence_id"].isin(lookup_id), "Text"].to_list()
312
+ )
313
+ for lookup_id in lookup_ids
314
  ]
315
  return context_list
316