Spaces:
Sleeping
Sleeping
david-oplatka
commited on
Commit
•
c4f995d
1
Parent(s):
e763e8a
Upload query.py
Browse files
query.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
from urllib.parse import quote
|
5 |
+
|
6 |
+
def extract_between_tags(text, start_tag, end_tag):
|
7 |
+
start_index = text.find(start_tag)
|
8 |
+
end_index = text.find(end_tag, start_index)
|
9 |
+
return text[start_index+len(start_tag):end_index-len(end_tag)]
|
10 |
+
|
11 |
+
class CitationNormalizer():
|
12 |
+
|
13 |
+
def __init__(self, responses, docs):
|
14 |
+
self.docs = docs
|
15 |
+
self.responses = responses
|
16 |
+
self.refs = []
|
17 |
+
|
18 |
+
def normalize_citations(self, summary):
|
19 |
+
start_tag = "%START_SNIPPET%"
|
20 |
+
end_tag = "%END_SNIPPET%"
|
21 |
+
|
22 |
+
# find all references in the summary
|
23 |
+
pattern = r'\[\d{1,2}\]'
|
24 |
+
matches = [match.span() for match in re.finditer(pattern, summary)]
|
25 |
+
|
26 |
+
# figure out unique list of references
|
27 |
+
for match in matches:
|
28 |
+
start, end = match
|
29 |
+
response_num = int(summary[start+1:end-1])
|
30 |
+
doc_num = self.responses[response_num-1]['documentIndex']
|
31 |
+
metadata = {item['name']: item['value'] for item in self.docs[doc_num]['metadata']}
|
32 |
+
text = extract_between_tags(self.responses[response_num-1]['text'], start_tag, end_tag)
|
33 |
+
if 'url' in metadata.keys():
|
34 |
+
url = f"{metadata['url']}#:~:text={quote(text)}"
|
35 |
+
if url not in self.refs:
|
36 |
+
self.refs.append(url)
|
37 |
+
|
38 |
+
# replace references with markdown links
|
39 |
+
refs_dict = {url:(inx+1) for inx,url in enumerate(self.refs)}
|
40 |
+
for match in reversed(matches):
|
41 |
+
start, end = match
|
42 |
+
response_num = int(summary[start+1:end-1])
|
43 |
+
doc_num = self.responses[response_num-1]['documentIndex']
|
44 |
+
metadata = {item['name']: item['value'] for item in self.docs[doc_num]['metadata']}
|
45 |
+
text = extract_between_tags(self.responses[response_num-1]['text'], start_tag, end_tag)
|
46 |
+
if 'url' in metadata.keys():
|
47 |
+
url = f"{metadata['url']}#:~:text={quote(text)}"
|
48 |
+
citation_inx = refs_dict[url]
|
49 |
+
summary = summary[:start] + f'[\[{citation_inx}\]]({url})' + summary[end:]
|
50 |
+
else:
|
51 |
+
summary = summary[:start] + summary[end:]
|
52 |
+
|
53 |
+
return summary
|
54 |
+
|
55 |
+
class VectaraQuery():
|
56 |
+
def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str], prompt_name: str = None):
|
57 |
+
self.customer_id = customer_id
|
58 |
+
self.corpus_ids = corpus_ids
|
59 |
+
self.api_key = api_key
|
60 |
+
self.prompt_name = prompt_name if prompt_name else "vectara-experimental-summary-ext-2023-12-11-sml"
|
61 |
+
self.conv_id = None
|
62 |
+
|
63 |
+
def get_body(self, query_str: str):
|
64 |
+
corpora_key_list = [{
|
65 |
+
'customer_id': self.customer_id, 'corpus_id': corpus_id, 'lexical_interpolation_config': {'lambda': 0.025}
|
66 |
+
} for corpus_id in self.corpus_ids
|
67 |
+
]
|
68 |
+
|
69 |
+
return {
|
70 |
+
'query': [
|
71 |
+
{
|
72 |
+
'query': query_str,
|
73 |
+
'start': 0,
|
74 |
+
'numResults': 50,
|
75 |
+
'corpusKey': corpora_key_list,
|
76 |
+
'context_config': {
|
77 |
+
'sentences_before': 2,
|
78 |
+
'sentences_after': 2,
|
79 |
+
'start_tag': "%START_SNIPPET%",
|
80 |
+
'end_tag': "%END_SNIPPET%",
|
81 |
+
},
|
82 |
+
'rerankingConfig':
|
83 |
+
{
|
84 |
+
'rerankerId': 272725718,
|
85 |
+
'mmrConfig': {
|
86 |
+
'diversityBias': 0.3
|
87 |
+
}
|
88 |
+
},
|
89 |
+
'summary': [
|
90 |
+
{
|
91 |
+
'responseLang': 'eng',
|
92 |
+
'maxSummarizedResults': 5,
|
93 |
+
'summarizerPromptName': self.prompt_name,
|
94 |
+
'chat': {
|
95 |
+
'store': True,
|
96 |
+
'conversationId': self.conv_id
|
97 |
+
},
|
98 |
+
}
|
99 |
+
]
|
100 |
+
}
|
101 |
+
]
|
102 |
+
}
|
103 |
+
|
104 |
+
def get_headers(self):
|
105 |
+
return {
|
106 |
+
"Content-Type": "application/json",
|
107 |
+
"Accept": "application/json",
|
108 |
+
"customer-id": self.customer_id,
|
109 |
+
"x-api-key": self.api_key,
|
110 |
+
"grpc-timeout": "60S"
|
111 |
+
}
|
112 |
+
|
113 |
+
def submit_query(self, query_str: str):
|
114 |
+
|
115 |
+
endpoint = f"https://api.vectara.io/v1/query"
|
116 |
+
body = self.get_body(query_str)
|
117 |
+
|
118 |
+
response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers())
|
119 |
+
if response.status_code != 200:
|
120 |
+
print(f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}")
|
121 |
+
return "Sorry, something went wrong in my brain. Please try again later."
|
122 |
+
|
123 |
+
res = response.json()
|
124 |
+
|
125 |
+
top_k = 10
|
126 |
+
summary = res['responseSet'][0]['summary'][0]['text']
|
127 |
+
responses = res['responseSet'][0]['response'][:top_k]
|
128 |
+
docs = res['responseSet'][0]['document']
|
129 |
+
chat = res['responseSet'][0]['summary'][0].get('chat', None)
|
130 |
+
|
131 |
+
if chat and chat['status'] is not None:
|
132 |
+
st_code = chat['status']
|
133 |
+
print(f"Chat query failed with code {st_code}")
|
134 |
+
if st_code == 'RESOURCE_EXHAUSTED':
|
135 |
+
self.conv_id = None
|
136 |
+
return 'Sorry, Vectara chat turns exceeds plan limit.'
|
137 |
+
return 'Sorry, something went wrong in my brain. Please try again later.'
|
138 |
+
|
139 |
+
self.conv_id = chat['conversationId'] if chat else None
|
140 |
+
summary = CitationNormalizer(responses, docs).normalize_citations(summary)
|
141 |
+
return summary
|
142 |
+
|
143 |
+
def submit_query_streaming(self, query_str: str):
|
144 |
+
|
145 |
+
endpoint = f"https://api.vectara.io/v1/stream-query"
|
146 |
+
body = self.get_body(query_str)
|
147 |
+
|
148 |
+
response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers(), stream=True)
|
149 |
+
if response.status_code != 200:
|
150 |
+
print(f"Query failed with code {response.status_code}, reason {response.reason}, text {response.text}")
|
151 |
+
return "Sorry, something went wrong in my brain. Please try again later."
|
152 |
+
|
153 |
+
chunks = []
|
154 |
+
accumulated_text = "" # Initialize text accumulation
|
155 |
+
pattern_max_length = 50 # Example heuristic
|
156 |
+
for line in response.iter_lines():
|
157 |
+
if line: # filter out keep-alive new lines
|
158 |
+
data = json.loads(line.decode('utf-8'))
|
159 |
+
res = data['result']
|
160 |
+
response_set = res['responseSet']
|
161 |
+
if response_set is None:
|
162 |
+
# grab next chunk and yield it as output
|
163 |
+
summary = res.get('summary', None)
|
164 |
+
if summary is None or len(summary)==0:
|
165 |
+
continue
|
166 |
+
else:
|
167 |
+
chat = summary.get('chat', None)
|
168 |
+
if chat and chat.get('status', None):
|
169 |
+
st_code = chat['status']
|
170 |
+
print(f"Chat query failed with code {st_code}")
|
171 |
+
if st_code == 'RESOURCE_EXHAUSTED':
|
172 |
+
self.conv_id = None
|
173 |
+
return 'Sorry, Vectara chat turns exceeds plan limit.'
|
174 |
+
return 'Sorry, something went wrong in my brain. Please try again later.'
|
175 |
+
conv_id = chat.get('conversationId', None) if chat else None
|
176 |
+
if conv_id:
|
177 |
+
self.conv_id = conv_id
|
178 |
+
|
179 |
+
chunk = summary['text']
|
180 |
+
accumulated_text += chunk # Append current chunk to accumulation
|
181 |
+
if len(accumulated_text) > pattern_max_length:
|
182 |
+
accumulated_text = re.sub(r"\[\d+\]", "", accumulated_text)
|
183 |
+
accumulated_text = re.sub(r"\s+\.", ".", accumulated_text)
|
184 |
+
out_chunk = accumulated_text[:-pattern_max_length]
|
185 |
+
chunks.append(out_chunk)
|
186 |
+
yield out_chunk
|
187 |
+
accumulated_text = accumulated_text[-pattern_max_length:]
|
188 |
+
|
189 |
+
if summary['done']:
|
190 |
+
break
|
191 |
+
|
192 |
+
# yield the last piece
|
193 |
+
if len(accumulated_text) > 0:
|
194 |
+
accumulated_text = re.sub(r" \[\d+\]\.", ".", accumulated_text)
|
195 |
+
chunks.append(accumulated_text)
|
196 |
+
yield accumulated_text
|
197 |
+
|
198 |
+
return ''.join(chunks)
|