Spaces:
Sleeping
Sleeping
david-oplatka
commited on
Commit
•
9cc5d1c
1
Parent(s):
3ec200d
Update query.py
Browse files
query.py
CHANGED
@@ -1,56 +1,6 @@
|
|
1 |
import requests
|
2 |
import json
|
3 |
-
import re
|
4 |
-
from urllib.parse import quote
|
5 |
-
|
6 |
-
def extract_between_tags(text, start_tag, end_tag):
|
7 |
-
start_index = text.find(start_tag)
|
8 |
-
end_index = text.find(end_tag, start_index)
|
9 |
-
return text[start_index+len(start_tag):end_index-len(end_tag)]
|
10 |
-
|
11 |
-
class CitationNormalizer():
|
12 |
-
|
13 |
-
def __init__(self, responses, docs):
|
14 |
-
self.docs = docs
|
15 |
-
self.responses = responses
|
16 |
-
self.refs = []
|
17 |
-
|
18 |
-
def normalize_citations(self, summary):
|
19 |
-
start_tag = "%START_SNIPPET%"
|
20 |
-
end_tag = "%END_SNIPPET%"
|
21 |
-
|
22 |
-
# find all references in the summary
|
23 |
-
pattern = r'\[\d{1,2}\]'
|
24 |
-
matches = [match.span() for match in re.finditer(pattern, summary)]
|
25 |
-
|
26 |
-
# figure out unique list of references
|
27 |
-
for match in matches:
|
28 |
-
start, end = match
|
29 |
-
response_num = int(summary[start+1:end-1])
|
30 |
-
doc_num = self.responses[response_num-1]['documentIndex']
|
31 |
-
metadata = {item['name']: item['value'] for item in self.docs[doc_num]['metadata']}
|
32 |
-
text = extract_between_tags(self.responses[response_num-1]['text'], start_tag, end_tag)
|
33 |
-
if 'url' in metadata.keys():
|
34 |
-
url = f"{metadata['url']}#:~:text={quote(text)}"
|
35 |
-
if url not in self.refs:
|
36 |
-
self.refs.append(url)
|
37 |
-
|
38 |
-
# replace references with markdown links
|
39 |
-
refs_dict = {url:(inx+1) for inx,url in enumerate(self.refs)}
|
40 |
-
for match in reversed(matches):
|
41 |
-
start, end = match
|
42 |
-
response_num = int(summary[start+1:end-1])
|
43 |
-
doc_num = self.responses[response_num-1]['documentIndex']
|
44 |
-
metadata = {item['name']: item['value'] for item in self.docs[doc_num]['metadata']}
|
45 |
-
text = extract_between_tags(self.responses[response_num-1]['text'], start_tag, end_tag)
|
46 |
-
if 'url' in metadata.keys():
|
47 |
-
url = f"{metadata['url']}#:~:text={quote(text)}"
|
48 |
-
citation_inx = refs_dict[url]
|
49 |
-
summary = summary[:start] + f'[\[{citation_inx}\]]({url})' + summary[end:]
|
50 |
-
else:
|
51 |
-
summary = summary[:start] + summary[end:]
|
52 |
|
53 |
-
return summary
|
54 |
|
55 |
class VectaraQuery():
|
56 |
def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str], prompt_name: str = None):
|
@@ -62,7 +12,7 @@ class VectaraQuery():
|
|
62 |
|
63 |
def get_body(self, query_str: str):
|
64 |
corpora_key_list = [{
|
65 |
-
'customer_id': self.customer_id, 'corpus_id': corpus_id, 'lexical_interpolation_config': {'lambda': 0.
|
66 |
} for corpus_id in self.corpus_ids
|
67 |
]
|
68 |
|
@@ -81,25 +31,26 @@ class VectaraQuery():
|
|
81 |
},
|
82 |
'rerankingConfig':
|
83 |
{
|
84 |
-
'rerankerId':
|
85 |
-
'mmrConfig': {
|
86 |
-
'diversityBias': 0.3
|
87 |
-
}
|
88 |
},
|
89 |
'summary': [
|
90 |
{
|
91 |
'responseLang': 'eng',
|
92 |
-
'maxSummarizedResults':
|
93 |
'summarizerPromptName': self.prompt_name,
|
94 |
'chat': {
|
95 |
'store': True,
|
96 |
'conversationId': self.conv_id
|
97 |
},
|
|
|
|
|
|
|
98 |
}
|
99 |
]
|
100 |
}
|
101 |
]
|
102 |
}
|
|
|
103 |
|
104 |
def get_headers(self):
|
105 |
return {
|
@@ -122,10 +73,7 @@ class VectaraQuery():
|
|
122 |
|
123 |
res = response.json()
|
124 |
|
125 |
-
top_k = 10
|
126 |
summary = res['responseSet'][0]['summary'][0]['text']
|
127 |
-
responses = res['responseSet'][0]['response'][:top_k]
|
128 |
-
docs = res['responseSet'][0]['document']
|
129 |
chat = res['responseSet'][0]['summary'][0].get('chat', None)
|
130 |
|
131 |
if chat and chat['status'] is not None:
|
@@ -137,12 +85,11 @@ class VectaraQuery():
|
|
137 |
return 'Sorry, something went wrong in my brain. Please try again later.'
|
138 |
|
139 |
self.conv_id = chat['conversationId'] if chat else None
|
140 |
-
summary = CitationNormalizer(responses, docs).normalize_citations(summary)
|
141 |
return summary
|
142 |
|
143 |
def submit_query_streaming(self, query_str: str):
|
144 |
|
145 |
-
endpoint =
|
146 |
body = self.get_body(query_str)
|
147 |
|
148 |
response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers(), stream=True)
|
@@ -151,8 +98,6 @@ class VectaraQuery():
|
|
151 |
return "Sorry, something went wrong in my brain. Please try again later."
|
152 |
|
153 |
chunks = []
|
154 |
-
accumulated_text = "" # Initialize text accumulation
|
155 |
-
pattern_max_length = 50 # Example heuristic
|
156 |
for line in response.iter_lines():
|
157 |
if line: # filter out keep-alive new lines
|
158 |
data = json.loads(line.decode('utf-8'))
|
@@ -177,22 +122,10 @@ class VectaraQuery():
|
|
177 |
self.conv_id = conv_id
|
178 |
|
179 |
chunk = summary['text']
|
180 |
-
|
181 |
-
|
182 |
-
accumulated_text = re.sub(r"\[\d+\]", "", accumulated_text)
|
183 |
-
accumulated_text = re.sub(r"\s+\.", ".", accumulated_text)
|
184 |
-
out_chunk = accumulated_text[:-pattern_max_length]
|
185 |
-
chunks.append(out_chunk)
|
186 |
-
yield out_chunk
|
187 |
-
accumulated_text = accumulated_text[-pattern_max_length:]
|
188 |
|
189 |
if summary['done']:
|
190 |
break
|
191 |
-
|
192 |
-
# yield the last piece
|
193 |
-
if len(accumulated_text) > 0:
|
194 |
-
accumulated_text = re.sub(r" \[\d+\]\.", ".", accumulated_text)
|
195 |
-
chunks.append(accumulated_text)
|
196 |
-
yield accumulated_text
|
197 |
|
198 |
return ''.join(chunks)
|
|
|
1 |
import requests
|
2 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
|
|
4 |
|
5 |
class VectaraQuery():
|
6 |
def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str], prompt_name: str = None):
|
|
|
12 |
|
13 |
def get_body(self, query_str: str):
|
14 |
corpora_key_list = [{
|
15 |
+
'customer_id': self.customer_id, 'corpus_id': corpus_id, 'lexical_interpolation_config': {'lambda': 0.005}
|
16 |
} for corpus_id in self.corpus_ids
|
17 |
]
|
18 |
|
|
|
31 |
},
|
32 |
'rerankingConfig':
|
33 |
{
|
34 |
+
'rerankerId': 272725719,
|
|
|
|
|
|
|
35 |
},
|
36 |
'summary': [
|
37 |
{
|
38 |
'responseLang': 'eng',
|
39 |
+
'maxSummarizedResults': 10,
|
40 |
'summarizerPromptName': self.prompt_name,
|
41 |
'chat': {
|
42 |
'store': True,
|
43 |
'conversationId': self.conv_id
|
44 |
},
|
45 |
+
'citationParams': {
|
46 |
+
"style": "NONE",
|
47 |
+
}
|
48 |
}
|
49 |
]
|
50 |
}
|
51 |
]
|
52 |
}
|
53 |
+
|
54 |
|
55 |
def get_headers(self):
|
56 |
return {
|
|
|
73 |
|
74 |
res = response.json()
|
75 |
|
|
|
76 |
summary = res['responseSet'][0]['summary'][0]['text']
|
|
|
|
|
77 |
chat = res['responseSet'][0]['summary'][0].get('chat', None)
|
78 |
|
79 |
if chat and chat['status'] is not None:
|
|
|
85 |
return 'Sorry, something went wrong in my brain. Please try again later.'
|
86 |
|
87 |
self.conv_id = chat['conversationId'] if chat else None
|
|
|
88 |
return summary
|
89 |
|
90 |
def submit_query_streaming(self, query_str: str):
|
91 |
|
92 |
+
endpoint = "https://api.vectara.io/v1/stream-query"
|
93 |
body = self.get_body(query_str)
|
94 |
|
95 |
response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers(), stream=True)
|
|
|
98 |
return "Sorry, something went wrong in my brain. Please try again later."
|
99 |
|
100 |
chunks = []
|
|
|
|
|
101 |
for line in response.iter_lines():
|
102 |
if line: # filter out keep-alive new lines
|
103 |
data = json.loads(line.decode('utf-8'))
|
|
|
122 |
self.conv_id = conv_id
|
123 |
|
124 |
chunk = summary['text']
|
125 |
+
chunks.append(chunk)
|
126 |
+
yield chunk
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
if summary['done']:
|
129 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
return ''.join(chunks)
|