david-oplatka commited on
Commit
9cc5d1c
1 Parent(s): 3ec200d

Update query.py

Browse files
Files changed (1) hide show
  1. query.py +10 -77
query.py CHANGED
@@ -1,56 +1,6 @@
1
  import requests
2
  import json
3
- import re
4
- from urllib.parse import quote
5
-
6
- def extract_between_tags(text, start_tag, end_tag):
7
- start_index = text.find(start_tag)
8
- end_index = text.find(end_tag, start_index)
9
- return text[start_index+len(start_tag):end_index-len(end_tag)]
10
-
11
- class CitationNormalizer():
12
-
13
- def __init__(self, responses, docs):
14
- self.docs = docs
15
- self.responses = responses
16
- self.refs = []
17
-
18
- def normalize_citations(self, summary):
19
- start_tag = "%START_SNIPPET%"
20
- end_tag = "%END_SNIPPET%"
21
-
22
- # find all references in the summary
23
- pattern = r'\[\d{1,2}\]'
24
- matches = [match.span() for match in re.finditer(pattern, summary)]
25
-
26
- # figure out unique list of references
27
- for match in matches:
28
- start, end = match
29
- response_num = int(summary[start+1:end-1])
30
- doc_num = self.responses[response_num-1]['documentIndex']
31
- metadata = {item['name']: item['value'] for item in self.docs[doc_num]['metadata']}
32
- text = extract_between_tags(self.responses[response_num-1]['text'], start_tag, end_tag)
33
- if 'url' in metadata.keys():
34
- url = f"{metadata['url']}#:~:text={quote(text)}"
35
- if url not in self.refs:
36
- self.refs.append(url)
37
-
38
- # replace references with markdown links
39
- refs_dict = {url:(inx+1) for inx,url in enumerate(self.refs)}
40
- for match in reversed(matches):
41
- start, end = match
42
- response_num = int(summary[start+1:end-1])
43
- doc_num = self.responses[response_num-1]['documentIndex']
44
- metadata = {item['name']: item['value'] for item in self.docs[doc_num]['metadata']}
45
- text = extract_between_tags(self.responses[response_num-1]['text'], start_tag, end_tag)
46
- if 'url' in metadata.keys():
47
- url = f"{metadata['url']}#:~:text={quote(text)}"
48
- citation_inx = refs_dict[url]
49
- summary = summary[:start] + f'[\[{citation_inx}\]]({url})' + summary[end:]
50
- else:
51
- summary = summary[:start] + summary[end:]
52
 
53
- return summary
54
 
55
  class VectaraQuery():
56
  def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str], prompt_name: str = None):
@@ -62,7 +12,7 @@ class VectaraQuery():
62
 
63
  def get_body(self, query_str: str):
64
  corpora_key_list = [{
65
- 'customer_id': self.customer_id, 'corpus_id': corpus_id, 'lexical_interpolation_config': {'lambda': 0.025}
66
  } for corpus_id in self.corpus_ids
67
  ]
68
 
@@ -81,25 +31,26 @@ class VectaraQuery():
81
  },
82
  'rerankingConfig':
83
  {
84
- 'rerankerId': 272725718,
85
- 'mmrConfig': {
86
- 'diversityBias': 0.3
87
- }
88
  },
89
  'summary': [
90
  {
91
  'responseLang': 'eng',
92
- 'maxSummarizedResults': 5,
93
  'summarizerPromptName': self.prompt_name,
94
  'chat': {
95
  'store': True,
96
  'conversationId': self.conv_id
97
  },
 
 
 
98
  }
99
  ]
100
  }
101
  ]
102
  }
 
103
 
104
  def get_headers(self):
105
  return {
@@ -122,10 +73,7 @@ class VectaraQuery():
122
 
123
  res = response.json()
124
 
125
- top_k = 10
126
  summary = res['responseSet'][0]['summary'][0]['text']
127
- responses = res['responseSet'][0]['response'][:top_k]
128
- docs = res['responseSet'][0]['document']
129
  chat = res['responseSet'][0]['summary'][0].get('chat', None)
130
 
131
  if chat and chat['status'] is not None:
@@ -137,12 +85,11 @@ class VectaraQuery():
137
  return 'Sorry, something went wrong in my brain. Please try again later.'
138
 
139
  self.conv_id = chat['conversationId'] if chat else None
140
- summary = CitationNormalizer(responses, docs).normalize_citations(summary)
141
  return summary
142
 
143
  def submit_query_streaming(self, query_str: str):
144
 
145
- endpoint = f"https://api.vectara.io/v1/stream-query"
146
  body = self.get_body(query_str)
147
 
148
  response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers(), stream=True)
@@ -151,8 +98,6 @@ class VectaraQuery():
151
  return "Sorry, something went wrong in my brain. Please try again later."
152
 
153
  chunks = []
154
- accumulated_text = "" # Initialize text accumulation
155
- pattern_max_length = 50 # Example heuristic
156
  for line in response.iter_lines():
157
  if line: # filter out keep-alive new lines
158
  data = json.loads(line.decode('utf-8'))
@@ -177,22 +122,10 @@ class VectaraQuery():
177
  self.conv_id = conv_id
178
 
179
  chunk = summary['text']
180
- accumulated_text += chunk # Append current chunk to accumulation
181
- if len(accumulated_text) > pattern_max_length:
182
- accumulated_text = re.sub(r"\[\d+\]", "", accumulated_text)
183
- accumulated_text = re.sub(r"\s+\.", ".", accumulated_text)
184
- out_chunk = accumulated_text[:-pattern_max_length]
185
- chunks.append(out_chunk)
186
- yield out_chunk
187
- accumulated_text = accumulated_text[-pattern_max_length:]
188
 
189
  if summary['done']:
190
  break
191
-
192
- # yield the last piece
193
- if len(accumulated_text) > 0:
194
- accumulated_text = re.sub(r" \[\d+\]\.", ".", accumulated_text)
195
- chunks.append(accumulated_text)
196
- yield accumulated_text
197
 
198
  return ''.join(chunks)
 
1
  import requests
2
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
 
4
 
5
  class VectaraQuery():
6
  def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str], prompt_name: str = None):
 
12
 
13
  def get_body(self, query_str: str):
14
  corpora_key_list = [{
15
+ 'customer_id': self.customer_id, 'corpus_id': corpus_id, 'lexical_interpolation_config': {'lambda': 0.005}
16
  } for corpus_id in self.corpus_ids
17
  ]
18
 
 
31
  },
32
  'rerankingConfig':
33
  {
34
+ 'rerankerId': 272725719,
 
 
 
35
  },
36
  'summary': [
37
  {
38
  'responseLang': 'eng',
39
+ 'maxSummarizedResults': 10,
40
  'summarizerPromptName': self.prompt_name,
41
  'chat': {
42
  'store': True,
43
  'conversationId': self.conv_id
44
  },
45
+ 'citationParams': {
46
+ "style": "NONE",
47
+ }
48
  }
49
  ]
50
  }
51
  ]
52
  }
53
+
54
 
55
  def get_headers(self):
56
  return {
 
73
 
74
  res = response.json()
75
 
 
76
  summary = res['responseSet'][0]['summary'][0]['text']
 
 
77
  chat = res['responseSet'][0]['summary'][0].get('chat', None)
78
 
79
  if chat and chat['status'] is not None:
 
85
  return 'Sorry, something went wrong in my brain. Please try again later.'
86
 
87
  self.conv_id = chat['conversationId'] if chat else None
 
88
  return summary
89
 
90
  def submit_query_streaming(self, query_str: str):
91
 
92
+ endpoint = "https://api.vectara.io/v1/stream-query"
93
  body = self.get_body(query_str)
94
 
95
  response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers(), stream=True)
 
98
  return "Sorry, something went wrong in my brain. Please try again later."
99
 
100
  chunks = []
 
 
101
  for line in response.iter_lines():
102
  if line: # filter out keep-alive new lines
103
  data = json.loads(line.decode('utf-8'))
 
122
  self.conv_id = conv_id
123
 
124
  chunk = summary['text']
125
+ chunks.append(chunk)
126
+ yield chunk
 
 
 
 
 
 
127
 
128
  if summary['done']:
129
  break
 
 
 
 
 
 
130
 
131
  return ''.join(chunks)