github-actions commited on
Commit
e63fa66
1 Parent(s): 7c382ab

Sync updates from source repository

Browse files
Files changed (1) hide show
  1. query.py +10 -74
query.py CHANGED
@@ -1,56 +1,6 @@
1
  import requests
2
  import json
3
- import re
4
- from urllib.parse import quote
5
-
6
- def extract_between_tags(text, start_tag, end_tag):
7
- start_index = text.find(start_tag)
8
- end_index = text.find(end_tag, start_index)
9
- return text[start_index+len(start_tag):end_index-len(end_tag)]
10
-
11
- class CitationNormalizer():
12
-
13
- def __init__(self, responses, docs):
14
- self.docs = docs
15
- self.responses = responses
16
- self.refs = []
17
-
18
- def normalize_citations(self, summary):
19
- start_tag = "%START_SNIPPET%"
20
- end_tag = "%END_SNIPPET%"
21
-
22
- # find all references in the summary
23
- pattern = r'\[\d{1,2}\]'
24
- matches = [match.span() for match in re.finditer(pattern, summary)]
25
-
26
- # figure out unique list of references
27
- for match in matches:
28
- start, end = match
29
- response_num = int(summary[start+1:end-1])
30
- doc_num = self.responses[response_num-1]['documentIndex']
31
- metadata = {item['name']: item['value'] for item in self.docs[doc_num]['metadata']}
32
- text = extract_between_tags(self.responses[response_num-1]['text'], start_tag, end_tag)
33
- if 'url' in metadata.keys():
34
- url = f"{metadata['url']}#:~:text={quote(text)}"
35
- if url not in self.refs:
36
- self.refs.append(url)
37
-
38
- # replace references with markdown links
39
- refs_dict = {url:(inx+1) for inx,url in enumerate(self.refs)}
40
- for match in reversed(matches):
41
- start, end = match
42
- response_num = int(summary[start+1:end-1])
43
- doc_num = self.responses[response_num-1]['documentIndex']
44
- metadata = {item['name']: item['value'] for item in self.docs[doc_num]['metadata']}
45
- text = extract_between_tags(self.responses[response_num-1]['text'], start_tag, end_tag)
46
- if 'url' in metadata.keys():
47
- url = f"{metadata['url']}#:~:text={quote(text)}"
48
- citation_inx = refs_dict[url]
49
- summary = summary[:start] + f'[\[{citation_inx}\]]({url})' + summary[end:]
50
- else:
51
- summary = summary[:start] + summary[end:]
52
 
53
- return summary
54
 
55
  class VectaraQuery():
56
  def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str], prompt_name: str = None):
@@ -82,9 +32,6 @@ class VectaraQuery():
82
  'rerankingConfig':
83
  {
84
  'rerankerId': 272725719,
85
- 'mmrConfig': {
86
- 'diversityBias': 0.3
87
- }
88
  },
89
  'summary': [
90
  {
@@ -95,11 +42,15 @@ class VectaraQuery():
95
  'store': True,
96
  'conversationId': self.conv_id
97
  },
 
 
 
98
  }
99
  ]
100
  }
101
  ]
102
  }
 
103
 
104
  def get_headers(self):
105
  return {
@@ -122,10 +73,10 @@ class VectaraQuery():
122
 
123
  res = response.json()
124
 
125
- top_k = 10
126
  summary = res['responseSet'][0]['summary'][0]['text']
127
- responses = res['responseSet'][0]['response'][:top_k]
128
- docs = res['responseSet'][0]['document']
129
  chat = res['responseSet'][0]['summary'][0].get('chat', None)
130
 
131
  if chat and chat['status'] is not None:
@@ -137,12 +88,11 @@ class VectaraQuery():
137
  return 'Sorry, something went wrong in my brain. Please try again later.'
138
 
139
  self.conv_id = chat['conversationId'] if chat else None
140
- summary = CitationNormalizer(responses, docs).normalize_citations(summary)
141
  return summary
142
 
143
  def submit_query_streaming(self, query_str: str):
144
 
145
- endpoint = f"https://api.vectara.io/v1/stream-query"
146
  body = self.get_body(query_str)
147
 
148
  response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers(), stream=True)
@@ -151,8 +101,6 @@ class VectaraQuery():
151
  return "Sorry, something went wrong in my brain. Please try again later."
152
 
153
  chunks = []
154
- accumulated_text = "" # Initialize text accumulation
155
- pattern_max_length = 50 # Example heuristic
156
  for line in response.iter_lines():
157
  if line: # filter out keep-alive new lines
158
  data = json.loads(line.decode('utf-8'))
@@ -177,22 +125,10 @@ class VectaraQuery():
177
  self.conv_id = conv_id
178
 
179
  chunk = summary['text']
180
- accumulated_text += chunk # Append current chunk to accumulation
181
- if len(accumulated_text) > pattern_max_length:
182
- accumulated_text = re.sub(r"\[\d+\]", "", accumulated_text)
183
- accumulated_text = re.sub(r"\s+\.", ".", accumulated_text)
184
- out_chunk = accumulated_text[:-pattern_max_length]
185
- chunks.append(out_chunk)
186
- yield out_chunk
187
- accumulated_text = accumulated_text[-pattern_max_length:]
188
 
189
  if summary['done']:
190
  break
191
-
192
- # yield the last piece
193
- if len(accumulated_text) > 0:
194
- accumulated_text = re.sub(r" \[\d+\]\.", ".", accumulated_text)
195
- chunks.append(accumulated_text)
196
- yield accumulated_text
197
 
198
  return ''.join(chunks)
 
1
  import requests
2
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
 
4
 
5
  class VectaraQuery():
6
  def __init__(self, api_key: str, customer_id: str, corpus_ids: list[str], prompt_name: str = None):
 
32
  'rerankingConfig':
33
  {
34
  'rerankerId': 272725719,
 
 
 
35
  },
36
  'summary': [
37
  {
 
42
  'store': True,
43
  'conversationId': self.conv_id
44
  },
45
+ 'citationParams': {
46
+ "style": "NONE",
47
+ }
48
  }
49
  ]
50
  }
51
  ]
52
  }
53
+
54
 
55
  def get_headers(self):
56
  return {
 
73
 
74
  res = response.json()
75
 
76
+ #top_k = 10
77
  summary = res['responseSet'][0]['summary'][0]['text']
78
+ #responses = res['responseSet'][0]['response'][:top_k]
79
+ #docs = res['responseSet'][0]['document']
80
  chat = res['responseSet'][0]['summary'][0].get('chat', None)
81
 
82
  if chat and chat['status'] is not None:
 
88
  return 'Sorry, something went wrong in my brain. Please try again later.'
89
 
90
  self.conv_id = chat['conversationId'] if chat else None
 
91
  return summary
92
 
93
  def submit_query_streaming(self, query_str: str):
94
 
95
+ endpoint = "https://api.vectara.io/v1/stream-query"
96
  body = self.get_body(query_str)
97
 
98
  response = requests.post(endpoint, data=json.dumps(body), verify=True, headers=self.get_headers(), stream=True)
 
101
  return "Sorry, something went wrong in my brain. Please try again later."
102
 
103
  chunks = []
 
 
104
  for line in response.iter_lines():
105
  if line: # filter out keep-alive new lines
106
  data = json.loads(line.decode('utf-8'))
 
125
  self.conv_id = conv_id
126
 
127
  chunk = summary['text']
128
+ chunks.append(chunk)
129
+ yield chunk
 
 
 
 
 
 
130
 
131
  if summary['done']:
132
  break
 
 
 
 
 
 
133
 
134
  return ''.join(chunks)