ola13 commited on
Commit
6c25d13
1 Parent(s): 394e502

add exact match metadata

Browse files
Files changed (1) hide show
  1. app.py +98 -112
app.py CHANGED
@@ -3,6 +3,8 @@ import os
3
  import gradio as gr
4
  import requests
5
  from huggingface_hub import HfApi
 
 
6
 
7
  hf_api = HfApi()
8
  roots_datasets = {
@@ -52,11 +54,32 @@ def process_pii(text):
52
  return text
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def process_results(results, highlight_terms):
56
  if len(results) == 0:
57
  return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
58
  No results retrieved.</p><br><hr>"""
59
-
60
  results_html = ""
61
  for result in results:
62
  tokens = result["text"].split()
@@ -68,136 +91,102 @@ def process_results(results, highlight_terms):
68
  tokens_html.append(token)
69
  tokens_html = " ".join(tokens_html)
70
  tokens_html = process_pii(tokens_html)
71
- meta_html = (
72
- """
73
- <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
74
- <a href='{}' target='_blank'>{}</a></p>""".format(
75
- result["meta"]["url"], result["meta"]["url"]
76
- )
77
- if "meta" in result
78
- and result["meta"] is not None
79
- and "url" in result["meta"]
80
- else ""
81
- )
82
- docid_html = get_docid_html(result["docid"])
83
- results_html += """{}
84
- <p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
85
- <p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
86
  <p style='font-family: Arial;'>{}</p>
87
  <br>
88
  """.format(
89
- meta_html, docid_html, result["lang"], tokens_html
90
  )
 
91
  return results_html + "<hr>"
92
 
93
 
94
- def scisearch(query, language, num_results=10, exact_search=False):
95
- try:
96
- query = " ".join(query.split())
97
- if query == "" or query is None:
98
- return ""
99
-
100
- post_data = {"query": query, "k": num_results, "exact_search": exact_search}
101
- if language != "detect_language":
102
- post_data["lang"] = language
103
-
104
- output = requests.post(
105
- os.environ.get("address"),
106
- headers={"Content-type": "application/json"},
107
- data=json.dumps(post_data),
108
- timeout=60,
109
- )
110
 
111
- payload = json.loads(output.text)
112
 
113
- if "err" in payload:
114
- if payload["err"]["type"] == "unsupported_lang":
115
- detected_lang = payload["err"]["meta"]["detected_lang"]
116
- return f"""
117
- <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
118
- Detected language <b>{detected_lang}</b> is not supported.<br>
119
- Please choose a language from the dropdown or type another query.
120
- </p><br><hr><br>"""
121
-
122
- results = payload["results"]
123
- highlight_terms = payload["highlight_terms"]
124
-
125
- if language == "detect_language":
126
- return (
127
- (
128
- f"""<p style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
129
- Detected language: <b>{results[0]["lang"]}</b></p><br><hr><br>"""
130
- if len(results) > 0 and language == "detect_language"
131
- else ""
132
- )
133
- + process_results(results, highlight_terms)
134
  )
 
 
135
 
136
- if language == "all":
137
- results_html = ""
138
- for lang, results_for_lang in results.items():
139
- if len(results_for_lang) == 0:
140
- results_html += f"""<p style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
141
- No results for language: <b>{lang}</b><hr></p>"""
142
- continue
143
-
144
- collapsible_results = f"""
145
- <details>
146
- <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
147
- Results for language: <b>{lang}</b><hr>
148
- </summary>
149
- {process_results(results_for_lang, highlight_terms)}
150
- </details>"""
151
- results_html += collapsible_results
152
- return results_html
153
-
154
- return process_results(results, highlight_terms)
155
 
156
- except Exception as e:
157
- results_html = f"""
158
- <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
159
- Raised {type(e).__name__}</p>
160
- <p style='font-size:14px; font-family: Arial; '>
161
- Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
162
- </p>
163
- """
 
164
 
165
- return results_html
166
 
167
 
168
- def perform_exact_search(query, num_results=10):
169
  try:
170
- print("perform_exact_search")
171
  query = " ".join(query.split())
172
  if query == "" or query is None:
173
  return ""
174
-
175
- post_data = {"query": query, "k": num_results, "exact_search": True}
176
-
177
- print("post_data", post_data)
178
-
 
179
  output = requests.post(
180
- "http://34.105.160.81:8080",
181
  headers={"Content-type": "application/json"},
182
  data=json.dumps(post_data),
183
  timeout=60,
184
  )
185
-
186
  payload = json.loads(output.text)
187
- results = payload["results"]
188
-
189
- results_html = ""
190
- for result in results:
191
- print(result)
192
- result_html = """<br><hr><br>"""
193
- query_start = result.find(query)
194
- query_end = query_start + len(query)
195
- result_html += result[0:query_start]
196
- result_html += "<b>{}</b>".format(result[query_start:query_end])
197
- result_html += result[query_end:]
198
- results_html += result_html
199
- return results_html + "<hr>"
200
-
201
  except Exception as e:
202
  results_html = f"""
203
  <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
@@ -206,6 +195,9 @@ def perform_exact_search(query, num_results=10):
206
  Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
207
  </p>
208
  """
 
 
 
209
 
210
 
211
  def flag(query, language, num_results, issue_description):
@@ -308,12 +300,6 @@ if __name__ == "__main__":
308
  query = query.strip()
309
  if query is None or query == "":
310
  return "", ""
311
-
312
- if exact_search:
313
- return {
314
- results: perform_exact_search(query, k),
315
- flagging_form: gr.update(visible=True),
316
- }
317
  return {
318
  results: scisearch(query, lang, k, exact_search),
319
  flagging_form: gr.update(visible=True),
 
3
  import gradio as gr
4
  import requests
5
  from huggingface_hub import HfApi
6
+ import traceback
7
+
8
 
9
  hf_api = HfApi()
10
  roots_datasets = {
 
54
  return text
55
 
56
 
57
+ def format_meta(result):
58
+ meta_html = (
59
+ """
60
+ <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
61
+ <a href='{}' target='_blank'>{}</a></p>""".format(
62
+ result["meta"]["url"], result["meta"]["url"]
63
+ )
64
+ if "meta" in result and result["meta"] is not None and "url" in result["meta"]
65
+ else ""
66
+ )
67
+ docid_html = get_docid_html(result["docid"])
68
+ return """{}
69
+ <p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
70
+ <p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
71
+ """.format(
72
+ meta_html,
73
+ docid_html,
74
+ result["lang"] if lang in result else None,
75
+ )
76
+ return meta_html
77
+
78
+
79
  def process_results(results, highlight_terms):
80
  if len(results) == 0:
81
  return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
82
  No results retrieved.</p><br><hr>"""
 
83
  results_html = ""
84
  for result in results:
85
  tokens = result["text"].split()
 
91
  tokens_html.append(token)
92
  tokens_html = " ".join(tokens_html)
93
  tokens_html = process_pii(tokens_html)
94
+ meta_html = format_meta(result)
95
+ meta_html += """
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  <p style='font-family: Arial;'>{}</p>
97
  <br>
98
  """.format(
99
+ tokens_html
100
  )
101
+ results_html += meta_html
102
  return results_html + "<hr>"
103
 
104
 
105
+ def process_exact_match_payload(payload, query):
106
+ results = payload["results"]
107
+ results_html = ""
108
+ for result in results:
109
+ text = result["text"]
110
+ print(result, text, type(text))
111
+ meta_html = format_meta(result)
112
+ result_html = """<br><hr><br>""" + meta_html
113
+ query_start = text.find(query)
114
+ query_end = query_start + len(query)
115
+ result_html += text[0:query_start]
116
+ result_html += "<b>{}</b>".format(text[query_start:query_end])
117
+ result_html += text[query_end:]
118
+ results_html += result_html
119
+ return results_html + "<hr>"
 
120
 
 
121
 
122
+ def process_bm25_match_payload(payload, language):
123
+ if "err" in payload:
124
+ if payload["err"]["type"] == "unsupported_lang":
125
+ detected_lang = payload["err"]["meta"]["detected_lang"]
126
+ return f"""
127
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
128
+ Detected language <b>{detected_lang}</b> is not supported.<br>
129
+ Please choose a language from the dropdown or type another query.
130
+ </p><br><hr><br>"""
131
+
132
+ results = payload["results"]
133
+ highlight_terms = payload["highlight_terms"]
134
+
135
+ if language == "detect_language":
136
+ return (
137
+ (
138
+ f"""<p style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
139
+ Detected language: <b>{results[0]["lang"]}</b></p><br><hr><br>"""
140
+ if len(results) > 0 and language == "detect_language"
141
+ else ""
 
142
  )
143
+ + process_results(results, highlight_terms)
144
+ )
145
 
146
+ if language == "all":
147
+ results_html = ""
148
+ for lang, results_for_lang in results.items():
149
+ if len(results_for_lang) == 0:
150
+ results_html += f"""<p style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
151
+ No results for language: <b>{lang}</b><hr></p>"""
152
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ collapsible_results = f"""
155
+ <details>
156
+ <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
157
+ Results for language: <b>{lang}</b><hr>
158
+ </summary>
159
+ {process_results(results_for_lang, highlight_terms)}
160
+ </details>"""
161
+ results_html += collapsible_results
162
+ return results_html
163
 
164
+ return process_results(results, highlight_terms)
165
 
166
 
167
+ def scisearch(query, language, num_results=10, exact_search=False):
168
  try:
 
169
  query = " ".join(query.split())
170
  if query == "" or query is None:
171
  return ""
172
+ post_data = {"query": query, "k": num_results}
173
+ if language != "detect_language":
174
+ post_data["lang"] = language
175
+ address = (
176
+ "http://34.105.160.81:8080" if exact_search else os.environ.get("address")
177
+ )
178
  output = requests.post(
179
+ address,
180
  headers={"Content-type": "application/json"},
181
  data=json.dumps(post_data),
182
  timeout=60,
183
  )
 
184
  payload = json.loads(output.text)
185
+ return (
186
+ process_bm25_match_payload(payload, language)
187
+ if not exact_search
188
+ else process_exact_match_payload(payload, query)
189
+ )
 
 
 
 
 
 
 
 
 
190
  except Exception as e:
191
  results_html = f"""
192
  <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
 
195
  Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
196
  </p>
197
  """
198
+ print(e)
199
+ print(traceback.format_exc())
200
+ return results_html
201
 
202
 
203
  def flag(query, language, num_results, issue_description):
 
300
  query = query.strip()
301
  if query is None or query == "":
302
  return "", ""
 
 
 
 
 
 
303
  return {
304
  results: scisearch(query, lang, k, exact_search),
305
  flagging_form: gr.update(visible=True),