Abhaykoul commited on
Commit
7c57092
1 Parent(s): a2699e6

Create webscout.py

Browse files
Files changed (1) hide show
  1. webscout.py +1065 -0
webscout.py ADDED
@@ -0,0 +1,1065 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import warnings
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from datetime import datetime, timezone
5
+ from decimal import Decimal
6
+ from functools import cached_property
7
+ from itertools import cycle, islice
8
+ from threading import Event
9
+ from types import TracebackType
10
+ from typing import Dict, List, Optional, Tuple, Type, Union, cast
11
+
12
+ import pyreqwest_impersonate as pri # type: ignore
13
+
14
+ try:
15
+ from lxml.etree import _Element
16
+ from lxml.html import HTMLParser as LHTMLParser
17
+ from lxml.html import document_fromstring
18
+
19
+ LXML_AVAILABLE = True
20
+ except ImportError:
21
+ LXML_AVAILABLE = False
22
+
23
+ from .exceptions import WebscoutE, RatelimitE, TimeoutE
24
+ from .utils import (
25
+ _calculate_distance,
26
+ _extract_vqd,
27
+ _normalize,
28
+ _normalize_url,
29
+ _text_extract_json,
30
+ json_loads,
31
+ )
32
+
33
+ logger = logging.getLogger("webscout.WEBS")
34
+
35
+
36
+ class WEBS:
37
+ """webscout class to get search results from duckduckgo.com."""
38
+
39
+ _executor: ThreadPoolExecutor = ThreadPoolExecutor()
40
+
41
+ def __init__(
42
+ self,
43
+ headers: Optional[Dict[str, str]] = None,
44
+ proxy: Optional[str] = None,
45
+ proxies: Union[Dict[str, str], str, None] = None, # deprecated
46
+ timeout: Optional[int] = 10,
47
+ ) -> None:
48
+ """Initialize the WEBS object.
49
+
50
+ Args:
51
+ headers (dict, optional): Dictionary of headers for the HTTP client. Defaults to None.
52
+ proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
53
+ example: "http://user:[email protected]:3128". Defaults to None.
54
+ timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
55
+ """
56
+ self.proxy: Optional[str] = proxy
57
+ assert self.proxy is None or isinstance(self.proxy, str), "proxy must be a str"
58
+ if not proxy and proxies:
59
+ warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1)
60
+ self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies
61
+ self.headers = headers if headers else {}
62
+ self.headers["Referer"] = "https://duckduckgo.com/"
63
+ self.client = pri.Client(
64
+ headers=self.headers,
65
+ proxy=self.proxy,
66
+ timeout=timeout,
67
+ cookie_store=True,
68
+ referer=True,
69
+ impersonate="chrome_124",
70
+ follow_redirects=False,
71
+ verify=False,
72
+ )
73
+ self._exception_event = Event()
74
+ self._chat_messages: List[Dict[str, str]] = []
75
+ self._chat_vqd: str = ""
76
+
77
+ def __enter__(self) -> "WEBS":
78
+ return self
79
+
80
+ def __exit__(
81
+ self,
82
+ exc_type: Optional[Type[BaseException]] = None,
83
+ exc_val: Optional[BaseException] = None,
84
+ exc_tb: Optional[TracebackType] = None,
85
+ ) -> None:
86
+ pass
87
+
88
+ @cached_property
89
+ def parser(self) -> "LHTMLParser":
90
+ """Get HTML parser."""
91
+ return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
92
+
93
+ def _get_url(
94
+ self,
95
+ method: str,
96
+ url: str,
97
+ params: Optional[Dict[str, str]] = None,
98
+ content: Optional[bytes] = None,
99
+ data: Optional[Union[Dict[str, str], bytes]] = None,
100
+ ) -> bytes:
101
+ if self._exception_event.is_set():
102
+ raise WebscoutE("Exception occurred in previous call.")
103
+ try:
104
+ resp = self.client.request(method, url, params=params, content=content, data=data)
105
+ except Exception as ex:
106
+ self._exception_event.set()
107
+ if "time" in str(ex).lower():
108
+ raise TimeoutE(f"{url} {type(ex).__name__}: {ex}") from ex
109
+ raise WebscoutE(f"{url} {type(ex).__name__}: {ex}") from ex
110
+ logger.debug(f"_get_url() {resp.url} {resp.status_code} {len(resp.content)}")
111
+ if resp.status_code == 200:
112
+ return cast(bytes, resp.content)
113
+ self._exception_event.set()
114
+ if resp.status_code in (202, 301, 403):
115
+ raise RatelimitE(f"{resp.url} {resp.status_code} Ratelimit")
116
+ raise WebscoutE(f"{resp.url} return None. {params=} {content=} {data=}")
117
+
118
+ def _get_vqd(self, keywords: str) -> str:
119
+ """Get vqd value for a search query."""
120
+ resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
121
+ return _extract_vqd(resp_content, keywords)
122
+
123
+ def chat(self, keywords: str, model: str = "gpt-3.5") -> str:
124
+ """Initiates a chat session with DuckDuckGo AI.
125
+
126
+ Args:
127
+ keywords (str): The initial message or question to send to the AI.
128
+ model (str): The model to use: "gpt-3.5", "claude-3-haiku", "llama-3-70b", "mixtral-8x7b".
129
+ Defaults to "gpt-3.5".
130
+
131
+ Returns:
132
+ str: The response from the AI.
133
+ """
134
+ models = {
135
+ "claude-3-haiku": "claude-3-haiku-20240307",
136
+ "gpt-3.5": "gpt-3.5-turbo-0125",
137
+ "llama-3-70b": "meta-llama/Llama-3-70b-chat-hf",
138
+ "mixtral-8x7b": "mistralai/Mixtral-8x7B-Instruct-v0.1",
139
+ }
140
+ # vqd
141
+ if not self._chat_vqd:
142
+ resp = self.client.get("https://duckduckgo.com/duckchat/v1/status", headers={"x-vqd-accept": "1"})
143
+ self._chat_vqd = resp.headers.get("x-vqd-4", "")
144
+
145
+ self._chat_messages.append({"role": "user", "content": keywords})
146
+
147
+ json_data = {
148
+ "model": models[model],
149
+ "messages": self._chat_messages,
150
+ }
151
+ resp = self.client.post(
152
+ "https://duckduckgo.com/duckchat/v1/chat", headers={"x-vqd-4": self._chat_vqd}, json=json_data
153
+ )
154
+ self._chat_vqd = resp.headers.get("x-vqd-4", "")
155
+
156
+ messages = []
157
+ for line in resp.text.replace("data: ", "").replace("[DONE]", "").split("\n\n"):
158
+ x = line.strip()
159
+ if x:
160
+ j = json_loads(x)
161
+ message = j.get("message", "")
162
+ messages.append(message)
163
+ result = "".join(messages)
164
+ self._chat_messages.append({"role": "assistant", "content": result})
165
+ return result
166
+
167
+ def text(
168
+ self,
169
+ keywords: str,
170
+ region: str = "wt-wt",
171
+ safesearch: str = "moderate",
172
+ timelimit: Optional[str] = None,
173
+ backend: str = "api",
174
+ max_results: Optional[int] = None,
175
+ ) -> List[Dict[str, str]]:
176
+ """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
177
+
178
+ Args:
179
+ keywords: keywords for query.
180
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
181
+ safesearch: on, moderate, off. Defaults to "moderate".
182
+ timelimit: d, w, m, y. Defaults to None.
183
+ backend: api, html, lite. Defaults to api.
184
+ api - collect data from https://duckduckgo.com,
185
+ html - collect data from https://html.duckduckgo.com,
186
+ lite - collect data from https://lite.duckduckgo.com.
187
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
188
+
189
+ Returns:
190
+ List of dictionaries with search results, or None if there was an error.
191
+
192
+ Raises:
193
+ WebscoutE: Base exception for webscout errors.
194
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
195
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
196
+ """
197
+ if LXML_AVAILABLE is False and backend != "api":
198
+ backend = "api"
199
+ warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2)
200
+
201
+ if backend == "api":
202
+ results = self._text_api(keywords, region, safesearch, timelimit, max_results)
203
+ elif backend == "html":
204
+ results = self._text_html(keywords, region, timelimit, max_results)
205
+ elif backend == "lite":
206
+ results = self._text_lite(keywords, region, timelimit, max_results)
207
+ return results
208
+
209
+ def _text_api(
210
+ self,
211
+ keywords: str,
212
+ region: str = "wt-wt",
213
+ safesearch: str = "moderate",
214
+ timelimit: Optional[str] = None,
215
+ max_results: Optional[int] = None,
216
+ ) -> List[Dict[str, str]]:
217
+ """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
218
+
219
+ Args:
220
+ keywords: keywords for query.
221
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
222
+ safesearch: on, moderate, off. Defaults to "moderate".
223
+ timelimit: d, w, m, y. Defaults to None.
224
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
225
+
226
+ Returns:
227
+ List of dictionaries with search results.
228
+
229
+ Raises:
230
+ WebscoutE: Base exception for webscout errors.
231
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
232
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
233
+ """
234
+ assert keywords, "keywords is mandatory"
235
+
236
+ vqd = self._get_vqd(keywords)
237
+
238
+ payload = {
239
+ "q": keywords,
240
+ "kl": region,
241
+ "l": region,
242
+ "p": "",
243
+ "s": "0",
244
+ "df": "",
245
+ "vqd": vqd,
246
+ "bing_market": f"{region[3:]}-{region[:2].upper()}",
247
+ "ex": "",
248
+ }
249
+ safesearch = safesearch.lower()
250
+ if safesearch == "moderate":
251
+ payload["ex"] = "-1"
252
+ elif safesearch == "off":
253
+ payload["ex"] = "-2"
254
+ elif safesearch == "on": # strict
255
+ payload["p"] = "1"
256
+ if timelimit:
257
+ payload["df"] = timelimit
258
+
259
+ cache = set()
260
+ results: List[Dict[str, str]] = []
261
+
262
+ def _text_api_page(s: int) -> List[Dict[str, str]]:
263
+ payload["s"] = f"{s}"
264
+ resp_content = self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
265
+ page_data = _text_extract_json(resp_content, keywords)
266
+ page_results = []
267
+ for row in page_data:
268
+ href = row.get("u", None)
269
+ if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
270
+ cache.add(href)
271
+ body = _normalize(row["a"])
272
+ if body:
273
+ result = {
274
+ "title": _normalize(row["t"]),
275
+ "href": _normalize_url(href),
276
+ "body": body,
277
+ }
278
+ page_results.append(result)
279
+ return page_results
280
+
281
+ slist = [0]
282
+ if max_results:
283
+ max_results = min(max_results, 2023)
284
+ slist.extend(range(23, max_results, 50))
285
+ try:
286
+ for r in self._executor.map(_text_api_page, slist):
287
+ results.extend(r)
288
+ except Exception as e:
289
+ raise e
290
+
291
+ return list(islice(results, max_results))
292
+
293
+ def _text_html(
294
+ self,
295
+ keywords: str,
296
+ region: str = "wt-wt",
297
+ timelimit: Optional[str] = None,
298
+ max_results: Optional[int] = None,
299
+ ) -> List[Dict[str, str]]:
300
+ """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
301
+
302
+ Args:
303
+ keywords: keywords for query.
304
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
305
+ timelimit: d, w, m, y. Defaults to None.
306
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
307
+
308
+ Returns:
309
+ List of dictionaries with search results.
310
+
311
+ Raises:
312
+ WebscoutE: Base exception for webscout errors.
313
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
314
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
315
+ """
316
+ assert keywords, "keywords is mandatory"
317
+
318
+ payload = {
319
+ "q": keywords,
320
+ "s": "0",
321
+ "o": "json",
322
+ "api": "d.js",
323
+ "vqd": "",
324
+ "kl": region,
325
+ "bing_market": region,
326
+ }
327
+ if timelimit:
328
+ payload["df"] = timelimit
329
+ if max_results and max_results > 20:
330
+ vqd = self._get_vqd(keywords)
331
+ payload["vqd"] = vqd
332
+
333
+ cache = set()
334
+ results: List[Dict[str, str]] = []
335
+
336
+ def _text_html_page(s: int) -> List[Dict[str, str]]:
337
+ payload["s"] = f"{s}"
338
+ resp_content = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload)
339
+ if b"No results." in resp_content:
340
+ return []
341
+
342
+ page_results = []
343
+ tree = document_fromstring(resp_content, self.parser)
344
+ elements = tree.xpath("//div[h2]")
345
+ if not isinstance(elements, List):
346
+ return []
347
+ for e in elements:
348
+ if isinstance(e, _Element):
349
+ hrefxpath = e.xpath("./a/@href")
350
+ href = str(hrefxpath[0]) if isinstance(hrefxpath, List) else None
351
+ if (
352
+ href
353
+ and href not in cache
354
+ and not href.startswith(
355
+ ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
356
+ )
357
+ ):
358
+ cache.add(href)
359
+ titlexpath = e.xpath("./h2/a/text()")
360
+ title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
361
+ bodyxpath = e.xpath("./a//text()")
362
+ body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
363
+ result = {
364
+ "title": _normalize(title),
365
+ "href": _normalize_url(href),
366
+ "body": _normalize(body),
367
+ }
368
+ page_results.append(result)
369
+ return page_results
370
+
371
+ slist = [0]
372
+ if max_results:
373
+ max_results = min(max_results, 2023)
374
+ slist.extend(range(23, max_results, 50))
375
+ try:
376
+ for r in self._executor.map(_text_html_page, slist):
377
+ results.extend(r)
378
+ except Exception as e:
379
+ raise e
380
+
381
+ return list(islice(results, max_results))
382
+
383
+ def _text_lite(
384
+ self,
385
+ keywords: str,
386
+ region: str = "wt-wt",
387
+ timelimit: Optional[str] = None,
388
+ max_results: Optional[int] = None,
389
+ ) -> List[Dict[str, str]]:
390
+ """DuckDuckGo text search. Query params: https://duckduckgo.com/params.
391
+
392
+ Args:
393
+ keywords: keywords for query.
394
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
395
+ timelimit: d, w, m, y. Defaults to None.
396
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
397
+
398
+ Returns:
399
+ List of dictionaries with search results.
400
+
401
+ Raises:
402
+ WebscoutE: Base exception for webscout errors.
403
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
404
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
405
+ """
406
+ assert keywords, "keywords is mandatory"
407
+
408
+ payload = {
409
+ "q": keywords,
410
+ "s": "0",
411
+ "o": "json",
412
+ "api": "d.js",
413
+ "vqd": "",
414
+ "kl": region,
415
+ "bing_market": region,
416
+ }
417
+ if timelimit:
418
+ payload["df"] = timelimit
419
+
420
+ cache = set()
421
+ results: List[Dict[str, str]] = []
422
+
423
+ def _text_lite_page(s: int) -> List[Dict[str, str]]:
424
+ payload["s"] = f"{s}"
425
+ resp_content = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload)
426
+ if b"No more results." in resp_content:
427
+ return []
428
+
429
+ page_results = []
430
+ tree = document_fromstring(resp_content, self.parser)
431
+ elements = tree.xpath("//table[last()]//tr")
432
+ if not isinstance(elements, List):
433
+ return []
434
+
435
+ data = zip(cycle(range(1, 5)), elements)
436
+ for i, e in data:
437
+ if isinstance(e, _Element):
438
+ if i == 1:
439
+ hrefxpath = e.xpath(".//a//@href")
440
+ href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, List) else None
441
+ if (
442
+ href is None
443
+ or href in cache
444
+ or href.startswith(
445
+ ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
446
+ )
447
+ ):
448
+ [next(data, None) for _ in range(3)] # skip block(i=1,2,3,4)
449
+ else:
450
+ cache.add(href)
451
+ titlexpath = e.xpath(".//a//text()")
452
+ title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
453
+ elif i == 2:
454
+ bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
455
+ body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
456
+ if href:
457
+ result = {
458
+ "title": _normalize(title),
459
+ "href": _normalize_url(href),
460
+ "body": _normalize(body),
461
+ }
462
+ page_results.append(result)
463
+ return page_results
464
+
465
+ slist = [0]
466
+ if max_results:
467
+ max_results = min(max_results, 2023)
468
+ slist.extend(range(23, max_results, 50))
469
+ try:
470
+ for r in self._executor.map(_text_lite_page, slist):
471
+ results.extend(r)
472
+ except Exception as e:
473
+ raise e
474
+
475
+ return list(islice(results, max_results))
476
+
477
+ def images(
478
+ self,
479
+ keywords: str,
480
+ region: str = "wt-wt",
481
+ safesearch: str = "moderate",
482
+ timelimit: Optional[str] = None,
483
+ size: Optional[str] = None,
484
+ color: Optional[str] = None,
485
+ type_image: Optional[str] = None,
486
+ layout: Optional[str] = None,
487
+ license_image: Optional[str] = None,
488
+ max_results: Optional[int] = None,
489
+ ) -> List[Dict[str, str]]:
490
+ """DuckDuckGo images search. Query params: https://duckduckgo.com/params.
491
+
492
+ Args:
493
+ keywords: keywords for query.
494
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
495
+ safesearch: on, moderate, off. Defaults to "moderate".
496
+ timelimit: Day, Week, Month, Year. Defaults to None.
497
+ size: Small, Medium, Large, Wallpaper. Defaults to None.
498
+ color: color, Monochrome, Red, Orange, Yellow, Green, Blue,
499
+ Purple, Pink, Brown, Black, Gray, Teal, White. Defaults to None.
500
+ type_image: photo, clipart, gif, transparent, line.
501
+ Defaults to None.
502
+ layout: Square, Tall, Wide. Defaults to None.
503
+ license_image: any (All Creative Commons), Public (PublicDomain),
504
+ Share (Free to Share and Use), ShareCommercially (Free to Share and Use Commercially),
505
+ Modify (Free to Modify, Share, and Use), ModifyCommercially (Free to Modify, Share, and
506
+ Use Commercially). Defaults to None.
507
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
508
+
509
+ Returns:
510
+ List of dictionaries with images search results.
511
+
512
+ Raises:
513
+ WebscoutE: Base exception for webscout errors.
514
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
515
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
516
+ """
517
+ assert keywords, "keywords is mandatory"
518
+
519
+ vqd = self._get_vqd(keywords)
520
+
521
+ safesearch_base = {"on": "1", "moderate": "1", "off": "-1"}
522
+ timelimit = f"time:{timelimit}" if timelimit else ""
523
+ size = f"size:{size}" if size else ""
524
+ color = f"color:{color}" if color else ""
525
+ type_image = f"type:{type_image}" if type_image else ""
526
+ layout = f"layout:{layout}" if layout else ""
527
+ license_image = f"license:{license_image}" if license_image else ""
528
+ payload = {
529
+ "l": region,
530
+ "o": "json",
531
+ "q": keywords,
532
+ "vqd": vqd,
533
+ "f": f"{timelimit},{size},{color},{type_image},{layout},{license_image}",
534
+ "p": safesearch_base[safesearch.lower()],
535
+ }
536
+
537
+ cache = set()
538
+ results: List[Dict[str, str]] = []
539
+
540
+ def _images_page(s: int) -> List[Dict[str, str]]:
541
+ payload["s"] = f"{s}"
542
+ resp_content = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload)
543
+ resp_json = json_loads(resp_content)
544
+
545
+ page_data = resp_json.get("results", [])
546
+ page_results = []
547
+ for row in page_data:
548
+ image_url = row.get("image")
549
+ if image_url and image_url not in cache:
550
+ cache.add(image_url)
551
+ result = {
552
+ "title": row["title"],
553
+ "image": _normalize_url(image_url),
554
+ "thumbnail": _normalize_url(row["thumbnail"]),
555
+ "url": _normalize_url(row["url"]),
556
+ "height": row["height"],
557
+ "width": row["width"],
558
+ "source": row["source"],
559
+ }
560
+ page_results.append(result)
561
+ return page_results
562
+
563
+ slist = [0]
564
+ if max_results:
565
+ max_results = min(max_results, 500)
566
+ slist.extend(range(100, max_results, 100))
567
+ try:
568
+ for r in self._executor.map(_images_page, slist):
569
+ results.extend(r)
570
+ except Exception as e:
571
+ raise e
572
+
573
+ return list(islice(results, max_results))
574
+
575
+ def videos(
576
+ self,
577
+ keywords: str,
578
+ region: str = "wt-wt",
579
+ safesearch: str = "moderate",
580
+ timelimit: Optional[str] = None,
581
+ resolution: Optional[str] = None,
582
+ duration: Optional[str] = None,
583
+ license_videos: Optional[str] = None,
584
+ max_results: Optional[int] = None,
585
+ ) -> List[Dict[str, str]]:
586
+ """DuckDuckGo videos search. Query params: https://duckduckgo.com/params.
587
+
588
+ Args:
589
+ keywords: keywords for query.
590
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
591
+ safesearch: on, moderate, off. Defaults to "moderate".
592
+ timelimit: d, w, m. Defaults to None.
593
+ resolution: high, standart. Defaults to None.
594
+ duration: short, medium, long. Defaults to None.
595
+ license_videos: creativeCommon, youtube. Defaults to None.
596
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
597
+
598
+ Returns:
599
+ List of dictionaries with videos search results.
600
+
601
+ Raises:
602
+ WebscoutE: Base exception for webscout errors.
603
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
604
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
605
+ """
606
+ assert keywords, "keywords is mandatory"
607
+
608
+ vqd = self._get_vqd(keywords)
609
+
610
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
611
+ timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
612
+ resolution = f"videoDefinition:{resolution}" if resolution else ""
613
+ duration = f"videoDuration:{duration}" if duration else ""
614
+ license_videos = f"videoLicense:{license_videos}" if license_videos else ""
615
+ payload = {
616
+ "l": region,
617
+ "o": "json",
618
+ "q": keywords,
619
+ "vqd": vqd,
620
+ "f": f"{timelimit},{resolution},{duration},{license_videos}",
621
+ "p": safesearch_base[safesearch.lower()],
622
+ }
623
+
624
+ cache = set()
625
+ results: List[Dict[str, str]] = []
626
+
627
+ def _videos_page(s: int) -> List[Dict[str, str]]:
628
+ payload["s"] = f"{s}"
629
+ resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload)
630
+ resp_json = json_loads(resp_content)
631
+
632
+ page_data = resp_json.get("results", [])
633
+ page_results = []
634
+ for row in page_data:
635
+ if row["content"] not in cache:
636
+ cache.add(row["content"])
637
+ page_results.append(row)
638
+ return page_results
639
+
640
+ slist = [0]
641
+ if max_results:
642
+ max_results = min(max_results, 400)
643
+ slist.extend(range(60, max_results, 60))
644
+ try:
645
+ for r in self._executor.map(_videos_page, slist):
646
+ results.extend(r)
647
+ except Exception as e:
648
+ raise e
649
+
650
+ return list(islice(results, max_results))
651
+
652
+ def news(
653
+ self,
654
+ keywords: str,
655
+ region: str = "wt-wt",
656
+ safesearch: str = "moderate",
657
+ timelimit: Optional[str] = None,
658
+ max_results: Optional[int] = None,
659
+ ) -> List[Dict[str, str]]:
660
+ """DuckDuckGo news search. Query params: https://duckduckgo.com/params.
661
+
662
+ Args:
663
+ keywords: keywords for query.
664
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
665
+ safesearch: on, moderate, off. Defaults to "moderate".
666
+ timelimit: d, w, m. Defaults to None.
667
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
668
+
669
+ Returns:
670
+ List of dictionaries with news search results.
671
+
672
+ Raises:
673
+ WebscoutE: Base exception for webscout errors.
674
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
675
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
676
+ """
677
+ assert keywords, "keywords is mandatory"
678
+
679
+ vqd = self._get_vqd(keywords)
680
+
681
+ safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
682
+ payload = {
683
+ "l": region,
684
+ "o": "json",
685
+ "noamp": "1",
686
+ "q": keywords,
687
+ "vqd": vqd,
688
+ "p": safesearch_base[safesearch.lower()],
689
+ }
690
+ if timelimit:
691
+ payload["df"] = timelimit
692
+
693
+ cache = set()
694
+ results: List[Dict[str, str]] = []
695
+
696
+ def _news_page(s: int) -> List[Dict[str, str]]:
697
+ payload["s"] = f"{s}"
698
+ resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload)
699
+ resp_json = json_loads(resp_content)
700
+ page_data = resp_json.get("results", [])
701
+ page_results = []
702
+ for row in page_data:
703
+ if row["url"] not in cache:
704
+ cache.add(row["url"])
705
+ image_url = row.get("image", None)
706
+ result = {
707
+ "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(),
708
+ "title": row["title"],
709
+ "body": _normalize(row["excerpt"]),
710
+ "url": _normalize_url(row["url"]),
711
+ "image": _normalize_url(image_url),
712
+ "source": row["source"],
713
+ }
714
+ page_results.append(result)
715
+ return page_results
716
+
717
+ slist = [0]
718
+ if max_results:
719
+ max_results = min(max_results, 120)
720
+ slist.extend(range(30, max_results, 30))
721
+ try:
722
+ for r in self._executor.map(_news_page, slist):
723
+ results.extend(r)
724
+ except Exception as e:
725
+ raise e
726
+
727
+ return list(islice(results, max_results))
728
+
729
+ def answers(self, keywords: str) -> List[Dict[str, str]]:
730
+ """DuckDuckGo instant answers. Query params: https://duckduckgo.com/params.
731
+
732
+ Args:
733
+ keywords: keywords for query,
734
+
735
+ Returns:
736
+ List of dictionaries with instant answers results.
737
+
738
+ Raises:
739
+ WebscoutE: Base exception for webscout errors.
740
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
741
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
742
+ """
743
+ assert keywords, "keywords is mandatory"
744
+
745
+ payload = {
746
+ "q": f"what is {keywords}",
747
+ "format": "json",
748
+ }
749
+ resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
750
+ page_data = json_loads(resp_content)
751
+
752
+ results = []
753
+ answer = page_data.get("AbstractText")
754
+ url = page_data.get("AbstractURL")
755
+ if answer:
756
+ results.append(
757
+ {
758
+ "icon": None,
759
+ "text": answer,
760
+ "topic": None,
761
+ "url": url,
762
+ }
763
+ )
764
+
765
+ # related
766
+ payload = {
767
+ "q": f"{keywords}",
768
+ "format": "json",
769
+ }
770
+ resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
771
+ resp_json = json_loads(resp_content)
772
+ page_data = resp_json.get("RelatedTopics", [])
773
+
774
+ for row in page_data:
775
+ topic = row.get("Name")
776
+ if not topic:
777
+ icon = row["Icon"].get("URL")
778
+ results.append(
779
+ {
780
+ "icon": f"https://duckduckgo.com{icon}" if icon else "",
781
+ "text": row["Text"],
782
+ "topic": None,
783
+ "url": row["FirstURL"],
784
+ }
785
+ )
786
+ else:
787
+ for subrow in row["Topics"]:
788
+ icon = subrow["Icon"].get("URL")
789
+ results.append(
790
+ {
791
+ "icon": f"https://duckduckgo.com{icon}" if icon else "",
792
+ "text": subrow["Text"],
793
+ "topic": topic,
794
+ "url": subrow["FirstURL"],
795
+ }
796
+ )
797
+
798
+ return results
799
+
800
+ def suggestions(self, keywords: str, region: str = "wt-wt") -> List[Dict[str, str]]:
801
+ """DuckDuckGo suggestions. Query params: https://duckduckgo.com/params.
802
+
803
+ Args:
804
+ keywords: keywords for query.
805
+ region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
806
+
807
+ Returns:
808
+ List of dictionaries with suggestions results.
809
+
810
+ Raises:
811
+ WebscoutE: Base exception for webscout errors.
812
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
813
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
814
+ """
815
+ assert keywords, "keywords is mandatory"
816
+
817
+ payload = {
818
+ "q": keywords,
819
+ "kl": region,
820
+ }
821
+ resp_content = self._get_url("GET", "https://duckduckgo.com/ac/", params=payload)
822
+ page_data = json_loads(resp_content)
823
+ return [r for r in page_data]
824
+
825
+ def maps(
826
+ self,
827
+ keywords: str,
828
+ place: Optional[str] = None,
829
+ street: Optional[str] = None,
830
+ city: Optional[str] = None,
831
+ county: Optional[str] = None,
832
+ state: Optional[str] = None,
833
+ country: Optional[str] = None,
834
+ postalcode: Optional[str] = None,
835
+ latitude: Optional[str] = None,
836
+ longitude: Optional[str] = None,
837
+ radius: int = 0,
838
+ max_results: Optional[int] = None,
839
+ ) -> List[Dict[str, str]]:
840
+ """DuckDuckGo maps search. Query params: https://duckduckgo.com/params.
841
+
842
+ Args:
843
+ keywords: keywords for query
844
+ place: if set, the other parameters are not used. Defaults to None.
845
+ street: house number/street. Defaults to None.
846
+ city: city of search. Defaults to None.
847
+ county: county of search. Defaults to None.
848
+ state: state of search. Defaults to None.
849
+ country: country of search. Defaults to None.
850
+ postalcode: postalcode of search. Defaults to None.
851
+ latitude: geographic coordinate (north-south position). Defaults to None.
852
+ longitude: geographic coordinate (east-west position); if latitude and
853
+ longitude are set, the other parameters are not used. Defaults to None.
854
+ radius: expand the search square by the distance in kilometers. Defaults to 0.
855
+ max_results: max number of results. If None, returns results only from the first response. Defaults to None.
856
+
857
+ Returns:
858
+ List of dictionaries with maps search results, or None if there was an error.
859
+
860
+ Raises:
861
+ WebscoutE: Base exception for webscout errors.
862
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
863
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
864
+ """
865
+ assert keywords, "keywords is mandatory"
866
+
867
+ vqd = self._get_vqd(keywords)
868
+
869
+ # if longitude and latitude are specified, skip the request about bbox to the nominatim api
870
+ if latitude and longitude:
871
+ lat_t = Decimal(latitude.replace(",", "."))
872
+ lat_b = Decimal(latitude.replace(",", "."))
873
+ lon_l = Decimal(longitude.replace(",", "."))
874
+ lon_r = Decimal(longitude.replace(",", "."))
875
+ if radius == 0:
876
+ radius = 1
877
+ # otherwise request about bbox to nominatim api
878
+ else:
879
+ if place:
880
+ params = {
881
+ "q": place,
882
+ "polygon_geojson": "0",
883
+ "format": "jsonv2",
884
+ }
885
+ else:
886
+ params = {
887
+ "polygon_geojson": "0",
888
+ "format": "jsonv2",
889
+ }
890
+ if street:
891
+ params["street"] = street
892
+ if city:
893
+ params["city"] = city
894
+ if county:
895
+ params["county"] = county
896
+ if state:
897
+ params["state"] = state
898
+ if country:
899
+ params["country"] = country
900
+ if postalcode:
901
+ params["postalcode"] = postalcode
902
+ # request nominatim api to get coordinates box
903
+ resp_content = self._get_url(
904
+ "GET",
905
+ "https://nominatim.openstreetmap.org/search.php",
906
+ params=params,
907
+ )
908
+ if resp_content == b"[]":
909
+ raise WebscoutE("maps() Coordinates are not found, check function parameters.")
910
+ resp_json = json_loads(resp_content)
911
+ coordinates = resp_json[0]["boundingbox"]
912
+ lat_t, lon_l = Decimal(coordinates[1]), Decimal(coordinates[2])
913
+ lat_b, lon_r = Decimal(coordinates[0]), Decimal(coordinates[3])
914
+
915
+ # if a radius is specified, expand the search square
916
+ lat_t += Decimal(radius) * Decimal(0.008983)
917
+ lat_b -= Decimal(radius) * Decimal(0.008983)
918
+ lon_l -= Decimal(radius) * Decimal(0.008983)
919
+ lon_r += Decimal(radius) * Decimal(0.008983)
920
+ logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
921
+
922
+ cache = set()
923
+ results: List[Dict[str, str]] = []
924
+
925
+ def _maps_page(
926
+ bbox: Tuple[Decimal, Decimal, Decimal, Decimal],
927
+ ) -> Optional[List[Dict[str, str]]]:
928
+ if max_results and len(results) >= max_results:
929
+ return None
930
+ lat_t, lon_l, lat_b, lon_r = bbox
931
+ params = {
932
+ "q": keywords,
933
+ "vqd": vqd,
934
+ "tg": "maps_places",
935
+ "rt": "D",
936
+ "mkexp": "b",
937
+ "wiki_info": "1",
938
+ "is_requery": "1",
939
+ "bbox_tl": f"{lat_t},{lon_l}",
940
+ "bbox_br": f"{lat_b},{lon_r}",
941
+ "strict_bbox": "1",
942
+ }
943
+ resp_content = self._get_url("GET", "https://duckduckgo.com/local.js", params=params)
944
+ resp_json = json_loads(resp_content)
945
+ page_data = resp_json.get("results", [])
946
+
947
+ page_results = []
948
+ for res in page_data:
949
+ r_name = f'{res["name"]} {res["address"]}'
950
+ if r_name in cache:
951
+ continue
952
+ else:
953
+ cache.add(r_name)
954
+ result = {
955
+ "title": res["name"],
956
+ "address": res["address"],
957
+ "country_code": res["country_code"],
958
+ "url": _normalize_url(res["website"]),
959
+ "phone": res["phone"] or "",
960
+ "latitude": res["coordinates"]["latitude"],
961
+ "longitude": res["coordinates"]["longitude"],
962
+ "source": _normalize_url(res["url"]),
963
+ "image": x.get("image", "") if (x := res["embed"]) else "",
964
+ "desc": x.get("description", "") if (x := res["embed"]) else "",
965
+ "hours": res["hours"] or "",
966
+ "category": res["ddg_category"] or "",
967
+ "facebook": f"www.facebook.com/profile.php?id={x}" if (x := res["facebook_id"]) else "",
968
+ "instagram": f"https://www.instagram.com/{x}" if (x := res["instagram_id"]) else "",
969
+ "twitter": f"https://twitter.com/{x}" if (x := res["twitter_id"]) else "",
970
+ }
971
+ page_results.append(result)
972
+ return page_results
973
+
974
+ # search squares (bboxes)
975
+ start_bbox = (lat_t, lon_l, lat_b, lon_r)
976
+ work_bboxes = [start_bbox]
977
+ while work_bboxes:
978
+ queue_bboxes = [] # for next iteration, at the end of the iteration work_bboxes = queue_bboxes
979
+ tasks = []
980
+ for bbox in work_bboxes:
981
+ tasks.append(bbox)
982
+ # if distance between coordinates > 1, divide the square into 4 parts and save them in queue_bboxes
983
+ if _calculate_distance(lat_t, lon_l, lat_b, lon_r) > 1:
984
+ lat_t, lon_l, lat_b, lon_r = bbox
985
+ lat_middle = (lat_t + lat_b) / 2
986
+ lon_middle = (lon_l + lon_r) / 2
987
+ bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
988
+ bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
989
+ bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
990
+ bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
991
+ queue_bboxes.extend([bbox1, bbox2, bbox3, bbox4])
992
+
993
+ # gather tasks using asyncio.wait_for and timeout
994
+ work_bboxes_results = []
995
+ try:
996
+ for r in self._executor.map(_maps_page, tasks):
997
+ if r:
998
+ work_bboxes_results.extend(r)
999
+ except Exception as e:
1000
+ raise e
1001
+
1002
+ for x in work_bboxes_results:
1003
+ if isinstance(x, list):
1004
+ results.extend(x)
1005
+ elif isinstance(x, dict):
1006
+ results.append(x)
1007
+
1008
+ work_bboxes = queue_bboxes
1009
+ if not max_results or len(results) >= max_results or len(work_bboxes_results) == 0:
1010
+ break
1011
+
1012
+ return list(islice(results, max_results))
1013
+
1014
+ def translate(
1015
+ self, keywords: Union[List[str], str], from_: Optional[str] = None, to: str = "en"
1016
+ ) -> List[Dict[str, str]]:
1017
+ """DuckDuckGo translate.
1018
+
1019
+ Args:
1020
+ keywords: string or list of strings to translate.
1021
+ from_: translate from (defaults automatically). Defaults to None.
1022
+ to: what language to translate. Defaults to "en".
1023
+
1024
+ Returns:
1025
+ List od dictionaries with translated keywords.
1026
+
1027
+ Raises:
1028
+ WebscoutE: Base exception for webscout errors.
1029
+ RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
1030
+ TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
1031
+ """
1032
+ assert keywords, "keywords is mandatory"
1033
+
1034
+ vqd = self._get_vqd("translate")
1035
+
1036
+ payload = {
1037
+ "vqd": vqd,
1038
+ "query": "translate",
1039
+ "to": to,
1040
+ }
1041
+ if from_:
1042
+ payload["from"] = from_
1043
+
1044
+ def _translate_keyword(keyword: str) -> Dict[str, str]:
1045
+ resp_content = self._get_url(
1046
+ "POST",
1047
+ "https://duckduckgo.com/translation.js",
1048
+ params=payload,
1049
+ content=keyword.encode(),
1050
+ )
1051
+ page_data: Dict[str, str] = json_loads(resp_content)
1052
+ page_data["original"] = keyword
1053
+ return page_data
1054
+
1055
+ if isinstance(keywords, str):
1056
+ keywords = [keywords]
1057
+
1058
+ results = []
1059
+ try:
1060
+ for r in self._executor.map(_translate_keyword, keywords):
1061
+ results.append(r)
1062
+ except Exception as e:
1063
+ raise e
1064
+
1065
+ return results