import random import requests from bs4 import BeautifulSoup from transformers import AutoTokenizer, AutoModelForCausalLM # List of user agents _useragent_list = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36", ] # Function to extract visible text from HTML content of a webpage def extract_text_from_webpage(html): print("Extracting text from webpage...") soup = BeautifulSoup(html, 'html.parser') for script in soup(["script", "style"]): script.extract() # Remove scripts and styles text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n'.join(chunk for chunk in chunks if chunk) print(f"Extracted text length: {len(text)}") return text # Function to perform a Google search and retrieve results def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None): """Performs a Google search and returns the results.""" print(f"Searching for term: {term}") escaped_term = requests.utils.quote(term) start = 0 all_results = [] max_chars_per_page = 8000 # Limit the number of characters from each webpage to stay under the token limit with requests.Session() as session: while start < num_results: print(f"Fetching search results starting from: {start}") try: # Choose a random user agent user_agent = random.choice(_useragent_list) headers = { 'User-Agent': user_agent } print(f"Using User-Agent: {headers['User-Agent']}") resp = session.get( url="https://www.google.com/search", headers=headers, params={ "q": term, "num": num_results - start, "hl": lang, "start": start, "safe": safe, }, timeout=timeout, verify=ssl_verify, ) resp.raise_for_status() except requests.exceptions.RequestException as e: print(f"Error fetching search results: {e}") break soup = BeautifulSoup(resp.text, "html.parser") result_block = soup.find_all("div", attrs={"class": "g"}) if not result_block: print("No more results found.") break for result in result_block: link = result.find("a", href=True) if link: link = link["href"] print(f"Found link: {link}") try: webpage = session.get(link, headers=headers, timeout=timeout) webpage.raise_for_status() visible_text = extract_text_from_webpage(webpage.text) if len(visible_text) > max_chars_per_page: visible_text = visible_text[:max_chars_per_page] + "..." all_results.append({"link": link, "text": visible_text}) except requests.exceptions.RequestException as e: print(f"Error fetching or processing {link}: {e}") all_results.append({"link": link, "text": None}) else: print("No link found in result.") all_results.append({"link": None, "text": None}) start += len(result_block) print(f"Total results fetched: {len(all_results)}") return all_results # Load the Mixtral-8x7B-Instruct model and tokenizer model_name = 'mistralai/Mistral-7B-Instruct-v0.3' tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Example usage search_term = "How did Tesla perform in Q1 2024" search_results = google_search(search_term, num_results=3) # Combine text from search results to create a prompt combined_text = "\n\n".join(result['text'] for result in search_results if result['text']) # Tokenize the input text inputs = tokenizer(combined_text, return_tensors="pt") # Generate a response outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50) # Decode the generated tokens to a readable string response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Print the response print(response)