Sentinel-AI-Beta-Test

Running

App Files Files Community

Shreyas094 commited on Jun 20

Commit

34054e0

•

1 Parent(s): 67f5e62

Create app.py

Browse files

Files changed (1) hide show

app.py +116 -0

app.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import random
+import requests
+from bs4 import BeautifulSoup
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# List of user agents
+_useragent_list = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
+]
+# Function to extract visible text from HTML content of a webpage
+def extract_text_from_webpage(html):
+    print("Extracting text from webpage...")
+    soup = BeautifulSoup(html, 'html.parser')
+    for script in soup(["script", "style"]):
+        script.extract()  # Remove scripts and styles
+    text = soup.get_text()
+    lines = (line.strip() for line in text.splitlines())
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = '\n'.join(chunk for chunk in chunks if chunk)
+    print(f"Extracted text length: {len(text)}")
+    return text
+# Function to perform a Google search and retrieve results
+def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_verify=None):
+    """Performs a Google search and returns the results."""
+    print(f"Searching for term: {term}")
+    escaped_term = requests.utils.quote(term)
+    start = 0
+    all_results = []
+    max_chars_per_page = 8000  # Limit the number of characters from each webpage to stay under the token limit
+    with requests.Session() as session:
+        while start < num_results:
+            print(f"Fetching search results starting from: {start}")
+            try:
+                # Choose a random user agent
+                user_agent = random.choice(_useragent_list)
+                headers = {
+                    'User-Agent': user_agent
+                }
+                print(f"Using User-Agent: {headers['User-Agent']}")
+                resp = session.get(
+                    url="https://www.google.com/search",
+                    headers=headers,
+                    params={
+                        "q": term,
+                        "num": num_results - start,
+                        "hl": lang,
+                        "start": start,
+                        "safe": safe,
+                    },
+                    timeout=timeout,
+                    verify=ssl_verify,
+                )
+                resp.raise_for_status()
+            except requests.exceptions.RequestException as e:
+                print(f"Error fetching search results: {e}")
+                break
+            soup = BeautifulSoup(resp.text, "html.parser")
+            result_block = soup.find_all("div", attrs={"class": "g"})
+            if not result_block:
+                print("No more results found.")
+                break
+            for result in result_block:
+                link = result.find("a", href=True)
+                if link:
+                    link = link["href"]
+                    print(f"Found link: {link}")
+                    try:
+                        webpage = session.get(link, headers=headers, timeout=timeout)
+                        webpage.raise_for_status()
+                        visible_text = extract_text_from_webpage(webpage.text)
+                        if len(visible_text) > max_chars_per_page:
+                            visible_text = visible_text[:max_chars_per_page] + "..."
+                        all_results.append({"link": link, "text": visible_text})
+                    except requests.exceptions.RequestException as e:
+                        print(f"Error fetching or processing {link}: {e}")
+                        all_results.append({"link": link, "text": None})
+                else:
+                    print("No link found in result.")
+                    all_results.append({"link": None, "text": None})
+            start += len(result_block)
+    print(f"Total results fetched: {len(all_results)}")
+    return all_results
+# Load the Mixtral-8x7B-Instruct model and tokenizer
+model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# Example usage
+search_term = "How did Tesla perform in Q1 2024"
+search_results = google_search(search_term, num_results=3)
+# Combine text from search results to create a prompt
+combined_text = "\n\n".join(result['text'] for result in search_results if result['text'])
+# Tokenize the input text
+inputs = tokenizer(combined_text, return_tensors="pt")
+# Generate a response
+outputs = model.generate(**inputs, max_length=150, temperature=0.7, top_p=0.9, top_k=50)
+# Decode the generated tokens to a readable string
+response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+# Print the response
+print(response)