Spaces:
Running
Running
import subprocess | |
import sys | |
import os | |
import requests | |
from PIL import Image | |
import pytesseract | |
from io import BytesIO | |
import pandas as pd | |
import json | |
from groq import Groq | |
from twilio.rest import Client | |
import logging | |
from datetime import datetime | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
import time | |
# Configure logging | |
logging.basicConfig( | |
level=logging.DEBUG, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.FileHandler('logs/sms_debug.log'), | |
logging.StreamHandler() | |
] | |
) | |
logger = logging.getLogger(__name__) | |
def setup_directories(): | |
"""Create necessary directories""" | |
directories = ['logs', 'data', 'data/images', 'data/texts', 'data/reports'] | |
for directory in directories: | |
os.makedirs(directory, exist_ok=True) | |
logger.info("Directory structure created") | |
class SMSSender: | |
def __init__(self): | |
"""Initialize Twilio client with credentials""" | |
self.account_sid = "AC68e68b700bfe8ede9080e426042e6ccf" | |
self.auth_token = "27814cd39d313e35713c81e7b36da11f" | |
self.from_number = "+17322534518" | |
self.client = Client(self.account_sid, self.auth_token) | |
def send_sms(self, to_number, message): | |
"""Send SMS using Twilio""" | |
try: | |
logger.info(f"Attempting to send SMS to: {to_number}") | |
if not to_number.startswith('+'): | |
to_number = f"+91{to_number}" | |
message = self.client.messages.create( | |
body=message, | |
from_=self.from_number, | |
to=to_number | |
) | |
logger.info(f"SMS sent successfully! Message SID: {message.sid}") | |
return True | |
except Exception as e: | |
logger.error(f"Failed to send SMS: {str(e)}", exc_info=True) | |
return False | |
class ScamDetector: | |
def __init__(self, groq_api_key, sms_sender): | |
self.groq_client = Groq(api_key=groq_api_key) | |
self.sms_sender = sms_sender | |
self.base_path = os.path.join(os.getcwd(), 'data') | |
setup_directories() | |
def process_text_with_groq(self, text): | |
try: | |
prompt = f""" | |
Format the following extracted text from an SMS image. | |
Keep the original content intact but improve the formatting and remove any OCR artifacts: | |
{text} | |
""" | |
completion = self.groq_client.chat.completions.create( | |
model="llama3-8b-8192", | |
messages=[{"role": "user", "content": prompt}], | |
temperature=0.3, | |
max_tokens=1024, | |
top_p=1, | |
stream=False, | |
stop=None | |
) | |
return completion.choices[0].message.content.strip() | |
except Exception as e: | |
logger.error(f"Error in Groq processing: {str(e)}") | |
return text | |
def download_and_extract_text(self, url, save_image=True): | |
try: | |
response = requests.get(url, timeout=10) | |
img = Image.open(BytesIO(response.content)) | |
# Save image if requested | |
if save_image: | |
img_filename = f"image_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png" | |
img_path = os.path.join(self.base_path, 'images', img_filename) | |
img.save(img_path) | |
logger.info(f"Image saved: {img_path}") | |
text = pytesseract.image_to_string(img) | |
text = text.strip() | |
if text: | |
return self.process_text_with_groq(text) | |
except Exception as e: | |
logger.error(f"Error processing image from {url}: {str(e)}") | |
return None | |
def scrape_images(self): | |
# ... (rest of the scraping code remains the same) | |
def process_and_save(self, image_urls): | |
"""Process images and save results locally""" | |
logger.info("Extracting text from images...") | |
image_texts = [] | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
for i, url in enumerate(image_urls): | |
logger.info(f"Processing image {i+1}/{len(image_urls)} from URL: {url}") | |
text = self.download_and_extract_text(url) | |
if text: | |
image_texts.append({ | |
'url': url, | |
'text': text | |
}) | |
# Save files in local directories | |
url_path = os.path.join(self.base_path, 'texts', f'scam_urls_{timestamp}.txt') | |
text_path = os.path.join(self.base_path, 'texts', f'scam_texts_{timestamp}.txt') | |
csv_path = os.path.join(self.base_path, 'reports', f'scam_report_{timestamp}.csv') | |
# Save URLs | |
with open(url_path, 'w') as f: | |
for url in image_urls: | |
f.write(url + '\n') | |
# Save extracted texts | |
with open(text_path, 'w', encoding='utf-8') as f: | |
for item in image_texts: | |
f.write(f"URL: {item['url']}\n") | |
f.write(f"Text:\n{item['text']}\n") | |
f.write("-" * 80 + "\n") | |
# Save CSV report | |
df = pd.DataFrame(image_texts) | |
df.to_csv(csv_path, index=False) | |
# Send SMS report | |
message = f""" | |
Scam Detector Run Report | |
Time: {timestamp} | |
Total URLs found: {len(image_urls)} | |
Total texts extracted: {len(image_texts)} | |
Files saved in local directories | |
""" | |
self.sms_sender.send_sms( | |
to_number="8140030507", | |
message=message | |
) | |
return url_path, text_path, csv_path | |
def main(): | |
try: | |
logger.info("Starting the scam detection process...") | |
# Create timestamp for this run | |
run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
# Setup run-specific logging | |
log_path = os.path.join('logs', f'scam_run_{run_timestamp}.log') | |
run_log_handler = logging.FileHandler(log_path) | |
run_log_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) | |
logger.addHandler(run_log_handler) | |
logger.info(f"Starting new detection run at {run_timestamp}") | |
GROQ_API_KEY = "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" | |
sms_sender = SMSSender() | |
detector = ScamDetector(groq_api_key=GROQ_API_KEY, sms_sender=sms_sender) | |
logger.info("Starting image scraping...") | |
image_urls = detector.scrape_images() | |
logger.info(f"Found {len(image_urls)} unique images") | |
url_path, text_path, csv_path = detector.process_and_save(image_urls) | |
logger.info(f"Results saved locally and SMS sent!") | |
logger.info("Detection run completed") | |
print("Detection run completed successfully.") | |
except Exception as e: | |
logger.error(f"An error occurred: {str(e)}", exc_info=True) | |
import traceback | |
traceback.print_exc() | |
if __name__ == "__main__": | |
main() |