J.A.R.V.I.S / scam_detector.py
varun324242's picture
Upload folder using huggingface_hub
fe2a0f2 verified
import subprocess
import sys
import os
import requests
from PIL import Image
import pytesseract
from io import BytesIO
import pandas as pd
import json
from groq import Groq
from twilio.rest import Client
import logging
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# Configure logging
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('logs/sms_debug.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def setup_directories():
"""Create necessary directories"""
directories = ['logs', 'data', 'data/images', 'data/texts', 'data/reports']
for directory in directories:
os.makedirs(directory, exist_ok=True)
logger.info("Directory structure created")
class SMSSender:
def __init__(self):
"""Initialize Twilio client with credentials"""
self.account_sid = "AC68e68b700bfe8ede9080e426042e6ccf"
self.auth_token = "27814cd39d313e35713c81e7b36da11f"
self.from_number = "+17322534518"
self.client = Client(self.account_sid, self.auth_token)
def send_sms(self, to_number, message):
"""Send SMS using Twilio"""
try:
logger.info(f"Attempting to send SMS to: {to_number}")
if not to_number.startswith('+'):
to_number = f"+91{to_number}"
message = self.client.messages.create(
body=message,
from_=self.from_number,
to=to_number
)
logger.info(f"SMS sent successfully! Message SID: {message.sid}")
return True
except Exception as e:
logger.error(f"Failed to send SMS: {str(e)}", exc_info=True)
return False
class ScamDetector:
def __init__(self, groq_api_key, sms_sender):
self.groq_client = Groq(api_key=groq_api_key)
self.sms_sender = sms_sender
self.base_path = os.path.join(os.getcwd(), 'data')
setup_directories()
def process_text_with_groq(self, text):
try:
prompt = f"""
Format the following extracted text from an SMS image.
Keep the original content intact but improve the formatting and remove any OCR artifacts:
{text}
"""
completion = self.groq_client.chat.completions.create(
model="llama3-8b-8192",
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=1024,
top_p=1,
stream=False,
stop=None
)
return completion.choices[0].message.content.strip()
except Exception as e:
logger.error(f"Error in Groq processing: {str(e)}")
return text
def download_and_extract_text(self, url, save_image=True):
try:
response = requests.get(url, timeout=10)
img = Image.open(BytesIO(response.content))
# Save image if requested
if save_image:
img_filename = f"image_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
img_path = os.path.join(self.base_path, 'images', img_filename)
img.save(img_path)
logger.info(f"Image saved: {img_path}")
text = pytesseract.image_to_string(img)
text = text.strip()
if text:
return self.process_text_with_groq(text)
except Exception as e:
logger.error(f"Error processing image from {url}: {str(e)}")
return None
def scrape_images(self):
# ... (rest of the scraping code remains the same)
def process_and_save(self, image_urls):
"""Process images and save results locally"""
logger.info("Extracting text from images...")
image_texts = []
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
for i, url in enumerate(image_urls):
logger.info(f"Processing image {i+1}/{len(image_urls)} from URL: {url}")
text = self.download_and_extract_text(url)
if text:
image_texts.append({
'url': url,
'text': text
})
# Save files in local directories
url_path = os.path.join(self.base_path, 'texts', f'scam_urls_{timestamp}.txt')
text_path = os.path.join(self.base_path, 'texts', f'scam_texts_{timestamp}.txt')
csv_path = os.path.join(self.base_path, 'reports', f'scam_report_{timestamp}.csv')
# Save URLs
with open(url_path, 'w') as f:
for url in image_urls:
f.write(url + '\n')
# Save extracted texts
with open(text_path, 'w', encoding='utf-8') as f:
for item in image_texts:
f.write(f"URL: {item['url']}\n")
f.write(f"Text:\n{item['text']}\n")
f.write("-" * 80 + "\n")
# Save CSV report
df = pd.DataFrame(image_texts)
df.to_csv(csv_path, index=False)
# Send SMS report
message = f"""
Scam Detector Run Report
Time: {timestamp}
Total URLs found: {len(image_urls)}
Total texts extracted: {len(image_texts)}
Files saved in local directories
"""
self.sms_sender.send_sms(
to_number="8140030507",
message=message
)
return url_path, text_path, csv_path
def main():
try:
logger.info("Starting the scam detection process...")
# Create timestamp for this run
run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Setup run-specific logging
log_path = os.path.join('logs', f'scam_run_{run_timestamp}.log')
run_log_handler = logging.FileHandler(log_path)
run_log_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(run_log_handler)
logger.info(f"Starting new detection run at {run_timestamp}")
GROQ_API_KEY = "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"
sms_sender = SMSSender()
detector = ScamDetector(groq_api_key=GROQ_API_KEY, sms_sender=sms_sender)
logger.info("Starting image scraping...")
image_urls = detector.scrape_images()
logger.info(f"Found {len(image_urls)} unique images")
url_path, text_path, csv_path = detector.process_and_save(image_urls)
logger.info(f"Results saved locally and SMS sent!")
logger.info("Detection run completed")
print("Detection run completed successfully.")
except Exception as e:
logger.error(f"An error occurred: {str(e)}", exc_info=True)
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()