Spaces:
Running
Running
from fastapi import FastAPI, Depends , Request,WebSocket | |
from fastapi.security import OAuth2PasswordBearer | |
import requests | |
import webbrowser | |
import base64 | |
import logging | |
import time | |
import asyncio | |
import PyPDF2 | |
from docx import Document | |
from PIL import Image | |
import pytesseract | |
import io | |
app = FastAPI() | |
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") | |
# Replace these with your own values from the Google Developer Console | |
# GOOGLE_CLIENT_ID = "" | |
# GOOGLE_CLIENT_SECRET = "" | |
# GOOGLE_REDIRECT_URI = "" | |
GOOGLE_CLIENT_ID = "485753721652-5uta3e18va2g6cnkldib2d68q39t4vod.apps.googleusercontent.com" | |
GOOGLE_CLIENT_SECRET = "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" | |
GOOGLE_REDIRECT_URI = "https://omkar008-receipt-radar-test.hf.space/test" | |
# Configure the logger | |
logging.basicConfig(level=logging.DEBUG) | |
logger = logging.getLogger(__name__) | |
async def login_google(): | |
# oauth_url = f"https://accounts.google.com/o/oauth2/auth?response_type=code&client_id={GOOGLE_CLIENT_ID}&redirect_uri={GOOGLE_REDIRECT_URI}&scope=openid%20profile%20email&access_type=offline" | |
#Below is the URL to prompt the user to login to his specified gmail account and also give a readonly access to his gmail | |
oauth_url_hr = f"https://accounts.google.com/o/oauth2/auth?response_type=code&client_id={GOOGLE_CLIENT_ID}&redirect_uri={GOOGLE_REDIRECT_URI}&scope=openid%20profile%20email%20https://www.googleapis.com/auth/gmail.readonly&access_type=offline" | |
return { | |
"url_hr": oauth_url_hr | |
} | |
async def test_google(code:str): | |
token_url = "https://accounts.google.com/o/oauth2/token" | |
print("Printing authorisation token") | |
print(code) | |
data = { | |
"code": code, | |
"client_id": GOOGLE_CLIENT_ID, | |
"client_secret": GOOGLE_CLIENT_SECRET, | |
"redirect_uri": GOOGLE_REDIRECT_URI, | |
"grant_type": "authorization_code", | |
} | |
response = requests.post(token_url, data=data) | |
access_token = response.json().get("access_token") | |
print("printing access token , yo yo test") | |
print(access_token) | |
# if not access_token: | |
# raise HTTPException(status_code=400, detail="Authorization code not provided") | |
print("Entered this function, for testing purposes") | |
brand_name = "louis vuitton" | |
user_info = requests.get("https://www.googleapis.com/oauth2/v1/userinfo", headers={"Authorization": f"Bearer {access_token}"}) | |
page_token = None | |
messages = [] | |
user_query = f"subject:((receipt {brand_name}) OR (receipts {brand_name}) OR (reçu {brand_name}) OR (reçus {brand_name}) OR (Quittung {brand_name}) OR (Quittungen {brand_name}) OR (aankoopbon {brand_name}) OR (aankoopbonnen {brand_name}) OR (recibo {brand_name}) OR (recibos {brand_name}) OR (ricevuta {brand_name}) OR (ricevute {brand_name}) OR (ontvangstbewijs {brand_name}) OR (ontvangstbewijzen {brand_name})) has:attachment" | |
# user_query = f"{brand_name} label:^smartlabel_receipt" | |
while True: | |
# Construct Gmail API request with pageToken | |
gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={user_query}" | |
if page_token: | |
gmail_url += f"&pageToken={page_token}" | |
gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"}) | |
gmail_data = gmail_response.json() | |
# Check if there are messages in the response | |
if "messages" in gmail_data: | |
messages.extend(gmail_data["messages"]) | |
# Check if there are more pages | |
if "nextPageToken" in gmail_data: | |
page_token = gmail_data["nextPageToken"] | |
else: | |
break # No more pages, exit the loop | |
unique_thread_ids = set() | |
filtered_data_list = [] | |
for entry in messages: | |
thread_id = entry['threadId'] | |
if thread_id not in unique_thread_ids: | |
unique_thread_ids.add(thread_id) | |
filtered_data_list.append(entry) | |
attachments = [] | |
attachment_no = 0 | |
data_new = {} | |
for i,message in enumerate(messages) : | |
# print(i) | |
# print(message) | |
if message: | |
message_id = message.get("id") | |
print(message_id) | |
if message_id: | |
message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}" | |
message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token}"}) | |
message_data = message_response.json() | |
if "payload" in message_data: | |
payload = message_data["payload"] | |
if "body" in payload and "data" in payload["body"]: | |
body_data = payload["body"]["data"] | |
body_content = base64.urlsafe_b64decode(body_data.encode("UTF-8")).decode("UTF-8") | |
print("Body Content:") | |
print(body_content) | |
# Check for parts in the message payload | |
if "payload" in message_data and "parts" in message_data["payload"]: | |
for part in message_data["payload"]["parts"]: | |
if "body" in part and "attachmentId" in part["body"]: | |
attachment_id = part["body"]["attachmentId"] | |
attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}" | |
attachment_response = requests.get(attachment_url, headers={"Authorization": f"Bearer {access_token}"}) | |
attachment_data = attachment_response.json() | |
data = attachment_data.get("data") | |
filename = part.get("filename", "untitled.txt") | |
if data: | |
data_new[filename]=data[:10] | |
# attachment_content = base64.urlsafe_b64decode(data) | |
# extracted_text = await extract_text_from_attachment(filename, attachment_content) | |
attachment_no+=1 | |
return {"attachment_count":attachment_no,"attachment_content":data_new} | |
async def auth_google(request: Request): | |
data = await request.json() | |
code = data.get("access_token") | |
brand_name = data.get("brand_name") | |
print("Printing the access token") | |
print(code) | |
if not code: | |
raise HTTPException(status_code=400, detail="Authorization code not provided") | |
access_token_new = code | |
user_info = requests.get("https://www.googleapis.com/oauth2/v1/userinfo", headers={"Authorization": f"Bearer {access_token_new}"}) | |
page_token = None | |
messages = [] | |
user_query = f"subject:((receipt {brand_name}) OR (receipts {brand_name}) OR (reçu {brand_name}) OR (reçus {brand_name}) OR (Quittung {brand_name}) OR (Quittungen {brand_name}) OR (aankoopbon {brand_name}) OR (aankoopbonnen {brand_name}) OR (recibo {brand_name}) OR (recibos {brand_name}) OR (ricevuta {brand_name}) OR (ricevute {brand_name}) OR (ontvangstbewijs {brand_name}) OR (ontvangstbewijzen {brand_name})) has:attachment" | |
# user_query = f"{brand_name} label:^smartlabel_receipt" | |
while True: | |
# Construct Gmail API request with pageToken | |
gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={user_query}" | |
if page_token: | |
gmail_url += f"&pageToken={page_token}" | |
gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token_new}"}) | |
gmail_data = gmail_response.json() | |
# Check if there are messages in the response | |
if "messages" in gmail_data: | |
messages.extend(gmail_data["messages"]) | |
# Check if there are more pages | |
if "nextPageToken" in gmail_data: | |
page_token = gmail_data["nextPageToken"] | |
else: | |
break # No more pages, exit the loop | |
attachments = [] | |
attachment_no = 0 | |
data_new = {} | |
for i,message in enumerate(messages) : | |
# print(i) | |
# print(message) | |
if message: | |
message_id = message.get("id") | |
print(message_id) | |
if message_id: | |
message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}" | |
message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token_new}"}) | |
message_data = message_response.json() | |
if "payload" in message_data: | |
payload = message_data["payload"] | |
if "body" in payload and "data" in payload["body"]: | |
body_data = payload["body"]["data"] | |
body_content = base64.urlsafe_b64decode(body_data.encode("UTF-8")).decode("UTF-8") | |
print("Body Content:") | |
print(body_content) | |
# Check for parts in the message payload | |
if "payload" in message_data and "parts" in message_data["payload"]: | |
for part in message_data["payload"]["parts"]: | |
if "body" in part and "attachmentId" in part["body"]: | |
attachment_id = part["body"]["attachmentId"] | |
attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}" | |
attachment_response = requests.get(attachment_url, headers={"Authorization": f"Bearer {access_token_new}"}) | |
attachment_data = attachment_response.json() | |
data = attachment_data.get("data") | |
filename = part.get("filename", "untitled.txt") | |
if data: | |
data_new[filename]=data | |
attachment_content = base64.urlsafe_b64decode(data) | |
extracted_text = await extract_text_from_attachment(filename, attachment_content) | |
attachment_no+=1 | |
return {"attachment_count":attachment_no,"attachment_content":data_new} | |
async def send_chunked_data(websocket: WebSocket, filename: str, data: str): | |
chunk_size = 1024 # Set an appropriate chunk size | |
for i in range(0, len(data), chunk_size): | |
await websocket.send_json({"filename": filename, "data_chunk": data[i:i + chunk_size]}) | |
await asyncio.sleep(0.4) | |
await websocket.send_text("FinishedThisAttachment") | |
async def extract_text_from_pdf(pdf_data): | |
with io.BytesIO(pdf_data) as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text() | |
return text | |
async def extract_text_from_docx(docx_data): | |
doc = Document(io.BytesIO(docx_data)) | |
text = "" | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return text | |
async def extract_text_from_attachment(filename, data): | |
if filename.endswith('.pdf'): | |
return await extract_text_from_pdf(data) | |
elif filename.endswith('.docx'): | |
return await extract_text_from_docx(data) | |
else: | |
# Add handling for other document types if needed | |
return "Unsupported document type" | |
async def test_websocket(websocket: WebSocket): | |
await websocket.accept() | |
logger.info("Hi hi succefull in connecting !!") | |
data = await websocket.receive_text() | |
logger.info("Received JSON data: %s", data) | |
def get_messages(code:str): | |
logging.info("entered into the get_messages") | |
access_token = code | |
print("printing access_token") | |
print(access_token) | |
page_token = None | |
messages = [] | |
jobs_query = "subject:receipt OR subject:receipts has:attachment" | |
while True: | |
# Construct Gmail API request with pageToken | |
print("into the gmail") | |
gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={jobs_query}" | |
if page_token: | |
gmail_url += f"&pageToken={page_token}" | |
gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"}) | |
logging.info(f"{gmail_response}") | |
print(gmail_response) | |
gmail_data = gmail_response.json() | |
# Check if there are messages in the response | |
if "messages" in gmail_data: | |
messages.extend(gmail_data["messages"]) | |
# Check if there are more pages | |
if "nextPageToken" in gmail_data: | |
page_token = gmail_data["nextPageToken"] | |
else: | |
break # No more pages, exit the loop | |
print("returning the messages") | |
unique_thread_ids = set() | |
filtered_data_list = [] | |
for entry in messages: | |
thread_id = entry['threadId'] | |
if thread_id not in unique_thread_ids: | |
unique_thread_ids.add(thread_id) | |
filtered_data_list.append(entry) | |
return filtered_data_list | |
async def event_generator(code:str): | |
logging.info("entered into the event_generator") | |
access_token = code | |
messages=get_messages(access_token) | |
print(len(messages)) | |
await websocket.send_json({"total_messages":len(messages)}) | |
await websocket.send_text("CompletedSendingTotalMessagesLength") | |
attachments = [] | |
prev_data="" | |
data_new={} | |
attachment_no=0 | |
batch_size = 5 | |
prev_filename = None | |
for i,message in enumerate(messages) : | |
print(i) | |
logging.info(f"{i}") | |
logging.info(f"{message}") | |
print(message) | |
if message: | |
message_id = message.get("id") | |
thread_id = message.get("threadId") | |
print(message_id) | |
if message_id: | |
message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}" | |
message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token}"}) | |
message_data = message_response.json() | |
# Check for parts in the message payload | |
if "payload" in message_data and "parts" in message_data["payload"]: | |
for part in message_data["payload"]["parts"]: | |
if "body" in part and "attachmentId" in part["body"]: | |
attachment_id = part["body"]["attachmentId"] | |
print(attachment_id) | |
attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}" | |
attachment_response = requests.get(attachment_url, headers={"Authorization": f"Bearer {access_token}"}) | |
attachment_data = attachment_response.json() | |
data = attachment_data.get("data",{}) | |
filename = part.get("filename", "untitled.txt") | |
if data: | |
data_new[filename]=str(data[:10]) | |
attachment_content = base64.urlsafe_b64decode(data) | |
logging.info(filename) | |
extracted_text = await extract_text_from_attachment(filename, attachment_content) | |
logging.info(extracted_text) | |
await send_chunked_data(websocket, filename, data) | |
attachment_no+=1 | |
await websocket.send_text("CompletedFetchingMessages") | |
await event_generator(data) | |
logging.info("Closing connection") | |
await websocket.close() | |