Omkar008 commited on
Commit
f99443e
1 Parent(s): ea0ec6d

Update controllers/fetch_microsoft_mails.py

Browse files
Files changed (1) hide show
  1. controllers/fetch_microsoft_mails.py +58 -41
controllers/fetch_microsoft_mails.py CHANGED
@@ -62,40 +62,35 @@ def get_company_type(company_name:str)->str:
62
 
63
  async def get_messages(code: str,websocket:WebSocket,brand_name: Optional[str] = None):
64
  access_token = code
65
- g_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR subject:aankoopbon OR subject:reçu OR subject:invoice OR subject:invoices OR category:purchases) has:attachment'
66
  if brand_name is not None:
67
- g_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR subject: aankoopbon OR subject:reçu OR subject:invoice OR subject:invoices OR category:purchases OR from:{brand_name}) AND subject:{brand_name} has:attachment'
68
- page_token = None
69
  messages = []
70
- # max_results = 10
71
- # gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={jobs_query}&maxResults={max_results}"
72
- # gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
73
- # gmail_data = gmail_response.json()
74
- # messages.append(gmail_data['messages'])
75
  def fetch_message_wrapper(message_data):
76
  message_id = message_data.get("id")
 
 
 
77
  if message_id:
78
- return fetch_message_data(access_token, message_id)
79
 
80
  return None
81
 
82
  while True:
83
- gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={g_query}"
84
- if page_token:
85
- gmail_url += f"&pageToken={page_token}"
86
-
87
- gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
88
- gmail_data = gmail_response.json()
89
- print(len(gmail_data))
90
- print(gmail_data)
91
 
92
- if "messages" in gmail_data:
93
  with ThreadPoolExecutor(max_workers=15) as executor:
94
 
95
 
96
 
97
  futures=[executor.submit(fetch_message_wrapper, message_data) for message_data in
98
- gmail_data["messages"]]
99
  for future in futures:
100
  message = future.result()
101
  if message:
@@ -120,15 +115,15 @@ async def process_message(message:Message, websocket:WebSocket, chunk_size:int):
120
  await websocket.send_text("NEXT_MESSAGE")
121
 
122
 
123
- def fetch_message_data(access_token: str, message_id: str) -> Message:
124
- message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
125
- message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token}"})
126
- message_data = message_response.json()
127
  # print(message_data)
128
- subject = extract_subject_from_mail(message_data)
129
- company_from_mail = extract_domain_name(message_data['payload']['headers'], subject)
130
 
131
- body = extract_body_from_mail(message_data)
132
 
133
  attachments,structed_attachment_data = extract_attachments_from_mail(access_token, message_data)
134
  high_level_company_type = get_company_type(company_from_mail)
@@ -235,22 +230,44 @@ def fetch_attachment_data(access_token: str, message_id: str, attachment_id: str
235
  def extract_attachments_from_mail(access_token: str, message_data: dict) -> List[Attachment]:
236
  attachments = []
237
  structured_data = []
238
- if "payload" in message_data and "parts" in message_data["payload"]:
239
- for part in message_data["payload"]["parts"]:
240
- if "body" in part and "attachmentId" in part["body"]:
241
- attachment_id = part["body"]["attachmentId"]
242
- attachment_data = fetch_attachment_data(access_token, message_data["id"], attachment_id)
243
- filename = part.get("filename", "untitled.txt")
244
- if filename.endswith(".zip") or filename.endswith(".txt") or filename.endswith(".png") or filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".gif"):
245
- continue
246
- data = attachment_data.get("data", "")
247
- raw_text=ut.extract_text_from_attachment(filename , data)
248
- struct_data = ut.strcuture_document_data(raw_text)
249
- if struct_data:
250
- structured_data.append(struct_data)
251
-
252
- attachments.append(Attachment(attachment_len = len(attachment_data.get("data", "")),filename=filename, data=attachment_data.get("data", "")))
 
 
 
 
 
253
  return attachments,structured_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
 
256
  def extract_text(html_content: str) -> str:
 
62
 
63
  async def get_messages(code: str,websocket:WebSocket,brand_name: Optional[str] = None):
64
  access_token = code
65
+ g_query = 'subject:"your order" OR subject:"receipts" OR subject:"receipt" OR subject:"aankoopbon" OR subject:"reçu" OR subject:"invoice" OR subject:"invoices" hasattachment:yes'
66
  if brand_name is not None:
67
+ g_query = f'(subject:"your order" OR subject:"receipts" OR subject:"receipt" OR subject: "aankoopbon" OR subject:"reçu" OR subject:"invoice" OR subject:"invoices" OR from:{brand_name}) AND subject:{brand_name} has:attachment'
 
68
  messages = []
 
 
 
 
 
69
  def fetch_message_wrapper(message_data):
70
  message_id = message_data.get("id")
71
+ message_subject = message_data.get('subject')
72
+ message_body_preview = message_data.get('bodyPreview')
73
+ from_mail = message_data['from']['emailAddress']['address']
74
  if message_id:
75
+ return fetch_message_data(access_token,message_id,message_data,message_subject,message_body_preview,from_mail)
76
 
77
  return None
78
 
79
  while True:
80
+ outlook_url = f"https://graph.microsoft.com/v1.0/me/messages?q={g_query}"
81
+
82
+ outlook_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
83
+ outlook_data = gmail_response.json()
84
+ print(len(outlook_data))
85
+ print(outlook_data)
 
 
86
 
87
+ if "value" in gmail_data:
88
  with ThreadPoolExecutor(max_workers=15) as executor:
89
 
90
 
91
 
92
  futures=[executor.submit(fetch_message_wrapper, message_data) for message_data in
93
+ outlook_data["value"]]
94
  for future in futures:
95
  message = future.result()
96
  if message:
 
115
  await websocket.send_text("NEXT_MESSAGE")
116
 
117
 
118
+ def fetch_message_data(access_token: str,message_id:str, message_data,message_subject:str , message_body_preview,message_data,from_mail:str) -> Message:
119
+ # message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
120
+ # message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token}"})
121
+ # message_data = message_response.json()
122
  # print(message_data)
123
+ subject = message_subject
124
+ company_from_mail = from_mail
125
 
126
+ body = message_body_preview
127
 
128
  attachments,structed_attachment_data = extract_attachments_from_mail(access_token, message_data)
129
  high_level_company_type = get_company_type(company_from_mail)
 
230
  def extract_attachments_from_mail(access_token: str, message_data: dict) -> List[Attachment]:
231
  attachments = []
232
  structured_data = []
233
+ message_id = message_data.get('id')
234
+ headers = {
235
+ 'Authorization': f'Bearer {access_token}',
236
+ 'Accept': 'application/json',
237
+ }
238
+ attachments_response = requests.get(f'https://graph.microsoft.com/v1.0/me/messages/{message_id}/attachments', headers=headers)
239
+ attachments = attachments_response.json().get('value', [])
240
+
241
+ for attachment in attachments:
242
+ attachment_id = attachment['id']
243
+ filename = attachment['name']
244
+ if filename.endswith(".zip") or filename.endswith(".txt") or filename.endswith(".png") or filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".gif"):
245
+ continue
246
+ attachment_response = requests.get(f'https://graph.microsoft.com/v1.0/me/messages/{message_id}/attachments/{attachment_id}/$value', headers=headers)
247
+ data = attachment_response.content
248
+ raw_text = ut.extract_text_from_attachment(filename , data)
249
+ struct_data = ut.strcuture_document_data(raw_text)
250
+ if struct_data:
251
+ structured_data.append(struct_data)
252
+ attachments.append(Attachment(attachment_len = len(data),filename=filename, data=data))
253
  return attachments,structured_data
254
+ #Gmail data fetching below
255
+ # if "payload" in message_data and "parts" in message_data["payload"]:
256
+ # for part in message_data["payload"]["parts"]:
257
+ # if "body" in part and "attachmentId" in part["body"]:
258
+ # attachment_id = part["body"]["attachmentId"]
259
+ # attachment_data = fetch_attachment_data(access_token, message_data["id"], attachment_id)
260
+ # filename = part.get("filename", "untitled.txt")
261
+ # if filename.endswith(".zip") or filename.endswith(".txt") or filename.endswith(".png") or filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".gif"):
262
+ # continue
263
+ # data = attachment_data.get("data", "")
264
+ # raw_text=ut.extract_text_from_attachment(filename , data)
265
+ # struct_data = ut.strcuture_document_data(raw_text)
266
+ # if struct_data:
267
+ # structured_data.append(struct_data)
268
+
269
+ # attachments.append(Attachment(attachment_len = len(attachment_data.get("data", "")),filename=filename, data=attachment_data.get("data", "")))
270
+ # return attachments,structured_data
271
 
272
 
273
  def extract_text(html_content: str) -> str: