Omkar008 commited on
Commit
ea0ec6d
1 Parent(s): 4ace11e

Update controllers/fetch_microsoft_mails.py

Browse files
Files changed (1) hide show
  1. controllers/fetch_microsoft_mails.py +267 -2
controllers/fetch_microsoft_mails.py CHANGED
@@ -1,4 +1,3 @@
1
- @app.get("/emails")
2
  def get_emails(request: Request):
3
  access_token = request.session.get('access_token')
4
  if not access_token:
@@ -41,4 +40,270 @@ def get_emails(request: Request):
41
 
42
  emails.append(email_info)
43
 
44
- return {"Successfully extracted the messages!!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def get_emails(request: Request):
2
  access_token = request.session.get('access_token')
3
  if not access_token:
 
40
 
41
  emails.append(email_info)
42
 
43
+ return {"Successfully extracted the messages!!"}
44
+
45
+ import base64
46
+ import json
47
+ import logging
48
+ import re
49
+ from concurrent.futures import ThreadPoolExecutor
50
+ from typing import Optional, List, Dict
51
+ import requests
52
+ from bs4 import BeautifulSoup
53
+ from models.models import Message, Attachment
54
+ from fastapi import WebSocket
55
+ from services import utils as ut
56
+ import asyncio
57
+
58
+ def get_company_type(company_name:str)->str:
59
+ company_types_dict ={'ao yun': 'wines and spirit', 'ardbeg': 'wines and spirit', 'belvedere': 'wines and spirit', 'bodega numanthia': 'wines and spirit', 'chandon': 'wines and spirit', 'château cheval blanc': 'wines and spirit', "château d'yquem": 'wines and spirit', 'château galoupet': 'wines and spirit', 'cheval des andes': 'wines and spirit', 'clos19': 'wines and spirit', 'cloudy bay': 'wines and spirit', 'colgin cellars': 'wines and spirit', 'dom pérignon': 'wines and spirit', 'domaine des lambrays': 'wines and spirit', 'eminente': 'wines and spirit', 'glenmorangie': 'wines and spirit', 'hennessy': 'wines and spirit', 'joseph phelps': 'wines and spirit', 'krug': 'wines and spirit', 'mercier': 'wines and spirit', 'moët & chandon': 'wines and spirit', 'newton vineyard': 'wines and spirit', 'ruinart': 'wines and spirit', 'terrazas de los andes': 'wines and spirit', 'veuve clicquot': 'wines and spirit', 'volcan de mi tierra': 'wines and spirit', 'woodinville': 'wines and spirit' , 'berluti': 'Fashion & Leather Goods', 'celine': 'Fashion & Leather Goods', 'christian dior': 'Fashion & Leather Goods', 'emilio pucci': 'Fashion & Leather Goods', 'fendi': 'Fashion & Leather Goods', 'givenchy': 'Fashion & Leather Goods', 'kenzo': 'Fashion & Leather Goods', 'loewe': 'Fashion & Leather Goods', 'loro piana': 'Fashion & Leather Goods', 'louis vuitton': 'Fashion & Leather Goods', 'marc jacobs': 'Fashion & Leather Goods', 'moynat': 'Fashion & Leather Goods', 'patou': 'Fashion & Leather Goods', 'rimowa': 'Fashion & Leather Goods','acqua di parma': 'Perfumes & Cosmetics', 'benefit cosmetics': 'Perfumes & Cosmetics', 'cha ling': 'Perfumes & Cosmetics', 'fenty beauty by rihanna': 'Perfumes & Cosmetics', 'fresh': 'Perfumes & Cosmetics', 'givenchy parfums': 'Perfumes & Cosmetics', 'guerlain': 'Perfumes & Cosmetics', 'kenzo parfums': 'Perfumes & Cosmetics', 'kvd beauty': 'Perfumes & Cosmetics', 'loewe perfumes': 'Perfumes & Cosmetics', 'maison francis kurkdjian': 'Perfumes & Cosmetics', 'make up for ever': 'Perfumes & Cosmetics', 'officine universelle buly': 'Perfumes & Cosmetics', 'olehenriksen': 'Perfumes & Cosmetics', 'parfums christian dior': 'Perfumes & Cosmetics', 'stella by stella mccartney': 'Perfumes & Cosmetics','bulgari': 'Watches & Jewelry', 'chaumet': 'Watches & Jewelry', 'fred': 'Watches & Jewelry', 'hublot': 'Watches & Jewelry', 'repossi': 'Watches & Jewelry', 'tag heuer': 'Watches & Jewelry', 'tiffany & co.': 'Watches & Jewelry', 'zenith': 'Watches & Jewelry','24s': 'Selective retailing', 'dfs': 'Selective retailing', 'la grande epicerie de paris': 'Selective retailing', 'le bon marché rive gauche': 'Selective retailing', 'sephora': 'Selective retailing','belmond': 'Other activities', 'cheval blanc': 'Other activities', 'connaissance des arts': 'Other activities', 'cova': 'Other activities', 'investir': 'Other activities', "jardin d'acclimatation": 'Other activities', 'le parisien': 'Other activities', 'les echos': 'Other activities', 'radio classique': 'Other activities', 'royal van lent': 'Other activities'}
60
+ print(company_types_dict["louis vuitton"])
61
+ return company_types_dict.get(company_name.lower(), 'Others')
62
+
63
+ async def get_messages(code: str,websocket:WebSocket,brand_name: Optional[str] = None):
64
+ access_token = code
65
+ g_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR subject:aankoopbon OR subject:reçu OR subject:invoice OR subject:invoices OR category:purchases) has:attachment'
66
+ if brand_name is not None:
67
+ g_query = f'(subject:"your order" OR subject:receipts OR subject:receipt OR subject: aankoopbon OR subject:reçu OR subject:invoice OR subject:invoices OR category:purchases OR from:{brand_name}) AND subject:{brand_name} has:attachment'
68
+ page_token = None
69
+ messages = []
70
+ # max_results = 10
71
+ # gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={jobs_query}&maxResults={max_results}"
72
+ # gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
73
+ # gmail_data = gmail_response.json()
74
+ # messages.append(gmail_data['messages'])
75
+ def fetch_message_wrapper(message_data):
76
+ message_id = message_data.get("id")
77
+ if message_id:
78
+ return fetch_message_data(access_token, message_id)
79
+
80
+ return None
81
+
82
+ while True:
83
+ gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={g_query}"
84
+ if page_token:
85
+ gmail_url += f"&pageToken={page_token}"
86
+
87
+ gmail_response = requests.get(gmail_url, headers={"Authorization": f"Bearer {access_token}"})
88
+ gmail_data = gmail_response.json()
89
+ print(len(gmail_data))
90
+ print(gmail_data)
91
+
92
+ if "messages" in gmail_data:
93
+ with ThreadPoolExecutor(max_workers=15) as executor:
94
+
95
+
96
+
97
+ futures=[executor.submit(fetch_message_wrapper, message_data) for message_data in
98
+ gmail_data["messages"]]
99
+ for future in futures:
100
+ message = future.result()
101
+ if message:
102
+ messages.append(message)
103
+ for message_data in messages:
104
+ await process_message(message_data,websocket,10000)
105
+
106
+ if "nextPageToken" in gmail_data:
107
+ page_token = gmail_data["nextPageToken"]
108
+ else:
109
+ break
110
+ print("printing messages")
111
+ print(messages)
112
+ return messages
113
+
114
+ async def process_message(message:Message, websocket:WebSocket, chunk_size:int):
115
+ logging.info("process_message")
116
+ if message:
117
+ message_json = message.to_json()
118
+ logging.info(f"{message_json}")
119
+ await send_message_in_chunks(websocket, message_json, chunk_size)
120
+ await websocket.send_text("NEXT_MESSAGE")
121
+
122
+
123
+ def fetch_message_data(access_token: str, message_id: str) -> Message:
124
+ message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
125
+ message_response = requests.get(message_url, headers={"Authorization": f"Bearer {access_token}"})
126
+ message_data = message_response.json()
127
+ # print(message_data)
128
+ subject = extract_subject_from_mail(message_data)
129
+ company_from_mail = extract_domain_name(message_data['payload']['headers'], subject)
130
+
131
+ body = extract_body_from_mail(message_data)
132
+
133
+ attachments,structed_attachment_data = extract_attachments_from_mail(access_token, message_data)
134
+ high_level_company_type = get_company_type(company_from_mail)
135
+ # structed_attachment_data = extract_json_from_attachments(access_token , message_data)
136
+
137
+
138
+ body_len = 0
139
+ if body is not None :
140
+ body_len = len(body)
141
+
142
+ # print("subject: ")
143
+ # print(subject)
144
+ # print("company name: ")
145
+ # print(company_from_mail)
146
+ # print("Printing the body of the mail: ")
147
+ # print(body)
148
+ # print("Printing attachment Data: ")
149
+ # print(attachments)
150
+ # print("Completed this mail.")
151
+ return Message(message_id=message_id, body_len=body_len,body=body, attachments=attachments, company=company_from_mail,high_level_company_type=high_level_company_type,structured_data = structed_attachment_data)
152
+
153
+
154
+
155
+ def extract_subject_from_mail(message_data: dict) -> str:
156
+ if 'payload' in message_data and 'headers' in message_data['payload']:
157
+ headers = message_data['payload']['headers']
158
+ for header in headers:
159
+ if header['name'] == 'Subject':
160
+ return header['value']
161
+ return ""
162
+ else:
163
+ return ""
164
+
165
+
166
+ def extract_domain_name(payload: dict, subject: str) -> str:
167
+ domain_name = 'others'
168
+ for fromdata in payload:
169
+ if fromdata['name'] == 'From':
170
+ domain_name = extract_domain_from_email(fromdata['value'])
171
+ break
172
+ if 'chanel' in subject.lower():
173
+ return 'chanel'
174
+ if 'louis vuitton' in subject.lower():
175
+ return 'Louis Vuitton'
176
+ return domain_name
177
+
178
+
179
+ def extract_domain_from_email(email_string: str) -> Optional[str]:
180
+ email_address = re.search(r'[\w\.-]+@[\w\.-]+', email_string).group()
181
+ domain = email_address.split('@')[-1].split('.')[0]
182
+ if email_address and domain:
183
+ return domain
184
+ else:
185
+ return None
186
+
187
+
188
+ # def extract_body_from_mail(message_data: dict) -> str:
189
+ # body = None
190
+ # if "payload" in message_data and "parts" in message_data["payload"]:
191
+ # for part in message_data["payload"]["parts"]:
192
+ # if 'mimeType' in part and (part['mimeType'] == 'text/plain' or part['mimeType'] == 'text/html'):
193
+ # body_data = part['body'].get('data', '')
194
+ # body_base64 = base64.urlsafe_b64decode(body_data)
195
+ # body = extract_text(body_base64)
196
+ # return body
197
+
198
+
199
+ def extract_body_from_mail(message_data: dict) -> str:
200
+ body = None
201
+ if "payload" in message_data:
202
+ payload = message_data["payload"]
203
+ if "parts" in payload:
204
+ for part in payload["parts"]:
205
+ if 'mimeType' in part and (part['mimeType'] == 'text/plain' or part['mimeType'] == 'text/html'):
206
+ body_data = part['body'].get('data', '')
207
+ if body_data:
208
+ body_base64 = base64.urlsafe_b64decode(body_data)
209
+ body = extract_text(body_base64)
210
+
211
+ elif 'body' in payload:
212
+ body_data = payload['body'].get('data', '')
213
+ if body_data:
214
+ body_base64 = base64.urlsafe_b64decode(body_data)
215
+ body = extract_text(body_base64)
216
+ elif 'parts' in payload['body']:
217
+ for part in payload['body']['parts']:
218
+ if 'mimeType' in part and (part['mimeType'] == 'text/plain' or part['mimeType'] == 'text/html'):
219
+ body_data = part['body'].get('data', '')
220
+ if body_data:
221
+ body_base64 = base64.urlsafe_b64decode(body_data)
222
+ body = extract_text(body_base64)
223
+
224
+ if not body:
225
+ body = message_data.get('snippet', '')
226
+ return body
227
+
228
+
229
+ def fetch_attachment_data(access_token: str, message_id: str, attachment_id: str) -> Dict:
230
+ attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
231
+ attachment_response = requests.get(attachment_url, headers={"Authorization": f"Bearer {access_token}"})
232
+ return attachment_response.json()
233
+
234
+
235
+ def extract_attachments_from_mail(access_token: str, message_data: dict) -> List[Attachment]:
236
+ attachments = []
237
+ structured_data = []
238
+ if "payload" in message_data and "parts" in message_data["payload"]:
239
+ for part in message_data["payload"]["parts"]:
240
+ if "body" in part and "attachmentId" in part["body"]:
241
+ attachment_id = part["body"]["attachmentId"]
242
+ attachment_data = fetch_attachment_data(access_token, message_data["id"], attachment_id)
243
+ filename = part.get("filename", "untitled.txt")
244
+ if filename.endswith(".zip") or filename.endswith(".txt") or filename.endswith(".png") or filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".gif"):
245
+ continue
246
+ data = attachment_data.get("data", "")
247
+ raw_text=ut.extract_text_from_attachment(filename , data)
248
+ struct_data = ut.strcuture_document_data(raw_text)
249
+ if struct_data:
250
+ structured_data.append(struct_data)
251
+
252
+ attachments.append(Attachment(attachment_len = len(attachment_data.get("data", "")),filename=filename, data=attachment_data.get("data", "")))
253
+ return attachments,structured_data
254
+
255
+
256
+ def extract_text(html_content: str) -> str:
257
+ if not html_content:
258
+ raise ValueError("HTML content is empty or None")
259
+
260
+ soup = BeautifulSoup(html_content, 'html.parser')
261
+ text = soup.get_text(separator=' ')
262
+ text = re.sub(r'\s+', ' ', text).strip()
263
+ return text
264
+
265
+
266
+ async def websocket_main(code: str, websocket: WebSocket,brand_name: Optional[str] = None):
267
+ access_token = code
268
+ # messages = get_messages(access_token,websocket,brand_name)
269
+ await get_messages(access_token,websocket,brand_name)
270
+ # print("websocket_main")
271
+ # print(messages)
272
+ # # logging.info(f"brand_name:{brand_name}")
273
+ # await websocket.send_json({"total_messages": len(messages)})
274
+ # print("Total Length of messages")
275
+ # print(len(messages))
276
+ # chunk_size = 100000
277
+ # i=0
278
+ # for message in messages:
279
+ # message_json = message.to_json()
280
+
281
+ # logging.info(f"{i} th message")
282
+ # i=i+1
283
+ # await send_message_in_chunks(websocket, message_json, chunk_size)
284
+ # await websocket.send_text("NEXT_MESSAGE")
285
+
286
+
287
+ await websocket.close()
288
+
289
+
290
+ async def send_message_in_chunks(websocket: WebSocket, message_json: dict, chunk_size: int):
291
+
292
+ # if message_json['attachments'] is not None :
293
+ # for attch in message_json['attachments']:
294
+ # attachment_len = attch['attachment_len']
295
+
296
+
297
+ # print(body_len)
298
+ # print(attachment_len)
299
+ # if attachment_len == 0:
300
+ # attachment_len = None
301
+ # await websocket.send_json({"body_len":body_len ,"attachment_len":attachment_len})
302
+
303
+ message_str = json.dumps(message_json)
304
+ # print("Printing message_str")
305
+ # print(message_str)
306
+ # logging.info(message_str)
307
+ # await websocket.send_json({"file_len":len(file)})
308
+ for i in range(0, len(message_str), chunk_size):
309
+ await websocket.send_text(message_str[i:i + chunk_size])