Omkar008 commited on
Commit
e6e2b35
1 Parent(s): 3df0c32

Upload get_gmail_data.py

Browse files
Files changed (1) hide show
  1. get_gmail_data.py +223 -0
get_gmail_data.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import base64
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ import jwt
6
+ class GmailDataExtractor:
7
+
8
+ def __init__(self,jwt:str , user_input: str = None) -> None:
9
+ if jwt is None :
10
+ self.error = "Error"
11
+ else:
12
+ self.__jwt = jwt
13
+ self.__user_input = user_input
14
+ self.error = None
15
+ self.__secret_key = 'nkldjlncbamjlklwjeklwu24898h*&#Ujnfjf34893U5HSJFBSKFSHFNSK*$*W_ 3OWU'
16
+
17
+ def __validate_jwt_token(self):
18
+ try:
19
+ payload = jwt.decode(self.jwt, self.secret_key, algorithms=["HS256"])
20
+ access_token = payload.get("access_token")
21
+ if access_token:
22
+ return access_token
23
+ else:
24
+ raise ValueError("Invalid JWT token: Missing access token")
25
+ except jwt.ExpiredSignatureError:
26
+ raise ValueError("Invalid JWT token: Expired token")
27
+ except jwt.InvalidTokenError:
28
+ raise ValueError("Invalid JWT token: Token verification failed")
29
+
30
+ def __fetch_messages(self) -> list:
31
+ """
32
+ Fetches messages from the Gmail API.
33
+
34
+ Args:
35
+ gmail_url (str): The URL for the Gmail API request.
36
+ access_token (str): The access token for authenticating with Gmail API.
37
+
38
+ Returns:
39
+ list: A list of message objects retrieved from the Gmail API.
40
+
41
+ Raises:
42
+ RuntimeError: If there is an issue while fetching messages from the Gmail API.
43
+
44
+ """
45
+
46
+ """currently not implementing jwt for testing purposes
47
+ replace every access_token with jwt function directly which returns the access token"""
48
+ access_token = self.__jwt
49
+ print("access token")
50
+ print(access_token)
51
+ receipt_query = f"(label:^smartlabel_receipt OR (subject:your AND subject:order) OR subject:receipts OR subject:receipt OR subject:invoice OR subject:invoice))"
52
+ if self.__user_input is not None:
53
+ receipt_query = f"(label:^smartlabel_receipt OR (subject:your AND subject:order) OR subject:receipts OR subject:receipt OR subject:invoice OR subject:invoice)) AND subject:{self.__user_input}"
54
+ gmail_url = f"https://www.googleapis.com/gmail/v1/users/me/messages?q={receipt_query}"
55
+ def __fetch_page(url):
56
+ response = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
57
+ response.raise_for_status() # Raise error if the request fails
58
+ data = response.json()
59
+ return data.get("messages", []), data.get("nextPageToken")
60
+
61
+ messages = []
62
+ page_token = None
63
+ try:
64
+ while True:
65
+ url = f"{gmail_url}&pageToken={page_token}" if page_token else gmail_url
66
+ page_messages, page_token = __fetch_page(url)
67
+ messages.extend(page_messages)
68
+ if not page_token:
69
+ break
70
+ except requests.RequestException as e:
71
+ raise RuntimeError(f"Error fetching messages from Gmail API: {str(e)}")
72
+
73
+ return messages
74
+
75
+ def __fetch_message_data(self, message_id: str) -> dict:
76
+ """
77
+ Fetches message data from the Gmail API.
78
+
79
+ Args:
80
+ message_id (str): The ID of the message to fetch.
81
+
82
+ Returns:
83
+ dict: Message data retrieved from the Gmail API.
84
+
85
+ Raises:
86
+ RuntimeError: If there is an issue while fetching message data from the Gmail API.
87
+ """
88
+ message_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}"
89
+ try:
90
+ response = requests.get(message_url, headers={"Authorization": f"Bearer {self.__jwt}"})
91
+ response.raise_for_status() # Raise error if the request fails
92
+ return response.json()
93
+ except requests.RequestException as e:
94
+ raise RuntimeError(f"Error fetching message data from Gmail API: {str(e)}")
95
+
96
+ def __fetch_attachment_data(self, message_id: str, attachment_id: str) -> dict:
97
+ """
98
+ Fetches attachment data from the Gmail API.
99
+
100
+ Args:
101
+ message_id (str): The ID of the message containing the attachment.
102
+ attachment_id (str): The ID of the attachment to fetch.
103
+
104
+ Returns:
105
+ dict: Attachment data retrieved from the Gmail API.
106
+
107
+ Raises:
108
+ RuntimeError: If there is an issue while fetching attachment data from the Gmail API.
109
+ """
110
+ attachment_url = f"https://www.googleapis.com/gmail/v1/users/me/messages/{message_id}/attachments/{attachment_id}"
111
+ try:
112
+ response = requests.get(attachment_url, headers={"Authorization": f"Bearer {self.__jwt}"})
113
+ response.raise_for_status() # Raise error if the request fails
114
+ return response.json()
115
+ except requests.RequestException as e:
116
+ raise RuntimeError(f"Error fetching attachment data from Gmail API: {str(e)}")
117
+
118
+ def __process_message(self, message: dict) -> tuple:
119
+ """
120
+ Processes a single message.
121
+
122
+ Args:
123
+ message (dict): The message to process.
124
+
125
+ Returns:
126
+ tuple: A tuple containing the subject (str), body (str), links (list of str),
127
+ and base64 data if it contains an document attachment in the form of pdf, docx, ppt or any file format indicating whether the message contains an attachment.
128
+
129
+ Raises:
130
+ RuntimeError: If there is an issue while fetching message data from the Gmail API.
131
+ """
132
+ message_id = message.get("id")
133
+ if not message_id:
134
+ return None, None, [], False
135
+
136
+ message_data = self.__fetch_message_data(message_id, self.__jwt)
137
+ subject = message_data.get('payload', {}).get('headers', {}).get('value', '')
138
+
139
+ body = ''
140
+ links = []
141
+ has_attachment = False
142
+
143
+ if 'payload' in message_data and 'parts' in message_data['payload']:
144
+ parts = message_data['payload']['parts']
145
+ for part in parts:
146
+ if 'mimeType' not in part:
147
+ continue
148
+
149
+ mime_type = part['mimeType']
150
+ if mime_type == 'text/plain' or mime_type == 'text/html':
151
+ body_data = part['body'].get('data', '')
152
+ body = base64.urlsafe_b64decode(body_data).decode('utf-8')
153
+ text= self._extract_text_and_links(body)
154
+
155
+ if 'body' in part and 'attachmentId' in part['body']:
156
+ attachment_id = part['body']['attachmentId']
157
+ attachment_data = self.__fetch_attachment_data(message_id, attachment_id)
158
+ data = attachment_data.get("data", "")
159
+ filename = part.get("filename", "untitled.txt")
160
+
161
+ if data:
162
+ # Save only the first 10 characters of the attachment data
163
+ return subject,body , links , {filename:data}
164
+
165
+ return subject, body, links , None
166
+
167
+ def extract_text_and_links(html_content: str) -> tuple:
168
+ """
169
+ Extracts text and links from HTML content.
170
+
171
+ Args:
172
+ html_content (str): The HTML content to process.
173
+
174
+ Returns:
175
+ tuple: A tuple containing the extracted text (str) and links (list of tuples).
176
+
177
+ Raises:
178
+ ValueError: If the input HTML content is empty or None.
179
+ """
180
+ if not html_content:
181
+ raise ValueError("HTML content is empty or None")
182
+
183
+ soup = BeautifulSoup(html_content, 'html.parser')
184
+
185
+ # Extract text
186
+ text = soup.get_text(separator=' ')
187
+ text = re.sub(r'\s+', ' ', text).strip()
188
+
189
+ # Extract links
190
+ links = [(link.text, link['href']) for link in soup.find_all('a', href=True)]
191
+
192
+ return text, links
193
+
194
+ def extract_messages(self) -> dict:
195
+ """
196
+ Extracts messages based on the provided brand name.
197
+
198
+ Args:
199
+ brand_name (str): The brand name to search for in email subjects.
200
+ jwt_token (str): The JWT token for authentication.
201
+
202
+ Returns:
203
+ dict: A dictionary containing the extracted messages with their subjects, bodies, links, and attachment statuses.
204
+ format:{"results":[{"subjec":"test subject" , "body":"it would be text" , "attachment_data":{"filename":base64URL format}},{second message with same content of subject , body , attachment_data}]}
205
+
206
+ """
207
+ print("entered the extract messages")
208
+ messages = self.__fetch_messages()
209
+ results = []
210
+ for message in messages:
211
+ subject, body, attachment_data = self.__process_message(message)
212
+
213
+ """ Handling None values """
214
+ subject = subject if subject is not None else ""
215
+ body = body if body is not None else ""
216
+ attachment_data = attachment_data if attachment_data is not None else {}
217
+
218
+ results.append({"subject": subject, "body": body, "attachment_data": attachment_data})
219
+
220
+ return {"results": results}
221
+
222
+ # obj = GmailDataExtractor("abcd","user_input")
223
+ # print(obj.error)