Spaces:
Sleeping
Sleeping
Update get_gmail_data.py
Browse files- get_gmail_data.py +25 -5
get_gmail_data.py
CHANGED
@@ -148,11 +148,22 @@ class GmailDataExtractor:
|
|
148 |
|
149 |
if 'payload' in message_data and 'parts' in message_data['payload']:
|
150 |
parts = message_data['payload']['parts']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
for part in parts:
|
152 |
if 'mimeType' not in part:
|
153 |
continue
|
154 |
|
155 |
mime_type = part['mimeType']
|
|
|
156 |
if mime_type == 'text/plain' or mime_type == 'text/html':
|
157 |
body_data = part['body'].get('data', '')
|
158 |
body = base64.urlsafe_b64decode(body_data).decode('utf-8')
|
@@ -166,9 +177,18 @@ class GmailDataExtractor:
|
|
166 |
|
167 |
if data:
|
168 |
# Save only the first 10 characters of the attachment data
|
169 |
-
return subject,body ,{"filename":filename , "data":data}
|
|
|
|
|
170 |
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
def extract_text_and_links(html_content: str) -> tuple:
|
174 |
"""
|
@@ -214,14 +234,14 @@ class GmailDataExtractor:
|
|
214 |
messages = self.__fetch_messages()
|
215 |
results = []
|
216 |
for message in messages:
|
217 |
-
subject, body, attachment_data = self.__process_message(message)
|
218 |
|
219 |
""" Handling None values """
|
220 |
-
subject = subject if subject is not None else ""
|
221 |
body = body if body is not None else None
|
222 |
attachment_data = attachment_data if attachment_data is not None else {}
|
|
|
223 |
|
224 |
-
results.append({"body": body, "attachment_data": [attachment_data]})
|
225 |
|
226 |
return {"results": results}
|
227 |
|
|
|
148 |
|
149 |
if 'payload' in message_data and 'parts' in message_data['payload']:
|
150 |
parts = message_data['payload']['parts']
|
151 |
+
payload = message_data['payload']
|
152 |
+
brand_from_gmail = ''
|
153 |
+
company_from_gmail = ''
|
154 |
+
if payload['headers']['name'] == 'from':
|
155 |
+
brand_from_gmail = payload['headers']['value']
|
156 |
+
company_from_gmail = extract_domain_from_email(brand_from_gmail)
|
157 |
+
else:
|
158 |
+
company_from_gmail = None
|
159 |
+
|
160 |
+
|
161 |
for part in parts:
|
162 |
if 'mimeType' not in part:
|
163 |
continue
|
164 |
|
165 |
mime_type = part['mimeType']
|
166 |
+
|
167 |
if mime_type == 'text/plain' or mime_type == 'text/html':
|
168 |
body_data = part['body'].get('data', '')
|
169 |
body = base64.urlsafe_b64decode(body_data).decode('utf-8')
|
|
|
177 |
|
178 |
if data:
|
179 |
# Save only the first 10 characters of the attachment data
|
180 |
+
return subject,body ,{"filename":filename , "data":data} , company_from_gmail
|
181 |
+
|
182 |
+
return subject, body,None , company_from_gmail
|
183 |
|
184 |
+
def extract_domain_from_email(email):
|
185 |
+
regex = r"@(.+)$"
|
186 |
+
match = re.search(regex,email)
|
187 |
+
if match :
|
188 |
+
return match.group(1)
|
189 |
+
else:
|
190 |
+
return None
|
191 |
+
|
192 |
|
193 |
def extract_text_and_links(html_content: str) -> tuple:
|
194 |
"""
|
|
|
234 |
messages = self.__fetch_messages()
|
235 |
results = []
|
236 |
for message in messages:
|
237 |
+
subject, body, attachment_data , company_name = self.__process_message(message)
|
238 |
|
239 |
""" Handling None values """
|
|
|
240 |
body = body if body is not None else None
|
241 |
attachment_data = attachment_data if attachment_data is not None else {}
|
242 |
+
company_associated = company_name if company_name is not None else None
|
243 |
|
244 |
+
results.append({"body": body, "attachment_data": [attachment_data] ,'company_associated':company_associated})
|
245 |
|
246 |
return {"results": results}
|
247 |
|