Spaces:

Omkar008
/

receipt_radar_test

Running

Omkar008 commited on Feb 7

Commit

ce3d970

•

1 Parent(s): 30fac65

Update get_gmail_data.py

Files changed (1) hide show

get_gmail_data.py CHANGED Viewed

@@ -170,8 +170,8 @@ class GmailDataExtractor:
                 if mime_type == 'text/plain' or mime_type == 'text/html':
                     body_data = part['body'].get('data', '')
-                    body = base64.urlsafe_b64decode(body_data).decode('utf-8')
-                    text= self.extract_text_and_links(body)
                 if 'body' in part and 'attachmentId' in part['body']:
                     attachment_id = part['body']['attachmentId']
@@ -186,15 +186,18 @@ class GmailDataExtractor:
         return subject, body,None , company_from_gmail
     def extract_domain_from_email(email):
-        regex = r"@(.+)$"
-        match = re.search(regex,email)
-        if match :
-            return match.group(1)
         else:
             return None
-    def extract_text_and_links(html_content: str) -> tuple:
         """
         Extracts text and links from HTML content.

                 if mime_type == 'text/plain' or mime_type == 'text/html':
                     body_data = part['body'].get('data', '')
+                    body = base64.urlsafe_b64decode(body_data)
+                    text= self.extract_text(body)
                 if 'body' in part and 'attachmentId' in part['body']:
                     attachment_id = part['body']['attachmentId']
         return subject, body,None , company_from_gmail
     def extract_domain_from_email(email):
+        # Extracting the email address using regex
+        email_address = re.search(r'[\w\.-]+@[\w\.-]+', email_string).group()
+        # Extracting the domain name from the email address
+        domain = email_address.split('@')[-1].split('.')[0]
+        if email_address and domain :
+            return domain
         else:
             return None
+    def extract_text(html_content: str) -> tuple:
         """
         Extracts text and links from HTML content.