Standard_Intelligence_Dev

Sleeping

App Files Files Community

heymenn commited on Apr 11

Commit

1c64a4a

•

1 Parent(s): 36947a6

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +19 -15

scrape_3gpp.py CHANGED Viewed

@@ -215,7 +215,7 @@ def remod_text(text):
     return text.replace("/n", "\n")
 def update_excel(data, excel_file, url):
-    new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"]
     temp_df = pd.DataFrame(data, columns=new_df_columns)
     try:
@@ -258,17 +258,18 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
     extract_directory = url.split("/")[-3] + "_extraction"
     categories = {
-        "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
-        "CR": ["URL", "File", "Type", "Title", "Source", "Content"],
-        "pCR":["URL", "File", "Type", "Title", "Source", "Content"],
-        "LS": ["URL", "File", "Type", "Title", "Source", "Content"],
-        "WID": ["URL", "File", "Type", "Title", "Source", "Content"],
-        "SID": ["URL", "File", "Type", "Title", "Source", "Content"],
-        "DISCUSSION": ["URL", "File", "Type", "Title", "Source", "Content"],
-        "pdf": ["URL", "File", "Type", "Title", "Source", "Content"],
-        "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
-        "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
     }
     pourcents2=0.6
@@ -282,7 +283,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
         df = pd.read_excel(temp_excel)
     except Exception as e:
         print(f"Initializing a new DataFrame because: {e}")
-        df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
     for folder in os.listdir(extract_directory):
         folder_path = os.path.join(extract_directory, folder)
@@ -455,24 +456,27 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
                     # Assuming 'source' needs to be filled from the guide.xlsx mapping
                     # Placeholder for source value calculation
                     source = ""  # Update this with actual source determination logic
                     status = ""
-                    data.append([url+ "/" + folder + '.zip', folder , category, title, source,status, contenu])
                     guide_file = 'guide.xlsx'
                     if os.path.exists(guide_file):
                         # If guide.xlsx exists, proceed with operations that require it
                         try:
-                            guide_df = pd.read_excel(guide_file, usecols=['Source', 'TDoc', 'TDoc Status'])
                             # Continue with the operations that require guide.xlsx
                             # For example, reading the file, processing the data, etc.
                             tdoc_source_map = {row['TDoc']: row['Source'] for index, row in guide_df.iterrows()}
                             tdoc_status_map = {row['TDoc']: row['TDoc Status'] for index, row in guide_df.iterrows()}
                             # Update the 'Source' in your data based on matching 'Nom du fichier' with 'TDoc'
                             for item in data:
                                 nom_du_fichier = item[1]  # Assuming 'Nom du fichier' is the first item in your data list
                                 if nom_du_fichier in tdoc_source_map:
                                     item[4] = tdoc_source_map[nom_du_fichier]  # Update the 'Source' field, assuming it's the fourth item
-                                    item[5] = tdoc_status_map[nom_du_fichier]
                             # Your code that depends on guide.xlsx goes here
                         except Exception as e:

     return text.replace("/n", "\n")
 def update_excel(data, excel_file, url):
+    new_df_columns = ["URL", "File", "Type", "Title", "Source", "Related WIs", "Status", "Content"]
     temp_df = pd.DataFrame(data, columns=new_df_columns)
     try:
     extract_directory = url.split("/")[-3] + "_extraction"
+    TabCategories = ["URL", "File", "Title", "Source", "Related WIs", "Content"]
     categories = {
+        "Other": TabCategories,
+        "CR": TabCategories,
+        "pCR": TabCategories,
+        "LS": TabCategories,
+        "WID": TabCategories,
+        "SID": TabCategories,
+        "DISCUSSION": TabCategories,
+        "pdf": TabCategories,
+        "ppt": TabCategories,
+        "pptx": TabCategories
     }
     pourcents2=0.6
         df = pd.read_excel(temp_excel)
     except Exception as e:
         print(f"Initializing a new DataFrame because: {e}")
+        df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Related WIs", "Content"])
     for folder in os.listdir(extract_directory):
         folder_path = os.path.join(extract_directory, folder)
                     # Assuming 'source' needs to be filled from the guide.xlsx mapping
                     # Placeholder for source value calculation
                     source = ""  # Update this with actual source determination logic
+                    RelatedWIs = ""
                     status = ""
+                    data.append([url+ "/" + folder + '.zip', folder , category, title, source, RelatedWIs, status, contenu])
                     guide_file = 'guide.xlsx'
                     if os.path.exists(guide_file):
                         # If guide.xlsx exists, proceed with operations that require it
                         try:
+                            guide_df = pd.read_excel(guide_file, usecols=['Source', 'TDoc', 'RelatedWIs', 'TDoc Status'])
                             # Continue with the operations that require guide.xlsx
                             # For example, reading the file, processing the data, etc.
                             tdoc_source_map = {row['TDoc']: row['Source'] for index, row in guide_df.iterrows()}
+                            tdoc_relatedWIs_map = {row['TDoc']: row['Related WIs'] for index, row in guide_df.iterrows()}
                             tdoc_status_map = {row['TDoc']: row['TDoc Status'] for index, row in guide_df.iterrows()}
                             # Update the 'Source' in your data based on matching 'Nom du fichier' with 'TDoc'
                             for item in data:
                                 nom_du_fichier = item[1]  # Assuming 'Nom du fichier' is the first item in your data list
                                 if nom_du_fichier in tdoc_source_map:
                                     item[4] = tdoc_source_map[nom_du_fichier]  # Update the 'Source' field, assuming it's the fourth item
+                                    item[5] = tdoc_relatedWIs_map[nom_du_fichier]
+                                    item[6] = tdoc_status_map[nom_du_fichier]
                             # Your code that depends on guide.xlsx goes here
                         except Exception as e: