Standard_Intelligence_Dev

Sleeping

App Files Files Community

MaksG commited on Mar 6

Commit

92701ab

•

1 Parent(s): 507ff3f

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +7 -35

scrape_3gpp.py CHANGED Viewed

@@ -228,7 +228,7 @@ def update_excel(data, excel_file, url):
     temp_df = pd.DataFrame(data, columns=new_df_columns)
     try:
-        # Load the existing Excel file if it exists, else create a new one
         if os.path.exists(excel_file):
             old_df = pd.read_excel(excel_file)
             df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
@@ -242,7 +242,7 @@ def update_excel(data, excel_file, url):
 def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
     folder_name = 'nom provisoire'
-    temp_excel = '/content/temporaire.xlsx'
     progress(0.0,desc='Telechargement')
     result, message = scrape(url, excel_file, folder_name, status_list)
     if result:
@@ -279,7 +279,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
     pre_title_section = None
     try:
-        df = pd.read_excel(excel_file)
     except Exception as e:
         print(f"Initializing a new DataFrame because: {e}")
         df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
@@ -450,39 +450,11 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
                         print(f"Updated after processing {processed_count} files.")
                         data = []  # Clear the data list after updating
-    new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"]  # Create a DataFrame with the updated data
-    new_df = pd.DataFrame(data, columns=new_df_columns)
-    try:
-        old_df = pd.read_excel(excel_file)
-        # Check if 'Actions' column exists in the old DataFrame
-        if 'Actions' in old_df.columns:
-            # Assuming you want to update 'Content' in old_df for matching 'TDoc' values in 'File'
-            for index, new_row in new_df.iterrows():
-                # Find matching rows in old_df where 'TDoc' matches 'File' from new_df
-                match_indices = old_df[old_df['TDoc'] == new_row['File']].index
-                # Update 'Content' in old_df for matching rows
-                for i in match_indices:
-                    old_df.at[i, 'Content'] = new_row['Content']
-                    old_df.at[i, 'URL'] = new_row['URL']
-            df = old_df
-            ###placer la colonne content en 4eme position
-            # current_columns = df.columns.tolist()
-            # current_columns.remove('URL')
-            # # Insert 'Content' at the desired position
-            # new_columns_order = current_columns[:1] + ['URL'] + current_columns[3:]
-            # df = df[new_columns_order]
-        else:
-            # If 'Actions' column doesn't exist, simply concatenate the DataFrames
-            df = pd.concat([old_df, new_df], axis=0, ignore_index=True)
-    except Exception as e:
-        print("The provided excel file seems invalid:", e)
-        df = new_df
     file_name = url.split("/")[-2] + ".xlsx"
     # Save the updated DataFrame to Excel
-    df.to_excel(file_name, index=False)
     return file_name, "Téléchargement réussi"

     temp_df = pd.DataFrame(data, columns=new_df_columns)
     try:
+        # Check if the Excel file already exists and append data to it
         if os.path.exists(excel_file):
             old_df = pd.read_excel(excel_file)
             df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
 def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
     folder_name = 'nom provisoire'
+    temp_excel = 'temporaire.xlsx'
     progress(0.0,desc='Telechargement')
     result, message = scrape(url, excel_file, folder_name, status_list)
     if result:
     pre_title_section = None
     try:
+        df = pd.read_excel(temp_excel)
     except Exception as e:
         print(f"Initializing a new DataFrame because: {e}")
         df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
                         print(f"Updated after processing {processed_count} files.")
                         data = []  # Clear the data list after updating
+    if data:
+    # This final call ensures that any remaining data is processed and saved.
+        update_excel(data, temp_excel, url)
+        print(f"Final update after processing all files.")
     file_name = url.split("/")[-2] + ".xlsx"
     # Save the updated DataFrame to Excel
     return file_name, "Téléchargement réussi"