Update scrape_3gpp.py
Browse files- scrape_3gpp.py +7 -35
scrape_3gpp.py
CHANGED
@@ -228,7 +228,7 @@ def update_excel(data, excel_file, url):
|
|
228 |
temp_df = pd.DataFrame(data, columns=new_df_columns)
|
229 |
|
230 |
try:
|
231 |
-
#
|
232 |
if os.path.exists(excel_file):
|
233 |
old_df = pd.read_excel(excel_file)
|
234 |
df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
|
@@ -242,7 +242,7 @@ def update_excel(data, excel_file, url):
|
|
242 |
|
243 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
244 |
folder_name = 'nom provisoire'
|
245 |
-
temp_excel = '
|
246 |
progress(0.0,desc='Telechargement')
|
247 |
result, message = scrape(url, excel_file, folder_name, status_list)
|
248 |
if result:
|
@@ -279,7 +279,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
279 |
pre_title_section = None
|
280 |
|
281 |
try:
|
282 |
-
df = pd.read_excel(
|
283 |
except Exception as e:
|
284 |
print(f"Initializing a new DataFrame because: {e}")
|
285 |
df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
|
@@ -450,39 +450,11 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
450 |
print(f"Updated after processing {processed_count} files.")
|
451 |
data = [] # Clear the data list after updating
|
452 |
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
try:
|
458 |
-
old_df = pd.read_excel(excel_file)
|
459 |
-
|
460 |
-
# Check if 'Actions' column exists in the old DataFrame
|
461 |
-
if 'Actions' in old_df.columns:
|
462 |
-
# Assuming you want to update 'Content' in old_df for matching 'TDoc' values in 'File'
|
463 |
-
for index, new_row in new_df.iterrows():
|
464 |
-
# Find matching rows in old_df where 'TDoc' matches 'File' from new_df
|
465 |
-
match_indices = old_df[old_df['TDoc'] == new_row['File']].index
|
466 |
-
# Update 'Content' in old_df for matching rows
|
467 |
-
for i in match_indices:
|
468 |
-
old_df.at[i, 'Content'] = new_row['Content']
|
469 |
-
old_df.at[i, 'URL'] = new_row['URL']
|
470 |
-
|
471 |
-
df = old_df
|
472 |
-
###placer la colonne content en 4eme position
|
473 |
-
# current_columns = df.columns.tolist()
|
474 |
-
# current_columns.remove('URL')
|
475 |
-
# # Insert 'Content' at the desired position
|
476 |
-
# new_columns_order = current_columns[:1] + ['URL'] + current_columns[3:]
|
477 |
-
# df = df[new_columns_order]
|
478 |
-
else:
|
479 |
-
# If 'Actions' column doesn't exist, simply concatenate the DataFrames
|
480 |
-
df = pd.concat([old_df, new_df], axis=0, ignore_index=True)
|
481 |
-
except Exception as e:
|
482 |
-
print("The provided excel file seems invalid:", e)
|
483 |
-
df = new_df
|
484 |
|
485 |
file_name = url.split("/")[-2] + ".xlsx"
|
486 |
# Save the updated DataFrame to Excel
|
487 |
-
df.to_excel(file_name, index=False)
|
488 |
return file_name, "Téléchargement réussi"
|
|
|
228 |
temp_df = pd.DataFrame(data, columns=new_df_columns)
|
229 |
|
230 |
try:
|
231 |
+
# Check if the Excel file already exists and append data to it
|
232 |
if os.path.exists(excel_file):
|
233 |
old_df = pd.read_excel(excel_file)
|
234 |
df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
|
|
|
242 |
|
243 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
244 |
folder_name = 'nom provisoire'
|
245 |
+
temp_excel = 'temporaire.xlsx'
|
246 |
progress(0.0,desc='Telechargement')
|
247 |
result, message = scrape(url, excel_file, folder_name, status_list)
|
248 |
if result:
|
|
|
279 |
pre_title_section = None
|
280 |
|
281 |
try:
|
282 |
+
df = pd.read_excel(temp_excel)
|
283 |
except Exception as e:
|
284 |
print(f"Initializing a new DataFrame because: {e}")
|
285 |
df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
|
|
|
450 |
print(f"Updated after processing {processed_count} files.")
|
451 |
data = [] # Clear the data list after updating
|
452 |
|
453 |
+
if data:
|
454 |
+
# This final call ensures that any remaining data is processed and saved.
|
455 |
+
update_excel(data, temp_excel, url)
|
456 |
+
print(f"Final update after processing all files.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
|
458 |
file_name = url.split("/")[-2] + ".xlsx"
|
459 |
# Save the updated DataFrame to Excel
|
|
|
460 |
return file_name, "Téléchargement réussi"
|