Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +7 -7
scrape_3gpp.py
CHANGED
@@ -67,7 +67,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
67 |
status_filenames = []
|
68 |
df = pd.DataFrame() # Initialize df to ensure it's always defined
|
69 |
|
70 |
-
if
|
|
|
71 |
try:
|
72 |
df = pd.read_excel(excel_file)
|
73 |
print(f"Initial DataFrame size: {len(df)}")
|
@@ -76,7 +77,6 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
76 |
df = df[df['TDoc Status'].isin(status_list)]
|
77 |
print(f"Filtered DataFrame size: {len(df)}")
|
78 |
else:
|
79 |
-
# If status_list is empty, consider all statuses
|
80 |
print("No filtering applied based on TDoc Status")
|
81 |
|
82 |
if not df.empty:
|
@@ -86,13 +86,15 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
86 |
status_filenames = df['URL'].tolist()
|
87 |
else:
|
88 |
print("No valid 'TDoc' or 'URL' entries found.")
|
89 |
-
|
90 |
print(f"Filenames: {status_filenames}")
|
91 |
else:
|
92 |
print("DataFrame is empty after filtering.")
|
93 |
|
94 |
except Exception as e:
|
95 |
print(f"Error reading Excel file: {e}")
|
|
|
|
|
96 |
|
97 |
download_directory = folder_name
|
98 |
if not os.path.exists(download_directory):
|
@@ -109,8 +111,7 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
109 |
filename = os.path.basename(file_url)
|
110 |
save_path = os.path.join(download_directory, filename)
|
111 |
progress(pourcentss, desc='Downloading')
|
112 |
-
|
113 |
-
pourcentss += 0.4 / len(status_filenames)
|
114 |
try:
|
115 |
with requests.get(file_url, stream=True) as r:
|
116 |
r.raise_for_status()
|
@@ -119,14 +120,13 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
119 |
f.write(chunk)
|
120 |
except requests.exceptions.HTTPError as e:
|
121 |
print(f"HTTP error occurred: {file_url}: {e}")
|
122 |
-
# Decide how you want to handle HTTP errors (e.g., skip this file, stop the process, etc.)
|
123 |
|
124 |
-
# Ensure correct return value, especially if the function should indicate success/failure and the number of processed files
|
125 |
return True, len(status_filenames)
|
126 |
|
127 |
|
128 |
|
129 |
|
|
|
130 |
def extractZip(url):
|
131 |
# Répertoire où les fichiers zip sont déjà téléchargés
|
132 |
nom_extract = url.split("/")[-3] + "_extraction"
|
|
|
67 |
status_filenames = []
|
68 |
df = pd.DataFrame() # Initialize df to ensure it's always defined
|
69 |
|
70 |
+
# Only proceed if excel_file is not None and it exists
|
71 |
+
if excel_file and os.path.exists(excel_file):
|
72 |
try:
|
73 |
df = pd.read_excel(excel_file)
|
74 |
print(f"Initial DataFrame size: {len(df)}")
|
|
|
77 |
df = df[df['TDoc Status'].isin(status_list)]
|
78 |
print(f"Filtered DataFrame size: {len(df)}")
|
79 |
else:
|
|
|
80 |
print("No filtering applied based on TDoc Status")
|
81 |
|
82 |
if not df.empty:
|
|
|
86 |
status_filenames = df['URL'].tolist()
|
87 |
else:
|
88 |
print("No valid 'TDoc' or 'URL' entries found.")
|
89 |
+
|
90 |
print(f"Filenames: {status_filenames}")
|
91 |
else:
|
92 |
print("DataFrame is empty after filtering.")
|
93 |
|
94 |
except Exception as e:
|
95 |
print(f"Error reading Excel file: {e}")
|
96 |
+
else:
|
97 |
+
print("No valid excel_file path provided.")
|
98 |
|
99 |
download_directory = folder_name
|
100 |
if not os.path.exists(download_directory):
|
|
|
111 |
filename = os.path.basename(file_url)
|
112 |
save_path = os.path.join(download_directory, filename)
|
113 |
progress(pourcentss, desc='Downloading')
|
114 |
+
pourcentss += 0.4 / len(status_filenames) if status_filenames else 1 # Adjust to prevent division by zero
|
|
|
115 |
try:
|
116 |
with requests.get(file_url, stream=True) as r:
|
117 |
r.raise_for_status()
|
|
|
120 |
f.write(chunk)
|
121 |
except requests.exceptions.HTTPError as e:
|
122 |
print(f"HTTP error occurred: {file_url}: {e}")
|
|
|
123 |
|
|
|
124 |
return True, len(status_filenames)
|
125 |
|
126 |
|
127 |
|
128 |
|
129 |
+
|
130 |
def extractZip(url):
|
131 |
# Répertoire où les fichiers zip sont déjà téléchargés
|
132 |
nom_extract = url.split("/")[-3] + "_extraction"
|