Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +36 -106
scrape_3gpp.py
CHANGED
@@ -65,135 +65,65 @@ def extract_statuses(url):
|
|
65 |
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
66 |
filenames = []
|
67 |
status_filenames = []
|
68 |
-
|
69 |
-
excel_file_path = "guide_status.xlsx" # Hardcoded path to the Excel file
|
70 |
|
71 |
-
if os.path.exists(
|
72 |
try:
|
73 |
-
df = pd.read_excel(
|
74 |
print(f"Initial DataFrame size: {len(df)}")
|
75 |
|
76 |
-
if 'TDoc Status' in df.columns:
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
else:
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
status_filenames = df['URL'].tolist()
|
94 |
-
else:
|
95 |
-
print("No valid 'File' or 'URL' entries found for the filtered statuses.")
|
96 |
-
|
97 |
-
print(f"Filenames: {status_filenames}")
|
98 |
else:
|
99 |
-
print("
|
100 |
-
|
101 |
-
except Exception as e:
|
102 |
-
print(f"Error reading Excel file: {e}")
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
if excel_file and os.path.exists(excel_file):
|
107 |
-
try:
|
108 |
-
df = pd.read_excel(excel_file)
|
109 |
|
110 |
-
# If 'Actions' in df.columns and filter based on it, and construct URLs from 'TDoc' or 'URL' columns
|
111 |
-
if 'Actions' in df.columns:
|
112 |
-
df = df[df['Actions'] == 'x']
|
113 |
-
|
114 |
-
elif 'File' in df.columns:
|
115 |
-
filenames = [f"{url}{row['File']}.zip" for index, row in df.iterrows()]
|
116 |
-
elif 'URL' in df.columns:
|
117 |
-
filenames = df['URL'].tolist()
|
118 |
except Exception as e:
|
119 |
print(f"Error reading Excel file: {e}")
|
120 |
-
# Optionally, handle the error or return a message if needed
|
121 |
|
122 |
-
# If no Excel file is provided or found, or if it lacks 'TDoc'/'URL', the function can still continue with predefined URLs or other logic
|
123 |
download_directory = folder_name
|
124 |
if not os.path.exists(download_directory):
|
125 |
os.makedirs(download_directory)
|
126 |
|
127 |
pourcentss = 0.05
|
128 |
-
print(f'filenames: {status_filenames}')
|
129 |
-
if not filenames and not status_filenames:
|
130 |
-
print("No Excel file provided, or no valid URLs found in the file.")
|
131 |
-
# You can either return here or continue with other predefined logic
|
132 |
-
response = requests.get(url)
|
133 |
-
|
134 |
-
# Analyser le contenu HTML de la page
|
135 |
-
soup = BeautifulSoup(response.content, "html.parser")
|
136 |
-
|
137 |
-
# Trouver tous les balises <a> avec des attributs href (liens)
|
138 |
-
links = soup.find_all("a", href=True)
|
139 |
-
|
140 |
-
# Filtrer les liens se terminant par ".zip"
|
141 |
-
zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
|
142 |
-
|
143 |
-
# Télécharger chaque fichier zip
|
144 |
-
for zip_link in zip_links:
|
145 |
-
progress(pourcentss,desc='Downloading')
|
146 |
-
pourcentss+=0.4/len(df)
|
147 |
-
# Construire l'URL absolue du fichier zip
|
148 |
-
absolute_url = urljoin(url, zip_link)
|
149 |
-
|
150 |
-
# Extraire le nom de fichier de l'URL
|
151 |
-
filename = os.path.basename(absolute_url)
|
152 |
|
153 |
-
|
154 |
-
|
|
|
155 |
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
r.raise_for_status()
|
159 |
with open(save_path, 'wb') as f:
|
160 |
for chunk in r.iter_content(chunk_size=8192):
|
161 |
f.write(chunk)
|
|
|
|
|
|
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
for file_url in status_filenames:
|
166 |
-
filename = os.path.basename(file_url)
|
167 |
-
save_path = os.path.join(download_directory, filename)
|
168 |
-
progress(pourcentss,desc='Downloading')
|
169 |
-
pourcentss+=0.4/len(df)
|
170 |
-
try:
|
171 |
-
with requests.get(file_url, stream=True) as r:
|
172 |
-
r.raise_for_status()
|
173 |
-
with open(save_path, 'wb') as f:
|
174 |
-
for chunk in r.iter_content(chunk_size=8192):
|
175 |
-
f.write(chunk)
|
176 |
-
except requests.exceptions.HTTPError as e:
|
177 |
-
print(f"skipped file: {file_url}: {e}")
|
178 |
|
179 |
-
else:
|
180 |
-
# Proceed with downloading files using the filenames list
|
181 |
-
for file_url in filenames:
|
182 |
-
filename = os.path.basename(file_url)
|
183 |
-
save_path = os.path.join(download_directory, filename)
|
184 |
-
progress(pourcentss,desc='Downloading')
|
185 |
-
pourcentss+=0.4/len(df)
|
186 |
-
try:
|
187 |
-
with requests.get(file_url, stream=True) as r:
|
188 |
-
r.raise_for_status()
|
189 |
-
with open(save_path, 'wb') as f:
|
190 |
-
for chunk in r.iter_content(chunk_size=8192):
|
191 |
-
f.write(chunk)
|
192 |
-
except requests.exceptions.HTTPError as e:
|
193 |
-
print(f"HTTP error occurred: {file_url}: {e}")
|
194 |
-
return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
|
195 |
-
|
196 |
-
return True, len(df)
|
197 |
|
198 |
|
199 |
|
|
|
65 |
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
66 |
filenames = []
|
67 |
status_filenames = []
|
68 |
+
df = pd.DataFrame() # Initialize df to ensure it's always defined
|
|
|
69 |
|
70 |
+
if os.path.exists(excel_file):
|
71 |
try:
|
72 |
+
df = pd.read_excel(excel_file)
|
73 |
print(f"Initial DataFrame size: {len(df)}")
|
74 |
|
75 |
+
if 'TDoc Status' in df.columns and status_list:
|
76 |
+
df = df[df['TDoc Status'].isin(status_list)]
|
77 |
+
print(f"Filtered DataFrame size: {len(df)}")
|
78 |
+
else:
|
79 |
+
# If status_list is empty, consider all statuses
|
80 |
+
print("No filtering applied based on TDoc Status")
|
81 |
+
|
82 |
+
if not df.empty:
|
83 |
+
if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
|
84 |
+
status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
|
85 |
+
elif 'URL' in df.columns and not df['URL'].isnull().all():
|
86 |
+
status_filenames = df['URL'].tolist()
|
|
|
87 |
else:
|
88 |
+
print("No valid 'TDoc' or 'URL' entries found.")
|
89 |
+
|
90 |
+
print(f"Filenames: {status_filenames}")
|
|
|
|
|
|
|
|
|
|
|
91 |
else:
|
92 |
+
print("DataFrame is empty after filtering.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
except Exception as e:
|
95 |
print(f"Error reading Excel file: {e}")
|
|
|
96 |
|
|
|
97 |
download_directory = folder_name
|
98 |
if not os.path.exists(download_directory):
|
99 |
os.makedirs(download_directory)
|
100 |
|
101 |
pourcentss = 0.05
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
+
if not status_filenames:
|
104 |
+
print("No Excel file provided, or no valid URLs found in the file.")
|
105 |
+
return False, 0
|
106 |
|
107 |
+
# Proceed with downloading files using the filenames list
|
108 |
+
for file_url in status_filenames:
|
109 |
+
filename = os.path.basename(file_url)
|
110 |
+
save_path = os.path.join(download_directory, filename)
|
111 |
+
progress(pourcentss, desc='Downloading')
|
112 |
+
# Adjust progress calculation based on actual number of files
|
113 |
+
pourcentss += 0.4 / len(status_filenames)
|
114 |
+
try:
|
115 |
+
with requests.get(file_url, stream=True) as r:
|
116 |
r.raise_for_status()
|
117 |
with open(save_path, 'wb') as f:
|
118 |
for chunk in r.iter_content(chunk_size=8192):
|
119 |
f.write(chunk)
|
120 |
+
except requests.exceptions.HTTPError as e:
|
121 |
+
print(f"HTTP error occurred: {file_url}: {e}")
|
122 |
+
# Decide how you want to handle HTTP errors (e.g., skip this file, stop the process, etc.)
|
123 |
|
124 |
+
# Ensure correct return value, especially if the function should indicate success/failure and the number of processed files
|
125 |
+
return True, len(status_filenames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
|
129 |
|