Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +39 -44
scrape_3gpp.py
CHANGED
@@ -9,31 +9,6 @@ import textract
|
|
9 |
import gradio as gr
|
10 |
|
11 |
|
12 |
-
def count_links(url):
|
13 |
-
# Define common file extensions for downloadable content
|
14 |
-
file_extensions = ('.zip')
|
15 |
-
|
16 |
-
try:
|
17 |
-
# Send a HTTP request to the URL
|
18 |
-
response = requests.get(url)
|
19 |
-
response.raise_for_status() # Raise an exception for HTTP errors
|
20 |
-
|
21 |
-
# Parse the HTML content of the page
|
22 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
23 |
-
|
24 |
-
# Find all <a> tags in the HTML
|
25 |
-
links = soup.find_all('a')
|
26 |
-
|
27 |
-
# Count the number of links that point to downloadable files
|
28 |
-
count = sum(1 for link in links if any(link.get('href', '').endswith(ext) for ext in file_extensions))
|
29 |
-
|
30 |
-
return count
|
31 |
-
except requests.RequestException as e:
|
32 |
-
print(f"Error fetching the page: {e}")
|
33 |
-
return None
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
def browse_folder(url):
|
38 |
if url.lower().endswith(('docs', 'docs/')):
|
39 |
return gr.update(choices=[])
|
@@ -87,11 +62,11 @@ def extract_statuses(url):
|
|
87 |
return []
|
88 |
|
89 |
|
90 |
-
def scrape(url, excel_file, folder_name, status_list,
|
91 |
filenames = []
|
92 |
status_filenames = []
|
93 |
# Check if the excel_file argument is provided and if the file exists.
|
94 |
-
excel_file_path =
|
95 |
|
96 |
if os.path.exists(excel_file_path):
|
97 |
try:
|
@@ -141,7 +116,7 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
|
|
141 |
download_directory = folder_name
|
142 |
if not os.path.exists(download_directory):
|
143 |
os.makedirs(download_directory)
|
144 |
-
|
145 |
pourcentss = 0.05
|
146 |
print(f'filenames: {status_filenames}')
|
147 |
if not filenames and not status_filenames:
|
@@ -157,11 +132,11 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
|
|
157 |
|
158 |
# Filtrer les liens se terminant par ".zip"
|
159 |
zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
|
160 |
-
|
161 |
# Télécharger chaque fichier zip
|
162 |
for zip_link in zip_links:
|
163 |
progress(pourcentss,desc='Downloading')
|
164 |
-
pourcentss+=0.4/
|
165 |
# Construire l'URL absolue du fichier zip
|
166 |
absolute_url = urljoin(url, zip_link)
|
167 |
|
@@ -184,7 +159,7 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
|
|
184 |
filename = os.path.basename(file_url)
|
185 |
save_path = os.path.join(download_directory, filename)
|
186 |
progress(pourcentss,desc='Downloading')
|
187 |
-
pourcentss+=0.4/
|
188 |
try:
|
189 |
with requests.get(file_url, stream=True) as r:
|
190 |
r.raise_for_status()
|
@@ -210,14 +185,19 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
|
|
210 |
print(f"HTTP error occurred: {file_url}: {e}")
|
211 |
return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
|
212 |
|
213 |
-
return True, "Téléchargement terminé !"
|
214 |
|
215 |
|
216 |
|
217 |
-
def extractZip(
|
218 |
# Répertoire où les fichiers zip sont déjà téléchargés
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
# Extraire le contenu de tous les fichiers zip dans le répertoire de téléchargement
|
223 |
for zip_file in os.listdir(download_directory):
|
@@ -233,6 +213,7 @@ def extractZip(folder_name):
|
|
233 |
os.makedirs(extract_dir)
|
234 |
|
235 |
# Extraire le contenu du fichier zip
|
|
|
236 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
237 |
zip_ref.extractall(extract_dir)
|
238 |
|
@@ -242,6 +223,7 @@ def extractZip(folder_name):
|
|
242 |
|
243 |
print("Toutes les extractions sont terminées !")
|
244 |
|
|
|
245 |
def excel3gpp(url):
|
246 |
response = requests.get(url)
|
247 |
response.raise_for_status() # This will raise an exception if there's an error
|
@@ -263,12 +245,16 @@ def excel3gpp(url):
|
|
263 |
excel_response.raise_for_status()
|
264 |
|
265 |
# Define the path where you want to save the file
|
266 |
-
|
267 |
-
filepath = os.path.join('path_to_save_directory', filename) # Replace 'path_to_save_directory' with your desired path
|
268 |
|
269 |
# Write the content of the Excel file to a local file
|
270 |
# Write the content of the Excel file to a local file named 'guide.xlsx'
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
272 |
|
273 |
with open(filepath, 'wb') as f:
|
274 |
f.write(excel_response.content)
|
@@ -300,24 +286,32 @@ def update_excel(data, excel_file, url):
|
|
300 |
print(f"Error updating Excel file: {e}")
|
301 |
|
302 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
303 |
-
|
304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
progress(0.0,desc='Downloading')
|
306 |
-
|
307 |
-
result, message = scrape(url, excel_file, folder_name, status_list)
|
308 |
if result:
|
309 |
print("Success:", message)
|
310 |
else:
|
311 |
return(None, message)
|
312 |
|
313 |
progress(0.4,desc='Extraction')
|
314 |
-
extractZip(
|
315 |
progress(0.5,desc='Extraction 2')
|
316 |
excel3gpp(url)
|
317 |
progress(0.6,desc='Creating Excel File')
|
318 |
|
319 |
|
320 |
-
extract_directory =
|
321 |
categories = {
|
322 |
"Other": ["URL", "File", "Type", "Title", "Source", "Content"],
|
323 |
"CR": ["URL", "File", "Type", "Title", "Source", "Content"],
|
@@ -518,3 +512,4 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
518 |
file_name = temp_excel
|
519 |
# Save the updated DataFrame to Excel
|
520 |
return file_name, "Téléchargement réussi"
|
|
|
|
9 |
import gradio as gr
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def browse_folder(url):
|
13 |
if url.lower().endswith(('docs', 'docs/')):
|
14 |
return gr.update(choices=[])
|
|
|
62 |
return []
|
63 |
|
64 |
|
65 |
+
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
66 |
filenames = []
|
67 |
status_filenames = []
|
68 |
# Check if the excel_file argument is provided and if the file exists.
|
69 |
+
excel_file_path = "guide_status.xlsx" # Hardcoded path to the Excel file
|
70 |
|
71 |
if os.path.exists(excel_file_path):
|
72 |
try:
|
|
|
116 |
download_directory = folder_name
|
117 |
if not os.path.exists(download_directory):
|
118 |
os.makedirs(download_directory)
|
119 |
+
|
120 |
pourcentss = 0.05
|
121 |
print(f'filenames: {status_filenames}')
|
122 |
if not filenames and not status_filenames:
|
|
|
132 |
|
133 |
# Filtrer les liens se terminant par ".zip"
|
134 |
zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
|
135 |
+
|
136 |
# Télécharger chaque fichier zip
|
137 |
for zip_link in zip_links:
|
138 |
progress(pourcentss,desc='Downloading')
|
139 |
+
pourcentss+=0.4/len(df)
|
140 |
# Construire l'URL absolue du fichier zip
|
141 |
absolute_url = urljoin(url, zip_link)
|
142 |
|
|
|
159 |
filename = os.path.basename(file_url)
|
160 |
save_path = os.path.join(download_directory, filename)
|
161 |
progress(pourcentss,desc='Downloading')
|
162 |
+
pourcentss+=0.4/len(df)
|
163 |
try:
|
164 |
with requests.get(file_url, stream=True) as r:
|
165 |
r.raise_for_status()
|
|
|
185 |
print(f"HTTP error occurred: {file_url}: {e}")
|
186 |
return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
|
187 |
|
188 |
+
return True, "Téléchargement terminé !", len(df)
|
189 |
|
190 |
|
191 |
|
192 |
+
def extractZip(url):
|
193 |
# Répertoire où les fichiers zip sont déjà téléchargés
|
194 |
+
nom_extract = url.split("/")[-3] + "_extraction"
|
195 |
+
if os.path.exists(nom_extract):
|
196 |
+
shutil.rmtree(nom_extract)
|
197 |
+
extract_directory = nom_extract
|
198 |
+
|
199 |
+
download_directory = url.split("/")[-3] + "_downloads"
|
200 |
+
# Répertoire où le contenu des fichiers zip sera extrait
|
201 |
|
202 |
# Extraire le contenu de tous les fichiers zip dans le répertoire de téléchargement
|
203 |
for zip_file in os.listdir(download_directory):
|
|
|
213 |
os.makedirs(extract_dir)
|
214 |
|
215 |
# Extraire le contenu du fichier zip
|
216 |
+
|
217 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
218 |
zip_ref.extractall(extract_dir)
|
219 |
|
|
|
223 |
|
224 |
print("Toutes les extractions sont terminées !")
|
225 |
|
226 |
+
|
227 |
def excel3gpp(url):
|
228 |
response = requests.get(url)
|
229 |
response.raise_for_status() # This will raise an exception if there's an error
|
|
|
245 |
excel_response.raise_for_status()
|
246 |
|
247 |
# Define the path where you want to save the file
|
248 |
+
# Replace 'path_to_save_directory' with your desired path
|
|
|
249 |
|
250 |
# Write the content of the Excel file to a local file
|
251 |
# Write the content of the Excel file to a local file named 'guide.xlsx'
|
252 |
+
|
253 |
+
nom_guide = 'guide.xlsx' # Directly specify the filename
|
254 |
+
if os.path.exists(nom_guide):
|
255 |
+
os.remove(nom_guide)
|
256 |
+
filepath = nom_guide
|
257 |
+
|
258 |
|
259 |
with open(filepath, 'wb') as f:
|
260 |
f.write(excel_response.content)
|
|
|
286 |
print(f"Error updating Excel file: {e}")
|
287 |
|
288 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
289 |
+
nom_download = url.split("/")[-3] + "_downloads"
|
290 |
+
if os.path.exists(nom_download):
|
291 |
+
shutil.rmtree(nom_download)
|
292 |
+
folder_name = nom_download
|
293 |
+
|
294 |
+
nom_status = url.split("/")[-3] + "_status.xlsx"
|
295 |
+
if os.path.exists(nom_status):
|
296 |
+
os.remove(nom_status)
|
297 |
+
temp_excel = nom_status
|
298 |
+
|
299 |
progress(0.0,desc='Downloading')
|
300 |
+
|
301 |
+
result, message, count = scrape(url, excel_file, folder_name, status_list)
|
302 |
if result:
|
303 |
print("Success:", message)
|
304 |
else:
|
305 |
return(None, message)
|
306 |
|
307 |
progress(0.4,desc='Extraction')
|
308 |
+
extractZip(url)
|
309 |
progress(0.5,desc='Extraction 2')
|
310 |
excel3gpp(url)
|
311 |
progress(0.6,desc='Creating Excel File')
|
312 |
|
313 |
|
314 |
+
extract_directory = url.split("/")[-3] + "_extraction"
|
315 |
categories = {
|
316 |
"Other": ["URL", "File", "Type", "Title", "Source", "Content"],
|
317 |
"CR": ["URL", "File", "Type", "Title", "Source", "Content"],
|
|
|
512 |
file_name = temp_excel
|
513 |
# Save the updated DataFrame to Excel
|
514 |
return file_name, "Téléchargement réussi"
|
515 |
+
|