MaksG commited on
Commit
4f012ae
1 Parent(s): 7854dd4

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +21 -8
scrape_3gpp.py CHANGED
@@ -7,7 +7,7 @@ import numpy as np
7
  import zipfile
8
  import textract
9
 
10
- def scrape(url, excel_file, folder_name):
11
  filenames = []
12
  # Check if the excel_file argument is provided and if the file exists.
13
  if excel_file and os.path.exists(excel_file):
@@ -45,9 +45,15 @@ def scrape(url, excel_file, folder_name):
45
 
46
  # Filtrer les liens se terminant par ".zip"
47
  zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
48
-
 
49
  # Télécharger chaque fichier zip
50
  for zip_link in zip_links:
 
 
 
 
 
51
  # Construire l'URL absolue du fichier zip
52
  absolute_url = urljoin(url, zip_link)
53
 
@@ -85,6 +91,7 @@ def scrape(url, excel_file, folder_name):
85
  return True, "Téléchargement terminé !"
86
 
87
 
 
88
  def extractZip(folder_name):
89
  # Répertoire où les fichiers zip sont déjà téléchargés
90
  download_directory = folder_name
@@ -147,25 +154,27 @@ def excel3gpp(url):
147
  print(f'Excel file downloaded and saved as: {filepath}')
148
 
149
 
 
150
  def replace_line_breaks(text):
151
  return text.replace("\n", "/n")
152
 
153
  def remod_text(text):
154
  return text.replace("/n", "\n")
155
 
156
- def extractionPrincipale(url, excel_file=None):
157
  folder_name = url.split("/")[-2]
158
-
159
  result, message = scrape(url, excel_file, folder_name)
160
  if result:
161
  print("Success:", message)
162
  else:
163
  return(None, message)
164
 
165
-
166
  extractZip(folder_name)
 
167
  excel3gpp(url)
168
-
169
 
170
  extract_directory = folder_name +" extraction"
171
  categories = {
@@ -180,7 +189,8 @@ def extractionPrincipale(url, excel_file=None):
180
  "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
181
  "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
182
  }
183
-
 
184
  data = []
185
  errors_count = 0
186
  pre_title_section = None
@@ -188,6 +198,9 @@ def extractionPrincipale(url, excel_file=None):
188
  folder_path = os.path.join(extract_directory, folder)
189
  if os.path.isdir(folder_path):
190
  for file in os.listdir(folder_path):
 
 
 
191
  if file == "__MACOSX":
192
  continue
193
  file_path = os.path.join(folder_path, file)
@@ -340,7 +353,7 @@ def extractionPrincipale(url, excel_file=None):
340
 
341
 
342
 
343
- new_df_columns = ["URL", "File", "Type", "title", "Source", "Status", "Content"] # Create a DataFrame with the updated data
344
  new_df = pd.DataFrame(data, columns=new_df_columns)
345
  try:
346
  old_df = pd.read_excel(excel_file)
 
7
  import zipfile
8
  import textract
9
 
10
+ def scrape(url, excel_file, folder_name,progress=gr.Progress()):
11
  filenames = []
12
  # Check if the excel_file argument is provided and if the file exists.
13
  if excel_file and os.path.exists(excel_file):
 
45
 
46
  # Filtrer les liens se terminant par ".zip"
47
  zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
48
+ download_num = 0
49
+ pourcentss = 0.1
50
  # Télécharger chaque fichier zip
51
  for zip_link in zip_links:
52
+ if download_num%10 == 0:
53
+ pourcentss = pourcentss + download_num/500
54
+ progress(pourcentss,desc='Telechargement')
55
+ download_num = 0
56
+ download_num+=1
57
  # Construire l'URL absolue du fichier zip
58
  absolute_url = urljoin(url, zip_link)
59
 
 
91
  return True, "Téléchargement terminé !"
92
 
93
 
94
+
95
  def extractZip(folder_name):
96
  # Répertoire où les fichiers zip sont déjà téléchargés
97
  download_directory = folder_name
 
154
  print(f'Excel file downloaded and saved as: {filepath}')
155
 
156
 
157
+
158
  def replace_line_breaks(text):
159
  return text.replace("\n", "/n")
160
 
161
  def remod_text(text):
162
  return text.replace("/n", "\n")
163
 
164
+ def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
165
  folder_name = url.split("/")[-2]
166
+ progress(0.1,desc='Telechargement')
167
  result, message = scrape(url, excel_file, folder_name)
168
  if result:
169
  print("Success:", message)
170
  else:
171
  return(None, message)
172
 
173
+ progress(0.4,desc='Extraction')
174
  extractZip(folder_name)
175
+ progress(0.5,desc='Extraction 2')
176
  excel3gpp(url)
177
+ progress(0.6,desc='Mise en forme Excel')
178
 
179
  extract_directory = folder_name +" extraction"
180
  categories = {
 
189
  "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
190
  "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
191
  }
192
+ nouv=0
193
+ num=0.6
194
  data = []
195
  errors_count = 0
196
  pre_title_section = None
 
198
  folder_path = os.path.join(extract_directory, folder)
199
  if os.path.isdir(folder_path):
200
  for file in os.listdir(folder_path):
201
+ num=num + nouv/400
202
+ progress(num,desc='Mise en forme Excel')
203
+ nouv+=1
204
  if file == "__MACOSX":
205
  continue
206
  file_path = os.path.join(folder_path, file)
 
353
 
354
 
355
 
356
+ new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"] # Create a DataFrame with the updated data
357
  new_df = pd.DataFrame(data, columns=new_df_columns)
358
  try:
359
  old_df = pd.read_excel(excel_file)