Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +19 -15
scrape_3gpp.py
CHANGED
@@ -215,7 +215,7 @@ def remod_text(text):
|
|
215 |
return text.replace("/n", "\n")
|
216 |
|
217 |
def update_excel(data, excel_file, url):
|
218 |
-
new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"]
|
219 |
temp_df = pd.DataFrame(data, columns=new_df_columns)
|
220 |
|
221 |
try:
|
@@ -258,17 +258,18 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
258 |
|
259 |
|
260 |
extract_directory = url.split("/")[-3] + "_extraction"
|
|
|
261 |
categories = {
|
262 |
-
"Other":
|
263 |
-
"CR":
|
264 |
-
"pCR":
|
265 |
-
"LS":
|
266 |
-
"WID":
|
267 |
-
"SID":
|
268 |
-
"DISCUSSION":
|
269 |
-
"pdf":
|
270 |
-
"ppt":
|
271 |
-
"pptx":
|
272 |
}
|
273 |
|
274 |
pourcents2=0.6
|
@@ -282,7 +283,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
282 |
df = pd.read_excel(temp_excel)
|
283 |
except Exception as e:
|
284 |
print(f"Initializing a new DataFrame because: {e}")
|
285 |
-
df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
|
286 |
|
287 |
for folder in os.listdir(extract_directory):
|
288 |
folder_path = os.path.join(extract_directory, folder)
|
@@ -455,24 +456,27 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
455 |
# Assuming 'source' needs to be filled from the guide.xlsx mapping
|
456 |
# Placeholder for source value calculation
|
457 |
source = "" # Update this with actual source determination logic
|
|
|
458 |
status = ""
|
459 |
-
data.append([url+ "/" + folder + '.zip', folder , category, title, source,status, contenu])
|
460 |
|
461 |
guide_file = 'guide.xlsx'
|
462 |
if os.path.exists(guide_file):
|
463 |
# If guide.xlsx exists, proceed with operations that require it
|
464 |
try:
|
465 |
-
guide_df = pd.read_excel(guide_file, usecols=['Source', 'TDoc', 'TDoc Status'])
|
466 |
# Continue with the operations that require guide.xlsx
|
467 |
# For example, reading the file, processing the data, etc.
|
468 |
tdoc_source_map = {row['TDoc']: row['Source'] for index, row in guide_df.iterrows()}
|
|
|
469 |
tdoc_status_map = {row['TDoc']: row['TDoc Status'] for index, row in guide_df.iterrows()}
|
470 |
# Update the 'Source' in your data based on matching 'Nom du fichier' with 'TDoc'
|
471 |
for item in data:
|
472 |
nom_du_fichier = item[1] # Assuming 'Nom du fichier' is the first item in your data list
|
473 |
if nom_du_fichier in tdoc_source_map:
|
474 |
item[4] = tdoc_source_map[nom_du_fichier] # Update the 'Source' field, assuming it's the fourth item
|
475 |
-
item[5] =
|
|
|
476 |
# Your code that depends on guide.xlsx goes here
|
477 |
|
478 |
except Exception as e:
|
|
|
215 |
return text.replace("/n", "\n")
|
216 |
|
217 |
def update_excel(data, excel_file, url):
|
218 |
+
new_df_columns = ["URL", "File", "Type", "Title", "Source", "Related WIs", "Status", "Content"]
|
219 |
temp_df = pd.DataFrame(data, columns=new_df_columns)
|
220 |
|
221 |
try:
|
|
|
258 |
|
259 |
|
260 |
extract_directory = url.split("/")[-3] + "_extraction"
|
261 |
+
TabCategories = ["URL", "File", "Title", "Source", "Related WIs", "Content"]
|
262 |
categories = {
|
263 |
+
"Other": TabCategories,
|
264 |
+
"CR": TabCategories,
|
265 |
+
"pCR": TabCategories,
|
266 |
+
"LS": TabCategories,
|
267 |
+
"WID": TabCategories,
|
268 |
+
"SID": TabCategories,
|
269 |
+
"DISCUSSION": TabCategories,
|
270 |
+
"pdf": TabCategories,
|
271 |
+
"ppt": TabCategories,
|
272 |
+
"pptx": TabCategories
|
273 |
}
|
274 |
|
275 |
pourcents2=0.6
|
|
|
283 |
df = pd.read_excel(temp_excel)
|
284 |
except Exception as e:
|
285 |
print(f"Initializing a new DataFrame because: {e}")
|
286 |
+
df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Related WIs", "Content"])
|
287 |
|
288 |
for folder in os.listdir(extract_directory):
|
289 |
folder_path = os.path.join(extract_directory, folder)
|
|
|
456 |
# Assuming 'source' needs to be filled from the guide.xlsx mapping
|
457 |
# Placeholder for source value calculation
|
458 |
source = "" # Update this with actual source determination logic
|
459 |
+
RelatedWIs = ""
|
460 |
status = ""
|
461 |
+
data.append([url+ "/" + folder + '.zip', folder , category, title, source, RelatedWIs, status, contenu])
|
462 |
|
463 |
guide_file = 'guide.xlsx'
|
464 |
if os.path.exists(guide_file):
|
465 |
# If guide.xlsx exists, proceed with operations that require it
|
466 |
try:
|
467 |
+
guide_df = pd.read_excel(guide_file, usecols=['Source', 'TDoc', 'RelatedWIs', 'TDoc Status'])
|
468 |
# Continue with the operations that require guide.xlsx
|
469 |
# For example, reading the file, processing the data, etc.
|
470 |
tdoc_source_map = {row['TDoc']: row['Source'] for index, row in guide_df.iterrows()}
|
471 |
+
tdoc_relatedWIs_map = {row['TDoc']: row['Related WIs'] for index, row in guide_df.iterrows()}
|
472 |
tdoc_status_map = {row['TDoc']: row['TDoc Status'] for index, row in guide_df.iterrows()}
|
473 |
# Update the 'Source' in your data based on matching 'Nom du fichier' with 'TDoc'
|
474 |
for item in data:
|
475 |
nom_du_fichier = item[1] # Assuming 'Nom du fichier' is the first item in your data list
|
476 |
if nom_du_fichier in tdoc_source_map:
|
477 |
item[4] = tdoc_source_map[nom_du_fichier] # Update the 'Source' field, assuming it's the fourth item
|
478 |
+
item[5] = tdoc_relatedWIs_map[nom_du_fichier]
|
479 |
+
item[6] = tdoc_status_map[nom_du_fichier]
|
480 |
# Your code that depends on guide.xlsx goes here
|
481 |
|
482 |
except Exception as e:
|