Spaces:
Build error
Build error
import os | |
import shutil | |
import pandas as pd | |
from vidfetch import compress_folder, pull_from_hf | |
def download_video_links(hf_token: str, filename: str, save_dir: str): | |
# check save dir | |
if not os.path.exists(save_dir): | |
os.makedirs(save_dir) | |
# download | |
pull_from_hf( | |
hf_token=hf_token, | |
hf_repo_id="OpenVideo/Panda-70M-Original-Links", | |
filename=filename, | |
save_dir=save_dir | |
) | |
def download_videos_by_csv( | |
csv_file_path: str, | |
save_dir: str, | |
targz_filename: str, | |
): | |
try: | |
import youtube_dl | |
except: | |
raise ModuleNotFoundError( | |
"youtube_dl missed, please install it by ``vidfetch.package.youtube.youtube_dl_install_helper``" | |
) | |
# path/dir | |
folder_name = targz_filename.replace(".tar.gz", "") | |
download_videos_dir = os.path.join(save_dir, folder_name, "download_raw") | |
log_path = os.path.join(download_videos_dir, "log.txt") | |
targz_path = os.path.join(save_dir, targz_filename) | |
# make dirs | |
if not os.path.exists(download_videos_dir): | |
os.makedirs(download_videos_dir) | |
# read from csv | |
csv_filename = os.path.basename(csv_file_path) | |
shutil.copy(src=csv_file_path, dst=os.path.join(download_videos_dir, csv_filename)) | |
data = pd.read_csv(csv_file_path) | |
links = data["url"].tolist() | |
videos_id = data["videoID"].to_list() | |
failed_links = [] # record failed links | |
for link, video_id in zip(links, videos_id): | |
# check if downloaded | |
video_save_path = os.path.join(download_videos_dir, video_id[1:]+".mp4") | |
if os.path.exists(video_save_path): | |
continue | |
# download | |
ydl_opts = { | |
'format': 'best', | |
'quiet': False, | |
'outtmpl': os.path.join(download_videos_dir, video_id[1:]+".mp4"), | |
} | |
with youtube_dl.YoutubeDL(ydl_opts) as ydl: | |
try: | |
ydl.download([link]) | |
except: | |
failed_links.append(link) | |
# delete videos larger than 100MB | |
video_files = os.listdir(download_videos_dir) | |
delete_videos = [] | |
for file in video_files: | |
file_path = os.path.join(download_videos_dir, file) | |
file_size_mb = os.path.getsize(file_path) / (1024 * 1024) # Convert to megabytes | |
if file_size_mb > 500: | |
delete_videos.append(file_path) | |
os.remove(file_path) | |
# Write to log file | |
with open(log_path, 'w') as file: | |
file.write('Fail to download\n') | |
file.write('\n'.join(failed_links)) | |
file.write('Delete videos larger than 500MB\n') | |
file.write('\n'.join(failed_links)) | |
compress_folder(download_videos_dir, targz_path) |