ubuntu commited on
Commit
c25690f
1 Parent(s): 356ac1b

Initial Commit

Browse files
Files changed (3) hide show
  1. app.py +71 -0
  2. panda70m_downloader.py +85 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import shutil
4
+ import pandas as pd
5
+ import gradio as gr
6
+ from vidfetch import youtube_dl_install_helper, push_to_hf
7
+ from panda70m_downloader import download_video_links, download_videos_by_csv
8
+
9
+
10
+ SAVE_CSV_DIR = "panda70m_csv"
11
+ SAVE_VIDEOS_DIR = "panda70m_videos"
12
+
13
+ def handle(
14
+ hf_token: str,
15
+ filename: str,
16
+ ):
17
+ try:
18
+ import youtube_dl
19
+ except:
20
+ youtube_dl_install_helper(hf_token=hf_token)
21
+ import youtube_dl
22
+
23
+ download_video_links(hf_token=hf_token, filename=filename, save_dir=SAVE_CSV_DIR)
24
+
25
+ # devide .csv to 100 files and download
26
+ csv_path = os.path.join(SAVE_CSV_DIR, filename)
27
+ data = pd.read_csv(csv_path)
28
+ for idx in range(len(data) // 100):
29
+ begin_idx = idx * 100
30
+ end_idx = idx * 100 + 100
31
+ part_data = data[begin_idx : end_idx]
32
+ part_filename = filename.replace(".csv", "") + "_{:06d}_{:06d}.csv".format(begin_idx, end_idx)
33
+ targz_filename = part_filename.replace(".csv", ".tar.gz")
34
+ part_save_path = os.path.join(SAVE_CSV_DIR, part_filename)
35
+ part_data.to_csv(part_save_path)
36
+ download_videos_by_csv(
37
+ csv_file_path=part_save_path,
38
+ save_dir=SAVE_VIDEOS_DIR,
39
+ targz_filename=targz_filename
40
+ )
41
+ push_to_hf(
42
+ hf_token=hf_token,
43
+ hf_repo_id="OpenVideo/Panda-70M-raw",
44
+ file_path=os.path.join(SAVE_VIDEOS_DIR, targz_filename),
45
+ path_in_repo=targz_filename
46
+ )
47
+ shutil.rmtree(SAVE_VIDEOS_DIR)
48
+
49
+
50
+ with gr.Blocks() as demo:
51
+ gr.Markdown(
52
+ '''
53
+ Panda70M-Downloader
54
+ '''
55
+ )
56
+ hf_token = gr.Textbox(label="HuggingFace Token")
57
+ filename = gr.Textbox(label="csv name")
58
+
59
+ with gr.Row():
60
+ button = gr.Button("Submit", variant="primary")
61
+ clear = gr.Button("Clear")
62
+
63
+ button.click(
64
+ handle,
65
+ [hf_token, filename],
66
+ outputs=None
67
+ )
68
+
69
+
70
+ if __name__ == "__main__":
71
+ demo.launch(debug = True)
panda70m_downloader.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import pandas as pd
4
+ from vidfetch import compress_folder, pull_from_hf
5
+
6
+
7
+ def download_video_links(hf_token: str, filename: str, save_dir: str):
8
+ # check save dir
9
+ if not os.path.exists(save_dir):
10
+ os.makedirs(save_dir)
11
+
12
+ # download
13
+ pull_from_hf(
14
+ hf_token=hf_token,
15
+ hf_repo_id="OpenVideo/Panda-70M-Original-Links",
16
+ filename=filename,
17
+ save_dir=save_dir
18
+ )
19
+
20
+
21
+ def download_videos_by_csv(
22
+ csv_file_path: str,
23
+ save_dir: str,
24
+ targz_filename: str,
25
+ ):
26
+ try:
27
+ import youtube_dl
28
+ except:
29
+ raise ModuleNotFoundError(
30
+ "youtube_dl missed, please install it by ``vidfetch.package.youtube.youtube_dl_install_helper``"
31
+ )
32
+ # path/dir
33
+ folder_name = targz_filename.replace(".tar.gz", "")
34
+ download_videos_dir = os.path.join(save_dir, folder_name, "download_raw")
35
+ log_path = os.path.join(download_videos_dir, "log.txt")
36
+ targz_path = os.path.join(save_dir, targz_filename)
37
+
38
+ # make dirs
39
+ if not os.path.exists(download_videos_dir):
40
+ os.makedirs(download_videos_dir)
41
+
42
+ # read from csv
43
+ csv_filename = os.path.basename(csv_file_path)
44
+ shutil.copy(src=csv_file_path, dst=os.path.join(download_videos_dir, csv_filename))
45
+ data = pd.read_csv(csv_file_path)
46
+ links = data["url"].tolist()
47
+ videos_id = data["videoID"].to_list()
48
+
49
+ failed_links = [] # record failed links
50
+ for link, video_id in zip(links, videos_id):
51
+ # check if downloaded
52
+ video_save_path = os.path.join(download_videos_dir, video_id[1:]+".mp4")
53
+ if os.path.exists(video_save_path):
54
+ continue
55
+
56
+ # download
57
+ ydl_opts = {
58
+ 'format': 'best',
59
+ 'quiet': False,
60
+ 'outtmpl': os.path.join(download_videos_dir, video_id[1:]+".mp4"),
61
+ }
62
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
63
+ try:
64
+ ydl.download([link])
65
+ except:
66
+ failed_links.append(link)
67
+
68
+ # delete videos larger than 100MB
69
+ video_files = os.listdir(download_videos_dir)
70
+ delete_videos = []
71
+ for file in video_files:
72
+ file_path = os.path.join(download_videos_dir, file)
73
+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024) # Convert to megabytes
74
+ if file_size_mb > 500:
75
+ delete_videos.append(file_path)
76
+ os.remove(file_path)
77
+
78
+ # Write to log file
79
+ with open(log_path, 'w') as file:
80
+ file.write('Fail to download\n')
81
+ file.write('\n'.join(failed_links))
82
+ file.write('Delete videos larger than 500MB\n')
83
+ file.write('\n'.join(failed_links))
84
+
85
+ compress_folder(download_videos_dir, targz_path)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ vidfetch
2
+ gradio
3
+ pandas