# Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Chameleon License Agreement. import hashlib import subprocess import sys from pathlib import Path def download_file(url: str, output_path: Path): print(f"Downloading {output_path}") subprocess.check_call(["wget", "--continue", url, "-O", str(output_path)]) def validate_checksum(folder: Path): chks_parts = (folder / "checklist.chk").read_text().split() for expected_checksum, file in zip(chks_parts[::2], chks_parts[1::2]): file_path = folder / file checksum = hashlib.md5(file_path.read_bytes()).hexdigest() if checksum != expected_checksum: print(f"Checksum mismatch for {file_path}") sys.exit(1) def download_tokenizer(presigned_url: str, target_folder: Path): tokenizer_folder = target_folder / "tokenizer" tokenizer_folder.mkdir(parents=True, exist_ok=True) for filename in [ "text_tokenizer.json", "vqgan.ckpt", "vqgan.yaml", "checklist.chk", ]: download_file( presigned_url.replace("*", f"tokenizer/{filename}"), tokenizer_folder / filename, ) validate_checksum(tokenizer_folder) def download_model(presigned_url: str, target_folder: Path, model: str): model_folder = target_folder / "models" / model model_folder.mkdir(parents=True, exist_ok=True) download_filenames = ["params.json", "consolidate_params.json", "checklist.chk"] if model == "7b": download_filenames += ["consolidated.pth"] elif model == "30b": download_filenames += [f"consolidated.{i:02}.pth" for i in range(4)] else: print(f"Unknown model: {model}") sys.exit(1) for filename in download_filenames: download_file( presigned_url.replace("*", f"{model}/{filename}"), model_folder / filename, ) validate_checksum(model_folder) def main(): presigned_url = ( sys.argv[1] if len(sys.argv) > 1 else input("Enter the URL from email: ") ) target_folder = Path("./data") target_folder.mkdir(parents=True, exist_ok=True) download_tokenizer(presigned_url, target_folder) model_size = input( "Enter the list of models to download without spaces (7B,30B), or press Enter for all: " ) if not model_size: model_size = "7B,30B" for model in model_size.split(","): model = model.strip().lower() download_model(presigned_url, target_folder, model) if __name__ == "__main__": main()