Spaces:
Paused
Paused
File size: 3,922 Bytes
14a2693 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import boto3
import os
import zipfile
from glob import glob
import shutil
from huggingface_hub import HfApi
import gradio as gr
from tqdm.auto import tqdm
import threading
################################################################################
# Declarations:
print("Declaring variables.")
# AWS S3 service name
service_name = 's3'
# AWS S3 bucket names
biorxiv_bucket_name = 'biorxiv-src-monthly'
medrxiv_bucket_name = 'medrxiv-src-monthly'
# AWS region name
region_name = 'us-east-1'
# Hugging Face destination repository name
destination_repo_name = 'xml-dump-monthly'
################################################################################
print("Initiating clients.")
# Create a S3 client
s3_client = boto3.client(
service_name='s3',
region_name=region_name,
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
)
paginator = s3_client.get_paginator('list_objects_v2')
# Create a Hugging Face API client
access_token = os.getenv('HF_API_KEY')
hugging_face_api = HfApi(token=access_token)
# Create a dataset repo
hugging_face_api.create_repo(
repo_id=destination_repo_name,
repo_type="dataset",
private=False,
exist_ok=True
)
# Extract Hugging facec username
username = hugging_face_api.whoami()['name']
repo_id = f"{username}/{destination_repo_name}"
################################################################################
def download_biorxiv(Prefix=""):
print("Downloading Biorxiv files.")
# Output folders for downloaded files
biorxiv_output_folder = Prefix + 'biorxiv-xml-dump'
# Create output folders if they don't exist
os.makedirs(biorxiv_output_folder, exist_ok=True)
# Gather all objects from Biorxiv bucket
biorxiv_pages = paginator.paginate(
Bucket=biorxiv_bucket_name,
RequestPayer='requester',
Prefix=Prefix
).build_full_result()
# Dowload all objects from Biorxiv bucket
for biorxiv_object in tqdm(biorxiv_pages['Contents'], desc=Prefix):
# Get the file name
file = biorxiv_object['Key']
# Check if the file is a zip file
if file.endswith(".meca"):
# Proccess the zip file
try:
# Download the file
s3_client.download_file(biorxiv_bucket_name, file, 'tmp_bio.meca', ExtraArgs={'RequestPayer':'requester'})
# Unzip meca file
with zipfile.ZipFile('tmp_bio.meca', 'r') as zip_ref:
zip_ref.extractall("tmp_bio")
# Gather the xml file
xml = glob('tmp_bio/content/*.xml')
# Copy the xml file to the output folder
shutil.copy(xml[0], biorxiv_output_folder)
# Remove the tmp_bio folder and file
shutil.rmtree('tmp_bio')
os.remove('tmp_bio.meca')
except Exception as e:
print(f"Error processing file {file}: {e}")
# Zip the output folder
shutil.make_archive(biorxiv_output_folder, 'zip', biorxiv_output_folder)
# Upload the zip files to Hugging Face
print(f"Uploading {biorxiv_output_folder}.zip to Hugging Face repo {repo_id}.")
hugging_face_api.upload_file(path_or_fileobj=f'{biorxiv_output_folder}.zip', path_in_repo=f'{biorxiv_output_folder}.zip', repo_id=repo_id, repo_type="dataset")
print("Biorxiv Done.")
# Create separate threads function
first_thread2 = threading.Thread(target=download_biorxiv, args=("Current_Content/October_2024/",))
# Start thread
first_thread2.start()
###############################################################################
# Dummy app
def greet(name, intensity):
return "Hello, " + name + "!" * int(intensity)
demo = gr.Interface(
fn=greet,
inputs=["text", "slider"],
outputs=["text"],
)
demo.launch()
|