Spaces:

bluuebunny
/

b_o

Paused

File size: 3,922 Bytes

14a2693

import boto3
import os
import zipfile
from glob import glob
import shutil
from huggingface_hub import HfApi
import gradio as gr
from tqdm.auto import tqdm
import threading


################################################################################

# Declarations:
print("Declaring variables.")
# AWS S3 service name
service_name = 's3'

# AWS S3 bucket names
biorxiv_bucket_name = 'biorxiv-src-monthly'
medrxiv_bucket_name = 'medrxiv-src-monthly'

# AWS region name
region_name = 'us-east-1'

# Hugging Face destination repository name
destination_repo_name = 'xml-dump-monthly'

################################################################################

print("Initiating clients.")

# Create a S3 client
s3_client = boto3.client(
    service_name='s3',
    region_name=region_name,
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
)
paginator = s3_client.get_paginator('list_objects_v2')

# Create a Hugging Face API client
access_token =  os.getenv('HF_API_KEY')
hugging_face_api = HfApi(token=access_token)

# Create a dataset repo
hugging_face_api.create_repo(
    repo_id=destination_repo_name,
    repo_type="dataset",
    private=False,
    exist_ok=True
)

# Extract Hugging facec username
username = hugging_face_api.whoami()['name']
repo_id = f"{username}/{destination_repo_name}"

################################################################################

def download_biorxiv(Prefix=""):
    
    print("Downloading Biorxiv files.")

    # Output folders for downloaded files
    biorxiv_output_folder = Prefix + 'biorxiv-xml-dump'

    # Create output folders if they don't exist
    os.makedirs(biorxiv_output_folder, exist_ok=True)

    # Gather all objects from Biorxiv bucket
    biorxiv_pages = paginator.paginate(
        Bucket=biorxiv_bucket_name,
        RequestPayer='requester',
        Prefix=Prefix
    ).build_full_result()

    # Dowload all objects from Biorxiv bucket
    for biorxiv_object in tqdm(biorxiv_pages['Contents'], desc=Prefix):

        # Get the file name
        file = biorxiv_object['Key']

        # Check if the file is a zip file
        if file.endswith(".meca"):

            # Proccess the zip file
            try:

                # Download the file
                s3_client.download_file(biorxiv_bucket_name, file, 'tmp_bio.meca', ExtraArgs={'RequestPayer':'requester'})
                    
                # Unzip meca file
                with zipfile.ZipFile('tmp_bio.meca', 'r') as zip_ref:
                    zip_ref.extractall("tmp_bio")

                # Gather the xml file
                xml = glob('tmp_bio/content/*.xml')

                # Copy the xml file to the output folder
                shutil.copy(xml[0], biorxiv_output_folder)

                # Remove the tmp_bio folder and file
                shutil.rmtree('tmp_bio')
                os.remove('tmp_bio.meca')

            except Exception as e:

                print(f"Error processing file {file}: {e}")


    # Zip the output folder
    shutil.make_archive(biorxiv_output_folder, 'zip', biorxiv_output_folder)

    # Upload the zip files to Hugging Face
    print(f"Uploading {biorxiv_output_folder}.zip to Hugging Face repo {repo_id}.")
    hugging_face_api.upload_file(path_or_fileobj=f'{biorxiv_output_folder}.zip', path_in_repo=f'{biorxiv_output_folder}.zip', repo_id=repo_id, repo_type="dataset")
    
    print("Biorxiv Done.")



# Create separate threads function
first_thread2 = threading.Thread(target=download_biorxiv, args=("Current_Content/October_2024/",))

# Start thread
first_thread2.start()


###############################################################################

# Dummy app

def greet(name, intensity):
    return "Hello, " + name + "!" * int(intensity)

demo = gr.Interface(
    fn=greet,
    inputs=["text", "slider"],
    outputs=["text"],
)

demo.launch()