File size: 3,922 Bytes
14a2693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import boto3
import os
import zipfile
from glob import glob
import shutil
from huggingface_hub import HfApi
import gradio as gr
from tqdm.auto import tqdm
import threading


################################################################################

# Declarations:
print("Declaring variables.")
# AWS S3 service name
service_name = 's3'

# AWS S3 bucket names
biorxiv_bucket_name = 'biorxiv-src-monthly'
medrxiv_bucket_name = 'medrxiv-src-monthly'

# AWS region name
region_name = 'us-east-1'

# Hugging Face destination repository name
destination_repo_name = 'xml-dump-monthly'

################################################################################

print("Initiating clients.")

# Create a S3 client
s3_client = boto3.client(
    service_name='s3',
    region_name=region_name,
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
)
paginator = s3_client.get_paginator('list_objects_v2')

# Create a Hugging Face API client
access_token =  os.getenv('HF_API_KEY')
hugging_face_api = HfApi(token=access_token)

# Create a dataset repo
hugging_face_api.create_repo(
    repo_id=destination_repo_name,
    repo_type="dataset",
    private=False,
    exist_ok=True
)

# Extract Hugging facec username
username = hugging_face_api.whoami()['name']
repo_id = f"{username}/{destination_repo_name}"

################################################################################

def download_biorxiv(Prefix=""):
    
    print("Downloading Biorxiv files.")

    # Output folders for downloaded files
    biorxiv_output_folder = Prefix + 'biorxiv-xml-dump'

    # Create output folders if they don't exist
    os.makedirs(biorxiv_output_folder, exist_ok=True)

    # Gather all objects from Biorxiv bucket
    biorxiv_pages = paginator.paginate(
        Bucket=biorxiv_bucket_name,
        RequestPayer='requester',
        Prefix=Prefix
    ).build_full_result()

    # Dowload all objects from Biorxiv bucket
    for biorxiv_object in tqdm(biorxiv_pages['Contents'], desc=Prefix):

        # Get the file name
        file = biorxiv_object['Key']

        # Check if the file is a zip file
        if file.endswith(".meca"):

            # Proccess the zip file
            try:

                # Download the file
                s3_client.download_file(biorxiv_bucket_name, file, 'tmp_bio.meca', ExtraArgs={'RequestPayer':'requester'})
                    
                # Unzip meca file
                with zipfile.ZipFile('tmp_bio.meca', 'r') as zip_ref:
                    zip_ref.extractall("tmp_bio")

                # Gather the xml file
                xml = glob('tmp_bio/content/*.xml')

                # Copy the xml file to the output folder
                shutil.copy(xml[0], biorxiv_output_folder)

                # Remove the tmp_bio folder and file
                shutil.rmtree('tmp_bio')
                os.remove('tmp_bio.meca')

            except Exception as e:

                print(f"Error processing file {file}: {e}")


    # Zip the output folder
    shutil.make_archive(biorxiv_output_folder, 'zip', biorxiv_output_folder)

    # Upload the zip files to Hugging Face
    print(f"Uploading {biorxiv_output_folder}.zip to Hugging Face repo {repo_id}.")
    hugging_face_api.upload_file(path_or_fileobj=f'{biorxiv_output_folder}.zip', path_in_repo=f'{biorxiv_output_folder}.zip', repo_id=repo_id, repo_type="dataset")
    
    print("Biorxiv Done.")



# Create separate threads function
first_thread2 = threading.Thread(target=download_biorxiv, args=("Current_Content/October_2024/",))

# Start thread
first_thread2.start()


###############################################################################

# Dummy app

def greet(name, intensity):
    return "Hello, " + name + "!" * int(intensity)

demo = gr.Interface(
    fn=greet,
    inputs=["text", "slider"],
    outputs=["text"],
)

demo.launch()