File size: 3,954 Bytes
3ef2b6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import boto3
import os
import zipfile
from glob import glob
import shutil
from huggingface_hub import HfApi
import gradio as gr
from tqdm.auto import tqdm
import threading


################################################################################

# Declarations:
print("Declaring variables.")
# AWS S3 service name
service_name = 's3'

# AWS S3 bucket names
biorxiv_bucket_name = 'biorxiv-src-monthly'
medrxiv_bucket_name = 'medrxiv-src-monthly'

# AWS region name
region_name = 'us-east-1'

# Hugging Face destination repository name
destination_repo_name = 'xml-dump-monthly'

################################################################################

print("Initiating clients.")

# Create a S3 client
s3_client = boto3.client(
    service_name='s3',
    region_name=region_name,
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
)
paginator = s3_client.get_paginator('list_objects_v2')

# Create a Hugging Face API client
access_token =  os.getenv('HF_API_KEY')
hugging_face_api = HfApi(token=access_token)

# Create a dataset repo
hugging_face_api.create_repo(
    repo_id=destination_repo_name,
    repo_type="dataset",
    private=False,
    exist_ok=True
)

# Extract Hugging facec username
username = hugging_face_api.whoami()['name']
repo_id = f"{username}/{destination_repo_name}"

################################################################################


################################################################################
def download_medrxiv(Prefix=""):

    print("Downloading Medrxiv files.")

    # Output folders for downloaded files
    medrxiv_output_folder = Prefix + 'medrxiv-xml-dump'

    # Create output folders if they don't exist
    os.makedirs(medrxiv_output_folder, exist_ok=True)

    # Gather all objects from Medrxiv bucket
    medrxiv_pages = paginator.paginate(
        Bucket=medrxiv_bucket_name,
        RequestPayer='requester',
        Prefix=Prefix
    ).build_full_result()

    # Dowload all objects from Medrxiv bucket
    for medrxiv_object in tqdm(medrxiv_pages['Contents'], desc=Prefix):

        # Get the file name
        file = medrxiv_object['Key']

        # Check if the file is a zip file
        if file.endswith(".meca"):

            # Proccess the zip file
            try:

                # Download the file
                s3_client.download_file(medrxiv_bucket_name, file, 'tmp_med.meca', ExtraArgs={'RequestPayer':'requester'})
                    
                # Unzip meca file
                with zipfile.ZipFile('tmp_med.meca', 'r') as zip_ref:
                    zip_ref.extractall("tmp_med")

                # Gather the xml file
                xml = glob('tmp_med/content/*.xml')

                # Copy the xml file to the output folder
                shutil.copy(xml[0], medrxiv_output_folder)

                # Remove the tmp_med folder and file
                shutil.rmtree('tmp_med')
                os.remove('tmp_med.meca')

            except Exception as e:
                print(f"Error processing file {file}: {e}")


    # Zip the output folder
    shutil.make_archive(medrxiv_output_folder, 'zip', medrxiv_output_folder)

    print(f"Uploading {medrxiv_output_folder}.zip to Hugging Face repo {repo_id}.")

    hugging_face_api.upload_file(path_or_fileobj=f'{medrxiv_output_folder}.zip', path_in_repo=f'{medrxiv_output_folder}.zip', repo_id=repo_id, repo_type="dataset")

    print("Medrxiv Done.")



# Create separate threads function
second_thread2 = threading.Thread(target=download_medrxiv, args=("Current_Content/October_2024/",))

# Start thread
second_thread2.start()

###############################################################################

# Dummy app

def greet(name, intensity):
    return "Hello, " + name + "!" * int(intensity)

demo = gr.Interface(
    fn=greet,
    inputs=["text", "slider"],
    outputs=["text"],
)

demo.launch()