Spaces:
Paused
Paused
File size: 3,954 Bytes
3ef2b6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import boto3
import os
import zipfile
from glob import glob
import shutil
from huggingface_hub import HfApi
import gradio as gr
from tqdm.auto import tqdm
import threading
################################################################################
# Declarations:
print("Declaring variables.")
# AWS S3 service name
service_name = 's3'
# AWS S3 bucket names
biorxiv_bucket_name = 'biorxiv-src-monthly'
medrxiv_bucket_name = 'medrxiv-src-monthly'
# AWS region name
region_name = 'us-east-1'
# Hugging Face destination repository name
destination_repo_name = 'xml-dump-monthly'
################################################################################
print("Initiating clients.")
# Create a S3 client
s3_client = boto3.client(
service_name='s3',
region_name=region_name,
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
)
paginator = s3_client.get_paginator('list_objects_v2')
# Create a Hugging Face API client
access_token = os.getenv('HF_API_KEY')
hugging_face_api = HfApi(token=access_token)
# Create a dataset repo
hugging_face_api.create_repo(
repo_id=destination_repo_name,
repo_type="dataset",
private=False,
exist_ok=True
)
# Extract Hugging facec username
username = hugging_face_api.whoami()['name']
repo_id = f"{username}/{destination_repo_name}"
################################################################################
################################################################################
def download_medrxiv(Prefix=""):
print("Downloading Medrxiv files.")
# Output folders for downloaded files
medrxiv_output_folder = Prefix + 'medrxiv-xml-dump'
# Create output folders if they don't exist
os.makedirs(medrxiv_output_folder, exist_ok=True)
# Gather all objects from Medrxiv bucket
medrxiv_pages = paginator.paginate(
Bucket=medrxiv_bucket_name,
RequestPayer='requester',
Prefix=Prefix
).build_full_result()
# Dowload all objects from Medrxiv bucket
for medrxiv_object in tqdm(medrxiv_pages['Contents'], desc=Prefix):
# Get the file name
file = medrxiv_object['Key']
# Check if the file is a zip file
if file.endswith(".meca"):
# Proccess the zip file
try:
# Download the file
s3_client.download_file(medrxiv_bucket_name, file, 'tmp_med.meca', ExtraArgs={'RequestPayer':'requester'})
# Unzip meca file
with zipfile.ZipFile('tmp_med.meca', 'r') as zip_ref:
zip_ref.extractall("tmp_med")
# Gather the xml file
xml = glob('tmp_med/content/*.xml')
# Copy the xml file to the output folder
shutil.copy(xml[0], medrxiv_output_folder)
# Remove the tmp_med folder and file
shutil.rmtree('tmp_med')
os.remove('tmp_med.meca')
except Exception as e:
print(f"Error processing file {file}: {e}")
# Zip the output folder
shutil.make_archive(medrxiv_output_folder, 'zip', medrxiv_output_folder)
print(f"Uploading {medrxiv_output_folder}.zip to Hugging Face repo {repo_id}.")
hugging_face_api.upload_file(path_or_fileobj=f'{medrxiv_output_folder}.zip', path_in_repo=f'{medrxiv_output_folder}.zip', repo_id=repo_id, repo_type="dataset")
print("Medrxiv Done.")
# Create separate threads function
second_thread2 = threading.Thread(target=download_medrxiv, args=("Current_Content/October_2024/",))
# Start thread
second_thread2.start()
###############################################################################
# Dummy app
def greet(name, intensity):
return "Hello, " + name + "!" * int(intensity)
demo = gr.Interface(
fn=greet,
inputs=["text", "slider"],
outputs=["text"],
)
demo.launch()
|