m_o / app.py
bluuebunny's picture
Create app.py
3ef2b6e verified
import boto3
import os
import zipfile
from glob import glob
import shutil
from huggingface_hub import HfApi
import gradio as gr
from tqdm.auto import tqdm
import threading
################################################################################
# Declarations:
print("Declaring variables.")
# AWS S3 service name
service_name = 's3'
# AWS S3 bucket names
biorxiv_bucket_name = 'biorxiv-src-monthly'
medrxiv_bucket_name = 'medrxiv-src-monthly'
# AWS region name
region_name = 'us-east-1'
# Hugging Face destination repository name
destination_repo_name = 'xml-dump-monthly'
################################################################################
print("Initiating clients.")
# Create a S3 client
s3_client = boto3.client(
service_name='s3',
region_name=region_name,
aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
)
paginator = s3_client.get_paginator('list_objects_v2')
# Create a Hugging Face API client
access_token = os.getenv('HF_API_KEY')
hugging_face_api = HfApi(token=access_token)
# Create a dataset repo
hugging_face_api.create_repo(
repo_id=destination_repo_name,
repo_type="dataset",
private=False,
exist_ok=True
)
# Extract Hugging facec username
username = hugging_face_api.whoami()['name']
repo_id = f"{username}/{destination_repo_name}"
################################################################################
################################################################################
def download_medrxiv(Prefix=""):
print("Downloading Medrxiv files.")
# Output folders for downloaded files
medrxiv_output_folder = Prefix + 'medrxiv-xml-dump'
# Create output folders if they don't exist
os.makedirs(medrxiv_output_folder, exist_ok=True)
# Gather all objects from Medrxiv bucket
medrxiv_pages = paginator.paginate(
Bucket=medrxiv_bucket_name,
RequestPayer='requester',
Prefix=Prefix
).build_full_result()
# Dowload all objects from Medrxiv bucket
for medrxiv_object in tqdm(medrxiv_pages['Contents'], desc=Prefix):
# Get the file name
file = medrxiv_object['Key']
# Check if the file is a zip file
if file.endswith(".meca"):
# Proccess the zip file
try:
# Download the file
s3_client.download_file(medrxiv_bucket_name, file, 'tmp_med.meca', ExtraArgs={'RequestPayer':'requester'})
# Unzip meca file
with zipfile.ZipFile('tmp_med.meca', 'r') as zip_ref:
zip_ref.extractall("tmp_med")
# Gather the xml file
xml = glob('tmp_med/content/*.xml')
# Copy the xml file to the output folder
shutil.copy(xml[0], medrxiv_output_folder)
# Remove the tmp_med folder and file
shutil.rmtree('tmp_med')
os.remove('tmp_med.meca')
except Exception as e:
print(f"Error processing file {file}: {e}")
# Zip the output folder
shutil.make_archive(medrxiv_output_folder, 'zip', medrxiv_output_folder)
print(f"Uploading {medrxiv_output_folder}.zip to Hugging Face repo {repo_id}.")
hugging_face_api.upload_file(path_or_fileobj=f'{medrxiv_output_folder}.zip', path_in_repo=f'{medrxiv_output_folder}.zip', repo_id=repo_id, repo_type="dataset")
print("Medrxiv Done.")
# Create separate threads function
second_thread2 = threading.Thread(target=download_medrxiv, args=("Current_Content/October_2024/",))
# Start thread
second_thread2.start()
###############################################################################
# Dummy app
def greet(name, intensity):
return "Hello, " + name + "!" * int(intensity)
demo = gr.Interface(
fn=greet,
inputs=["text", "slider"],
outputs=["text"],
)
demo.launch()