OSainz's picture
File fixes and cleaning (#17)
99a8650 verified
import json
import os
import filelock
import huggingface_hub
import pandas as pd
from utils import (
build_datasets_urls,
build_models_urls,
build_text_icon,
download_favicons,
get_base_url,
get_domain_name,
)
HF_ICON = "https://huggingface.co/front/assets/huggingface_logo.svg"
CROSS_ICON = "https://upload.wikimedia.org/wikipedia/commons/4/4e/Cross.png"
DISABLE_ONLINE_CACHE = False
ONLINE_CACHE = "CONDA-Workshop/RequestCache"
def save_cache(cache_data, cache_file, initial_timestamp):
print(f"Saving cache to {cache_file}")
# Acquire lock before reading and updating the file to prevent race conditions
with filelock.FileLock(f"{cache_file}.lock"):
# Check if the file has been modified since the initial read
current_timestamp = (
os.path.getmtime(cache_file) if os.path.exists(cache_file) else None
)
if current_timestamp is None or initial_timestamp != current_timestamp:
# File has been modified or created since initial read, re-read the file
try:
with open(cache_file, "r", encoding="utf8") as f:
# Update the dictionary with newly added entries
cache_dict = json.load(f)
# Test if cache_dict and cache_data are different
if cache_dict != cache_data:
cache_data.update(cache_dict)
except FileNotFoundError:
pass # If the file doesn't exist at this point, continue with the current dictionary
# Write the updated dictionary back to the file
with open(cache_file, "w", encoding="utf8") as f:
json.dump(cache_data, f, ensure_ascii=False, indent=4)
if not DISABLE_ONLINE_CACHE:
try:
huggingface_hub.upload_file(
repo_id=ONLINE_CACHE,
repo_type="dataset",
token=os.environ.get("TOKEN") or True,
path_in_repo=cache_file,
path_or_fileobj=cache_file,
)
except Exception as e:
print(f"Unable to upload {cache_file}: {e}")
return cache_data
def update_favicon_cache(sources):
# Load the favicon dictionary if it exists
favicon_dict = {}
favicon_file_path = "favicons.json"
initial_timestamp = None
if not DISABLE_ONLINE_CACHE:
try:
huggingface_hub.hf_hub_download(
repo_id=ONLINE_CACHE,
repo_type="dataset",
token=os.environ.get("TOKEN") or True,
filename=favicon_file_path,
local_dir=os.getcwd(),
)
except Exception as e:
print(f"Unable to download favicons.json: {e}")
# Attempt to load the favicon dictionary and record its last modification time
if os.path.exists(favicon_file_path):
initial_timestamp = os.path.getmtime(favicon_file_path)
try:
with open(favicon_file_path, "r", encoding="utf8") as f:
favicon_dict = json.load(f)
except FileNotFoundError:
pass # File not found, proceed with an empty dictionary
# Determine which favicons need to be downloaded
missing_domains = [domain for domain in sources if domain not in favicon_dict]
# Download missing favicons in batch
if missing_domains:
new_favicon_urls = download_favicons(missing_domains)
favicon_dict.update(new_favicon_urls)
favicon_dict = save_cache(
cache_data=favicon_dict,
cache_file=favicon_file_path,
initial_timestamp=initial_timestamp,
)
return favicon_dict
def update_model_url_cache(models):
models = [x for x in models if x is not None]
models = list(set(models))
# Load the model url dictionary if it exists
model_url_dict = {}
model_url_file_path = "model_urls.json"
initial_timestamp = None
if not DISABLE_ONLINE_CACHE:
try:
huggingface_hub.hf_hub_download(
repo_id=ONLINE_CACHE,
repo_type="dataset",
token=os.environ.get("TOKEN") or True,
filename=model_url_file_path,
local_dir=os.getcwd(),
)
except Exception as e:
print(f"Unable to download model_urls.json: {e}")
# Attempt to load the model url dictionary and record its last modification time
if os.path.exists(model_url_file_path):
initial_timestamp = os.path.getmtime(model_url_file_path)
try:
with open(model_url_file_path, "r", encoding="utf8") as f:
model_url_dict = json.load(f)
except FileNotFoundError:
pass # File not found, proceed with an empty dictionary
# Determine which model urls need to be downloaded
missing_model_urls = [model for model in models if model not in model_url_dict]
# Download missing model urls in batch
if missing_model_urls:
new_model_urls = build_models_urls(missing_model_urls)
model_url_dict.update(new_model_urls)
model_url_dict = save_cache(
cache_data=model_url_dict,
cache_file=model_url_file_path,
initial_timestamp=initial_timestamp,
)
return model_url_dict
def update_dataset_url_cache(datasets):
datasets = [x for x in datasets if x is not None]
datasets = list(set(datasets))
# Load the dataset url dictionary if it exists
dataset_url_dict = {}
dataset_url_file_path = "dataset_urls.json"
initial_timestamp = None
if not DISABLE_ONLINE_CACHE:
try:
huggingface_hub.hf_hub_download(
repo_id=ONLINE_CACHE,
repo_type="dataset",
token=os.environ.get("TOKEN") or True,
filename=dataset_url_file_path,
local_dir=os.getcwd(),
)
except Exception as e:
print(f"Unable to download dataset_urls.json: {e}")
# Attempt to load the dataset url dictionary and record its last modification time
if os.path.exists(dataset_url_file_path):
initial_timestamp = os.path.getmtime(dataset_url_file_path)
try:
with open(dataset_url_file_path, "r", encoding="utf8") as f:
dataset_url_dict = json.load(f)
except FileNotFoundError:
pass # File not found, proceed with an empty dictionary
# Determine which dataset urls need to be downloaded
missing_dataset_urls = [
dataset for dataset in datasets if dataset not in dataset_url_dict
]
# Download missing dataset urls in batch
if missing_dataset_urls:
new_dataset_urls = build_datasets_urls(missing_dataset_urls)
dataset_url_dict.update(new_dataset_urls)
dataset_url_dict = save_cache(
cache_data=dataset_url_dict,
cache_file=dataset_url_file_path,
initial_timestamp=initial_timestamp,
)
return dataset_url_dict
def get_dataframe():
# Load the contamination_report.csv file
data = pd.read_csv("contamination_report.csv", delimiter=";", header=0)
# Load the favicon dictionary if it exists
favicon_dict = {}
# Update the favicon dictionary
favicon_dict = update_favicon_cache([get_base_url(x) for x in data["Reference"]])
# Update the model url dictionary
model_url_dict = update_model_url_cache(
data[data["Model or corpus"] == "model"]["Contaminated Source"]
)
# Update the dataset url dictionary
dataset_url_dict = update_dataset_url_cache(
list(data["Evaluation Dataset"])
+ list(data[data["Model or corpus"] == "corpus"]["Contaminated Source"])
)
# Add favicons URLs to the dataframe in a vectorized manner
data["Reference"] = data["Reference"].apply(
lambda x: build_text_icon(
text=get_domain_name(x),
url=x,
icon_url=favicon_dict.get(get_base_url(x), ""),
)
)
PR_URL_FORMAT = "https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report/discussions/{}"
data["PR"] = data["PR"].apply(
lambda x: build_text_icon(
text="",
url=PR_URL_FORMAT.format(int(x)) if not pd.isna(x) else "no link",
icon_url=HF_ICON if x == x else CROSS_ICON,
)
)
data["Evaluation Dataset"] = data["Evaluation Dataset"].apply(
lambda x: build_text_icon(
text=x,
url=dataset_url_dict.get(x, ""),
icon_url=HF_ICON,
)
)
data["Evaluation Dataset"] = data.apply(
lambda x: x["Evaluation Dataset"] + f" ({x['Subset']})" if pd.notna(x["Subset"]) else x["Evaluation Dataset"],
axis=1,
)
del data["Subset"]
# For "Contaminated Source" use build_dataset_url if "Model or corpus" is "corpus" and build_model_url if "Model or corpus" is "model"
data["Contaminated Source"] = data.apply(
lambda x: build_text_icon(
text=x["Contaminated Source"] + f" ({x['Version']})" if pd.notna(x["Version"]) else x["Contaminated Source"],
url=dataset_url_dict.get(x["Contaminated Source"], "")
if x["Model or corpus"] == "corpus"
else model_url_dict.get(x["Contaminated Source"], ""),
icon_url=HF_ICON,
),
axis=1,
)
del data["Version"]
data["Train Split"] = data["Train Split"].apply(lambda x: x/100 if x else x)
data["Development Split"] = data["Development Split"].apply(lambda x: x/100 if x else x)
data["Test Split"] = data["Test Split"].apply(lambda x: x/100 if x else x)
return data