|
import pandas as pd
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
def combine_databases():
|
|
|
|
aggregated_data_path = Path("aggregated_data")
|
|
db_update_bio_path = Path("db_update")
|
|
biorxiv_embeddings_path = Path("biorxiv_ubin_embaddings.npy")
|
|
embed_update_bio_path = Path("embed_update")
|
|
|
|
db_update_med_path = Path("db_update_med")
|
|
embed_update_med_path = Path("embed_update_med")
|
|
|
|
|
|
df_bio_existing = pd.read_parquet(aggregated_data_path)
|
|
bio_embeddings_existing = np.load(biorxiv_embeddings_path, allow_pickle=True)
|
|
print(f"Existing BioRxiv data shape: {df_bio_existing.shape}, Existing BioRxiv embeddings shape: {bio_embeddings_existing.shape}")
|
|
|
|
|
|
embedding_size = bio_embeddings_existing.shape[1]
|
|
|
|
|
|
bio_dfs_list = []
|
|
bio_embeddings_list = []
|
|
|
|
|
|
def process_updates(new_data_directory, updated_embeddings_directory, dfs_list, embeddings_list):
|
|
new_data_files = sorted(Path(new_data_directory).glob("*.parquet"))
|
|
for data_file in new_data_files:
|
|
corresponding_embedding_file = Path(updated_embeddings_directory) / (data_file.stem + ".npy")
|
|
|
|
if corresponding_embedding_file.exists():
|
|
df = pd.read_parquet(data_file)
|
|
new_embeddings = np.load(corresponding_embedding_file, allow_pickle=True)
|
|
|
|
|
|
if df.shape[0] != new_embeddings.shape[0]:
|
|
print(f"Shape mismatch for {data_file.name}: DataFrame has {df.shape[0]} rows, embeddings have {new_embeddings.shape[0]} rows. Skipping.")
|
|
continue
|
|
|
|
|
|
if new_embeddings.shape[1] != embedding_size:
|
|
print(f"Skipping {data_file.name} due to embedding size mismatch.")
|
|
continue
|
|
|
|
dfs_list.append(df)
|
|
embeddings_list.append(new_embeddings)
|
|
else:
|
|
print(f"No corresponding embedding file found for {data_file.name}")
|
|
|
|
|
|
process_updates(db_update_bio_path, embed_update_bio_path, bio_dfs_list, bio_embeddings_list)
|
|
|
|
|
|
if bio_dfs_list:
|
|
df_bio_updates = pd.concat(bio_dfs_list)
|
|
else:
|
|
df_bio_updates = pd.DataFrame()
|
|
|
|
if bio_embeddings_list:
|
|
bio_embeddings_updates = np.vstack(bio_embeddings_list)
|
|
else:
|
|
bio_embeddings_updates = np.array([])
|
|
|
|
|
|
df_bio_combined = pd.concat([df_bio_existing, df_bio_updates])
|
|
|
|
|
|
bio_mask = ~df_bio_combined.duplicated(subset=["title"], keep="last")
|
|
df_bio_combined = df_bio_combined[bio_mask]
|
|
|
|
|
|
bio_embeddings_combined = (
|
|
np.vstack([bio_embeddings_existing, bio_embeddings_updates])
|
|
if bio_embeddings_updates.size
|
|
else bio_embeddings_existing
|
|
)
|
|
|
|
|
|
bio_embeddings_combined = bio_embeddings_combined[bio_mask]
|
|
|
|
assert df_bio_combined.shape[0] == bio_embeddings_combined.shape[0], "Shape mismatch between BioRxiv DataFrame and embeddings"
|
|
|
|
print(f"Filtered BioRxiv DataFrame shape: {df_bio_combined.shape}")
|
|
print(f"Filtered BioRxiv embeddings shape: {bio_embeddings_combined.shape}")
|
|
|
|
|
|
combined_biorxiv_data_path = aggregated_data_path / "combined_biorxiv_data.parquet"
|
|
df_bio_combined.to_parquet(combined_biorxiv_data_path)
|
|
print(f"Saved combined BioRxiv DataFrame to {combined_biorxiv_data_path}")
|
|
|
|
combined_biorxiv_embeddings_path = "biorxiv_ubin_embaddings.npy"
|
|
np.save(combined_biorxiv_embeddings_path, bio_embeddings_combined)
|
|
print(f"Saved combined BioRxiv embeddings to {combined_biorxiv_embeddings_path}")
|
|
|
|
|
|
med_dfs_list = []
|
|
med_embeddings_list = []
|
|
|
|
process_updates(db_update_med_path, embed_update_med_path, med_dfs_list, med_embeddings_list)
|
|
|
|
|
|
if med_dfs_list:
|
|
df_med_combined = pd.concat(med_dfs_list)
|
|
else:
|
|
df_med_combined = pd.DataFrame()
|
|
|
|
if med_embeddings_list:
|
|
med_embeddings_combined = np.vstack(med_embeddings_list)
|
|
else:
|
|
med_embeddings_combined = np.array([])
|
|
|
|
last_date_in_med_database = df_med_combined['date'].max() if not df_med_combined.empty else "unknown"
|
|
|
|
|
|
med_mask = ~df_med_combined.duplicated(subset=["title"], keep="last")
|
|
df_med_combined = df_med_combined[med_mask]
|
|
med_embeddings_combined = med_embeddings_combined[med_mask]
|
|
|
|
assert df_med_combined.shape[0] == med_embeddings_combined.shape[0], "Shape mismatch between MedRxiv DataFrame and embeddings"
|
|
|
|
print(f"Filtered MedRxiv DataFrame shape: {df_med_combined.shape}")
|
|
print(f"Filtered MedRxiv embeddings shape: {med_embeddings_combined.shape}")
|
|
|
|
|
|
combined_medrxiv_data_path = db_update_med_path / f"database_{last_date_in_med_database}.parquet"
|
|
df_med_combined.to_parquet(combined_medrxiv_data_path)
|
|
print(f"Saved combined MedRxiv DataFrame to {combined_medrxiv_data_path}")
|
|
|
|
combined_medrxiv_embeddings_path = embed_update_med_path / f"database_{last_date_in_med_database}.npy"
|
|
np.save(combined_medrxiv_embeddings_path, med_embeddings_combined)
|
|
print(f"Saved combined MedRxiv embeddings to {combined_medrxiv_embeddings_path}")
|
|
|
|
if __name__ == "__main__":
|
|
combine_databases()
|
|
|