Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
from io import StringIO | |
import joblib | |
import numpy as np | |
import pandas as pd | |
# page set up | |
import streamlit as st | |
from b3clf.descriptor_padel import compute_descriptors | |
from b3clf.geometry_opt import geometry_optimize | |
from b3clf.utils import ( | |
get_descriptors, | |
predict_permeability, | |
scale_descriptors, | |
select_descriptors, | |
) | |
from streamlit_ketcher import st_ketcher | |
st.set_page_config( | |
page_title="BBB Permeability Prediction with Imbalanced Learning", | |
# page_icon="🧊", | |
layout="wide", | |
# initial_sidebar_state="expanded", | |
# menu_items={ | |
# 'Get Help': 'https://www.extremelycoolapp.com/help', | |
# 'Report a bug': "https://www.extremelycoolapp.com/bug", | |
# 'About': "# This is a header. This is an *extremely* cool app!" | |
# } | |
) | |
# Load the pre-trained model and feature scaler | |
model = joblib.load("pre_trained/b3clf_knn_kmeans_SMOTE.joblib") | |
scaler = joblib.load("pre_trained/b3clf_scaler.joblib") | |
# Define a function to generate predictions | |
# def generate_predictions(file): | |
# # Read the input file | |
# if file.type == "text/csv": | |
# df = pd.read_csv(file) | |
# elif file.type == "chemical/x-mdl-sdfile": | |
# df = pd.read_sdf(file) | |
# else: | |
# st.error("Invalid file type. Please upload a CSV or SDF file.") | |
# return | |
# # Compute the molecular geometry, calculate the features, and perform the predictions | |
# X = df.drop("ID", axis=1) | |
# X_scaled = scaler.transform(X) | |
# y_pred_proba = model.predict_proba(X_scaled)[:, 1] | |
# y_pred = model.predict(X_scaled) | |
# # Create a DataFrame with the predictions | |
# results = pd.DataFrame({"ID": df["ID"], "B3clf_predicted_probability": y_pred_proba, "B3clf_predicted_label": y_pred}) | |
# return results | |
keep_features = "no" | |
keep_sdf = "no" | |
classifiers_dict = { | |
"decision trees": "dtree", | |
"kNN": "knn", | |
"logsistical regression": "logreg", | |
"XGBoost": "xgb", | |
} | |
resample_methods_dict = { | |
"random undersampling": "classic_RandUndersampling", | |
"SMOTE": "classic_SMOTE", | |
"Borderline SMOTE": "borderline_SMOTE", | |
"k-means SMOTE": "kmeans_SMOTE", | |
"ADASYN": "classic_ADASYN", | |
"no resampling": "common", | |
} | |
def generate_predictions( | |
input_fname: str, | |
sep: str = "\s+|\t+", | |
clf: str = "xgb", | |
sampling: str = "classic_ADASYN", | |
time_per_mol: int = 120, | |
): | |
""" | |
Generate predictions for a given input file. | |
""" | |
# mol_tag = os.path.splitext(uploaded_file.name)[0] | |
# uploaded_file = uploaded_file.read().decode("utf-8") | |
mol_tag = os.path.basename(input_fname).split(".")[0] | |
internal_sdf = f"{mol_tag}_optimized_3d.sdf" | |
# Geometry optimization | |
# Input: | |
# * Either an SDF file with molecular geometries or a text file with SMILES strings | |
geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep) | |
df_features = compute_descriptors( | |
sdf_file=internal_sdf, | |
excel_out=None, | |
output_csv=None, | |
timeout=None, | |
time_per_molecule=time_per_mol, | |
) | |
# st.write(df_features) | |
# Get computed descriptors | |
X_features, info_df = get_descriptors(df=df_features) | |
# Select descriptors | |
X_features = select_descriptors(df=X_features) | |
# Scale descriptors | |
X_features = scale_descriptors(df=X_features) | |
# Get classifier | |
# clf = get_clf(clf_str=clf, sampling_str=sampling) | |
# Get classifier | |
result_df = predict_permeability( | |
clf_str=clf, | |
sampling_str=sampling, | |
features_df=X_features, | |
info_df=info_df, | |
threshold="none", | |
) | |
# Get classifier | |
display_cols = [ | |
"ID", | |
"SMILES", | |
"B3clf_predicted_probability", | |
"B3clf_predicted_label", | |
] | |
result_df = result_df[ | |
[col for col in result_df.columns.to_list() if col in display_cols] | |
] | |
os.remove(internal_sdf) | |
return X_features, result_df | |
# Create the Streamlit app | |
st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]") | |
info_column, upload_column = st.columns(2) | |
# Create a file uploader | |
with upload_column: | |
st.subheader("Molecule Input") | |
with st.container(): | |
# uneven columns | |
# st.columns((2, 1, 1, 1)) | |
# two subcolumns for sample input files | |
sample_sdf_column, classifier_col = st.columns(2) | |
with sample_sdf_column: | |
# download sample sdf | |
with open("sample_input.sdf", "r") as file_sdf: | |
btn = st.download_button( | |
label="Download SDF sample file", | |
data=file_sdf, | |
file_name="sample_input.sdf", | |
) | |
with classifier_col: | |
classifier = st.selectbox( | |
label="Classification algorithm:", | |
options=("XGBoost", "kNN", "decision trees", "logsistical regression"), | |
) | |
sample_smiles_column, resampler_col = st.columns(2) | |
with sample_smiles_column: | |
# download sample smiles | |
with open("sample_input_smiles.csv", "r") as file_smi: | |
btn = st.download_button( | |
label="Download SMILES sample file", | |
data=file_smi, | |
file_name="sample_input_smiles.csv", | |
) | |
with resampler_col: | |
resampler = st.selectbox( | |
label="Resampling method:", | |
options=( | |
"ADASYN", | |
"random undersampling", | |
"Borderline SMOTE", | |
"k-means SMOTE", | |
"SMOTE", | |
"no resampling", | |
), | |
) | |
# horizontal line | |
st.divider() | |
file = st.file_uploader( | |
label="Upload a CSV, SDF or TXT file", | |
type=["csv", "sdf", "txt"], | |
help="Input molecule file and only text files are supported.", | |
# accept_multiple_files=False, | |
) | |
# st.write("The content of the file will be displayed below once uploaded.") | |
# if file: | |
# if "csv" in file.name or "txt" in file.name: | |
# st.write(file.read().decode("utf-8")) | |
# st.write(file) | |
with info_column: | |
st.subheader("About `B3clf`") | |
# fmt: off | |
st.markdown( | |
""" | |
`B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. Source code is available at https://github.com/theochem/B3clf.""" # | |
) | |
# fmt: on | |
feature_column, prediction_column = st.columns(2) | |
with feature_column: | |
st.subheader("Features") | |
placeholder_features = st.empty() | |
# placeholder_features = pd.DataFrame(index=[1, 2, 3, 4], | |
# columns=["ID", "nAcid", "ALogP", "Alogp2", | |
# "AMR", "naAromAtom", "nH", "nN"]) | |
# st.dataframe(placeholder_features) | |
# placeholder_features.text("molecular features") | |
with prediction_column: | |
st.subheader("Predictions") | |
# placeholder_predictions = st.empty() | |
# placeholder_predictions.text("prediction") | |
# Generate predictions when the user uploads a file | |
if file: | |
temp_dir = tempfile.mkdtemp() | |
# Create a temporary file path for the uploaded file | |
temp_file_path = os.path.join(temp_dir, file.name) | |
# Save the uploaded file to the temporary file path | |
with open(temp_file_path, "wb") as temp_file: | |
temp_file.write(file.read()) | |
# X_features, results = generate_predictions(temp_file_path) | |
X_features, results = generate_predictions( | |
input_fname=temp_file_path, | |
sep="\s+|\t+", | |
clf=classifiers_dict[classifier], | |
sampling=resample_methods_dict[resampler], | |
time_per_mol=120, | |
) | |
# feture table | |
with feature_column: | |
st.dataframe(X_features) | |
# placeholder_features.dataframe(X_features, hide_index=False) | |
feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv" | |
features_csv = X_features.to_csv(index=True) | |
st.download_button( | |
"Download features as CSV", | |
data=features_csv, | |
file_name=feature_file_name, | |
) | |
# prediction table | |
with prediction_column: | |
# st.subheader("Predictions") | |
if results is not None: | |
# Display the predictions in a table | |
st.dataframe(results, hide_index=True) | |
# Add a button to download the predictions as a CSV file | |
predictions_csv = results.to_csv(index=True) | |
results_file_name = file.name.split(".")[0] + "_b3clf_predictions.csv" | |
st.download_button( | |
"Download predictions as CSV", | |
data=predictions_csv, | |
file_name=results_file_name, | |
) | |
# hide footer | |
# https://github.com/streamlit/streamlit/issues/892 | |
hide_streamlit_style = """ | |
<style> | |
#MainMenu {visibility: hidden;} | |
footer {visibility: hidden;} | |
</style> | |
""" | |
st.markdown(hide_streamlit_style, unsafe_allow_html=True) | |