Spaces:

legend1234
/

b3clf_hf

Sleeping

App Files Files Community

legend1234 commited on Oct 9, 2023

Commit

9992ded

•

1 Parent(s): 95b3113

Refactor to use caching

Browse files

Files changed (1) hide show

app.py +128 -54

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import tempfile
 from io import StringIO
@@ -5,12 +6,12 @@ from io import StringIO
 import joblib
 import numpy as np
 import pandas as pd
 # page set up
 import streamlit as st
 from b3clf.descriptor_padel import compute_descriptors
 from b3clf.geometry_opt import geometry_optimize
-from b3clf.utils import (get_descriptors, predict_permeability,
-                         scale_descriptors, select_descriptors)
 # from PIL import Image
 from streamlit_extras.let_it_rain import rain
 from streamlit_ketcher import st_ketcher
@@ -50,6 +51,78 @@ pandas_display_options = {
 }
 mol_features = None
 info_df = None
 # @st.cache_resource
@@ -258,7 +331,7 @@ with prediction_column:
 # Generate predictions when the user uploads a file
 if submit_job_button:
-    if file:
         temp_dir = tempfile.mkdtemp()
         # Create a temporary file path for the uploaded file
         temp_file_path = os.path.join(temp_dir, file.name)
@@ -266,59 +339,60 @@ if submit_job_button:
         with open(temp_file_path, "wb") as temp_file:
             temp_file.write(file.read())
         # mol_features, results = generate_predictions(temp_file_path)
-        mol_features, info_df, results = generate_predictions(
-            input_fname=temp_file_path,
-            sep="\s+|\t+",
-            clf=classifiers_dict[classifier],
-            sampling=resample_methods_dict[resampler],
-            time_per_mol=120,
-            mol_features=mol_features,
-            info_df=info_df,
-        )
-        # feture table
-        with feature_column:
-            selected_feature_rows = np.min(
-                [mol_features.shape[0], pandas_display_options["line_limit"]]
-            )
-            st.dataframe(mol_features.iloc[:selected_feature_rows, :], hide_index=False)
-            # placeholder_features.dataframe(mol_features, hide_index=False)
-            feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
-            features_csv = mol_features.to_csv(index=True)
-            st.download_button(
-                "Download features as CSV",
-                data=features_csv,
-                file_name=feature_file_name,
-            )
-        # prediction table
-        with prediction_column:
-            # st.subheader("Predictions")
-            if results is not None:
-                # Display the predictions in a table
-                selected_result_rows = np.min(
-                    [results.shape[0], pandas_display_options["line_limit"]]
-                )
-                results_df_display = results.iloc[
-                    :selected_result_rows, :
-                ].style.format({"B3clf_predicted_probability": "{:.6f}".format})
-                st.dataframe(results_df_display, hide_index=True)
-                # Add a button to download the predictions as a CSV file
-                predictions_csv = results.to_csv(index=True)
-                results_file_name = file.name.split(".")[0] + "_b3clf_predictions.csv"
-                st.download_button(
-                    "Download predictions as CSV",
-                    data=predictions_csv,
-                    file_name=results_file_name,
-                )
-                # indicate the success of the job
-                # rain(
-                #     emoji="🎈",
-                #     font_size=54,
-                #     falling_speed=5,
-                #     animation_length=10,
-                # )
-                st.balloons()
 # hide footer
 # https://github.com/streamlit/streamlit/issues/892

+import itertools as it
 import os
 import tempfile
 from io import StringIO
 import joblib
 import numpy as np
 import pandas as pd
+import pkg_resources
 # page set up
 import streamlit as st
 from b3clf.descriptor_padel import compute_descriptors
 from b3clf.geometry_opt import geometry_optimize
+from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors
 # from PIL import Image
 from streamlit_extras.let_it_rain import rain
 from streamlit_ketcher import st_ketcher
 }
 mol_features = None
 info_df = None
+results = None
+temp_file_path = None
+@st.cache_data
+def load_all_models():
+    """Get b3clf fitted classifier"""
+    clf_list = ["dtree", "knn", "logreg", "xgb"]
+    sampling_list = [
+        "borderline_SMOTE",
+        "classic_ADASYN",
+        "classic_RandUndersampling",
+        "classic_SMOTE",
+        "kmeans_SMOTE",
+        "common",
+    ]
+    model_dict = {}
+    package_name = "b3clf"
+    for clf_str, sampling_str in it.product(clf_list, sampling_list):
+        # joblib_fpath = os.path.join(
+        #     dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str))
+        # pred_model = joblib.load(joblib_fpath)
+        joblib_path_str = f"pre_trained/b3clf_{clf_str}_{sampling_str}.joblib"
+        with pkg_resources.resource_stream(package_name, joblib_path_str) as f:
+            pred_model = joblib.load(f)
+        model_dict[clf_str + "_" + sampling_str] = pred_model
+    return model_dict
+@st.cache_resource
+def predict_permeability(clf_str, sampling_str, mol_features, info_df, threshold="none"):
+    """Compute permeability prediction for given feature data."""
+    # load the model
+    pred_model = load_all_models()[clf_str + "_" + sampling_str]
+    # load the threshold data
+    package_name = "b3clf"
+    with pkg_resources.resource_stream(
+        package_name, "data/B3clf_thresholds.xlsx"
+    ) as f:
+        df_thres = pd.read_excel(f, index_col=0, engine="openpyxl")
+    # default threshold is 0.5
+    label_pool = np.zeros(mol_features.shape[0], dtype=int)
+    if type(mol_features) == pd.DataFrame:
+        if mol_features.index.tolist() != info_df.index.tolist():
+            raise ValueError(
+                "Features_df and Info_df do not have the same index."
+            )
+    # get predicted probabilities
+    info_df.loc[:, "B3clf_predicted_probability"] = pred_model.predict_proba(mol_features)[
+        :, 1
+    ]
+    # get predicted label from probability using the threshold
+    mask = np.greater_equal(
+        info_df["B3clf_predicted_probability"].to_numpy(),
+        # df_thres.loc[clf_str + "-" + sampling_str, threshold])
+        df_thres.loc["xgb-classic_ADASYN", threshold],
+    )
+    label_pool[mask] = 1
+    # save the predicted labels
+    info_df["B3clf_predicted_label"] = label_pool
+    info_df.reset_index(inplace=True)
+    return info_df
 # @st.cache_resource
 # Generate predictions when the user uploads a file
 if submit_job_button:
+    if file and mol_features is None and info_df is None:
         temp_dir = tempfile.mkdtemp()
         # Create a temporary file path for the uploaded file
         temp_file_path = os.path.join(temp_dir, file.name)
         with open(temp_file_path, "wb") as temp_file:
             temp_file.write(file.read())
         # mol_features, results = generate_predictions(temp_file_path)
+    mol_features, info_df, results = generate_predictions(
+        input_fname=temp_file_path,
+        sep="\s+|\t+",
+        clf=classifiers_dict[classifier],
+        sampling=resample_methods_dict[resampler],
+        time_per_mol=120,
+        mol_features=mol_features,
+        info_df=info_df,
+    )
+    st.balloons()
+# feture table
+with feature_column:
+    if mol_features is not None:
+        selected_feature_rows = np.min(
+            [mol_features.shape[0], pandas_display_options["line_limit"]]
+        )
+        st.dataframe(mol_features.iloc[:selected_feature_rows, :], hide_index=False)
+        # placeholder_features.dataframe(mol_features, hide_index=False)
+        feature_file_name = file.name.split(".")[0] + "_b3clf_features.csv"
+        features_csv = mol_features.to_csv(index=True)
+        st.download_button(
+            "Download features as CSV",
+            data=features_csv,
+            file_name=feature_file_name,
+        )
+# prediction table
+with prediction_column:
+    # st.subheader("Predictions")
+    if results is not None:
+        # Display the predictions in a table
+        selected_result_rows = np.min(
+            [results.shape[0], pandas_display_options["line_limit"]]
+        )
+        results_df_display = results.iloc[
+            :selected_result_rows, :
+        ].style.format({"B3clf_predicted_probability": "{:.6f}".format})
+        st.dataframe(results_df_display, hide_index=True)
+        # Add a button to download the predictions as a CSV file
+        predictions_csv = results.to_csv(index=True)
+        results_file_name = file.name.split(".")[0] + "_b3clf_predictions.csv"
+        st.download_button(
+            "Download predictions as CSV",
+            data=predictions_csv,
+            file_name=results_file_name,
+        )
+        # indicate the success of the job
+        # rain(
+        #     emoji="🎈",
+        #     font_size=54,
+        #     falling_speed=5,
+        #     animation_length=10,
+        # )
 # hide footer
 # https://github.com/streamlit/streamlit/issues/892