legend1234
/

B3clf

Joblib

Model card Files Files and versions Community

legend1234 commited on Oct 9, 2023

Commit

0e95800

•

1 Parent(s): d05f89f

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (2) hide show

b3clf/b3clf.py +43 -28
b3clf/utils.py +14 -9

b3clf/b3clf.py CHANGED Viewed

@@ -31,26 +31,31 @@ import os
 import numpy as np
 from .descriptor_padel import compute_descriptors
 from .geometry_opt import geometry_optimize
-from .utils import (get_descriptors, predict_permeability,
-                    scale_descriptors, select_descriptors)
 __all__ = [
     "b3clf",
 ]
-def b3clf(mol_in,
-          sep="\s+|\t+",
-          clf="xgb",
-          sampling="classic_ADASYN",
-          output="B3clf_output.xlsx",
-          verbose=1,
-          random_seed=42,
-          time_per_mol=-1,
-          keep_features="no",
-          keep_sdf="no",
-          threshold="none",
-          ):
     """Use B3clf for BBB classifications with resampling strategies.
     Parameters
@@ -110,12 +115,13 @@ def b3clf(mol_in,
     geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
-    _ = compute_descriptors(sdf_file=internal_sdf,
-                            excel_out=features_out,
-                            output_csv=None,
-                            timeout=None,
-                            time_per_molecule=time_per_mol,
-                            )
     # Get computed descriptors
     X_features, info_df = get_descriptors(df=features_out)
@@ -131,16 +137,25 @@ def b3clf(mol_in,
     # clf = get_clf(clf_str=clf, sampling_str=sampling)
     # Get classifier
-    result_df = predict_permeability(clf_str=clf,
-                                     sampling_str=sampling,
-                                     features_df=X_features,
-                                     info_df=info_df,
-                                     threshold=threshold)
     # Get classifier
-    display_cols = ["ID", "SMILES", "B3clf_predicted_probability", "B3clf_predicted_label"]
-    result_df = result_df[[col for col in result_df.columns.to_list() if col in display_cols]]
     if verbose != 0:
         print(result_df)

 import numpy as np
 from .descriptor_padel import compute_descriptors
 from .geometry_opt import geometry_optimize
+from .utils import (
+    get_descriptors,
+    predict_permeability,
+    scale_descriptors,
+    select_descriptors,
+)
 __all__ = [
     "b3clf",
 ]
+def b3clf(
+    mol_in,
+    sep="\s+|\t+",
+    clf="xgb",
+    sampling="classic_ADASYN",
+    output="B3clf_output.xlsx",
+    verbose=1,
+    random_seed=42,
+    time_per_mol=-1,
+    keep_features="no",
+    keep_sdf="no",
+    threshold="none",
+):
     """Use B3clf for BBB classifications with resampling strategies.
     Parameters
     geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
+    _ = compute_descriptors(
+        sdf_file=internal_sdf,
+        excel_out=features_out,
+        output_csv=None,
+        timeout=None,
+        time_per_molecule=time_per_mol,
+    )
     # Get computed descriptors
     X_features, info_df = get_descriptors(df=features_out)
     # clf = get_clf(clf_str=clf, sampling_str=sampling)
     # Get classifier
+    result_df = predict_permeability(
+        clf_str=clf,
+        sampling_str=sampling,
+        mol_features=X_features,
+        info_df=info_df,
+        threshold=threshold,
+    )
     # Get classifier
+    display_cols = [
+        "ID",
+        "SMILES",
+        "B3clf_predicted_probability",
+        "B3clf_predicted_label",
+    ]
+    result_df = result_df[
+        [col for col in result_df.columns.to_list() if col in display_cols]
+    ]
     if verbose != 0:
         print(result_df)

b3clf/utils.py CHANGED Viewed

@@ -89,9 +89,9 @@ def scale_descriptors(df):
     dirname = os.path.dirname(__file__)
     filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
     b3db_scaler = load(filename)
-    df.iloc[:, :] = b3db_scaler.transform(df)
-    return df
 def get_clf(clf_str, sampling_str):
@@ -125,7 +125,9 @@ def get_clf(clf_str, sampling_str):
     return clf
-def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold="none"):
     """Compute and store BBB predicted label and predicted probability to results dataframe."""
     # load the threshold data
@@ -133,18 +135,21 @@ def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold=
     fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
     df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
     # default threshold is 0.5
-    label_pool = np.zeros(features_df.shape[0], dtype=int)
     # get the classifier
     clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
-    if features_df.index.tolist() != info_df.index.tolist():
-        raise ValueError(
-            "Features_df and Info_df do not have the same index. Internal processing error"
-        )
     # get predicted probabilities
-    info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(features_df)[:, 1]
     # get predicted label from probability using the threshold
     mask = np.greater_equal(
         info_df["B3clf_predicted_probability"].to_numpy(),

     dirname = os.path.dirname(__file__)
     filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
     b3db_scaler = load(filename)
+    df_new = b3db_scaler.transform(df)
+    return df_new
 def get_clf(clf_str, sampling_str):
     return clf
+def predict_permeability(
+    clf_str, sampling_str, mol_features, info_df, threshold="none"
+):
     """Compute and store BBB predicted label and predicted probability to results dataframe."""
     # load the threshold data
     fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
     df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
     # default threshold is 0.5
+    label_pool = np.zeros(mol_features.shape[0], dtype=int)
     # get the classifier
     clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
+    if type(mol_features) == pd.DataFrame:
+        if mol_features.index.tolist() != info_df.index.tolist():
+            raise ValueError(
+                "Features_df and Info_df do not have the same index. Internal processing error"
+            )
     # get predicted probabilities
+    info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[
+        :, 1
+    ]
     # get predicted label from probability using the threshold
     mask = np.greater_equal(
         info_df["B3clf_predicted_probability"].to_numpy(),