Spaces:

legend1234
/

b3clf_hf

Sleeping

App Files Files Community

b3clf_hf / utils.py

legend1234

Add utils.py

3d6fbe8 12 months ago

raw

history blame contribute delete

No virus

4.85 kB

	import itertools as it
	import os

	import joblib
	import numpy as np
	import pandas as pd
	import pkg_resources
	import streamlit as st
	from b3clf.descriptor_padel import compute_descriptors
	from b3clf.geometry_opt import geometry_optimize
	from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors


	@st.cache_resource()
	def load_all_models():
	"""Get b3clf fitted classifier"""
	clf_list = ["dtree", "knn", "logreg", "xgb"]
	sampling_list = [
	"borderline_SMOTE",
	"classic_ADASYN",
	"classic_RandUndersampling",
	"classic_SMOTE",
	"kmeans_SMOTE",
	"common",
	]

	model_dict = {}
	package_name = "b3clf"

	for clf_str, sampling_str in it.product(clf_list, sampling_list):
	# joblib_fpath = os.path.join(
	# dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str))
	# pred_model = joblib.load(joblib_fpath)
	joblib_path_str = f"pre_trained/b3clf_{clf_str}_{sampling_str}.joblib"
	with pkg_resources.resource_stream(package_name, joblib_path_str) as f:
	pred_model = joblib.load(f)

	model_dict[clf_str + "_" + sampling_str] = pred_model

	return model_dict


	@st.cache_resource
	def predict_permeability(
	clf_str, sampling_str, _models_dict, mol_features, info_df, threshold="none"
	):
	"""Compute permeability prediction for given feature data."""
	# load the model
	# pred_model = load_all_models()[clf_str + "_" + sampling_str]
	pred_model = _models_dict[clf_str + "_" + sampling_str]

	# load the threshold data
	package_name = "b3clf"
	with pkg_resources.resource_stream(package_name, "data/B3clf_thresholds.xlsx") as f:
	df_thres = pd.read_excel(f, index_col=0, engine="openpyxl")

	# default threshold is 0.5
	label_pool = np.zeros(mol_features.shape[0], dtype=int)

	if type(mol_features) == pd.DataFrame:
	if mol_features.index.tolist() != info_df.index.tolist():
	raise ValueError("Features_df and Info_df do not have the same index.")

	# get predicted probabilities
	info_df.loc[:, "B3clf_predicted_probability"] = pred_model.predict_proba(
	mol_features
	)[:, 1]
	# get predicted label from probability using the threshold
	mask = np.greater_equal(
	info_df["B3clf_predicted_probability"].to_numpy(),
	# df_thres.loc[clf_str + "-" + sampling_str, threshold])
	df_thres.loc["xgb-classic_ADASYN", threshold],
	)
	label_pool[mask] = 1

	# save the predicted labels
	info_df["B3clf_predicted_label"] = label_pool
	info_df.reset_index(inplace=True)

	return info_df


	@st.cache_resource
	def generate_predictions(
	input_fname: str = None,
	sep: str = "\s+\|\t+",
	clf: str = "xgb",
	_models_dict: dict = None,
	keep_sdf: str = "no",
	sampling: str = "classic_ADASYN",
	time_per_mol: int = 120,
	mol_features: pd.DataFrame = None,
	info_df: pd.DataFrame = None,
	):
	"""
	Generate predictions for a given input file.
	"""
	if mol_features is None and info_df is None:
	# mol_tag = os.path.splitext(uploaded_file.name)[0]
	# uploaded_file = uploaded_file.read().decode("utf-8")
	mol_tag = os.path.basename(input_fname).split(".")[0]
	internal_sdf = f"{mol_tag}_optimized_3d.sdf"

	# Geometry optimization
	# Input:
	# * Either an SDF file with molecular geometries or a text file with SMILES strings

	geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)

	df_features = compute_descriptors(
	sdf_file=internal_sdf,
	excel_out=None,
	output_csv=None,
	timeout=None,
	time_per_molecule=time_per_mol,
	)

	# Get computed descriptors
	mol_features, info_df = get_descriptors(df=df_features)

	# Select descriptors
	mol_features = select_descriptors(df=mol_features)

	# Scale descriptors
	mol_features.iloc[:, :] = scale_descriptors(df=mol_features)

	# this is problematic for using the same file for calculation
	if os.path.exists(internal_sdf) and keep_sdf == "no":
	os.remove(internal_sdf)

	# Get classifier
	# clf = get_clf(clf_str=clf, sampling_str=sampling)
	# Get classifier
	result_df = predict_permeability(
	clf_str=clf,
	sampling_str=sampling,
	_models_dict=_models_dict,
	mol_features=mol_features,
	info_df=info_df,
	threshold="none",
	)

	# Get classifier
	display_cols = [
	"ID",
	"SMILES",
	"B3clf_predicted_probability",
	"B3clf_predicted_label",
	]

	result_df = result_df[
	[col for col in result_df.columns.to_list() if col in display_cols]
	]

	return mol_features, info_df, result_df