import pandas as pd import numpy as np import xgboost as xgb import streamlit as st import requests from bs4 import BeautifulSoup from gensim.models import FastText import joblib # Load the trained FastText model try: fasttext_model = FastText.load('fasttext_model.bin') except FileNotFoundError: st.error("The FastText model file was not found. Please ensure 'fasttext_model.bin' and its associated files are in the correct directory.") st.stop() # Load the trained XGBoost model for the combined features try: model = joblib.load('model.pkl') except FileNotFoundError: st.error("The XGBoost model file was not found. Please ensure 'model.pkl' is in the correct directory.") st.stop() def tokenize(text): if isinstance(text, str): return text.split() else: return [] def embed_text(text_series, fasttext_model): embeddings = [] for text in text_series: tokens = tokenize(text) vectors = [fasttext_model.wv[token] for token in tokens if token in fasttext_model.wv] if vectors: embeddings.append(np.mean(vectors, axis=0)) else: embeddings.append(np.zeros(fasttext_model.vector_size)) return np.array(embeddings) def preprocess_input(query, title, description, url, fasttext_model): query = str(query) if pd.notna(query) else '' title = str(title) if pd.notna(title) else '' description = str(description) if pd.notna(description) else '' url = str(url) if pd.notna(url) else '' query_ft = embed_text(pd.Series([query]), fasttext_model) title_ft = embed_text(pd.Series([title]), fasttext_model) description_ft = embed_text(pd.Series([description]), fasttext_model) url_ft = embed_text(pd.Series([url]), fasttext_model) combined_features = np.hstack([query_ft, title_ft, description_ft, url_ft]) dmatrix = xgb.DMatrix(combined_features) return dmatrix def extract_title_description(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36' } try: response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') title = soup.title.string if soup.title else 'No title found' description_tag = soup.find('meta', attrs={'name': 'description'}) description = description_tag['content'] if description_tag else 'No description found' return title, description except Exception as e: return 'Error extracting title', 'Error extracting description' def predict(query, title, description, url, fasttext_model): dmatrix = preprocess_input(query, title, description, url, fasttext_model) probability = model.predict(dmatrix, validate_features=False)[0] binary_prediction = int(probability >= 0.5) return binary_prediction, probability # Streamlit interface st.title('CTR Prediction Inference') tab1, tab2, tab3 = st.tabs(["Single Entry", "Batch Entry", "A/B Test"]) with tab1: st.header('Single Entry Inference') query = st.text_input('Query') url = st.text_input('URL') if st.button('Predict'): title, description = extract_title_description(url) st.write(f'Extracted Title: {title}') st.write(f'Extracted Description: {description}') if query and url: binary_result, confidence = predict(query, title, description, url, fasttext_model) st.write(f'Predicted +/-: {binary_result}') st.write(f'Conf.: {confidence:.2%}') confidence_percentage = int(confidence * 100) st.progress(confidence_percentage) else: st.write('Please enter both a query and a URL.') with tab2: st.header('Batch Entry Inference') uploaded_file = st.file_uploader("Upload CSV", type="csv") if uploaded_file is not None: df = pd.read_csv(uploaded_file) required_columns = ['Query', 'Title', 'Description', 'URL'] if set(required_columns).issubset(df.columns): predictions = [] confidences = [] for index, row in df.iterrows(): binary_result, confidence = predict(row['Query'], row['Title'], row['Description'], row['URL'], fasttext_model) predictions.append(binary_result) confidences.append(confidence) df['+/-'] = predictions df['Conf.'] = [f"{conf:.2%}" for conf in confidences] cols = ['+/-', 'Conf.'] + [col for col in df.columns if col not in ['+/-', 'Conf.']] df = df[cols] st.write(df) st.download_button("Download Predictions", df.to_csv(index=False), "predictions.csv") else: st.write('CSV must contain Query, Title, Description, and URL columns.') with tab3: st.header('A/B Test Inference') query = st.text_input('Query for A/B Test') url = st.text_input('URL for A/B Test') if 'step' not in st.session_state: st.session_state.step = 0 if st.button('Scrape A/B'): title_A, description_A = extract_title_description(url) st.session_state['title_A'] = title_A st.session_state['description_A'] = description_A st.session_state.step = 1 if st.session_state.step == 1: title_B = st.text_input('Title B', value=st.session_state.get('title_A', '')) description_B = st.text_area('Description B', value=st.session_state.get('description_A', '')) if st.button('Predict A/B'): if query and url: binary_result_A, confidence_A = predict(query, st.session_state['title_A'], st.session_state['description_A'], url, fasttext_model) binary_result_B, confidence_B = predict(query, title_B, description_B, url, fasttext_model) st.write(f'Results for A: Predicted +/-: {binary_result_A}, Conf.: {confidence_A:.2%}') st.write(f'Results for B: Predicted +/-: {binary_result_B}, Conf.: {confidence_B:.2%}') if binary_result_A == 1 and binary_result_B == 0: st.write("B is worse than A") elif binary_result_A == 0 and binary_result_B == 1: st.write("B is better than A") else: st.write("B is the same as A")