import os
import zipfile
import streamlit as st
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize the text and remove stopwords
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

def get_context_files(prompt, md_files):
    # Preprocess the prompt and context files
    processed_prompt = preprocess_text(prompt)
    processed_files = {}
    for file in md_files:
        with open(file, 'r') as f:
            content = f.read()
            processed_files[file] = preprocess_text(content)
    
    # Create a CountVectorizer to calculate word counts
    vectorizer = CountVectorizer()
    file_vectors = vectorizer.fit_transform(processed_files.values())
    prompt_vector = vectorizer.transform([processed_prompt])
    
    # Calculate the number of matching words for each file
    match_counts = prompt_vector.dot(file_vectors.T).toarray()[0]
    
    # Sort the files by the number of matching words
    sorted_files = sorted(zip(md_files, match_counts), key=lambda x: x[1], reverse=True)
    
    # Get the top ten files
    top_ten_files = [file for file, count in sorted_files[:10]]
    
    # Create a single prompt by concatenating the original prompt and the content of the top ten files
    context_prompt = prompt
    for file in top_ten_files:
        with open(file, 'r') as f:
            context_prompt += '\n\n' + f.read()
    
    # Create a plotly graph showing the counts of matching words for the top ten files
    fig = px.bar(x=[file for file, count in sorted_files[:10]], y=[count for file, count in sorted_files[:10]])
    fig.update_layout(xaxis_title='File', yaxis_title='Number of Matching Words')
    st.plotly_chart(fig)
    
    return context_prompt

# Streamlit app
def main():
    st.title("Context-Aware Prompt Evaluation")
    
    # File upload
    uploaded_file = st.file_uploader("Upload a zip file with .md files", type="zip")
    
    if uploaded_file is not None:
        # Unzip the uploaded file
        with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
            zip_ref.extractall('uploaded_files')
        
        # Get the list of .md files from the uploaded directory
        md_files = [os.path.join('uploaded_files', file) for file in os.listdir('uploaded_files') if file.endswith('.md')]
        
        # Show the list of files
        st.subheader("Uploaded Files")
        for file in md_files:
            st.write(file)
        
        # Prompt input
        prompt = st.session_state.get('prompt', 'What are the main use cases of generative AI in healthcare that are currently unsolved?')
        prompt = st.text_area("Enter your prompt", value=prompt, key='prompt')
        
        # Evaluate the files for the prompt
        if st.button("Evaluate"):
            context_prompt = get_context_files(prompt, md_files)
            st.subheader("Context Prompt")
            st.write(context_prompt)

if __name__ == '__main__':
    main()