import os import zipfile import streamlit as st import nltk from nltk.corpus import stopwords from sklearn.feature_extraction.text import CountVectorizer import plotly.express as px nltk.download('punkt') nltk.download('stopwords') def preprocess_text(text): # Tokenize the text and remove stopwords tokens = nltk.word_tokenize(text.lower()) stop_words = set(stopwords.words('english')) filtered_tokens = [token for token in tokens if token not in stop_words] return filtered_tokens def get_context_files(prompt, md_files): # Preprocess the prompt and context files processed_prompt = preprocess_text(prompt) processed_files = {} for file in md_files: with open(file, 'r') as f: content = f.read() processed_files[file] = preprocess_text(content) # Calculate word matches and LCS bonus file_matches = {} for file, tokens in processed_files.items(): single_matches = set(tokens) & set(processed_prompt) double_matches = set(nltk.bigrams(tokens)) & set(nltk.bigrams(processed_prompt)) triple_matches = set(nltk.trigrams(tokens)) & set(nltk.trigrams(processed_prompt)) match_count = len(single_matches) + len(double_matches) * 4 + len(triple_matches) * 9 file_matches[file] = { 'single_matches': single_matches, 'double_matches': double_matches, 'triple_matches': triple_matches, 'match_count': match_count } # Sort the files by the match count sorted_files = sorted(file_matches.items(), key=lambda x: x[1]['match_count'], reverse=True) # Create a markdown outline with match counts and word matches outline = "## Outline\n" for file, matches in sorted_files: outline += f"- {file}: {matches['match_count']} matches\n" if matches['single_matches']: outline += f" - Single word matches: {', '.join(matches['single_matches'])}\n" if matches['double_matches']: outline += f" - Double word matches: {', '.join(' '.join(pair) for pair in matches['double_matches'])}\n" if matches['triple_matches']: outline += f" - Triple word matches: {', '.join(' '.join(trio) for trio in matches['triple_matches'])}\n" # Create a single prompt by concatenating the original prompt and the content of the top ten files context_prompt = prompt for file, _ in sorted_files[:10]: with open(file, 'r') as f: content = f.read() # Highlight the matching words in bold for word in file_matches[file]['single_matches']: content = content.replace(word, f"**{word}**") for pair in file_matches[file]['double_matches']: content = content.replace(' '.join(pair), f"**{' '.join(pair)}**") for trio in file_matches[file]['triple_matches']: content = content.replace(' '.join(trio), f"**{' '.join(trio)}**") context_prompt += '\n\n' + content # Create a plotly graph showing the match counts for the top ten files fig = px.bar(x=[file for file, _ in sorted_files[:10]], y=[matches['match_count'] for _, matches in sorted_files[:10]]) fig.update_layout(xaxis_title='File', yaxis_title='Match Count') st.plotly_chart(fig) return outline, context_prompt # Streamlit app def main(): st.title("Context-Aware Prompt Evaluation") # File upload uploaded_file = st.file_uploader("Upload a zip file with .md files", type="zip") if uploaded_file is not None: # Unzip the uploaded file with zipfile.ZipFile(uploaded_file, 'r') as zip_ref: zip_ref.extractall('uploaded_files') # Get the list of .md files from the uploaded directory md_files = [os.path.join('uploaded_files', file) for file in os.listdir('uploaded_files') if file.endswith('.md')] # Show the list of files st.subheader("Uploaded Files") for file in md_files: st.write(file) # Prompt input prompt = st.session_state.get('prompt', 'What are the main use cases of generative AI in healthcare that are currently unsolved?') prompt = st.text_area("Enter your prompt", value=prompt, key='prompt') # Evaluate the files for the prompt if st.button("Evaluate"): outline, context_prompt = get_context_files(prompt, md_files) st.subheader("Outline") st.markdown(outline) st.subheader("Context Prompt") st.markdown(context_prompt) if __name__ == '__main__': main()