Spaces:

awacke1
/

BetterThanRAGPattern

Sleeping

App Files Files Community

BetterThanRAGPattern / app.py

awacke1

Create app.py

2da2f3b verified 6 months ago

raw

history blame contribute delete

4.68 kB

	import os
	import zipfile
	import streamlit as st
	import nltk
	from nltk.corpus import stopwords
	from sklearn.feature_extraction.text import CountVectorizer
	import plotly.express as px

	nltk.download('punkt')
	nltk.download('stopwords')

	def preprocess_text(text):
	# Tokenize the text and remove stopwords
	tokens = nltk.word_tokenize(text.lower())
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [token for token in tokens if token not in stop_words]
	return filtered_tokens

	def get_context_files(prompt, md_files):
	# Preprocess the prompt and context files
	processed_prompt = preprocess_text(prompt)
	processed_files = {}
	for file in md_files:
	with open(file, 'r') as f:
	content = f.read()
	processed_files[file] = preprocess_text(content)

	# Calculate word matches and LCS bonus
	file_matches = {}
	for file, tokens in processed_files.items():
	single_matches = set(tokens) & set(processed_prompt)
	double_matches = set(nltk.bigrams(tokens)) & set(nltk.bigrams(processed_prompt))
	triple_matches = set(nltk.trigrams(tokens)) & set(nltk.trigrams(processed_prompt))
	match_count = len(single_matches) + len(double_matches) * 4 + len(triple_matches) * 9
	file_matches[file] = {
	'single_matches': single_matches,
	'double_matches': double_matches,
	'triple_matches': triple_matches,
	'match_count': match_count
	}

	# Sort the files by the match count
	sorted_files = sorted(file_matches.items(), key=lambda x: x[1]['match_count'], reverse=True)

	# Create a markdown outline with match counts and word matches
	outline = "## Outline\n"
	for file, matches in sorted_files:
	outline += f"- {file}: {matches['match_count']} matches\n"
	if matches['single_matches']:
	outline += f" - Single word matches: {', '.join(matches['single_matches'])}\n"
	if matches['double_matches']:
	outline += f" - Double word matches: {', '.join(' '.join(pair) for pair in matches['double_matches'])}\n"
	if matches['triple_matches']:
	outline += f" - Triple word matches: {', '.join(' '.join(trio) for trio in matches['triple_matches'])}\n"

	# Create a single prompt by concatenating the original prompt and the content of the top ten files
	context_prompt = prompt
	for file, _ in sorted_files[:10]:
	with open(file, 'r') as f:
	content = f.read()
	# Highlight the matching words in bold
	for word in file_matches[file]['single_matches']:
	content = content.replace(word, f"{word}")
	for pair in file_matches[file]['double_matches']:
	content = content.replace(' '.join(pair), f"{' '.join(pair)}")
	for trio in file_matches[file]['triple_matches']:
	content = content.replace(' '.join(trio), f"{' '.join(trio)}")
	context_prompt += '\n\n' + content

	# Create a plotly graph showing the match counts for the top ten files
	fig = px.bar(x=[file for file, _ in sorted_files[:10]], y=[matches['match_count'] for _, matches in sorted_files[:10]])
	fig.update_layout(xaxis_title='File', yaxis_title='Match Count')
	st.plotly_chart(fig)

	return outline, context_prompt

	# Streamlit app
	def main():
	st.title("Context-Aware Prompt Evaluation")

	# File upload
	uploaded_file = st.file_uploader("Upload a zip file with .md files", type="zip")

	if uploaded_file is not None:
	# Unzip the uploaded file
	with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
	zip_ref.extractall('uploaded_files')

	# Get the list of .md files from the uploaded directory
	md_files = [os.path.join('uploaded_files', file) for file in os.listdir('uploaded_files') if file.endswith('.md')]

	# Show the list of files
	st.subheader("Uploaded Files")
	for file in md_files:
	st.write(file)

	# Prompt input
	prompt = st.session_state.get('prompt', 'What are the main use cases of generative AI in healthcare that are currently unsolved?')
	prompt = st.text_area("Enter your prompt", value=prompt, key='prompt')

	# Evaluate the files for the prompt
	if st.button("Evaluate"):
	outline, context_prompt = get_context_files(prompt, md_files)
	st.subheader("Outline")
	st.markdown(outline)
	st.subheader("Context Prompt")
	st.markdown(context_prompt)

	if __name__ == '__main__':
	main()