ChatAcadamy / app.py
yangbh217's picture
Update app.py
ef40030 verified
import gradio as gr
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.llms import HuggingFaceHub
from getpass import getpass
import os
HUGGINGFACEHUB_API_TOKEN = os.getenv("hf_api")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
def process_input(urls, question):
# model_local = ChatOllama(model="mistral")
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm = HuggingFaceHub(
repo_id=repo_id,
)
model_local = llm
# Convert string of URLs to list
# urls_list = urls.split("\n")
if urls:
urls_list = urls.split("\n")
else:
urls_list = [
"https://ollama.com/",
"https://ollama.com/blog/windows-preview",
"https://ollama.com/blog/openai-compatibility",
]
link_list_path = "./shef_extracted_links.txt"
with open(link_list_path, "r") as f:
link_list = [l.strip() for l in f.readlines()]
urls_list += link_list
docs = [WebBaseLoader(url).load() for url in urls_list]
docs_list = [item for sublist in docs for item in sublist]
# print(docs_list)
loader = PyPDFLoader("./doc.pdf")
pages = loader.load_and_split()
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
doc_init = text_splitter.split_documents(pages)
doc_splits = text_splitter.split_documents(docs_list)
doc_new = doc_init + doc_splits
# doc_new = doc_splits
vectorstore = Chroma.from_documents(
documents=doc_new,
collection_name="rag-chroma",
embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
)
retriever = vectorstore.as_retriever()
# after_rag_template = """Answer the question based only on the following context:
# {context}
# Question: {question}
# """
after_rag_template = """ Using the contexts below, answer the question.
contexts:
{context}
Question: {question}
"""
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
after_rag_chain = (
{"context": lambda x:retriever, "question": RunnablePassthrough()}
| after_rag_prompt
| model_local
| StrOutputParser()
)
return after_rag_chain.invoke(question).split(f'Question: {question}')[1]
# Define Gradio interface
iface = gr.Interface(fn=process_input,
inputs=[gr.Textbox(label="Enter URLs separated by new lines"), gr.Textbox(label="Question")],
outputs="text",
title="ChatAcadamy with Mistral",
description="Enter URLs and a question to query the documents.")
iface.launch()