### RAG code # Embedding model builder from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex from llama_index.core.retrievers import VectorIndexRetriever from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core.postprocessor import SimilarityPostprocessor def set_embed_model(model_name: str, chunk_size: int = 256, chunk_overlap: int = 25) -> None: Settings.llm = None Settings.embed_model = HuggingFaceEmbedding(model_name=model_name) Settings.chunk_size = chunk_size Settings.chunk_overlap = chunk_overlap class RAGModule: def __init__(self, llm_model: str = "MarcoAland/llama3.1-rag-indo", embedding_model: str = "MarcoAland/Indo-bge-m3", docs_path: str = "data", top_k: int = 3, similarity_cutoff: float = 0.4): # Define embedding model set_embed_model(model_name=embedding_model) # Set vector DB documents = SimpleDirectoryReader(docs_path).load_data() index = VectorStoreIndex.from_documents(documents) retriever = VectorIndexRetriever( index=index, similarity_top_k=top_k, ) self.top_k = top_k self.query_engine = RetrieverQueryEngine( retriever=retriever, node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=similarity_cutoff)] ) def format_context(self, response): context = "Jawab dengan akurat\n\nContext:\n" for i in range(self.top_k): context += response.source_nodes[i].text + "\n\n" return context def query(self, query: str): try: response = self.query_engine.query(query) context = self.format_context(response) return context except: return "" def prompt(self, context: str, instruction: str): return f"{context}\n ### Instruksi:\n {instruction}" def main(self, instruction: str): context = self.query(query=instruction) prompt = self.prompt(context=context, instruction=instruction) return prompt ### Chainlit code import chainlit as cl from openai import AsyncOpenAI RAG_Trwira = RAGModule() # Configure the async OpenAI client client = AsyncOpenAI(api_key="34.69.9.203", base_url="http://34.69.9.203:11434/v1") settings = { "model": "MarcoAland/llama3.1-rag-indo", "temperature":0.3, "max_tokens": 2048, } @cl.on_chat_start async def start_chat(): # Display a title in the UI using Markdown await cl.Message(content="# Hai, namaku Mitrakara👋\n\n ## Selamat datang!\n\nSiap menjadi partner dalam berkarya didunia profesional😊").send() @cl.on_message async def main(message: cl.Message): if "document:" in message.content.lower() or "documents:" in message.content.lower(): # Prepare the message with documents context prompt = RAG_Trwira.main(message.content[10:]) # slice the "documents" command else: # Without documents context prompt = message.content # Format the messages as a list of message dictionaries message_formated = [ {"role": "user", "content": prompt} ] # Create an initial empty message to send back to the user msg = cl.Message(content="") await msg.send() # Use streaming to handle partial responses stream = await client.chat.completions.create(messages=message_formated, stream=True, **settings) async for part in stream: if token := part.choices[0].delta.content or "": await msg.stream_token(token) # Update the message after streaming completion await msg.update()