File size: 2,767 Bytes
3af157b
 
 
 
 
 
 
 
 
786f732
 
 
 
3af157b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786f732
 
3af157b
 
 
 
 
 
 
786f732
3af157b
786f732
 
 
 
 
3af157b
786f732
 
3af157b
 
786f732
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3af157b
786f732
3af157b
 
 
786f732
 
3af157b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import streamlit as st
import pathlib

from huggingface_hub import hf_hub_download
from langchain_community.llms import LlamaCpp
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

from langchain_core.globals import set_debug

set_debug(True)


@st.cache_resource()
def load_llm(repo_id, filename):
    # Create a directory for models if it doesn't exist
    models_folder = pathlib.Path("models")
    models_folder.mkdir(exist_ok=True)

    # Download the model
    model_path = hf_hub_download(
        repo_id=repo_id, filename=filename, local_dir=models_folder
    )

    llm = LlamaCpp(
        model_path=model_path,
        repo_id=repo_id,
        filename=filename,
        verbose=False,
        use_mmap=True,
        use_mlock=True,
        n_threads=4,
        n_threads_batch=4,
        n_ctx=8000,
        max_tokens=128,
        # stop=["."],
    )
    print(f"{repo_id} loaded successfully. ✅")
    return llm


# Streamed response emulator
def response_generator(llm, messages, question, retriever):
    # System prompt setting up context for the assistant
    system_prompt = (
        "<|im_start|>system\n"
        "You are an AI assistant specializing in question-answering tasks. "
        "Utilize the provided context and past conversation to answer "
        "the current question. If the answer is unknown, clearly state that you "
        "don't know. Keep responses concise and direct."
        "\n\n"
        "Context: {context}"
        "\n<|im_end|>"
    )

    # Prepare message history
    message_history = [("system", system_prompt)]

    # Append conversation history to messages
    for message in messages:
        if message["role"] == "user":
            message_history.append(
                ("user", "<|im_start|>user\n" + message["content"] + "\n<|im_end|>")
            )
        elif message["role"] == "assistant":
            message_history.append(
                (
                    "assistant",
                    "<|im_start|>assistant\n" + message["content"] + "\n<|im_end|>",
                )
            )

    message_history.append(("assistant", "<|im_start|>assistant\n"))

    # Create prompt template with full message history
    prompt = ChatPromptTemplate.from_messages(message_history)

    # Instantiate chains for document retrieval and question answering
    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)

    # Invoke RAG (retrieval-augmented generation) chain with current input
    results = rag_chain.invoke({"input": question}, verbose=True)

    return results