Spaces:
Sleeping
Sleeping
File size: 6,387 Bytes
19f4fce 4bb745d 19f4fce 215cfd3 19f4fce 183c15c 4bb745d b7600b8 19f4fce 4bb745d e058c61 4bb745d 19f4fce 183c15c 19f4fce 4bb745d 170741d 4bb745d 170741d 4bb745d 170741d 4bb745d f5254ad 4bb745d e058c61 4bb745d f5254ad 4bb745d e058c61 991fc6b 069e494 991fc6b 069e494 991fc6b 19f4fce 3059501 4bb745d b7600b8 e594eb9 9281dcc e594eb9 4bb745d 5d94ab8 e594eb9 19f4fce e594eb9 19f4fce e594eb9 19f4fce 5d94ab8 4bb745d 13c5bb4 8f095e3 19f4fce 4bb745d f5254ad 4bb745d 170741d f5254ad 170741d f5254ad e058c61 f5254ad 170741d f5254ad 4bb745d 215cfd3 4bb745d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
import os
import pickle
from json import dumps, loads
import time
from typing import Any, List, Mapping, Optional
import numpy as np
import openai
import pandas as pd
import streamlit as st
from dotenv import load_dotenv
from huggingface_hub import HfFileSystem
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, Pipeline
# prompts
from assets.prompts import custom_prompts
# llama index
from llama_index.core import (
VectorStoreIndex,
PromptTemplate,
)
from llama_index.core.llms import (
CustomLLM,
CompletionResponse,
LLMMetadata,
)
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.llms.callbacks import llm_completion_callback
from llama_index.core.base.llms.types import ChatMessage
from llama_index.core import Settings
load_dotenv()
# openai.api_key = os.getenv("OPENAI_API_KEY")
fs = HfFileSystem()
# define prompt helper
# set maximum input size
CONTEXT_WINDOW = 2048
# set number of output tokens
NUM_OUTPUT = 525
# set maximum chunk overlap
CHUNK_OVERLAP_RATION = 0.2
ANSWER_FORMAT = """
Provide the answer to the user question in the following format:
[FORMAT]
Your answer to the user question above.
Reference:
The list of references (such as page number, title, chapter, section) to the specific sections of the documents that support your answer.
[END_FORMAT]
"""
# query engine templates
QUERY_ENGINE_QA_TEMPLATE = """
We have provided context information below:
[CONTEXT]
{context_str}
[END_CONTEXT]
Given this information, please answer the following question:
[QUESTION]
{query_str}
[END_QUESTION]
"""
QUERY_ENGINE_REFINE_TEMPLATE = """
The original query is as follows:
[QUESTION]
{query_str}
[END_QUESTION]
We have providec an existing answer:
[ANSWER]
{existing_answer}
[END_ANSWER]
We have the opportunity to refine the existing answer (only if needed) with some more
context below.
[CONTEXT]
{context_msg}
[END_CONTEXT]
Given the new context, refine the original answer to include more details like references \
to the specific sections of the documents that support your answer.
Refined Answer:
"""
CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE = """
The following is a friendly conversation between a user and an AI assistant.
The assistant is talkative and provides lots of specific details from its context.
If the assistant does not know the answer to a question, it truthfully says it
does not know.
Here are the relevant documents for the context:
{context_str}
Instruction: Based on the above documents, provide a detailed answer for the user question below. \
Include references to the specific sections of the documents that support your answer. \
Answer "don't know" if not present in the document.
"""
CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE = """
Given the following conversation between a user and an AI assistant and a follow up question from user,
rephrase the follow up question to be a standalone question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:
"""
@st.cache_resource
def load_model(model_name: str):
# llm_model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, config="T5Config")
pipe = pipeline(
task="text-generation",
model=model,
tokenizer=tokenizer,
# device=0, # GPU device number
# max_length=512,
do_sample=True,
top_p=0.95,
top_k=50,
temperature=0.7,
)
return pipe
class OurLLM(CustomLLM):
context_window: int = 3900
num_output: int = 256
model_name: str = ""
pipeline: Pipeline = None
@property
def metadata(self) -> LLMMetadata:
"""Get LLM metadata."""
return LLMMetadata(
context_window=CONTEXT_WINDOW,
num_output=NUM_OUTPUT,
model_name=self.model_name,
)
# The decorator is optional, but provides observability via callbacks on the LLM calls.
@llm_completion_callback()
def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
prompt_length = len(prompt)
response = self.pipeline(prompt, max_new_tokens=NUM_OUTPUT)[0]["generated_text"]
# only return newly generated tokens
text = response[prompt_length:]
return CompletionResponse(text=text)
@llm_completion_callback()
def stream_complete(self, prompt: str, **kwargs: Any):
response = ""
for token in self.dummy_response:
response += token
yield CompletionResponse(text=response, delta=token)
class LlamaCustom:
def __init__(self, model_name: str, index: VectorStoreIndex):
self.model_name = model_name
self.index = index
self.chat_mode = "condense_plus_context"
self.memory = ChatMemoryBuffer.from_defaults()
self.verbose = True
def get_response(self, query_str: str, chat_history: List[ChatMessage]):
# https://docs.llamaindex.ai/en/stable/module_guides/deploying/chat_engines/
# https://docs.llamaindex.ai/en/stable/examples/query_engine/citation_query_engine/
# https://docs.llamaindex.ai/en/stable/examples/query_engine/knowledge_graph_rag_query_engine/
query_engine = self.index.as_query_engine(
text_qa_template=PromptTemplate(QUERY_ENGINE_QA_TEMPLATE + ANSWER_FORMAT),
refine_template=PromptTemplate(
QUERY_ENGINE_REFINE_TEMPLATE
), # passing ANSWER_FORMAT here will not give the desired output, need to use the output parser from llama index?
verbose=self.verbose,
)
# chat_engine = self.index.as_chat_engine(
# chat_mode=self.chat_mode,
# memory=self.memory,
# context_prompt=CHAT_ENGINE_CONTEXT_PROMPT_TEMPLATE,
# condense_prompt=CHAT_ENGINE_CONDENSE_PROMPT_TEMPLATE,
# # verbose=True,
# )
response = query_engine.query(query_str)
# response = chat_engine.chat(message=query_str, chat_history=chat_history)
return str(response)
def get_stream_response(self, query_str: str, chat_history: List[ChatMessage]):
response = self.get_response(query_str=query_str, chat_history=chat_history)
for word in response.split():
yield word + " "
time.sleep(0.05)
|