diogodsa's picture
Update app.py
cf99100 verified
raw
history blame
3.09 kB
# -*- coding: utf-8 -*-
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
import torch
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
documents = SimpleDirectoryReader("./data").load_data()
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
model_url='https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q5_K_M.gguf',
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=None,
temperature=0.1,
max_new_tokens=256,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": -1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
from llama_index.embeddings import HuggingFaceEmbedding
# loads BAAI/bge-small-en
# embed_model = HuggingFaceEmbedding()
# loads BAAI/bge-small-en-v1.5
# intilaize our custom embeddings
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
service_context = ServiceContext.from_defaults(
chunk_size=512,
llm=llm,
embed_model=embed_model
)
"""Advanced RAG with Cross Encoder Reranker . Referred from: https://wandb.ai/ayush-thakur/llama-index-report/reports/Building-Advanced-Query-Engine-and-Evaluation-with-LlamaIndex-and-W-B--Vmlldzo0OTIzMjMy"""
from llama_index.indices.postprocessor import SentenceTransformerRerank
# Initialize the reranker
rerank = SentenceTransformerRerank(
model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_n=3) # Retrives top 3 chunks
#create query engine
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine() # Without reranker
def predict(input, history):
response = query_engine.query(input)
return str(response)
#create query engine with cross encoder reranker
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank]) # Note we are first selecting 10 chunks.
def predict(input, history):
response = query_engine.query(input)
return str(response)
import time
import gradio as gr
def predict(input, history):
start_time = time.time() # Start the timer
response = query_engine.query(input) # Process the query
end_time = time.time() # Stop the timer
response_time = end_time - start_time # Calculate the time taken
# Format the response to include the time taken
timed_response = f"{response}\n\n(Response Time: {response_time:.2f} seconds)"
return str(timed_response)
# Launch gradio chat ui
gr.ChatInterface(predict).launch(share=True, debug=True)