import streamlit as st import torch from transformers import BitsAndBytesConfig from llama_index.prompts import PromptTemplate from llama_index.llms import HuggingFaceLLM from llama_index import ServiceContext, VectorStoreIndex , SummaryIndex import streamlit as st import faiss import time from pypdf import PdfReader from huggingface_hub import delete_file from pathlib import Path import requests import os st.success(""" If you'd like to learn more about the technical details of YeCases, check out the LlamaIndex: [How I built the Streamlit LLM application using LlamaIndex.]) """) #with open("docs/news.md", "r") as f: st.success(f.read()) #with open("docs/main.md", "r") as f: st.info(f.read()) quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) def messages_to_prompt(messages): prompt = "" for message in messages: if message.role == 'system': prompt += f"<|system|>\n{message.content}\n" elif message.role == 'user': prompt += f"<|user|>\n{message.content}\n" elif message.role == 'assistant': prompt += f"<|assistant|>\n{message.content}\n" # ensure we start with a system prompt, insert blank if needed if not prompt.startswith("<|system|>\n"): prompt = "<|system|>\n\n" + prompt # add final assistant prompt prompt = prompt + "<|assistant|>\n" return prompt #model_name="mistralai/Mistral-7B-v0.1" model_name="HuggingFaceH4/zephyr-7b-beta" llm = HuggingFaceLLM( model_name=model_name, tokenizer_name=model_name, query_wrapper_prompt=PromptTemplate("<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n"), context_window=3900, max_new_tokens=256, model_kwargs={"quantization_config": quantization_config}, # tokenizer_kwargs={}, generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, messages_to_prompt=messages_to_prompt, device_map="auto", )