import gradio as gr from langchain_community.llms import LlamaCpp from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from langchain_core.callbacks import StreamingStdOutCallbackHandler from langchain.retrievers import TFIDFRetriever from langchain.chains import RetrievalQA from langchain.memory import ConversationBufferMemory from unsloth import FastLanguageModel import torch max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. model, tokenizer = FastLanguageModel.from_pretrained( model_name = "Danielrahmai1991/finbro-v0.1.0-llama-3-8B-instruct-1m", max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit, # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf ) from langchain_huggingface.llms import HuggingFacePipeline from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline FastLanguageModel.for_inference(model) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256) from langchain_community.llms import HuggingFaceEndpoint # gpu_llm = HuggingFacePipeline( # pipeline=pipe, # batch_size=5, # adjust as needed based on GPU map and model size. # model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True}, # ) gpu_llm = HuggingFacePipeline( pipeline=pipe, batch_size=5, # adjust as needed based on GPU map and model size. model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True}, ) from langchain_core.prompts import PromptTemplate from langchain.chains import LLMChain from langchain.schema import HumanMessage, SystemMessage, AIMessage alpaca_prompt_simple = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {question} ### Input: ### Response: """ prompt = PromptTemplate.from_template(alpaca_prompt_simple) llm_chain_model = LLMChain(prompt=prompt, llm=gpu_llm.bind(skip_prompt=True)) from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate examples = [ { "query": "what is forex?", "answer": "Forex is an abbreviation for foreign exchange. It involves trading currencies from different countries with one another at the current market price." }, ] example_prompt = ChatPromptTemplate.from_messages( [ ("human", "{query}"), ("ai", "{answer}"), ] ) few_shot_prompt = FewShotChatMessagePromptTemplate( example_prompt=example_prompt, examples=examples, ) # with memory from langchain_core.prompts import PromptTemplate from langchain.memory import ConversationBufferMemory alpaca_prompt_memory = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. {chat_history} ### Instruction: {question} ### Input: ### Response: """ prompt = PromptTemplate( input_variables=["chat_history", "question"], template=alpaca_prompt_memory ) memory = ConversationBufferMemory(memory_key="chat_history") llm_chain_memory = LLMChain( llm=gpu_llm.bind(skip_prompt=True), prompt=prompt, verbose=True, memory=memory, ) # question = "give me suggestion about inevstment" def greet(question, model_type): print(f"question is {question}") if model_type == "With memory": print("With memory") response_of_llm = llm_chain_memory.predict(question=question) else: print("Without memory") query = question final_prompt = ChatPromptTemplate.from_messages( [ ("system", "You are a financial ai assitant "), few_shot_prompt, ("human", "{userInput}"), ] ) messages = final_prompt.format(userInput=query) ai_out = llm_chain_model.invoke(messages) response_of_llm = ai_out['text'] print(f"out is: {response_of_llm}") return response_of_llm demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown( ["With memory", "Without memory"], label="Memory status", info="With using memory, the output will be slow but strong" ),], outputs="text") demo.launch(debug=True, share=True)