import gradio as gr

from langchain_community.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.callbacks import StreamingStdOutCallbackHandler
from langchain.retrievers import TFIDFRetriever
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Danielrahmai1991/finbro-v0.1.0-llama-3-8B-instruct-1m",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

from langchain_huggingface.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
FastLanguageModel.for_inference(model)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
from langchain_community.llms import HuggingFaceEndpoint


# gpu_llm = HuggingFacePipeline(
#     pipeline=pipe,
#     batch_size=5,  # adjust as needed based on GPU map and model size.
#     model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},

# )
gpu_llm = HuggingFacePipeline(
    pipeline=pipe,
    batch_size=5,  # adjust as needed based on GPU map and model size.
    model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},

)
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.schema import HumanMessage, SystemMessage, AIMessage

alpaca_prompt_simple = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{question}

### Input:


### Response:
"""

prompt = PromptTemplate.from_template(alpaca_prompt_simple)
llm_chain_model = LLMChain(prompt=prompt, llm=gpu_llm.bind(skip_prompt=True))


from langchain.prompts import  ChatPromptTemplate, FewShotChatMessagePromptTemplate


examples = [
 {
        "query": "what is forex?",
        "answer": "Forex is an abbreviation for foreign exchange. It involves trading currencies from different countries with one another at the current market price."
    },
]
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{query}"),
        ("ai", "{answer}"),
    ]
)


few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)


# with memory 
from langchain_core.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory

alpaca_prompt_memory = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
{chat_history}

### Instruction:

{question}


### Input:

### Response:
"""

prompt = PromptTemplate(
    input_variables=["chat_history", "question"], template=alpaca_prompt_memory
)
memory = ConversationBufferMemory(memory_key="chat_history")

llm_chain_memory = LLMChain(
    llm=gpu_llm.bind(skip_prompt=True),
    prompt=prompt,
    verbose=True,
    memory=memory,
)

# question = "give me suggestion about inevstment"

def greet(question, model_type):
    print(f"question is {question}")
    if model_type == "With memory":
        print("With memory")
        response_of_llm = llm_chain_memory.predict(question=question)
    else:
        print("Without memory")
        query = question
        final_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", "You are a financial ai assitant "),
                few_shot_prompt,
                ("human", "{userInput}"),
            ]
        )
        messages = final_prompt.format(userInput=query)
        
        ai_out = llm_chain_model.invoke(messages)
        response_of_llm = ai_out['text']

    print(f"out is: {response_of_llm}")
    return response_of_llm

demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown(
            ["With memory", "Without memory"], label="Memory status", info="With using memory, the output will be slow but strong"
        ),], outputs="text")
demo.launch(debug=True, share=True)