import gradio as gr from transformers import AutoTokenizer from petals import AutoDistributedModelForCausalLM import npc_data # Choose any model available at https://health.petals.dev model_name = "daekeun-ml/Llama-2-ko-instruct-13B" #daekeun-ml/Llama-2-ko-instruct-13B #quantumaikr/llama-2-70b-fb16-korean tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoDistributedModelForCausalLM.from_pretrained(model_name) # Run the model as if it were on your computer def chat2(id, npc, text): prom = "" inputs = tokenizer(prom, return_tensors="pt")["input_ids"] outputs = model.generate(inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0])) return text def chat(id, npc, text): return f"{text}에 대한 {npc}의 응답" with gr.Blocks() as demo: count = 0 aa = gr.Interface( fn=chat, inputs=["text","text","text"], outputs="text", description="chat, ai 응답을 반환합니다. 내부적으로 트랜잭션 생성. \n /run/predict", ) demo.queue(max_size=32).launch(enable_queue=True)