import gradio as gr import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, LlavaNextProcessor, LlavaNextForConditionalGeneration from PIL import Image # 获取 Hugging Face 访问令牌 hf_token = os.getenv("HF_API_TOKEN") # 定义模型名称 vqa_model_name = "llava-hf/llava-v1.6-mistral-7b-hf" language_model_name = "larry1129/WooWoof_AI_Vision_merged_16bit_3b" # 全局变量用于缓存模型和分词器 vqa_processor = None vqa_model = None language_tokenizer = None language_model = None # 初始化看图说话模型 def load_vqa_model(): global vqa_processor, vqa_model if vqa_processor is None or vqa_model is None: vqa_processor = LlavaNextProcessor.from_pretrained(vqa_model_name, use_auth_token=hf_token) vqa_model = LlavaNextForConditionalGeneration.from_pretrained( vqa_model_name, torch_dtype=torch.float16, low_cpu_mem_usage=True ).to("cuda:0") return vqa_processor, vqa_model # 初始化纯语言模型 def load_language_model(): global language_tokenizer, language_model if language_tokenizer is None or language_model is None: language_tokenizer = AutoTokenizer.from_pretrained(language_model_name, use_auth_token=hf_token) language_model = AutoModelForCausalLM.from_pretrained( language_model_name, device_map="auto", torch_dtype=torch.float16 ) language_tokenizer.pad_token = language_tokenizer.eos_token language_model.config.pad_token_id = language_tokenizer.pad_token_id language_model.eval() return language_tokenizer, language_model # 从图片生成描述 # 定义生成响应的函数,并使用 @spaces.GPU 装饰 @spaces.GPU(duration=40) # 建议将 duration 增加到 120 def generate_image_description(image): vqa_processor, vqa_model = load_vqa_model() conversation = [ { "role": "user", "content": [ {"type": "text", "text": "What is shown in this image?"}, {"type": "image"}, ], }, ] prompt = vqa_processor.apply_chat_template(conversation, add_generation_prompt=True) inputs = vqa_processor(images=image, text=prompt, return_tensors="pt").to("cuda:0") with torch.no_grad(): output = vqa_model.generate(**inputs, max_new_tokens=100) image_description = vqa_processor.decode(output[0], skip_special_tokens=True) return image_description # 使用纯语言模型生成最终回答 # 定义生成响应的函数,并使用 @spaces.GPU 装饰 @spaces.GPU(duration=40) # 建议将 duration 增加到 120 def generate_language_response(instruction, image_description): language_tokenizer, language_model = load_language_model() prompt = f"""### Instruction: {instruction} ### Input: {image_description} ### Response: """ inputs = language_tokenizer(prompt, return_tensors="pt").to(language_model.device) with torch.no_grad(): outputs = language_model.generate( input_ids=inputs["input_ids"], attention_mask=inputs.get("attention_mask"), max_new_tokens=128, temperature=0.7, top_p=0.95, do_sample=True, ) response = language_tokenizer.decode(outputs[0], skip_special_tokens=True) response = response.split("### Response:")[-1].strip() return response # 整合的 Gradio 接口函数 def process_image_and_text(image, instruction): image_description = generate_image_description(image) final_response = generate_language_response(instruction, image_description) return f"图片描述: {image_description}\n\n最终回答: {final_response}" # 创建 Gradio 界面 iface = gr.Interface( fn=process_image_and_text, inputs=[ gr.Image(type="pil", label="上传图片"), gr.Textbox(lines=2, placeholder="Instruction", label="Instruction") ], outputs="text", title="WooWoof AI - 图片和文本交互", description="输入图片并添加指令,生成基于图片描述的回答。", allow_flagging="never" ) # 启动 Gradio 接口 iface.launch()