from PIL import Image import base64 from io import BytesIO import os from openai import OpenAI import json class Captioner: def __init__(self, api_key_path = None, proxy=None, api_base="https://api.lingyiwanwu.com/v1"): # if api_key_path is None: # # try find datas/01_key.txt and ../datas/01_key.txt # cand_paths = ['datas/01_key.txt', '../datas/01_key.txt'] # flag = False # for path in cand_paths: # if os.path.exists(path): # api_key_path = path # flag = True # break # if not flag: # raise ValueError("Please provide the path to the API key file.") self.api_key = os.getenv('YI_VL_KEY') self.api_base = api_base # if proxy: # os.environ['HTTP_PROXY'] = proxy # os.environ['HTTPS_PROXY'] = proxy self.client = OpenAI( api_key=self.api_key, base_url=self.api_base ) self.history = {} self.history_file = None self.load_history() def load_access_token(self, file_path): with open(file_path, 'r') as file: return file.read().strip() def image2base64(self, image_path): # 打开图像 with Image.open(image_path) as img: # 检查图像高度是否超过480 if img.height > 480: # 计算调整后的宽度,以保持宽高比不变 aspect_ratio = img.width / img.height new_height = 480 new_width = int(new_height * aspect_ratio) img = img.resize((new_width, new_height), Image.ANTIALIAS) # 使用BytesIO在内存中保存调整大小后的图像 buffered = BytesIO() img.save(buffered, format="JPEG") buffered.seek(0) # 将图像转换为Base64编码字符串 img_base64 = "data:image/jpeg;base64," + base64.b64encode(buffered.read()).decode('utf-8') return img_base64 def load_history(self, jsonl_file_name=None): if jsonl_file_name is None: jsonl_file_name = "datas/caption_history.jsonl" self.history_file = jsonl_file_name if os.path.exists(jsonl_file_name): with open(jsonl_file_name, 'r', encoding='utf-8') as f: for line in f: data = json.loads(line) self.history[data['file_name']] = data['response'] def search_from_history(self, file_name): return self.history.get(file_name, None) def save_history(self, jsonl_file_name=None): if jsonl_file_name is None: jsonl_file_name = self.history_file if jsonl_file_name: with open(jsonl_file_name, 'w', encoding='utf-8') as f: for file_name, response in self.history.items(): json.dump({'file_name': file_name, 'response': response}, f, ensure_ascii=False) f.write('\n') # print(f"History saved to {jsonl_file_name}") def add_to_history(self, file_name, response): self.history[file_name] = response def caption(self, image_name): # Check if the caption is already in the history cached_response = self.search_from_history(image_name) if cached_response: # print("return the cache") return cached_response prompt = """Analyze the image and output in JSON format, including the following fields: - "detailed_description": A detailed description of the image content. - "major_object": Determine the main object/scene in the image based on the description, output with a simple word - "Chinese_name": 判断图片中主要物体的中文名 - "real_or_composite": Determine whether this image was taken with a camera or created/modifed by a computer, output with real or composite.""" img_base64 = self.image2base64(image_name) completion = self.client.chat.completions.create( model="yi-vision", messages=[ { "role": "user", "content": [ { "type": "text", "text": prompt }, { "type": "image_url", "image_url": { "url": img_base64 } } ] } ], stream=False ) response = completion.choices[0].message.content # Add the new response to history self.add_to_history(image_name, response) # Save history after adding the new entry self.save_history() return response if __name__ == "__main__": import os os.environ['HTTP_PROXY'] = 'http://localhost:8234' os.environ['HTTPS_PROXY'] = 'http://localhost:8234' captioner = Captioner() test_image = "temp_images/3zjz9b3l.jpg" print(captioner.caption(test_image))