# -*- coding: utf-8 -*- # Copyright (c) 2024 OSU Natural Language Processing Group # # Licensed under the OpenRAIL-S License; # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.licenses.ai/ai-pubs-open-rails-vz1 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging import os import traceback from datetime import datetime import json import toml import torch from playwright.async_api import async_playwright,Locator from .data_utils.format_prompt_utils import get_index_from_option_name, generate_new_query_prompt, \ generate_new_referring_prompt, format_options from .demo_utils.browser_helper import normal_launch_async, normal_new_context_async, \ get_interactive_elements_with_playwright, select_option, saveconfig, get_select_elements_with_playwright from .demo_utils.format_prompt import format_choices, postprocess_action_lmm from .demo_utils.inference_engine import engine_factory from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN from llava.conversation import conv_templates from llava.model.builder import load_pretrained_model from llava.utils import disable_torch_init from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path from PIL import Image, ImageDraw,ImageFont import numpy as np import asyncio def average_color(image, bbox): """计算指定区域的平均颜色。""" region = image.crop(bbox) numpy_image = np.array(region) avg_color = np.mean(numpy_image, axis=(0, 1)) return tuple(avg_color) def color_contrast(color1, color2): """计算两种颜色之间的对比度。""" l1 = 0.2126 * pow(color1[0]/255, 2.2) + \ 0.7152 * pow(color1[1]/255, 2.2) + \ 0.0722 * pow(color1[2]/255, 2.2) l2 = 0.2126 * pow(color2[0]/255, 2.2) + \ 0.7152 * pow(color2[1]/255, 2.2) + \ 0.0722 * pow(color2[2]/255, 2.2) if l1 > l2: return (l1 + 0.05) / (l2 + 0.05) else: return (l2 + 0.05) / (l1 + 0.05) def text_color_for_background(background_color): """选择最佳的文本颜色基于背景颜色。""" red = (255, 0, 0) blue = (0, 0, 255) contrast_red = color_contrast(background_color, red) contrast_blue = color_contrast(background_color, blue) if contrast_red > contrast_blue: return red else: return blue def draw_text(draw, text, position, font, max_width, image): """在图像上绘制自动换行的文本,并根据背景色调整文本颜色。""" x, y = position words = text.split() current_line = "" # 使用一个空字符串来获取单行文字的高度 line_height = 40 for word in words: test_line = f"{current_line} {word}".strip() width, _ = 40,40 # 正确调用font对象的getsize方法 if width <= max_width: current_line = test_line else: bbox = (x, y, x + width, y + line_height) bg_color = average_color(image, bbox) color = text_color_for_background(bg_color) draw.text((x, y), current_line, font=font, fill=color) y += line_height current_line = word if current_line: bbox = (x, y, x + width, y + line_height) bg_color = average_color(image, bbox) color = text_color_for_background(bg_color) draw.text((x, y), current_line, font=font, fill=color) class SeeActAgent: def __init__(self, config_path=None, save_file_dir="seeact_agent_files", save_task_id=None, default_task='Search for the flight status for the flight AA 3942 leaving on Jun. 10"', default_website="https://www.aa.com/homePage.do", input_info=["screenshot"], grounding_strategy="text_choice", max_auto_op=50, max_continuous_no_op=5, highlight=False, headless=False, args=[], browser_app="chrome", persistant=False, persistant_user_path="", save_video=False, viewport={ "width": 1280, "height": 960 }, tracing=False, trace={ "screenshots": True, "snapshots": True, "sources": True }, rate_limit=-1, model="gpt-4-turbo", temperature=0.9 ): try: if config_path is not None: with open(config_path, 'r') as config: print(f"Configuration File Loaded - {config_path}") config = toml.load(config) else: config = { "basic": { "save_file_dir": save_file_dir, "default_task": default_task, "default_website": default_website }, "agent": { "input_info": input_info, "grounding_strategy": grounding_strategy, "max_auto_op": max_auto_op, "max_continuous_no_op": max_continuous_no_op, "highlight": highlight }, "openai": { "rate_limit": rate_limit, "model": model, "temperature": temperature } } config.update({ "browser": { "headless": headless, "args": args, "browser_app": browser_app, "persistant": persistant, "persistant_user_path": persistant_user_path, "save_video": save_video, "viewport": viewport, "tracing": tracing, "trace": trace } }) except FileNotFoundError: print(f"Error: File '{os.path.abspath(config_path)}' not found.") except toml.TomlDecodeError: print(f"Error: File '{os.path.abspath(config_path)}' is not a valid TOML file.") self.config = config self.complete_flag = False self.session_control = { 'active_page': None, 'context': None, 'browser': None } self.tasks = [self.config["basic"]["default_task"]] if save_task_id: self.main_path = os.path.join(self.config["basic"]["save_file_dir"], save_task_id) else: self.main_path = os.path.join(self.config["basic"]["save_file_dir"], datetime.now().strftime("%Y%m%d_%H%M%S")) if os.path.exists(self.main_path): self.complete_flag=True os.makedirs(self.main_path, exist_ok=True) self.action_space = ["CLICK", "PRESS ENTER", "HOVER", "SCROLL UP", "SCROLL DOWN", "NEW TAB", "CLOSE TAB", "GO BACK", "GO FORWARD", "TERMINATE", "SELECT", "TYPE", "GOTO", "MEMORIZE"] # Define the list of actions here self.no_value_op = ["CLICK", "PRESS ENTER", "HOVER", "SCROLL UP", "SCROLL DOWN", "NEW TAB", "CLOSE TAB", "PRESS HOME", "PRESS END", "PRESS PAGEUP", "PRESS PAGEDOWN" "GO BACK", "GO FORWARD", "TERMINATE", "NONE"] self.with_value_op = ["SELECT", "TYPE", "GOTO", "MEMORIZE", "SAY"] self.no_element_op = ["PRESS ENTER", "SCROLL UP", "SCROLL DOWN", "NEW TAB", "CLOSE TAB", "GO BACK", "GOTO", "PRESS HOME", "PRESS END", "PRESS PAGEUP", "PRESS PAGEDOWN", "GO FORWARD", "TERMINATE", "NONE", "MEMORIZE", "SAY"] # Initialize the primary logger and the developer logger self.logger = self._setup_logger(redirect_to_dev_log=False) # self.dev_logger = self._setup_dev_logger() # # Redirect primary logger messages to dev_logger as well # for handler in self.logger.handlers: # self.dev_logger.addHandler(handler) self.engine = engine_factory(**self.config['openai']) self.taken_actions = [] self.prompts = self._initialize_prompts() self.time_step = 0 self.valid_op = 0 # self.error=0 self.continuous_no_op = 0 self.predictions=[] disable_torch_init() self.pixui_model_path = os.path.expanduser( "/fs/ess/PAS1576/boyu_gou/train_vlm/ui_llava_fine_tune/checkpoints/only-web/merged-llava-v1.5-vicuna-7b-16k-pad-no-fusion-web-80k") self.pixui_model_name = get_model_name_from_path(self.pixui_model_path) self.pixui_tokenizer, self.pixui_model, self.pixui_image_processor, self.pixui_context_len = load_pretrained_model(self.pixui_model_path, None, self.pixui_model_name) def _initialize_prompts(self): """Initialize prompt information including dynamic action space.""" action_format = f"" # Dynamically generate action_format based on self.action_space return { "system_prompt": '''You are assisting humans doing web navigation tasks step by step. At each stage, you can see the webpage by a screenshot and know the previous actions before the current step decided by yourself that have been executed for this task through recorded history. You need to decide on the first following action to take.''', "action_space": ''' Here are the descriptions of all allowed actions: No Value Operations: - CLICK: Click on a webpage element using the mouse. - PRESS ENTER: Press the Enter key, typically to submit a form or confirm an input. - SCROLL UP: Scroll the webpage upwards by half of the window height. - SCROLL DOWN: Scroll the webpage downwards by half of the window height. - PRESS HOME: Scroll to the top of the webpage. - PRESS END: Scroll to the bottom of the webpage. - PRESS PAGEUP: Scroll up by one window height. - PRESS PAGEDOWN: Scroll down by one window height. - GO BACK: Navigate to the previous page in the browser history. - GO FORWARD: Navigate to the next page in the browser history. - TERMINATE: End the current task, typically used when the task is considered complete or requires potentially harmful actions. - NONE: Indicates that no action is necessary at this stage. Used to skip an action or wait. With Value Operations: - SELECT: Choose an option from a dropdown menu or elements), you should try directly typing the input or selecting the choice, bypassing the need for an initial click. 4. You should not attempt to create accounts, log in or do the final submission. 5. Terminate when you deem the task complete or if it requires potentially harmful actions. 6. Details of