Spaces:

VDebugger
/

VDebugger-generalist-for-VQA

Sleeping

File size: 31,392 Bytes

e20ef71

from typing import List, Union

from vision_functions import find_in_image, simple_qa, verify_property, best_text_match, compute_depth


def bool_to_yesno(bool_answer: bool) -> str:
    return "yes" if bool_answer else "no"


class ImagePatch:
    """A Python class containing a crop of an image centered around a particular object, as well as relevant information.
    Attributes
    ----------
    cropped_image : array_like
        An array-like of the cropped image taken from the original image.
    left : int
        An int describing the position of the left border of the crop's bounding box in the original image.
    lower : int
        An int describing the position of the bottom border of the crop's bounding box in the original image.
    right : int
        An int describing the position of the right border of the crop's bounding box in the original image.
    upper : int
        An int describing the position of the top border of the crop's bounding box in the original image.

    Methods
    -------
    find(object_name: str) -> List[ImagePatch]
        Returns a list of new ImagePatch objects containing crops of the image centered around any objects found in the image matching the object_name.
    simple_query(question: str=None) -> str
        Returns the answer to a basic question asked about the image. If no question is provided, returns the answer to "What is this?".
    exists(object_name: str) -> bool
        Returns True if the object specified by object_name is found in the image, and False otherwise.
    verify_property(property: str) -> bool
        Returns True if the property is met, and False otherwise.
    compute_depth()->float
        Returns the median depth of the image crop.
    best_text_match(string1: str, string2: str) -> str
        Returns the string that best matches the image.
    crop(left: int, lower: int, right: int, upper: int) -> ImagePatch
        Returns a new ImagePatch object containing a crop of the image at the given coordinates.
    """

    def __init__(self, image, left: int = None, lower: int = None, right: int = None, upper: int = None):
        """Initializes an ImagePatch object by cropping the image at the given coordinates and stores the coordinates as attributes.
        If no coordinates are provided, the image is left unmodified, and the coordinates are set to the dimensions of the image.
        Parameters
        -------
        image : array_like
            An array-like of the original image.
        left : int
            An int describing the position of the left border of the crop's bounding box in the original image.
        lower : int
            An int describing the position of the bottom border of the crop's bounding box in the original image.
        right : int
            An int describing the position of the right border of the crop's bounding box in the original image.
        upper : int
            An int describing the position of the top border of the crop's bounding box in the original image.
        """
        if left is None and right is None and upper is None and lower is None:
            self.cropped_image = image
            self.left = 0
            self.lower = 0
            self.right = image.shape[2]  # width
            self.upper = image.shape[1]  # height
        else:
            self.cropped_image = image[:, lower:upper, left:right]
            self.left = left
            self.upper = upper
            self.right = right
            self.lower = lower

        self.width = self.cropped_image.shape[2]
        self.height = self.cropped_image.shape[1]

        self.horizontal_center = (self.left + self.right) / 2
        self.vertical_center = (self.lower + self.upper) / 2

    def find(self, object_name: str) -> List["ImagePatch"]:
        """Returns a new ImagePatch object containing the crop of the image centered around the object specified by object_name.
        Parameters
        -------
        object_name : str
            A string describing the name of the object to be found in the image.

        Examples
        --------
        >>> # Given an image: Find the foo.
        >>> def execute_command(image) -> List[ImagePatch]:
        >>>     image_patch = ImagePatch(image)
        >>>     foo_patches = image_patch.find("foo")
        >>>     return foo_patches
        """
        return find_in_image(self.cropped_image, object_name)

    def simple_query(self, question: str = None) -> str:
        """Returns the answer to a basic question asked about the image. If no question is provided, returns the answer to "What is this?".
        Parameters
        -------
        question : str
            A string describing the question to be asked.

        Examples
        -------
        >>> # Given an image: Which kind of animal is not eating?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     animal_patches = image_patch.find("animal")
        >>>     for animal_patch in animal_patches:
        >>>         if not animal_patch.verify_property("animal", "eating"):
        >>>             return animal_patch.simple_query("What kind of animal is eating?") # crop would include eating so keep it in the query
        >>>     # If no animal is not eating, query the image directly
        >>>     return image_patch.simple_query("Which kind of animal is not eating?")

        >>> # Given an image: What is in front of the horse?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     # contains a relation (around, next to, on, near, on top of, in front of, behind, etc), so ask directly
        >>>     return image_patch.simple_query("What is in front of the horse?")
        """
        return simple_qa(self.cropped_image, question)

    def exists(self, object_name: str) -> bool:
        """Returns True if the object specified by object_name is found in the image, and False otherwise.
        Parameters
        -------
        object_name : str
            A string describing the name of the object to be found in the image.

        Examples
        -------
        >>> # Given an image: Are there both cakes and gummy bears in the photo?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     is_cake = image_patch.exists("cake")
        >>>     is_gummy_bear = image_patch.exists("gummy bear")
        >>>     return bool_to_yesno(is_cake and is_gummy_bear)
        """
        return len(self.find(object_name)) > 0

    def verify_property(self, object_name: str, property: str) -> bool:
        """Returns True if the object possesses the property, and False otherwise.
        Differs from 'exists' in that it presupposes the existence of the object specified by object_name, instead checking whether the object possesses the property.
        Parameters
        -------
        object_name : str
            A string describing the name of the object to be found in the image.
        property : str
            A string describing the property to be checked.

        Examples
        -------
        >>> # Given an image: Do the letters have blue color?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     letters_patches = image_patch.find("letters")
        >>>     # Question assumes only one letter patch
        >>>     if len(letters_patches) == 0:
        >>>         # If no letters are found, query the image directly
        >>>         return image_patch.simple_query("Do the letters have blue color?")
        >>>     return bool_to_yesno(letters_patches[0].verify_property("letters", "blue"))
        """
        return verify_property(self.cropped_image, object_name, property)

    def compute_depth(self):
        """Returns the median depth of the image crop
        Parameters
        ----------
        Returns
        -------
        float
            the median depth of the image crop

        Examples
        --------
        >>> # Given an image: Find the bar furthest away.
        >>> def execute_command(image)->ImagePatch:
        >>>     image_patch = ImagePatch(image)
        >>>     bar_patches = image_patch.find("bar")
        >>>     bar_patches.sort(key=lambda bar: bar.compute_depth())
        >>>     return bar_patches[-1]
        """
        depth_map = compute_depth(self.cropped_image)
        return depth_map.median()

    def best_text_match(self, option_list: List[str]) -> str:
        """Returns the string that best matches the image.
        Parameters
        -------
        option_list : str
            A list with the names of the different options
        prefix : str
            A string with the prefixes to append to the options

        Examples
        -------
        >>> # Given an image: Is the cap gold or white?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     cap_patches = image_patch.find("cap")
        >>>     # Question assumes one cap patch
        >>>     if len(cap_patches) == 0:
        >>>         # If no cap is found, query the image directly
        >>>         return image_patch.simple_query("Is the cap gold or white?")
        >>>     return cap_patches[0].best_text_match(["gold", "white"])
        """
        return best_text_match(self.cropped_image, option_list)

    def crop(self, left: int, lower: int, right: int, upper: int) -> "ImagePatch":
        """Returns a new ImagePatch cropped from the current ImagePatch.
        Parameters
        -------
        left : int
            The leftmost pixel of the cropped image.
        lower : int
            The lowest pixel of the cropped image.
        right : int
            The rightmost pixel of the cropped image.
        upper : int
            The uppermost pixel of the cropped image.
        -------
        """
        return ImagePatch(self.cropped_image, left, lower, right, upper)


def best_image_match(list_patches: List[ImagePatch], content: List[str], return_index=False) -> Union[ImagePatch, int]:
    """Returns the patch most likely to contain the content.
    Parameters
    ----------
    list_patches : List[ImagePatch]
    content : List[str]
        the object of interest
    return_index : bool
        if True, returns the index of the patch most likely to contain the object

    Returns
    -------
    int
        Patch most likely to contain the object
    """
    return best_image_match(list_patches, content, return_index)


def distance(patch_a: ImagePatch, patch_b: ImagePatch) -> float:
    """
    Returns the distance between the edges of two ImagePatches. If the patches overlap, it returns a negative distance
    corresponding to the negative intersection over union.

    Parameters
    ----------
    patch_a : ImagePatch
    patch_b : ImagePatch

    Examples
    --------
    # Return the qux that is closest to the foo
    >>> def execute_command(image):
    >>>     image_patch = ImagePatch(image)
    >>>     qux_patches = image_patch.find('qux')
    >>>     foo_patches = image_patch.find('foo')
    >>>     foo_patch = foo_patches[0]
    >>>     qux_patches.sort(key=lambda x: distance(x, foo_patch))
    >>>     return qux_patches[0]
    """
    return distance(patch_a, patch_b)


# Examples of using ImagePatch


# Given an image: What toy is wearing a shirt?
def execute_command(image) -> str:
    # not a relational verb so go step by step
    image_patch = ImagePatch(image)
    toy_patches = image_patch.find("toy")
    # Question assumes only one toy patch
    if len(toy_patches) == 0:
        # If no toy is found, query the image directly
        return image_patch.simple_query("What toy is wearing a shirt?")
    for toy_patch in toy_patches:
        is_wearing_shirt = (toy_patch.simple_query("Is the toy wearing a shirt?") == "yes")
        if is_wearing_shirt:
            return toy_patch.simple_query(
                "What toy is wearing a shirt?")  # crop would include the shirt so keep it in the query
    # If no toy is wearing a shirt, pick the first toy
    return toy_patches[0].simple_query("What toy is wearing a shirt?")


# Given an image: Who is the man staring at?
def execute_command(image) -> str:
    # asks for the predicate of a relational verb (staring at), so ask directly
    image_patch = ImagePatch(image)
    return image_patch.simple_query("Who is the man staring at?")


# Given an image: Find more visible chair.
def execute_command(image) -> ImagePatch:
    # Return the chair
    image_patch = ImagePatch(image)
    # Remember: return the chair
    return image_patch.find("chair")[0]


# Given an image: Find lamp on the bottom.
def execute_command(image) -> ImagePatch:
    # Return the lamp
    image_patch = ImagePatch(image)
    lamp_patches = image_patch.find("lamp")
    lamp_patches.sort(key=lambda lamp: lamp.vertical_center)
    # Remember: return the lamp
    return lamp_patches[0]  # Return the bottommost lamp


# Given a list of images: Does the pole that is near a building that is near a green sign and the pole that is near bushes that are near a green sign have the same material?
def execute_command(image_list) -> str:
    material_1 = None
    material_2 = None
    for image in image_list:
        image = ImagePatch(image)
        # find the building
        building_patches = image.find("building")
        for building_patch in building_patches:
            poles = building_patch.find("pole")
            signs = building_patch.find("sign")
            greensigns = [sign for sign in signs if sign.verify_property('sign', 'green')]
            if len(poles) > 0 and len(greensigns) > 0:
                material_1 = poles[0].simple_query("What is the material of the pole?")
        # find the bush
        bushes_patches = image.find("bushes")
        for bushes_patch in bushes_patches:
            poles = bushes_patch.find("pole")
            signs = bushes_patch.find("sign")
            greensigns = [sign for sign in signs if sign.verify_property('sign', 'green')]
            if len(poles) > 0 and len(greensigns) > 0:
                material_2 = poles[0].simple_query("What is the material of the pole?")
    return bool_to_yesno(material_1 == material_2)


# Given an image: Find middle kid.
def execute_command(image) -> ImagePatch:
    # Return the kid
    image_patch = ImagePatch(image)
    kid_patches = image_patch.find("kid")
    if len(kid_patches) == 0:
        kid_patches = [image_patch]
    kid_patches.sort(key=lambda kid: kid.horizontal_center)
    # Remember: return the kid
    return kid_patches[len(kid_patches) // 2]  # Return the middle kid


# Given an image: Is that blanket to the right of a pillow?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    blanket_patches = image_patch.find("blanket")
    # Question assumes only one blanket patch
    if len(blanket_patches) == 0:
        # If no blanket is found, query the image directly
        return image_patch.simple_query("Is that blanket to the right of a pillow?")
    for blanket_patch in blanket_patches:
        pillow_patches = image_patch.find("pillow")
        for pillow_patch in pillow_patches:
            if pillow_patch.horizontal_center > blanket_patch.horizontal_center:
                return "yes"
    return "no"


# Given an image: How many people are there?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    person_patches = image_patch.find("person")
    return str(len(person_patches))


# Given a list of images: Is the man that is wearing dark pants driving?.
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        man_patches = image.find("man")
        for man_patch in man_patches:
            pants = man_patch.find("pants")
            if len(pants) == 0:
                continue
            if pants[0].verify_property("pants", "dark"):
                return man_patch.simple_query("Is this man driving?")
    return  ImagePatch(image_list[0]).simple_query("Is the man that is wearing dark pants driving?")


# Given an image: Is there a backpack to the right of the man?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    man_patches = image_patch.find("man")
    # Question assumes one man patch
    if len(man_patches) == 0:
        # If no man is found, query the image directly
        return image_patch.simple_query("Is there a backpack to the right of the man?")
    man_patch = man_patches[0]
    backpack_patches = image_patch.find("backpack")
    # Question assumes one backpack patch
    if len(backpack_patches) == 0:
        return "no"
    for backpack_patch in backpack_patches:
        if backpack_patch.horizontal_center > man_patch.horizontal_center:
            return "yes"
    return "no"


# Given a list of images: What is the pizza with red tomato on it on?
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        pizza_patches = image.find("pizza")
        for pizza_patch in pizza_patches:
            tomato_patches = pizza_patch.find("tomato")
            has_red_tomato = False
            for tomato_patch in tomato_patches:
                if tomato_patch.verify_property("tomato", "red"):
                    has_red_tomato = True
            if has_red_tomato:
                return pizza_patch.simple_query("What is the pizza on?")
    return ImagePatch(image_list[0]).simple_query("What is the pizza with red tomato on it on?")


# Given an image: Find chair to the right near the couch.
def execute_command(image) -> ImagePatch:
    # Return the chair
    image_patch = ImagePatch(image)
    chair_patches = image_patch.find("chair")
    if len(chair_patches) == 0:
        chair_patches = [image_patch]
    elif len(chair_patches) == 1:
        return chair_patches[0]
    chair_patches_right = [c for c in chair_patches if c.horizontal_center > image_patch.horizontal_center]
    couch_patches = image_patch.find("couch")
    if len(couch_patches) == 0:
        couch_patches = [image_patch]
    couch_patch = couch_patches[0]
    chair_patches_right.sort(key=lambda c: distance(c, couch_patch))
    chair_patch = chair_patches_right[0]
    # Remember: return the chair
    return chair_patch


# Given an image: Are there bagels or lemons?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    is_bagel = image_patch.exists("bagel")
    is_lemon = image_patch.exists("lemon")
    return bool_to_yesno(is_bagel or is_lemon)


# Given an image: In which part is the bread, the bottom or the top?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    bread_patches = image_patch.find("bread")
    # Question assumes only one bread patch
    if len(bread_patches) == 0:
        # If no bread is found, query the image directly
        return image_patch.simple_query("In which part is the bread, the bottom or the top?")
    if bread_patches[0].vertical_center < image_patch.vertical_center:
        return "bottom"
    else:
        return "top"


# Given an image: Find foo to bottom left.
def execute_command(image) -> ImagePatch:
    # Return the foo
    image_patch = ImagePatch(image)
    foo_patches = image_patch.find("foo")
    lowermost_coordinate = min([patch.vertical_center for patch in foo_patches])
    foo_patches_bottom = [patch for patch in foo_patches if patch.vertical_center - lowermost_coordinate < 100]
    if len(foo_patches_bottom) == 0:
        foo_patches_bottom = foo_patches
    elif len(foo_patches_bottom) == 1:
        return foo_patches_bottom[0]
    foo_patches_bottom.sort(key=lambda foo: foo.horizontal_center)
    foo_patch = foo_patches_bottom[0]
    # Remember: return the foo
    return foo_patch


# Given an image: Find number 17.
def execute_command(image) -> ImagePatch:
    # Return the person
    image_patch = ImagePatch(image)
    person_patches = image_patch.find("person")
    for patch in person_patches:
        if patch.exists("17"):
            return patch
    # Remember: return the person
    return person_patches[0]


# Given a list of images: Is the statement true? There is at least 1 image with a brown dog that is near a bicycle and is wearing a collar.
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        dog_patches = image.find("dog")
        for dog in dog_patches:
            near_bicycle = dog.simple_query("Is the dog near a bicycle?")
            wearing_collar = dog.simple_query("Is the dog wearing a collar?")
            if near_bicycle == "yes" and wearing_collar == "yes":
                return 'yes'
    return 'no'


# Given an image: Find dog to the left of the post who is closest to girl wearing a shirt with text that says "I love you".
def execute_command(image) -> ImagePatch:
    # Return the dog
    image_patch = ImagePatch(image)
    shirt_patches = image_patch.find("shirt")
    if len(shirt_patches) == 0:
        shirt_patches = [image_patch]
    shirt_patch = best_image_match(list_patches=shirt_patches, content=["I love you shirt"])
    post_patches = image_patch.find("post")
    post_patches.sort(key=lambda post: distance(post, shirt_patch))
    post_patch = post_patches[0]
    dog_patches = image_patch.find("dog")
    dogs_left_patch = [dog for dog in dog_patches if dog.left < post_patch.left]
    if len(dogs_left_patch) == 0:
        dogs_left_patch = dog_patches
    dogs_left_patch.sort(key=lambda dog: distance(dog, post_patch))
    dog_patch = dogs_left_patch[0]
    # Remember: return the dog
    return dog_patch


# Given an image: Find balloon on the right and second from the bottom.
def execute_command(image) -> ImagePatch:
    # Return the balloon
    image_patch = ImagePatch(image)
    balloon_patches = image_patch.find("balloon")
    if len(balloon_patches) == 0:
        balloon_patches = [image_patch]
    elif len(balloon_patches) == 1:
        return balloon_patches[0]
    leftmost_coordinate = min([patch.horizontal_center for patch in balloon_patches])
    balloon_patches_right = [patch for patch in balloon_patches if patch.horizontal_center - leftmost_coordinate < 100]
    if len(balloon_patches_right) == 0:
        balloon_patches_right = balloon_patches
    balloon_patches_right.sort(key=lambda p: p.vertical_center)
    balloon_patch = balloon_patches_right[1]
    # Remember: return the balloon
    return balloon_patch


# Given an image: Find girl in white next to man in left.
def execute_command(image) -> ImagePatch:
    # Return the girl
    image_patch = ImagePatch(image)
    girl_patches = image_patch.find("girl")
    girl_in_white_patches = [g for g in girl_patches if g.verify_property("girl", "white clothing")]
    if len(girl_in_white_patches) == 0:
        girl_in_white_patches = girl_patches
    man_patches = image_patch.find("man")
    man_patches.sort(key=lambda man: man.horizontal_center)
    leftmost_man = man_patches[0]  # First from the left
    girl_in_white_patches.sort(key=lambda girl: distance(girl, leftmost_man))
    girl_patch = girl_in_white_patches[0]
    # Remember: return the girl
    return girl_patch


# Given a list of images: Is the statement true? There is 1 table that is in front of woman that is wearing jacket.
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        woman_patches = image.find("woman")
        for woman in woman_patches:
            if woman.simple_query("Is the woman wearing jacket?") == "yes":
                tables = woman.find("table")
                return bool_to_yesno(len(tables) == 1)
    return 'no'


# Given an image: Find top left.
def execute_command(image) -> ImagePatch:
    # Return the person
    image_patch = ImagePatch(image)
    # Figure out what thing the caption is referring to. We need a subject for every caption
    persons = image_patch.find("person")
    top_all_objects = max([obj.vertical_center for obj in persons])
    # Select objects that are close to the top
    # We do this because the caption is asking first about vertical and then about horizontal
    persons_top = [p for p in persons if top_all_objects - p.vertical_center < 100]
    if len(persons_top) == 0:
        persons_top = persons
    # And after that, obtain the leftmost object among them
    persons_top.sort(key=lambda obj: obj.horizontal_center)
    person_leftmost = persons_top[0]
    # Remember: return the person
    return person_leftmost


# Given an image: What type of weather do you see in the photograph?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    return image_patch.simple_query("What type of weather do you see in the photograph?")


# Given an image: How many orange life vests can be seen?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    life_vest_patches = image_patch.find("life vest")
    orange_life_vest_patches = []
    for life_vest_patch in life_vest_patches:
        if life_vest_patch.verify_property('life vest', 'orange'):
            orange_life_vest_patches.append(life_vest_patch)
    return str(len(orange_life_vest_patches))


# Given an image: What is behind the pole?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    # contains a relation (around, next to, on, near, on top of, in front of, behind, etc), so ask directly
    return image_patch.simple_query("What is behind the pole?")


# Given an image: Find second to top flower.
def execute_command(image) -> ImagePatch:
    # Return the flower
    image_patch = ImagePatch(image)
    flower_patches = image_patch.find("flower")
    flower_patches.sort(key=lambda flower: flower.vertical_center)
    flower_patch = flower_patches[-2]
    # Remember: return the flower
    return flower_patch


# Given an image: Find back.
def execute_command(image) -> ImagePatch:
    # Return the person
    image_patch = ImagePatch(image)
    person_patches = image_patch.find("person")
    person_patches.sort(key=lambda person: person.compute_depth())
    person_patch = person_patches[-1]
    # Remember: return the person
    return person_patch


# Given an image: Find chair at the front.
def execute_command(image) -> ImagePatch:
    # Return the chair
    image_patch = ImagePatch(image)
    chair_patches = image_patch.find("chair")
    chair_patches.sort(key=lambda chair: chair.compute_depth())
    chair_patch = chair_patches[0]
    # Remember: return the chair
    return chair_patch


# Given an image: Find white and yellow pants.
def execute_command(image) -> ImagePatch:
    # Return the person
    image_patch = ImagePatch(image)
    # Clothing always requires returning the person
    person_patches = image_patch.find("person")
    person_patch = best_image_match(person_patches, ["white pants", "yellow pants"])
    # Remember: return the person
    return person_patch


# Given an image: Find cow facing the camera.
def execute_command(image) -> ImagePatch:
    # Return the cow
    image_patch = ImagePatch(image)
    cow_patches = image_patch.find("cow")
    if len(cow_patches) == 0:
        cow_patches = [image_patch]
    cow_patch = best_image_match(list_patches=cow_patches, content=["cow facing the camera"])
    # Remember: return the cow
    return cow_patch


# Given a list of images: Is the statement true? There is 1 image that contains exactly 3 blue papers.
def execute_command(image_list) -> str:
    image_cnt = 0
    for image in image_list:
        image = ImagePatch(image)
        paper_patches = image.find("paper")
        blue_paper_patches = []
        for paper in paper_patches:
            if paper.verify_property("paper", "blue"):
                blue_paper_patches.append(paper)
        if len(blue_paper_patches) == 3:
            image_cnt += 1
    return bool_to_yesno(image_cnt == 1)


# Given an image: Find black car just under stop sign.
def execute_command(image) -> ImagePatch:
    # Return the car
    image_patch = ImagePatch(image)
    stop_sign_patches = image_patch.find("stop sign")
    if len(stop_sign_patches) == 0:
        stop_sign_patches = [image_patch]
    stop_sign_patch = stop_sign_patches[0]
    car_patches = image_patch.find("black car")
    car_under_stop = []
    for car in car_patches:
        if car.upper < stop_sign_patch.upper:
            car_under_stop.append(car)
    # Find car that is closest to the stop sign
    car_under_stop.sort(key=lambda car: car.vertical_center - stop_sign_patch.vertical_center)
    # Remember: return the car
    return car_under_stop[0]


# Given a list of images: Is there either a standing man that is holding a cell phone or a sitting man that is holding a cell phone?
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        man_patches = image.find("man")
        for man in man_patches:
            holding_cell_phone = man.simple_query("Is this man holding a cell phone?")
            if holding_cell_phone == "yes":
                if man.simple_query("Is this man sitting?") == "yes":
                    return 'yes'
                if man.simple_query("Is this man standing?") == "yes":
                    return 'yes'
    return 'no'


# Given a list of images: How many people are running while looking at their cell phone?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    people_patches = image_patch.find("person")
    # Question assumes only one person patch
    if len(people_patches) == 0:
        # If no people are found, query the image directly
        return image_patch.simple_query("How many people are running while looking at their cell phone?")
    people_count = 0
    for person_patch in people_patches:
        # Verify two conditions: (1) running (2) looking at cell phone
        if person_patch.simple_query("Is the person running?") == "yes":
            if person_patch.simple_query("Is the person looking at cell phone?") == "yes":
                people_count += 1
    return str(people_count)


# Given a list of images: Does the car that is on a highway and the car that is on a street have the same color?
def execute_command(image_list) -> str:
    color_1 = None
    color_2 = None
    for image in image_list:
        image = ImagePatch(image)
        car_patches = image.find("car")
        for car_patch in car_patches:
            if car_patch.simple_query("Is the car on the highway?") == "yes":
                color_1 = car_patch.simple_query("What is the color of the car?")
            elif car_patch.simple_query("Is the car on a street?") == "yes":
                color_2 = car_patch.simple_query("What is the color of the car?")
    return bool_to_yesno(color_1 == color_2)


# Given a list of images: Is the statement true? There are 3 magazine that are on table.
def execute_command(image_list) -> str:
    count = 0
    for image in image_list:
        image = ImagePatch(image)
        magazine_patches = image.find("magazine")
        for magazine_patch in magazine_patches:
            on_table = magazine_patch.simple_query("Is the magazine on a table?")
            if on_table == "yes":
                count += 1
    return bool_to_yesno(count == 3)


# INSERT_QUERY_HERE