File size: 31,392 Bytes
e20ef71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
from typing import List, Union

from vision_functions import find_in_image, simple_qa, verify_property, best_text_match, compute_depth


def bool_to_yesno(bool_answer: bool) -> str:
    return "yes" if bool_answer else "no"


class ImagePatch:
    """A Python class containing a crop of an image centered around a particular object, as well as relevant information.
    Attributes
    ----------
    cropped_image : array_like
        An array-like of the cropped image taken from the original image.
    left : int
        An int describing the position of the left border of the crop's bounding box in the original image.
    lower : int
        An int describing the position of the bottom border of the crop's bounding box in the original image.
    right : int
        An int describing the position of the right border of the crop's bounding box in the original image.
    upper : int
        An int describing the position of the top border of the crop's bounding box in the original image.

    Methods
    -------
    find(object_name: str) -> List[ImagePatch]
        Returns a list of new ImagePatch objects containing crops of the image centered around any objects found in the image matching the object_name.
    simple_query(question: str=None) -> str
        Returns the answer to a basic question asked about the image. If no question is provided, returns the answer to "What is this?".
    exists(object_name: str) -> bool
        Returns True if the object specified by object_name is found in the image, and False otherwise.
    verify_property(property: str) -> bool
        Returns True if the property is met, and False otherwise.
    compute_depth()->float
        Returns the median depth of the image crop.
    best_text_match(string1: str, string2: str) -> str
        Returns the string that best matches the image.
    crop(left: int, lower: int, right: int, upper: int) -> ImagePatch
        Returns a new ImagePatch object containing a crop of the image at the given coordinates.
    """

    def __init__(self, image, left: int = None, lower: int = None, right: int = None, upper: int = None):
        """Initializes an ImagePatch object by cropping the image at the given coordinates and stores the coordinates as attributes.
        If no coordinates are provided, the image is left unmodified, and the coordinates are set to the dimensions of the image.
        Parameters
        -------
        image : array_like
            An array-like of the original image.
        left : int
            An int describing the position of the left border of the crop's bounding box in the original image.
        lower : int
            An int describing the position of the bottom border of the crop's bounding box in the original image.
        right : int
            An int describing the position of the right border of the crop's bounding box in the original image.
        upper : int
            An int describing the position of the top border of the crop's bounding box in the original image.
        """
        if left is None and right is None and upper is None and lower is None:
            self.cropped_image = image
            self.left = 0
            self.lower = 0
            self.right = image.shape[2]  # width
            self.upper = image.shape[1]  # height
        else:
            self.cropped_image = image[:, lower:upper, left:right]
            self.left = left
            self.upper = upper
            self.right = right
            self.lower = lower

        self.width = self.cropped_image.shape[2]
        self.height = self.cropped_image.shape[1]

        self.horizontal_center = (self.left + self.right) / 2
        self.vertical_center = (self.lower + self.upper) / 2

    def find(self, object_name: str) -> List["ImagePatch"]:
        """Returns a new ImagePatch object containing the crop of the image centered around the object specified by object_name.
        Parameters
        -------
        object_name : str
            A string describing the name of the object to be found in the image.

        Examples
        --------
        >>> # Given an image: Find the foo.
        >>> def execute_command(image) -> List[ImagePatch]:
        >>>     image_patch = ImagePatch(image)
        >>>     foo_patches = image_patch.find("foo")
        >>>     return foo_patches
        """
        return find_in_image(self.cropped_image, object_name)

    def simple_query(self, question: str = None) -> str:
        """Returns the answer to a basic question asked about the image. If no question is provided, returns the answer to "What is this?".
        Parameters
        -------
        question : str
            A string describing the question to be asked.

        Examples
        -------
        >>> # Given an image: Which kind of animal is not eating?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     animal_patches = image_patch.find("animal")
        >>>     for animal_patch in animal_patches:
        >>>         if not animal_patch.verify_property("animal", "eating"):
        >>>             return animal_patch.simple_query("What kind of animal is eating?") # crop would include eating so keep it in the query
        >>>     # If no animal is not eating, query the image directly
        >>>     return image_patch.simple_query("Which kind of animal is not eating?")

        >>> # Given an image: What is in front of the horse?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     # contains a relation (around, next to, on, near, on top of, in front of, behind, etc), so ask directly
        >>>     return image_patch.simple_query("What is in front of the horse?")
        """
        return simple_qa(self.cropped_image, question)

    def exists(self, object_name: str) -> bool:
        """Returns True if the object specified by object_name is found in the image, and False otherwise.
        Parameters
        -------
        object_name : str
            A string describing the name of the object to be found in the image.

        Examples
        -------
        >>> # Given an image: Are there both cakes and gummy bears in the photo?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     is_cake = image_patch.exists("cake")
        >>>     is_gummy_bear = image_patch.exists("gummy bear")
        >>>     return bool_to_yesno(is_cake and is_gummy_bear)
        """
        return len(self.find(object_name)) > 0

    def verify_property(self, object_name: str, property: str) -> bool:
        """Returns True if the object possesses the property, and False otherwise.
        Differs from 'exists' in that it presupposes the existence of the object specified by object_name, instead checking whether the object possesses the property.
        Parameters
        -------
        object_name : str
            A string describing the name of the object to be found in the image.
        property : str
            A string describing the property to be checked.

        Examples
        -------
        >>> # Given an image: Do the letters have blue color?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     letters_patches = image_patch.find("letters")
        >>>     # Question assumes only one letter patch
        >>>     if len(letters_patches) == 0:
        >>>         # If no letters are found, query the image directly
        >>>         return image_patch.simple_query("Do the letters have blue color?")
        >>>     return bool_to_yesno(letters_patches[0].verify_property("letters", "blue"))
        """
        return verify_property(self.cropped_image, object_name, property)

    def compute_depth(self):
        """Returns the median depth of the image crop
        Parameters
        ----------
        Returns
        -------
        float
            the median depth of the image crop

        Examples
        --------
        >>> # Given an image: Find the bar furthest away.
        >>> def execute_command(image)->ImagePatch:
        >>>     image_patch = ImagePatch(image)
        >>>     bar_patches = image_patch.find("bar")
        >>>     bar_patches.sort(key=lambda bar: bar.compute_depth())
        >>>     return bar_patches[-1]
        """
        depth_map = compute_depth(self.cropped_image)
        return depth_map.median()

    def best_text_match(self, option_list: List[str]) -> str:
        """Returns the string that best matches the image.
        Parameters
        -------
        option_list : str
            A list with the names of the different options
        prefix : str
            A string with the prefixes to append to the options

        Examples
        -------
        >>> # Given an image: Is the cap gold or white?
        >>> def execute_command(image) -> str:
        >>>     image_patch = ImagePatch(image)
        >>>     cap_patches = image_patch.find("cap")
        >>>     # Question assumes one cap patch
        >>>     if len(cap_patches) == 0:
        >>>         # If no cap is found, query the image directly
        >>>         return image_patch.simple_query("Is the cap gold or white?")
        >>>     return cap_patches[0].best_text_match(["gold", "white"])
        """
        return best_text_match(self.cropped_image, option_list)

    def crop(self, left: int, lower: int, right: int, upper: int) -> "ImagePatch":
        """Returns a new ImagePatch cropped from the current ImagePatch.
        Parameters
        -------
        left : int
            The leftmost pixel of the cropped image.
        lower : int
            The lowest pixel of the cropped image.
        right : int
            The rightmost pixel of the cropped image.
        upper : int
            The uppermost pixel of the cropped image.
        -------
        """
        return ImagePatch(self.cropped_image, left, lower, right, upper)


def best_image_match(list_patches: List[ImagePatch], content: List[str], return_index=False) -> Union[ImagePatch, int]:
    """Returns the patch most likely to contain the content.
    Parameters
    ----------
    list_patches : List[ImagePatch]
    content : List[str]
        the object of interest
    return_index : bool
        if True, returns the index of the patch most likely to contain the object

    Returns
    -------
    int
        Patch most likely to contain the object
    """
    return best_image_match(list_patches, content, return_index)


def distance(patch_a: ImagePatch, patch_b: ImagePatch) -> float:
    """
    Returns the distance between the edges of two ImagePatches. If the patches overlap, it returns a negative distance
    corresponding to the negative intersection over union.

    Parameters
    ----------
    patch_a : ImagePatch
    patch_b : ImagePatch

    Examples
    --------
    # Return the qux that is closest to the foo
    >>> def execute_command(image):
    >>>     image_patch = ImagePatch(image)
    >>>     qux_patches = image_patch.find('qux')
    >>>     foo_patches = image_patch.find('foo')
    >>>     foo_patch = foo_patches[0]
    >>>     qux_patches.sort(key=lambda x: distance(x, foo_patch))
    >>>     return qux_patches[0]
    """
    return distance(patch_a, patch_b)


# Examples of using ImagePatch


# Given an image: What toy is wearing a shirt?
def execute_command(image) -> str:
    # not a relational verb so go step by step
    image_patch = ImagePatch(image)
    toy_patches = image_patch.find("toy")
    # Question assumes only one toy patch
    if len(toy_patches) == 0:
        # If no toy is found, query the image directly
        return image_patch.simple_query("What toy is wearing a shirt?")
    for toy_patch in toy_patches:
        is_wearing_shirt = (toy_patch.simple_query("Is the toy wearing a shirt?") == "yes")
        if is_wearing_shirt:
            return toy_patch.simple_query(
                "What toy is wearing a shirt?")  # crop would include the shirt so keep it in the query
    # If no toy is wearing a shirt, pick the first toy
    return toy_patches[0].simple_query("What toy is wearing a shirt?")


# Given an image: Who is the man staring at?
def execute_command(image) -> str:
    # asks for the predicate of a relational verb (staring at), so ask directly
    image_patch = ImagePatch(image)
    return image_patch.simple_query("Who is the man staring at?")


# Given an image: Find more visible chair.
def execute_command(image) -> ImagePatch:
    # Return the chair
    image_patch = ImagePatch(image)
    # Remember: return the chair
    return image_patch.find("chair")[0]


# Given an image: Find lamp on the bottom.
def execute_command(image) -> ImagePatch:
    # Return the lamp
    image_patch = ImagePatch(image)
    lamp_patches = image_patch.find("lamp")
    lamp_patches.sort(key=lambda lamp: lamp.vertical_center)
    # Remember: return the lamp
    return lamp_patches[0]  # Return the bottommost lamp


# Given a list of images: Does the pole that is near a building that is near a green sign and the pole that is near bushes that are near a green sign have the same material?
def execute_command(image_list) -> str:
    material_1 = None
    material_2 = None
    for image in image_list:
        image = ImagePatch(image)
        # find the building
        building_patches = image.find("building")
        for building_patch in building_patches:
            poles = building_patch.find("pole")
            signs = building_patch.find("sign")
            greensigns = [sign for sign in signs if sign.verify_property('sign', 'green')]
            if len(poles) > 0 and len(greensigns) > 0:
                material_1 = poles[0].simple_query("What is the material of the pole?")
        # find the bush
        bushes_patches = image.find("bushes")
        for bushes_patch in bushes_patches:
            poles = bushes_patch.find("pole")
            signs = bushes_patch.find("sign")
            greensigns = [sign for sign in signs if sign.verify_property('sign', 'green')]
            if len(poles) > 0 and len(greensigns) > 0:
                material_2 = poles[0].simple_query("What is the material of the pole?")
    return bool_to_yesno(material_1 == material_2)


# Given an image: Find middle kid.
def execute_command(image) -> ImagePatch:
    # Return the kid
    image_patch = ImagePatch(image)
    kid_patches = image_patch.find("kid")
    if len(kid_patches) == 0:
        kid_patches = [image_patch]
    kid_patches.sort(key=lambda kid: kid.horizontal_center)
    # Remember: return the kid
    return kid_patches[len(kid_patches) // 2]  # Return the middle kid


# Given an image: Is that blanket to the right of a pillow?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    blanket_patches = image_patch.find("blanket")
    # Question assumes only one blanket patch
    if len(blanket_patches) == 0:
        # If no blanket is found, query the image directly
        return image_patch.simple_query("Is that blanket to the right of a pillow?")
    for blanket_patch in blanket_patches:
        pillow_patches = image_patch.find("pillow")
        for pillow_patch in pillow_patches:
            if pillow_patch.horizontal_center > blanket_patch.horizontal_center:
                return "yes"
    return "no"


# Given an image: How many people are there?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    person_patches = image_patch.find("person")
    return str(len(person_patches))


# Given a list of images: Is the man that is wearing dark pants driving?.
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        man_patches = image.find("man")
        for man_patch in man_patches:
            pants = man_patch.find("pants")
            if len(pants) == 0:
                continue
            if pants[0].verify_property("pants", "dark"):
                return man_patch.simple_query("Is this man driving?")
    return  ImagePatch(image_list[0]).simple_query("Is the man that is wearing dark pants driving?")


# Given an image: Is there a backpack to the right of the man?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    man_patches = image_patch.find("man")
    # Question assumes one man patch
    if len(man_patches) == 0:
        # If no man is found, query the image directly
        return image_patch.simple_query("Is there a backpack to the right of the man?")
    man_patch = man_patches[0]
    backpack_patches = image_patch.find("backpack")
    # Question assumes one backpack patch
    if len(backpack_patches) == 0:
        return "no"
    for backpack_patch in backpack_patches:
        if backpack_patch.horizontal_center > man_patch.horizontal_center:
            return "yes"
    return "no"


# Given a list of images: What is the pizza with red tomato on it on?
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        pizza_patches = image.find("pizza")
        for pizza_patch in pizza_patches:
            tomato_patches = pizza_patch.find("tomato")
            has_red_tomato = False
            for tomato_patch in tomato_patches:
                if tomato_patch.verify_property("tomato", "red"):
                    has_red_tomato = True
            if has_red_tomato:
                return pizza_patch.simple_query("What is the pizza on?")
    return ImagePatch(image_list[0]).simple_query("What is the pizza with red tomato on it on?")


# Given an image: Find chair to the right near the couch.
def execute_command(image) -> ImagePatch:
    # Return the chair
    image_patch = ImagePatch(image)
    chair_patches = image_patch.find("chair")
    if len(chair_patches) == 0:
        chair_patches = [image_patch]
    elif len(chair_patches) == 1:
        return chair_patches[0]
    chair_patches_right = [c for c in chair_patches if c.horizontal_center > image_patch.horizontal_center]
    couch_patches = image_patch.find("couch")
    if len(couch_patches) == 0:
        couch_patches = [image_patch]
    couch_patch = couch_patches[0]
    chair_patches_right.sort(key=lambda c: distance(c, couch_patch))
    chair_patch = chair_patches_right[0]
    # Remember: return the chair
    return chair_patch


# Given an image: Are there bagels or lemons?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    is_bagel = image_patch.exists("bagel")
    is_lemon = image_patch.exists("lemon")
    return bool_to_yesno(is_bagel or is_lemon)


# Given an image: In which part is the bread, the bottom or the top?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    bread_patches = image_patch.find("bread")
    # Question assumes only one bread patch
    if len(bread_patches) == 0:
        # If no bread is found, query the image directly
        return image_patch.simple_query("In which part is the bread, the bottom or the top?")
    if bread_patches[0].vertical_center < image_patch.vertical_center:
        return "bottom"
    else:
        return "top"


# Given an image: Find foo to bottom left.
def execute_command(image) -> ImagePatch:
    # Return the foo
    image_patch = ImagePatch(image)
    foo_patches = image_patch.find("foo")
    lowermost_coordinate = min([patch.vertical_center for patch in foo_patches])
    foo_patches_bottom = [patch for patch in foo_patches if patch.vertical_center - lowermost_coordinate < 100]
    if len(foo_patches_bottom) == 0:
        foo_patches_bottom = foo_patches
    elif len(foo_patches_bottom) == 1:
        return foo_patches_bottom[0]
    foo_patches_bottom.sort(key=lambda foo: foo.horizontal_center)
    foo_patch = foo_patches_bottom[0]
    # Remember: return the foo
    return foo_patch


# Given an image: Find number 17.
def execute_command(image) -> ImagePatch:
    # Return the person
    image_patch = ImagePatch(image)
    person_patches = image_patch.find("person")
    for patch in person_patches:
        if patch.exists("17"):
            return patch
    # Remember: return the person
    return person_patches[0]


# Given a list of images: Is the statement true? There is at least 1 image with a brown dog that is near a bicycle and is wearing a collar.
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        dog_patches = image.find("dog")
        for dog in dog_patches:
            near_bicycle = dog.simple_query("Is the dog near a bicycle?")
            wearing_collar = dog.simple_query("Is the dog wearing a collar?")
            if near_bicycle == "yes" and wearing_collar == "yes":
                return 'yes'
    return 'no'


# Given an image: Find dog to the left of the post who is closest to girl wearing a shirt with text that says "I love you".
def execute_command(image) -> ImagePatch:
    # Return the dog
    image_patch = ImagePatch(image)
    shirt_patches = image_patch.find("shirt")
    if len(shirt_patches) == 0:
        shirt_patches = [image_patch]
    shirt_patch = best_image_match(list_patches=shirt_patches, content=["I love you shirt"])
    post_patches = image_patch.find("post")
    post_patches.sort(key=lambda post: distance(post, shirt_patch))
    post_patch = post_patches[0]
    dog_patches = image_patch.find("dog")
    dogs_left_patch = [dog for dog in dog_patches if dog.left < post_patch.left]
    if len(dogs_left_patch) == 0:
        dogs_left_patch = dog_patches
    dogs_left_patch.sort(key=lambda dog: distance(dog, post_patch))
    dog_patch = dogs_left_patch[0]
    # Remember: return the dog
    return dog_patch


# Given an image: Find balloon on the right and second from the bottom.
def execute_command(image) -> ImagePatch:
    # Return the balloon
    image_patch = ImagePatch(image)
    balloon_patches = image_patch.find("balloon")
    if len(balloon_patches) == 0:
        balloon_patches = [image_patch]
    elif len(balloon_patches) == 1:
        return balloon_patches[0]
    leftmost_coordinate = min([patch.horizontal_center for patch in balloon_patches])
    balloon_patches_right = [patch for patch in balloon_patches if patch.horizontal_center - leftmost_coordinate < 100]
    if len(balloon_patches_right) == 0:
        balloon_patches_right = balloon_patches
    balloon_patches_right.sort(key=lambda p: p.vertical_center)
    balloon_patch = balloon_patches_right[1]
    # Remember: return the balloon
    return balloon_patch


# Given an image: Find girl in white next to man in left.
def execute_command(image) -> ImagePatch:
    # Return the girl
    image_patch = ImagePatch(image)
    girl_patches = image_patch.find("girl")
    girl_in_white_patches = [g for g in girl_patches if g.verify_property("girl", "white clothing")]
    if len(girl_in_white_patches) == 0:
        girl_in_white_patches = girl_patches
    man_patches = image_patch.find("man")
    man_patches.sort(key=lambda man: man.horizontal_center)
    leftmost_man = man_patches[0]  # First from the left
    girl_in_white_patches.sort(key=lambda girl: distance(girl, leftmost_man))
    girl_patch = girl_in_white_patches[0]
    # Remember: return the girl
    return girl_patch


# Given a list of images: Is the statement true? There is 1 table that is in front of woman that is wearing jacket.
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        woman_patches = image.find("woman")
        for woman in woman_patches:
            if woman.simple_query("Is the woman wearing jacket?") == "yes":
                tables = woman.find("table")
                return bool_to_yesno(len(tables) == 1)
    return 'no'


# Given an image: Find top left.
def execute_command(image) -> ImagePatch:
    # Return the person
    image_patch = ImagePatch(image)
    # Figure out what thing the caption is referring to. We need a subject for every caption
    persons = image_patch.find("person")
    top_all_objects = max([obj.vertical_center for obj in persons])
    # Select objects that are close to the top
    # We do this because the caption is asking first about vertical and then about horizontal
    persons_top = [p for p in persons if top_all_objects - p.vertical_center < 100]
    if len(persons_top) == 0:
        persons_top = persons
    # And after that, obtain the leftmost object among them
    persons_top.sort(key=lambda obj: obj.horizontal_center)
    person_leftmost = persons_top[0]
    # Remember: return the person
    return person_leftmost


# Given an image: What type of weather do you see in the photograph?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    return image_patch.simple_query("What type of weather do you see in the photograph?")


# Given an image: How many orange life vests can be seen?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    life_vest_patches = image_patch.find("life vest")
    orange_life_vest_patches = []
    for life_vest_patch in life_vest_patches:
        if life_vest_patch.verify_property('life vest', 'orange'):
            orange_life_vest_patches.append(life_vest_patch)
    return str(len(orange_life_vest_patches))


# Given an image: What is behind the pole?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    # contains a relation (around, next to, on, near, on top of, in front of, behind, etc), so ask directly
    return image_patch.simple_query("What is behind the pole?")


# Given an image: Find second to top flower.
def execute_command(image) -> ImagePatch:
    # Return the flower
    image_patch = ImagePatch(image)
    flower_patches = image_patch.find("flower")
    flower_patches.sort(key=lambda flower: flower.vertical_center)
    flower_patch = flower_patches[-2]
    # Remember: return the flower
    return flower_patch


# Given an image: Find back.
def execute_command(image) -> ImagePatch:
    # Return the person
    image_patch = ImagePatch(image)
    person_patches = image_patch.find("person")
    person_patches.sort(key=lambda person: person.compute_depth())
    person_patch = person_patches[-1]
    # Remember: return the person
    return person_patch


# Given an image: Find chair at the front.
def execute_command(image) -> ImagePatch:
    # Return the chair
    image_patch = ImagePatch(image)
    chair_patches = image_patch.find("chair")
    chair_patches.sort(key=lambda chair: chair.compute_depth())
    chair_patch = chair_patches[0]
    # Remember: return the chair
    return chair_patch


# Given an image: Find white and yellow pants.
def execute_command(image) -> ImagePatch:
    # Return the person
    image_patch = ImagePatch(image)
    # Clothing always requires returning the person
    person_patches = image_patch.find("person")
    person_patch = best_image_match(person_patches, ["white pants", "yellow pants"])
    # Remember: return the person
    return person_patch


# Given an image: Find cow facing the camera.
def execute_command(image) -> ImagePatch:
    # Return the cow
    image_patch = ImagePatch(image)
    cow_patches = image_patch.find("cow")
    if len(cow_patches) == 0:
        cow_patches = [image_patch]
    cow_patch = best_image_match(list_patches=cow_patches, content=["cow facing the camera"])
    # Remember: return the cow
    return cow_patch


# Given a list of images: Is the statement true? There is 1 image that contains exactly 3 blue papers.
def execute_command(image_list) -> str:
    image_cnt = 0
    for image in image_list:
        image = ImagePatch(image)
        paper_patches = image.find("paper")
        blue_paper_patches = []
        for paper in paper_patches:
            if paper.verify_property("paper", "blue"):
                blue_paper_patches.append(paper)
        if len(blue_paper_patches) == 3:
            image_cnt += 1
    return bool_to_yesno(image_cnt == 1)


# Given an image: Find black car just under stop sign.
def execute_command(image) -> ImagePatch:
    # Return the car
    image_patch = ImagePatch(image)
    stop_sign_patches = image_patch.find("stop sign")
    if len(stop_sign_patches) == 0:
        stop_sign_patches = [image_patch]
    stop_sign_patch = stop_sign_patches[0]
    car_patches = image_patch.find("black car")
    car_under_stop = []
    for car in car_patches:
        if car.upper < stop_sign_patch.upper:
            car_under_stop.append(car)
    # Find car that is closest to the stop sign
    car_under_stop.sort(key=lambda car: car.vertical_center - stop_sign_patch.vertical_center)
    # Remember: return the car
    return car_under_stop[0]


# Given a list of images: Is there either a standing man that is holding a cell phone or a sitting man that is holding a cell phone?
def execute_command(image_list) -> str:
    for image in image_list:
        image = ImagePatch(image)
        man_patches = image.find("man")
        for man in man_patches:
            holding_cell_phone = man.simple_query("Is this man holding a cell phone?")
            if holding_cell_phone == "yes":
                if man.simple_query("Is this man sitting?") == "yes":
                    return 'yes'
                if man.simple_query("Is this man standing?") == "yes":
                    return 'yes'
    return 'no'


# Given a list of images: How many people are running while looking at their cell phone?
def execute_command(image) -> str:
    image_patch = ImagePatch(image)
    people_patches = image_patch.find("person")
    # Question assumes only one person patch
    if len(people_patches) == 0:
        # If no people are found, query the image directly
        return image_patch.simple_query("How many people are running while looking at their cell phone?")
    people_count = 0
    for person_patch in people_patches:
        # Verify two conditions: (1) running (2) looking at cell phone
        if person_patch.simple_query("Is the person running?") == "yes":
            if person_patch.simple_query("Is the person looking at cell phone?") == "yes":
                people_count += 1
    return str(people_count)


# Given a list of images: Does the car that is on a highway and the car that is on a street have the same color?
def execute_command(image_list) -> str:
    color_1 = None
    color_2 = None
    for image in image_list:
        image = ImagePatch(image)
        car_patches = image.find("car")
        for car_patch in car_patches:
            if car_patch.simple_query("Is the car on the highway?") == "yes":
                color_1 = car_patch.simple_query("What is the color of the car?")
            elif car_patch.simple_query("Is the car on a street?") == "yes":
                color_2 = car_patch.simple_query("What is the color of the car?")
    return bool_to_yesno(color_1 == color_2)


# Given a list of images: Is the statement true? There are 3 magazine that are on table.
def execute_command(image_list) -> str:
    count = 0
    for image in image_list:
        image = ImagePatch(image)
        magazine_patches = image.find("magazine")
        for magazine_patch in magazine_patches:
            on_table = magazine_patch.simple_query("Is the magazine on a table?")
            if on_table == "yes":
                count += 1
    return bool_to_yesno(count == 3)


# INSERT_QUERY_HERE