{ "1034419625": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man is dribbling a basketball.\nOption 2: A man is dunking a basketball.", "gt-answer": "Option 2: A man is dunking a basketball.", "video-llm-prediction": "Option 2: A man is dunking a basketball.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A man is dunking a basketball.\nSentence B: A video showing a person passing a basketball.", "gt-answer": "Sentence A: A man is dunking a basketball.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A man is dunking a basketball.\nCaption B: The man is shooting a basketball.", "gt-answer": "Caption A: A man is dunking a basketball.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ], "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is moving from left to right.\nSentence B: The man is moving from right to left.", "gt-answer": "Sentence A: The man is moving from left to right.", "video-llm-prediction": "Sentence B: The man is moving from right to left.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: From the camera's point of view, the man is moving from right to left.\nOption 2: The man is moving from left to right.", "gt-answer": "Option 2: The man is moving from left to right.", "video-llm-prediction": "Option 1: From the camera's point of view, the man is moving from right to left.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The man is moving from right to left.\nCaption B: The man is moving from left to right.", "gt-answer": "Caption B: The man is moving from left to right.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1034419625_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is moving from left to right.\nSentence B: The man is moving from right to left.", "gt-answer": "Sentence B: The man is moving from right to left.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: From the camera's point of view, the man is moving from right to left.\nOption 2: The man is moving from left to right.", "gt-answer": "Option 1: From the camera's point of view, the man is moving from right to left.", "video-llm-prediction": "Option 1: From the camera's point of view, the man is moving from right to left.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man is moving from right to left.\nCaption B: The man is moving from left to right.", "gt-answer": "Caption A: The man is moving from right to left.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1094590221": { "action": [ { "question": "Which caption matches the video better?\nCaption A: A man is shooting basketball in the video.\nCaption B: A man is dunking basketball in the video.", "gt-answer": "Caption A: A man is shooting basketball in the video.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is spinning basketball.\nSentence B: A man is shooting basketball in the video.", "gt-answer": "Sentence B: A man is shooting basketball in the video.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A man is shooting basketball in the video.\nOption 2: A person is dribbling basketball in the video.", "gt-answer": "Option 1: A man is shooting basketball in the video.", "video-llm-prediction": "Option 1: A man is shooting basketball in the video.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The basketball is moving from right to left.\nSentence B: The basketball is moving from left to right.", "gt-answer": "Sentence A: The basketball is moving from right to left.", "video-llm-prediction": "Sentence B: The basketball is moving from left to right.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The basketball is moving from right to left.\nOption 2: The basketball is moving from left to right.", "gt-answer": "Option 1: The basketball is moving from right to left.", "video-llm-prediction": "Option 2: The basketball is moving from left to right.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The basketball is moving from right to left.\nCaption B: The basketball is moving from left to right.", "gt-answer": "Caption A: The basketball is moving from right to left.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1094590221_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The basketball is moving from right to left.\nSentence B: The basketball is moving from left to right.", "gt-answer": "Sentence B: The basketball is moving from left to right.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The basketball is moving from right to left.\nOption 2: The basketball is moving from left to right.", "gt-answer": "Option 2: The basketball is moving from left to right.", "video-llm-prediction": "Option 1: The basketball is moving from right to left.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The basketball is moving from right to left.\nCaption B: The basketball is moving from left to right.", "gt-answer": "Caption B: The basketball is moving from left to right.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1098982673": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is eating and singing.\nOption 2: The woman is dancing and singing.", "gt-answer": "Option 2: The woman is dancing and singing.", "video-llm-prediction": "Option 2: The woman is dancing and singing.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is sleeping and singing.\nCaption B: The woman is dancing and singing.", "gt-answer": "Caption B: The woman is dancing and singing.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is running and singing.\nSentence B: The woman is dancing and singing.", "gt-answer": "Sentence B: The woman is dancing and singing.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing at normal speed.\nSentence B: The video is playing in slow motion.", "gt-answer": "Sentence A: The video is playing at normal speed.", "video-llm-prediction": "Sentence B: The video is playing in slow motion.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video is fast forwarded.\nCaption B: The video is playing at normal speed.", "gt-answer": "Caption B: The video is playing at normal speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing at normal speed.\nOption 2: The video is being played in reverse.", "gt-answer": "Option 1: The video is playing at normal speed.", "video-llm-prediction": "Option 1: The video is playing at normal speed.", "match_success": true, "rating": 1 } ] }, "1111657885": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The people are skiing.\nSentence B: The people are ice skating.", "gt-answer": "Sentence B: The people are ice skating.", "video-llm-prediction": "Sentence B: The people are ice skating.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The people are ice skating.\nCaption B: The people are swimming.", "gt-answer": "Caption A: The people are ice skating.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The people are ice skating.\nOption 2: A sport being played by the people is tennis.", "gt-answer": "Option 1: The people are ice skating.", "video-llm-prediction": "Option 1: The people are ice skating.", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which caption matches the video better?\nCaption A: The video is presented at a rapid pace.\nCaption B: The video has a slow motion tempo.", "gt-answer": "Caption B: The video has a slow motion tempo.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The entire video is in normal speed.\nOption 2: The video has a slow motion tempo.", "gt-answer": "Option 2: The video has a slow motion tempo.", "video-llm-prediction": "Option 2: The video has a slow motion tempo.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is paused.\nSentence B: The video has a slow motion tempo.", "gt-answer": "Sentence B: The video has a slow motion tempo.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which caption matches the video better?\nCaption A: The people are parallel to the screen.\nCaption B: The people are skating towards the camera.", "gt-answer": "Caption B: The people are skating towards the camera.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The people are skating away from the camera.\nOption 2: The people are skating towards the camera.", "gt-answer": "Option 2: The people are skating towards the camera.", "video-llm-prediction": "Option 2: The people are skating towards the camera.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The people are skating away from the camera.\nSentence B: The people are skating towards the camera.", "gt-answer": "Sentence B: The people are skating towards the camera.", "video-llm-prediction": "Sentence A: The people are skating away from the camera.", "match_success": true, "rating": 0 } ] }, "1111657885_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The people are parallel to the screen.\nCaption B: The people are skating towards the camera.", "gt-answer": "Caption A: The people are parallel to the screen.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The people are skating away from the camera.\nOption 2: The people are skating towards the camera.", "gt-answer": "Option 1: The people are skating away from the camera.", "video-llm-prediction": "Option 2: The people are skating towards the camera.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The people are skating away from the camera.\nSentence B: The people are skating towards the camera.", "gt-answer": "Sentence A: The people are skating away from the camera.", "video-llm-prediction": "Sentence B: The people are skating towards the camera.", "match_success": true, "rating": 0 } ] }, "1073873933": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A video is showing a fight.\nOption 2: A person is eating.", "gt-answer": "Option 1: A video is showing a fight.", "video-llm-prediction": "Option 1: A video is showing a fight.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A video is showing a fight.\nCaption B: The monkey is swimming.", "gt-answer": "Caption A: A video is showing a fight.", "video-llm-prediction": "A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A video is showing a fight.\nSentence B: Someone is running.", "gt-answer": "Sentence A: A video is showing a fight.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1089810425": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The people are dancing.\nSentence B: The people are cooking.", "gt-answer": "Sentence A: The people are dancing.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The people are dancing.\nCaption B: A video showing people sleeping.", "gt-answer": "Caption A: The people are dancing.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The people are painting.\nOption 2: The people are dancing.", "gt-answer": "Option 2: The people are dancing.", "video-llm-prediction": "Option 2: The people are dancing.", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the video is normal.\nSentence B: The video is in slow motion.", "gt-answer": "Sentence A: The speed of the video is normal.", "video-llm-prediction": "Sentence B: The video is in slow motion.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is in time-lapse.\nOption 2: The speed of the video is normal.", "gt-answer": "Option 2: The speed of the video is normal.", "video-llm-prediction": "Option 1: The video is in time-lapse.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the video is normal.\nCaption B: The tempo of the entire video is fast.", "gt-answer": "Caption A: The speed of the video is normal.", "video-llm-prediction": "Caption B: The tempo of the entire video is fast.", "match_success": true, "rating": 0 } ], "direction": [ { "question": "Which caption matches the video better?\nCaption A: The video is panning from left to right.\nCaption B: The video is panning from right to left.", "gt-answer": "Caption B: The video is panning from right to left.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The panning in the video is from left to right.\nSentence B: The panning in the video is from right to left.", "gt-answer": "Sentence B: The panning in the video is from right to left.", "video-llm-prediction": "Sentence B: The panning in the video is from right to left.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The panning in the video is from left to right.\nOption 2: The panning in the video is from right to left.", "gt-answer": "Option 2: The panning in the video is from right to left.", "video-llm-prediction": "Option 2: The panning in the video is from right to left.", "match_success": true, "rating": 1 } ] }, "1089810425_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The video is panning from left to right.\nCaption B: The video is panning from right to left.", "gt-answer": "Caption A: The video is panning from left to right.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The panning in the video is from left to right.\nSentence B: The panning in the video is from right to left.", "gt-answer": "Sentence A: The panning in the video is from left to right.", "video-llm-prediction": "Sentence B: The panning in the video is from right to left.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The panning in the video is from left to right.\nOption 2: The panning in the video is from right to left.", "gt-answer": "Option 1: The panning in the video is from left to right.", "video-llm-prediction": "Option 2: The panning in the video is from right to left.", "match_success": true, "rating": 0 } ] }, "1077507593": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The people in the video are cooking.\nSentence B: The people in the video are sleeping.", "gt-answer": "Sentence A: The people in the video are cooking.", "video-llm-prediction": "Sentence A: The people in the video are cooking.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The people in the video are cooking.\nCaption B: A person is eating in the video.", "gt-answer": "Caption A: The people in the video are cooking.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The people in the video are cooking.\nOption 2: There is dancing happening in the video.", "gt-answer": "Option 1: The people in the video are cooking.", "video-llm-prediction": "Option 1: The people in the video are cooking.", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is in slow motion.\nSentence B: The video is at normal speed.", "gt-answer": "Sentence A: The video is in slow motion.", "video-llm-prediction": "Sentence A: The video is in slow motion.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is in slow motion.\nCaption B: The video is in time-lapse.", "gt-answer": "Caption A: The video is in slow motion.", "video-llm-prediction": "Caption A: The video is in slow motion.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is in slow motion.\nOption 2: The events in the video are happening at a fast rate.", "gt-answer": "Option 1: The video is in slow motion.", "video-llm-prediction": "Option 1: The video is in slow motion.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The skillet is smoking, and then it is burning in fire.\nSentence B: The skillet is burning in fire, and then it is smoking.", "gt-answer": "Sentence A: The skillet is smoking, and then it is burning in fire.", "video-llm-prediction": "Sentence A: The skillet is smoking, and then it is burning in fire.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The skillet is smoking after burning in fire.\nOption 2: The skillet is smoking, and then it is burning in fire.", "gt-answer": "Option 2: The skillet is smoking, and then it is burning in fire.", "video-llm-prediction": "Option 1: The skillet is smoking after burning in fire.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The skillet is neither smoking nor burning.\nCaption B: The skillet is smoking, and then it is burning in fire.", "gt-answer": "Caption B: The skillet is smoking, and then it is burning in fire.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1107116885": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man is swimming.\nOption 2: The man is shooting an arrow.", "gt-answer": "Option 2: The man is shooting an arrow.", "video-llm-prediction": "Option 2: The man is shooting an arrow.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A person is running.\nCaption B: The man is shooting an arrow.", "gt-answer": "Caption B: The man is shooting an arrow.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: He is playing soccer.\nSentence B: The man is shooting an arrow.", "gt-answer": "Sentence B: The man is shooting an arrow.", "video-llm-prediction": "B", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing with time-lapse effect.\nOption 2: The video is playing at a normal speed.", "gt-answer": "Option 2: The video is playing at a normal speed.", "video-llm-prediction": "Option 1: The video is playing with time-lapse effect.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The pace of the video is slow motion.\nCaption B: The video is playing at a normal speed.", "gt-answer": "Caption B: The video is playing at a normal speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing at a normal speed.\nSentence B: The video is playing at a rapid speed.", "gt-answer": "Sentence A: The video is playing at a normal speed.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1059718130": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The video is playing in time-lapse.\nCaption B: The speed of the video is slow motion.", "gt-answer": "Caption A: The video is playing in time-lapse.", "video-llm-prediction": "Caption A: The video is playing in time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing in stop motion.\nSentence B: The video is playing in time-lapse.", "gt-answer": "Sentence B: The video is playing in time-lapse.", "video-llm-prediction": "Sentence B: The video is playing in time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing in time-lapse.\nOption 2: The pace at which the video is playing is fast-forward.", "gt-answer": "Option 1: The video is playing in time-lapse.", "video-llm-prediction": "Option 1: The video is playing in time-lapse.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The sun is rising.\nSentence B: The sun is setting.", "gt-answer": "Sentence A: The sun is rising.", "video-llm-prediction": "Sentence B: The sun is setting.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The sun is falling.\nCaption B: The sun is rising.", "gt-answer": "Caption B: The sun is rising.", "video-llm-prediction": "Caption B: The sun is rising.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sun is moving downwards.\nOption 2: The sun is moving upwards.", "gt-answer": "Option 2: The sun is moving upwards.", "video-llm-prediction": "Option 1: The sun is moving downwards.", "match_success": true, "rating": 0 } ], "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The video's brightness is turning brighter.\nCaption B: The video's brightness is getting darker.", "gt-answer": "Caption A: The video's brightness is turning brighter.", "video-llm-prediction": "Caption A: The video's brightness is turning brighter.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video's brightness is turning brighter.\nSentence B: The attribute of brightness is remaining stable.", "gt-answer": "Sentence A: The video's brightness is turning brighter.", "video-llm-prediction": "Sentence A: The video's brightness is turning brighter.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video's brightness is turning brighter.\nOption 2: The brightness is decreasing.", "gt-answer": "Option 1: The video's brightness is turning brighter.", "video-llm-prediction": "Option 1: The video's brightness is turning brighter.", "match_success": true, "rating": 1 } ] }, "1059718130_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The sun is rising.\nSentence B: The sun is setting.", "gt-answer": "Sentence B: The sun is setting.", "video-llm-prediction": "Sentence B: The sun is setting.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The sun is falling.\nCaption B: The sun is rising.", "gt-answer": "Caption A: The sun is falling.", "video-llm-prediction": "Caption B: The sun is rising.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sun is moving downwards.\nOption 2: The sun is moving upwards.", "gt-answer": "Option 1: The sun is moving downwards.", "video-llm-prediction": "Option 1: The sun is moving downwards.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The video's brightness is turning brighter.\nCaption B: The video's brightness is getting darker.", "gt-answer": "Caption B: The video's brightness is getting darker.", "video-llm-prediction": "Caption B: The video's brightness is getting darker.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video's brightness is turning brighter.\nSentence B: The attribute of brightness is remaining stable.", "gt-answer": "Sentence B: The attribute of brightness is remaining stable.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video's brightness is turning brighter.\nOption 2: The brightness is decreasing.", "gt-answer": "Option 2: The brightness is decreasing.", "video-llm-prediction": "Option 2: The brightness is decreasing.", "match_success": true, "rating": 1 } ] }, "1083845854": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing at time-lapse speed.\nSentence B: The video is playing at slow motion speed.", "gt-answer": "Sentence A: The video is playing at time-lapse speed.", "video-llm-prediction": "Sentence A: The video is playing at time-lapse speed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is playing at normal speed.\nCaption B: The video is playing at time-lapse speed.", "gt-answer": "Caption B: The video is playing at time-lapse speed.", "video-llm-prediction": "Caption B: The video is playing at time-lapse speed.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing at time-lapse speed.\nOption 2: The video is playing in fast-forward mode.", "gt-answer": "Option 1: The video is playing at time-lapse speed.", "video-llm-prediction": "Option 1: The video is playing at time-lapse speed.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The clock hands are moving clockwise.\nOption 2: The clock hands are moving counterclockwise.", "gt-answer": "Option 1: The clock hands are moving clockwise.", "video-llm-prediction": "Option 1: The clock hands are moving clockwise.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The clock hands are moving counterclockwise.\nSentence B: The clock hands are moving clockwise.", "gt-answer": "Sentence B: The clock hands are moving clockwise.", "video-llm-prediction": "Sentence A: The clock hands are moving counterclockwise.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The clock hands are rotating counterclockwise.\nCaption B: The clock hands are rotating clockwise.", "gt-answer": "Caption B: The clock hands are rotating clockwise.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1050618460": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the entire video is normal speed.\nSentence B: The speed of the entire video is time-lapse.", "gt-answer": "Sentence B: The speed of the entire video is time-lapse.", "video-llm-prediction": "Sentence B: The speed of the entire video is time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the entire video is slow motion.\nCaption B: The speed of the entire video is time-lapse.", "gt-answer": "Caption B: The speed of the entire video is time-lapse.", "video-llm-prediction": "Caption B: The speed of the entire video is time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the entire video is time-lapse.\nOption 2: The pace of the entire video is slow-paced.", "gt-answer": "Option 1: The speed of the entire video is time-lapse.", "video-llm-prediction": "Option 1: The speed of the entire video is time-lapse.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The flowers are withering.\nSentence B: The flowers are turning into full bloom.", "gt-answer": "Sentence B: The flowers are turning into full bloom.", "video-llm-prediction": "Sentence B: The flowers are turning into full bloom.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The flowers are turning into full bloom.\nCaption B: The flowers are shrinking and wilting.", "gt-answer": "Caption A: The flowers are turning into full bloom.", "video-llm-prediction": "Caption A: The flowers are turning into full bloom.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The flowers are turning into full bloom.\nOption 2: The flowers are turning into buds.", "gt-answer": "Option 1: The flowers are turning into full bloom.", "video-llm-prediction": "Option 1: The flowers are turning into full bloom.", "match_success": true, "rating": 1 } ] }, "1083113449": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the video is time-lapse.\nOption 2: The speed of the video is slow motion.", "gt-answer": "Option 1: The speed of the video is time-lapse.", "video-llm-prediction": "Option 1: The speed of the video is time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the video is time-lapse.\nSentence B: The video is in normal speed.", "gt-answer": "Sentence A: The speed of the video is time-lapse.", "video-llm-prediction": "Sentence A: The speed of the video is time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the video is rewind.\nCaption B: The speed of the video is time-lapse.", "gt-answer": "Caption B: The speed of the video is time-lapse.", "video-llm-prediction": "Caption B: The speed of the video is time-lapse.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The sky is maintaining its brightness.\nCaption B: The sky is getting darker.", "gt-answer": "Caption B: The sky is getting darker.", "video-llm-prediction": "Caption B: The sky is getting darker.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sky is lightening up.\nOption 2: The sky is getting darker.", "gt-answer": "Option 2: The sky is getting darker.", "video-llm-prediction": "Option 2: The sky is getting darker.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The sky is getting darker.\nSentence B: The sky is brightening.", "gt-answer": "Sentence A: The sky is getting darker.", "video-llm-prediction": "Sentence A: The sky is getting darker.", "match_success": true, "rating": 1 } ] }, "1091143117": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is in normal speed.\nOption 2: The video is in time-lapse.", "gt-answer": "Option 1: The video is in normal speed.", "video-llm-prediction": "Option 1: The video is in normal speed.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is in slow motion.\nSentence B: The video is in normal speed.", "gt-answer": "Sentence B: The video is in normal speed.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video is in normal speed.\nCaption B: The video is moving backwards.", "gt-answer": "Caption A: The video is in normal speed.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The eye is changing from closed to open.\nOption 2: The eye is blinking rapidly.", "gt-answer": "Option 1: The eye is changing from closed to open.", "video-llm-prediction": "Option 1: The eye is changing from closed to open.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The eye is changing from closed to open.\nCaption B: The eye is changing from open to closed.", "gt-answer": "Caption A: The eye is changing from closed to open.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The eye is changing from closed to open.\nSentence B: The eye is changing from open to closed.", "gt-answer": "Sentence A: The eye is changing from closed to open.", "video-llm-prediction": "Sentence A: The eye is changing from closed to open.", "match_success": true, "rating": 1 } ] }, "1100329207": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The sheep is sleeping.\nCaption B: A sheep is running.", "gt-answer": "Caption B: A sheep is running.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The sheep is eating grass.\nSentence B: A sheep is running.", "gt-answer": "Sentence B: A sheep is running.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A sheep is running.\nOption 2: The sheep is swimming.", "gt-answer": "Option 1: A sheep is running.", "video-llm-prediction": "Option 1: A sheep is running.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The camera view is moving right.\nOption 2: The camera view is moving forward.", "gt-answer": "Option 2: The camera view is moving forward.", "video-llm-prediction": "Option 2: The camera view is moving forward.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The camera view is moving forward.\nCaption B: The camera view is moving backward.", "gt-answer": "Caption A: The camera view is moving forward.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera view is moving forward.\nSentence B: The camera view is moving left.", "gt-answer": "Sentence A: The camera view is moving forward.", "video-llm-prediction": "Sentence A: The camera view is moving forward.", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing at normal speed.\nOption 2: The video is fast-forwarded.", "gt-answer": "Option 1: The video is playing at normal speed.", "video-llm-prediction": "Option 1: The video is playing at normal speed.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing at normal speed.\nSentence B: The video is playing in reverse speed.", "gt-answer": "Sentence A: The video is playing at normal speed.", "video-llm-prediction": "Sentence A: The video is playing at normal speed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is in super slow motion.\nCaption B: The video is playing at normal speed.", "gt-answer": "Caption B: The video is playing at normal speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1089201973": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The people in the video are spinning football.\nOption 2: The people in the video are kicking a football.", "gt-answer": "Option 2: The people in the video are kicking a football.", "video-llm-prediction": "Option 2: The people in the video are kicking a football.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The people in the video are kicking a football.\nSentence B: The people in the video are stopping football.", "gt-answer": "Sentence A: The people in the video are kicking a football.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The people in the video are throwing football.\nCaption B: The people in the video are kicking a football.", "gt-answer": "Caption B: The people in the video are kicking a football.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing at normal speed.\nSentence B: The video is playing in slow motion.", "gt-answer": "Sentence B: The video is playing in slow motion.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is time-lapse.\nCaption B: The video is playing in slow motion.", "gt-answer": "Caption B: The video is playing in slow motion.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing in rapid speed.\nOption 2: The video is playing in slow motion.", "gt-answer": "Option 2: The video is playing in slow motion.", "video-llm-prediction": "Option 2: The video is playing in slow motion.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The football is moving from right to left.\nOption 2: The football is moving from left to right in the video.", "gt-answer": "Option 2: The football is moving from left to right in the video.", "video-llm-prediction": "Option 2: The football is moving from left to right in the video.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The football is moving from left to right in the video.\nSentence B: The football is moving from right to left in the video.", "gt-answer": "Sentence A: The football is moving from left to right in the video.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The football is moving upwards.\nCaption B: The football is moving from left to right in the video.", "gt-answer": "Caption B: The football is moving from left to right in the video.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1034736653": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman athletes are running.\nSentence B: The woman athletes are cycling.", "gt-answer": "Sentence A: The woman athletes are running.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman athletes are running.\nOption 2: The woman athletes are swimming.", "gt-answer": "Option 1: The woman athletes are running.", "video-llm-prediction": "Option 1: The woman athletes are running.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman athletes are running.\nCaption B: The woman athletes are dancing.", "gt-answer": "Caption A: The woman athletes are running.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman closest to the camera is moving at a normal speed.\nSentence B: The woman closest to the camera is moving at the fastest speed.", "gt-answer": "Sentence B: The woman closest to the camera is moving at the fastest speed.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Compared to others, the woman closest to the camera is moving at the same speed.\nCaption B: The woman closest to the camera is moving at the fastest speed.", "gt-answer": "Caption B: The woman closest to the camera is moving at the fastest speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman closest to the camera is moving at the fastest speed.\nOption 2: The woman closest to the camera is moving at the slowest speed.", "gt-answer": "Option 1: The woman closest to the camera is moving at the fastest speed.", "video-llm-prediction": "Option 1: The woman closest to the camera is moving at the fastest speed.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman athletes are running from right to left.\nOption 2: The woman athletes are running from left to right.", "gt-answer": "Option 2: The woman athletes are running from left to right.", "video-llm-prediction": "Option 2: The woman athletes are running from left to right.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman athletes are running from left to right.\nSentence B: The woman athletes are running from right to left.", "gt-answer": "Sentence A: The woman athletes are running from left to right.", "video-llm-prediction": "Sentence A: The woman athletes are running from left to right.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman athletes are running from top to bottom.\nCaption B: The woman athletes are running from left to right.", "gt-answer": "Caption B: The woman athletes are running from left to right.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1098153561": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing at a regular pace.\nOption 2: The video is playing in fast forward.", "gt-answer": "Option 2: The video is playing in fast forward.", "video-llm-prediction": "Option 1: The video is playing at a regular pace.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video is playing in fast forward.\nCaption B: The speed of the video is normal speed.", "gt-answer": "Caption A: The video is playing in fast forward.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is in slow motion.\nSentence B: The video is playing in fast forward.", "gt-answer": "Sentence B: The video is playing in fast forward.", "video-llm-prediction": "Sentence A: The video is in slow motion.", "match_success": true, "rating": 0 } ], "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The leaves are changing color from green to yellow.\nCaption B: The leaves are changing color from yellow to green.", "gt-answer": "Caption A: The leaves are changing color from green to yellow.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The leaves are changing color from yellow to green.\nSentence B: The leaves are changing color from green to yellow.", "gt-answer": "Sentence B: The leaves are changing color from green to yellow.", "video-llm-prediction": "Sentence A: The leaves are changing color from yellow to green.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The leaves are changing color from yellow to green.\nOption 2: The leaves are changing color from green to yellow.", "gt-answer": "Option 2: The leaves are changing color from green to yellow.", "video-llm-prediction": "Option 1: The leaves are changing color from yellow to green.", "match_success": true, "rating": 0 } ] }, "1084153195": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The dolphins are diving.\nCaption B: The dolphins are swimming.", "gt-answer": "Caption B: The dolphins are swimming.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The dolphins are swimming.\nOption 2: The dolphins are sleeping.", "gt-answer": "Option 1: The dolphins are swimming.", "video-llm-prediction": "Option 1: The dolphins are swimming.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The dolphins are jumping.\nSentence B: The dolphins are swimming.", "gt-answer": "Sentence B: The dolphins are swimming.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which caption matches the video better?\nCaption A: The dolphins are hovering in one place.\nCaption B: The dolphins are swimming towards the shore.", "gt-answer": "Caption B: The dolphins are swimming towards the shore.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The dolphins are swimming away from the shore.\nSentence B: The dolphins are swimming towards the shore.", "gt-answer": "Sentence B: The dolphins are swimming towards the shore.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The dolphins are swimming towards the shore.\nOption 2: The dolphins are swimming parallel to the shore.", "gt-answer": "Option 1: The dolphins are swimming towards the shore.", "video-llm-prediction": "Option 2: The dolphins are swimming parallel to the shore.", "match_success": true, "rating": 0 } ], "speed": [ { "question": "Which caption matches the video better?\nCaption A: The rate of movement in the video is normal speed.\nCaption B: The rate of movement in the video is fast speed.", "gt-answer": "Caption A: The rate of movement in the video is normal speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The rate of movement in the video is no speed.\nSentence B: The rate of movement in the video is normal speed.", "gt-answer": "Sentence B: The rate of movement in the video is normal speed.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The rate of movement in the video is normal speed.\nOption 2: The rate of movement in the video is slow speed.", "gt-answer": "Option 1: The rate of movement in the video is normal speed.", "video-llm-prediction": "Option 2: The rate of movement in the video is slow speed.", "match_success": true, "rating": 0 } ], "order": [ { "question": "Which caption matches the video better?\nCaption A: The dolphins swim in circles.\nCaption B: The dolphins swim towards the shore, then get fed with fish.", "gt-answer": "Caption B: The dolphins swim towards the shore, then get fed with fish.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The dolphins swim towards the shore, then get fed with fish.\nSentence B: The dolphins get fed with fish, then swim towards the shore.", "gt-answer": "Sentence A: The dolphins swim towards the shore, then get fed with fish.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The dolphins swim towards the shore, then get fed with fish.\nOption 2: Getting fed with fish is the first event that happens to the dolphins.", "gt-answer": "Option 1: The dolphins swim towards the shore, then get fed with fish.", "video-llm-prediction": "Option 2: Getting fed with fish is the first event that happens to the dolphins.", "match_success": true, "rating": 0 } ] }, "1054717541": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is kneading dough.\nSentence B: The person is cutting dough.", "gt-answer": "Sentence A: A person is kneading dough.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is grilling dough.\nOption 2: A person is kneading dough.", "gt-answer": "Option 2: A person is kneading dough.", "video-llm-prediction": "Option 2: A person is kneading dough.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The person is throwing dough.\nCaption B: A person is kneading dough.", "gt-answer": "Caption B: A person is kneading dough.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing at normal speed.\nSentence B: The video is playing in slow motion.", "gt-answer": "Sentence B: The video is playing in slow motion.", "video-llm-prediction": "Sentence B: The video is playing in slow motion.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is playing in slow motion.\nCaption B: The video is playing with a time-lapse effect.", "gt-answer": "Caption A: The video is playing in slow motion.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing in slow motion.\nOption 2: The tempo of the video playback is fast.", "gt-answer": "Option 1: The video is playing in slow motion.", "video-llm-prediction": "Option 1: The video is playing in slow motion.", "match_success": true, "rating": 1 } ] }, "1081092839": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The sun is descending.\nCaption B: The sun is ascending.", "gt-answer": "Caption A: The sun is descending.", "video-llm-prediction": "Caption B: The sun is ascending.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The sun is moving upwards.\nSentence B: The sun is moving downwards.", "gt-answer": "Sentence B: The sun is moving downwards.", "video-llm-prediction": "Sentence A: The sun is moving upwards.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sun is descending.\nOption 2: The sun is rising.", "gt-answer": "Option 1: The sun is descending.", "video-llm-prediction": "Option 2: The sun is rising.", "match_success": true, "rating": 0 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is in slow motion.\nOption 2: The video is in time-lapse.", "gt-answer": "Option 2: The video is in time-lapse.", "video-llm-prediction": "Option 2: The video is in time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is in time-lapse.\nSentence B: The speed of the video is normal.", "gt-answer": "Sentence A: The video is in time-lapse.", "video-llm-prediction": "Sentence A: The video is in time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is in time-lapse.\nCaption B: The video has a fast pace.", "gt-answer": "Caption A: The video is in time-lapse.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1059462749": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The person is reading a book.\nCaption B: The person is climbing down a ladder.", "gt-answer": "Caption B: The person is climbing down a ladder.", "video-llm-prediction": "B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is cooking.\nOption 2: The person is climbing down a ladder.", "gt-answer": "Option 2: The person is climbing down a ladder.", "video-llm-prediction": "Option 2: The person is climbing down a ladder.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is riding a bike.\nSentence B: The person is climbing down a ladder.", "gt-answer": "Sentence B: The person is climbing down a ladder.", "video-llm-prediction": "B", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is moving downwards.\nSentence B: The person is moving rightwards.", "gt-answer": "Sentence A: The person is moving downwards.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is moving downwards.\nOption 2: The person is moving upwards.", "gt-answer": "Option 1: The person is moving downwards.", "video-llm-prediction": "Option 2: The person is moving upwards.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The person is moving downwards.\nCaption B: The person is moving leftwards.", "gt-answer": "Caption A: The person is moving downwards.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing at normal speed.\nSentence B: The video is playing in time-lapse.", "gt-answer": "Sentence A: The video is playing at normal speed.", "video-llm-prediction": "Sentence B: The video is playing in time-lapse.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing at normal speed.\nOption 2: The video is playing slowly.", "gt-answer": "Option 1: The video is playing at normal speed.", "video-llm-prediction": "Option 1: The video is playing at normal speed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is playing quickly.\nCaption B: The video is playing at normal speed.", "gt-answer": "Caption B: The video is playing at normal speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1077590324": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A woman is cooking.\nOption 2: The woman is doing laundry.", "gt-answer": "Option 2: The woman is doing laundry.", "video-llm-prediction": "Option 2: The woman is doing laundry.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is doing laundry.\nSentence B: The woman is singing.", "gt-answer": "Sentence A: The woman is doing laundry.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is doing laundry.\nCaption B: The woman is painting.", "gt-answer": "Caption A: The woman is doing laundry.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the entire video is slow motion.\nSentence B: The entire video is in normal speed.", "gt-answer": "Sentence B: The entire video is in normal speed.", "video-llm-prediction": "Sentence A: The speed of the entire video is slow motion.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The entire video is in normal speed.\nOption 2: The video is playing in high speed.", "gt-answer": "Option 1: The entire video is in normal speed.", "video-llm-prediction": "Option 1: The entire video is in normal speed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The velocity of the video playback is low speed.\nCaption B: The entire video is in normal speed.", "gt-answer": "Caption B: The entire video is in normal speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which caption matches the video better?\nCaption A: The woman is putting clothes into the washing machine and then standing up to operate it.\nCaption B: The woman is sitting down to operate the washing machine and then putting clothes into it.", "gt-answer": "Caption A: The woman is putting clothes into the washing machine and then standing up to operate it.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is putting clothes into the washing machine and then standing up to operate it.\nSentence B: The woman is operating the washing machine and then putting clothes into it.", "gt-answer": "Sentence A: The woman is putting clothes into the washing machine and then standing up to operate it.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is putting clothes into the washing machine and then standing up to operate it.\nOption 2: The woman is turning off the washing machine and then getting dressed.", "gt-answer": "Option 1: The woman is putting clothes into the washing machine and then standing up to operate it.", "video-llm-prediction": "Option 1: The woman is putting clothes into the washing machine and then standing up to operate it.", "match_success": true, "rating": 1 } ] }, "1040541401": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The children are painting pictures.\nSentence B: The children are doing chemical experiments.", "gt-answer": "Sentence B: The children are doing chemical experiments.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The children are reading books.\nOption 2: The children are doing chemical experiments.", "gt-answer": "Option 2: The children are doing chemical experiments.", "video-llm-prediction": "Option 2: The children are doing chemical experiments.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The children are doing chemical experiments.\nCaption B: The children are playing soccer.", "gt-answer": "Caption A: The children are doing chemical experiments.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is in time-lapse.\nSentence B: The video is playing at normal speed.", "gt-answer": "Sentence B: The video is playing at normal speed.", "video-llm-prediction": "Sentence A: The video is in time-lapse.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing at normal speed.\nOption 2: The video is playing in super slow motion.", "gt-answer": "Option 1: The video is playing at normal speed.", "video-llm-prediction": "Option 1: The video is playing at normal speed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is playing at normal speed.\nCaption B: The video is playing very slow.", "gt-answer": "Caption A: The video is playing at normal speed.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1059072773": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The ice cream is melting.\nCaption B: The ice cream is freezing.", "gt-answer": "Caption A: The ice cream is melting.", "video-llm-prediction": "Caption A: The ice cream is melting.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The ice cream is evaporating.\nOption 2: The ice cream is melting.", "gt-answer": "Option 2: The ice cream is melting.", "video-llm-prediction": "Option 2: The ice cream is melting.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice cream is melting.\nSentence B: The ice cream is solidifying.", "gt-answer": "Sentence A: The ice cream is melting.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 } ] }, "1064645467": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man is cooking.\nOption 2: The man is doing exercise.", "gt-answer": "Option 2: The man is doing exercise.", "video-llm-prediction": "Option 2: The man is doing exercise.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A video showing the man reading a book.\nSentence B: The man is doing exercise.", "gt-answer": "Sentence B: The man is doing exercise.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man is swimming.\nCaption B: The man is doing exercise.", "gt-answer": "Caption B: The man is doing exercise.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing at normal speed.\nOption 2: The video is playing in slow motion.", "gt-answer": "Option 1: The video is playing at normal speed.", "video-llm-prediction": "Option 2: The video is playing in slow motion.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video is playing at normal speed.\nCaption B: The video is playing at a fast speed.", "gt-answer": "Caption A: The video is playing at normal speed.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the video is time-lapse.\nSentence B: The video is playing at normal speed.", "gt-answer": "Sentence B: The video is playing at normal speed.", "video-llm-prediction": "Sentence A: The video is playing at normal speed.", "match_success": false, "chatgpt-response": "Incorrect", "rating": 0 } ], "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man takes off his jacket and then jumps up to grab the bar.\nOption 2: The man jumps up to grab the bar before taking off his jacket.", "gt-answer": "Option 1: The man takes off his jacket and then jumps up to grab the bar.", "video-llm-prediction": "Option 2: The man jumps up to grab the bar before taking off his jacket.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The man takes off his jacket and then jumps up to grab the bar.\nCaption B: The man takes off his jacket after jumping down from the bar.", "gt-answer": "Caption A: The man takes off his jacket and then jumps up to grab the bar.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man takes off his jacket and then jumps up to grab the bar.\nSentence B: The man jumps down from the bar and then puts on his jacket.", "gt-answer": "Sentence A: The man takes off his jacket and then jumps up to grab the bar.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1088279109": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The elevator is rising.\nSentence B: The elevator is going down.", "gt-answer": "Sentence A: The elevator is rising.", "video-llm-prediction": "Sentence A: The elevator is rising.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The elevator is staying still.\nCaption B: The elevator is rising.", "gt-answer": "Caption B: The elevator is rising.", "video-llm-prediction": "Caption B: The elevator is rising.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The elevator is rising.\nOption 2: The elevator is not moving.", "gt-answer": "Option 1: The elevator is rising.", "video-llm-prediction": "Option 2: The elevator is not moving.", "match_success": true, "rating": 0 } ], "speed": [ { "question": "Which caption matches the video better?\nCaption A: The video is playing at normal speed.\nCaption B: The video is in slow motion.", "gt-answer": "Caption A: The video is playing at normal speed.", "video-llm-prediction": "Caption B: The video is in slow motion.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is being played in fast forward mode.\nOption 2: The video is playing at normal speed.", "gt-answer": "Option 2: The video is playing at normal speed.", "video-llm-prediction": "Option 2: The video is playing at normal speed.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is on pause.\nSentence B: The video is playing at normal speed.", "gt-answer": "Sentence B: The video is playing at normal speed.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1093573255": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is running in time-lapse.\nOption 2: The video is running at normal speed.", "gt-answer": "Option 1: The video is running in time-lapse.", "video-llm-prediction": "Option 1: The video is running in time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is in slow motion.\nSentence B: The video is running in time-lapse.", "gt-answer": "Sentence B: The video is running in time-lapse.", "video-llm-prediction": "Sentence B: The video is running in time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is running steadily.\nCaption B: The video is running in time-lapse.", "gt-answer": "Caption B: The video is running in time-lapse.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The plant is reviving.\nOption 2: The plant is wilting.", "gt-answer": "Option 1: The plant is reviving.", "video-llm-prediction": "Option 1: The plant is reviving.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The plant is decaying.\nSentence B: The plant is reviving.", "gt-answer": "Sentence B: The plant is reviving.", "video-llm-prediction": "Sentence B: The plant is reviving.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The plant is reviving.\nCaption B: The plant is wilting.", "gt-answer": "Caption A: The plant is reviving.", "video-llm-prediction": "Caption A: The plant is reviving.", "match_success": true, "rating": 1 } ] }, "1105233293": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The people in the video are swimming.\nSentence B: The people in the video are playing basketball.", "gt-answer": "Sentence B: The people in the video are playing basketball.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The people in the video are playing basketball.\nCaption B: The people in the video are dancing.", "gt-answer": "Caption A: The people in the video are playing basketball.", "video-llm-prediction": "A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The people in the video are sleeping.\nOption 2: The people in the video are playing basketball.", "gt-answer": "Option 2: The people in the video are playing basketball.", "video-llm-prediction": "Option 2: The people in the video are playing basketball.", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing at accelerated speed.\nOption 2: The video is playing at normal speed.", "gt-answer": "Option 2: The video is playing at normal speed.", "video-llm-prediction": "Option 2: The video is playing at normal speed.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing in reverse.\nSentence B: The video is playing at normal speed.", "gt-answer": "Sentence B: The video is playing at normal speed.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is playing at normal speed.\nCaption B: The video is in slow-motion.", "gt-answer": "Caption A: The video is playing at normal speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The players are passing the ball to each other and then shooting the basketball.\nSentence B: The players are shooting the basketball and then blocking the opponent.", "gt-answer": "Sentence A: The players are passing the ball to each other and then shooting the basketball.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The players are passing the ball to each other and then shooting the basketball.\nCaption B: The players are shooting the basketball and then dribbling the ball.", "gt-answer": "Caption A: The players are passing the ball to each other and then shooting the basketball.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The players are passing the ball to each other and then shooting the basketball.\nOption 2: The players are passing the ball to each other and then blocking the opponent.", "gt-answer": "Option 1: The players are passing the ball to each other and then shooting the basketball.", "video-llm-prediction": "Option 1: The players are passing the ball to each other and then shooting the basketball.", "match_success": true, "rating": 1 } ] }, "1077146132": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A woman is tossing and turning in the bed.\nSentence B: The woman is waking up in the video.", "gt-answer": "Sentence B: The woman is waking up in the video.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The woman is falling asleep.\nCaption B: The woman is waking up in the video.", "gt-answer": "Caption B: The woman is waking up in the video.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is waking up in the video.\nOption 2: The woman is lying down on the bed.", "gt-answer": "Option 1: The woman is waking up in the video.", "video-llm-prediction": "Option 1: The woman is waking up in the video.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which caption matches the video better?\nCaption A: The first event in the sequence is getting out of bed.\nCaption B: The first event in the sequence is opening the curtains.", "gt-answer": "Caption A: The first event in the sequence is getting out of bed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Closing the curtains occurs before getting into bed.\nOption 2: The first event in the sequence is getting out of bed.", "gt-answer": "Option 2: The first event in the sequence is getting out of bed.", "video-llm-prediction": "Option 1: Closing the curtains occurs before getting into bed.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman closes the curtais and then gets into bed.\nSentence B: The first event in the sequence is getting out of bed.", "gt-answer": "Sentence B: The first event in the sequence is getting out of bed.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ], "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is turning darker.\nOption 2: The video is turning brighter.", "gt-answer": "Option 2: The video is turning brighter.", "video-llm-prediction": "Option 2: The video is turning brighter.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is staying the same brightness.\nSentence B: The video is turning brighter.", "gt-answer": "Sentence B: The video is turning brighter.", "video-llm-prediction": "Sentence B: The video is turning brighter.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is decreasing in brightness.\nCaption B: The video is turning brighter.", "gt-answer": "Caption B: The video is turning brighter.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1086688682": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The traffic lights are changing from red to yellow.\nSentence B: The traffic lights are changing from red to green.", "gt-answer": "Sentence B: The traffic lights are changing from red to green.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The traffic lights are changing from yellow to red.\nCaption B: The traffic lights are changing from red to green.", "gt-answer": "Caption B: The traffic lights are changing from red to green.", "video-llm-prediction": "Caption B: The traffic lights are changing from red to green.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The traffic lights are changing from green to red.\nOption 2: The traffic lights are changing from red to green.", "gt-answer": "Option 2: The traffic lights are changing from red to green.", "video-llm-prediction": "Option 1: The traffic lights are changing from green to red.", "match_success": true, "rating": 0 } ] }, "1096612823": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The car is sliding on ice.\nCaption B: The car is driving on the road.", "gt-answer": "Caption B: The car is driving on the road.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The car is dancing on the rooftop.\nSentence B: The car is driving on the road.", "gt-answer": "Sentence B: The car is driving on the road.", "video-llm-prediction": "Sentence B: The car is driving on the road.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The car is driving on the road.\nOption 2: The car is buried in the sand.", "gt-answer": "Option 1: The car is driving on the road.", "video-llm-prediction": "Option 1: The car is driving on the road.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is zooming in.\nOption 2: The video is zooming out.", "gt-answer": "Option 1: The video is zooming in.", "video-llm-prediction": "Option 2: The video is zooming out.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video is zooming in.\nCaption B: The video is rotating.", "gt-answer": "Caption A: The video is zooming in.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is zooming in.\nSentence B: The video is panning left.", "gt-answer": "Sentence A: The video is zooming in.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1075484900": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The child is turning away from the man.\nSentence B: The child is turning towards the man.", "gt-answer": "Sentence A: The child is turning away from the man.", "video-llm-prediction": "Sentence B: The child is turning towards the man.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The child is turning towards the man.\nOption 2: The child is turning away from the man.", "gt-answer": "Option 2: The child is turning away from the man.", "video-llm-prediction": "Option 1: The child is turning towards the man.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The child is moving forward in relation to the man.\nCaption B: The child is turning away from the man.", "gt-answer": "Caption B: The child is turning away from the man.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1090578917": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is painting a picture.\nSentence B: The man is playing soccer.", "gt-answer": "Sentence B: The man is playing soccer.", "video-llm-prediction": "B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is reading a book.\nOption 2: The man is playing soccer.", "gt-answer": "Option 2: The man is playing soccer.", "video-llm-prediction": "Option 2: The man is playing soccer.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man is playing soccer.\nCaption B: The man is cooking dinner.", "gt-answer": "Caption A: The man is playing soccer.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The man stands up from the ground, then leaps to his right.\nSentence B: The man is leaping to his right and then standing up.", "gt-answer": "Sentence B: The man is leaping to his right and then standing up.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A person is standing up from the ground and then leaping to this right.\nOption 2: The man is leaping to his right and then standing up.", "gt-answer": "Option 2: The man is leaping to his right and then standing up.", "video-llm-prediction": "Option 1: A person is standing up from the ground and then leaping to this right.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The man is leaping to his right after standing up.\nCaption B: The man is leaping to his right and then standing up.", "gt-answer": "Caption B: The man is leaping to his right and then standing up.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which caption matches the video better?\nCaption A: The man is leaping to his left.\nCaption B: The man is leaping to his right.", "gt-answer": "Caption B: The man is leaping to his right.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is leaping to his right.\nSentence B: The man is leaping backward.", "gt-answer": "Sentence A: The man is leaping to his right.", "video-llm-prediction": "Sentence A: The man is leaping to his right.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is leaping to his right.\nOption 2: The man is leaping forward.", "gt-answer": "Option 1: The man is leaping to his right.", "video-llm-prediction": "Option 2: The man is leaping forward.", "match_success": true, "rating": 0 } ] }, "1075338140": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The ball is rolling from right to left.\nSentence B: The ball is rolling from left to right.", "gt-answer": "Sentence B: The ball is rolling from left to right.", "video-llm-prediction": "Sentence B: The ball is rolling from left to right.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The ball is rolling from left to right.\nCaption B: The ball is rolling from right to left.", "gt-answer": "Caption A: The ball is rolling from left to right.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The ball is rolling from right to left.\nOption 2: The ball is rolling from left to right.", "gt-answer": "Option 2: The ball is rolling from left to right.", "video-llm-prediction": "Option 2: The ball is rolling from left to right.", "match_success": true, "rating": 1 } ] }, "1067268403": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The wind turbines are stationary.\nCaption B: The wind turbines are rotating clockwise.", "gt-answer": "Caption B: The wind turbines are rotating clockwise.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The wind turbines are rotating counterclockwise.\nSentence B: The wind turbines are rotating clockwise.", "gt-answer": "Sentence B: The wind turbines are rotating clockwise.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The wind turbines are rotating counterclockwise.\nOption 2: The wind turbines are rotating clockwise.", "gt-answer": "Option 2: The wind turbines are rotating clockwise.", "video-llm-prediction": "Option 1: The wind turbines are rotating counterclockwise.", "match_success": true, "rating": 0 } ] }, "1072594556": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The car in the center of the screen is moving faster than other cars.\nOption 2: The car in the center of the screen is moving at the same speed as other cars.", "gt-answer": "Option 1: The car in the center of the screen is moving faster than other cars.", "video-llm-prediction": "Option 1: The car in the center of the screen is moving faster than other cars.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The car in the center of the screen is moving slower than other cars.\nSentence B: The car in the center of the screen is moving faster than other cars.", "gt-answer": "Sentence B: The car in the center of the screen is moving faster than other cars.", "video-llm-prediction": "Sentence B: The car in the center of the screen is moving faster than other cars.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the car in the center of the screen is unknown.\nCaption B: The car in the center of the screen is moving faster than other cars.", "gt-answer": "Caption B: The car in the center of the screen is moving faster than other cars.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1096007487": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3d cloud is moving upward from the camera's point of view.\nSentence B: The 3d cloud is moving leftward from the camera's point of view.", "gt-answer": "Sentence B: The 3d cloud is moving leftward from the camera's point of view.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The 3d cloud is moving downward from the camera's point of view.\nCaption B: The 3d cloud is moving leftward from the camera's point of view.", "gt-answer": "Caption B: The 3d cloud is moving leftward from the camera's point of view.", "video-llm-prediction": "Caption B: The 3d cloud is moving leftward from the camera's point of view.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3d cloud is moving rightward from the camera's point of view.\nOption 2: The 3d cloud is moving leftward from the camera's point of view.", "gt-answer": "Option 2: The 3d cloud is moving leftward from the camera's point of view.", "video-llm-prediction": "Option 1: The 3d cloud is moving rightward from the camera's point of view.", "match_success": true, "rating": 0 } ] }, "1095939659": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The light is growing clockwise.\nCaption B: The light is growing downwards.", "gt-answer": "Caption A: The light is growing clockwise.", "video-llm-prediction": "Caption A: The light is growing clockwise.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The light is growing counterclockwise.\nSentence B: The light is growing clockwise.", "gt-answer": "Sentence B: The light is growing clockwise.", "video-llm-prediction": "Sentence A: The light is growing counterclockwise.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The light is growing clockwise.\nOption 2: The light is moving straight forward.", "gt-answer": "Option 1: The light is growing clockwise.", "video-llm-prediction": "Option 1: The light is growing clockwise.", "match_success": true, "rating": 1 } ] }, "1059632327": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The tennis ball is bouncing up and down.\nOption 2: The tennis ball is spinning clockwise and anticlockwise.", "gt-answer": "Option 1: The tennis ball is bouncing up and down.", "video-llm-prediction": "Option 1: The tennis ball is bouncing up and down.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The tennis ball is rolling forward and backward.\nSentence B: The tennis ball is bouncing up and down.", "gt-answer": "Sentence B: The tennis ball is bouncing up and down.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The tennis ball is bouncing left to right.\nCaption B: The tennis ball is bouncing up and down.", "gt-answer": "Caption B: The tennis ball is bouncing up and down.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1098785169": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The galaxy is spinning counterclockwise.\nCaption B: The galaxy is rotating clockwise.", "gt-answer": "Caption B: The galaxy is rotating clockwise.", "video-llm-prediction": "Caption B: The galaxy is rotating clockwise.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The galaxy's spin is counterclockwise.\nSentence B: The galaxy is rotating clockwise.", "gt-answer": "Sentence B: The galaxy is rotating clockwise.", "video-llm-prediction": "Sentence A: The galaxy's spin is counterclockwise.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The rotation of the galaxy is counterclockwise.\nOption 2: The galaxy is rotating clockwise.", "gt-answer": "Option 2: The galaxy is rotating clockwise.", "video-llm-prediction": "Option 1: The galaxy is rotating counterclockwise.", "match_success": false, "chatgpt-response": "Incorrect", "rating": 0 } ] }, "1099984801": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The clouds are moving away from the camera.\nSentence B: The clouds are moving towards the camera.", "gt-answer": "Sentence B: The clouds are moving towards the camera.", "video-llm-prediction": "Sentence B: The clouds are moving towards the camera.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The clouds are moving towards the camera.\nOption 2: The clouds are staying still.", "gt-answer": "Option 1: The clouds are moving towards the camera.", "video-llm-prediction": "Option 2: The clouds are staying still.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The clouds are moving away from the camera.\nCaption B: The clouds are moving towards the camera.", "gt-answer": "Caption B: The clouds are moving towards the camera.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is set to time-lapse speed.\nOption 2: The video is in slow motion.", "gt-answer": "Option 1: The video is set to time-lapse speed.", "video-llm-prediction": "Option 1: The video is set to time-lapse speed.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is set to time-lapse speed.\nSentence B: The video is set to real-time speed.", "gt-answer": "Sentence A: The video is set to time-lapse speed.", "video-llm-prediction": "Sentence B: The video is set to real-time speed.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video is set to time-lapse speed.\nCaption B: The video is dilated in time.", "gt-answer": "Caption A: The video is set to time-lapse speed.", "video-llm-prediction": "Caption A: The video is set to time-lapse speed.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The brightness of the video is staying the same.\nOption 2: The entire video is getting darker.", "gt-answer": "Option 2: The entire video is getting darker.", "video-llm-prediction": "Option 2: The entire video is getting darker.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The entire video is getting darker.\nCaption B: The video is getting brighter.", "gt-answer": "Caption A: The entire video is getting darker.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The entire video is getting darker.\nSentence B: The entire video is getting brighter.", "gt-answer": "Sentence A: The entire video is getting darker.", "video-llm-prediction": "Sentence A: The entire video is getting darker.", "match_success": true, "rating": 1 } ] }, "1082419672": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The movement in the video is flying upwards relative to the ship.\nOption 2: The movement in the video is flying backwards relative to the ship.", "gt-answer": "Option 2: The movement in the video is flying backwards relative to the ship.", "video-llm-prediction": "Option 1: The movement in the video is flying upwards relative to the ship.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The movement in the video is flying backwards relative to the ship.\nSentence B: The movement in the video is flying forwards relative to the ship.", "gt-answer": "Sentence A: The movement in the video is flying backwards relative to the ship.", "video-llm-prediction": "Sentence A: The movement in the video is flying backwards relative to the ship.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The movement in the video is flying backwards relative to the ship.\nCaption B: The movement in the video is flying forwards relative to the ship.", "gt-answer": "Caption A: The movement in the video is flying backwards relative to the ship.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1058985719": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The airplane is moving from right to left.\nCaption B: The airplane is flying from bottom to top.", "gt-answer": "Caption B: The airplane is flying from bottom to top.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The airplane is flying from bottom to top.\nOption 2: The airplane is flying from top to bottom.", "gt-answer": "Option 1: The airplane is flying from bottom to top.", "video-llm-prediction": "Option 2: The airplane is flying from top to bottom.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The airplane is flying from bottom to top.\nSentence B: The airplane is moving from left to right.", "gt-answer": "Sentence A: The airplane is flying from bottom to top.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1055413730": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The planet Earth is spinning to the left.\nOption 2: The planet Earth is spinning to the right.", "gt-answer": "Option 2: The planet Earth is spinning to the right.", "video-llm-prediction": "Option 2: The planet Earth is spinning to the right.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The planet Earth is spinning to the right.\nCaption B: The Earth is rotating upward.", "gt-answer": "Caption A: The planet Earth is spinning to the right.", "video-llm-prediction": "Caption A: The planet Earth is spinning to the right.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The planet Earth is spinning to the left.\nSentence B: The planet Earth is spinning to the right.", "gt-answer": "Sentence B: The planet Earth is spinning to the right.", "video-llm-prediction": "Sentence A: The planet Earth is spinning to the left.", "match_success": true, "rating": 0 } ] }, "1081684532": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The asteroids in the video are flying towards the camera.\nSentence B: The asteroids are flying away from the camera.", "gt-answer": "Sentence A: The asteroids in the video are flying towards the camera.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The asteroids in the video are flying towards the camera.\nOption 2: The asteroids are flying to the left.", "gt-answer": "Option 1: The asteroids in the video are flying towards the camera.", "video-llm-prediction": "Option 1: The asteroids in the video are flying towards the camera.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The asteroids are flying to the right.\nCaption B: The asteroids in the video are flying towards the camera.", "gt-answer": "Caption B: The asteroids in the video are flying towards the camera.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "23391142": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The people in the video are dancing.\nCaption B: The people in the video are playing badminton.", "gt-answer": "Caption B: The people in the video are playing badminton.", "video-llm-prediction": "B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The people in the video are playing badminton.\nSentence B: The people are reading books.", "gt-answer": "Sentence A: The people in the video are playing badminton.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The people in the video are playing badminton.\nOption 2: The people in the video are playing soccer.", "gt-answer": "Option 1: The people in the video are playing badminton.", "video-llm-prediction": "Option 1: The people in the video are playing badminton.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is panning upwards.\nOption 2: The video is panning to the right.", "gt-answer": "Option 2: The video is panning to the right.", "video-llm-prediction": "Option 2: The video is panning to the right.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is zooming in.\nCaption B: The video is panning to the right.", "gt-answer": "Caption B: The video is panning to the right.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is panning to the right.\nSentence B: The video is panning to the left.", "gt-answer": "Sentence A: The video is panning to the right.", "video-llm-prediction": "Sentence A: The video is panning to the right.", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing at normal speed.\nOption 2: The video is playing in slow motion.", "gt-answer": "Option 2: The video is playing in slow motion.", "video-llm-prediction": "Option 1: The video is playing at normal speed.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video is playing in slow motion.\nCaption B: The video is playing in time-lapse.", "gt-answer": "Caption A: The video is playing in slow motion.", "video-llm-prediction": "Caption B: The video is playing in time-lapse.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing in slow motion.\nSentence B: The video is playing quickly.", "gt-answer": "Sentence A: The video is playing in slow motion.", "video-llm-prediction": "Sentence A: The video is playing in slow motion.", "match_success": true, "rating": 1 } ] }, "1078368947": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is washing her face in the bathroom.\nOption 2: A woman is reading a book in the bathroom.", "gt-answer": "Option 2: A woman is reading a book in the bathroom.", "video-llm-prediction": "Option 2: A woman is reading a book in the bathroom.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A woman is reading a book in the bathroom.\nSentence B: The woman is taking a shower in the bathroom.", "gt-answer": "Sentence A: A woman is reading a book in the bathroom.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is washing her hair in the bathroom.\nCaption B: A woman is reading a book in the bathroom.", "gt-answer": "Caption B: A woman is reading a book in the bathroom.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1099738787": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The concrete wall is falling apart.\nCaption B: The concrete wall is assembling together.", "gt-answer": "Caption A: The concrete wall is falling apart.", "video-llm-prediction": "Caption B: The concrete wall is assembling together.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The concrete wall is falling apart.\nOption 2: The concrete wall is growing taller.", "gt-answer": "Option 1: The concrete wall is falling apart.", "video-llm-prediction": "Option 1: The concrete wall is falling apart.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The concrete wall is assembling together.\nSentence B: The concrete wall is falling apart.", "gt-answer": "Sentence B: The concrete wall is falling apart.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1094219575": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is played in normal speed.\nSentence B: The video is played in time-lapse mode.", "gt-answer": "Sentence B: The video is played in time-lapse mode.", "video-llm-prediction": "Sentence B: The video is played in time-lapse mode.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is played in time-lapse mode.\nOption 2: The video is played in slow motion.", "gt-answer": "Option 1: The video is played in time-lapse mode.", "video-llm-prediction": "Option 1: The video is played in time-lapse mode.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is played in fast forwarding mode.\nCaption B: The video is played in time-lapse mode.", "gt-answer": "Caption B: The video is played in time-lapse mode.", "video-llm-prediction": "Caption B: The video is played in time-lapse mode.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The flowers are blooming.\nCaption B: The flowers are wilting.", "gt-answer": "Caption B: The flowers are wilting.", "video-llm-prediction": "Caption B: The flowers are wilting.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The flowers are blooming.\nOption 2: The flowers are wilting.", "gt-answer": "Option 2: The flowers are wilting.", "video-llm-prediction": "Option 1: The flowers are blooming.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The flowers are wilting.\nSentence B: The flowers are blooming.", "gt-answer": "Sentence A: The flowers are wilting.", "video-llm-prediction": "Sentence B: The flowers are blooming.", "match_success": true, "rating": 0 } ] }, "1053758117": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The camera is standing still before a 3D digital brain.\nCaption B: The camera is zooming into a 3D digital brain.", "gt-answer": "Caption B: The camera is zooming into a 3D digital brain.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is zooming out from a 3D digital brain.\nOption 2: The camera is zooming into a 3D digital brain.", "gt-answer": "Option 2: The camera is zooming into a 3D digital brain.", "video-llm-prediction": "Option 2: The camera is zooming into a 3D digital brain.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is zooming into a 3D digital brain.\nSentence B: The camera is moving downwards.", "gt-answer": "Sentence A: The camera is zooming into a 3D digital brain.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1069717165": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman shakes her head, followed by nodding.\nOption 2: The woman nods her head repeatedly.", "gt-answer": "Option 1: The woman shakes her head, followed by nodding.", "video-llm-prediction": "Option 2: The woman nods her head repeatedly.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The woman shakes her head, followed by nodding.\nCaption B: After shaking her head, the woman feels sad.", "gt-answer": "Caption A: The woman shakes her head, followed by nodding.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman shakes her head, followed by nodding.\nSentence B: The woman shakes her head after nodding her head.", "gt-answer": "Sentence A: The woman shakes her head, followed by nodding.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1041584314": { "order": [ { "question": "Which caption matches the video better?\nCaption A: The man's expression changes from serious to smiling.\nCaption B: The man doesn't change his expression.", "gt-answer": "Caption A: The man's expression changes from serious to smiling.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man's expression changes from serious to smiling.\nSentence B: The man starts to smile before showing a serious expression.", "gt-answer": "Sentence A: The man's expression changes from serious to smiling.", "video-llm-prediction": "Sentence A: The man's expression changes from serious to smiling.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man's expression changes from serious to smiling.\nOption 2: The man never smiles.", "gt-answer": "Option 1: The man's expression changes from serious to smiling.", "video-llm-prediction": "Option 1: The man's expression changes from serious to smiling.", "match_success": true, "rating": 1 } ] }, "1067427023": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The video is playing at normal speed.\nCaption B: The video is playing in time-lapse.", "gt-answer": "Caption B: The video is playing in time-lapse.", "video-llm-prediction": "Caption B: The video is playing in time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing in slow motion.\nSentence B: The video is playing in time-lapse.", "gt-answer": "Sentence B: The video is playing in time-lapse.", "video-llm-prediction": "Sentence A: The video is playing in slow motion.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing in time-lapse.\nOption 2: The speed of the video is unknown.", "gt-answer": "Option 1: The video is playing in time-lapse.", "video-llm-prediction": "Option 1: The video is playing in time-lapse.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The snow particles are melting.\nSentence B: The snow particles are solidifying.", "gt-answer": "Sentence A: The snow particles are melting.", "video-llm-prediction": "Sentence A: The snow particles are melting.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The snow particles are freezing.\nOption 2: The snow particles are melting.", "gt-answer": "Option 2: The snow particles are melting.", "video-llm-prediction": "Option 2: The snow particles are melting.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The snow particles are melting.\nCaption B: The snow particles are remaining the same.", "gt-answer": "Caption A: The snow particles are melting.", "video-llm-prediction": "Caption B: The snow particles are remaining the same.", "match_success": true, "rating": 0 } ] }, "1093948229": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The golden particles are exploding.\nSentence B: The golden particles are disappearing.", "gt-answer": "Sentence A: The golden particles are exploding.", "video-llm-prediction": "Sentence A: The golden particles are exploding.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The golden particles are exploding.\nOption 2: The golden particles are shrinking.", "gt-answer": "Option 1: The golden particles are exploding.", "video-llm-prediction": "Option 1: The golden particles are exploding.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The golden particles are contracting.\nCaption B: The golden particles are exploding.", "gt-answer": "Caption B: The golden particles are exploding.", "video-llm-prediction": "Caption B: The golden particles are exploding.", "match_success": true, "rating": 1 } ] }, "1106668103": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is staying still.\nOption 2: The movement in the video is forward.", "gt-answer": "Option 2: The movement in the video is forward.", "video-llm-prediction": "Option 2: The movement in the video is forward.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The movement in the video is forward.\nCaption B: The movement in the video is backward.", "gt-answer": "Caption A: The movement in the video is forward.", "video-llm-prediction": "Caption A: The movement in the video is forward.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The movement in the video is forward.\nSentence B: The primary motion depicted in the video is turning in circles.", "gt-answer": "Sentence A: The movement in the video is forward.", "video-llm-prediction": "Sentence A: The movement in the video is forward.", "match_success": true, "rating": 1 } ] }, "1049965795": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The light bulb is turning off.\nSentence B: The light bulb is lighting up.", "gt-answer": "Sentence B: The light bulb is lighting up.", "video-llm-prediction": "Sentence B: The light bulb is lighting up.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The light bulb is breaking.\nCaption B: The light bulb is lighting up.", "gt-answer": "Caption B: The light bulb is lighting up.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The light bulb is rapidly changing color.\nOption 2: The light bulb is lighting up.", "gt-answer": "Option 2: The light bulb is lighting up.", "video-llm-prediction": "Option 1: The light bulb is rapidly changing color.", "match_success": true, "rating": 0 } ] }, "1052364490": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The digital bar chart is growing.\nSentence B: The digital bar chart is staying in the same size.", "gt-answer": "Sentence A: The digital bar chart is growing.", "video-llm-prediction": "Sentence A: The digital bar chart is growing.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The digital bar chart is growing.\nCaption B: The digital bar chart is shrinking.", "gt-answer": "Caption A: The digital bar chart is growing.", "video-llm-prediction": "Caption B: The digital bar chart is shrinking.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The digital bar chart is decreasing.\nOption 2: The digital bar chart is growing.", "gt-answer": "Option 2: The digital bar chart is growing.", "video-llm-prediction": "Option 1: The digital bar chart is decreasing.", "match_success": true, "rating": 0 } ] }, "1080990179": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The apple is shrinking and drying out.\nCaption B: The apple is ripening.", "gt-answer": "Caption A: The apple is shrinking and drying out.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The apple is shrinking and drying out.\nSentence B: The apple is growing bigger.", "gt-answer": "Sentence A: The apple is shrinking and drying out.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The apple is becoming juicier.\nOption 2: The apple is shrinking and drying out.", "gt-answer": "Option 2: The apple is shrinking and drying out.", "video-llm-prediction": "Option 2: The apple is shrinking and drying out.", "match_success": true, "rating": 1 } ] }, "1057163434": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The chef is removing the bread from the top of the burger.\nCaption B: The chef is putting the bread on top of the burger.", "gt-answer": "Caption B: The chef is putting the bread on top of the burger.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The chef is putting the bread on top of the burger.\nOption 2: The chef is removing the bread from the top of the burger.", "gt-answer": "Option 1: The chef is putting the bread on top of the burger.", "video-llm-prediction": "Option 1: The chef is putting the bread on top of the burger.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The chef is putting the bread on top of the burger.\nSentence B: The chef is removing the bread from the top of the burger.", "gt-answer": "Sentence A: The chef is putting the bread on top of the burger.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1085910917": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is drinking in the video.\nSentence B: The woman is sleeping.", "gt-answer": "Sentence A: The woman is drinking in the video.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is drinking in the video.\nCaption B: A woman is dancing in the video.", "gt-answer": "Caption A: The woman is drinking in the video.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is drinking in the video.\nOption 2: She is cooking.", "gt-answer": "Option 1: The woman is drinking in the video.", "video-llm-prediction": "Option 1: The woman is drinking in the video.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is turning back and then drinking a glass of water.\nSentence B: The person is drinking a glass of water and then turning back.", "gt-answer": "Sentence B: The person is drinking a glass of water and then turning back.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The person is turning back and drinking a glass of water at the same time.\nCaption B: The person is drinking a glass of water and then turning back.", "gt-answer": "Caption B: The person is drinking a glass of water and then turning back.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is drinking a glass of water after turning back.\nOption 2: The person is drinking a glass of water and then turning back.", "gt-answer": "Option 2: The person is drinking a glass of water and then turning back.", "video-llm-prediction": "Option 2: The person is drinking a glass of water and then turning back.", "match_success": true, "rating": 1 } ] }, "1058907236": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The squirrel is sleeping.\nSentence B: The squirrel is foraging.", "gt-answer": "Sentence B: The squirrel is foraging.", "video-llm-prediction": "Sentence B: The squirrel is foraging.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The squirrel is swimming in the park.\nOption 2: The squirrel is foraging.", "gt-answer": "Option 2: The squirrel is foraging.", "video-llm-prediction": "Option 2: The squirrel is foraging.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The squirrel is foraging.\nCaption B: The squirrel is dancing in the park.", "gt-answer": "Caption A: The squirrel is foraging.", "video-llm-prediction": "A", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which caption matches the video better?\nCaption A: The correct order of events is picking up a nut followed by turning back.\nCaption B: The correct order of events is picking up a nut after turning back.", "gt-answer": "Caption A: The correct order of events is picking up a nut followed by turning back.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The correct order of events is turning back to pick up a nut.\nOption 2: The correct order of events is picking up a nut followed by turning back.", "gt-answer": "Option 2: The correct order of events is picking up a nut followed by turning back.", "video-llm-prediction": "Option 1: The correct order of events is turning back to pick up a nut.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The squirrel is turning back followed by picking up a nut.\nSentence B: The correct order of events is picking up a nut followed by turning back.", "gt-answer": "Sentence B: The correct order of events is picking up a nut followed by turning back.", "video-llm-prediction": "Sentence A: The squirrel is turning back followed by picking up a nut.", "match_success": true, "rating": 0 } ] }, "1077575504": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The pile of salt is shrinking.\nOption 2: The pile of salt is growing.", "gt-answer": "Option 2: The pile of salt is growing.", "video-llm-prediction": "Option 1: The pile of salt is shrinking.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The pile of salt is melting.\nSentence B: The pile of salt is growing.", "gt-answer": "Sentence B: The pile of salt is growing.", "video-llm-prediction": "Sentence A: The pile of salt is melting.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The pile of salt is growing.\nCaption B: The pile of salt is disappearing.", "gt-answer": "Caption A: The pile of salt is growing.", "video-llm-prediction": "Caption B: The pile of salt is disappearing.", "match_success": true, "rating": 0 } ] }, "23370985": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice crystal is forming.\nSentence B: The ice crystal is shattering.", "gt-answer": "Sentence A: The ice crystal is forming.", "video-llm-prediction": "Sentence A: The ice crystal is forming.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The ice crystal is melting.\nCaption B: The ice crystal is forming.", "gt-answer": "Caption B: The ice crystal is forming.", "video-llm-prediction": "Caption B: The ice crystal is forming.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The ice crystal is reducing in size.\nOption 2: The ice crystal is growing in size.", "gt-answer": "Option 2: The ice crystal is growing in size.", "video-llm-prediction": "Option 1: The ice crystal is reducing in size.", "match_success": true, "rating": 0 } ] }, "1100319395": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The circular pattern is moving counter-clockwise.\nCaption B: The circular pattern is moving clockwise.", "gt-answer": "Caption B: The circular pattern is moving clockwise.", "video-llm-prediction": "Caption B: The circular pattern is moving clockwise.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The circular pattern is moving clockwise.\nOption 2: The circular pattern is spinning counter-clockwise.", "gt-answer": "Option 1: The circular pattern is moving clockwise.", "video-llm-prediction": "Option 1: The circular pattern is moving clockwise.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The circular pattern is vibrating back and forth.\nSentence B: The circular pattern is moving clockwise.", "gt-answer": "Sentence B: The circular pattern is moving clockwise.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1067268406": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: At the same speed as the camera, the car is moving.\nCaption B: The car is moving faster than the camera.", "gt-answer": "Caption B: The car is moving faster than the camera.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The car is moving slower than the camera.\nSentence B: The car is moving faster than the camera.", "gt-answer": "Sentence B: The car is moving faster than the camera.", "video-llm-prediction": "Sentence A: The car is moving slower than the camera.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The car and the camera have the same speed.\nOption 2: The car is moving faster than the camera.", "gt-answer": "Option 2: The car is moving faster than the camera.", "video-llm-prediction": "Option 1: The car and the camera have the same speed.", "match_success": true, "rating": 0 } ] }, "1092682551": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The cars are traveling at the same speed as the camera.\nOption 2: The cars are traveling slower than the camera.", "gt-answer": "Option 2: The cars are traveling slower than the camera.", "video-llm-prediction": "Option 2: The cars are traveling slower than the camera.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The cars are traveling slower than the camera.\nSentence B: The cars are traveling faster than the camera.", "gt-answer": "Sentence A: The cars are traveling slower than the camera.", "video-llm-prediction": "Sentence A: The cars are traveling slower than the camera.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The cars are traveling slower than the camera.\nCaption B: The cars and the camera have the same speed.", "gt-answer": "Caption A: The cars are traveling slower than the camera.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1101200293": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is moving counterclockwise around the aircraft carrier.\nSentence B: The camera is moving clockwise around the aircraft carrier.", "gt-answer": "Sentence B: The camera is moving clockwise around the aircraft carrier.", "video-llm-prediction": "Sentence A: The camera is moving counterclockwise around the aircraft carrier.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The camera is moving clockwise around the aircraft carrier.\nCaption B: The camera is tilting upwardsaround the aircraft carrier.", "gt-answer": "Caption A: The camera is moving clockwise around the aircraft carrier.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is moving clockwise around the aircraft carrier.\nOption 2: The camera is moving back and forth around the aircraft carrier.", "gt-answer": "Option 1: The camera is moving clockwise around the aircraft carrier.", "video-llm-prediction": "Option 1: The camera is moving clockwise around the aircraft carrier.", "match_success": true, "rating": 1 } ] }, "1072453637": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is panning to the back of a fighter jet.\nSentence B: The camera is panning downwards around a fighter jet.", "gt-answer": "Sentence A: The camera is panning to the back of a fighter jet.", "video-llm-prediction": "Sentence A: The camera is panning to the back of a fighter jet.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is panning to the back of a fighter jet.\nOption 2: The camera is panning upwards around a fighter jet.", "gt-answer": "Option 1: The camera is panning to the back of a fighter jet.", "video-llm-prediction": "Option 2: The camera is panning upwards around a fighter jet.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The camera is panning to the back of a fighter jet.\nCaption B: The camera is panning to the front of a fighter jet.", "gt-answer": "Caption A: The camera is panning to the back of a fighter jet.", "video-llm-prediction": "A", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which caption matches the video better?\nCaption A: The fighter jet first shot one missile, then shot two missiles.\nCaption B: The fighter jet shot three missiles, then shot four missiles.", "gt-answer": "Caption A: The fighter jet first shot one missile, then shot two missiles.", "video-llm-prediction": "A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The fighter jet shot two missiles.\nOption 2: The fighter jet first shot one missile, then shot two missiles.", "gt-answer": "Option 2: The fighter jet first shot one missile, then shot two missiles.", "video-llm-prediction": "Option 2: The fighter jet first shot one missile, then shot two missiles.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The fighter jet first shot one missile, then shot two missiles.\nSentence B: The fighter jet shot four missiles, then shot three missiles.", "gt-answer": "Sentence A: The fighter jet first shot one missile, then shot two missiles.", "video-llm-prediction": "Sentence A: The fighter jet first shot one missile, then shot two missiles.", "match_success": true, "rating": 1 } ] }, "3396473605": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The five stars appear from left to right.\nSentence B: The five stars disappear from right to left.", "gt-answer": "Sentence A: The five stars appear from left to right.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The five stars appear from left to right.\nOption 2: The five stars appear from top to bottom.", "gt-answer": "Option 1: The five stars appear from left to right.", "video-llm-prediction": "Option 2: The five stars appear from top to bottom.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The five stars appear from left to right.\nCaption B: The five stars appear from bottom to top.", "gt-answer": "Caption A: The five stars appear from left to right.", "video-llm-prediction": "Caption B: The five stars appear from bottom to top.", "match_success": true, "rating": 0 } ] }, "1086820415": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The large wave is moving horizontally towards the right side of the screen.\nSentence B: The large wave is moving vertically towards the bottom of the screen.", "gt-answer": "Sentence B: The large wave is moving vertically towards the bottom of the screen.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The large wave is moving vertically towards the top of the screen.\nOption 2: The large wave is moving vertically towards the bottom of the screen.", "gt-answer": "Option 2: The large wave is moving vertically towards the bottom of the screen.", "video-llm-prediction": "Option 1: The large wave is moving vertically towards the top of the screen.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The large wave is moving vertically towards the bottom of the screen.\nCaption B: The large wave is moving diagonally towards the top-left corner.", "gt-answer": "Caption A: The large wave is moving vertically towards the bottom of the screen.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1081513610": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A man in the video is discussing something.\nOption 2: The man in the video is swimming.", "gt-answer": "Option 1: A man in the video is discussing something.", "video-llm-prediction": "Option 1: A man in the video is discussing something.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A man in the video is discussing something.\nCaption B: The man in the video is sleeping.", "gt-answer": "Caption A: A man in the video is discussing something.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man in the video is running.\nSentence B: A man in the video is discussing something.", "gt-answer": "Sentence B: A man in the video is discussing something.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1073831006": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: Zebras are running.\nSentence B: Zebras are swimming.", "gt-answer": "Sentence A: Zebras are running.", "video-llm-prediction": "Sentence A: Zebras are running.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Zebras are sleeping.\nCaption B: Zebras are running.", "gt-answer": "Caption B: Zebras are running.", "video-llm-prediction": "B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Zebras are walking.\nOption 2: Zebras are running.", "gt-answer": "Option 2: Zebras are running.", "video-llm-prediction": "Option 2: Zebras are running.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which caption matches the video better?\nCaption A: The zebras are moving from right to left.\nCaption B: The zebras are moving from left to right.", "gt-answer": "Caption B: The zebras are moving from left to right.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The zebras are moving from left to right.\nOption 2: The zebras are moving from top to bottom.", "gt-answer": "Option 1: The zebras are moving from left to right.", "video-llm-prediction": "Option 1: The zebras are moving from left to right.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The zebras are moving away from the camera.\nSentence B: The zebras are moving from left to right.", "gt-answer": "Sentence B: The zebras are moving from left to right.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video shows zebras moving in slow motion.\nSentence B: The zebras are stationary.", "gt-answer": "Sentence A: The video shows zebras moving in slow motion.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video shows zebras moving in slow motion.\nCaption B: The video is playing at normal speed.", "gt-answer": "Caption A: The video shows zebras moving in slow motion.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video shows zebras moving in slow motion.\nOption 2: The video is playing in time-lapse.", "gt-answer": "Option 1: The video shows zebras moving in slow motion.", "video-llm-prediction": "Option 1: The video shows zebras moving in slow motion.", "match_success": true, "rating": 1 } ] }, "1069338034": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The elephants are sleeping and resting.\nSentence B: The elephants are eating and drinking in the video.", "gt-answer": "Sentence B: The elephants are eating and drinking in the video.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The elephants are eating and drinking in the video.\nCaption B: They are swimming and taking a bath in the video.", "gt-answer": "Caption A: The elephants are eating and drinking in the video.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The elephants are eating and drinking in the video.\nOption 2: The elephants are eating and playing.", "gt-answer": "Option 1: The elephants are eating and drinking in the video.", "video-llm-prediction": "Option 1: The elephants are eating and drinking in the video.", "match_success": true, "rating": 1 } ] }, "1076880707": { "action": [ { "question": "Which caption matches the video better?\nCaption A: A bear is eating honey near the tree.\nCaption B: A bear is scratching against a tree.", "gt-answer": "Caption B: A bear is scratching against a tree.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The bear is climbing a tree.\nSentence B: A bear is scratching against a tree.", "gt-answer": "Sentence B: A bear is scratching against a tree.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A bear is scratching against a tree.\nOption 2: The bear is taking a nap under the tree.", "gt-answer": "Option 1: A bear is scratching against a tree.", "video-llm-prediction": "Option 1: A bear is scratching against a tree.", "match_success": true, "rating": 1 } ] }, "1092278063": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The two rows of duck toy are moving in the same direction on a conveyor belt.\nSentence B: The two rows of duck toy are moving in opposite directions on a conveyor belt.", "gt-answer": "Sentence B: The two rows of duck toy are moving in opposite directions on a conveyor belt.", "video-llm-prediction": "Sentence A: The two rows of duck toy are moving in the same direction on a conveyor belt.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The two rows of duck toy are moving in opposite directions on a conveyor belt.\nOption 2: The two rows of duck toy are not moving on a conveyor belt.", "gt-answer": "Option 1: The two rows of duck toy are moving in opposite directions on a conveyor belt.", "video-llm-prediction": "Option 1: The two rows of duck toy are moving in opposite directions on a conveyor belt.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The movement pattern of the two rows of duck toy on a conveyor belt is circular motion.\nCaption B: The two rows of duck toy are moving in opposite directions on a conveyor belt.", "gt-answer": "Caption B: The two rows of duck toy are moving in opposite directions on a conveyor belt.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The conveyor belt is moving at a super fast speed.\nOption 2: The speed of the conveyor belt is normal.", "gt-answer": "Option 2: The speed of the conveyor belt is normal.", "video-llm-prediction": "Option 1: The conveyor belt is moving at a super fast speed.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The conveyor belt is stopped.\nCaption B: The speed of the conveyor belt is normal.", "gt-answer": "Caption B: The speed of the conveyor belt is normal.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the conveyor belt is normal.\nSentence B: The conveyor belt is moving slowly.", "gt-answer": "Sentence A: The speed of the conveyor belt is normal.", "video-llm-prediction": "Sentence B: The conveyor belt is moving slowly.", "match_success": true, "rating": 0 } ] }, "1024867412": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The dog is sleeping.\nCaption B: The dog is driving a car.", "gt-answer": "Caption B: The dog is driving a car.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A dog is driving a car.\nOption 2: A video showing a dog eating.", "gt-answer": "Option 1: A dog is driving a car.", "video-llm-prediction": "Option 1: A dog is driving a car.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The dog is barking.\nSentence B: A dog is driving a car.", "gt-answer": "Sentence B: A dog is driving a car.", "video-llm-prediction": "B", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which caption matches the video better?\nCaption A: The car is taking a right turn.\nCaption B: The car is taking a left turn.", "gt-answer": "Caption A: The car is taking a right turn.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The car is going backward.\nSentence B: The car is taking a right turn.", "gt-answer": "Sentence B: The car is taking a right turn.", "video-llm-prediction": "Sentence A: The car is going backward.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The car is taking a right turn.\nOption 2: The car is moving forward.", "gt-answer": "Option 1: The car is taking a right turn.", "video-llm-prediction": "Option 2: The car is moving forward.", "match_success": true, "rating": 0 } ] }, "1076877854": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The robots are caring for seedlings.\nCaption B: The robots are watering seedlings.", "gt-answer": "Caption A: The robots are caring for seedlings.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The robots are caring for seedlings.\nOption 2: The robots are harvesting crops.", "gt-answer": "Option 1: The robots are caring for seedlings.", "video-llm-prediction": "Option 1: The robots are caring for seedlings.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The robots are plowing the soil.\nSentence B: The robots are caring for seedlings.", "gt-answer": "Sentence B: The robots are caring for seedlings.", "video-llm-prediction": "Sentence B: The robots are caring for seedlings.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The robotic arm is moving to the left side of the screen.\nSentence B: The closest robotic arm is moving to the right side of the screen.", "gt-answer": "Sentence B: The closest robotic arm is moving to the right side of the screen.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The closest robotic arm is moving to the right side of the screen.\nCaption B: The robotic arm is moving away from the camera.", "gt-answer": "Caption A: The closest robotic arm is moving to the right side of the screen.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The robotic arm is rotating.\nOption 2: The closest robotic arm is moving to the right side of the screen.", "gt-answer": "Option 2: The closest robotic arm is moving to the right side of the screen.", "video-llm-prediction": "Option 1: The robotic arm is rotating.", "match_success": true, "rating": 0 } ] }, "1104673225": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is making salad.\nOption 2: The woman is eating salad.", "gt-answer": "Option 2: The woman is eating salad.", "video-llm-prediction": "Option 2: The woman is eating salad.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is wrapping up salad.\nSentence B: The woman is eating salad.", "gt-answer": "Sentence B: The woman is eating salad.", "video-llm-prediction": "Sentence A: The woman is wrapping up salad.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The woman is eating salad.\nCaption B: The woman is throwing away salad.", "gt-answer": "Caption A: The woman is eating salad.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1100468035": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The camera is zooming backward in the video.\nCaption B: The camera is zooming forward in the video.", "gt-answer": "Caption B: The camera is zooming forward in the video.", "video-llm-prediction": "Caption B: The camera is zooming forward in the video.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is zooming forward in the video.\nSentence B: The camera is tracking right.", "gt-answer": "Sentence A: The camera is zooming forward in the video.", "video-llm-prediction": "Sentence A: The camera is zooming forward in the video.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is panning left.\nOption 2: The camera is zooming forward in the video.", "gt-answer": "Option 2: The camera is zooming forward in the video.", "video-llm-prediction": "Option 2: The camera is zooming forward in the video.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The door is gradually closing.\nCaption B: The door is gradually opening.", "gt-answer": "Caption B: The door is gradually opening.", "video-llm-prediction": "Caption B: The door is gradually opening.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The door is gradually opening.\nSentence B: The door is remaining closed.", "gt-answer": "Sentence A: The door is gradually opening.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The door is gradually opening.\nOption 2: The door is gradually closing.", "gt-answer": "Option 1: The door is gradually opening.", "video-llm-prediction": "Option 2: The door is gradually closing.", "match_success": true, "rating": 0 } ] }, "315784": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The speedometer in the center is moving faster than the one on the left.\nOption 2: The speedometer in the center is moving slower than the one on the left.", "gt-answer": "Option 1: The speedometer in the center is moving faster than the one on the left.", "video-llm-prediction": "Option 1: The speedometer in the center is moving faster than the one on the left.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The speedometer in the center is moving faster than the one on the left.\nSentence B: The speedometer in the center is stationary.", "gt-answer": "Sentence A: The speedometer in the center is moving faster than the one on the left.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The speedometer in the center is moving at the same speed as the one on the left.\nCaption B: The speedometer in the center is moving faster than the one on the left.", "gt-answer": "Caption B: The speedometer in the center is moving faster than the one on the left.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "3411806181": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is cleaning an electric vehicle.\nSentence B: A person is recharging an electric vehicle.", "gt-answer": "Sentence B: A person is recharging an electric vehicle.", "video-llm-prediction": "Sentence B: A person is recharging an electric vehicle.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A person is recharging an electric vehicle.\nCaption B: A person is repairing an electric vehicle.", "gt-answer": "Caption A: A person is recharging an electric vehicle.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is repairing an EV.\nOption 2: A person is recharging an electric vehicle.", "gt-answer": "Option 2: A person is recharging an electric vehicle.", "video-llm-prediction": "Option 2: A person is recharging an electric vehicle.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The person is unplugging the charging gun from the car.\nOption 2: The person is plugging the charging gun into the car.", "gt-answer": "Option 2: The person is plugging the charging gun into the car.", "video-llm-prediction": "Option 1: The person is unplugging the charging gun from the car.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is moving the charging gun away from the car.\nSentence B: The person is plugging the charging gun into the car.", "gt-answer": "Sentence B: The person is plugging the charging gun into the car.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The person is holding the charging gun still.\nCaption B: The person is plugging the charging gun into the car.", "gt-answer": "Caption B: The person is plugging the charging gun into the car.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing in fast motion.\nOption 2: The video is playing in slow motion.", "gt-answer": "Option 2: The video is playing in slow motion.", "video-llm-prediction": "Option 2: The video is playing in slow motion.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing in slow motion.\nSentence B: The video is playing at normal speed.", "gt-answer": "Sentence A: The video is playing in slow motion.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video is time-lapse.\nCaption B: The video is playing in slow motion.", "gt-answer": "Caption B: The video is playing in slow motion.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1043054206": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The yellow car is driving slower than other cars.\nSentence B: The yellow car is driving faster than other cars.", "gt-answer": "Sentence B: The yellow car is driving faster than other cars.", "video-llm-prediction": "Sentence B: The yellow car is driving faster than other cars.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The yellow car is driving faster than other cars.\nCaption B: The yellow car is currently not in motion.", "gt-answer": "Caption A: The yellow car is driving faster than other cars.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The yellow car is traveling at an equivalent speed to other cars.\nOption 2: The yellow car is driving faster than other cars.", "gt-answer": "Option 2: The yellow car is driving faster than other cars.", "video-llm-prediction": "Option 2: The yellow car is driving faster than other cars.", "match_success": true, "rating": 1 } ] }, "1096238125": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The man and woman are lifting weights.\nCaption B: The man and woman are doing sit-ups.", "gt-answer": "Caption A: The man and woman are lifting weights.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man and woman are lifting weights.\nSentence B: The man and woman are jumping rope.", "gt-answer": "Sentence A: The man and woman are lifting weights.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man and woman are lifting weights.\nOption 2: The man and woman are doing push-ups.", "gt-answer": "Option 1: The man and woman are lifting weights.", "video-llm-prediction": "Option 1: The man and woman are lifting weights.", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which caption matches the video better?\nCaption A: The man is lifting the kettlebell at roughly the same speed as the woman.\nCaption B: The man is lifting the kettlebell at a much slower speed than the woman.", "gt-answer": "Caption A: The man is lifting the kettlebell at roughly the same speed as the woman.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is lifting the kettlebell at roughly the same speed as the woman.\nOption 2: The man is lifting the kettlebell faster than the woman.", "gt-answer": "Option 1: The man is lifting the kettlebell at roughly the same speed as the woman.", "video-llm-prediction": "Option 1: The man is lifting the kettlebell at roughly the same speed as the woman.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is significantly faster in lifting the kettlebell compared to the woman.\nSentence B: The man is lifting the kettlebell at roughly the same speed as the woman.", "gt-answer": "Sentence B: The man is lifting the kettlebell at roughly the same speed as the woman.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1103088481": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is selling vegetables at the grocery store.\nSentence B: A person is buying vegetables at the grocery store.", "gt-answer": "Sentence B: A person is buying vegetables at the grocery store.", "video-llm-prediction": "Sentence B: A person is buying vegetables at the grocery store.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A person is buying vegetables at the market/grocery store.\nCaption B: The person is packaging vegetables at the market/grocery store.", "gt-answer": "Caption A: A person is buying vegetables at the market/grocery store.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A person is buying vegetables at the market/grocery store.\nOption 2: The person is cooking vegetables at the market/grocery store.", "gt-answer": "Option 1: A person is buying vegetables at the market/grocery store.", "video-llm-prediction": "Option 1: A person is buying vegetables at the market/grocery store.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: First, a black vegetable is being placed in the trolley and then a green vegetable.\nSentence B: First, a green vegetable is being placed in the trolley and then a black vegetable.", "gt-answer": "Sentence B: First, a green vegetable is being placed in the trolley and then a black vegetable.", "video-llm-prediction": "Sentence B: First, a green vegetable is being placed in the trolley and then a black vegetable.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Place a green vegetable into the trolley and then place a black vegetable into the trolley.\nCaption B: Place a black vegetable into the trolley and then place a green vegetable into the trolley.", "gt-answer": "Caption A: Place a green vegetable into the trolley and then place a black vegetable into the trolley.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The correct order of events in the video is to place a green vegetable into the trolley and then place a black vegetable into the trolley.\nOption 2: Only a green vegetable is being placed.", "gt-answer": "Option 1: The correct order of events in the video is to place a green vegetable into the trolley and then place a black vegetable into the trolley.", "video-llm-prediction": "Option 1: The correct order of events in the video is to place a green vegetable into the trolley and then place a black vegetable into the trolley.", "match_success": true, "rating": 1 } ] }, "1055981522": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The 3d shapes are transforming from cubes into spheres.\nCaption B: The 3d shapes are transforming from spheres into cubes.", "gt-answer": "Caption A: The 3d shapes are transforming from cubes into spheres.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3d shapes are transforming from spheres into cubes.\nOption 2: The 3d shapes are transforming from cubes into spheres.", "gt-answer": "Option 2: The 3d shapes are transforming from cubes into spheres.", "video-llm-prediction": "Option 1: The 3d shapes are transforming from spheres into cubes.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3d shapes are transforming from cubes into spheres.\nSentence B: The 3d shapes are transforming from spheres into cubes.", "gt-answer": "Sentence A: The 3d shapes are transforming from cubes into spheres.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1061017846": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The boat on the top of the screen is moving faster than the boat on the bottom of the screen.\nSentence B: The boat on the top of the screen travels at the same speed as the boat on the bottom of the screen.", "gt-answer": "Sentence B: The boat on the top of the screen travels at the same speed as the boat on the bottom of the screen.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The boat on the top of the screen is moving slower than the boat on the bottom of the screen.\nOption 2: The boat on the top of the screen travels at the same speed as the boat on the bottom of the screen.", "gt-answer": "Option 2: The boat on the top of the screen travels at the same speed as the boat on the bottom of the screen.", "video-llm-prediction": "Option 1: The boat on the top of the screen is moving slower than the boat on the bottom of the screen.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The boat on the top of the screen travels at the same speed as the boat on the bottom of the screen.\nCaption B: No, the boat on the top of the screen is not moving at the same speed.", "gt-answer": "Caption A: The boat on the top of the screen travels at the same speed as the boat on the bottom of the screen.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1097341567": { "order": [ { "question": "Which caption matches the video better?\nCaption A: First, the bars appear, then the arrow appears.\nCaption B: First, the arrow appears, then the bars appear.", "gt-answer": "Caption A: First, the bars appear, then the arrow appears.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The bars and the arrow appear at the same time.\nOption 2: First, the bars appear, then the arrow appears.", "gt-answer": "Option 2: First, the bars appear, then the arrow appears.", "video-llm-prediction": "Option 2: The bars and the arrow appear at the same time.", "match_success": false, "chatgpt-response": "Incorrect", "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: First, the bars appear, then the arrow appears.\nSentence B: Only the arrow appears in the video.", "gt-answer": "Sentence A: First, the bars appear, then the arrow appears.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1056433484": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The Earth moves around the Sun in a counter-clockwise direction.\nSentence B: The Earth moves around the Sun in a clockwise direction.", "gt-answer": "Sentence A: The Earth moves around the Sun in a counter-clockwise direction.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The Earth moves around the Sun in a counter-clockwise direction.\nCaption B: The Earth travels upwards in its orbit around the Sun.", "gt-answer": "Caption A: The Earth moves around the Sun in a counter-clockwise direction.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The Earth moves around the Sun in a counter-clockwise direction.\nOption 2: The Earth travels from vertically in its orbit around the Sun.", "gt-answer": "Option 1: The Earth moves around the Sun in a counter-clockwise direction.", "video-llm-prediction": "Option 1: The Earth moves around the Sun in a counter-clockwise direction.", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The earth rotates slower than the sun.\nOption 2: The earth rotates faster than the sun.", "gt-answer": "Option 2: The earth rotates faster than the sun.", "video-llm-prediction": "Option 2: The earth rotates faster than the sun.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the earth's rotation is faster than the sun.\nSentence B: The earth and sun rotate at equal speeds.", "gt-answer": "Sentence A: The speed of the earth's rotation is faster than the sun.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the earth's rotation is faster than the sun.\nCaption B: The sun rotates faster than the earth.", "gt-answer": "Caption A: The speed of the earth's rotation is faster than the sun.", "video-llm-prediction": "Caption B: The sun rotates faster than the earth.", "match_success": true, "rating": 0 } ] }, "1073914229": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The trains are moving towards each other.\nSentence B: The trains are moving away from each other.", "gt-answer": "Sentence A: The trains are moving towards each other.", "video-llm-prediction": "Sentence A: The trains are moving towards each other.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The trains are moving away from each other.\nOption 2: The trains are moving towards each other.", "gt-answer": "Option 2: The trains are moving towards each other.", "video-llm-prediction": "Option 1: The trains are moving away from each other.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The trains are moving towards each other.\nCaption B: The trains are moving away from each other.", "gt-answer": "Caption A: The trains are moving towards each other.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1099360359": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The train closer to the screen drives faster.\nSentence B: The train further away from the screen drives faster.", "gt-answer": "Sentence B: The train further away from the screen drives faster.", "video-llm-prediction": "Sentence B: The train further away from the screen drives faster.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The train further away from the screen drives faster.\nCaption B: The train further away from the screen drives slower.", "gt-answer": "Caption A: The train further away from the screen drives faster.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The two trains drive at the same speed.\nOption 2: The train further away from the screen drives faster.", "gt-answer": "Option 2: The train further away from the screen drives faster.", "video-llm-prediction": "Option 2: The train further away from the screen drives faster.", "match_success": true, "rating": 1 } ] }, "1098970151": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The cat is licking its paw.\nOption 2: The cat is waving its paw.", "gt-answer": "Option 1: The cat is licking its paw.", "video-llm-prediction": "Option 1: The cat is licking its paw.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The cat is licking its paw.\nSentence B: The cat is biting its paw.", "gt-answer": "Sentence A: The cat is licking its paw.", "video-llm-prediction": "Sentence A: The cat is licking its paw.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The cat is licking its paw.\nCaption B: The cat is licking its belly.", "gt-answer": "Caption A: The cat is licking its paw.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1081734740": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The leaf is turning from green to yellow.\nOption 2: The leaf is turning from yellow to green.", "gt-answer": "Option 1: The leaf is turning from green to yellow.", "video-llm-prediction": "Option 1: The leaf is turning from green to yellow.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The texture of the leaf is turning from green to yellow.\nCaption B: The leaf is turning from green to white.", "gt-answer": "Caption A: The texture of the leaf is turning from green to yellow.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The texture of the leaf is turning from yellow to green.\nSentence B: The texture of the leaf is turning from green to yellow.", "gt-answer": "Sentence B: The texture of the leaf is turning from green to yellow.", "video-llm-prediction": "Sentence A: The texture of the leaf is turning from yellow to green.", "match_success": true, "rating": 0 } ] }, "1093719871": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The pink flowers are turning from full blossom to withered.\nSentence B: The pink flowers are turning from withered to full blossom.", "gt-answer": "Sentence B: The pink flowers are turning from withered to full blossom.", "video-llm-prediction": "Sentence A: The pink flowers are turning from full blossom to withered.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pink flowers are growing taller.\nOption 2: The pink flowers are turning from withered to full blossom.", "gt-answer": "Option 2: The pink flowers are turning from withered to full blossom.", "video-llm-prediction": "Option 2: The pink flowers are turning from withered to full blossom.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The pink flowers are falling off petals.\nCaption B: The pink flowers are turning from withered to full blossom.", "gt-answer": "Caption B: The pink flowers are turning from withered to full blossom.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1042819039": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is playing in slow motion.\nOption 2: The video is playing in time-lapse.", "gt-answer": "Option 2: The video is playing in time-lapse.", "video-llm-prediction": "Option 1: The video is playing in slow motion.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is playing in time-lapse.\nSentence B: The video is playing at normal speed.", "gt-answer": "Sentence A: The video is playing in time-lapse.", "video-llm-prediction": "Sentence A: The video is playing in time-lapse.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is playing in time-lapse.\nCaption B: The video is playing in slow motion.", "gt-answer": "Caption A: The video is playing in time-lapse.", "video-llm-prediction": "Caption B: The video is playing in slow motion.", "match_success": true, "rating": 0 } ], "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The street was filled with people, then it was filled with vehicles.\nOption 2: The street was filled with vehicles, then it was filled with people.", "gt-answer": "Option 1: The street was filled with people, then it was filled with vehicles.", "video-llm-prediction": "Option 2: The street was filled with vehicles, then it was filled with people.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The street was filled with vehicles, then it was filled with people.\nCaption B: The street was filled with people, then it was filled with vehicles.", "gt-answer": "Caption B: The street was filled with people, then it was filled with vehicles.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The street was filled with people, then it was filled with vehicles.\nSentence B: The street was completely empty and then filled with vehicles.", "gt-answer": "Sentence A: The street was filled with people, then it was filled with vehicles.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1093139559": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is zooming out from three football players.\nSentence B: The camera is zooming into three football players.", "gt-answer": "Sentence B: The camera is zooming into three football players.", "video-llm-prediction": "Sentence A: The camera is zooming out from three football players.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The camera is zooming into three football players.\nCaption B: The camera is panning left.", "gt-answer": "Caption A: The camera is zooming into three football players.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is zooming into three football players.\nOption 2: The camera is panning right.", "gt-answer": "Option 1: The camera is zooming into three football players.", "video-llm-prediction": "Option 1: The camera is zooming into three football players.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The light on the football players is turning brighter.\nOption 2: The light on the football players is turning darker.", "gt-answer": "Option 1: The light on the football players is turning brighter.", "video-llm-prediction": "Option 2: The light on the football players is turning darker.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The light on the football players is staying the same.\nCaption B: The light on the football players is turning brighter.", "gt-answer": "Caption B: The light on the football players is turning brighter.", "video-llm-prediction": "Caption B: The light on the football players is turning brighter.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The light on the football players is dimming.\nSentence B: The light on the football players is turning brighter.", "gt-answer": "Sentence B: The light on the football players is turning brighter.", "video-llm-prediction": "Sentence B: The light on the football players is turning brighter.", "match_success": true, "rating": 1 } ] }, "1055973887": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The fire ball is getting smaller.\nOption 2: The fire ball is getting larger.", "gt-answer": "Option 2: The fire ball is getting larger.", "video-llm-prediction": "Option 2: The fire ball is getting larger.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The fire ball is shrinking in size.\nSentence B: The fire ball is getting larger.", "gt-answer": "Sentence B: The fire ball is getting larger.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The fire ball is getting larger.\nCaption B: The fire ball is getting smaller.", "gt-answer": "Caption A: The fire ball is getting larger.", "video-llm-prediction": "Caption A: The fire ball is getting larger.", "match_success": true, "rating": 1 } ] }, "1098710341": { "action": [ { "question": "Which caption matches the video better?\nCaption A: A person is pouring water into a glass.\nCaption B: A person is drinking water from a glass.", "gt-answer": "Caption A: A person is pouring water into a glass.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is pouring water into a glass.\nSentence B: The person is washing hands with water.", "gt-answer": "Sentence A: A person is pouring water into a glass.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Water is being spilled on the floor.\nOption 2: A person is pouring water into a glass.", "gt-answer": "Option 2: A person is pouring water into a glass.", "video-llm-prediction": "Option 2: A person is pouring water into a glass.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The glass is going from empty to full of water.\nOption 2: The glass is going from full of water to empty.", "gt-answer": "Option 1: The glass is going from empty to full of water.", "video-llm-prediction": "Option 1: The glass is going from empty to full of water.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The glass is going from full of water to empty.\nSentence B: The glass is going from empty to full of water.", "gt-answer": "Sentence B: The glass is going from empty to full of water.", "video-llm-prediction": "Sentence A: The glass is going from full of water to empty.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The glass is going from empty to full of water.\nCaption B: The glass is going from full of water to empty.", "gt-answer": "Caption A: The glass is going from empty to full of water.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1062736174": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The chef is chopping steak.\nCaption B: The chef is sprinkling salt on the steak.", "gt-answer": "Caption B: The chef is sprinkling salt on the steak.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The chef is sprinkling vegetables on the steak.\nSentence B: The chef is sprinkling salt on the steak.", "gt-answer": "Sentence B: The chef is sprinkling salt on the steak.", "video-llm-prediction": "Sentence B: The chef is sprinkling salt on the steak.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The chef is sprinkling salt on the steak.\nOption 2: The chef is grilling steak.", "gt-answer": "Option 1: The chef is sprinkling salt on the steak.", "video-llm-prediction": "Option 1: The chef is sprinkling salt on the steak.", "match_success": true, "rating": 1 } ] }, "1099116143": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: White foam appears on the surface of the pink liquid.\nCaption B: White foam disappears from the surface of the pink liquid.", "gt-answer": "Caption A: White foam appears on the surface of the pink liquid.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pink liquid is freezing.\nOption 2: White foam appears on the surface of the pink liquid.", "gt-answer": "Option 2: White foam appears on the surface of the pink liquid.", "video-llm-prediction": "Option 2: White foam appears on the surface of the pink liquid.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: White foam appears on the surface of the pink liquid.\nSentence B: The pink liquid is evaporating.", "gt-answer": "Sentence A: White foam appears on the surface of the pink liquid.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1108398787": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The person is watching videos on a smartphone device.\nOption 2: The person is operating system settings on a smartphone device.", "gt-answer": "Option 2: The person is operating system settings on a smartphone device.", "video-llm-prediction": "Option 1: The person is watching videos on a smartphone device.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The person is making a phone call using a smartphone device.\nCaption B: The person is operating system settings on a smartphone device.", "gt-answer": "Caption B: The person is operating system settings on a smartphone device.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is operating system settings on a smartphone device.\nSentence B: The person is chatting on a smartphone device.", "gt-answer": "Sentence A: The person is operating system settings on a smartphone device.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which caption matches the video better?\nCaption A: The person is scrolling down and then entering passwords.\nCaption B: The person is entering passwords and then scrolling down.", "gt-answer": "Caption A: The person is scrolling down and then entering passwords.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is scrolling down and then entering passwords.\nSentence B: The person is entering passwords and then scrolling up.", "gt-answer": "Sentence A: The person is scrolling down and then entering passwords.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is scrolling down and then entering passwords.\nOption 2: The person is scrolling up and then entering passwords.", "gt-answer": "Option 1: The person is scrolling down and then entering passwords.", "video-llm-prediction": "Option 1: The person is scrolling down and then entering passwords.", "match_success": true, "rating": 1 } ] }, "1080591311": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The jellyfish is floating downwards.\nCaption B: The jellyfish is floating upwards.", "gt-answer": "Caption B: The jellyfish is floating upwards.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The jellyfish is moving towards the right.\nSentence B: The jellyfish is floating upwards.", "gt-answer": "Sentence B: The jellyfish is floating upwards.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The jellyfish is sinking downwards.\nOption 2: The jellyfish is floating upwards.", "gt-answer": "Option 2: The jellyfish is floating upwards.", "video-llm-prediction": "Option 1: The jellyfish is sinking downwards.", "match_success": true, "rating": 0 } ] }, "1107034223": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The bear is catching fishes in the river.\nOption 2: The bear is swallowing fishes in the river.", "gt-answer": "Option 1: The bear is catching fishes in the river.", "video-llm-prediction": "Option 2: The bear is swallowing fishes in the river.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The bear is catching fishes in the river.\nCaption B: The bear is climbing trees.", "gt-answer": "Caption A: The bear is catching fishes in the river.", "video-llm-prediction": "A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The bear is catching fishes in the river.\nSentence B: The bear is sleeping in the river.", "gt-answer": "Sentence A: The bear is catching fishes in the river.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The bear turned back and then caught a fish.\nOption 2: The bear caught a fish and then turned back.", "gt-answer": "Option 2: The bear caught a fish and then turned back.", "video-llm-prediction": "Option 2: The bear caught a fish and then turned back.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The bear caught a fish and then turned back.\nSentence B: The bear caught a fish and then climbed up a tree.", "gt-answer": "Sentence A: The bear caught a fish and then turned back.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The bear caught a fish and then turned back.\nCaption B: The bear jumped into the water and then caught a fish.", "gt-answer": "Caption A: The bear caught a fish and then turned back.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1025623079": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The skyscraper is growing taller.\nCaption B: The skyscraper is collapsing.", "gt-answer": "Caption B: The skyscraper is collapsing.", "video-llm-prediction": "Caption B: The skyscraper is collapsing.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The skyscraper is growing taller.\nOption 2: The skyscraper is collapsing.", "gt-answer": "Option 2: The skyscraper is collapsing.", "video-llm-prediction": "Option 2: The skyscraper is collapsing.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The skyscraper is standing up.\nSentence B: The skyscraper is collapsing.", "gt-answer": "Sentence B: The skyscraper is collapsing.", "video-llm-prediction": "Sentence B: The skyscraper is collapsing.", "match_success": true, "rating": 1 } ] }, "1094782389": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The dog is running towards a woman.\nCaption B: The dog is giving a high five to a woman.", "gt-answer": "Caption B: The dog is giving a high five to a woman.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The dog is barking to a woman.\nSentence B: The dog is giving a high five to a woman.", "gt-answer": "Sentence B: The dog is giving a high five to a woman.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The dog is giving a high five to a woman.\nOption 2: The dog is catching food from a woman.", "gt-answer": "Option 1: The dog is giving a high five to a woman.", "video-llm-prediction": "Option 1: The dog is giving a high five to a woman.", "match_success": true, "rating": 1 } ] }, "3411228673": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is moving upwards along the skyscraper buildings.\nSentence B: The camera is moving downwards along the skyscraper buildings.", "gt-answer": "Sentence A: The camera is moving upwards along the skyscraper buildings.", "video-llm-prediction": "Sentence A: The camera is moving upwards along the skyscraper buildings.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The movement in the video is upwards among skyscraper buildings.\nOption 2: The camera is moving backwards.", "gt-answer": "Option 1: The movement in the video is upwards among skyscraper buildings.", "video-llm-prediction": "Option 1: The movement in the video is upwards among skyscraper buildings.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The movement in the video is upwards among skyscraper buildings.\nCaption B: The camera is moving downwards.", "gt-answer": "Caption A: The movement in the video is upwards among skyscraper buildings.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1043487715": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: Children are diving into water.\nOption 2: Children are eating.", "gt-answer": "Option 1: Children are diving into water.", "video-llm-prediction": "Option 1: Children are diving into water.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: Children can be seen diving into water in the video.\nSentence B: A group of children is sleeping.", "gt-answer": "Sentence A: Children can be seen diving into water in the video.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Children can be seen diving into water in the video.\nCaption B: The children are hiking.", "gt-answer": "Caption A: Children can be seen diving into water in the video.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The children are diving into water and then making gestures to the camera.\nSentence B: The children are making gestures to the camera and then diving into water.", "gt-answer": "Sentence A: The children are diving into water and then making gestures to the camera.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The children are diving into water after making gestures to the camera.\nCaption B: The children are diving into water and then making gestures to the camera.", "gt-answer": "Caption B: The children are diving into water and then making gestures to the camera.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The children are making gestures to the camera first and then diving into water.\nOption 2: The children are diving into water and then making gestures to the camera.", "gt-answer": "Option 2: The children are diving into water and then making gestures to the camera.", "video-llm-prediction": "Option 1: The children are making gestures to the camera first and then diving into water.", "match_success": true, "rating": 0 } ] }, "1093737353": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is panning to the left.\nSentence B: The camera is panning to the right.", "gt-answer": "Sentence B: The camera is panning to the right.", "video-llm-prediction": "Sentence B: The camera is panning to the right.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The camera is panning to the right.\nCaption B: The camera is tilting up.", "gt-answer": "Caption A: The camera is panning to the right.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is panning to the right.\nOption 2: The camera is zooming in.", "gt-answer": "Option 1: The camera is panning to the right.", "video-llm-prediction": "Option 2: The camera is zooming in.", "match_success": true, "rating": 0 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video shows a beaker followed by a line of test tubes.\nSentence B: The video shows a line of test tubes followed by a beaker.", "gt-answer": "Sentence B: The video shows a line of test tubes followed by a beaker.", "video-llm-prediction": "Sentence A: The video shows a beaker followed by a line of test tubes.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, a microscope and then a beaker are shown.\nOption 2: The video shows a line of test tubes followed by a beaker.", "gt-answer": "Option 2: The video shows a line of test tubes followed by a beaker.", "video-llm-prediction": "Option 1: In the video, a microscope and then a beaker are shown.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video shows a line of test tubes followed by a beaker.\nCaption B: A beaker is shown, followed by a line of test tubes.", "gt-answer": "Caption A: The video shows a line of test tubes followed by a beaker.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1063373449": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The coronavirus is dissolving.\nOption 2: The coronavirus is enlarging.", "gt-answer": "Option 1: The coronavirus is dissolving.", "video-llm-prediction": "Option 2: The coronavirus is enlarging.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The coronavirus is dissolving under the attack of antibodies.\nSentence B: The coronavirus is assembling together.", "gt-answer": "Sentence A: The coronavirus is dissolving under the attack of antibodies.", "video-llm-prediction": "Sentence A: The coronavirus is dissolving under the attack of antibodies.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The coronavirus is dissolving under the attack of antibodies.\nCaption B: The coronavirus is assembling together.", "gt-answer": "Caption A: The coronavirus is dissolving under the attack of antibodies.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1017768724": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The person is moving leftwards.\nOption 2: The person is moving rightwards.", "gt-answer": "Option 2: The person is moving rightwards.", "video-llm-prediction": "Option 2: The person is moving rightwards.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is moving rightwards.\nSentence B: The person is moving upwards.", "gt-answer": "Sentence A: The person is moving rightwards.", "video-llm-prediction": "Sentence A: The person is moving rightwards.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The person is moving downwards.\nCaption B: The person is moving rightwards.", "gt-answer": "Caption B: The person is moving rightwards.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1106280325": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is playing guitar.\nOption 2: The woman is reading a book.", "gt-answer": "Option 2: The woman is reading a book.", "video-llm-prediction": "Option 2: The woman is reading a book.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is reading a book.\nCaption B: The woman is hiking.", "gt-answer": "Caption A: The woman is reading a book.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is reading a book.\nSentence B: The woman is dancing.", "gt-answer": "Sentence A: The woman is reading a book.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A body of water is shown, then a girl is shown reading a book\nOption 2: A girl is shown reading a book, then a body of water is shown.", "gt-answer": "Option 2: A girl is shown reading a book, then a body of water is shown.", "video-llm-prediction": "Option 2: A girl is shown reading a book, then a body of water is shown.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The girl and the river appear at the same time in the video.\nSentence B: In the video, a girl is shown reading a book, then a body of water is shown.", "gt-answer": "Sentence B: In the video, a girl is shown reading a book, then a body of water is shown.", "video-llm-prediction": "Sentence A: The girl and the river appear at the same time in the video.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video shows a girl reading a book and a body of water simultaneously.\nCaption B: In the video, a girl is shown reading a book, then a body of water is shown.", "gt-answer": "Caption B: In the video, a girl is shown reading a book, then a body of water is shown.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1100117263": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The little lion is jumping onto the back of an adult lion.\nSentence B: The little lion is falling from the back of an adult lion.", "gt-answer": "Sentence B: The little lion is falling from the back of an adult lion.", "video-llm-prediction": "Sentence A: The little lion is jumping onto the back of an adult lion.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The little lion is sitting nearby an adult lion.\nCaption B: The little lion is falling from the back of an adult lion.", "gt-answer": "Caption B: The little lion is falling from the back of an adult lion.", "video-llm-prediction": "A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The little lion is falling from the back of an adult lion.\nOption 2: The little lion is sitting on the back of an adult lion.", "gt-answer": "Option 1: The little lion is falling from the back of an adult lion.", "video-llm-prediction": "Option 2: The little lion is sitting on the back of an adult lion.", "match_success": true, "rating": 0 } ] }, "15695290": { "action": [ { "question": "Which caption matches the video better?\nCaption A: A person is drawing a picture.\nCaption B: A person is playing soccer.", "gt-answer": "Caption A: A person is drawing a picture.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is drawing a picture.\nSentence B: The person is reading a book.", "gt-answer": "Sentence A: A person is drawing a picture.", "video-llm-prediction": "A person is drawing a picture.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A person is drawing a picture.\nOption 2: The person is dancing.", "gt-answer": "Option 1: A person is drawing a picture.", "video-llm-prediction": "Option 1: A person is drawing a picture.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: In the video, first a house is drawn and then a family of people is drawn.\nSentence B: In the video, first a family of people is drawn and then a house is drawn.", "gt-answer": "Sentence A: In the video, first a house is drawn and then a family of people is drawn.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: In the video, first a house is drawn and then a family of people is drawn.\nCaption B: Both events, drawing the house and drawing the family, occur at the same time in the video.", "gt-answer": "Caption A: In the video, first a house is drawn and then a family of people is drawn.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The house being drawn occurs after the family of people is drawn.\nOption 2: In the video, first a house is drawn and then a family of people is drawn.", "gt-answer": "Option 2: In the video, first a house is drawn and then a family of people is drawn.", "video-llm-prediction": "Option 2: In the video, first a house is drawn and then a family of people is drawn.", "match_success": true, "rating": 1 } ] }, "1091090295": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: First, a bird is seen flying onto the back of a hippo. Then, a hippo pokes its head out of water.\nOption 2: First, a hippo pokes its head out of water. Then, a bird is seen flying onto the back of another hippo.", "gt-answer": "Option 1: First, a bird is seen flying onto the back of a hippo. Then, a hippo pokes its head out of water.", "video-llm-prediction": "Option 1", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A hippo puts its head into water before a bird flying onto the back of another hippo.\nSentence B: First, a bird is seen flying onto the back of a hippo. Then, a hippo pokes its head out of water.", "gt-answer": "Sentence B: First, a bird is seen flying onto the back of a hippo. Then, a hippo pokes its head out of water.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: First, a bird is seen flying onto the back of a hippo. Then, a hippo pokes its head out of water.\nCaption B: Simultaneously, a bird flying onto the back of a hippo and another hippo pokes its head out of water.", "gt-answer": "Caption A: First, a bird is seen flying onto the back of a hippo. Then, a hippo pokes its head out of water.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1021488277": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is dropping down the pineapple.\nSentence B: A person is picking up the pineapple.", "gt-answer": "Sentence B: A person is picking up the pineapple.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pineapple is being rotated by a person.\nOption 2: A person is picking up the pineapple.", "gt-answer": "Option 2: A person is picking up the pineapple.", "video-llm-prediction": "Option 1: The pineapple is being rotated by a person.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: A person is picking up the pineapple.\nCaption B: The pineapple is being pushed forward by a person.", "gt-answer": "Caption A: A person is picking up the pineapple.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1106899445": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: Someone is removing toothpaste from the toothbrush.\nSentence B: The person is squeezing out toothpaste on the toothbrush.", "gt-answer": "Sentence B: The person is squeezing out toothpaste on the toothbrush.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The person is squeezing out toothpaste on the toothbrush.\nCaption B: The person is brushing teeth.", "gt-answer": "Caption A: The person is squeezing out toothpaste on the toothbrush.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is squeezing out toothpaste on the toothbrush.\nOption 2: A video showing a person wiping off toothpaste from the toothbrush.", "gt-answer": "Option 1: The person is squeezing out toothpaste on the toothbrush.", "video-llm-prediction": "Option 1: The person is squeezing out toothpaste on the toothbrush.", "match_success": true, "rating": 1 } ] }, "1067925569": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: Firstly, a cargo train is shown, secondly a container ship is shown.\nOption 2: Firstly, a container ship is shown, secondly a cargo train is shown.", "gt-answer": "Option 2: Firstly, a container ship is shown, secondly a cargo train is shown.", "video-llm-prediction": "Option 2: Firstly, a container ship is shown, secondly a cargo train is shown.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The cargo train and the container ship are shown simultaneously.\nSentence B: In the video, first a container ship is shown, then a cargo train is shown.", "gt-answer": "Sentence B: In the video, first a container ship is shown, then a cargo train is shown.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A container ship is shown, then a cargo train is shown.\nCaption B: In the video, first a container ship is shown, then a cargo train is shown.", "gt-answer": "Caption B: In the video, first a container ship is shown, then a cargo train is shown.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1062609286": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The battery charge is decreasing.\nOption 2: The battery charge is increasing.", "gt-answer": "Option 2: The battery charge is increasing.", "video-llm-prediction": "Option 1: The battery charge is decreasing.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The battery charge is staying constant.\nCaption B: The battery charge is increasing.", "gt-answer": "Caption B: The battery charge is increasing.", "video-llm-prediction": "Caption B: The battery charge is increasing.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The battery charge is decreasing.\nSentence B: The battery charge is increasing.", "gt-answer": "Sentence B: The battery charge is increasing.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1100397435": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A person is turning off the robot vacuum cleaner.\nOption 2: A person starts the robot vacuum cleaner.", "gt-answer": "Option 2: A person starts the robot vacuum cleaner.", "video-llm-prediction": "Option 2: A person starts the robot vacuum cleaner.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is fixing the robot vacuum cleaner.\nSentence B: A person starts the robot vacuum cleaner.", "gt-answer": "Sentence B: A person starts the robot vacuum cleaner.", "video-llm-prediction": "Sentence A: A person is fixing the robot vacuum cleaner.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: A person starts the robot vacuum cleaner.\nCaption B: A person is sweeping the floor.", "gt-answer": "Caption A: A person starts the robot vacuum cleaner.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The robot vacuum cleaner is moving to the right side of the screen.\nOption 2: The robot vacuum cleaner is moving to the left side of the screen.", "gt-answer": "Option 1: The robot vacuum cleaner is moving to the right side of the screen.", "video-llm-prediction": "Option 1: The robot vacuum cleaner is moving to the right side of the screen.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The robot vacuum cleaner is moving to the right side of the screen.\nCaption B: The robot vacuum cleaner is moving towards the screen.", "gt-answer": "Caption A: The robot vacuum cleaner is moving to the right side of the screen.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The robot vacuum cleaner is moving away from the screen.\nSentence B: The robot vacuum cleaner is moving to the right side of the screen.", "gt-answer": "Sentence B: The robot vacuum cleaner is moving to the right side of the screen.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ], "order": [ { "question": "Which caption matches the video better?\nCaption A: The robot vacuum cleaner starts moving and then a person enters the screen.\nCaption B: A person enters the screen and then the robot vacuum cleaner starts moving.", "gt-answer": "Caption B: A person enters the screen and then the robot vacuum cleaner starts moving.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A person enters the screen and then the robot vacuum cleaner starts moving.\nSentence B: The person starts moving before the robot vacuum cleaner.", "gt-answer": "Sentence A: A person enters the screen and then the robot vacuum cleaner starts moving.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A person enters the screen and then the robot vacuum cleaner starts moving.\nOption 2: The robot vacuum cleaner moves first, followed by a person entering the screen.", "gt-answer": "Option 1: A person enters the screen and then the robot vacuum cleaner starts moving.", "video-llm-prediction": "Option 2: The robot vacuum cleaner moves first, followed by a person entering the screen.", "match_success": true, "rating": 0 } ] }, "1108534143": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The puppy is walking into a wigwam.\nOption 2: The puppy is walking out of a wigwam.", "gt-answer": "Option 2: The puppy is walking out of a wigwam.", "video-llm-prediction": "Option 2: The puppy is walking out of a wigwam.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The puppy is staying in a wigwam.\nCaption B: The puppy is walking out of a wigwam.", "gt-answer": "Caption B: The puppy is walking out of a wigwam.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The puppy is walking out of a wigwam.\nSentence B: The puppy is walking around a wigwam.", "gt-answer": "Sentence A: The puppy is walking out of a wigwam.", "video-llm-prediction": "Sentence A: The puppy is walking out of a wigwam.", "match_success": true, "rating": 1 } ] }, "1034049020": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A woman is painting a wall while walking in the video.\nSentence B: A woman is painting a wall while standing still in the video.", "gt-answer": "Sentence A: A woman is painting a wall while walking in the video.", "video-llm-prediction": "Sentence A: A woman is painting a wall while walking in the video.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A girl is walking and cleaning a wall.\nCaption B: A woman is painting a wall while walking in the video.", "gt-answer": "Caption B: A woman is painting a wall while walking in the video.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A woman is painting a wall while walking in the video.\nOption 2: A girl painting a wall.", "gt-answer": "Option 1: A woman is painting a wall while walking in the video.", "video-llm-prediction": "Option 1: A woman is painting a wall while walking in the video.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which caption matches the video better?\nCaption A: The woman is turning back and then painting a wall while walking.\nCaption B: The woman is painting a wall while walking and then turning back.", "gt-answer": "Caption B: The woman is painting a wall while walking and then turning back.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The events are occurring in the order of painting a wall while walking, then turning back.\nOption 2: The order of events is turning back and painting the wall, then walking.", "gt-answer": "Option 1: The events are occurring in the order of painting a wall while walking, then turning back.", "video-llm-prediction": "Option 1: The events are occurring in the order of painting a wall while walking, then turning back.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The events are occurring in the order of painting a wall while walking, then turning back.\nSentence B: The events are happening in the order of turning back, then painting a wall while walking.", "gt-answer": "Sentence A: The events are occurring in the order of painting a wall while walking, then turning back.", "video-llm-prediction": "Sentence A: The events are occurring in the order of painting a wall while walking, then turning back.", "match_success": true, "rating": 1 } ] }, "1047368344": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The girl is jumping into the water.\nSentence B: The girl is jumping out of water.", "gt-answer": "Sentence A: The girl is jumping into the water.", "video-llm-prediction": "Sentence A: The girl is jumping into the water.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The girl is jumping around water.\nCaption B: The girl is jumping into the water.", "gt-answer": "Caption B: The girl is jumping into the water.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The girl is swimming in the water.\nOption 2: The girl is jumping into the water.", "gt-answer": "Option 2: The girl is jumping into the water.", "video-llm-prediction": "Option 2: The girl is jumping into the water.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The girl first jumps into the water and then takes off her clothes.\nSentence B: The girl first takes off her clothes and then jumps into the water.", "gt-answer": "Sentence B: The girl first takes off her clothes and then jumps into the water.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The girl first takes off her clothes and then jumps into the water.\nCaption B: After jumping into the water, the girl takes off her clothes.", "gt-answer": "Caption A: The girl first takes off her clothes and then jumps into the water.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The girl first takes off her clothes and then jumps into the water.\nOption 2: The girl puts on her clothes after jumping into the water.", "gt-answer": "Option 1: The girl first takes off her clothes and then jumps into the water.", "video-llm-prediction": "Option 2: The girl puts on her clothes after jumping into the water.", "match_success": true, "rating": 0 } ] }, "1060403198": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The tree is decreasing in height.\nSentence B: The tree is increasing in height.", "gt-answer": "Sentence B: The tree is increasing in height.", "video-llm-prediction": "Sentence A: The tree is decreasing in height.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The tree is growing taller.\nCaption B: The tree is becoming shorter.", "gt-answer": "Caption A: The tree is growing taller.", "video-llm-prediction": "Caption A: The tree is growing taller.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The tree is decreasing in height.\nOption 2: The tree is increasing in height.", "gt-answer": "Option 2: The tree is increasing in height.", "video-llm-prediction": "Option 1: The tree is decreasing in height.", "match_success": true, "rating": 0 } ] }, "1100074677": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The person wrote something on the paper note and then stood the paper note up.\nSentence B: The person stood the paper note up and then wrote something on the paper note.", "gt-answer": "Sentence A: The person wrote something on the paper note and then stood the paper note up.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The person wrote something on the paper note and then stood the paper note up.\nCaption B: The person wrote something on the paper note and then tore it apart.", "gt-answer": "Caption A: The person wrote something on the paper note and then stood the paper note up.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person wrote something on the paper note and then stood the paper note up.\nOption 2: The person threw the paper note away and then tore it apart.", "gt-answer": "Option 1: The person wrote something on the paper note and then stood the paper note up.", "video-llm-prediction": "Option 1: The person wrote something on the paper note and then stood the paper note up.", "match_success": true, "rating": 1 } ] }, "1100350201": { "order": [ { "question": "Which caption matches the video better?\nCaption A: The little girl is drawn first, then the man is drawn.\nCaption B: The man is drawn first, then the little girl disappears.", "gt-answer": "Caption A: The little girl is drawn first, then the man is drawn.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is drawn and then the little girl appears.\nOption 2: The little girl is drawn first, then the man is drawn.", "gt-answer": "Option 2: The little girl is drawn first, then the man is drawn.", "video-llm-prediction": "Option 2: The little girl is drawn first, then the man is drawn.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The little girl is drawn first, then the man is drawn.\nSentence B: The little girl is drawn and then the man gets erased and drawn.", "gt-answer": "Sentence A: The little girl is drawn first, then the man is drawn.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1099109715": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The number of fruits and vegetables is increasing.\nOption 2: The number of fruits and vegetables is decreasing.", "gt-answer": "Option 1: The number of fruits and vegetables is increasing.", "video-llm-prediction": "Option 1: The number of fruits and vegetables is increasing.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The number of fruits and vegetables is increasing.\nSentence B: The fruits and vegetables are becoming rotten.", "gt-answer": "Sentence A: The number of fruits and vegetables is increasing.", "video-llm-prediction": "Sentence A: The number of fruits and vegetables is increasing.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The number of fruits and vegetables is increasing.\nCaption B: The fruits and vegetables are staying the same.", "gt-answer": "Caption A: The number of fruits and vegetables is increasing.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1051211272": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The toast with cream cheese is gradually growing larger.\nSentence B: The toast with cream cheese is gradually being eaten out and disappearing.", "gt-answer": "Sentence B: The toast with cream cheese is gradually being eaten out and disappearing.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The toast with cream cheese is gradually being eaten out and disappearing.\nOption 2: The toast with cream cheese is gradually emerging piece by piece.", "gt-answer": "Option 1: The toast with cream cheese is gradually being eaten out and disappearing.", "video-llm-prediction": "Option 1: The toast with cream cheese is gradually being eaten out and disappearing.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The toast with cream cheese is gradually growing larger.\nCaption B: The toast with cream cheese is gradually being eaten out and disappearing.", "gt-answer": "Caption B: The toast with cream cheese is gradually being eaten out and disappearing.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1059815936": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The paper is changing from a wrinkled paper ball into a flattened sheet.\nSentence B: The paper is changing from a flattened sheet into a wrinkled paper ball.", "gt-answer": "Sentence A: The paper is changing from a wrinkled paper ball into a flattened sheet.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The paper is changing from a wrinkled paper ball into a flattened sheet.\nCaption B: The paper is disappearing.", "gt-answer": "Caption A: The paper is changing from a wrinkled paper ball into a flattened sheet.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The paper is changing from a wrinkled paper ball into a flattened sheet.\nOption 2: The paper is tearing apart.", "gt-answer": "Option 1: The paper is changing from a wrinkled paper ball into a flattened sheet.", "video-llm-prediction": "Option 1: The paper is changing from a wrinkled paper ball into a flattened sheet.", "match_success": true, "rating": 1 } ] }, "1086423566": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The bowl rotates and then food is added into the bowl.\nOption 2: Food is added into the bowl and then the bowl rotates.", "gt-answer": "Option 2: Food is added into the bowl and then the bowl rotates.", "video-llm-prediction": "Option 2: Food is added into the bowl and then the bowl rotates.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: In the video, food is added into the bowl first and then the bowl rotates.\nCaption B: The bowl flips upside down and then food is added into the bowl.", "gt-answer": "Caption A: In the video, food is added into the bowl first and then the bowl rotates.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: Food is added into the bowl and then it disappears.\nSentence B: In the video, food is added into the bowl first and then the bowl rotates.", "gt-answer": "Sentence B: In the video, food is added into the bowl first and then the bowl rotates.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1069611487": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: First, pizza slices being taken away and then a paper box appears.\nSentence B: First, a paper box appears and then pizza slices being taken away.", "gt-answer": "Sentence B: First, a paper box appears and then pizza slices being taken away.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A pizza slice disappears before a paper box appears.\nOption 2: First, a paper box appears and then pizza slices disappear.", "gt-answer": "Option 2: First, a paper box appears and then pizza slices disappear.", "video-llm-prediction": "Option 2: First, a paper box appears and then pizza slices disappear.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: In the video, a paper box disappears before the pizza slices disappear.\nCaption B: First, a paper box appears and then pizza slices disappear.", "gt-answer": "Caption B: First, a paper box appears and then pizza slices disappear.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1083845854_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The clock hands are moving clockwise.\nOption 2: The clock hands are moving counterclockwise.", "gt-answer": "Option 2: The clock hands are moving counterclockwise.", "video-llm-prediction": "Option 1: The clock hands are moving clockwise.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The clock hands are moving counterclockwise.\nSentence B: The clock hands are moving clockwise.", "gt-answer": "Sentence A: The clock hands are moving counterclockwise.", "video-llm-prediction": "Sentence A: The clock hands are moving counterclockwise.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The clock hands are rotating counterclockwise.\nCaption B: The clock hands are rotating clockwise.", "gt-answer": "Caption A: The clock hands are rotating counterclockwise.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1100329207_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The camera view is moving right.\nOption 2: The camera view is moving backward.", "gt-answer": "Option 2: The camera view is moving backward.", "video-llm-prediction": "Option 1: The camera view is moving right.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The camera view is moving forward.\nCaption B: The camera view is moving backward.", "gt-answer": "Caption B: The camera view is moving backward.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera view is moving backward.\nSentence B: The camera view is moving left.", "gt-answer": "Sentence A: The camera view is moving backward.", "video-llm-prediction": "Sentence A: The camera view is moving backward.", "match_success": true, "rating": 1 } ] }, "1089201973_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The football is moving from right to left.\nOption 2: The football is moving from left to right in the video.", "gt-answer": "Option 1: The football is moving from right to left.", "video-llm-prediction": "Option 2: The football is moving from left to right in the video.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The football is moving from left to right in the video.\nSentence B: The football is moving from right to left in the video.", "gt-answer": "Sentence B: The football is moving from right to left in the video.", "video-llm-prediction": "Sentence A: The football is moving from left to right in the video.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The football is moving upwards.\nCaption B: The football is moving from right to left in the video.", "gt-answer": "Caption B: The football is moving from right to left in the video.", "video-llm-prediction": "Caption A: The football is moving upwards.", "match_success": true, "rating": 0 } ] }, "1034736653_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman athletes are running from right to left.\nOption 2: The woman athletes are running from left to right.", "gt-answer": "Option 1: The woman athletes are running from right to left.", "video-llm-prediction": "Option 2: The woman athletes are running from left to right.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman athletes are running from left to right.\nSentence B: The woman athletes are running from right to left.", "gt-answer": "Sentence B: The woman athletes are running from right to left.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The woman athletes are running from top to bottom.\nCaption B: The woman athletes are running from right to left.", "gt-answer": "Caption B: The woman athletes are running from right to left.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1084153195_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The dolphins are hovering in one place.\nCaption B: The dolphins are swimming away from the shore.", "gt-answer": "Caption B: The dolphins are swimming away from the shore.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The dolphins are swimming away from the shore.\nSentence B: The dolphins are swimming towards the shore.", "gt-answer": "Sentence A: The dolphins are swimming away from the shore.", "video-llm-prediction": "Sentence A: The dolphins are swimming away from the shore.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The dolphins are swimming away from the shore.\nOption 2: The dolphins are swimming parallel to the shore.", "gt-answer": "Option 1: The dolphins are swimming away from the shore.", "video-llm-prediction": "Option 2: The dolphins are swimming parallel to the shore.", "match_success": true, "rating": 0 } ] }, "1081092839_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The sun is descending.\nCaption B: The sun is ascending.", "gt-answer": "Caption B: The sun is ascending.", "video-llm-prediction": "Caption B: The sun is ascending.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The sun is moving upwards.\nSentence B: The sun is moving downwards.", "gt-answer": "Sentence A: The sun is moving upwards.", "video-llm-prediction": "Sentence B: The sun is moving downwards.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sun is descending.\nOption 2: The sun is rising.", "gt-answer": "Option 2: The sun is rising.", "video-llm-prediction": "Option 2: The sun is rising.", "match_success": true, "rating": 1 } ] }, "1059462749_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is moving upwards.\nSentence B: The person is moving rightwards.", "gt-answer": "Sentence A: The person is moving upwards.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is moving downwards.\nOption 2: The person is moving upwards.", "gt-answer": "Option 2: The person is moving upwards.", "video-llm-prediction": "Option 2: The person is moving upwards.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The person is moving upwards.\nCaption B: The person is moving leftwards.", "gt-answer": "Caption A: The person is moving upwards.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1088279109_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The elevator is rising.\nSentence B: The elevator is going down.", "gt-answer": "Sentence B: The elevator is going down.", "video-llm-prediction": "Sentence A: The elevator is rising.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The elevator is staying still.\nCaption B: The elevator is going down.", "gt-answer": "Caption B: The elevator is going down.", "video-llm-prediction": "Caption B: The elevator is going down.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The elevator is going down.\nOption 2: The elevator is not moving.", "gt-answer": "Option 1: The elevator is going down.", "video-llm-prediction": "Option 2: The elevator is not moving.", "match_success": true, "rating": 0 } ] }, "1096612823_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is zooming in.\nOption 2: The video is zooming out.", "gt-answer": "Option 2: The video is zooming out.", "video-llm-prediction": "Option 2: The video is zooming out.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is zooming out.\nCaption B: The video is rotating.", "gt-answer": "Caption A: The video is zooming out.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is zooming out.\nSentence B: The video is panning left.", "gt-answer": "Sentence A: The video is zooming out.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 } ] }, "1075484900_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The child is turning away from the man.\nSentence B: The child is turning towards the man.", "gt-answer": "Sentence B: The child is turning towards the man.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The child is turning towards the man.\nOption 2: The child is turning away from the man.", "gt-answer": "Option 1: The child is turning towards the man.", "video-llm-prediction": "Option 2: The child is turning away from the man.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The child is moving forward in relation to the man.\nCaption B: The child is turning away from the man.", "gt-answer": "Caption A: The child is moving forward in relation to the man.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1090578917_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The man is leaping to his left.\nCaption B: The man is leaping to his right.", "gt-answer": "Caption A: The man is leaping to his left.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is leaping to his left.\nSentence B: The man is leaping backward.", "gt-answer": "Sentence A: The man is leaping to his left.", "video-llm-prediction": "Sentence A: The man is leaping to his left.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is leaping to his left.\nOption 2: The man is leaping forward.", "gt-answer": "Option 1: The man is leaping to his left.", "video-llm-prediction": "Option 2: The man is leaping forward.", "match_success": true, "rating": 0 } ] }, "1075338140_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The ball is rolling from right to left.\nSentence B: The ball is rolling from left to right.", "gt-answer": "Sentence A: The ball is rolling from right to left.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The ball is rolling from left to right.\nCaption B: The ball is rolling from right to left.", "gt-answer": "Caption B: The ball is rolling from right to left.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The ball is rolling from right to left.\nOption 2: The ball is rolling from left to right.", "gt-answer": "Option 1: The ball is rolling from right to left.", "video-llm-prediction": "Option 1: The ball is rolling from right to left.", "match_success": true, "rating": 1 } ] }, "1067268403_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The wind turbines are stationary.\nCaption B: The wind turbines are rotating counterclockwise.", "gt-answer": "Caption B: The wind turbines are rotating counterclockwise.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The wind turbines are rotating counterclockwise.\nSentence B: The wind turbines are rotating clockwise.", "gt-answer": "Sentence A: The wind turbines are rotating counterclockwise.", "video-llm-prediction": "Sentence A: The wind turbines are rotating counterclockwise.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The wind turbines are rotating counterclockwise.\nOption 2: The wind turbines are rotating clockwise.", "gt-answer": "Option 1: The wind turbines are rotating counterclockwise.", "video-llm-prediction": "Option 1: The wind turbines are rotating counterclockwise.", "match_success": true, "rating": 1 } ] }, "1096007487_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3d cloud is moving upward from the camera's point of view.\nSentence B: The 3d cloud is moving rightward from the camera's point of view.", "gt-answer": "Sentence B: The 3d cloud is moving rightward from the camera's point of view.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The 3d cloud is moving downward from the camera's point of view.\nCaption B: The 3d cloud is moving rightward from the camera's point of view.", "gt-answer": "Caption B: The 3d cloud is moving rightward from the camera's point of view.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3d cloud is moving rightward from the camera's point of view.\nOption 2: The 3d cloud is moving leftward from the camera's point of view.", "gt-answer": "Option 1: The 3d cloud is moving rightward from the camera's point of view.", "video-llm-prediction": "Option 1: The 3d cloud is moving rightward from the camera's point of view.", "match_success": true, "rating": 1 } ] }, "1095939659_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The light is growing counterclockwise.\nCaption B: The light is growing downwards.", "gt-answer": "Caption A: The light is growing counterclockwise.", "video-llm-prediction": "Caption A: The light is growing counterclockwise.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The light is growing counterclockwise.\nSentence B: The light is growing clockwise.", "gt-answer": "Sentence A: The light is growing counterclockwise.", "video-llm-prediction": "Sentence A: The light is growing counterclockwise.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The light is growing counterclockwise.\nOption 2: The light is moving straight forward.", "gt-answer": "Option 1: The light is growing counterclockwise.", "video-llm-prediction": "Option 1: The light is growing counterclockwise.", "match_success": true, "rating": 1 } ] }, "1098785169_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The galaxy is spinning counterclockwise.\nCaption B: The galaxy is rotating clockwise.", "gt-answer": "Caption A: The galaxy is spinning counterclockwise.", "video-llm-prediction": "Caption A: The galaxy is spinning counterclockwise.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The galaxy's spin is counterclockwise.\nSentence B: The galaxy is rotating clockwise.", "gt-answer": "Sentence A: The galaxy's spin is counterclockwise.", "video-llm-prediction": "Sentence A: The galaxy's spin is counterclockwise.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The rotation of the galaxy is counterclockwise.\nOption 2: The galaxy is rotating clockwise.", "gt-answer": "Option 1: The rotation of the galaxy is counterclockwise.", "video-llm-prediction": "Option 1: The galaxy is rotating counterclockwise.", "match_success": false, "chatgpt-response": "Incorrect", "rating": 0 } ] }, "1099984801_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The clouds are moving away from the camera.\nSentence B: The clouds are moving towards the camera.", "gt-answer": "Sentence A: The clouds are moving away from the camera.", "video-llm-prediction": "Sentence B: The clouds are moving towards the camera.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The clouds are moving away from the camera.\nOption 2: The clouds are staying still.", "gt-answer": "Option 1: The clouds are moving away from the camera.", "video-llm-prediction": "Option 1: The clouds are moving away from the camera.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The clouds are moving away from the camera.\nCaption B: The clouds are moving towards the camera.", "gt-answer": "Caption A: The clouds are moving away from the camera.", "video-llm-prediction": "Caption B: The clouds are moving towards the camera.", "match_success": true, "rating": 0 } ], "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The brightness of the video is staying the same.\nOption 2: The entire video is getting brighter.", "gt-answer": "Option 2: The entire video is getting brighter.", "video-llm-prediction": "Option 2: The entire video is getting brighter.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The entire video is getting darker.\nCaption B: The video is getting brighter.", "gt-answer": "Caption B: The video is getting brighter.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The entire video is getting darker.\nSentence B: The entire video is getting brighter.", "gt-answer": "Sentence B: The entire video is getting brighter.", "video-llm-prediction": "Sentence B: The entire video is getting brighter.", "match_success": true, "rating": 1 } ] }, "1082419672_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The movement in the video is flying upwards relative to the ship.\nOption 2: The movement in the video is flying forwards relative to the ship.", "gt-answer": "Option 2: The movement in the video is flying forwards relative to the ship.", "video-llm-prediction": "Option 1: The movement in the video is flying upwards relative to the ship.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The movement in the video is flying backwards relative to the ship.\nSentence B: The movement in the video is flying forwards relative to the ship.", "gt-answer": "Sentence B: The movement in the video is flying forwards relative to the ship.", "video-llm-prediction": "Sentence A: The movement in the video is flying backwards relative to the ship.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The movement in the video is flying backwards relative to the ship.\nCaption B: The movement in the video is flying forwards relative to the ship.", "gt-answer": "Caption B: The movement in the video is flying forwards relative to the ship.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1058985719_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The airplane is moving from right to left.\nCaption B: The airplane is flying from top to bottom.", "gt-answer": "Caption B: The airplane is flying from top to bottom.", "video-llm-prediction": "A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The airplane is flying from bottom to top.\nOption 2: The airplane is flying from top to bottom.", "gt-answer": "Option 2: The airplane is flying from top to bottom.", "video-llm-prediction": "Option 2: The airplane is flying from top to bottom.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The airplane is flying from top to bottom.\nSentence B: The airplane is moving from left to right.", "gt-answer": "Sentence A: The airplane is flying from top to bottom.", "video-llm-prediction": "Sentence A: The airplane is flying from top to bottom.", "match_success": true, "rating": 1 } ] }, "1055413730_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The planet Earth is spinning to the left.\nOption 2: The planet Earth is spinning to the right.", "gt-answer": "Option 1: The planet Earth is spinning to the left.", "video-llm-prediction": "Option 2: The planet Earth is spinning to the right.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The planet Earth is spinning to the left.\nCaption B: The Earth is rotating upward.", "gt-answer": "Caption A: The planet Earth is spinning to the left.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The planet Earth is spinning to the left.\nSentence B: The planet Earth is spinning to the right.", "gt-answer": "Sentence A: The planet Earth is spinning to the left.", "video-llm-prediction": "Sentence A: The planet Earth is spinning to the left.", "match_success": true, "rating": 1 } ] }, "1081684532_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The asteroids in the video are flying towards the camera.\nSentence B: The asteroids are flying away from the camera.", "gt-answer": "Sentence B: The asteroids are flying away from the camera.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The asteroids in the video are flying away from the camera.\nOption 2: The asteroids are flying to the left.", "gt-answer": "Option 1: The asteroids in the video are flying away from the camera.", "video-llm-prediction": "Option 1: The asteroids in the video are flying away from the camera.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The asteroids are flying to the right.\nCaption B: The asteroids in the video are flying away from the camera.", "gt-answer": "Caption B: The asteroids in the video are flying away from the camera.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "23391142_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is panning upwards.\nOption 2: The video is panning to the left.", "gt-answer": "Option 2: The video is panning to the left.", "video-llm-prediction": "Option 2: The video is panning to the left.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is zooming in.\nCaption B: The video is panning to the left.", "gt-answer": "Caption B: The video is panning to the left.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is panning to the right.\nSentence B: The video is panning to the left.", "gt-answer": "Sentence B: The video is panning to the left.", "video-llm-prediction": "Sentence A: The video is panning to the right.", "match_success": true, "rating": 0 } ] }, "1053758117_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The camera is standing still before a 3D digital brain.\nCaption B: The camera is zooming out from a 3D digital brain.", "gt-answer": "Caption B: The camera is zooming out from a 3D digital brain.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is zooming out from a 3D digital brain.\nOption 2: The camera is zooming into a 3D digital brain.", "gt-answer": "Option 1: The camera is zooming out from a 3D digital brain.", "video-llm-prediction": "Option 2: The camera is zooming into a 3D digital brain.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is zooming out from a 3D digital brain.\nSentence B: The camera is moving downwards.", "gt-answer": "Sentence A: The camera is zooming out from a 3D digital brain.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1106668103_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is staying still.\nOption 2: The movement in the video is backward.", "gt-answer": "Option 2: The movement in the video is backward.", "video-llm-prediction": "Option 2: The movement in the video is backward.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The movement in the video is forward.\nCaption B: The movement in the video is backward.", "gt-answer": "Caption B: The movement in the video is backward.", "video-llm-prediction": "Caption A: The movement in the video is forward.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The movement in the video is backward.\nSentence B: The primary motion depicted in the video is turning in circles.", "gt-answer": "Sentence A: The movement in the video is backward.", "video-llm-prediction": "Sentence A: The movement in the video is backward.", "match_success": true, "rating": 1 } ] }, "1057163434_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The chef is removing the bread from the top of the burger.\nCaption B: The chef is putting the bread on top of the burger.", "gt-answer": "Caption A: The chef is removing the bread from the top of the burger.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The chef is putting the bread on top of the burger.\nOption 2: The chef is removing the bread from the top of the burger.", "gt-answer": "Option 2: The chef is removing the bread from the top of the burger.", "video-llm-prediction": "Option 2: The chef is removing the bread from the top of the burger.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The chef is putting the bread on top of the burger.\nSentence B: The chef is removing the bread from the top of the burger.", "gt-answer": "Sentence B: The chef is removing the bread from the top of the burger.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1100319395_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The circular pattern is moving counter-clockwise.\nCaption B: The circular pattern is moving clockwise.", "gt-answer": "Caption A: The circular pattern is moving counter-clockwise.", "video-llm-prediction": "Caption B: The circular pattern is moving clockwise.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The circular pattern is moving clockwise.\nOption 2: The circular pattern is spinning counter-clockwise.", "gt-answer": "Option 2: The circular pattern is spinning counter-clockwise.", "video-llm-prediction": "Option 1: The circular pattern is moving clockwise.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The circular pattern is vibrating back and forth.\nSentence B: The circular pattern is moving counter-clockwise.", "gt-answer": "Sentence B: The circular pattern is moving counter-clockwise.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1101200293_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is moving counterclockwise around the aircraft carrier.\nSentence B: The camera is moving clockwise around the aircraft carrier.", "gt-answer": "Sentence A: The camera is moving counterclockwise around the aircraft carrier.", "video-llm-prediction": "Sentence A: The camera is moving counterclockwise around the aircraft carrier.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The camera is moving counterclockwise around the aircraft carrier.\nCaption B: The camera is tilting upwardsaround the aircraft carrier.", "gt-answer": "Caption A: The camera is moving counterclockwise around the aircraft carrier.", "video-llm-prediction": "Caption A: The camera is moving counterclockwise around the aircraft carrier.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is moving counterclockwise around the aircraft carrier.\nOption 2: The camera is moving back and forth around the aircraft carrier.", "gt-answer": "Option 1: The camera is moving counterclockwise around the aircraft carrier.", "video-llm-prediction": "Option 1: The camera is moving counterclockwise around the aircraft carrier.", "match_success": true, "rating": 1 } ] }, "1072453637_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is panning to the front of a fighter jet.\nSentence B: The camera is panning downwards around a fighter jet.", "gt-answer": "Sentence A: The camera is panning to the front of a fighter jet.", "video-llm-prediction": "Sentence A: The camera is panning to the front of a fighter jet.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is panning to the front of a fighter jet.\nOption 2: The camera is panning upwards around a fighter jet.", "gt-answer": "Option 1: The camera is panning to the front of a fighter jet.", "video-llm-prediction": "Option 2: The camera is panning upwards around a fighter jet.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The camera is panning to the back of a fighter jet.\nCaption B: The camera is panning to the front of a fighter jet.", "gt-answer": "Caption B: The camera is panning to the front of a fighter jet.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "3396473605_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The five stars appear from left to right.\nSentence B: The five stars disappear from right to left.", "gt-answer": "Sentence B: The five stars disappear from right to left.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The five stars disappear from right to left.\nOption 2: The five stars appear from top to bottom.", "gt-answer": "Option 1: The five stars disappear from right to left.", "video-llm-prediction": "Option 2: The five stars appear from top to bottom.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The five stars disappear from right to left.\nCaption B: The five stars appear from bottom to top.", "gt-answer": "Caption A: The five stars disappear from right to left.", "video-llm-prediction": "Caption B: The five stars appear from bottom to top.", "match_success": true, "rating": 0 } ] }, "1086820415_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The large wave is moving horizontally towards the right side of the screen.\nSentence B: The large wave is moving vertically towards the top of the screen.", "gt-answer": "Sentence B: The large wave is moving vertically towards the top of the screen.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The large wave is moving vertically towards the top of the screen.\nOption 2: The large wave is moving vertically towards the bottom of the screen.", "gt-answer": "Option 1: The large wave is moving vertically towards the top of the screen.", "video-llm-prediction": "Option 1: The large wave is moving vertically towards the top of the screen.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The large wave is moving vertically towards the top of the screen.\nCaption B: The large wave is moving diagonally towards the top-left corner.", "gt-answer": "Caption A: The large wave is moving vertically towards the top of the screen.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1073831006_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The zebras are moving from right to left.\nCaption B: The zebras are moving from left to right.", "gt-answer": "Caption A: The zebras are moving from right to left.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The zebras are moving from right to left.\nOption 2: The zebras are moving from top to bottom.", "gt-answer": "Option 1: The zebras are moving from right to left.", "video-llm-prediction": "Option 1: The zebras are moving from right to left.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The zebras are moving away from the camera.\nSentence B: The zebras are moving from right to left.", "gt-answer": "Sentence B: The zebras are moving from right to left.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1024867412_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The car is taking a right turn.\nCaption B: The car is taking a left turn.", "gt-answer": "Caption B: The car is taking a left turn.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The car is going backward.\nSentence B: The car is taking a left turn.", "gt-answer": "Sentence B: The car is taking a left turn.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The car is taking a left turn.\nOption 2: The car is moving forward.", "gt-answer": "Option 1: The car is taking a left turn.", "video-llm-prediction": "Option 2: The car is moving forward.", "match_success": true, "rating": 0 } ] }, "1076877854_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The robotic arm is moving to the left side of the screen.\nSentence B: The closest robotic arm is moving to the right side of the screen.", "gt-answer": "Sentence A: The robotic arm is moving to the left side of the screen.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The closest robotic arm is moving to the left side of the screen.\nCaption B: The robotic arm is moving away from the camera.", "gt-answer": "Caption A: The closest robotic arm is moving to the left side of the screen.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The robotic arm is rotating.\nOption 2: The closest robotic arm is moving to the left side of the screen.", "gt-answer": "Option 2: The closest robotic arm is moving to the left side of the screen.", "video-llm-prediction": "Option 2: The closest robotic arm is moving to the left side of the screen.", "match_success": true, "rating": 1 } ] }, "1100468035_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The camera is zooming backward in the video.\nCaption B: The camera is zooming forward in the video.", "gt-answer": "Caption A: The camera is zooming backward in the video.", "video-llm-prediction": "Caption B: The camera is zooming forward in the video.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is zooming backward in the video.\nSentence B: The camera is tracking right.", "gt-answer": "Sentence A: The camera is zooming backward in the video.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is panning left.\nOption 2: The camera is zooming backward in the video.", "gt-answer": "Option 2: The camera is zooming backward in the video.", "video-llm-prediction": "Option 1: The camera is panning left.", "match_success": true, "rating": 0 } ], "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The door is gradually closing.\nCaption B: The door is gradually opening.", "gt-answer": "Caption A: The door is gradually closing.", "video-llm-prediction": "Caption B: The door is gradually opening.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The door is gradually closing.\nSentence B: The door is remaining closed.", "gt-answer": "Sentence A: The door is gradually closing.", "video-llm-prediction": "Sentence A: The door is gradually closing.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The door is gradually opening.\nOption 2: The door is gradually closing.", "gt-answer": "Option 2: The door is gradually closing.", "video-llm-prediction": "Option 2: The door is gradually closing.", "match_success": true, "rating": 1 } ] }, "3411806181_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The person is unplugging the charging gun from the car.\nOption 2: The person is plugging the charging gun into the car.", "gt-answer": "Option 1: The person is unplugging the charging gun from the car.", "video-llm-prediction": "Option 2: The person is plugging the charging gun into the car.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is moving the charging gun away from the car.\nSentence B: The person is plugging the charging gun into the car.", "gt-answer": "Sentence A: The person is moving the charging gun away from the car.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The person is holding the charging gun still.\nCaption B: The person is unplugging the charging gun from the car.", "gt-answer": "Caption B: The person is unplugging the charging gun from the car.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1073914229_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The trains are moving towards each other.\nSentence B: The trains are moving away from each other.", "gt-answer": "Sentence B: The trains are moving away from each other.", "video-llm-prediction": "Sentence A: The trains are moving towards each other.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The trains are moving away from each other.\nOption 2: The trains are moving towards each other.", "gt-answer": "Option 1: The trains are moving away from each other.", "video-llm-prediction": "Option 2: The trains are moving towards each other.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The trains are moving towards each other.\nCaption B: The trains are moving away from each other.", "gt-answer": "Caption B: The trains are moving away from each other.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1093139559_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is zooming out from three football players.\nSentence B: The camera is zooming into three football players.", "gt-answer": "Sentence A: The camera is zooming out from three football players.", "video-llm-prediction": "Sentence B: The camera is zooming into three football players.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The camera is zooming out from three football players.\nCaption B: The camera is panning left.", "gt-answer": "Caption A: The camera is zooming out from three football players.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is zooming out from three football players.\nOption 2: The camera is panning right.", "gt-answer": "Option 1: The camera is zooming out from three football players.", "video-llm-prediction": "Option 1: The camera is zooming out from three football players.", "match_success": true, "rating": 1 } ], "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The light on the football players is turning brighter.\nOption 2: The light on the football players is turning darker.", "gt-answer": "Option 2: The light on the football players is turning darker.", "video-llm-prediction": "Option 2: The light on the football players is turning darker.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The light on the football players is staying the same.\nCaption B: The light on the football players is turning darker.", "gt-answer": "Caption B: The light on the football players is turning darker.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The light on the football players is dimming.\nSentence B: The light on the football players is turning brighter.", "gt-answer": "Sentence A: The light on the football players is dimming.", "video-llm-prediction": "Sentence B: The light on the football players is turning brighter.", "match_success": true, "rating": 0 } ] }, "1080591311_reverse": { "direction": [ { "question": "Which caption matches the video better?\nCaption A: The jellyfish is floating downwards.\nCaption B: The jellyfish is floating upwards.", "gt-answer": "Caption A: The jellyfish is floating downwards.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The jellyfish is moving towards the right.\nSentence B: The jellyfish is floating downwards.", "gt-answer": "Sentence B: The jellyfish is floating downwards.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The jellyfish is sinking downwards.\nOption 2: The jellyfish is floating upwards.", "gt-answer": "Option 1: The jellyfish is sinking downwards.", "video-llm-prediction": "Option 1: The jellyfish is sinking downwards.", "match_success": true, "rating": 1 } ] }, "3411228673_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is moving upwards along the skyscraper buildings.\nSentence B: The camera is moving downwards along the skyscraper buildings.", "gt-answer": "Sentence B: The camera is moving downwards along the skyscraper buildings.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The movement in the video is downwards among skyscraper buildings.\nOption 2: The camera is moving backwards.", "gt-answer": "Option 1: The movement in the video is downwards among skyscraper buildings.", "video-llm-prediction": "Option 1: The movement in the video is downwards among skyscraper buildings.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The movement in the video is upwards among skyscraper buildings.\nCaption B: The camera is moving downwards.", "gt-answer": "Caption B: The camera is moving downwards.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1093737353_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is panning to the left.\nSentence B: The camera is panning to the right.", "gt-answer": "Sentence A: The camera is panning to the left.", "video-llm-prediction": "Sentence B: The camera is panning to the right.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The camera is panning to the left.\nCaption B: The camera is tilting up.", "gt-answer": "Caption A: The camera is panning to the left.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is panning to the left.\nOption 2: The camera is zooming in.", "gt-answer": "Option 1: The camera is panning to the left.", "video-llm-prediction": "Option 2: The camera is zooming in.", "match_success": true, "rating": 0 } ] }, "1017768724_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The person is moving leftwards.\nOption 2: The person is moving rightwards.", "gt-answer": "Option 1: The person is moving leftwards.", "video-llm-prediction": "Option 2: The person is moving rightwards.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person is moving leftwards.\nSentence B: The person is moving upwards.", "gt-answer": "Sentence A: The person is moving leftwards.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The person is moving downwards.\nCaption B: The person is moving leftwards.", "gt-answer": "Caption B: The person is moving leftwards.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1100117263_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The little lion is jumping onto the back of an adult lion.\nSentence B: The little lion is falling from the back of an adult lion.", "gt-answer": "Sentence A: The little lion is jumping onto the back of an adult lion.", "video-llm-prediction": "Sentence A: The little lion is jumping onto the back of an adult lion.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The little lion is sitting nearby an adult lion.\nCaption B: The little lion is jumping onto the back of an adult lion.", "gt-answer": "Caption B: The little lion is jumping onto the back of an adult lion.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The little lion is jumping onto the back of an adult lion.\nOption 2: The little lion is sitting on the back of an adult lion.", "gt-answer": "Option 1: The little lion is jumping onto the back of an adult lion.", "video-llm-prediction": "Option 2: The little lion is sitting on the back of an adult lion.", "match_success": true, "rating": 0 } ] }, "1021488277_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is dropping down the pineapple.\nSentence B: A person is picking up the pineapple.", "gt-answer": "Sentence A: A person is dropping down the pineapple.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pineapple is being rotated by a person.\nOption 2: A person is dropping down the pineapple.", "gt-answer": "Option 2: A person is dropping down the pineapple.", "video-llm-prediction": "Option 1: The pineapple is being rotated by a person.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: A person is dropping down the pineapple.\nCaption B: The pineapple is being pushed forward by a person.", "gt-answer": "Caption A: A person is dropping down the pineapple.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1100397435_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The robot vacuum cleaner is moving to the right side of the screen.\nOption 2: The robot vacuum cleaner is moving to the left side of the screen.", "gt-answer": "Option 2: The robot vacuum cleaner is moving to the left side of the screen.", "video-llm-prediction": "Option 1: The robot vacuum cleaner is moving to the right side of the screen.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The robot vacuum cleaner is moving to the left side of the screen.\nCaption B: The robot vacuum cleaner is moving towards the screen.", "gt-answer": "Caption A: The robot vacuum cleaner is moving to the left side of the screen.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The robot vacuum cleaner is moving away from the screen.\nSentence B: The robot vacuum cleaner is moving to the left side of the screen.", "gt-answer": "Sentence B: The robot vacuum cleaner is moving to the left side of the screen.", "video-llm-prediction": "Sentence A: The robot vacuum cleaner is moving away from the screen.", "match_success": true, "rating": 0 } ] }, "1108534143_reverse": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The puppy is walking into a wigwam.\nOption 2: The puppy is walking out of a wigwam.", "gt-answer": "Option 1: The puppy is walking into a wigwam.", "video-llm-prediction": "Option 1: The puppy is walking into a wigwam.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The puppy is staying in a wigwam.\nCaption B: The puppy is walking into a wigwam.", "gt-answer": "Caption B: The puppy is walking into a wigwam.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The puppy is walking into a wigwam.\nSentence B: The puppy is walking around a wigwam.", "gt-answer": "Sentence A: The puppy is walking into a wigwam.", "video-llm-prediction": "Sentence A: The puppy is walking into a wigwam.", "match_success": true, "rating": 1 } ] }, "1047368344_reverse": { "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The girl is jumping into the water.\nSentence B: The girl is jumping out of water.", "gt-answer": "Sentence B: The girl is jumping out of water.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The girl is jumping around water.\nCaption B: The girl is jumping out of the water.", "gt-answer": "Caption B: The girl is jumping out of the water.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The girl is swimming in the water.\nOption 2: The girl is jumping out of the water.", "gt-answer": "Option 2: The girl is jumping out of the water.", "video-llm-prediction": "Option 2: The girl is jumping out of the water.", "match_success": true, "rating": 1 } ] }, "1050618460_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The flowers are withering.\nSentence B: The flowers are turning into full bloom.", "gt-answer": "Sentence A: The flowers are withering.", "video-llm-prediction": "Sentence B: The flowers are turning into full bloom.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The flowers are turning into full bloom.\nCaption B: The flowers are shrinking and wilting.", "gt-answer": "Caption B: The flowers are shrinking and wilting.", "video-llm-prediction": "Caption A: The flowers are turning into full bloom.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The flowers are withering.\nOption 2: The flowers are turning into buds.", "gt-answer": "Option 1: The flowers are withering.", "video-llm-prediction": "Option 2: The flowers are turning into buds.", "match_success": true, "rating": 0 } ] }, "1083113449_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The sky is maintaining its brightness.\nCaption B: The sky is getting brighter.", "gt-answer": "Caption B: The sky is getting brighter.", "video-llm-prediction": "Caption B: The sky is getting brighter.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sky is lightening up.\nOption 2: The sky is getting darker.", "gt-answer": "Option 1: The sky is lightening up.", "video-llm-prediction": "Option 2: The sky is getting darker.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The sky is getting darker.\nSentence B: The sky is brightening.", "gt-answer": "Sentence B: The sky is brightening.", "video-llm-prediction": "Sentence B: The sky is brightening.", "match_success": true, "rating": 1 } ] }, "1091143117_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The eye is changing from open to closed.\nOption 2: The eye is blinking rapidly.", "gt-answer": "Option 1: The eye is changing from open to closed.", "video-llm-prediction": "Option 1: The eye is changing from open to closed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The eye is changing from closed to open.\nCaption B: The eye is changing from open to closed.", "gt-answer": "Caption B: The eye is changing from open to closed.", "video-llm-prediction": "Caption B: The eye is changing from open to closed.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The eye is changing from closed to open.\nSentence B: The eye is changing from open to closed.", "gt-answer": "Sentence B: The eye is changing from open to closed.", "video-llm-prediction": "Sentence A: The eye is changing from closed to open.", "match_success": true, "rating": 0 } ] }, "1098153561_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The leaves are changing color from green to yellow.\nCaption B: The leaves are changing color from yellow to green.", "gt-answer": "Caption B: The leaves are changing color from yellow to green.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The leaves are changing color from yellow to green.\nSentence B: The leaves are changing color from green to yellow.", "gt-answer": "Sentence A: The leaves are changing color from yellow to green.", "video-llm-prediction": "Sentence A: The leaves are changing color from yellow to green.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The leaves are changing color from yellow to green.\nOption 2: The leaves are changing color from green to yellow.", "gt-answer": "Option 1: The leaves are changing color from yellow to green.", "video-llm-prediction": "Option 1: The leaves are changing color from yellow to green.", "match_success": true, "rating": 1 } ] }, "1059072773_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The ice cream is melting.\nCaption B: The ice cream is freezing.", "gt-answer": "Caption B: The ice cream is freezing.", "video-llm-prediction": "Caption A: The ice cream is melting.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The ice cream is evaporating.\nOption 2: The ice cream is freezing.", "gt-answer": "Option 2: The ice cream is freezing.", "video-llm-prediction": "Option 1: The ice cream is evaporating.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice cream is melting.\nSentence B: The ice cream is solidifying.", "gt-answer": "Sentence B: The ice cream is solidifying.", "video-llm-prediction": "Sentence A: The ice cream is melting.", "match_success": true, "rating": 0 } ] }, "1093573255_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The plant is reviving.\nOption 2: The plant is wilting.", "gt-answer": "Option 2: The plant is wilting.", "video-llm-prediction": "Option 1: The plant is reviving.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The plant is decaying.\nSentence B: The plant is reviving.", "gt-answer": "Sentence A: The plant is decaying.", "video-llm-prediction": "Sentence B: The plant is reviving.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The plant is reviving.\nCaption B: The plant is wilting.", "gt-answer": "Caption B: The plant is wilting.", "video-llm-prediction": "Caption A: The plant is reviving.", "match_success": true, "rating": 0 } ] }, "1077146132_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is turning darker.\nOption 2: The video is turning brighter.", "gt-answer": "Option 1: The video is turning darker.", "video-llm-prediction": "Option 2: The video is turning brighter.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is staying the same brightness.\nSentence B: The video is turning darker.", "gt-answer": "Sentence B: The video is turning darker.", "video-llm-prediction": "Sentence B: The video is turning darker.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is decreasing in brightness.\nCaption B: The video is turning brighter.", "gt-answer": "Caption A: The video is decreasing in brightness.", "video-llm-prediction": "Caption A: The video is decreasing in brightness.", "match_success": true, "rating": 1 } ] }, "1086688682_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The traffic lights are changing from red to yellow.\nSentence B: The traffic lights are changing from green to red.", "gt-answer": "Sentence B: The traffic lights are changing from green to red.", "video-llm-prediction": "Sentence A: The traffic lights are changing from red to yellow.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The traffic lights are changing from yellow to red.\nCaption B: The traffic lights are changing from green to red.", "gt-answer": "Caption B: The traffic lights are changing from green to red.", "video-llm-prediction": "Caption B: The traffic lights are changing from green to red.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The traffic lights are changing from green to red.\nOption 2: The traffic lights are changing from red to green.", "gt-answer": "Option 1: The traffic lights are changing from green to red.", "video-llm-prediction": "Option 1: The traffic lights are changing from green to red.", "match_success": true, "rating": 1 } ] }, "1099738787_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The concrete wall is falling apart.\nCaption B: The concrete wall is assembling together.", "gt-answer": "Caption B: The concrete wall is assembling together.", "video-llm-prediction": "Caption B: The concrete wall is assembling together.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The concrete wall is falling apart.\nOption 2: The concrete wall is growing taller.", "gt-answer": "Option 2: The concrete wall is growing taller.", "video-llm-prediction": "Option 1: The concrete wall is falling apart.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The concrete wall is assembling together.\nSentence B: The concrete wall is falling apart.", "gt-answer": "Sentence A: The concrete wall is assembling together.", "video-llm-prediction": "Sentence B: The concrete wall is falling apart.", "match_success": true, "rating": 0 } ] }, "1094219575_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The flowers are blooming.\nCaption B: The flowers are wilting.", "gt-answer": "Caption A: The flowers are blooming.", "video-llm-prediction": "Caption B: The flowers are wilting.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The flowers are blooming.\nOption 2: The flowers are wilting.", "gt-answer": "Option 1: The flowers are blooming.", "video-llm-prediction": "Option 1: The flowers are blooming.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The flowers are wilting.\nSentence B: The flowers are blooming.", "gt-answer": "Sentence B: The flowers are blooming.", "video-llm-prediction": "Sentence B: The flowers are blooming.", "match_success": true, "rating": 1 } ] }, "1067427023_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The snow particles are melting.\nSentence B: The snow particles are solidifying.", "gt-answer": "Sentence B: The snow particles are solidifying.", "video-llm-prediction": "Sentence A: The snow particles are melting.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The snow particles are freezing.\nOption 2: The snow particles are melting.", "gt-answer": "Option 1: The snow particles are freezing.", "video-llm-prediction": "Option 2: The snow particles are melting.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The snow particles are freezing.\nCaption B: The snow particles are remaining the same.", "gt-answer": "Caption A: The snow particles are freezing.", "video-llm-prediction": "Caption A: The snow particles are freezing.", "match_success": true, "rating": 1 } ] }, "1093948229_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The golden particles are exploding.\nSentence B: The golden particles are disappearing.", "gt-answer": "Sentence B: The golden particles are disappearing.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The golden particles are exploding.\nOption 2: The golden particles are shrinking.", "gt-answer": "Option 2: The golden particles are shrinking.", "video-llm-prediction": "Option 1: The golden particles are exploding.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The golden particles are contracting.\nCaption B: The golden particles are exploding.", "gt-answer": "Caption A: The golden particles are contracting.", "video-llm-prediction": "Caption B: The golden particles are exploding.", "match_success": true, "rating": 0 } ] }, "1049965795_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The light bulb is turning off.\nSentence B: The light bulb is lighting up.", "gt-answer": "Sentence A: The light bulb is turning off.", "video-llm-prediction": "Sentence A: The light bulb is turning off.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The light bulb is breaking.\nCaption B: The light bulb is turning off.", "gt-answer": "Caption B: The light bulb is turning off.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The light bulb is rapidly changing color.\nOption 2: The light bulb is turning off.", "gt-answer": "Option 2: The light bulb is turning off.", "video-llm-prediction": "Option 1: The light bulb is rapidly changing color.", "match_success": true, "rating": 0 } ] }, "1052364490_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The digital bar chart is decreasing.\nSentence B: The digital bar chart is staying in the same size.", "gt-answer": "Sentence A: The digital bar chart is decreasing.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The digital bar chart is growing.\nCaption B: The digital bar chart is shrinking.", "gt-answer": "Caption B: The digital bar chart is shrinking.", "video-llm-prediction": "Caption B: The digital bar chart is shrinking.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The digital bar chart is decreasing.\nOption 2: The digital bar chart is growing.", "gt-answer": "Option 1: The digital bar chart is decreasing.", "video-llm-prediction": "Option 1: The digital bar chart is decreasing.", "match_success": true, "rating": 1 } ] }, "1080990179_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The apple is shrinking and drying out.\nCaption B: The apple is ripening.", "gt-answer": "Caption B: The apple is ripening.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The apple is shrinking and drying out.\nSentence B: The apple is growing bigger.", "gt-answer": "Sentence B: The apple is growing bigger.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The apple is becoming juicier.\nOption 2: The apple is shrinking and drying out.", "gt-answer": "Option 1: The apple is becoming juicier.", "video-llm-prediction": "Option 2: The apple is shrinking and drying out.", "match_success": true, "rating": 0 } ] }, "1077575504_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The pile of salt is shrinking.\nOption 2: The pile of salt is growing.", "gt-answer": "Option 1: The pile of salt is shrinking.", "video-llm-prediction": "Option 1: The pile of salt is shrinking.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The pile of salt is melting.\nSentence B: The pile of salt is shrinking.", "gt-answer": "Sentence B: The pile of salt is shrinking.", "video-llm-prediction": "Sentence B: The pile of salt is shrinking.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The pile of salt is growing.\nCaption B: The pile of salt is disappearing.", "gt-answer": "Caption B: The pile of salt is disappearing.", "video-llm-prediction": "Caption B: The pile of salt is disappearing.", "match_success": true, "rating": 1 } ] }, "23370985_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice crystal is melting.\nSentence B: The ice crystal is shattering.", "gt-answer": "Sentence A: The ice crystal is melting.", "video-llm-prediction": "Sentence B: The ice crystal is shattering.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The ice crystal is melting.\nCaption B: The ice crystal is forming.", "gt-answer": "Caption A: The ice crystal is melting.", "video-llm-prediction": "Caption B: The ice crystal is forming.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The ice crystal is reducing in size.\nOption 2: The ice crystal is growing in size.", "gt-answer": "Option 1: The ice crystal is reducing in size.", "video-llm-prediction": "Option 1: The ice crystal is reducing in size.", "match_success": true, "rating": 1 } ] }, "1055981522_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The 3d shapes are transforming from cubes into spheres.\nCaption B: The 3d shapes are transforming from spheres into cubes.", "gt-answer": "Caption B: The 3d shapes are transforming from spheres into cubes.", "video-llm-prediction": "Caption A: The 3d shapes are transforming from cubes into spheres.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3d shapes are transforming from spheres into cubes.\nOption 2: The 3d shapes are transforming from cubes into spheres.", "gt-answer": "Option 1: The 3d shapes are transforming from spheres into cubes.", "video-llm-prediction": "Option 1: The 3d shapes are transforming from spheres into cubes.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3d shapes are transforming from cubes into spheres.\nSentence B: The 3d shapes are transforming from spheres into cubes.", "gt-answer": "Sentence B: The 3d shapes are transforming from spheres into cubes.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1081734740_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The leaf is turning from green to yellow.\nOption 2: The leaf is turning from yellow to green.", "gt-answer": "Option 2: The leaf is turning from yellow to green.", "video-llm-prediction": "Option 1: The leaf is turning from green to yellow.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The texture of the leaf is turning from green to yellow.\nCaption B: The leaf is turning from green to white.", "gt-answer": "Caption B: The leaf is turning from green to white.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The texture of the leaf is turning from yellow to green.\nSentence B: The texture of the leaf is turning from green to yellow.", "gt-answer": "Sentence A: The texture of the leaf is turning from yellow to green.", "video-llm-prediction": "Sentence A: The texture of the leaf is turning from yellow to green.", "match_success": true, "rating": 1 } ] }, "1093719871_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The pink flowers are turning from full blossom to withered.\nSentence B: The pink flowers are turning from withered to full blossom.", "gt-answer": "Sentence A: The pink flowers are turning from full blossom to withered.", "video-llm-prediction": "Sentence A: The pink flowers are turning from full blossom to withered.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pink flowers are growing taller.\nOption 2: The pink flowers are turning from full blossom to withered.", "gt-answer": "Option 2: The pink flowers are turning from full blossom to withered.", "video-llm-prediction": "Option 2: The pink flowers are turning from full blossom to withered.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The pink flowers are falling off petals.\nCaption B: The pink flowers are turning from full blossom to withered.", "gt-answer": "Caption B: The pink flowers are turning from full blossom to withered.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1055973887_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The fire ball is getting smaller.\nOption 2: The fire ball is getting larger.", "gt-answer": "Option 1: The fire ball is getting smaller.", "video-llm-prediction": "Option 2: The fire ball is getting larger.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The fire ball is shrinking in size.\nSentence B: The fire ball is getting larger.", "gt-answer": "Sentence A: The fire ball is shrinking in size.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The fire ball is getting larger.\nCaption B: The fire ball is getting smaller.", "gt-answer": "Caption B: The fire ball is getting smaller.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1098710341_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The glass is going from empty to full of water.\nOption 2: The glass is going from full of water to empty.", "gt-answer": "Option 2: The glass is going from full of water to empty.", "video-llm-prediction": "Option 1: The glass is going from empty to full of water.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The glass is going from full of water to empty.\nSentence B: The glass is going from empty to full of water.", "gt-answer": "Sentence A: The glass is going from full of water to empty.", "video-llm-prediction": "Sentence B: The glass is going from empty to full of water.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The glass is going from empty to full of water.\nCaption B: The glass is going from full of water to empty.", "gt-answer": "Caption B: The glass is going from full of water to empty.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1099116143_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: White foam appears on the surface of the pink liquid.\nCaption B: White foam disappears from the surface of the pink liquid.", "gt-answer": "Caption B: White foam disappears from the surface of the pink liquid.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pink liquid is freezing.\nOption 2: White foam disappears from the surface of the pink liquid.", "gt-answer": "Option 2: White foam disappears from the surface of the pink liquid.", "video-llm-prediction": "Option 2: White foam disappears from the surface of the pink liquid.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: White foam disappears from the surface of the pink liquid.\nSentence B: The pink liquid is evaporating.", "gt-answer": "Sentence A: White foam disappears from the surface of the pink liquid.", "video-llm-prediction": "Sentence B: The pink liquid is evaporating.", "match_success": true, "rating": 0 } ] }, "1025623079_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The skyscraper is growing taller.\nCaption B: The skyscraper is collapsing.", "gt-answer": "Caption A: The skyscraper is growing taller.", "video-llm-prediction": "Caption B: The skyscraper is collapsing.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The skyscraper is growing taller.\nOption 2: The skyscraper is collapsing.", "gt-answer": "Option 1: The skyscraper is growing taller.", "video-llm-prediction": "Option 2: The skyscraper is collapsing.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The skyscraper is standing up.\nSentence B: The skyscraper is collapsing.", "gt-answer": "Sentence A: The skyscraper is standing up.", "video-llm-prediction": "Sentence B: The skyscraper is collapsing.", "match_success": true, "rating": 0 } ] }, "1063373449_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The coronavirus is assembling together.\nOption 2: The coronavirus is decreasing in size.", "gt-answer": "Option 1: The coronavirus is assembling together.", "video-llm-prediction": "Option 2: The coronavirus is decreasing in size.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The coronavirus is dissolving under the attack of antibodies.\nSentence B: The coronavirus is assembling together.", "gt-answer": "Sentence B: The coronavirus is assembling together.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The coronavirus is dissolving under the attack of antibodies.\nCaption B: The coronavirus is assembling together.", "gt-answer": "Caption B: The coronavirus is assembling together.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1062609286_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The battery charge is decreasing.\nOption 2: The battery charge is increasing.", "gt-answer": "Option 1: The battery charge is decreasing.", "video-llm-prediction": "Option 1: The battery charge is decreasing.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The battery charge is staying constant.\nCaption B: The battery charge is decreasing.", "gt-answer": "Caption B: The battery charge is decreasing.", "video-llm-prediction": "Caption B: The battery charge is decreasing.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The battery charge is decreasing.\nSentence B: The battery charge is increasing.", "gt-answer": "Sentence A: The battery charge is decreasing.", "video-llm-prediction": "Sentence A: The battery charge is decreasing.", "match_success": true, "rating": 1 } ] }, "1060403198_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The tree is decreasing in height.\nSentence B: The tree is increasing in height.", "gt-answer": "Sentence A: The tree is decreasing in height.", "video-llm-prediction": "Sentence A: The tree is decreasing in height.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The tree is growing taller.\nCaption B: The tree is becoming shorter.", "gt-answer": "Caption B: The tree is becoming shorter.", "video-llm-prediction": "Caption A: The tree is growing taller.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The tree is decreasing in height.\nOption 2: The tree is increasing in height.", "gt-answer": "Option 1: The tree is decreasing in height.", "video-llm-prediction": "Option 1: The tree is decreasing in height.", "match_success": true, "rating": 1 } ] }, "1099109715_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The number of fruits and vegetables is increasing.\nOption 2: The number of fruits and vegetables is decreasing.", "gt-answer": "Option 2: The number of fruits and vegetables is decreasing.", "video-llm-prediction": "Option 1: The number of fruits and vegetables is increasing.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The number of fruits and vegetables is decreasing.\nSentence B: The fruits and vegetables are becoming rotten.", "gt-answer": "Sentence A: The number of fruits and vegetables is decreasing.", "video-llm-prediction": "Sentence A: The number of fruits and vegetables is decreasing.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The number of fruits and vegetables is decreasing.\nCaption B: The fruits and vegetables are staying the same.", "gt-answer": "Caption A: The number of fruits and vegetables is decreasing.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1051211272_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The toast with cream cheese is gradually growing larger.\nSentence B: The toast with cream cheese is gradually being eaten out and disappearing.", "gt-answer": "Sentence A: The toast with cream cheese is gradually growing larger.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The toast with cream cheese is gradually being eaten out and disappearing.\nOption 2: The toast with cream cheese is gradually emerging piece by piece.", "gt-answer": "Option 2: The toast with cream cheese is gradually emerging piece by piece.", "video-llm-prediction": "Option 1: The toast with cream cheese is gradually being eaten out and disappearing.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The toast with cream cheese is gradually growing larger.\nCaption B: The toast with cream cheese is gradually being eaten out and disappearing.", "gt-answer": "Caption A: The toast with cream cheese is gradually growing larger.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1059815936_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The paper is changing from a wrinkled paper ball into a flattened sheet.\nSentence B: The paper is changing from a flattened sheet into a wrinkled paper ball.", "gt-answer": "Sentence B: The paper is changing from a flattened sheet into a wrinkled paper ball.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The paper is changing from a wrinkled paper ball into a flattened sheet.\nCaption B: The paper is disappearing.", "gt-answer": "Caption B: The paper is disappearing.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The paper is changing from a flattened sheet into a wrinkled paper ball.\nOption 2: The paper is tearing apart.", "gt-answer": "Option 1: The paper is changing from a flattened sheet into a wrinkled paper ball.", "video-llm-prediction": "Option 1: The paper is changing from a flattened sheet into a wrinkled paper ball.", "match_success": true, "rating": 1 } ] }, "1059718130_concat_0": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The sun above is rising at the same speed as the sun below.\nSentence B: The sun above is rising slower than the sun below.", "gt-answer": "Sentence A: The sun above is rising at the same speed as the sun below.", "video-llm-prediction": "Sentence A: The sun above is rising at the same speed as the sun below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The sun above is rising at the same speed as the sun below.\nCaption B: The sun below is rising faster than the sun above.", "gt-answer": "Caption A: The sun above is rising at the same speed as the sun below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sun above is rising at the same speed as the sun below.\nOption 2: The sun above is not rising at the same speed as the sun below.", "gt-answer": "Option 1: The sun above is rising at the same speed as the sun below.", "video-llm-prediction": "Option 2: The sun above is not rising at the same speed as the sun below.", "match_success": true, "rating": 0 } ] }, "1059718130_concat_1": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The sun above is rising faster than the sun below.\nOption 2: The sun above is rising slower than the sun below.", "gt-answer": "Option 2: The sun above is rising slower than the sun below.", "video-llm-prediction": "Option 1: The sun above is rising faster than the sun below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: Both suns are rising at the same speed.\nCaption B: The sun above is rising slower than the sun below.", "gt-answer": "Caption B: The sun above is rising slower than the sun below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The sun below is rising at the same speed as the sun above.\nSentence B: The sun above is rising slower than the sun below.", "gt-answer": "Sentence B: The sun above is rising slower than the sun below.", "video-llm-prediction": "Sentence B: The sun above is rising slower than the sun below.", "match_success": true, "rating": 1 } ] }, "1059718130_concat_2": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The sun above is rising faster.\nSentence B: The sun above is rising at the same speed as the sun below.", "gt-answer": "Sentence A: The sun above is rising faster.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sun above is rising faster.\nOption 2: The sun below is rising faster than the sun above.", "gt-answer": "Option 1: The sun above is rising faster.", "video-llm-prediction": "Option 2: The sun below is rising faster than the sun above.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The sun above is rising faster.\nCaption B: Both suns are rising at the same speed.", "gt-answer": "Caption A: The sun above is rising faster.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1098153561_concat_0": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The speed that the forest above changes color is faster than the forest below.\nCaption B: The speed that the forest above changes color is at the same speed as the forest below.", "gt-answer": "Caption B: The speed that the forest above changes color is at the same speed as the forest below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed that the forest above changes color is at the same speed as the forest below.\nOption 2: The speed that the forest above changes color is slower than the forest below.", "gt-answer": "Option 1: The speed that the forest above changes color is at the same speed as the forest below.", "video-llm-prediction": "Option 2: The speed that the forest above changes color is slower than the forest below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed that the forest above changes color is at the same speed as the forest below.\nSentence B: The speed that the forest above changes color is faster than the forest below.", "gt-answer": "Sentence A: The speed that the forest above changes color is at the same speed as the forest below.", "video-llm-prediction": "Sentence A: The speed that the forest above changes color is at the same speed as the forest below.", "match_success": true, "rating": 1 } ] }, "1098153561_concat_1": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The forest above changes color faster than the forest below.\nSentence B: The forest above changes color at a slower speed than the forest below.", "gt-answer": "Sentence B: The forest above changes color at a slower speed than the forest below.", "video-llm-prediction": "Sentence A: The forest above changes color faster than the forest below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The forest above changes color at the same speed as the forest below.\nCaption B: The forest above changes color at a slower speed than the forest below.", "gt-answer": "Caption B: The forest above changes color at a slower speed than the forest below.", "video-llm-prediction": "Caption A: The forest above changes color at the same speed as the forest below.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The forest above changes color at a slower speed than the forest below.\nOption 2: The forest above changes color at a faster speed than the forest below.", "gt-answer": "Option 1: The forest above changes color at a slower speed than the forest below.", "video-llm-prediction": "Option 2: The forest above changes color at a faster speed than the forest below.", "match_success": true, "rating": 0 } ] }, "1098153561_concat_2": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The forest above changes color faster than the forest below.\nOption 2: The forest above changes color at the same speed as the forest below.", "gt-answer": "Option 1: The forest above changes color faster than the forest below.", "video-llm-prediction": "Option 1: The forest above changes color faster than the forest below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The forest above changes color slower than the forest below.\nCaption B: The forest above changes color faster than the forest below.", "gt-answer": "Caption B: The forest above changes color faster than the forest below.", "video-llm-prediction": "Caption B: The forest above changes color faster than the forest below.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The forest below changes color faster than the forest above.\nSentence B: The forest above changes color faster than the forest below.", "gt-answer": "Sentence B: The forest above changes color faster than the forest below.", "video-llm-prediction": "Sentence A: The forest below changes color faster than the forest above.", "match_success": true, "rating": 0 } ] }, "1059462749_concat_0": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The person above and the person below are moving at the same speed.\nCaption B: The person above is climbing faster than the person below.", "gt-answer": "Caption A: The person above and the person below are moving at the same speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person above and the person below are moving at the same speed.\nOption 2: The person above is climbing slower than the person below.", "gt-answer": "Option 1: The person above and the person below are moving at the same speed.", "video-llm-prediction": "Option 2: The person above is climbing slower than the person below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person above and the person below are moving at the same speed.\nSentence B: The person below is climbing faster than the person above.", "gt-answer": "Sentence A: The person above and the person below are moving at the same speed.", "video-llm-prediction": "Sentence B: The person below is climbing faster than the person above.", "match_success": true, "rating": 0 } ] }, "1059462749_concat_1": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The person above is climbing faster than the person below.\nCaption B: The person above is climbing slower than the person below.", "gt-answer": "Caption B: The person above is climbing slower than the person below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person above is climbing slower than the person below.\nSentence B: The person above and below are moving at the same speed.", "gt-answer": "Sentence A: The person above is climbing slower than the person below.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person above is climbing slower than the person below.\nOption 2: The person above is moving at the same speed as the person below.", "gt-answer": "Option 1: The person above is climbing slower than the person below.", "video-llm-prediction": "Option 1: The person above is climbing slower than the person below.", "match_success": true, "rating": 1 } ] }, "1059462749_concat_2": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The person above is climbing faster than the person below.\nCaption B: The person above is climbing at the same speed as the person below.", "gt-answer": "Caption A: The person above is climbing faster than the person below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person above is climbing faster than the person below.\nSentence B: The person above is climbing slower than the person below.", "gt-answer": "Sentence A: The person above is climbing faster than the person below.", "video-llm-prediction": "Sentence A: The person above is climbing faster than the person below.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person above is climbing faster than the person below.\nOption 2: The person below is moving faster than the person above.", "gt-answer": "Option 1: The person above is climbing faster than the person below.", "video-llm-prediction": "Option 2: The person below is moving faster than the person above.", "match_success": true, "rating": 0 } ] }, "1059072773_concat_0": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The ice cream above is melting at the same speed as the ice cream below.\nOption 2: The ice cream above is melting faster than the ice cream below.", "gt-answer": "Option 1: The ice cream above is melting at the same speed as the ice cream below.", "video-llm-prediction": "Option 2: The ice cream above is melting faster than the ice cream below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice cream above is melting at the same speed as the ice cream below.\nSentence B: The melting of the ice cream above is slower than the ice cream below.", "gt-answer": "Sentence A: The ice cream above is melting at the same speed as the ice cream below.", "video-llm-prediction": "Sentence A: The ice cream above is melting at the same speed as the ice cream below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The ice cream above is melting at the same speed as the ice cream below.\nCaption B: The melting of the ice cream above is not at the same speed as the ice cream below.", "gt-answer": "Caption A: The ice cream above is melting at the same speed as the ice cream below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1059072773_concat_1": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The ice cream above is melting faster than the ice cream below.\nCaption B: The ice cream above is melting slower than the ice cream below.", "gt-answer": "Caption B: The ice cream above is melting slower than the ice cream below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The ice cream above is melting slower than the ice cream below.\nOption 2: The ice cream above is melting at the same speed as the ice cream below.", "gt-answer": "Option 1: The ice cream above is melting slower than the ice cream below.", "video-llm-prediction": "Option 1: The ice cream above is melting slower than the ice cream below.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice cream above is melting slower than the ice cream below.\nSentence B: The ice cream above is melting faster than the ice cream below.", "gt-answer": "Sentence A: The ice cream above is melting slower than the ice cream below.", "video-llm-prediction": "Sentence A: The ice cream above is melting slower than the ice cream below.", "match_success": true, "rating": 1 } ] }, "1059072773_concat_2": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice cream above is melting at the same speed as the ice cream below.\nSentence B: The ice cream above is melting faster than the ice cream below.", "gt-answer": "Sentence B: The ice cream above is melting faster than the ice cream below.", "video-llm-prediction": "Sentence B: The ice cream above is melting faster than the ice cream below.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The ice cream above is melting faster than the ice cream below.\nOption 2: The ice cream above is melting slower than the ice cream below.", "gt-answer": "Option 1: The ice cream above is melting faster than the ice cream below.", "video-llm-prediction": "Option 1: The ice cream above is melting faster than the ice cream below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The ice cream above is melting faster than the ice cream below.\nCaption B: The ice cream above is melting slower than the ice cream below.", "gt-answer": "Caption A: The ice cream above is melting faster than the ice cream below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1090578917_concat_0": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The man above is jumping slower than the man below.\nCaption B: The man above is jumping at the same speed as the man below.", "gt-answer": "Caption B: The man above is jumping at the same speed as the man below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man above is jumping at the same speed as the man below.\nSentence B: The man above is jumping faster than the man below.", "gt-answer": "Sentence A: The man above is jumping at the same speed as the man below.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: In comparison to the man below, the speed of the man above jumping looks faster.\nOption 2: The man above is jumping at the same speed as the man below.", "gt-answer": "Option 2: The man above is jumping at the same speed as the man below.", "video-llm-prediction": "Option 1: In comparison to the man below, the speed of the man above jumping looks faster.", "match_success": true, "rating": 0 } ] }, "1090578917_concat_1": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man above is jumping at a slower speed than the man below.\nOption 2: The man above is jumping at the same speed as the man below.", "gt-answer": "Option 1: The man above is jumping at a slower speed than the man below.", "video-llm-prediction": "Option 1: The man above is jumping at a slower speed than the man below.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man above is jumping at a faster speed than the man below.\nSentence B: The man above is jumping at a slower speed than the man below.", "gt-answer": "Sentence B: The man above is jumping at a slower speed than the man below.", "video-llm-prediction": "Sentence A: The man above is jumping at a faster speed than the man below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The man above is jumping at a slower speed than the man below.\nCaption B: The man above is jumping at a faster speed than the man below.", "gt-answer": "Caption A: The man above is jumping at a slower speed than the man below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1090578917_concat_2": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The man above is jumping faster than the man below.\nSentence B: The man above is jumping at the same speed as the man below.", "gt-answer": "Sentence A: The man above is jumping faster than the man below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man above is jumping faster than the man below.\nCaption B: The man above is jumping slower than the man below.", "gt-answer": "Caption A: The man above is jumping faster than the man below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the man above jumping looks slower.\nOption 2: The man above is jumping faster than the man below.", "gt-answer": "Option 2: The man above is jumping faster than the man below.", "video-llm-prediction": "Option 2: The man above is jumping faster than the man below.", "match_success": true, "rating": 1 } ] }, "1075338140_concat_0": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The ball above is faster than the ball below.\nSentence B: The speed of the ball above is the same as the ball below.", "gt-answer": "Sentence B: The speed of the ball above is the same as the ball below.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the ball above is the same as the ball below.\nCaption B: The ball below is slower than the ball above.", "gt-answer": "Caption A: The speed of the ball above is the same as the ball below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the ball above is the same as the ball below.\nOption 2: Both balls are moving at different speeds.", "gt-answer": "Option 1: The speed of the ball above is the same as the ball below.", "video-llm-prediction": "Option 2: Both balls are moving at different speeds.", "match_success": true, "rating": 0 } ] }, "1075338140_concat_1": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The ball above is moving slower than the ball below.\nCaption B: The ball above is moving faster than the ball below.", "gt-answer": "Caption A: The ball above is moving slower than the ball below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The ball above is moving slower than the ball below.\nOption 2: Both balls are moving at the same speed.", "gt-answer": "Option 1: The ball above is moving slower than the ball below.", "video-llm-prediction": "Option 1: The ball above is moving slower than the ball below.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The ball above is moving slower than the ball below.\nSentence B: The ball below is moving slower.", "gt-answer": "Sentence A: The ball above is moving slower than the ball below.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 } ] }, "1075338140_concat_2": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The ball above is moving at the same speed as the ball below.\nOption 2: The ball above is moving faster.", "gt-answer": "Option 2: The ball above is moving faster.", "video-llm-prediction": "Option 1: The ball above is moving at the same speed as the ball below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The ball below is moving faster.\nSentence B: The ball above is moving faster.", "gt-answer": "Sentence B: The ball above is moving faster.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The ball above is moving slower than the ball below.\nCaption B: The ball above is moving faster.", "gt-answer": "Caption B: The ball above is moving faster.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1077575504_concat_0": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the pile of salt above is growing at the same speed as the pile of salt below.\nSentence B: The pile of salt above is growing at a faster speed than the pile of salt below.", "gt-answer": "Sentence A: The speed of the pile of salt above is growing at the same speed as the pile of salt below.", "video-llm-prediction": "Sentence B: The pile of salt above is growing at a faster speed than the pile of salt below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the pile of salt above is growing at the same speed as the pile of salt below.\nCaption B: The pile of salt above is growing at a slower speed than the pile of salt below.", "gt-answer": "Caption A: The speed of the pile of salt above is growing at the same speed as the pile of salt below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pile of salt above is growing at a slower speed than the pile of salt below.\nOption 2: The speed of the pile of salt above is growing at the same speed as the pile of salt below.", "gt-answer": "Option 2: The speed of the pile of salt above is growing at the same speed as the pile of salt below.", "video-llm-prediction": "Option 1: The pile of salt above is growing at a slower speed than the pile of salt below.", "match_success": true, "rating": 0 } ] }, "1077575504_concat_1": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The pile of salt above is growing at a slower speed than the pile of salt below.\nSentence B: The pile of salt above is growing at a faster speed than the pile of salt below.", "gt-answer": "Sentence A: The pile of salt above is growing at a slower speed than the pile of salt below.", "video-llm-prediction": "Sentence A: The pile of salt above is growing at a slower speed than the pile of salt below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The pile of salt above is growing at a slower speed than the pile of salt below.\nCaption B: The pile of salt above is growing at the same speed as the pile of salt below.", "gt-answer": "Caption A: The pile of salt above is growing at a slower speed than the pile of salt below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pile of salt below is growing slower than the pile of salt above.\nOption 2: The pile of salt above is growing at a slower speed than the pile of salt below.", "gt-answer": "Option 2: The pile of salt above is growing at a slower speed than the pile of salt below.", "video-llm-prediction": "Option 2: The pile of salt above is growing at a slower speed than the pile of salt below.", "match_success": true, "rating": 1 } ] }, "1077575504_concat_2": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The pile of salt above is growing at a faster speed than the pile of salt below.\nCaption B: The pile of salt above is growing at the same speed as the pile of salt below.", "gt-answer": "Caption A: The pile of salt above is growing at a faster speed than the pile of salt below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pile of salt above is growing at a faster speed than the pile of salt below.\nOption 2: The pile of salt above is growing at a slower speed than the pile of salt below.", "gt-answer": "Option 1: The pile of salt above is growing at a faster speed than the pile of salt below.", "video-llm-prediction": "Option 1: The pile of salt above is growing at a faster speed than the pile of salt below.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The pile of salt above is growing at a slower speed than the pile of salt below.\nSentence B: The pile of salt above is growing at a faster speed than the pile of salt below.", "gt-answer": "Sentence B: The pile of salt above is growing at a faster speed than the pile of salt below.", "video-llm-prediction": "Sentence A: The pile of salt above is growing at a slower speed than the pile of salt below.", "match_success": true, "rating": 0 } ] }, "1100319395_concat_0": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The white circle above is rotating at the same speed as the white circle below.\nOption 2: The white circle above is rotating faster than the white circle below.", "gt-answer": "Option 1: The white circle above is rotating at the same speed as the white circle below.", "video-llm-prediction": "Option 1: The white circle above is rotating at the same speed as the white circle below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The white circle above is rotating at the same speed as the white circle below.\nCaption B: The white circle above is rotating slower than the white circle below.", "gt-answer": "Caption A: The white circle above is rotating at the same speed as the white circle below.", "video-llm-prediction": "Caption B: The white circle above is rotating slower than the white circle below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The white circle above is rotating at the same speed as the white circle below.\nSentence B: The white circle above is rotating faster than the white circle below.", "gt-answer": "Sentence A: The white circle above is rotating at the same speed as the white circle below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1100319395_concat_1": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The white circle above is rotating faster than the white circle below.\nOption 2: The white circle above is rotating at the same speed as the white circle below.", "gt-answer": "Option 1: The white circle above is rotating faster than the white circle below.", "video-llm-prediction": "Option 1: The white circle above is rotating faster than the white circle below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The white circle above is rotating faster than the white circle below.\nCaption B: The white circle above is rotating slower than the white circle below.", "gt-answer": "Caption A: The white circle above is rotating faster than the white circle below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The white circle above is rotating slower than the white circle below.\nSentence B: The white circle above is rotating faster than the white circle below.", "gt-answer": "Sentence B: The white circle above is rotating faster than the white circle below.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1100319395_concat_2": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The white circle above is rotating at a slower speed than the white circle below.\nCaption B: The white circle above is rotating at the same speed as the white circle below.", "gt-answer": "Caption A: The white circle above is rotating at a slower speed than the white circle below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The white circle above is rotating at a slower speed than the white circle below.\nSentence B: The white circle above is rotating faster than the white circle below.", "gt-answer": "Sentence A: The white circle above is rotating at a slower speed than the white circle below.", "video-llm-prediction": "Sentence B: The white circle above is rotating faster than the white circle below.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The white circle above is rotating at a slower speed than the white circle below.\nOption 2: The white circle above is rotating faster than the white circle below.", "gt-answer": "Option 1: The white circle above is rotating at a slower speed than the white circle below.", "video-llm-prediction": "Option 2: The white circle above is rotating faster than the white circle below.", "match_success": true, "rating": 0 } ] }, "1086820415_concat_0": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the wave above is the same as the wave below.\nOption 2: The wave above is moving faster than the wave below.", "gt-answer": "Option 1: The speed of the wave above is the same as the wave below.", "video-llm-prediction": "Option 2: The wave above is moving faster than the wave below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the wave above is the same as the wave below.\nSentence B: The wave above is moving slower than the wave below.", "gt-answer": "Sentence A: The speed of the wave above is the same as the wave below.", "video-llm-prediction": "Sentence A: The speed of the wave above is the same as the wave below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The wave below is moving faster.\nCaption B: The speed of the wave above is the same as the wave below.", "gt-answer": "Caption B: The speed of the wave above is the same as the wave below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1086820415_concat_1": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The speed of the wave above is slower than the wave below.\nCaption B: The speed of the wave above is faster than the wave below.", "gt-answer": "Caption B: The speed of the wave above is faster than the wave below.", "video-llm-prediction": "Caption B: The speed of the wave above is faster than the wave below.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the wave above is faster than the wave below.\nOption 2: Both waves are moving at the same speed.", "gt-answer": "Option 1: The speed of the wave above is faster than the wave below.", "video-llm-prediction": "Option 1: The speed of the wave above is faster than the wave below.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the wave above is faster than the wave below.\nSentence B: The wave below is moving faster than the wave above.", "gt-answer": "Sentence A: The speed of the wave above is faster than the wave below.", "video-llm-prediction": "Sentence B: The wave below is moving faster than the wave above.", "match_success": true, "rating": 0 } ] }, "1086820415_concat_2": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The wave above is moving at the same speed as the wave below.\nCaption B: The wave above is moving slower than the wave below.", "gt-answer": "Caption B: The wave above is moving slower than the wave below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Both waves are moving at the same speed.\nOption 2: The wave above is moving slower than the wave below.", "gt-answer": "Option 2: The wave above is moving slower than the wave below.", "video-llm-prediction": "Option 2: The wave above is moving slower than the wave below.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The wave below is moving faster than the wave above.\nSentence B: The wave above is moving slower than the wave below.", "gt-answer": "Sentence B: The wave above is moving slower than the wave below.", "video-llm-prediction": "Sentence B: The wave above is moving slower than the wave below.", "match_success": true, "rating": 1 } ] }, "1096238125_concat_0": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The two people above are lifting the kettlebell slower than the two people below.\nCaption B: The two people above are lifting the kettlebell at the same speed as the two people below.", "gt-answer": "Caption B: The two people above are lifting the kettlebell at the same speed as the two people below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The two people above are lifting the kettlebell at the same speed as the two people below.\nOption 2: The two people above are lifting the kettlebell faster than the two people below.", "gt-answer": "Option 1: The two people above are lifting the kettlebell at the same speed as the two people below.", "video-llm-prediction": "Option 2: The two people above are lifting the kettlebell faster than the two people below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The two people above are lifting the kettlebell at the same speed as the two people below.\nSentence B: The two people above are lifting the kettlebell faster than the two people below.", "gt-answer": "Sentence A: The two people above are lifting the kettlebell at the same speed as the two people below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1096238125_concat_1": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The two people above are lifting the kettlebell slower than the two people below.\nOption 2: The two people above are lifting the kettlebell faster than the two people below.", "gt-answer": "Option 2: The two people above are lifting the kettlebell faster than the two people below.", "video-llm-prediction": "Option 2: The two people above are lifting the kettlebell faster than the two people below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The two people above are lifting the kettlebell faster than the two people below.\nCaption B: The two people above are lifting the kettlebell at the same speed as the two people below.", "gt-answer": "Caption A: The two people above are lifting the kettlebell faster than the two people below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The two people above are lifting the kettlebell slower than the two people below.\nSentence B: The two people above are lifting the kettlebell faster than the two people below.", "gt-answer": "Sentence B: The two people above are lifting the kettlebell faster than the two people below.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1096238125_concat_2": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The two people above are lifting the kettlebell slower than the two people below.\nOption 2: The two people above are lifting the kettlebell at the same speed as the two people below.", "gt-answer": "Option 1: The two people above are lifting the kettlebell slower than the two people below.", "video-llm-prediction": "Option 1: The two people above are lifting the kettlebell slower than the two people below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The two people above are lifting the kettlebell faster than the two people below.\nCaption B: The two people above are lifting the kettlebell slower than the two people below.", "gt-answer": "Caption B: The two people above are lifting the kettlebell slower than the two people below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The two people above are lifting the kettlebell slower than the two people below.\nSentence B: The two people above are lifting the kettlebell faster than the two people below.", "gt-answer": "Sentence A: The two people above are lifting the kettlebell slower than the two people below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1056433484_concat_0": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the earth above is the same as the speed of the earth below.\nSentence B: The speed of the earth above is slower than the earth below.", "gt-answer": "Sentence A: The speed of the earth above is the same as the speed of the earth below.", "video-llm-prediction": "Sentence B: The speed of the earth above is slower than the earth below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the earth above is the same as the speed of the earth below.\nCaption B: The speed of the earth above is faster than the earth below.", "gt-answer": "Caption A: The speed of the earth above is the same as the speed of the earth below.", "video-llm-prediction": "Caption A: The speed of the earth above is the same as the speed of the earth below.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the earth above is the same as the speed of the earth below.\nOption 2: The speed of the earth above is different from the earth below.", "gt-answer": "Option 1: The speed of the earth above is the same as the speed of the earth below.", "video-llm-prediction": "Option 1: The speed of the earth above is the same as the speed of the earth below.", "match_success": true, "rating": 1 } ] }, "1056433484_concat_1": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The speed of the earth above is slower than the earth below.\nSentence B: The speed of the earth above is faster than the earth below.", "gt-answer": "Sentence A: The speed of the earth above is slower than the earth below.", "video-llm-prediction": "Sentence A: The speed of the earth above is slower than the earth below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the earth above is slower than the earth below.\nCaption B: The speed of the earth above is the same as the earth below.", "gt-answer": "Caption A: The speed of the earth above is slower than the earth below.", "video-llm-prediction": "Caption A: The speed of the earth above is slower than the earth below.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the earth above is slower than the earth below.\nOption 2: The speed of the earth above is faster than the earth below.", "gt-answer": "Option 1: The speed of the earth above is slower than the earth below.", "video-llm-prediction": "Option 2: The speed of the earth above is faster than the earth below.", "match_success": true, "rating": 0 } ] }, "1056433484_concat_2": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The speed of the earth above is slower than the earth below.\nCaption B: The speed of the earth above is faster than the earth below.", "gt-answer": "Caption B: The speed of the earth above is faster than the earth below.", "video-llm-prediction": "Caption A: The speed of the earth above is slower than the earth below.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the earth above is the same as the earth below.\nOption 2: The speed of the earth above is faster than the earth below.", "gt-answer": "Option 2: The speed of the earth above is faster than the earth below.", "video-llm-prediction": "Option 2: The speed of the earth above is faster than the earth below.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The earth above rotates at the same speed as the earth below.\nSentence B: The speed of the earth above is faster than the earth below.", "gt-answer": "Sentence B: The speed of the earth above is faster than the earth below.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "15695290_concat_0": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The person above draws a picture more slowly than the person below.\nSentence B: The person above draws a picture at the same speed as the person below.", "gt-answer": "Sentence B: The person above draws a picture at the same speed as the person below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person above draws a picture more rapidly than the person below.\nOption 2: The person above draws a picture at the same speed as the person below.", "gt-answer": "Option 2: The person above draws a picture at the same speed as the person below.", "video-llm-prediction": "Option 1: The person above draws a picture more rapidly than the person below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The person above draws a picture more rapidly than the person below.\nCaption B: The person above draws a picture at the same speed as the person below.", "gt-answer": "Caption B: The person above draws a picture at the same speed as the person below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "15695290_concat_1": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The person above draws a picture at an equal speed as the person below.\nCaption B: The person above draws a picture more slowly than the person below.", "gt-answer": "Caption B: The person above draws a picture more slowly than the person below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person above draws a picture more slowly than the person below.\nSentence B: The person above draws a picture more rapidly than the person below.", "gt-answer": "Sentence A: The person above draws a picture more slowly than the person below.", "video-llm-prediction": "Sentence B: The person above draws a picture more rapidly than the person below.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person above draws a picture more slowly than the person below.\nOption 2: The person above draws a picture faster than the person below.", "gt-answer": "Option 1: The person above draws a picture more slowly than the person below.", "video-llm-prediction": "Option 2: The person above draws a picture faster than the person below.", "match_success": true, "rating": 0 } ] }, "15695290_concat_2": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The person above is drawing a picture slower than the person below.\nSentence B: The person above is drawing a picture faster than the person below.", "gt-answer": "Sentence B: The person above is drawing a picture faster than the person below.", "video-llm-prediction": "Sentence B: The person above is drawing a picture faster than the person below.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person above is drawing a picture at the same speed as the person below.\nOption 2: The person above is drawing a picture faster than the person below.", "gt-answer": "Option 2: The person above is drawing a picture faster than the person below.", "video-llm-prediction": "Option 1: The person above is drawing a picture at the same speed as the person below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The person above is drawing a picture faster than the person below.\nCaption B: The person above is drawing a picture more slowly than the person below.", "gt-answer": "Caption A: The person above is drawing a picture faster than the person below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1047368344_concat_0": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The girl above is running slower than the girl below.\nSentence B: The girl above is running at the same speed as the girl below.", "gt-answer": "Sentence B: The girl above is running at the same speed as the girl below.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The girl above is running faster than the girl below.\nOption 2: The girl above is running at the same speed as the girl below.", "gt-answer": "Option 2: The girl above is running at the same speed as the girl below.", "video-llm-prediction": "Option 1: The girl above is running faster than the girl below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The girl above is running at the same speed as the girl below.\nCaption B: The speed of the girl above is different from the girl below.", "gt-answer": "Caption A: The girl above is running at the same speed as the girl below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1047368344_concat_1": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The girl above is running slower than the girl below.\nSentence B: The girl above is running at the same speed as the girl below.", "gt-answer": "Sentence A: The girl above is running slower than the girl below.", "video-llm-prediction": "Sentence A: The girl above is running slower than the girl below.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The girl above is running faster than the girl below.\nOption 2: The girl above is running slower than the girl below.", "gt-answer": "Option 2: The girl above is running slower than the girl below.", "video-llm-prediction": "Option 1: The girl above is running faster than the girl below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The girl above is running faster than the girl below.\nCaption B: The girl above is running slower than the girl below.", "gt-answer": "Caption B: The girl above is running slower than the girl below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1047368344_concat_2": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The girl above is running faster than the girl below.\nSentence B: The girl above is running at the same speed as the girl below.", "gt-answer": "Sentence A: The girl above is running faster than the girl below.", "video-llm-prediction": "Sentence A: The girl above is running faster than the girl below.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The girl above is running faster than the girl below.\nOption 2: The girl above is running slower than the girl below.", "gt-answer": "Option 1: The girl above is running faster than the girl below.", "video-llm-prediction": "Option 1: The girl above is running faster than the girl below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the girl above is slower than the girl below.\nCaption B: The girl above is running faster than the girl below.", "gt-answer": "Caption B: The girl above is running faster than the girl below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1060403198_concat_0": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The tree above is growing slower than the tree below.\nSentence B: The tree above is growing at the same speed as the tree below.", "gt-answer": "Sentence B: The tree above is growing at the same speed as the tree below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The tree above is growing faster than the tree below.\nCaption B: The tree above is growing at the same speed as the tree below.", "gt-answer": "Caption B: The tree above is growing at the same speed as the tree below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The tree above is growing at the same speed as the tree below.\nOption 2: The tree above is growing faster than the tree below.", "gt-answer": "Option 1: The tree above is growing at the same speed as the tree below.", "video-llm-prediction": "Option 2: The tree above is growing faster than the tree below.", "match_success": true, "rating": 0 } ] }, "1060403198_concat_1": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The tree above is growing at the same speed as the tree below.\nOption 2: The tree above is growing faster than the tree below.", "gt-answer": "Option 2: The tree above is growing faster than the tree below.", "video-llm-prediction": "Option 2: The tree above is growing faster than the tree below.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The tree above is growing slower than the tree below.\nCaption B: The tree above is growing faster than the tree below.", "gt-answer": "Caption B: The tree above is growing faster than the tree below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The tree above is growing slower than the tree below.\nSentence B: The tree above is growing faster than the tree below.", "gt-answer": "Sentence B: The tree above is growing faster than the tree below.", "video-llm-prediction": "Sentence A: The tree above is growing slower than the tree below.", "match_success": true, "rating": 0 } ] }, "1060403198_concat_2": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The tree above is growing at the same speed as the tree below.\nCaption B: The tree above is growing slower than the tree below.", "gt-answer": "Caption B: The tree above is growing slower than the tree below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The tree above is growing slower than the tree below.\nOption 2: The tree above is growing faster than the tree below.", "gt-answer": "Option 1: The tree above is growing slower than the tree below.", "video-llm-prediction": "Option 2: The tree above is growing faster than the tree below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The tree above is growing faster than the tree below.\nSentence B: The tree above is growing slower than the tree below.", "gt-answer": "Sentence B: The tree above is growing slower than the tree below.", "video-llm-prediction": "Sentence A: The tree above is growing faster than the tree below.", "match_success": true, "rating": 0 } ] }, "1100058499_concat_0": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The man above is jumping rope faster than the man below.\nCaption B: The man above and the man below are jumping rope at the same speed.", "gt-answer": "Caption B: The man above and the man below are jumping rope at the same speed.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man above and the man below are jumping rope at the same speed.\nOption 2: The man above is jumping rope slower than the man below.", "gt-answer": "Option 1: The man above and the man below are jumping rope at the same speed.", "video-llm-prediction": "Option 1: The man above and the man below are jumping rope at the same speed.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man above is jumping rope faster than the man below.\nSentence B: The man above and the man below are jumping rope at the same speed.", "gt-answer": "Sentence B: The man above and the man below are jumping rope at the same speed.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1100058499_concat_1": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The man above is jumping rope faster than the man below.\nCaption B: The man above is jumping rope slower than the man below.", "gt-answer": "Caption B: The man above is jumping rope slower than the man below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man above is jumping rope slower than the man below.\nSentence B: The man above and the man below are jumping rope at the same speed.", "gt-answer": "Sentence A: The man above is jumping rope slower than the man below.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man below is jumping rope slower than the man above.\nOption 2: The man above is jumping rope slower than the man below.", "gt-answer": "Option 2: The man above is jumping rope slower than the man below.", "video-llm-prediction": "Option 2: The man above is jumping rope slower than the man below.", "match_success": true, "rating": 1 } ] }, "1100058499_concat_2": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The man above is jumping rope at the same speed as the man below.\nCaption B: The man above is jumping rope faster than the man below.", "gt-answer": "Caption B: The man above is jumping rope faster than the man below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man above is jumping rope faster than the man below.\nSentence B: The man above is jumping rope slower than the man below.", "gt-answer": "Sentence A: The man above is jumping rope faster than the man below.", "video-llm-prediction": "Sentence A: The man above is jumping rope faster than the man below.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man below is jumping rope faster than the man above.\nOption 2: The man above is jumping rope faster than the man below.", "gt-answer": "Option 2: The man above is jumping rope faster than the man below.", "video-llm-prediction": "Option 2: The man above is jumping rope faster than the man below.", "match_success": true, "rating": 1 } ] }, "1092813279_concat_0": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman below is doing push ups slower.\nOption 2: The two women are doing push ups at the same speed.", "gt-answer": "Option 2: The two women are doing push ups at the same speed.", "video-llm-prediction": "Option 2: The two women are doing push ups at the same speed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman above is doing push ups slower.\nCaption B: The two women are doing push ups at the same speed.", "gt-answer": "Caption B: The two women are doing push ups at the same speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman below is doing push ups faster.\nSentence B: The two women are doing push ups at the same speed.", "gt-answer": "Sentence B: The two women are doing push ups at the same speed.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1092813279_concat_1": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman below is doing push ups slower.\nSentence B: The woman above is doing push ups slower.", "gt-answer": "Sentence B: The woman above is doing push ups slower.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: Two women doing push ups at the same speed.\nCaption B: The woman above is doing push ups slower.", "gt-answer": "Caption B: The woman above is doing push ups slower.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman above is doing push ups faster.\nOption 2: The woman above is doing push ups slower.", "gt-answer": "Option 2: The woman above is doing push ups slower.", "video-llm-prediction": "Option 2: The woman above is doing push ups slower.", "match_success": true, "rating": 1 } ] }, "1092813279_concat_2": { "speed": [ { "question": "Which caption matches the video better?\nCaption A: The woman above is doing push ups slower.\nCaption B: The woman below is doing push ups slower.", "gt-answer": "Caption B: The woman below is doing push ups slower.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman below is doing push ups slower.\nSentence B: Two women doing push ups at the same speed.", "gt-answer": "Sentence A: The woman below is doing push ups slower.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed of the woman above's push ups is the same as the woman below.\nOption 2: The woman below is doing push ups slower.", "gt-answer": "Option 2: The woman below is doing push ups slower.", "video-llm-prediction": "Option 2: The woman below is doing push ups slower.", "match_success": true, "rating": 1 } ] }, "1084518106_concat_0": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The truck above is driving at the same speed as the truck below.\nOption 2: The truck above is driving slower than the truck below.", "gt-answer": "Option 1: The truck above is driving at the same speed as the truck below.", "video-llm-prediction": "Option 2: The truck above is driving slower than the truck below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The truck above is driving faster than the truck below.\nSentence B: The truck above is driving at the same speed as the truck below.", "gt-answer": "Sentence B: The truck above is driving at the same speed as the truck below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The speed of the truck below is slower than the speed of the truck above.\nCaption B: The truck above is driving at the same speed as the truck below.", "gt-answer": "Caption B: The truck above is driving at the same speed as the truck below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1084518106_concat_1": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The truck above is driving slower than the truck below.\nOption 2: The truck above is driving faster than the truck below.", "gt-answer": "Option 2: The truck above is driving faster than the truck below.", "video-llm-prediction": "Option 1: The truck above is driving slower than the truck below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The truck above is driving faster than the truck below.\nSentence B: Both trucks are driving at the same speed.", "gt-answer": "Sentence A: The truck above is driving faster than the truck below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The truck above is driving faster than the truck below.\nCaption B: The truck above is driving slower than the truck below.", "gt-answer": "Caption A: The truck above is driving faster than the truck below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1084518106_concat_2": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The truck above is driving faster than the truck below.\nOption 2: The truck above is driving slower than the truck below.", "gt-answer": "Option 2: The truck above is driving slower than the truck below.", "video-llm-prediction": "Option 1: The truck above is driving faster than the truck below.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The truck above is driving slower than the truck below.\nSentence B: Both trucks are driving at the same speed.", "gt-answer": "Sentence A: The truck above is driving slower than the truck below.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The truck above is driving faster than the truck below.\nCaption B: The truck above is driving slower than the truck below.", "gt-answer": "Caption B: The truck above is driving slower than the truck below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1062609286_concat_0": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The battery charge above is increasing at a slower speed than the battery charge below.\nSentence B: The battery charge above is increasing at the same speed as the battery charge below.", "gt-answer": "Sentence B: The battery charge above is increasing at the same speed as the battery charge below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The battery charge above is increasing at the same speed as the battery charge below.\nOption 2: The battery charge above is increasing at a faster speed than the battery charge below.", "gt-answer": "Option 1: The battery charge above is increasing at the same speed as the battery charge below.", "video-llm-prediction": "Option 2: The battery charge above is increasing at a faster speed than the battery charge below.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The battery charge above is increasing at the same speed as the battery charge below.\nCaption B: The battery charge above is increasing at a slower speed than the battery charge below.", "gt-answer": "Caption A: The battery charge above is increasing at the same speed as the battery charge below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1062609286_concat_1": { "speed": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The battery charge above is increasing at a slower speed than the battery charge below.\nSentence B: The battery charge above is increasing at a faster speed than the battery charge below.", "gt-answer": "Sentence A: The battery charge above is increasing at a slower speed than the battery charge below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The battery charge above is increasing at the same speed as the battery charge below.\nCaption B: The battery charge above is increasing at a slower speed than the battery charge below.", "gt-answer": "Caption B: The battery charge above is increasing at a slower speed than the battery charge below.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The speed at which the battery charge above is increasing is faster than the battery charge below.\nOption 2: The battery charge above is increasing at a slower speed than the battery charge below.", "gt-answer": "Option 2: The battery charge above is increasing at a slower speed than the battery charge below.", "video-llm-prediction": "Option 2: The battery charge above is increasing at a slower speed than the battery charge below.", "match_success": true, "rating": 1 } ] }, "1062609286_concat_2": { "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The battery charge above is increasing at a faster speed than the battery charge below.\nOption 2: The battery charge above is increasing at a slower speed than the battery charge below.", "gt-answer": "Option 1: The battery charge above is increasing at a faster speed than the battery charge below.", "video-llm-prediction": "Option 1: The battery charge above is increasing at a faster speed than the battery charge below.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The battery charge above is increasing at a faster speed than the battery charge below.\nSentence B: The battery charge above is increasing at the same speed as the battery charge below.", "gt-answer": "Sentence A: The battery charge above is increasing at a faster speed than the battery charge below.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The battery charge above is increasing at a slower speed than the battery charge below.\nCaption B: The battery charge above is increasing at a faster speed than the battery charge below.", "gt-answer": "Caption B: The battery charge above is increasing at a faster speed than the battery charge below.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1111601171_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The sphere building is changing from blue to green.\nOption 2: The sphere building is changing from blue to red.", "gt-answer": "Option 1: The sphere building is changing from blue to green.", "video-llm-prediction": "Option 1: The sphere building is changing from blue to green.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The sphere building is changing from green to blue.\nCaption B: The sphere building is changing from blue to green.", "gt-answer": "Caption B: The sphere building is changing from blue to green.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The sphere building is staying in the same color.\nSentence B: The sphere building is changing from blue to green.", "gt-answer": "Sentence B: The sphere building is changing from blue to green.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1086105089_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The building is demolishing.\nCaption B: The building is being constructed.", "gt-answer": "Caption A: The building is demolishing.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The building is demolishing.\nOption 2: The building is growing wider.", "gt-answer": "Option 1: The building is demolishing.", "video-llm-prediction": "Option 2: The building is growing wider.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The building is staying the same.\nSentence B: The building is demolishing.", "gt-answer": "Sentence B: The building is demolishing.", "video-llm-prediction": "Sentence B: The building is demolishing.", "match_success": true, "rating": 1 } ] }, "1058498242_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3D pixels are changing from a cuboid into the shape of a man.\nSentence B: The 3D pixels are changing from the shape of a man into a cuboid.", "gt-answer": "Sentence B: The 3D pixels are changing from the shape of a man into a cuboid.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The 3D pixels are changing from the shape of a man into a cuboid.\nCaption B: The 3D pixels are changing from a cuboid into a the shape of a man.", "gt-answer": "Caption A: The 3D pixels are changing from the shape of a man into a cuboid.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3D pixels are changing from the shape of a man into a cuboid.\nOption 2: The 3D pixels are changing from a sphere into a cuboid.", "gt-answer": "Option 1: The 3D pixels are changing from the shape of a man into a cuboid.", "video-llm-prediction": "Option 1: The 3D pixels are changing from the shape of a man into a cuboid.", "match_success": true, "rating": 1 } ] }, "1082268005_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The planet earth is disassembling and disappearing.\nCaption B: The planet earth is assembling from toy bricks.", "gt-answer": "Caption A: The planet earth is disassembling and disappearing.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The planet earth is splitting into two halves.\nOption 2: The planet earth is disassembling and disappearing.", "gt-answer": "Option 2: The planet earth is disassembling and disappearing.", "video-llm-prediction": "Option 2: The planet earth is disassembling and disappearing.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The planet earth is disassembling and disappearing.\nSentence B: The planet earth is assembling from toy bricks.", "gt-answer": "Sentence A: The planet earth is disassembling and disappearing.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 } ] }, "1070711428_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The glacier is assembling and rising from the water.\nSentence B: The glacier is breaking and falling in the water.", "gt-answer": "Sentence A: The glacier is assembling and rising from the water.", "video-llm-prediction": "Sentence A: The glacier is assembling and rising from the water.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The glacier is assembling and rising from the water.\nCaption B: The glacier is breaking and falling in the water.", "gt-answer": "Caption A: The glacier is assembling and rising from the water.", "video-llm-prediction": "Caption A: The glacier is assembling and rising from the water.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The glacier is evaporating.\nOption 2: The glacier is assembling and rising from the water.", "gt-answer": "Option 2: The glacier is assembling and rising from the water.", "video-llm-prediction": "Option 2: The glacier is assembling and rising from the water.", "match_success": true, "rating": 1 } ] }, "1057447372_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The sport stadium is turning darker.\nCaption B: The sport stadium is staying the same brightness.", "gt-answer": "Caption A: The sport stadium is turning darker.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sport stadium is becoming brighter.\nOption 2: The sport stadium is turning darker.", "gt-answer": "Option 2: The sport stadium is turning darker.", "video-llm-prediction": "Option 2: The sport stadium is turning darker.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The sport stadium is becoming brighter.\nSentence B: The sport stadium is turning darker.", "gt-answer": "Sentence B: The sport stadium is turning darker.", "video-llm-prediction": "Sentence A: The sport stadium is becoming brighter.", "match_success": true, "rating": 0 } ] }, "1053334574_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The black ink is shrinking into the center.\nSentence B: The black ink is staying in the same shape.", "gt-answer": "Sentence A: The black ink is shrinking into the center.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The black ink is shrinking into the center.\nCaption B: The black ink is splitting in half.", "gt-answer": "Caption A: The black ink is shrinking into the center.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The black ink is expanding from the center.\nOption 2: The black ink is shrinking into the center.", "gt-answer": "Option 2: The black ink is shrinking into the center.", "video-llm-prediction": "Option 1: The black ink is expanding from the center.", "match_success": true, "rating": 0 } ] }, "1016172301_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The 3D house model is being demolished.\nCaption B: The 3D house model is being constructed.", "gt-answer": "Caption A: The 3D house model is being demolished.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3D house model is being demolished.\nOption 2: The 3D house model is being constructed.", "gt-answer": "Option 1: The 3D house model is being demolished.", "video-llm-prediction": "Option 2: The 3D house model is being constructed.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3D house model is being repaired.\nSentence B: The 3D house model is being demolished.", "gt-answer": "Sentence B: The 3D house model is being demolished.", "video-llm-prediction": "Sentence A: The 3D house model is being repaired.", "match_success": true, "rating": 0 } ] }, "1084736389_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The face is aging.\nCaption B: The face is morphing from a human into a chimpanzee.", "gt-answer": "Caption B: The face is morphing from a human into a chimpanzee.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The face is morphing from a chimpanzee into a human.\nOption 2: The face is morphing from a human into a chimpanzee.", "gt-answer": "Option 2: The face is morphing from a human into a chimpanzee.", "video-llm-prediction": "Option 1: The face is morphing from a chimpanzee into a human.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The face is morphing from a human into a chimpanzee.\nSentence B: The face is morphing from a chimpanzee into a human.", "gt-answer": "Sentence A: The face is morphing from a human into a chimpanzee.", "video-llm-prediction": "Sentence A: The face is morphing from a human into a chimpanzee.", "match_success": true, "rating": 1 } ] }, "1054494542_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The background is changing from pink to green.\nSentence B: The background is changing from blue to pink.", "gt-answer": "Sentence B: The background is changing from blue to pink.", "video-llm-prediction": "Sentence B: The background is changing from blue to pink.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The background is changing from blue to pink.\nOption 2: The background is changing from green to pink.", "gt-answer": "Option 1: The background is changing from blue to pink.", "video-llm-prediction": "Option 1: The background is changing from blue to pink.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The background is changing from blue to pink.\nCaption B: The background is changing from pink to blue.", "gt-answer": "Caption A: The background is changing from blue to pink.", "video-llm-prediction": "Caption B: The background is changing from pink to blue.", "match_success": true, "rating": 0 } ] }, "1055194940_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The insect is transforming from butterfly into pupa.\nSentence B: The insect is transforming from pupa into butterfly.", "gt-answer": "Sentence A: The insect is transforming from butterfly into pupa.", "video-llm-prediction": "Sentence A: The insect is transforming from butterfly into pupa.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The insect is transforming from butterfly into pupa.\nOption 2: The insect is transforming from pupa into butterfly.", "gt-answer": "Option 1: The insect is transforming from butterfly into pupa.", "video-llm-prediction": "Option 2: The insect is transforming from pupa into butterfly.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The insect is transforming from butterfly into pupa.\nCaption B: The insect is transforming from pupa into butterfly.", "gt-answer": "Caption A: The insect is transforming from butterfly into pupa.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1079384099_reverse": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The 3D pixel is transforming from the shape of a white square into a flower.\nCaption B: The 3D pixel is transforming from the shape of a flower into a white square.", "gt-answer": "Caption B: The 3D pixel is transforming from the shape of a flower into a white square.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3D pixel is transforming from a white square into the shape of a flower.\nSentence B: The 3D pixel is transforming from the shape of a flower into a white square.", "gt-answer": "Sentence B: The 3D pixel is transforming from the shape of a flower into a white square.", "video-llm-prediction": "Sentence A: The 3D pixel is transforming from a white square into the shape of a flower.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The final shape of the 3D pixel is a star.\nOption 2: The 3D pixel is transforming from the shape of a flower into a white square.", "gt-answer": "Option 2: The 3D pixel is transforming from the shape of a flower into a white square.", "video-llm-prediction": "Option 1: The final shape of the 3D pixel is a star.", "match_success": true, "rating": 0 } ] }, "1099811331_reverse": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The 3d heart is gradually disappearing.\nOption 2: The 3d heart is gradually appearing.", "gt-answer": "Option 1: The 3d heart is gradually disappearing.", "video-llm-prediction": "Option 2: The 3d heart is gradually appearing.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The 3d heart is gradually disappearing.\nCaption B: The size of the 3d heart is growing.", "gt-answer": "Caption A: The 3d heart is gradually disappearing.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3d heart is reducing in size.\nSentence B: The 3d heart is gradually disappearing.", "gt-answer": "Sentence B: The 3d heart is gradually disappearing.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1093340469_reverse": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The room is staying the same brightness.\nSentence B: The room is turning brighter.", "gt-answer": "Sentence B: The room is turning brighter.", "video-llm-prediction": "Sentence A: The room is staying the same brightness.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The room is turning brighter.\nCaption B: The room is turning darker.", "gt-answer": "Caption A: The room is turning brighter.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The word that best describes the change in the room is darkening.\nOption 2: The room is turning brighter.", "gt-answer": "Option 2: The room is turning brighter.", "video-llm-prediction": "Option 1: The word that best describes the change in the room is darkening.", "match_success": true, "rating": 0 } ] }, "1111601171": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The sphere building is changing from green to blue.\nOption 2: The sphere building is changing from blue to green.", "gt-answer": "Option 1: The sphere building is changing from green to blue.", "video-llm-prediction": "Option 1: The sphere building is changing from green to blue.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The sphere building is changing from green to blue.\nSentence B: The sphere building is changing from blue to red.", "gt-answer": "Sentence A: The sphere building is changing from green to blue.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The sphere building is staying in the same color.\nCaption B: The sphere building is changing from green to blue.", "gt-answer": "Caption B: The sphere building is changing from green to blue.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1086105089": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The building is being constructed.\nSentence B: The building is demolishing.", "gt-answer": "Sentence A: The building is being constructed.", "video-llm-prediction": "Sentence A: The building is being constructed.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The building is being constructed.\nOption 2: The building is growing wider.", "gt-answer": "Option 1: The building is being constructed.", "video-llm-prediction": "Option 1: The building is being constructed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The building is being deconstructed.\nCaption B: The building is being constructed.", "gt-answer": "Caption B: The building is being constructed.", "video-llm-prediction": "Caption B: The building is being constructed.", "match_success": true, "rating": 1 } ] }, "1058498242": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The 3D pixels are changing from a cuboid into the shape of a man.\nCaption B: The 3D pixels are changing from the shape of a man into a cuboid.", "gt-answer": "Caption A: The 3D pixels are changing from a cuboid into the shape of a man.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3D pixels are changing from a cuboid into the shape of a man.\nOption 2: The 3D pixels are changing from the shape of a man into a cuboid.", "gt-answer": "Option 1: The 3D pixels are changing from a cuboid into the shape of a man.", "video-llm-prediction": "Option 1: The 3D pixels are changing from a cuboid into the shape of a man.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3D pixels are changing from a cuboid into the shape of a man.\nSentence B: The 3D pixels are changing from the shape of a man into a cuboid.", "gt-answer": "Sentence A: The 3D pixels are changing from a cuboid into the shape of a man.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1082268005": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The planet earth is assembling from toy bricks.\nOption 2: The planet earth is disassembling and disappearing.", "gt-answer": "Option 1: The planet earth is assembling from toy bricks.", "video-llm-prediction": "Option 1: The planet earth is assembling from toy bricks.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The planet earth is splitting into two halves.\nSentence B: The planet earth is assembling from toy bricks.", "gt-answer": "Sentence B: The planet earth is assembling from toy bricks.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The planet earth is disassembling and disappearing.\nCaption B: The planet earth is assembling from toy bricks.", "gt-answer": "Caption B: The planet earth is assembling from toy bricks.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1070711428": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The glacier is assembling and rising from water.\nOption 2: The glacier is breaking and falling in water.", "gt-answer": "Option 2: The glacier is breaking and falling in water.", "video-llm-prediction": "Option 2: The glacier is breaking and falling in water.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The glacier is assembling and rising from water.\nCaption B: The glacier is breaking and falling in water.", "gt-answer": "Caption B: The glacier is breaking and falling in water.", "video-llm-prediction": "Caption A: The glacier is assembling and rising from water.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The glacier is assembling and rising from water.\nSentence B: The glacier is breaking and falling in water.", "gt-answer": "Sentence B: The glacier is breaking and falling in water.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1057447372": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The sport stadium is turning darker.\nOption 2: The sport stadium is turning brighter.", "gt-answer": "Option 2: The sport stadium is turning brighter.", "video-llm-prediction": "Option 2: The sport stadium is turning brighter.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The brightness of the sport stadium is staying the same.\nSentence B: The sport stadium is turning brighter.", "gt-answer": "Sentence B: The sport stadium is turning brighter.", "video-llm-prediction": "Sentence A: The brightness of the sport stadium is staying the same.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The sport stadium is turning darker.\nCaption B: The sport stadium is turning brighter.", "gt-answer": "Caption B: The sport stadium is turning brighter.", "video-llm-prediction": "Caption B: The sport stadium is turning brighter.", "match_success": true, "rating": 1 } ] }, "1053334574": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The black ink is staying in the same shape.\nSentence B: The black ink is expanding from the center.", "gt-answer": "Sentence B: The black ink is expanding from the center.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The black ink is splitting in half.\nCaption B: The black ink is expanding from the center.", "gt-answer": "Caption B: The black ink is expanding from the center.", "video-llm-prediction": "Caption B: The black ink is expanding from the center.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The black ink is expanding from the center.\nOption 2: The black ink is shrinking into the center.", "gt-answer": "Option 1: The black ink is expanding from the center.", "video-llm-prediction": "Option 1: The black ink is expanding from the center.", "match_success": true, "rating": 1 } ] }, "1016172301": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3D house model is being deconstructed.\nSentence B: The 3D house model is being constructed.", "gt-answer": "Sentence B: The 3D house model is being constructed.", "video-llm-prediction": "Sentence A: The 3D house model is being deconstructed.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3D house model is being constructed.\nOption 2: The 3D house model is being demolished.", "gt-answer": "Option 1: The 3D house model is being constructed.", "video-llm-prediction": "Option 1: The 3D house model is being constructed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The 3D house model is being repaired.\nCaption B: The 3D house model is being constructed.", "gt-answer": "Caption B: The 3D house model is being constructed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1084736389": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The face is aging.\nCaption B: The face is morphing from a chimpanzee into a human.", "gt-answer": "Caption B: The face is morphing from a chimpanzee into a human.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The face is morphing from a chimpanzee into a human.\nSentence B: The face is morphing from a human into a chimpanzee.", "gt-answer": "Sentence A: The face is morphing from a chimpanzee into a human.", "video-llm-prediction": "Sentence A: The face is morphing from a chimpanzee into a human.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The face is morphing from a human into a chimpanzee.\nOption 2: The face is morphing from a chimpanzee into a human.", "gt-answer": "Option 2: The face is morphing from a chimpanzee into a human.", "video-llm-prediction": "Option 1: The face is morphing from a human into a chimpanzee.", "match_success": true, "rating": 0 } ] }, "1054494542": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The background is changing from pink to green.\nOption 2: The background is changing from pink to blue.", "gt-answer": "Option 2: The background is changing from pink to blue.", "video-llm-prediction": "Option 2: The background is changing from pink to blue.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The background is changing from pink to blue.\nSentence B: The background is changing from green to pink.", "gt-answer": "Sentence A: The background is changing from pink to blue.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The background is changing from pink to blue.\nCaption B: The background is changing from blue to pink.", "gt-answer": "Caption A: The background is changing from pink to blue.", "video-llm-prediction": "Caption B: The background is changing from blue to pink.", "match_success": true, "rating": 0 } ] }, "1055194940": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The insect is transforming from pupa into butterfly.\nSentence B: The insect is remaining as a butterfly.", "gt-answer": "Sentence A: The insect is transforming from pupa into butterfly.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The insect is transforming from pupa into butterfly.\nCaption B: The insect is transforming from butterfly into pupa.", "gt-answer": "Caption A: The insect is transforming from pupa into butterfly.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The insect is transforming from pupa into butterfly.\nOption 2: The insect is transforming from butterfly into pupa.", "gt-answer": "Option 1: The insect is transforming from pupa into butterfly.", "video-llm-prediction": "Option 1: The insect is transforming from pupa into butterfly.", "match_success": true, "rating": 1 } ] }, "1079384099": { "attribute_change": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3D pixel is transforming from the shape of a flower into a white square.\nSentence B: The 3D pixel is transforming from a white square into the shape of a flower.", "gt-answer": "Sentence B: The 3D pixel is transforming from a white square into the shape of a flower.", "video-llm-prediction": "Sentence A: The 3D pixel is transforming from the shape of a flower into a white square.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The 3D pixel is transforming from the shape of a heart into a triangle.\nCaption B: The 3D pixel is transforming from a white square into the shape of a flower.", "gt-answer": "Caption B: The 3D pixel is transforming from a white square into the shape of a flower.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3D pixel is transforming from a white square into the shape of a flower.\nOption 2: The 3D pixel is transforming from the shape of a flower into a white square.", "gt-answer": "Option 1: The 3D pixel is transforming from a white square into the shape of a flower.", "video-llm-prediction": "Option 1: The 3D pixel is transforming from a white square into the shape of a flower.", "match_success": true, "rating": 1 } ] }, "1099811331": { "attribute_change": [ { "question": "Which caption matches the video better?\nCaption A: The 3D heart is reducing in size.\nCaption B: The 3D heart is gradually appearing.", "gt-answer": "Caption B: The 3D heart is gradually appearing.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The 3D heart is gradually appearing.\nOption 2: The 3D heart is gradually disappearing.", "gt-answer": "Option 1: The 3D heart is gradually appearing.", "video-llm-prediction": "Option 1: The 3D heart is gradually appearing.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The 3D heart is gradually appearing.\nSentence B: The 3D heart is growing in size.", "gt-answer": "Sentence A: The 3D heart is gradually appearing.", "video-llm-prediction": "Sentence A: The 3D heart is gradually appearing.", "match_success": true, "rating": 1 } ] }, "1093340469": { "attribute_change": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The room is staying the same brightness.\nOption 2: The room is turning darker.", "gt-answer": "Option 2: The room is turning darker.", "video-llm-prediction": "Option 2: The room is turning darker.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The room is turning brighter.\nSentence B: The room is turning darker.", "gt-answer": "Sentence B: The room is turning darker.", "video-llm-prediction": "Sentence A: The room is turning brighter.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The room is turning darker.\nCaption B: The room is turning brighter.", "gt-answer": "Caption A: The room is turning darker.", "video-llm-prediction": "Caption B: The room is turning brighter.", "match_success": true, "rating": 0 } ] }, "1059072773_1059718130_0": { "order": [ { "question": "Which caption matches the video better?\nCaption A: The ice cream melts first and then the sun rises.\nCaption B: The sun is rising and the ice cream is melting at the same time.", "gt-answer": "Caption B: The sun is rising and the ice cream is melting at the same time.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The sun is rising and the ice cream is melting at the same time.\nOption 2: The ice cream melts after the sun rises.", "gt-answer": "Option 1: The sun is rising and the ice cream is melting at the same time.", "video-llm-prediction": "Option 1: The sun is rising and the ice cream is melting at the same time.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice cream melts first before the sun rises.\nSentence B: The sun is rising and the ice cream is melting at the same time.", "gt-answer": "Sentence B: The sun is rising and the ice cream is melting at the same time.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1059072773_1059718130_1": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice cream melts first and then the sun rises.\nSentence B: The sun rises and the ice cream melts at the same time.", "gt-answer": "Sentence A: The ice cream melts first and then the sun rises.", "video-llm-prediction": "Sentence A: The ice cream melts first and then the sun rises.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The sun rises first and then the ice cream melts.\nCaption B: The ice cream melts first and then the sun rises.", "gt-answer": "Caption B: The ice cream melts first and then the sun rises.", "video-llm-prediction": "Caption B: The ice cream melts first and then the sun rises.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Both events happen simultaneously, the sun rises and the ice cream melts.\nOption 2: The ice cream melts first and then the sun rises.", "gt-answer": "Option 2: The ice cream melts first and then the sun rises.", "video-llm-prediction": "Option 1: Both events happen simultaneously, the sun rises and the ice cream melts.", "match_success": true, "rating": 0 } ] }, "1059072773_1059718130_2": { "order": [ { "question": "Which caption matches the video better?\nCaption A: In the video, the sun rises first and then the ice cream melts.\nCaption B: Both events happen simultaneously, the sun rises and the ice cream melts.", "gt-answer": "Caption A: In the video, the sun rises first and then the ice cream melts.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The ice cream melts before the sun rises.\nSentence B: In the video, the sun rises first and then the ice cream melts.", "gt-answer": "Sentence B: In the video, the sun rises first and then the ice cream melts.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Before the sun rose, the ice cream melted.\nOption 2: In the video, the sun rises first and then the ice cream melts.", "gt-answer": "Option 2: In the video, the sun rises first and then the ice cream melts.", "video-llm-prediction": "Option 1: Before the sun rose, the ice cream melted.", "match_success": true, "rating": 0 } ] }, "1024867412_1034049020_0": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A girl is walking on the street while a dog is simultaneously driving.\nOption 2: A girl is walking on the street, then a dog is driving.", "gt-answer": "Option 1: A girl is walking on the street while a dog is simultaneously driving.", "video-llm-prediction": "Option 1: A girl is walking on the street while a dog is simultaneously driving.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A dog is driving, then a girl is walking on the street.\nCaption B: A girl is walking on the street while a dog is simultaneously driving.", "gt-answer": "Caption B: A girl is walking on the street while a dog is simultaneously driving.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A girl is walking on the street while a dog is simultaneously driving.\nSentence B: A girl is walking on the street, after a dog is driving.", "gt-answer": "Sentence A: A girl is walking on the street while a dog is simultaneously driving.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1024867412_1034049020_1": { "order": [ { "question": "Which caption matches the video better?\nCaption A: A girl is walking on the street, then a dog is driving in the video.\nCaption B: In the video, a dog is driving first, followed by a girl walking on the street.", "gt-answer": "Caption B: In the video, a dog is driving first, followed by a girl walking on the street.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Both events, a dog driving and a girl walking on the street, are happening at the same time in the video.\nOption 2: In the video, a dog is driving first, followed by a girl walking on the street.", "gt-answer": "Option 2: In the video, a dog is driving first, followed by a girl walking on the street.", "video-llm-prediction": "Option 1: Both events, a dog driving and a girl walking on the street, are happening at the same time in the video.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A girl is walking on the street while a dog is simultaneously driving in the video.\nSentence B: In the video, a dog is driving first, followed by a girl walking on the street.", "gt-answer": "Sentence B: In the video, a dog is driving first, followed by a girl walking on the street.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1024867412_1034049020_2": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: In the video, first a girl is walking on the street, and then a dog is driving.\nSentence B: A dog is driving, and then a girl is walking on the street.", "gt-answer": "Sentence A: In the video, first a girl is walking on the street, and then a dog is driving.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Both events of a girl walking on the street and a dog driving happen at the same time.\nCaption B: In the video, first a girl is walking on the street, and then a dog is driving.", "gt-answer": "Caption B: In the video, first a girl is walking on the street, and then a dog is driving.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A dog is driving, followed by a girl walking on the street.\nOption 2: In the video, first a girl is walking on the street, and then a dog is driving.", "gt-answer": "Option 2: In the video, first a girl is walking on the street, and then a dog is driving.", "video-llm-prediction": "Option 2: In the video, first a girl is walking on the street, and then a dog is driving.", "match_success": true, "rating": 1 } ] }, "1096238125_1098970151_0": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: Two people lifting weights and a cat licking its paw happening simultaneously.\nOption 2: Two people are lifting weights and then a cat is licking its paw.", "gt-answer": "Option 1: Two people lifting weights and a cat licking its paw happening simultaneously.", "video-llm-prediction": "Option 1: Two people lifting weights and a cat licking its paw happening simultaneously.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Two people lifting weights and a cat licking its paw happening simultaneously.\nCaption B: A cat is licking its paw and then two people are lifting weights.", "gt-answer": "Caption A: Two people lifting weights and a cat licking its paw happening simultaneously.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: In the video, two people are lifting weights first, then a cat is licking its paw.\nSentence B: Two people lifting weights and a cat licking its paw happening simultaneously.", "gt-answer": "Sentence B: Two people lifting weights and a cat licking its paw happening simultaneously.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1096238125_1098970151_1": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A cat is licking its paw and then two people are lifting weights in the video.\nOption 2: In the video, two people are lifting weights and then a cat is licking its paw.", "gt-answer": "Option 2: In the video, two people are lifting weights and then a cat is licking its paw.", "video-llm-prediction": "Option 2: In the video, two people are lifting weights and then a cat is licking its paw.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A cat is licking its paw before two people are lifting weights in the video.\nSentence B: In the video, two people are lifting weights and then a cat is licking its paw.", "gt-answer": "Sentence B: In the video, two people are lifting weights and then a cat is licking its paw.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: In the video, two people are lifting weights and then a cat is licking its paw.\nCaption B: A cat is licking its paw and then two people are lifting weights in the video.", "gt-answer": "Caption A: In the video, two people are lifting weights and then a cat is licking its paw.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1096238125_1098970151_2": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A cat is seen licking its paw first, followed by two people lifting weights.\nSentence B: Two people are lifting weights before a cat is seen licking its paw.", "gt-answer": "Sentence A: A cat is seen licking its paw first, followed by two people lifting weights.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Both events, two people lifting weights and a cat licking paw, happen at the same time in the video.\nOption 2: A cat is seen licking its paw first, followed by two people lifting weights.", "gt-answer": "Option 2: A cat is seen licking its paw first, followed by two people lifting weights.", "video-llm-prediction": "Option 1: Both events, two people lifting weights and a cat licking paw, happen at the same time in the video.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: Two people are lifting weights first, followed by a cat licking its paw.\nCaption B: A cat is seen licking its paw first, followed by two people lifting weights.", "gt-answer": "Caption B: A cat is seen licking its paw first, followed by two people lifting weights.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1069717165_1041584314_0": { "order": [ { "question": "Which caption matches the video better?\nCaption A: First, a woman is showing different expressions; then, a man is showing different expressions.\nCaption B: The woman and man are showing their expressions simultaneously.", "gt-answer": "Caption B: The woman and man are showing their expressions simultaneously.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man presents his expressions first, followed by the woman.\nSentence B: The woman and man are showing their expressions simultaneously.", "gt-answer": "Sentence B: The woman and man are showing their expressions simultaneously.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman presents her expressions first, followed by the man.\nOption 2: The woman and man are showing their expressions simultaneously.", "gt-answer": "Option 2: The woman and man are showing their expressions simultaneously.", "video-llm-prediction": "Option 1: The woman presents her expressions first, followed by the man.", "match_success": true, "rating": 0 } ] }, "1069717165_1041584314_1": { "order": [ { "question": "Which caption matches the video better?\nCaption A: First, a woman is showing different expressions; then, a man is showing different expressions.\nCaption B: Both the woman and the man are showing different expressions at the same time.", "gt-answer": "Caption A: First, a woman is showing different expressions; then, a man is showing different expressions.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: First, a woman is showing different expressions; then, a man is showing different expressions.\nSentence B: The man presents his expressions first, followed by the woman.", "gt-answer": "Sentence A: First, a woman is showing different expressions; then, a man is showing different expressions.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: First, a woman is showing different expressions; then, a man is showing different expressions.\nOption 2: In the video, the woman and man are showing their expressions simultaneously.", "gt-answer": "Option 1: First, a woman is showing different expressions; then, a man is showing different expressions.", "video-llm-prediction": "Option 1: First, a woman is showing different expressions; then, a man is showing different expressions.", "match_success": true, "rating": 1 } ] }, "1069717165_1041584314_2": { "order": [ { "question": "Which caption matches the video better?\nCaption A: First, a man is showing different expressions; then, a woman is showing different expressions.\nCaption B: Both the woman and the man are showing different expressions at the same time.", "gt-answer": "Caption A: First, a man is showing different expressions; then, a woman is showing different expressions.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman presents his expressions first, followed by the man.\nOption 2: First, a man is showing different expressions; then, a woman is showing different expressions.", "gt-answer": "Option 2: First, a man is showing different expressions; then, a woman is showing different expressions.", "video-llm-prediction": "Option 1: The woman presents his expressions first, followed by the man.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman and man are showing their expressions simultaneously.\nSentence B: First, a man is showing different expressions; then, a woman is showing different expressions.", "gt-answer": "Sentence B: First, a man is showing different expressions; then, a woman is showing different expressions.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1055413730_1055973887_0": { "order": [ { "question": "Which caption matches the video better?\nCaption A: Fire ball expanding and earth spinning are happening together.\nCaption B: The fire ball is expanding before the earth spinning.", "gt-answer": "Caption A: Fire ball expanding and earth spinning are happening together.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Fire ball expanding and earth spinning are happening together.\nOption 2: The fire ball is expanding after the earth spinning.", "gt-answer": "Option 1: Fire ball expanding and earth spinning are happening together.", "video-llm-prediction": "Option 1: Fire ball expanding and earth spinning are happening together.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The fire ball is expanding first, followed by the earth spinning.\nSentence B: Fire ball expanding and earth spinning are happening together.", "gt-answer": "Sentence B: Fire ball expanding and earth spinning are happening together.", "video-llm-prediction": "A", "match_success": true, "rating": 0 } ] }, "1055413730_1055973887_1": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: Both events, fire ball expanding and earth spinning, happen together.\nSentence B: The earth is spinning before fire ball expanding.", "gt-answer": "Sentence B: The earth is spinning before fire ball expanding.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The earth is spinning before fire ball expanding.\nCaption B: The fire ball is expanding before the earth spinning.", "gt-answer": "Caption A: The earth is spinning before fire ball expanding.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The earth is spinning before fire ball expanding.\nOption 2: The fire ball is expanding before the earth spinning.", "gt-answer": "Option 1: The earth is spinning before fire ball expanding.", "video-llm-prediction": "Option 2: The fire ball is expanding before the earth spinning.", "match_success": true, "rating": 0 } ] }, "1055413730_1055973887_2": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The fire ball is expanding before the earth spinning.\nOption 2: Both events, fire ball expanding and earth spinning, happen together.", "gt-answer": "Option 1: The fire ball is expanding before the earth spinning.", "video-llm-prediction": "Option 2: Both events, fire ball expanding and earth spinning, happen together.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The fire ball is expanding after the earth spinning.\nCaption B: The fire ball is expanding before the earth spinning.", "gt-answer": "Caption B: The fire ball is expanding before the earth spinning.", "video-llm-prediction": "Caption B: The fire ball is expanding before the earth spinning.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: Both events, fire ball expanding and earth spinning, happen together.\nSentence B: The fire ball is expanding before the earth spinning.", "gt-answer": "Sentence B: The fire ball is expanding before the earth spinning.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1058907236_1021488277_0": { "order": [ { "question": "Which caption matches the video better?\nCaption A: In the video, both a person picking up a pineapple and a squirrel foraging are happening concurrently.\nCaption B: A person is picking up a pineapple first, then a squirrel is foraging in the video.", "gt-answer": "Caption A: In the video, both a person picking up a pineapple and a squirrel foraging are happening concurrently.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, both a person picking up a pineapple and a squirrel foraging are happening concurrently.\nOption 2: A squirrel is foraging first, then a person is picking up a pineapple in the video.", "gt-answer": "Option 1: In the video, both a person picking up a pineapple and a squirrel foraging are happening concurrently.", "video-llm-prediction": "Option 1: In the video, both a person picking up a pineapple and a squirrel foraging are happening concurrently.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: In the video, both a person picking up a pineapple and a squirrel foraging are happening concurrently.\nSentence B: The events in the video are happening sequentially, with the person picking up a pineapple before the squirrel starts foraging.", "gt-answer": "Sentence A: In the video, both a person picking up a pineapple and a squirrel foraging are happening concurrently.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1058907236_1021488277_1": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: First, a squirrel is foraging; then, a person is picking up a pineapple.\nSentence B: A person is picking up a pineapple before a squirrel is foraging.", "gt-answer": "Sentence A: First, a squirrel is foraging; then, a person is picking up a pineapple.", "video-llm-prediction": "Sentence A: First, a squirrel is foraging; then, a person is picking up a pineapple.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: First, a squirrel is foraging; then, a person is picking up a pineapple.\nOption 2: Both events are happening simultaneously.", "gt-answer": "Option 1: First, a squirrel is foraging; then, a person is picking up a pineapple.", "video-llm-prediction": "Option 1: First, a squirrel is foraging; then, a person is picking up a pineapple.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: First, a squirrel is foraging; then, a person is picking up a pineapple.\nCaption B: First, a person is picking up a pineapple; then, a squirrel is foraging.", "gt-answer": "Caption A: First, a squirrel is foraging; then, a person is picking up a pineapple.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1058907236_1021488277_2": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A person is picking up a pineapple and then a squirrel is foraging.\nOption 2: First, a squirrel is foraging; then, a person is picking up a pineapple.", "gt-answer": "Option 1: A person is picking up a pineapple and then a squirrel is foraging.", "video-llm-prediction": "Option 2: First, a squirrel is foraging; then, a person is picking up a pineapple.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is picking up a pineapple and then a squirrel is foraging.\nSentence B: First, a squirrel is foraging; then, a person is picking up a pineapple.", "gt-answer": "Sentence A: A person is picking up a pineapple and then a squirrel is foraging.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Both a person picking up a pineapple and a squirrel foraging are happening concurrently.\nCaption B: A person is picking up a pineapple and then a squirrel is foraging.", "gt-answer": "Caption B: A person is picking up a pineapple and then a squirrel is foraging.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1054717541_1047368344_0": { "order": [ { "question": "Which caption matches the video better?\nCaption A: In the video, a girl is jumping into water while a person is kneading dough.\nCaption B: A person is kneading dough followed by a girl jumping into water.", "gt-answer": "Caption A: In the video, a girl is jumping into water while a person is kneading dough.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A person is kneading dough followed by a girl jumping into water.\nOption 2: In the video, a girl is jumping into water while a person is kneading dough.", "gt-answer": "Option 2: In the video, a girl is jumping into water while a person is kneading dough.", "video-llm-prediction": "Option 2: In the video, a girl is jumping into water while a person is kneading dough.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The person jumps into water after the girl kneads dough.\nSentence B: In the video, a girl is jumping into water while a person is kneading dough.", "gt-answer": "Sentence B: In the video, a girl is jumping into water while a person is kneading dough.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1054717541_1047368344_1": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, a person is kneading dough followed by a girl jumping into water.\nOption 2: A girl jumping into water while a person kneading dough.", "gt-answer": "Option 1: In the video, a person is kneading dough followed by a girl jumping into water.", "video-llm-prediction": "Option 1: In the video, a person is kneading dough followed by a girl jumping into water.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A girl is jumping into water, followed by a person kneading dough.\nSentence B: In the video, a person is kneading dough followed by a girl jumping into water.", "gt-answer": "Sentence B: In the video, a person is kneading dough followed by a girl jumping into water.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: In the video, a person is kneading dough followed by a girl jumping into water.\nCaption B: A person is kneading dough after a girl jumping into water.", "gt-answer": "Caption A: In the video, a person is kneading dough followed by a girl jumping into water.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1054717541_1047368344_2": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is kneading dough followed by a girl jumping into water in the video.\nSentence B: In the video, a girl is jumping into water first followed by a person kneading dough.", "gt-answer": "Sentence B: In the video, a girl is jumping into water first followed by a person kneading dough.", "video-llm-prediction": "Sentence A: A person is kneading dough followed by a girl jumping into water in the video.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, a girl is jumping into water first followed by a person kneading dough.\nOption 2: Both events of a girl jumping into water and a person kneading dough occur at the same time.", "gt-answer": "Option 1: In the video, a girl is jumping into water first followed by a person kneading dough.", "video-llm-prediction": "Option 1: In the video, a girl is jumping into water first followed by a person kneading dough.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A girl jumping into water while a person kneading dough is shown in the video.\nCaption B: In the video, a girl is jumping into water first followed by a person kneading dough.", "gt-answer": "Caption B: In the video, a girl is jumping into water first followed by a person kneading dough.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1089201973_1086820415_0": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A person kicking a football and a big wave are shown simultaneously.\nSentence B: A person is kicking a football and then a big wave is shown.", "gt-answer": "Sentence A: A person kicking a football and a big wave are shown simultaneously.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A big wave is shown and then a person is kicking a football.\nCaption B: A person kicking a football and a big wave are shown simultaneously.", "gt-answer": "Caption B: A person kicking a football and a big wave are shown simultaneously.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A person kicking a football and a big wave are shown simultaneously.\nOption 2: A person is kicking a football, then a big wave is shown.", "gt-answer": "Option 1: A person kicking a football and a big wave are shown simultaneously.", "video-llm-prediction": "Option 2: A person is kicking a football, then a big wave is shown.", "match_success": true, "rating": 0 } ] }, "1089201973_1086820415_1": { "order": [ { "question": "Which caption matches the video better?\nCaption A: A person kicking a football and a big wave are shown simultaneously.\nCaption B: The order of events in the video is that a person is kicking football and a big wave is shown.", "gt-answer": "Caption B: The order of events in the video is that a person is kicking football and a big wave is shown.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The order of events in the video is that a person is kicking football and a big wave is shown.\nSentence B: A big wave is shown and then a person is kicking football.", "gt-answer": "Sentence A: The order of events in the video is that a person is kicking football and a big wave is shown.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The order of events in the video is that a person is kicking football and a big wave is shown.\nOption 2: The order of events in the video is that a big wave is shown and a person is kicking football.", "gt-answer": "Option 1: The order of events in the video is that a person is kicking football and a big wave is shown.", "video-llm-prediction": "Option 1: The order of events in the video is that a person is kicking football and a big wave is shown.", "match_success": true, "rating": 1 } ] }, "1089201973_1086820415_2": { "order": [ { "question": "Which caption matches the video better?\nCaption A: 1. A person kicking a football and a big wave are shown simultaneously.\nCaption B: The video begins with a big wave being shown, followed by a person kicking a football.", "gt-answer": "Caption B: The video begins with a big wave being shown, followed by a person kicking a football.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video begins with a big wave being shown, followed by a person kicking a football.\nSentence B: 2. A person is kicking a football before a big wave is shown.", "gt-answer": "Sentence A: The video begins with a big wave being shown, followed by a person kicking a football.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video begins with a big wave being shown, followed by a person kicking a football.\nOption 2: 3. The video begins with a person kicking a football, followed by a big wave being shown.", "gt-answer": "Option 1: The video begins with a big wave being shown, followed by a person kicking a football.", "video-llm-prediction": "Option 1: The video begins with a big wave being shown, followed by a person kicking a football.", "match_success": true, "rating": 1 } ] }, "1080990179_1086820415_0": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: First, a big wave is shown in the video, and then an apple is drying out.\nSentence B: In the video, both events of an apple drying out and a big wave are shown simultaneously.", "gt-answer": "Sentence B: In the video, both events of an apple drying out and a big wave are shown simultaneously.", "video-llm-prediction": "Sentence B: In the video, both events of an apple drying out and a big wave are shown simultaneously.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, both events of an apple drying out and a big wave are shown simultaneously.\nOption 2: The event of an apple drying out occurs after showing a big wave.", "gt-answer": "Option 1: In the video, both events of an apple drying out and a big wave are shown simultaneously.", "video-llm-prediction": "Option 1: In the video, both events of an apple drying out and a big wave are shown simultaneously.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The event of an apple drying out occurs before showing a big wave.\nCaption B: In the video, both events of an apple drying out and a big wave are shown simultaneously.", "gt-answer": "Caption B: In the video, both events of an apple drying out and a big wave are shown simultaneously.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1080990179_1086820415_1": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: An apple is drying out and then a big wave is shown.\nSentence B: A big wave is shown and then an apple is drying out.", "gt-answer": "Sentence A: An apple is drying out and then a big wave is shown.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: At the same time, an apple is drying out and a big wave is shown.\nCaption B: An apple is drying out and then a big wave is shown.", "gt-answer": "Caption B: An apple is drying out and then a big wave is shown.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: An apple is drying out and then a big wave is shown.\nOption 2: An apple is drying out before a big wave is shown.", "gt-answer": "Option 1: An apple is drying out and then a big wave is shown.", "video-llm-prediction": "Option 2: An apple is drying out before a big wave is shown.", "match_success": true, "rating": 0 } ] }, "1080990179_1086820415_2": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, a big wave is shown first and then an apple is drying out.\nOption 2: An apple is drying out and then a big wave is shown.", "gt-answer": "Option 1: In the video, a big wave is shown first and then an apple is drying out.", "video-llm-prediction": "Option 1: In the video, a big wave is shown first and then an apple is drying out.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: Both events occur simultaneously, an apple is drying out and a big wave is shown.\nSentence B: In the video, a big wave is shown first and then an apple is drying out.", "gt-answer": "Sentence B: In the video, a big wave is shown first and then an apple is drying out.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: In the video, a big wave is shown first and then an apple is drying out.\nCaption B: In the video, an apple is drying out first and then a big wave is shown.", "gt-answer": "Caption A: In the video, a big wave is shown first and then an apple is drying out.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1067268403_1064645467_0": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: Wind turbines are rotating followed by a man doing exercise.\nOption 2: A man is doing exercise while wind turbines are rotating concurrently.", "gt-answer": "Option 2: A man is doing exercise while wind turbines are rotating concurrently.", "video-llm-prediction": "Option 2: A man is doing exercise while wind turbines are rotating concurrently.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A man is doing exercise while wind turbines are rotating concurrently.\nCaption B: A man doing exercise followed by wind turbines rotating.", "gt-answer": "Caption A: A man is doing exercise while wind turbines are rotating concurrently.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A man doing exercise followed by wind turbines rotating.\nSentence B: A man is doing exercise while wind turbines are rotating concurrently.", "gt-answer": "Sentence B: A man is doing exercise while wind turbines are rotating concurrently.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1067268403_1064645467_1": { "order": [ { "question": "Which caption matches the video better?\nCaption A: A man is doing exercise while wind turbines are rotating.\nCaption B: Wind turbines are rotating followed by a man doing exercise.", "gt-answer": "Caption B: Wind turbines are rotating followed by a man doing exercise.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A man is doing exercise before wind turbines rotating.\nSentence B: Wind turbines are rotating followed by a man doing exercise.", "gt-answer": "Sentence B: Wind turbines are rotating followed by a man doing exercise.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Wind turbines are rotating followed by a man doing exercise.\nOption 2: A man is doing exercise followed by wind turbines rotating.", "gt-answer": "Option 1: Wind turbines are rotating followed by a man doing exercise.", "video-llm-prediction": "Option 2: A man is doing exercise followed by wind turbines rotating.", "match_success": true, "rating": 0 } ] }, "1067268403_1064645467_2": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: Wind turbines are rotating followed by a man doing exercise.\nOption 2: The video shows a man doing exercise followed by wind turbines rotating.", "gt-answer": "Option 2: The video shows a man doing exercise followed by wind turbines rotating.", "video-llm-prediction": "Option 1: Wind turbines are rotating followed by a man doing exercise.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video shows a man doing exercise followed by wind turbines rotating.\nSentence B: Wind turbines are rotating followed by a man doing exercise.", "gt-answer": "Sentence A: The video shows a man doing exercise followed by wind turbines rotating.", "video-llm-prediction": "Sentence A: The video shows a man doing exercise followed by wind turbines rotating.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A man is doing exercise concurrently with wind turbines rotating.\nCaption B: The video shows a man doing exercise followed by wind turbines rotating.", "gt-answer": "Caption B: The video shows a man doing exercise followed by wind turbines rotating.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1082268005_1063373449_0": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: Planet Earth is assembling from toy bricks and then viruses are dissolving.\nSentence B: Planet earth assembling and viruses dissolving are happening simultaneously.", "gt-answer": "Sentence B: Planet earth assembling and viruses dissolving are happening simultaneously.", "video-llm-prediction": "A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: Planet earth assembling and viruses dissolving are happening simultaneously.\nCaption B: Viruses are dissolving and then planet Earth is assembling from toy bricks.", "gt-answer": "Caption A: Planet earth assembling and viruses dissolving are happening simultaneously.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Planet earth assembling and viruses dissolving are happening simultaneously.\nOption 2: Viruses are dissolving and then planet Earth is assembling from toy bricks.", "gt-answer": "Option 1: Planet earth assembling and viruses dissolving are happening simultaneously.", "video-llm-prediction": "Option 1: Planet earth assembling and viruses dissolving are happening simultaneously.", "match_success": true, "rating": 1 } ] }, "1082268005_1063373449_1": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: Viruses are dissolving and then planet earth is assembling from toy bricks.\nOption 2: The video shows planet earth assembling from toy bricks and then viruses dissolving.", "gt-answer": "Option 2: The video shows planet earth assembling from toy bricks and then viruses dissolving.", "video-llm-prediction": "Option 2: The video shows planet earth assembling from toy bricks and then viruses dissolving.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video shows planet earth assembling from toy bricks and then viruses dissolving.\nSentence B: Planet earth assembling and viruses dissolving are happening simultaneously.", "gt-answer": "Sentence A: The video shows planet earth assembling from toy bricks and then viruses dissolving.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video shows planet earth assembling from toy bricks and then viruses dissolving.\nCaption B: Planet earth is assembling from toy bricks after viruses dissolving.", "gt-answer": "Caption A: The video shows planet earth assembling from toy bricks and then viruses dissolving.", "video-llm-prediction": "A", "match_success": true, "rating": 1 } ] }, "1082268005_1063373449_2": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, viruses are dissolving first and then planet earth is assembling from toy bricks.\nOption 2: Planet earth assembling and viruses dissolving are happening simultaneously.", "gt-answer": "Option 1: In the video, viruses are dissolving first and then planet earth is assembling from toy bricks.", "video-llm-prediction": "Option 1: In the video, viruses are dissolving first and then planet earth is assembling from toy bricks.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: Planet earth is assembling from toy bricks first and then viruses dissolving.\nSentence B: In the video, viruses are dissolving first and then planet earth is assembling from toy bricks.", "gt-answer": "Sentence B: In the video, viruses are dissolving first and then planet earth is assembling from toy bricks.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: Planet earth is assembling from toy bricks first and then viruses dissolving.\nCaption B: In the video, viruses are dissolving first and then planet earth is assembling from toy bricks.", "gt-answer": "Caption B: In the video, viruses are dissolving first and then planet earth is assembling from toy bricks.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1060403198_1049965795_0": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A light bulb is lighting up while the tree is growing up at the same time\nOption 2: A light bulb is lighting up and then the tree is growing up.", "gt-answer": "Option 1: A light bulb is lighting up while the tree is growing up at the same time", "video-llm-prediction": "Option 1: A light bulb is lighting up while the tree is growing up at the same time.", "match_success": false, "chatgpt-response": "Correct", "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A light bulb is lighting up while the tree is growing up at the same time\nSentence B: The tree is growing up and then a light bulb is lighting up.", "gt-answer": "Sentence A: A light bulb is lighting up while the tree is growing up at the same time", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The light bulb lights up after the tree grows.\nCaption B: A light bulb is lighting up while the tree is growing up at the same time", "gt-answer": "Caption B: A light bulb is lighting up while the tree is growing up at the same time", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1060403198_1049965795_1": { "order": [ { "question": "Which caption matches the video better?\nCaption A: The tree is growing up and then a light bulb is lighting up.\nCaption B: A light bulb is lighting up while the tree is growing up at the same time", "gt-answer": "Caption A: The tree is growing up and then a light bulb is lighting up.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A light bulb is lighting up and then the tree is growing up.\nSentence B: The tree is growing up and then a light bulb is lighting up.", "gt-answer": "Sentence B: The tree is growing up and then a light bulb is lighting up.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The tree is growing up after a light bulb is lighting up.\nOption 2: The tree is growing up and then a light bulb is lighting up.", "gt-answer": "Option 2: The tree is growing up and then a light bulb is lighting up.", "video-llm-prediction": "Option 1: The tree is growing up after a light bulb is lighting up.", "match_success": true, "rating": 0 } ] }, "1060403198_1049965795_2": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The video shows a light bulb lighting up first and then the tree growing up.\nSentence B: The tree is growing up and then a light bulb is lighting up.", "gt-answer": "Sentence A: The video shows a light bulb lighting up first and then the tree growing up.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A light bulb is lighting up while the tree is growing up at the same time\nOption 2: The video shows a light bulb lighting up first and then the tree growing up.", "gt-answer": "Option 2: The video shows a light bulb lighting up first and then the tree growing up.", "video-llm-prediction": "Option 2: The video shows a light bulb lighting up first and then the tree growing up.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video shows a light bulb lighting up first and then the tree growing up.\nCaption B: The video shows the tree growing up first and then a light bulb lighting up.", "gt-answer": "Caption A: The video shows a light bulb lighting up first and then the tree growing up.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1091143117_1094219575_0": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, first a flower is wilting and then an eys is opening.\nOption 2: Both events of an eye opening and a flower wilting happen simultaneously in the video.", "gt-answer": "Option 2: Both events of an eye opening and a flower wilting happen simultaneously in the video.", "video-llm-prediction": "Option 1: In the video, first a flower is wilting and then an eys is opening.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: An eye is opening in the video, followed by a flower wilting.\nCaption B: Both events of an eye opening and a flower wilting happen simultaneously in the video.", "gt-answer": "Caption B: Both events of an eye opening and a flower wilting happen simultaneously in the video.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: An eye is opening in the video, followed by a flower wilting.\nSentence B: Both events of an eye opening and a flower wilting happen simultaneously in the video.", "gt-answer": "Sentence B: Both events of an eye opening and a flower wilting happen simultaneously in the video.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1091143117_1094219575_1": { "order": [ { "question": "Which caption matches the video better?\nCaption A: Both events of an eye opening and a flower wilting happen simultaneously.\nCaption B: In the video, an eye is opening first and then a flower is wilting.", "gt-answer": "Caption B: In the video, an eye is opening first and then a flower is wilting.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A flower is wilting first, and then an eye is opening.\nSentence B: In the video, an eye is opening first and then a flower is wilting.", "gt-answer": "Sentence B: In the video, an eye is opening first and then a flower is wilting.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, an eye is opening first and then a flower is wilting.\nOption 2: A flower is wilting first, then an eye is opening.", "gt-answer": "Option 1: In the video, an eye is opening first and then a flower is wilting.", "video-llm-prediction": "Option 2: A flower is wilting first, then an eye is opening.", "match_success": true, "rating": 0 } ] }, "1091143117_1094219575_2": { "order": [ { "question": "Which caption matches the video better?\nCaption A: In the video, a flower is wilting first, and then an eye is opening.\nCaption B: An eye is opening first, then a flower is wilting in the video.", "gt-answer": "Caption A: In the video, a flower is wilting first, and then an eye is opening.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, a flower is wilting first, and then an eye is opening.\nOption 2: Both events, a flower wilting and an eye opening, are happening simultaneously in the video.", "gt-answer": "Option 1: In the video, a flower is wilting first, and then an eye is opening.", "video-llm-prediction": "Option 2: In the video, a flower is wilting first, and then an eye is opening.", "match_success": false, "chatgpt-response": "Incorrect", "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: In the video, a flower is wilting first, and then an eye is opening.\nSentence B: An eye is opening before a flower wilts in the video.", "gt-answer": "Sentence A: In the video, a flower is wilting first, and then an eye is opening.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1086105089_1085910917_0": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: First, a building is being constructed; then, a woman is drinking water.\nOption 2: A building is being constructed while at the same time a woman is drinking water.", "gt-answer": "Option 2: A building is being constructed while at the same time a woman is drinking water.", "video-llm-prediction": "Option 2: A building is being constructed while at the same time a woman is drinking water.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A woman is drinking water happens before a building is being constructed.\nCaption B: A building is being constructed while at the same time a woman is drinking water.", "gt-answer": "Caption B: A building is being constructed while at the same time a woman is drinking water.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A building is being constructed happens before a woman is drinking water.\nSentence B: A building is being constructed while at the same time a woman is drinking water.", "gt-answer": "Sentence B: A building is being constructed while at the same time a woman is drinking water.", "video-llm-prediction": "Sentence B: A building is being constructed while at the same time a woman is drinking water.", "match_success": true, "rating": 1 } ] }, "1086105089_1085910917_1": { "order": [ { "question": "Which caption matches the video better?\nCaption A: First, a building is being constructed; then, a woman is drinking water.\nCaption B: A woman is drinking water before a building is being constructed.", "gt-answer": "Caption A: First, a building is being constructed; then, a woman is drinking water.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A building is being constructed while at the same time a woman is drinking water.\nOption 2: First, a building is being constructed; then, a woman is drinking water.", "gt-answer": "Option 2: First, a building is being constructed; then, a woman is drinking water.", "video-llm-prediction": "Option 2: First, a building is being constructed; then, a woman is drinking water.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: First, a woman is drinking water; then, a building is being constructed.\nSentence B: First, a building is being constructed; then, a woman is drinking water.", "gt-answer": "Sentence B: First, a building is being constructed; then, a woman is drinking water.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1086105089_1085910917_2": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: First, a woman is drinking water; then, a building is being constructed.\nOption 2: A building is being constructed while at the same time a woman is drinking water.", "gt-answer": "Option 1: First, a woman is drinking water; then, a building is being constructed.", "video-llm-prediction": "Option 1: First, a woman is drinking water; then, a building is being constructed.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A building is being constructed happens before a woman is drinking water.\nSentence B: First, a woman is drinking water; then, a building is being constructed.", "gt-answer": "Sentence B: First, a woman is drinking water; then, a building is being constructed.", "video-llm-prediction": "Sentence B: First, a woman is drinking water; then, a building is being constructed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: First, a building is being constructed; then, a woman is drinking water.\nCaption B: First, a woman is drinking water; then, a building is being constructed.", "gt-answer": "Caption B: First, a woman is drinking water; then, a building is being constructed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1034736653_1043054206_0": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: Women are running first, then cars are racing.\nOption 2: Woman running and car racing happen simultaneously.", "gt-answer": "Option 2: Woman running and car racing happen simultaneously.", "video-llm-prediction": "Option 2: Woman running and car racing happen simultaneously.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Cars are racing first, then women are running.\nCaption B: Woman running and car racing happen simultaneously.", "gt-answer": "Caption B: Woman running and car racing happen simultaneously.", "video-llm-prediction": "A", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: Women are running first, then cars are racing.\nSentence B: Woman running and car racing happen simultaneously.", "gt-answer": "Sentence B: Woman running and car racing happen simultaneously.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1034736653_1043054206_1": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video shows women running first, followed by cars racing.\nOption 2: Woman running and car racing happen simultaneously.", "gt-answer": "Option 1: The video shows women running first, followed by cars racing.", "video-llm-prediction": "Option 2: The video shows woman running and car racing happening simultaneously.", "match_success": false, "chatgpt-response": "Incorrect", "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The cars are racing first, followed by women running.\nCaption B: The video shows women running first, followed by cars racing.", "gt-answer": "Caption B: The video shows women running first, followed by cars racing.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The cars are racing first, followed by women running.\nSentence B: The video shows women running first, followed by cars racing.", "gt-answer": "Sentence B: The video shows women running first, followed by cars racing.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 } ] }, "1034736653_1043054206_2": { "order": [ { "question": "Which caption matches the video better?\nCaption A: The cars are racing first, followed by women running.\nCaption B: Woman running and car racing happen simultaneously.", "gt-answer": "Caption A: The cars are racing first, followed by women running.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The cars are racing first, followed by women running.\nSentence B: The video shows women running first, followed by cars racing.", "gt-answer": "Sentence A: The cars are racing first, followed by women running.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Cars are racing after the women have started running.\nOption 2: The cars are racing first, followed by women running.", "gt-answer": "Option 2: The cars are racing first, followed by women running.", "video-llm-prediction": "Option 2: The cars are racing first, followed by women running.", "match_success": true, "rating": 1 } ] }, "1086423566_1094782389_0": { "order": [ { "question": "Which caption matches the video better?\nCaption A: Foods are added into a bowl, while at the same time a dog is giving a high five to a woman.\nCaption B: A dog is giving a high five to a woman, and then foods are added into a bowl.", "gt-answer": "Caption A: Foods are added into a bowl, while at the same time a dog is giving a high five to a woman.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Foods are added into a bowl, while at the same time a dog is giving a high five to a woman.\nOption 2: A dog is giving a high five to a woman, and then foods are added into a bowl.", "gt-answer": "Option 1: Foods are added into a bowl, while at the same time a dog is giving a high five to a woman.", "video-llm-prediction": "Option 1: Foods are added into a bowl, while at the same time a dog is giving a high five to a woman.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: Foods are added into a bowl, while at the same time a dog is giving a high five to a woman.\nSentence B: Foods are added into a bowl, and then a dog is giving a high five to a woman.", "gt-answer": "Sentence A: Foods are added into a bowl, while at the same time a dog is giving a high five to a woman.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1086423566_1094782389_1": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: Foods are added into a bowl, and then a dog is giving a high five to a woman.\nSentence B: A dog is giving a high five to a woman, and then foods are added into a bowl.", "gt-answer": "Sentence A: Foods are added into a bowl, and then a dog is giving a high five to a woman.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Simultaneously, foods are added into a bowl and a dog is giving a high five to a woman.\nOption 2: Foods are added into a bowl, and then a dog is giving a high five to a woman.", "gt-answer": "Option 2: Foods are added into a bowl, and then a dog is giving a high five to a woman.", "video-llm-prediction": "Option 2: Foods are added into a bowl, and then a dog is giving a high five to a woman.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A dog is giving a high five to a woman, and then foods are added into a bowl.\nCaption B: Foods are added into a bowl, and then a dog is giving a high five to a woman.", "gt-answer": "Caption B: Foods are added into a bowl, and then a dog is giving a high five to a woman.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1086423566_1094782389_2": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, a dog is giving a high five to a woman, and then foods are added into a bowl.\nOption 2: Simultaneously, foods are added into a bowl and a dog is giving a high five to a woman.", "gt-answer": "Option 1: In the video, a dog is giving a high five to a woman, and then foods are added into a bowl.", "video-llm-prediction": "Option 1: In the video, a dog is giving a high five to a woman, and then foods are added into a bowl.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Foods are added into a bowl, while at the same time a dog is giving a high five to a woman.\nCaption B: In the video, a dog is giving a high five to a woman, and then foods are added into a bowl.", "gt-answer": "Caption B: In the video, a dog is giving a high five to a woman, and then foods are added into a bowl.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: In the video, a dog is giving a high five to a woman, and then foods are added into a bowl.\nSentence B: Foods are added into a bowl, and then a dog is giving a high five to a woman.", "gt-answer": "Sentence A: In the video, a dog is giving a high five to a woman, and then foods are added into a bowl.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1053758117_1055413730_0": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The digital brain and the earth are spinning at the same time.\nSentence B: First, the digital brain is spinning and then the earth is spinning.", "gt-answer": "Sentence A: The digital brain and the earth are spinning at the same time.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The digital brain and the earth are spinning at the same time.\nOption 2: The earth is spinning first, followed by the digital brain spinning.", "gt-answer": "Option 1: The digital brain and the earth are spinning at the same time.", "video-llm-prediction": "Option 2: The earth is spinning first, followed by the digital brain spinning.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The video shows the earth spinning.\nCaption B: The digital brain and the earth are spinning at the same time.", "gt-answer": "Caption B: The digital brain and the earth are spinning at the same time.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1053758117_1055413730_1": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: First, the digital brain is spinning and then the earth is spinning.\nOption 2: The digital brain and the earth are spinning at the same time.", "gt-answer": "Option 1: First, the digital brain is spinning and then the earth is spinning.", "video-llm-prediction": "Option 2: The digital brain and the earth are spinning at the same time.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: In the video, the earth is spinning before the digital brain starts spinning.\nSentence B: First, the digital brain is spinning and then the earth is spinning.", "gt-answer": "Sentence B: First, the digital brain is spinning and then the earth is spinning.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: First, the digital brain is spinning and then the earth is spinning.\nCaption B: In the video, the earth is spinning before the digital brain starts spinning.", "gt-answer": "Caption A: First, the digital brain is spinning and then the earth is spinning.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1053758117_1055413730_2": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The digital brain and the earth are spinning simultaneously.\nSentence B: In the video, the earth is spinning before the digital brain starts spinning.", "gt-answer": "Sentence B: In the video, the earth is spinning before the digital brain starts spinning.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, the earth is spinning before the digital brain starts spinning.\nOption 2: First, the digital brain is spinning and then the earth is spinning.", "gt-answer": "Option 1: In the video, the earth is spinning before the digital brain starts spinning.", "video-llm-prediction": "Option 1: In the video, the earth is spinning before the digital brain starts spinning.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A video showing the digital brain spinning and then the earth spinning.\nCaption B: In the video, the earth is spinning before the digital brain starts spinning.", "gt-answer": "Caption B: In the video, the earth is spinning before the digital brain starts spinning.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1076877854_1073914229_0": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: Robotic arms are working, then trains are driving.\nSentence B: Trains are driving and robotic arms are working at the same time.", "gt-answer": "Sentence B: Trains are driving and robotic arms are working at the same time.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Trains are driving and robotic arms are working at the same time.\nOption 2: The robotic arms are working first, then the trains are driving.", "gt-answer": "Option 1: Trains are driving and robotic arms are working at the same time.", "video-llm-prediction": "Option 2: The robotic arms are working first, then the trains are driving.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: Trains are driving and robotic arms are working at the same time.\nCaption B: Trains are driving, then robotic arms are working.", "gt-answer": "Caption A: Trains are driving and robotic arms are working at the same time.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1076877854_1073914229_1": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The trains are driving first, and then robotic arms are working.\nSentence B: The robotic arms are working first, then the trains are driving.", "gt-answer": "Sentence B: The robotic arms are working first, then the trains are driving.", "video-llm-prediction": "Sentence B: The robotic arms are working first, then the trains are driving.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Both the robotic arms and trains are shown at the same time.\nOption 2: The robotic arms are working first, then the trains are driving.", "gt-answer": "Option 2: The robotic arms are working first, then the trains are driving.", "video-llm-prediction": "Option 2: The robotic arms are working first, then the trains are driving.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The robotic arms are working first, then the trains are driving.\nCaption B: The trains are driving first, and then robotic arms are working.", "gt-answer": "Caption A: The robotic arms are working first, then the trains are driving.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1076877854_1073914229_2": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: In the video, robotic arms are working and trains are driving at the same time.\nSentence B: First, trains are driving, then robotic arms are working.", "gt-answer": "Sentence B: First, trains are driving, then robotic arms are working.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: First, trains are driving, then robotic arms are working.\nOption 2: The robotic arms are working first, then the trains are driving.", "gt-answer": "Option 1: First, trains are driving, then robotic arms are working.", "video-llm-prediction": "Option 2: The robotic arms are working first, then the trains are driving.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The robotic arms are working first, then the trains are driving.\nCaption B: First, trains are driving, then robotic arms are working.", "gt-answer": "Caption B: First, trains are driving, then robotic arms are working.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1067268406_1067427023_0": { "order": [ { "question": "Which caption matches the video better?\nCaption A: Snow particles are melting, then a white car is driving.\nCaption B: In the video, a white car is driving while snow particles are melting simultaneously.", "gt-answer": "Caption B: In the video, a white car is driving while snow particles are melting simultaneously.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A white car is driving first, then snow particles are melting.\nSentence B: In the video, a white car is driving while snow particles are melting simultaneously.", "gt-answer": "Sentence B: In the video, a white car is driving while snow particles are melting simultaneously.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: In the video, a white car is driving while snow particles are melting simultaneously.\nOption 2: A white car is driving first, then snow particles are melting.", "gt-answer": "Option 1: In the video, a white car is driving while snow particles are melting simultaneously.", "video-llm-prediction": "Option 1: In the video, a white car is driving while snow particles are melting simultaneously.", "match_success": true, "rating": 1 } ] }, "1067268406_1067427023_1": { "order": [ { "question": "Which caption matches the video better?\nCaption A: A white car is driving first, then snow particles are melting.\nCaption B: Snow particles are melting, then a white car is driving.", "gt-answer": "Caption A: A white car is driving first, then snow particles are melting.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A white car is driving while snow particles are melting simultaneously.\nSentence B: A white car is driving first, then snow particles are melting.", "gt-answer": "Sentence B: A white car is driving first, then snow particles are melting.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Snow particles are melting, before a white car is driving.\nOption 2: A white car is driving first, then snow particles are melting.", "gt-answer": "Option 2: A white car is driving first, then snow particles are melting.", "video-llm-prediction": "Option 2: A white car is driving first, then snow particles are melting.", "match_success": true, "rating": 1 } ] }, "1067268406_1067427023_2": { "order": [ { "question": "Which caption matches the video better?\nCaption A: Snow particles are melting, then a white car is driving.\nCaption B: A white car is driving while snow particles are melting simultaneously.", "gt-answer": "Caption A: Snow particles are melting, then a white car is driving.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Snow particles are melting, then a white car is driving.\nOption 2: A white car is driving, then snow particles are melting.", "gt-answer": "Option 1: Snow particles are melting, then a white car is driving.", "video-llm-prediction": "Option 1: Snow particles are melting, then a white car is driving.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: Snow particles are melting, then a white car is driving.\nSentence B: A white car is driving first, then snow particles are melting.", "gt-answer": "Sentence A: Snow particles are melting, then a white car is driving.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1086286649": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The armored vehicles are driving.\nSentence B: The armored vehicles are loading people.", "gt-answer": "Sentence A: The armored vehicles are driving.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The armored vehicles are parked.\nCaption B: The armored vehicles are driving.", "gt-answer": "Caption B: The armored vehicles are driving.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The armored vehicles are driving.\nOption 2: The armored vehicles are firing.", "gt-answer": "Option 1: The armored vehicles are driving.", "video-llm-prediction": "Option 1: The armored vehicles are driving.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The armored vehicles are moving from right to left.\nSentence B: The armored vehicles are moving from left to right.", "gt-answer": "Sentence A: The armored vehicles are moving from right to left.", "video-llm-prediction": "Sentence A: The armored vehicles are moving from right to left.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The armored vehicles are approaching from the upper side of the screen.\nOption 2: The armored vehicles are moving from right to left.", "gt-answer": "Option 2: The armored vehicles are moving from right to left.", "video-llm-prediction": "Option 2: The armored vehicles are moving from right to left.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The armored vehicles are approaching from left to right.\nCaption B: The armored vehicles are moving from right to left.", "gt-answer": "Caption B: The armored vehicles are moving from right to left.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ], "speed": [ { "question": "Which caption matches the video better?\nCaption A: The armored vehicles are stationary.\nCaption B: The armored vehicles are moving slowly.", "gt-answer": "Caption B: The armored vehicles are moving slowly.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The armored vehicles are moving rapidly.\nOption 2: The armored vehicles are moving slowly.", "gt-answer": "Option 2: The armored vehicles are moving slowly.", "video-llm-prediction": "Option 1: The armored vehicles are moving rapidly.", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The pace of the armored vehicles is fast.\nSentence B: The armored vehicles are moving slowly.", "gt-answer": "Sentence B: The armored vehicles are moving slowly.", "video-llm-prediction": "Sentence A: The pace of the armored vehicles is fast.", "match_success": true, "rating": 0 } ] }, "1093041749": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The dragon is breathing fire.\nSentence B: A video showing a dragon flying.", "gt-answer": "Sentence A: The dragon is breathing fire.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The dragon is breathing fire.\nOption 2: The dragon is swimming.", "gt-answer": "Option 1: The dragon is breathing fire.", "video-llm-prediction": "Option 1: The dragon is breathing fire.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The dragon is breathing fire.\nCaption B: The dragon is eating food.", "gt-answer": "Caption A: The dragon is breathing fire.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The dragon is flying to the sky and then breathing fire.\nOption 2: The dragon is landing from the sky and then breathing fire.", "gt-answer": "Option 2: The dragon is landing from the sky and then breathing fire.", "video-llm-prediction": "Option 1: The dragon is flying to the sky and then breathing fire.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The dragon is breathing fire and then flying to the sky.\nCaption B: The dragon is landing from the sky and then breathing fire.", "gt-answer": "Caption B: The dragon is landing from the sky and then breathing fire.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The dragon is landing from the sky and then breathing fire.\nSentence B: The dragon is breathing fire and then landing from the sky.", "gt-answer": "Sentence A: The dragon is landing from the sky and then breathing fire.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1076180768": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The knight is putting on armor.\nOption 2: The knight is drawing a sword from the sheath.", "gt-answer": "Option 2: The knight is drawing a sword from the sheath.", "video-llm-prediction": "Option 2: The knight is drawing a sword from the sheath.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The knight is drawing a sword from the sheath.\nSentence B: A video showing the knight wielding a spear.", "gt-answer": "Sentence A: The knight is drawing a sword from the sheath.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The knight is drawing a sword from the sheath.\nCaption B: The knight is shooting an arrow.", "gt-answer": "Caption A: The knight is drawing a sword from the sheath.", "video-llm-prediction": "A", "match_success": true, "rating": 1 } ] }, "3406661033": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The man is propelling himself forward in a wheelchair.\nCaption B: The man is swimming.", "gt-answer": "Caption A: The man is propelling himself forward in a wheelchair.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is propelling himself forward in a wheelchair.\nOption 2: A video showing the man walking.", "gt-answer": "Option 1: The man is propelling himself forward in a wheelchair.", "video-llm-prediction": "Option 1: The man is propelling himself forward in a wheelchair.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is jumping.\nSentence B: The man is propelling himself forward in a wheelchair.", "gt-answer": "Sentence B: The man is propelling himself forward in a wheelchair.", "video-llm-prediction": "B", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man stood up from the wheelchair and then handed the document to colleagues.\nOption 2: The man first propelled himself forward in the wheelchair and then handed the document to colleagues.", "gt-answer": "Option 2: The man first propelled himself forward in the wheelchair and then handed the document to colleagues.", "video-llm-prediction": "Option 2: The man first propelled himself forward in the wheelchair and then handed the document to colleagues.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man handed the document to colleagues and then propelled himself forward in the wheelchair.\nCaption B: The man first propelled himself forward in the wheelchair and then handed the document to colleagues.", "gt-answer": "Caption B: The man first propelled himself forward in the wheelchair and then handed the document to colleagues.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man handed the document to colleagues and then propelled himself forward in the wheelchair.\nSentence B: The man first propelled himself forward in the wheelchair and then handed the document to colleagues.", "gt-answer": "Sentence B: The man first propelled himself forward in the wheelchair and then handed the document to colleagues.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "3406618583": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The woman is performing freestyle dance.\nCaption B: The woman is performing hip-hop.", "gt-answer": "Caption A: The woman is performing freestyle dance.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is performing freestyle dance.\nSentence B: The woman is doing ballet.", "gt-answer": "Sentence A: The woman is performing freestyle dance.", "video-llm-prediction": "Sentence A: The woman is performing freestyle dance.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A video showing the woman engaged in jazz dance.\nOption 2: The woman is performing freestyle dance.", "gt-answer": "Option 2: The woman is performing freestyle dance.", "video-llm-prediction": "Option 1: A video showing the woman engaged in jazz dance.", "match_success": true, "rating": 0 } ], "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The camera is panning up and down around the girl.\nOption 2: The camera is panning left and right around the girl.", "gt-answer": "Option 1: The camera is panning up and down around the girl.", "video-llm-prediction": "Option 1: The camera is panning up and down around the girl.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The camera is rotating clockwise around the girl.\nCaption B: The camera is panning up and down around the girl.", "gt-answer": "Caption B: The camera is panning up and down around the girl.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The camera is zooming in and out around the girl.\nSentence B: The camera is panning up and down around the girl.", "gt-answer": "Sentence B: The camera is panning up and down around the girl.", "video-llm-prediction": "Sentence A: The camera is zooming in and out around the girl.", "match_success": true, "rating": 0 } ] }, "3405120819": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The chef is cutting dough.\nCaption B: The chef is shaping dough.", "gt-answer": "Caption B: The chef is shaping dough.", "video-llm-prediction": "Caption B: The chef is shaping dough.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The chef is shaping dough.\nOption 2: The chef is placing dough in a dish.", "gt-answer": "Option 1: The chef is shaping dough.", "video-llm-prediction": "Option 1: The chef is shaping dough.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The chef is shaping dough.\nSentence B: The chef is mixing dough with water.", "gt-answer": "Sentence A: The chef is shaping dough.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The chef is smiling at the camera and then shaping dough.\nSentence B: The chef is shaping dough and then smiling at the camera.", "gt-answer": "Sentence B: The chef is shaping dough and then smiling at the camera.", "video-llm-prediction": "Sentence A: The chef is smiling at the camera and then shaping dough.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The chef is smiling at the camera and then shaping dough.\nOption 2: The chef is shaping dough and then smiling at the camera.", "gt-answer": "Option 2: The chef is shaping dough and then smiling at the camera.", "video-llm-prediction": "Option 2: The chef is shaping dough and then smiling at the camera.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The chef is smiling at the camera and then shaping dough.\nCaption B: The chef is shaping dough and then smiling at the camera.", "gt-answer": "Caption B: The chef is shaping dough and then smiling at the camera.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "3405088753": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is buying bread in the video.\nSentence B: The woman is cooking soup.", "gt-answer": "Sentence A: The woman is buying bread in the video.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is eating an apple.\nCaption B: The woman is buying bread in the video.", "gt-answer": "Caption B: The woman is buying bread in the video.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is buying bread in the video.\nOption 2: The woman is sleeping.", "gt-answer": "Option 1: The woman is buying bread in the video.", "video-llm-prediction": "Option 1: The woman is buying bread in the video.", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The salesman is saying something to a woman and then packing up the bread for her.\nOption 2: The salesman is packing up the bread for the woman and then saying something to her.", "gt-answer": "Option 2: The salesman is packing up the bread for the woman and then saying something to her.", "video-llm-prediction": "Option 1: The salesman is saying something to a woman and then packing up the bread for her.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The salesman is saying something to a woman and then packing up the bread for her.\nCaption B: The salesman is packing up the bread for the woman and then saying something to her.", "gt-answer": "Caption B: The salesman is packing up the bread for the woman and then saying something to her.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The salesman is packing up the bread for the woman and then saying something to her.\nSentence B: The salesman is saying something to a woman and then packing up the bread for her.", "gt-answer": "Sentence A: The salesman is packing up the bread for the woman and then saying something to her.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "3401191309": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: An athletic man is taking off a prosthetic running blade.\nSentence B: The athletic man is putting on a prosthetic running blade.", "gt-answer": "Sentence A: An athletic man is taking off a prosthetic running blade.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: An athletic man is taking off a prosthetic running blade.\nCaption B: The athletic man is repairing a prosthetic running blade.", "gt-answer": "Caption A: An athletic man is taking off a prosthetic running blade.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: An athletic man is taking off a prosthetic running blade.\nOption 2: A video showing an athletic man running with a prosthetic blade.", "gt-answer": "Option 1: An athletic man is taking off a prosthetic running blade.", "video-llm-prediction": "Option 1: An athletic man is taking off a prosthetic running blade.", "match_success": true, "rating": 1 } ] }, "21218935": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is exercising.\nOption 2: The woman is cooking.", "gt-answer": "Option 1: The woman is exercising.", "video-llm-prediction": "Option 1: The woman is exercising.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A woman is sleeping.\nCaption B: The woman is exercising.", "gt-answer": "Caption B: The woman is exercising.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is exercising.\nSentence B: The woman is reading.", "gt-answer": "Sentence A: The woman is exercising.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ], "order": [ { "question": "Which caption matches the video better?\nCaption A: The woman first smiles at the camera and then does push ups.\nCaption B: The woman first does push ups and then smiles at the camera.", "gt-answer": "Caption B: The woman first does push ups and then smiles at the camera.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman first smiles at the camera and then does push ups.\nSentence B: The woman first does push ups and then smiles at the camera.", "gt-answer": "Sentence B: The woman first does push ups and then smiles at the camera.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman first smiles at the camera and then does push ups.\nOption 2: The woman first does push ups and then smiles at the camera.", "gt-answer": "Option 2: The woman first does push ups and then smiles at the camera.", "video-llm-prediction": "Option 1: The woman first smiles at the camera and then does push ups.", "match_success": true, "rating": 0 } ] }, "5395244": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is walking across the pebble creek.\nOption 2: The woman is canoeing across the pebble creek.", "gt-answer": "Option 1: The woman is walking across the pebble creek.", "video-llm-prediction": "Option 1: The woman is walking across the pebble creek.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is swimming across the pebble creek.\nSentence B: The woman is walking across the pebble creek.", "gt-answer": "Sentence B: The woman is walking across the pebble creek.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The woman is running across the pebble creek.\nCaption B: The woman is walking across the pebble creek.", "gt-answer": "Caption B: The woman is walking across the pebble creek.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1037628617": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is pressing a keyboard key and rotating a jog dial.\nSentence B: A person is clicking button and rolling wheel on a mouse.", "gt-answer": "Sentence B: A person is clicking button and rolling wheel on a mouse.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is tapping a touchpad and scrolling with two fingers.\nOption 2: A person is clicking button and rolling wheel on a mouse.", "gt-answer": "Option 2: A person is clicking button and rolling wheel on a mouse.", "video-llm-prediction": "Option 2: A person is clicking button and rolling wheel on a mouse.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A video showing a person's action of tapping a touchpad and scrolling with two fingers.\nCaption B: A person is clicking button and rolling wheel on a mouse.", "gt-answer": "Caption B: A person is clicking button and rolling wheel on a mouse.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1066801717": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is cutting a bouquet of daffodils.\nOption 2: The woman is arranging a bouquet of daffodils.", "gt-answer": "Option 2: The woman is arranging a bouquet of daffodils.", "video-llm-prediction": "Option 2: The woman is arranging a bouquet of daffodils.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is arranging a bouquet of daffodils.\nCaption B: A video showing a woman watering a bouquet of daffodils.", "gt-answer": "Caption A: The woman is arranging a bouquet of daffodils.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is planting a bouquet of daffodils.\nSentence B: The woman is arranging a bouquet of daffodils.", "gt-answer": "Sentence B: The woman is arranging a bouquet of daffodils.", "video-llm-prediction": "Sentence B: The woman is arranging a bouquet of daffodils.", "match_success": true, "rating": 1 } ] }, "1088428381": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is storing items in a gift box.\nSentence B: A woman is opening a gift box in the video.", "gt-answer": "Sentence B: A woman is opening a gift box in the video.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A woman is opening a gift box in the video.\nOption 2: The woman is wrapping a gift box.", "gt-answer": "Option 1: A woman is opening a gift box in the video.", "video-llm-prediction": "Option 1: A woman is opening a gift box in the video.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A woman is opening a gift box in the video.\nCaption B: The woman is delivering a gift box.", "gt-answer": "Caption A: A woman is opening a gift box in the video.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1069593640": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is putting a towel on her neck.\nSentence B: The woman is drying hair with a towel.", "gt-answer": "Sentence A: The woman is putting a towel on her neck.", "video-llm-prediction": "Sentence A: The woman is putting a towel on her neck.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is putting a towel on her neck.\nCaption B: The woman is washing face with a towel.", "gt-answer": "Caption A: The woman is putting a towel on her neck.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The towel is being used to dry the woman's hair.\nOption 2: The woman is putting a towel on her neck.", "gt-answer": "Option 2: The woman is putting a towel on her neck.", "video-llm-prediction": "Option 1: The towel is being used to dry the woman's hair.", "match_success": true, "rating": 0 } ] }, "8307961": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is chatting with a woman while walking and then showing something on his tablet to her.\nSentence B: The man is walking and simultaneously showing something on his tablet to a woman.", "gt-answer": "Sentence A: The man is chatting with a woman while walking and then showing something on his tablet to her.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is chatting with a woman while walking and then showing something on his tablet to her.\nOption 2: The man is showing something on his tablet to a woman and then chatting with her while walking.", "gt-answer": "Option 1: The man is chatting with a woman while walking and then showing something on his tablet to her.", "video-llm-prediction": "Option 1: The man is chatting with a woman while walking and then showing something on his tablet to her.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man is showing something on his tablet to a woman and then chatting with her while walking.\nCaption B: The man is chatting with a woman while walking and then showing something on his tablet to her.", "gt-answer": "Caption B: The man is chatting with a woman while walking and then showing something on his tablet to her.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "15439453": { "action": [ { "question": "Which caption matches the video better?\nCaption A: In the video, tango dancing and playing the piano are shown.\nCaption B: The video shows tango dancing and playing the saxophone.", "gt-answer": "Caption B: The video shows tango dancing and playing the saxophone.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The video shows ballet dancing and playing the saxophone.\nOption 2: The video shows tango dancing and playing the saxophone.", "gt-answer": "Option 2: The video shows tango dancing and playing the saxophone.", "video-llm-prediction": "Option 2: The video shows tango dancing and playing the saxophone.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video depicts tango dancing and playing the violin.\nSentence B: The video shows tango dancing and playing the saxophone.", "gt-answer": "Sentence B: The video shows tango dancing and playing the saxophone.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1107947873": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The doctor is giving medicine to a patient.\nSentence B: The doctor is writing a prescription.", "gt-answer": "Sentence A: The doctor is giving medicine to a patient.", "video-llm-prediction": "Sentence A: The doctor is giving medicine to a patient.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The doctor is giving medicine to a patient.\nCaption B: The doctor is performing a surgery.", "gt-answer": "Caption A: The doctor is giving medicine to a patient.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The doctor is giving medicine to a patient.\nOption 2: The doctor is checking vitals.", "gt-answer": "Option 1: The doctor is giving medicine to a patient.", "video-llm-prediction": "Option 1: The doctor is giving medicine to a patient.", "match_success": true, "rating": 1 } ] }, "1081194848": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The researcher is teaching children how to dissect a frog.\nOption 2: The researcher is teaching children how to cook a frog.", "gt-answer": "Option 1: The researcher is teaching children how to dissect a frog.", "video-llm-prediction": "Option 1: The researcher is teaching children how to dissect a frog.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The researcher is teaching children how to dissect a frog.\nSentence B: The researcher is teaching children how to feed a frog.", "gt-answer": "Sentence A: The researcher is teaching children how to dissect a frog.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The researcher is teaching children how to dissect a frog.\nCaption B: The researcher is teaching children how to catch a frog.", "gt-answer": "Caption A: The researcher is teaching children how to dissect a frog.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1091526823": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The hand is opening a book.\nSentence B: The hand is turning over pages of a book.", "gt-answer": "Sentence B: The hand is turning over pages of a book.", "video-llm-prediction": "Sentence B.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The hand is turning over pages of a book.\nCaption B: Someone is writing on a book.", "gt-answer": "Caption A: The hand is turning over pages of a book.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The hand is closing a book.\nOption 2: The hand is turning over pages of a book.", "gt-answer": "Option 2: The hand is turning over pages of a book.", "video-llm-prediction": "Option 2: The hand is turning over pages of a book.", "match_success": true, "rating": 1 } ] }, "26631100": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: First, two hand are holding the clay pot and then three hands are holding it.\nSentence B: In the video, three hands are holding the clay pot, then two hands are holding the clay pot.", "gt-answer": "Sentence B: In the video, three hands are holding the clay pot, then two hands are holding the clay pot.", "video-llm-prediction": "Sentence A: First, two hand are holding the clay pot and then three hands are holding it.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: In the video, three hands are holding the clay pot, then two hands are holding the clay pot.\nCaption B: First, two hand are holding the clay pot and then three hands are holding it.", "gt-answer": "Caption A: In the video, three hands are holding the clay pot, then two hands are holding the clay pot.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: First, two hand are holding the clay pot and then three hands are holding it.\nOption 2: In the video, three hands are holding the clay pot, then two hands are holding the clay pot.", "gt-answer": "Option 2: In the video, three hands are holding the clay pot, then two hands are holding the clay pot.", "video-llm-prediction": "Option 1: First, two hand are holding the clay pot and then three hands are holding it.", "match_success": true, "rating": 0 } ] }, "1104663381": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man is cutting a pizza dough.\nOption 2: The man is stretching a pizza dough.", "gt-answer": "Option 2: The man is stretching a pizza dough.", "video-llm-prediction": "Option 2: The man is stretching a pizza dough.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A video showing the man folding a pizza dough.\nCaption B: The man is stretching a pizza dough.", "gt-answer": "Caption B: The man is stretching a pizza dough.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is stretching a pizza dough.\nSentence B: The man is cutting a pizza dough.", "gt-answer": "Sentence A: The man is stretching a pizza dough.", "video-llm-prediction": "Sentence A: The man is stretching a pizza dough.", "match_success": true, "rating": 1 } ], "direction": [ { "question": "Which caption matches the video better?\nCaption A: The pizza dough is rotating counter-clockwise.\nCaption B: The pizza dough is rotating clockwise.", "gt-answer": "Caption A: The pizza dough is rotating counter-clockwise.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The pizza dough is rotating counter-clockwise.\nOption 2: The pizza dough is rotating clockwise.", "gt-answer": "Option 1: The pizza dough is rotating counter-clockwise.", "video-llm-prediction": "Option 1: The pizza dough is rotating counter-clockwise.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The pizza dough is flipping up and down.\nSentence B: The pizza dough is rotating counter-clockwise.", "gt-answer": "Sentence B: The pizza dough is rotating counter-clockwise.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "22100041": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The woman in the video is doing an invitation gesture.\nCaption B: The woman is waving goodbye.", "gt-answer": "Caption A: The woman in the video is doing an invitation gesture.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: She is clapping her hands.\nOption 2: The woman in the video is doing an invitation gesture.", "gt-answer": "Option 2: The woman in the video is doing an invitation gesture.", "video-llm-prediction": "Option 2: The woman in the video is doing an invitation gesture.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is pointing towards something.\nSentence B: The woman in the video is doing an invitation gesture.", "gt-answer": "Sentence B: The woman in the video is doing an invitation gesture.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1103623921": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The man is playing football.\nCaption B: The man is playing pool.", "gt-answer": "Caption B: The man is playing pool.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is playing pool.\nOption 2: The man is sleeping.", "gt-answer": "Option 1: The man is playing pool.", "video-llm-prediction": "Option 1: The man is playing pool.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is dancing.\nSentence B: The man is playing pool.", "gt-answer": "Sentence B: The man is playing pool.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ], "speed": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The video is being played at a normal speed.\nOption 2: The video is being played in slow-motion.", "gt-answer": "Option 1: The video is being played at a normal speed.", "video-llm-prediction": "Option 1: The video is being played at a normal speed.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The video is being played at a normal speed.\nCaption B: The video is being fast-forwarded.", "gt-answer": "Caption A: The video is being played at a normal speed.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The video is being played at a normal speed.\nSentence B: The video is stopped.", "gt-answer": "Sentence A: The video is being played at a normal speed.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1093338031": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man is trying to punch a computer screen.\nOption 2: The man is attempting to click a computer screen.", "gt-answer": "Option 1: The man is trying to punch a computer screen.", "video-llm-prediction": "Option 2: The man is attempting to click a computer screen.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The man is trying to punch a computer screen.\nCaption B: He is trying to fix a computer screen.", "gt-answer": "Caption A: The man is trying to punch a computer screen.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is cleaning a computer screen.\nSentence B: The man is trying to punch a computer screen.", "gt-answer": "Sentence B: The man is trying to punch a computer screen.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 } ] }, "1064310466": { "action": [ { "question": "Which caption matches the video better?\nCaption A: A man is peeling cocoa seeds.\nCaption B: The man is watering cocoa seeds.", "gt-answer": "Caption A: A man is peeling cocoa seeds.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is planting cocoa seeds.\nSentence B: A man is peeling cocoa seeds.", "gt-answer": "Sentence B: A man is peeling cocoa seeds.", "video-llm-prediction": "Sentence B: A man is peeling cocoa seeds.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A man is baking cocoa seeds.\nOption 2: A man is peeling cocoa seeds.", "gt-answer": "Option 2: A man is peeling cocoa seeds.", "video-llm-prediction": "Option 2: A man is peeling cocoa seeds.", "match_success": true, "rating": 1 } ] }, "1088497853": { "direction": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The movement in the video is upwards.\nOption 2: The movement in the video is leftwards.", "gt-answer": "Option 1: The movement in the video is upwards.", "video-llm-prediction": "Option 1: The movement in the video is upwards.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The camera is moving downwards before the stone building.\nCaption B: The movement in the video is upwards.", "gt-answer": "Caption B: The movement in the video is upwards.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The movement in the video is rightwards.\nSentence B: The movement in the video is upwards.", "gt-answer": "Sentence B: The movement in the video is upwards.", "video-llm-prediction": "Sentence B: The movement in the video is upwards.", "match_success": true, "rating": 1 } ] }, "1106087175": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is taking off a white sneaker.\nSentence B: The man is drawing on a white sneaker.", "gt-answer": "Sentence B: The man is drawing on a white sneaker.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is tying shoelaces on a white sneaker.\nOption 2: The man is drawing on a white sneaker.", "gt-answer": "Option 2: The man is drawing on a white sneaker.", "video-llm-prediction": "Option 1: The man is tying shoelaces on a white sneaker.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The man is wearing a white sneaker.\nCaption B: The man is drawing on a white sneaker.", "gt-answer": "Caption B: The man is drawing on a white sneaker.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "13324343": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man is tying his shoelaces and then gliding on the skateboard.\nOption 2: The man is gliding on the skateboard and then tying his shoelaces.", "gt-answer": "Option 1: The man is tying his shoelaces and then gliding on the skateboard.", "video-llm-prediction": "Option 1: The man is tying his shoelaces and then gliding on the skateboard.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is tying his shoelaces and then gliding on the skateboard.\nSentence B: The man is gliding on the skateboard and then tying his shoelaces.", "gt-answer": "Sentence A: The man is tying his shoelaces and then gliding on the skateboard.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man is tying his shoelaces and then gliding on the skateboard.\nCaption B: A video showing a man gliding on the skateboard and then tying his shoelaces.", "gt-answer": "Caption A: The man is tying his shoelaces and then gliding on the skateboard.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1083624811": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is performing alternating leg lift plank.\nSentence B: The woman is performing push-ups.", "gt-answer": "Sentence A: The woman is performing alternating leg lift plank.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is performing alternating leg lift plank.\nCaption B: The woman is performing a traditional plank hold.", "gt-answer": "Caption A: The woman is performing alternating leg lift plank.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is performing side plank variations.\nOption 2: The woman is performing alternating leg lift plank.", "gt-answer": "Option 2: The woman is performing alternating leg lift plank.", "video-llm-prediction": "Option 2: The woman is performing alternating leg lift plank.", "match_success": true, "rating": 1 } ] }, "1014976360": { "order": [ { "question": "Which caption matches the video better?\nCaption A: The man first looks at his phone and then takes off his glasses.\nCaption B: The man puts on his glasses and then looks at his phone.", "gt-answer": "Caption A: The man first looks at his phone and then takes off his glasses.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man takes off his glasses and then looks at his phone.\nOption 2: The man first looks at his phone and then takes off his glasses.", "gt-answer": "Option 2: The man first looks at his phone and then takes off his glasses.", "video-llm-prediction": "Option 2: The man first looks at his phone and then takes off his glasses.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man takes off his glasses and then looks at his phone.\nSentence B: The man first looks at his phone and then takes off his glasses.", "gt-answer": "Sentence B: The man first looks at his phone and then takes off his glasses.", "video-llm-prediction": "Sentence B: The man first looks at his phone and then takes off his glasses.", "match_success": true, "rating": 1 } ] }, "18416608": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman first operates her tablet, then samples water in a bottle.\nSentence B: The woman first samples water in a bottle, then operates her tablet.", "gt-answer": "Sentence B: The woman first samples water in a bottle, then operates her tablet.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman first samples water in a bottle, then operates her tablet.\nOption 2: The woman first operates her tablet, then samples water in a bottle.", "gt-answer": "Option 1: The woman first samples water in a bottle, then operates her tablet.", "video-llm-prediction": "Option 2: The woman first operates her tablet, then samples water in a bottle.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The woman first samples water in a bottle, then operates her tablet.\nCaption B: The woman first operates her tablet, then samples water in a bottle.", "gt-answer": "Caption A: The woman first samples water in a bottle, then operates her tablet.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1061351290": { "order": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is touching her pregnant belly and then measuring her belly with a measuring tape.\nOption 2: The woman is measuring her belly with a measuring tape and then touching her pregnant belly.", "gt-answer": "Option 1: The woman is touching her pregnant belly and then measuring her belly with a measuring tape.", "video-llm-prediction": "Option 1: The woman is touching her pregnant belly and then measuring her belly with a measuring tape.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The woman is touching her pregnant belly and then measuring her belly with a measuring tape.\nCaption B: The woman is taking off clothes then measuring her belly with a measuring tape.", "gt-answer": "Caption A: The woman is touching her pregnant belly and then measuring her belly with a measuring tape.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is touching her pregnant belly and then taking off clothes.\nSentence B: The woman is touching her pregnant belly and then measuring her belly with a measuring tape.", "gt-answer": "Sentence B: The woman is touching her pregnant belly and then measuring her belly with a measuring tape.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1066238617": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The woman is practising her punches.\nCaption B: A woman is dancing.", "gt-answer": "Caption A: The woman is practising her punches.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is practising her punches.\nOption 2: She is cooking.", "gt-answer": "Option 1: The woman is practising her punches.", "video-llm-prediction": "Option 1: The woman is practising her punches.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is swimming.\nSentence B: The woman is practising her punches.", "gt-answer": "Sentence B: The woman is practising her punches.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 } ] }, "1072000366": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The girls are elbowing each other.\nOption 2: The girls are giving high fives to each other.", "gt-answer": "Option 1: The girls are elbowing each other.", "video-llm-prediction": "Option 1: The girls are elbowing each other.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: They are waving at each other.\nSentence B: The girls are elbowing each other.", "gt-answer": "Sentence B: The girls are elbowing each other.", "video-llm-prediction": "Sentence A: They are waving at each other.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: The girls are elbowing each other.\nCaption B: The girls are shaking hands.", "gt-answer": "Caption A: The girls are elbowing each other.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1065272113": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The doctor is giving the woman an injection.\nSentence B: The doctor is taking her blood pressure.", "gt-answer": "Sentence A: The doctor is giving the woman an injection.", "video-llm-prediction": "A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The doctor is giving the woman an injection.\nCaption B: The doctor is checking her temperature.", "gt-answer": "Caption A: The doctor is giving the woman an injection.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The doctor is applying a bandage.\nOption 2: The doctor is giving the woman an injection.", "gt-answer": "Option 2: The doctor is giving the woman an injection.", "video-llm-prediction": "Option 2: The doctor is giving the woman an injection.", "match_success": true, "rating": 1 } ] }, "1072971926": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is wrapping the pizza box.\nSentence B: The man is opening the pizza box.", "gt-answer": "Sentence B: The man is opening the pizza box.", "video-llm-prediction": "Sentence A: The man is wrapping the pizza box.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: A video showing a person eating the pizza.\nCaption B: The man is opening the pizza box.", "gt-answer": "Caption B: The man is opening the pizza box.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is opening the pizza box.\nOption 2: The man is closing the pizza box.", "gt-answer": "Option 1: The man is opening the pizza box.", "video-llm-prediction": "Option 1: The man is opening the pizza box.", "match_success": true, "rating": 1 } ] }, "3399348431": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is mixing the sauce with the vegetables.\nSentence B: A person is squeezing the sauce into a bowl of vegetables.", "gt-answer": "Sentence B: A person is squeezing the sauce into a bowl of vegetables.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A video showing dipping vegetables on the sauce.\nOption 2: A person is squeezing the sauce into a bowl of vegetables.", "gt-answer": "Option 2: A person is squeezing the sauce into a bowl of vegetables.", "video-llm-prediction": "Option 2: A person is squeezing the sauce into a bowl of vegetables.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A person is mixing sauce with the vegetables.\nCaption B: A person is squeezing the sauce into a bowl of vegetables.", "gt-answer": "Caption B: A person is squeezing the sauce into a bowl of vegetables.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1025137214": { "action": [ { "question": "Which caption matches the video better?\nCaption A: A man is walking.\nCaption B: A man is smoking a cigarette.", "gt-answer": "Caption B: A man is smoking a cigarette.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is talking on the phone.\nSentence B: A man is smoking a cigarette.", "gt-answer": "Sentence B: A man is smoking a cigarette.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is reading a book.\nOption 2: A man is smoking a cigarette.", "gt-answer": "Option 2: A man is smoking a cigarette.", "video-llm-prediction": "Option 2: A man is smoking a cigarette.", "match_success": true, "rating": 1 } ] }, "1033101626": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The man is cooking.\nCaption B: The man is playing an instrument.", "gt-answer": "Caption B: The man is playing an instrument.", "video-llm-prediction": "B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is playing an instrument.\nOption 2: A video showing the man singing.", "gt-answer": "Option 1: The man is playing an instrument.", "video-llm-prediction": "Option 1: The man is playing an instrument.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is playing an instrument.\nSentence B: The man is reading.", "gt-answer": "Sentence A: The man is playing an instrument.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "9537677": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The woman is reading a book.\nCaption B: The woman is engaged in playing a tablet.", "gt-answer": "Caption B: The woman is engaged in playing a tablet.", "video-llm-prediction": "B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is engaged in playing a tablet.\nOption 2: A video showing the woman watching TV.", "gt-answer": "Option 1: The woman is engaged in playing a tablet.", "video-llm-prediction": "Option 1: The woman is engaged in playing a tablet.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is engaged in playing a tablet.\nSentence B: The woman is dancing.", "gt-answer": "Sentence A: The woman is engaged in playing a tablet.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1077388994": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is singing.\nOption 2: The woman is patting the baby.", "gt-answer": "Option 2: The woman is patting the baby.", "video-llm-prediction": "Option 2: The woman is patting the baby.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is patting the baby.\nSentence B: A person dancing.", "gt-answer": "Sentence A: The woman is patting the baby.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: She is cooking dinner.\nCaption B: The woman is patting the baby.", "gt-answer": "Caption B: The woman is patting the baby.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1037531786": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The two men are boxing.\nOption 2: The two men are dancing.", "gt-answer": "Option 1: The two men are boxing.", "video-llm-prediction": "Option 1: The two men are boxing.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The two men are cooking.\nCaption B: The two men are boxing.", "gt-answer": "Caption B: The two men are boxing.", "video-llm-prediction": "Caption B: The two men are boxing.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The two men are painting.\nSentence B: The two men are boxing.", "gt-answer": "Sentence B: The two men are boxing.", "video-llm-prediction": "B", "match_success": true, "rating": 1 } ] }, "1077275024": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The girl is jumping with a man playing ukulele.\nOption 2: The girl is clapping hands for a man playing ukulele.", "gt-answer": "Option 2: The girl is clapping hands for a man playing ukulele.", "video-llm-prediction": "Option 2: The girl is clapping hands for a man playing ukulele.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The girl is clapping hands for a man playing ukulele.\nCaption B: The girl is dancing with a man playing ukulele.", "gt-answer": "Caption A: The girl is clapping hands for a man playing ukulele.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The girl is clapping hands for a man playing ukulele.\nSentence B: The girl is sitting and watching a man play the ukulele.", "gt-answer": "Sentence A: The girl is clapping hands for a man playing ukulele.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1107180099": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The women are singing.\nCaption B: The women are dancing.", "gt-answer": "Caption B: The women are dancing.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The women are dancing.\nOption 2: The women are painting.", "gt-answer": "Option 1: The women are dancing.", "video-llm-prediction": "Option 1: The women are dancing.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The women are dancing.\nSentence B: The women are cooking.", "gt-answer": "Sentence A: The women are dancing.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 } ] }, "1049434159": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The parents and daughter are cooking dinner.\nOption 2: The parents and daughter are playing video games together.", "gt-answer": "Option 2: The parents and daughter are playing video games together.", "video-llm-prediction": "Option 2: The parents and daughter are playing video games together.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The parents and daughter are playing video games together.\nCaption B: A family engagement in reading a book.", "gt-answer": "Caption A: The parents and daughter are playing video games together.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: They are hiking together.\nSentence B: The parents and daughter are playing video games together.", "gt-answer": "Sentence B: The parents and daughter are playing video games together.", "video-llm-prediction": "B", "match_success": true, "rating": 1 } ] }, "3402234027": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The cat is sleeping.\nOption 2: The cat is eating food.", "gt-answer": "Option 2: The cat is eating food.", "video-llm-prediction": "Option 2: The cat is eating food.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The cat is eating food.\nSentence B: A video showing the cat playing with a toy.", "gt-answer": "Sentence A: The cat is eating food.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The cat is swimming.\nCaption B: The cat is eating food.", "gt-answer": "Caption B: The cat is eating food.", "video-llm-prediction": "B", "match_success": true, "rating": 1 } ] }, "1056491306": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: Passengers are taking a seat on the subway.\nSentence B: Passengers are sleeping on the bed.", "gt-answer": "Sentence A: Passengers are taking a seat on the subway.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: Passengers are taking a seat on the subway.\nOption 2: Passengers are standing in a room.", "gt-answer": "Option 1: Passengers are taking a seat on the subway.", "video-llm-prediction": "Option 1: Passengers are taking a seat on the subway.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: Passengers are taking a seat on the subway.\nCaption B: Passengers are dancing on the stage.", "gt-answer": "Caption A: Passengers are taking a seat on the subway.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 } ] }, "1012085909": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The people are rolling boats on the lake.\nSentence B: The people in the video are pushing bikes.", "gt-answer": "Sentence B: The people in the video are pushing bikes.", "video-llm-prediction": "Sentence B: The people in the video are pushing bikes.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A group of people walking on the street.\nCaption B: The people in the video are pushing bikes.", "gt-answer": "Caption B: The people in the video are pushing bikes.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The people in the video are pushing bikes.\nOption 2: The people are swimming.", "gt-answer": "Option 1: The people in the video are pushing bikes.", "video-llm-prediction": "Option 1: The people in the video are pushing bikes.", "match_success": true, "rating": 1 } ] }, "1079406734": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: A man is washing the car in the video.\nOption 2: The man is riding bikes.", "gt-answer": "Option 1: A man is washing the car in the video.", "video-llm-prediction": "Option 1: A man is washing the car in the video.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A man is washing the car in the video.\nSentence B: The man is taking a shower.", "gt-answer": "Sentence A: A man is washing the car in the video.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man is driving the car.\nCaption B: A man is washing the car in the video.", "gt-answer": "Caption B: A man is washing the car in the video.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1022115337": { "order": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The order of events is sending an air kiss first and then dancing.\nSentence B: The order of events is dancing first and then sending an air kiss.", "gt-answer": "Sentence B: The order of events is dancing first and then sending an air kiss.", "video-llm-prediction": "Sentence A: The order of events is sending an air kiss first and then dancing.", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The order of events is dancing first and then sending an air kiss.\nOption 2: The order of events is sending an air kiss first and then dancing.", "gt-answer": "Option 1: The order of events is dancing first and then sending an air kiss.", "video-llm-prediction": "Option 1: The order of events is dancing first and then sending an air kiss.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The girl is dancing first and then sending an air kiss.\nCaption B: The girl is sending an air kiss first and then dancing.", "gt-answer": "Caption A: The girl is dancing first and then sending an air kiss.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 0 } ] }, "1071214909": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: A person is drawing a cutting line on a wood surface.\nSentence B: The person is carving a design on the wood surface.", "gt-answer": "Sentence A: A person is drawing a cutting line on a wood surface.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: A video showing a person measuring the length on a wood surface.\nCaption B: A person is drawing a cutting line on a wood surface.", "gt-answer": "Caption B: A person is drawing a cutting line on a wood surface.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The person is sanding the surface of the wood.\nOption 2: A person is drawing a cutting line on a wood surface.", "gt-answer": "Option 2: A person is drawing a cutting line on a wood surface.", "video-llm-prediction": "Option 2: A person is drawing a cutting line on a wood surface.", "match_success": true, "rating": 1 } ] }, "1092065543": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is eating.\nSentence B: The man is shopping.", "gt-answer": "Sentence B: The man is shopping.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man is shopping.\nCaption B: The man is sleeping.", "gt-answer": "Caption A: The man is shopping.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The man is running.\nOption 2: The man is shopping.", "gt-answer": "Option 2: The man is shopping.", "video-llm-prediction": "Option 2: The man is shopping.", "match_success": true, "rating": 1 } ] }, "1017063703": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The person is baking cakes.\nCaption B: A person in the video is decorating cakes.", "gt-answer": "Caption B: A person in the video is decorating cakes.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A person in the video is decorating cakes.\nSentence B: The person is cutting cakes.", "gt-answer": "Sentence A: A person in the video is decorating cakes.", "video-llm-prediction": "Sentence A: A person in the video is decorating cakes.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A video showing someone presenting cakes to guests.\nOption 2: A person in the video is decorating cakes.", "gt-answer": "Option 2: A person in the video is decorating cakes.", "video-llm-prediction": "Option 2: A person in the video is decorating cakes.", "match_success": true, "rating": 1 } ] }, "1055022287": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The man in the video is cooking.\nCaption B: The man in the video is cleaning a swimming pool.", "gt-answer": "Caption B: The man in the video is cleaning a swimming pool.", "video-llm-prediction": "B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: A person in the video is singing.\nOption 2: The man in the video is cleaning a swimming pool.", "gt-answer": "Option 2: The man in the video is cleaning a swimming pool.", "video-llm-prediction": "Option 2: The man in the video is cleaning a swimming pool.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man in the video is cleaning a swimming pool.\nSentence B: The man is playing basketball.", "gt-answer": "Sentence A: The man in the video is cleaning a swimming pool.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 1 } ] }, "1094259081": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The woman is watching a VR movie.\nSentence B: A woman is playing a VR shooting game in the video.", "gt-answer": "Sentence B: A woman is playing a VR shooting game in the video.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 0 }, { "question": "Which caption matches the video better?\nCaption A: A woman is playing a VR shooting game in the video.\nCaption B: She is listening to music using VR.", "gt-answer": "Caption A: A woman is playing a VR shooting game in the video.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The woman is working out using VR.\nOption 2: A woman is playing a VR shooting game in the video.", "gt-answer": "Option 2: A woman is playing a VR shooting game in the video.", "video-llm-prediction": "Option 1: The woman is working out using VR.", "match_success": true, "rating": 0 } ] }, "1022260408": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The squirrel is sleeping.\nCaption B: The squirrel is eating food.", "gt-answer": "Caption B: The squirrel is eating food.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: A video showing a squirrel running.\nSentence B: The squirrel is eating food.", "gt-answer": "Sentence B: The squirrel is eating food.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The squirrel is chasing its tail.\nOption 2: The squirrel is eating food.", "gt-answer": "Option 2: The squirrel is eating food.", "video-llm-prediction": "Option 2: The squirrel is eating food.", "match_success": true, "rating": 1 } ] }, "1089810969": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The little girl runs towards the woman and hugs her.\nCaption B: The little girl walks towards the woman and hugs her.", "gt-answer": "Caption A: The little girl runs towards the woman and hugs her.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The little girl runs towards the woman and jumps on her back.\nSentence B: The little girl runs towards the woman and hugs her.", "gt-answer": "Sentence B: The little girl runs towards the woman and hugs her.", "video-llm-prediction": "Sentence A", "match_success": true, "rating": 0 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The little girl runs towards the woman and talks to her.\nOption 2: The little girl runs towards the woman and hugs her.", "gt-answer": "Option 2: The little girl runs towards the woman and hugs her.", "video-llm-prediction": "Option 2: The little girl runs towards the woman and hugs her.", "match_success": true, "rating": 1 } ] }, "6165731": { "action": [ { "question": "Which sentence better captures the essence of the video?\nSentence A: The scientist is looking confused and frustrated about the experiment.\nSentence B: The scientist is observing quietly on the experiment.", "gt-answer": "Sentence A: The scientist is looking confused and frustrated about the experiment.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The scientist is looking confused and frustrated about the experiment.\nOption 2: The scientist is leisurely conducting experiments.", "gt-answer": "Option 1: The scientist is looking confused and frustrated about the experiment.", "video-llm-prediction": "Option 1: The scientist is looking confused and frustrated about the experiment.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The scientist feels excited and motivated about the experiment.\nCaption B: The scientist is looking confused and frustrated about the experiment.", "gt-answer": "Caption B: The scientist is looking confused and frustrated about the experiment.", "video-llm-prediction": "Caption A", "match_success": true, "rating": 0 } ] }, "1111469465": { "action": [ { "question": "Which description is a more suitable match for the video?\nOption 1: The man is raising his arm to look at the watch and he appears anxious.\nOption 2: The man is raising his arm to look at the watch and he looks happy.", "gt-answer": "Option 1: The man is raising his arm to look at the watch and he appears anxious.", "video-llm-prediction": "Option 1: The man is raising his arm to look at the watch and he appears anxious.", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man is raising his arm to look at the watch and he appears anxious.\nSentence B: The man is raising his arm to look at the watch and he looks calm.", "gt-answer": "Sentence A: The man is raising his arm to look at the watch and he appears anxious.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which caption matches the video better?\nCaption A: The man is putting his hands in his pockets and he looks anxious.\nCaption B: The man is raising his arm to look at the watch and he appears anxious.", "gt-answer": "Caption B: The man is raising his arm to look at the watch and he appears anxious.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 } ] }, "1050000214": { "action": [ { "question": "Which caption matches the video better?\nCaption A: The little child is running.\nCaption B: The little child is performing water jumping.", "gt-answer": "Caption B: The little child is performing water jumping.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The little child is swimming.\nSentence B: The little child is performing water jumping.", "gt-answer": "Sentence B: The little child is performing water jumping.", "video-llm-prediction": "Sentence B", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The little child is dancing.\nOption 2: The little child is performing water jumping.", "gt-answer": "Option 2: The little child is performing water jumping.", "video-llm-prediction": "Option 2: The little child is performing water jumping.", "match_success": true, "rating": 1 } ] }, "27537016": { "order": [ { "question": "Which caption matches the video better?\nCaption A: The man pulls back inside the car and then shows off his car key.\nCaption B: The man first shows off his car key, then pulls back inside the car.", "gt-answer": "Caption B: The man first shows off his car key, then pulls back inside the car.", "video-llm-prediction": "Caption B", "match_success": true, "rating": 1 }, { "question": "Which sentence better captures the essence of the video?\nSentence A: The man first shows off his car key, then pulls back inside the car.\nSentence B: After pulling back inside the car, the man shows off his car key.", "gt-answer": "Sentence A: The man first shows off his car key, then pulls back inside the car.", "video-llm-prediction": "Sentence A.", "match_success": true, "rating": 1 }, { "question": "Which description is a more suitable match for the video?\nOption 1: The correct order of actions by the man is throwing away his car key, then pulling back inside the car.\nOption 2: The man first shows off his car key, then pulls back inside the car.", "gt-answer": "Option 2: The man first shows off his car key, then pulls back inside the car.", "video-llm-prediction": "Option 1: The correct order of actions by the man is throwing away his car key, then pulling back inside the car.", "match_success": true, "rating": 0 } ] } }