Adjusted readme, changed response, removed log.
Browse files- README.md +52 -0
- handler.py +2 -4
README.md
CHANGED
@@ -28,6 +28,58 @@ You can use this model for conditional and un-conditional image captioning
|
|
28 |
|
29 |
### Using the Pytorch model
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
#### Running the model on CPU
|
32 |
|
33 |
<details>
|
|
|
28 |
|
29 |
### Using the Pytorch model
|
30 |
|
31 |
+
#### Running inference
|
32 |
+
|
33 |
+
JSON Payload:
|
34 |
+
<details>
|
35 |
+
<summary> Click to expand </summary>
|
36 |
+
|
37 |
+
```json
|
38 |
+
{
|
39 |
+
"secret_token": "optional",
|
40 |
+
"inputs": {
|
41 |
+
"texts": [
|
42 |
+
[
|
43 |
+
"Is it a person?",
|
44 |
+
"What skin color?",
|
45 |
+
"What person wears?",
|
46 |
+
"Is person solo?",
|
47 |
+
"What is person mood?",
|
48 |
+
"What is person doing?"
|
49 |
+
]
|
50 |
+
],
|
51 |
+
"images": [
|
52 |
+
{
|
53 |
+
"url": "https://example.com"
|
54 |
+
}
|
55 |
+
]
|
56 |
+
}
|
57 |
+
}
|
58 |
+
```
|
59 |
+
</details>
|
60 |
+
|
61 |
+
JSON Response:
|
62 |
+
<details>
|
63 |
+
<summary> Click to expand </summary>
|
64 |
+
|
65 |
+
```json
|
66 |
+
{
|
67 |
+
"captions": [
|
68 |
+
{
|
69 |
+
"image_results": [
|
70 |
+
"yes",
|
71 |
+
"white",
|
72 |
+
"naked",
|
73 |
+
"yes",
|
74 |
+
"happy",
|
75 |
+
"taking selfie"
|
76 |
+
]
|
77 |
+
}
|
78 |
+
]
|
79 |
+
}
|
80 |
+
```
|
81 |
+
</details>
|
82 |
+
|
83 |
#### Running the model on CPU
|
84 |
|
85 |
<details>
|
handler.py
CHANGED
@@ -52,8 +52,6 @@ class EndpointHandler():
|
|
52 |
image_captions = [] # Store answers for each image
|
53 |
|
54 |
for question in questions:
|
55 |
-
print(f"Question: {question}")
|
56 |
-
|
57 |
# Process the image and question
|
58 |
processed_input = self.processor(image, question, return_tensors="pt").to(device)
|
59 |
|
@@ -61,10 +59,10 @@ class EndpointHandler():
|
|
61 |
out = self.model.generate(**processed_input)
|
62 |
|
63 |
# Decode the answer
|
64 |
-
|
65 |
|
66 |
# Add the answer to the list for the current image
|
67 |
-
image_captions.append(
|
68 |
|
69 |
# Store results for the current image
|
70 |
results.append({"image_results": image_captions})
|
|
|
52 |
image_captions = [] # Store answers for each image
|
53 |
|
54 |
for question in questions:
|
|
|
|
|
55 |
# Process the image and question
|
56 |
processed_input = self.processor(image, question, return_tensors="pt").to(device)
|
57 |
|
|
|
59 |
out = self.model.generate(**processed_input)
|
60 |
|
61 |
# Decode the answer
|
62 |
+
answer = self.processor.batch_decode(out, skip_special_tokens=True)[0]
|
63 |
|
64 |
# Add the answer to the list for the current image
|
65 |
+
image_captions.append(answer)
|
66 |
|
67 |
# Store results for the current image
|
68 |
results.append({"image_results": image_captions})
|