File size: 1,967 Bytes
8eded20 3c0f209 8eded20 3c0f209 8eded20 3c0f209 8eded20 3c0f209 8eded20 3c0f209 8eded20 3c0f209 8eded20 3c0f209 8eded20 3c0f209 8eded20 3c0f209 8eded20 3c0f209 8eded20 3c0f209 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
from typing import Dict, List, Any
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration
class EndpointHandler():
def __init__(self, path="", vision_model="obsidian3b"):
self.model = LlavaForConditionalGeneration.from_pretrained(
"NousResearch/Obsidian-3B-V0.5",
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to("cuda" if torch.is_cuda_available() else "cpu")
self.processor = AutoProcessor.from_pretrained("NousResearch/Obsidian-3B-V0.5")
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
data args:
inputs (:obj: `str`)
image (:obj: `Image`)
Return:
A :obj:`list` | `dict`: will be serialized and returned
"""
# get inputs
inputs = data.pop("inputs", "")
image = data.pop("image", None)
inputs = self.processor(inputs, image, return_tensors="pt")
res = self.model.generate(**inputs, do_sample=False, max_new_tokens=4096)
return self.processor.decode(res[0], skip_special_tokens=True)
#if image:
# perform image classification using Obsidian 3b vision
#image_features = self.vision.encode_image(image)
#image_embedding = self.vision.extract_feature(image_features)
#image_caption = self.vision.generate_caption(image_embedding)
# combine text and image captions
#combined_captions = [inputs, image_caption]
# run text classification on combined captions
#prediction = self.pipeline(combined_captions, temperature=0.33, num_beams=5, stop=[], do_sample=True)
#return prediction
#else:
# run text classification on plain text input
# prediction = self.pipeline(inputs, temperature=0.33, num_beams=5, stop=[], do_sample=True)
# return prediction |