File size: 1,967 Bytes

8eded20
3c0f209
 
8eded20
 
 
3c0f209
 
 
 
 
 
 
8eded20
 
 
 
 
 
 
 
 
 
 
 
3c0f209
 
 
 
8eded20
3c0f209
8eded20
3c0f209
 
 
8eded20
 
3c0f209
8eded20
 
3c0f209
8eded20
3c0f209
 
8eded20
3c0f209
8eded20
3c0f209
8eded20
3c0f209

from typing import Dict, List, Any
import torch
from transformers import AutoProcessor, LlavaForConditionalGeneration

class EndpointHandler():
    def __init__(self, path="", vision_model="obsidian3b"):
        self.model = LlavaForConditionalGeneration.from_pretrained(
        "NousResearch/Obsidian-3B-V0.5",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        ).to("cuda" if torch.is_cuda_available() else "cpu")
        self.processor = AutoProcessor.from_pretrained("NousResearch/Obsidian-3B-V0.5")
            

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        data args:
            inputs (:obj: `str`)
            image (:obj: `Image`)
        Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """
        # get inputs
        inputs = data.pop("inputs", "")
        image = data.pop("image", None)
        
        inputs = self.processor(inputs, image, return_tensors="pt")
        res = self.model.generate(**inputs, do_sample=False, max_new_tokens=4096)
        return self.processor.decode(res[0], skip_special_tokens=True)

        #if image:
            # perform image classification using Obsidian 3b vision
            #image_features = self.vision.encode_image(image)
            #image_embedding = self.vision.extract_feature(image_features)
            #image_caption = self.vision.generate_caption(image_embedding)

            # combine text and image captions
            #combined_captions = [inputs, image_caption]

            # run text classification on combined captions
            #prediction = self.pipeline(combined_captions, temperature=0.33, num_beams=5, stop=[], do_sample=True)

            #return prediction
        

        #else:
            # run text classification on plain text input
        #    prediction = self.pipeline(inputs, temperature=0.33, num_beams=5, stop=[], do_sample=True)

        #    return prediction