File size: 2,577 Bytes
8091862
4833b6f
8091862
 
4833b6f
80b4326
 
 
fc15d59
6221641
80b4326
8c792ce
df74d65
 
 
 
8c792ce
df74d65
8c792ce
80b4326
e2452eb
 
80b4326
 
 
 
 
 
0da27a0
df74d65
bc2751f
1355606
80b4326
 
 
 
 
 
8c792ce
df74d65
80b4326
 
df74d65
 
 
 
 
 
 
654ca8c
 
bc2751f
2e80f2b
654ca8c
df74d65
75c9a0c
df74d65
2e80f2b
654ca8c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import subprocess

# Install SentencePiece untuk keperluan translate bahasa Indonesia
subprocess.run(["pip", "install", "sentencepiece"])

from PIL import Image # library untuk image
import gradio as gr # library untuk tampilan interface di huggingface
from transformers import BlipProcessor, BlipForConditionalGeneration,MarianTokenizer, MarianMTModel #library blip (image captioning) dan marian untuk translate
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model untuk menghasilkan caption dalam bahasa Indonesia
translation_model_id = "Helsinki-NLP/opus-mt-en-id"
translation_model = MarianMTModel.from_pretrained(translation_model_id)
translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_id)

# Model untuk menghasilkan caption dalam bahasa Inggris

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
caption_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# caption_model_id = "Salesforce/blip-image-captioning-base"
# caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_id)
# caption_processor = BlipProcessor.from_pretrained(caption_model_id)

def generate_caption(input_image):
    # Mengonversi gambar ke mode RGB
    image = input_image.convert('RGB')
    inputs = caption_processor(images=image, return_tensors="pt")
    # inisial variabel inputs 
    inputs["max_length"] = 20
    inputs["num_beams"] = 1
    inputs["do_sample"] = True
    inputs["top_k"] = 50
    inputs["top_p"] = 0.95

    # Menghasilkan caption dalam bahasa Inggris
    # caption_inputs = caption_processor()
    caption_output = caption_model.generate(**inputs)
    english_caption = caption_processor.decode(caption_output[0], skip_special_tokens=True)

    # Menerjemahkan caption ke bahasa Indonesia
    translation_inputs = translation_tokenizer.encode(english_caption, return_tensors="pt", max_length=512, truncation=True)
    translation_output = translation_model.generate(translation_inputs)
    indonesian_caption = translation_tokenizer.decode(translation_output[0], skip_special_tokens=True)

    return english_caption, indonesian_caption


iface = gr.Interface(
    generate_caption,
    inputs=gr.inputs.Image(type="pil"),
    outputs=[gr.outputs.Textbox(type="text"), gr.outputs.Textbox(type="text")],  # Dua output teks
    live=True
)
iface.launch()