|
import re |
|
import jaconv |
|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoFeatureExtractor, VisionEncoderDecoderModel |
|
from PIL import Image |
|
import torch, time |
|
import spaces |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("kha-white/manga-ocr-base") |
|
|
|
model = VisionEncoderDecoderModel.from_pretrained("kha-white/manga-ocr-base") |
|
model.to("cuda") |
|
|
|
feature_extractor = AutoFeatureExtractor.from_pretrained("kha-white/manga-ocr-base") |
|
|
|
def post_process(text): |
|
text = ''.join(text.split()) |
|
text = text.replace('…', '...') |
|
text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text) |
|
text = jaconv.h2z(text, ascii=True, digit=True) |
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU(duration=8) |
|
def manga_ocr(imgs): |
|
texts = [] |
|
for img in imgs: |
|
img = Image.open(img) |
|
img = img.convert('L').convert('RGB') |
|
pixel_values = feature_extractor(img, return_tensors="pt").pixel_values.to("cuda") |
|
start_time = time.time() |
|
output = model.generate(pixel_values)[0] |
|
print("Time taken for OCR:", time.time() - start_time) |
|
text = tokenizer.decode(output, skip_special_tokens=True) |
|
text = post_process(text) |
|
texts.append(text) |
|
return "|||".join(texts) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=manga_ocr, |
|
inputs=gr.File(file_types=["image"], file_count="multiple"), |
|
outputs="text", |
|
title="Manga OCR", |
|
description="Extract Manga in lighting speed ⚡", |
|
) |
|
|
|
iface.launch() |