import cv2 from transformers import ViTImageProcessor, ViTForImageClassification, AutoModelForImageClassification, AutoImageProcessor import torch import numpy as np torch.backends.cudnn.benchmark = True import urllib.request path = 'https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml' urllib.request.urlretrieve(path, path.split('/')[-1]) face_cascade = cv2.CascadeClassifier('./haarcascade_frontalface_default.xml') class Base: size = 224 scale = 1. / 255. mean = np.array( [ .5 ] * 3 ).reshape( 1, 1, 1, -1) std = np.array( [ .5 ] * 3 ).reshape( 1, 1, 1, -1) resample = 2 class ethnicityConfig(Base): size = 384 class maskConfig(Base): resample = 3 mean = np.array( [ .485 ] * 3 ).reshape( 1, 1, 1, -1) std = np.array( [ .229 ] * 3 ).reshape( 1, 1, 1, -1) AGE = "nateraw/vit-age-classifier" GENDER = 'rizvandwiki/gender-classification-2' ETHNICITY = 'cledoux42/Ethnicity_Test_v003' MASK = 'DamarJati/Face-Mask-Detection' BLUR = 'WT-MM/vit-base-blur' BEARD = 'dima806/beard_face_image_detection' device = 'cuda' if torch.cuda.is_available() else 'cpu' # base_processor = ViTImageProcessor.from_pretrained( global_path + 'base_processor' ) age_model = ViTForImageClassification.from_pretrained( AGE ).to(device) gender_model = ViTForImageClassification.from_pretrained( GENDER ).to(device) beard_model = ViTForImageClassification.from_pretrained( BEARD ).to(device) blur_model = ViTForImageClassification.from_pretrained( BLUR ).to(device) # ethnicity_precessor = ViTImageProcessor.from_pretrained( global_path + 'ethnicity' ) ethnicity_model= ViTForImageClassification.from_pretrained( ETHNICITY ).to(device) # mask_processor = ViTImageProcessor.from_pretrained( global_path + 'mask' ) mask_model = AutoModelForImageClassification.from_pretrained( MASK ).to(device) from PIL import Image def normalize( data, mean, std ): # (batchs, nchannels, height, width) data = (data - mean ) / std return data.astype(np.float32) def resize( image, size = 224, resample = 2 ): # if isinstance(iamge, np.ndarray): # image = Image.fromarray( image, mode = 'RGB' ) image = image.resize( (size, size), resample = resample ) return np.array( image ) def rescale( data, scale = Base.scale ): return data * scale # resize # rescale # normalize def ParallelBatchsPredict( data, MODELS, nbatchs = 16 ): total = data.shape[0] # for change channel axis to first format data = np.transpose( data, ( 0, 3, 1, 2 ) ) count = 0 batchs = [ [] for i in range(len(MODELS)) ] for i in range( 0, total, nbatchs ): batch = data[i:i+nbatchs] count += batch.shape[0] with torch.no_grad(): batch = torch.from_numpy( batch ).to(device) for _, model in enumerate(MODELS): logits = model( batch ).logits.softmax(1).argmax(1).tolist() for x in logits: batchs[_].append( model.config.id2label[ x ] ) assert count == total return batchs # model arrange # age # gender # blur # beard # changle processor # Ethnicity # change processor # Mask def AnalysisFeatures(rawFaces): # list[ PIL.Image ] if len(rawFaces) == 0: return [ [] ]* 6 baseProcessed = np.array([ resize(x, size = Base.size, resample = Base.resample ) for x in rawFaces]) baseProcessed = rescale( baseProcessed ) baseProcessed = normalize( baseProcessed, Base.mean, Base.std ) ages, genders, beards, blurs = ParallelBatchsPredict(baseProcessed, [age_model, gender_model, beard_model, blur_model] ) EthncityProcessed = np.array([ resize(x, size = ethnicityConfig.size, resample = ethnicityConfig.resample ) for x in rawFaces]) EthncityProcessed = rescale( EthncityProcessed ) EthncityProcessed = normalize( EthncityProcessed, ethnicityConfig.mean, ethnicityConfig.std ) ethncities = ParallelBatchsPredict(EthncityProcessed, [ethnicity_model])[0] MaskProcessed = np.array([ resize(x, size = maskConfig.size, resample = maskConfig.resample ) for x in rawFaces]) MaskProcessed = rescale( MaskProcessed ) MaskProcessed = normalize( MaskProcessed, maskConfig.mean, maskConfig.std ) masks = ParallelBatchsPredict(MaskProcessed, [mask_model])[0] beards = [True if beard == 'Beard' else False for beard in beards] blurs = [True if blur == 'blurry' else False for blur in blurs] masks = [True if mask == 'WithMask' else False for mask in masks] return ages, genders, beards, blurs, ethncities, masks import gradio as gr def frameWrapper( facesCo, ages, genders, beards, blurs, ethncities, masks ): return { 'identifiedPersonCount': len(facesCo), 'value': [ { 'coordinate': { 'x': x, 'y': y, 'h': h, 'w':w }, 'ageGroup': age, 'gender': gender, 'beardPresent':beard, 'blurOccur': blur, 'ethncity': ethncity, 'maskPresent': mask } for (x, y, w, h), age, gender, beard, blur, ethncity, mask in zip( facesCo, ages, genders, beards, blurs, ethncities, masks ) ] } def postProcessed( rawfaces, maximunSize, minSize = 30 ): faces = [] for (x, y, w, h) in rawfaces: x1 = x if x minSize and y2-y1 >minSize: faces.append( (x, y, w, h) ) return faces def image_inference(image): if sum(image.shape) == 0: return { 'ErrorFound': 'ImageNotFound' } # Convert into grayscale gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Detect faces rawfaces = face_cascade.detectMultiScale(gray, 1.05, 5, minSize = (30, 30)) image = np.asarray( image ) # Draw rectangle around the faces rawfaces = postProcessed( rawfaces, image.shape[:2] ) faces = [ image[x:w+x, y:h+y].copy() for (x, y, w, h) in rawfaces ] faces = [ Image.fromarray(x, mode = 'RGB') for x in faces ] ages, genders, beards, blurs, ethncities, masks = AnalysisFeatures( faces ) # annotatedImage = image.copy() # for (x, y, w, h) in rawfaces: # cv2.rectangle(annotatedImage, (x, y), (x+w, y+h), (255, 0, 0), 2) # return Image.fromarray(annotatedImage, mode = 'RGB'), frameWrapper( rawfaces, ages, genders, beards, blurs, ethncities, masks ) return frameWrapper( rawfaces, ages, genders, beards, blurs, ethncities, masks ) def video_inference(video_path): global_facesCo = [] global_faces = [] cap = cv2.VideoCapture(video_path) frameCount = 0 while(cap.isOpened()): _, img = cap.read() try: # Convert into grayscale gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) except: break # Detect faces rawfaces = face_cascade.detectMultiScale(gray, 1.05, 6, minSize = (30, 30)) image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) image = np.asarray( image ) rawfaces = postProcessed( rawfaces, image.shape[:2] ) # Draw rectangle around the faces # https://stackoverflow.com/questions/15589517/how-to-crop-an-image-in-opencv-using-python for fliping axis global_facesCo.append( rawfaces ) for (x, y, w, h) in rawfaces: face = image[x:w+x, y:h+y].copy() global_faces.append(Image.fromarray( face , mode = 'RGB') ) ages, genders, beards, blurs, ethncities, masks = AnalysisFeatures( global_faces ) total_extraction = [] for facesCo in global_facedsCo: length = len(facesCo) total_extraction.append( frameWrapper( facesCo, ages[:length], genders[:length], beards[:length], blurs[:length], ethncities[:length], masks[:length] ) ) ages, genders, beards, blurs, ethncities, masks = ages[length:], genders[length:], beards[length:], blurs[length:], ethncities[length:], masks[length:] return total_extraction css = """ .outputJSON{ overflow: scroll; } """ imageHander = gr.Interface( fn = image_inference, inputs = gr.Image(type="numpy", sources = 'upload'), outputs = gr.JSON(elem_classes = 'outputJSON'), css = css ) videoHander = gr.Interface( fn = video_inference, inputs = gr.Video(sources = 'upload', max_length = 30, include_audio = False), outputs = 'json' ) demo = gr.TabbedInterface( [imageHander, videoHander], tab_names = [ 'Image-to-Features', 'Video-to-Features' ], title = 'Facial Feature Extraction' ) demo.launch()