Spaces:
Sleeping
Sleeping
import numpy as np | |
from tensorflow.keras.applications import ResNet50 | |
from tensorflow.keras.preprocessing import image | |
from tensorflow.keras.applications.resnet50 import preprocess_input | |
from sklearn.metrics.pairwise import cosine_similarity | |
import os | |
# Load the pre-trained ResNet50 model | |
model = ResNet50(weights='imagenet', include_top=False, pooling='avg') | |
# Function to extract feature vector from an image | |
def extract_features(img_path, model): | |
img = image.load_img(img_path, target_size=(224, 224)) | |
img_data = image.img_to_array(img) | |
img_data = np.expand_dims(img_data, axis=0) | |
img_data = preprocess_input(img_data) | |
features = model.predict(img_data) | |
return features.flatten() | |
# Directory containing images | |
image_dir = './forward_facing' | |
# Extract features for all images | |
image_features = {} | |
for img_file in os.listdir(image_dir): | |
img_path = os.path.join(image_dir, img_file) | |
features = extract_features(img_path, model) | |
image_features[img_file] = features | |
# Convert feature dictionary to list for processing | |
feature_list = list(image_features.values()) | |
file_list = list(image_features.keys()) | |
# Calculate similarities | |
num_images = len(file_list) | |
similarity_matrix = np.zeros((num_images, num_images)) | |
for i in range(num_images): | |
for j in range(i, num_images): | |
if i != j: | |
similarity = cosine_similarity( | |
[feature_list[i]], | |
[feature_list[j]] | |
)[0][0] | |
similarity_matrix[i][j] = similarity | |
similarity_matrix[j][i] = similarity | |
# Identify and remove duplicates | |
threshold = 0.9 # Similarity threshold for duplicates | |
duplicates = set() | |
for i in range(num_images): | |
for j in range(i + 1, num_images): | |
if similarity_matrix[i][j] > threshold: | |
duplicates.add(file_list[j]) | |
# Remove duplicates | |
# for duplicate in duplicates: | |
# os.remove(os.path.join(image_dir, duplicate)) | |
print("Duplicate Images No => ", len(duplicates)) | |
# print(f"Removed {len(duplicates)} duplicate images.") | |