图像去重
介绍
图像去重通常指的是完全相同的图像,即内容完全相同,颜色、尺寸、方向等都相同。但是在实际应用中,也有相似图像去重的需求,即内容大致相同,颜色、尺寸、方向等可能有所不同。因此,图像去重指的可以是完全一样的图像,也可以是相似的图像。
图像去重的方法有以下几种:
方法
1.哈希法:通过计算图像的散列值来识别重复图像。
2.图像比对法:通过对图像的直方图或灰度共生矩阵等特征进行比对来识别重复图像。
3.机器学习法:通过训练机器学习模型来识别重复图像,例如使用卷积神经网络(CNN)。
4.特征提取法:通过提取图像的特征,例如 SIFT 等,并将其映射到一个空间中,以识别重复图像。
这些方法的选择取决于图像去重的具体需求和数据的特征。
基于直方图进行图像比对
import cv2 import numpy as np def compare_images(image1, image2): # 计算图像的直方图 hist1 = cv2.calcHist([image1], [0], None, [256], [0, 256]) hist2 = cv2.calcHist([image2], [0], None, [256], [0, 256]) # 计算直方图的相似度 similarity = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL) return similarity # 读入两张图片 img1 = cv2.imread('image1.jpg') img2 = cv2.imread('image2.jpg') # 计算两张图片的相似度 similarity = compare_images(img1, img2) # 根据相似度判断图片是否重复 if similarity > 0.95: print("Images are duplicates") else: print("Images are not duplicates")
基于哈希法
import cv2 import hashlib def calculate_hash(image): # 调整图像大小 image = cv2.resize(image, (9, 8), interpolation = cv2.INTER_CUBIC) # 转换为灰度图像 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # 计算哈希值 hash_value = 0 for i in range(8): for j in range(8): if gray[i, j] > gray[i, j + 1]: hash_value += 1 << (i * 8 + j) return hash_value # 读入两张图片 img1 = cv2.imread('image1.jpg') img2 = cv2.imread('image2.jpg') # 计算图片的哈希值 hash1 = calculate_hash(img1) hash2 = calculate_hash(img2) # 判断图片是否重复 if hash1 == hash2: print("Images are duplicates") else: print("Images are not duplicates")
基于ORG进行图像特征提取
import cv2 def extract_features(image): # 使用 ORB 算法提取图像的特征点 orb = cv2.ORB_create() keypoints, descriptors = orb.detectAndCompute(image, None) return keypoints, descriptors # 读入两张图片 img1 = cv2.imread('image1.jpg') img2 = cv2.imread('image2.jpg') # 提取图像的特征点和描述符 kp1, des1 = extract_features(img1) kp2, des2 = extract_features(img2) # 匹配两张图片的特征点 bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck = True) matches = bf.match(des1, des2) # 计算匹配到的特征点数量 match_count = len(matches) # 判断图片是否重复 if match_count > 10: print("Images are duplicates") else: print("Images are not duplicates")
基于机器学习
import cv2 import numpy as np from sklearn.neighbors import KNeighborsClassifier # Load the images and resize them to a fixed size images = [] labels = [] for i in range(10): img = cv2.imread(f'image_{i}.jpg') img = cv2.resize(img, (128, 128)) images.append(img) labels.append(i % 5) # Extract features from the images using a feature extractor def extract_features(images): features = [] for img in images: hist = cv2.calcHist([img], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]) hist = hist.flatten() features.append(hist) return np.array(features) features = extract_features(images) # Train a KNN classifier on the features clf = KNeighborsClassifier(n_neighbors=1) clf.fit(features, labels) # Use the classifier to predict the label of a new image new_img = cv2.imread('new_image.jpg') new_img = cv2.resize(new_img, (128, 128)) new_features = extract_features([new_img]) predicted_label = clf.predict(new_features)
批量去重
import os import cv2 import numpy as np def calc_hist(image): """Calculate the histogram of an image.""" hist = cv2.calcHist([image], [0], None, [256], [0, 256]) return hist.flatten() def find_duplicates(path): """Find duplicate images in a directory.""" image_hashes = {} duplicates = [] for filename in os.listdir(path): file_path = os.path.join(path, filename) if os.path.isfile(file_path): try: image = cv2.imread(file_path, 0) hist = calc_hist(image) hash = np.array_str(hist) if hash in image_hashes: duplicates.append((file_path, image_hashes[hash])) else: image_hashes[hash] = file_path except: pass return duplicates dp = find_duplicates('data') print(dp)