jjsos_JJdetection/check_and_clean_images.py

189 lines
6.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
检查并清理没有标注文件的图片
找出 train 目录下没有对应 label 文件的图片,并删除它们
"""
import os
from pathlib import Path
def get_image_files(directory):
"""获取目录下所有图片文件"""
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.JPG', '.JPEG', '.PNG'}
image_files = []
for file in Path(directory).iterdir():
if file.is_file() and file.suffix in image_extensions:
image_files.append(file)
return image_files
def get_label_files(directory):
"""获取目录下所有标注文件"""
label_files = []
for file in Path(directory).iterdir():
if file.is_file() and file.suffix == '.txt':
label_files.append(file)
return label_files
def find_images_without_labels(train_dir, label_dir, dry_run=True):
"""
找出没有对应标注文件的图片
Args:
train_dir: 图片目录
label_dir: 标注目录
dry_run: 如果为True只检查不删除
Returns:
list: 没有标注的图片文件列表
"""
train_path = Path(train_dir)
label_path = Path(label_dir)
if not train_path.exists():
print(f"错误: 图片目录不存在: {train_dir}")
return []
if not label_path.exists():
print(f"错误: 标注目录不存在: {label_dir}")
return []
# 获取所有图片和标注文件
image_files = get_image_files(train_path)
label_files = get_label_files(label_path)
print(f"找到 {len(image_files)} 个图片文件")
print(f"找到 {len(label_files)} 个标注文件")
print("-" * 60)
# 创建标注文件名的集合(不含扩展名)
label_names = {f.stem for f in label_files}
# 找出没有对应标注的图片
images_without_labels = []
for img_file in image_files:
img_name_without_ext = img_file.stem
if img_name_without_ext not in label_names:
images_without_labels.append(img_file)
# 打印结果
if images_without_labels:
print(f"发现 {len(images_without_labels)} 个没有标注的图片:")
print("-" * 60)
for img in images_without_labels[:20]: # 只显示前20个
print(f" {img.name}")
if len(images_without_labels) > 20:
print(f" ... 还有 {len(images_without_labels) - 20} 个文件")
print("-" * 60)
if not dry_run:
# 删除文件
deleted_count = 0
failed_count = 0
for img_file in images_without_labels:
try:
img_file.unlink()
deleted_count += 1
except Exception as e:
print(f"删除失败: {img_file.name} - {e}")
failed_count += 1
print(f"\n删除完成:")
print(f" 成功删除: {deleted_count} 个文件")
if failed_count > 0:
print(f" 删除失败: {failed_count} 个文件")
else:
print("\n[预览模式] 未实际删除文件")
print("要删除这些文件,请运行: python check_and_clean_images.py --delete")
else:
print("✓ 所有图片都有对应的标注文件!")
return images_without_labels
def main():
"""主函数"""
import sys
# 默认路径
train_dir = "datasets/handleImage/train"
label_dir = "datasets/handleImage/label"
# 检查命令行参数
dry_run = True
force = False
if len(sys.argv) > 1:
if '--delete' in sys.argv or '-d' in sys.argv:
dry_run = False
if '--force' in sys.argv or '-f' in sys.argv:
force = True
if '--help' in sys.argv or '-h' in sys.argv:
print("=" * 60)
print("图片标注检查工具 - 使用说明")
print("=" * 60)
print("\n功能:")
print(" 检查图片文件是否有对应的标注文件,并删除没有标注的图片")
print("\n使用方法:")
print(" python check_and_clean_images.py")
print(" # 预览模式:检查但不删除文件(推荐先运行)")
print("\n python check_and_clean_images.py --delete")
print(" # 删除模式:删除没有标注的图片(需要确认)")
print("\n python check_and_clean_images.py --delete --force")
print(" # 强制删除:直接删除,不确认(适合脚本自动化)")
print("\n参数说明:")
print(" --delete, -d 启用删除模式")
print(" --force, -f 跳过确认,直接删除")
print(" --help, -h 显示此帮助信息")
print("\n默认路径:")
print(f" 图片目录: {train_dir}")
print(f" 标注目录: {label_dir}")
print("\n支持的图片格式:")
print(" .jpg, .jpeg, .png, .bmp, .tiff, .tif (大小写不敏感)")
print("\n标注文件格式:")
print(" .txt 文件")
print("\n文件名匹配规则:")
print(" 通过文件名(不含扩展名)匹配,例如:")
print(" image001.jpg ↔ image001.txt ✓")
print(" image001.jpg ↔ image_001.txt ✗")
print("\n注意事项:")
print(" ⚠ 删除操作不可恢复!建议先备份数据")
print(" ⚠ 使用 --delete 前建议先运行预览模式检查")
print("\n详细教程:")
print(" 查看文件: check_and_clean_images_使用教程.md")
print("=" * 60)
return
print("=" * 60)
print("检查没有标注的图片")
print("=" * 60)
print(f"图片目录: {train_dir}")
print(f"标注目录: {label_dir}")
print()
if not dry_run and not force:
try:
response = input("⚠ 警告: 将删除没有标注的图片文件!确认继续?(yes/no): ")
if response.lower() != 'yes':
print("已取消")
return
except (EOFError, KeyboardInterrupt):
print("\n已取消(非交互式环境,请使用 --force 参数)")
return
images_to_delete = find_images_without_labels(train_dir, label_dir, dry_run=dry_run)
if images_to_delete and dry_run:
print("\n提示: 使用 --delete 参数来实际删除这些文件")
if __name__ == '__main__':
main()