189 lines
6.5 KiB
Python
189 lines
6.5 KiB
Python
"""
|
||
检查并清理没有标注文件的图片
|
||
找出 train 目录下没有对应 label 文件的图片,并删除它们
|
||
"""
|
||
import os
|
||
from pathlib import Path
|
||
|
||
|
||
def get_image_files(directory):
|
||
"""获取目录下所有图片文件"""
|
||
image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif', '.JPG', '.JPEG', '.PNG'}
|
||
image_files = []
|
||
|
||
for file in Path(directory).iterdir():
|
||
if file.is_file() and file.suffix in image_extensions:
|
||
image_files.append(file)
|
||
|
||
return image_files
|
||
|
||
|
||
def get_label_files(directory):
|
||
"""获取目录下所有标注文件"""
|
||
label_files = []
|
||
|
||
for file in Path(directory).iterdir():
|
||
if file.is_file() and file.suffix == '.txt':
|
||
label_files.append(file)
|
||
|
||
return label_files
|
||
|
||
|
||
def find_images_without_labels(train_dir, label_dir, dry_run=True):
|
||
"""
|
||
找出没有对应标注文件的图片
|
||
|
||
Args:
|
||
train_dir: 图片目录
|
||
label_dir: 标注目录
|
||
dry_run: 如果为True,只检查不删除
|
||
|
||
Returns:
|
||
list: 没有标注的图片文件列表
|
||
"""
|
||
train_path = Path(train_dir)
|
||
label_path = Path(label_dir)
|
||
|
||
if not train_path.exists():
|
||
print(f"错误: 图片目录不存在: {train_dir}")
|
||
return []
|
||
|
||
if not label_path.exists():
|
||
print(f"错误: 标注目录不存在: {label_dir}")
|
||
return []
|
||
|
||
# 获取所有图片和标注文件
|
||
image_files = get_image_files(train_path)
|
||
label_files = get_label_files(label_path)
|
||
|
||
print(f"找到 {len(image_files)} 个图片文件")
|
||
print(f"找到 {len(label_files)} 个标注文件")
|
||
print("-" * 60)
|
||
|
||
# 创建标注文件名的集合(不含扩展名)
|
||
label_names = {f.stem for f in label_files}
|
||
|
||
# 找出没有对应标注的图片
|
||
images_without_labels = []
|
||
|
||
for img_file in image_files:
|
||
img_name_without_ext = img_file.stem
|
||
|
||
if img_name_without_ext not in label_names:
|
||
images_without_labels.append(img_file)
|
||
|
||
# 打印结果
|
||
if images_without_labels:
|
||
print(f"发现 {len(images_without_labels)} 个没有标注的图片:")
|
||
print("-" * 60)
|
||
for img in images_without_labels[:20]: # 只显示前20个
|
||
print(f" {img.name}")
|
||
if len(images_without_labels) > 20:
|
||
print(f" ... 还有 {len(images_without_labels) - 20} 个文件")
|
||
print("-" * 60)
|
||
|
||
if not dry_run:
|
||
# 删除文件
|
||
deleted_count = 0
|
||
failed_count = 0
|
||
|
||
for img_file in images_without_labels:
|
||
try:
|
||
img_file.unlink()
|
||
deleted_count += 1
|
||
except Exception as e:
|
||
print(f"删除失败: {img_file.name} - {e}")
|
||
failed_count += 1
|
||
|
||
print(f"\n删除完成:")
|
||
print(f" 成功删除: {deleted_count} 个文件")
|
||
if failed_count > 0:
|
||
print(f" 删除失败: {failed_count} 个文件")
|
||
else:
|
||
print("\n[预览模式] 未实际删除文件")
|
||
print("要删除这些文件,请运行: python check_and_clean_images.py --delete")
|
||
else:
|
||
print("✓ 所有图片都有对应的标注文件!")
|
||
|
||
return images_without_labels
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
import sys
|
||
|
||
# 默认路径
|
||
train_dir = "datasets/handleImage/train"
|
||
label_dir = "datasets/handleImage/label"
|
||
|
||
# 检查命令行参数
|
||
dry_run = True
|
||
force = False
|
||
if len(sys.argv) > 1:
|
||
if '--delete' in sys.argv or '-d' in sys.argv:
|
||
dry_run = False
|
||
if '--force' in sys.argv or '-f' in sys.argv:
|
||
force = True
|
||
if '--help' in sys.argv or '-h' in sys.argv:
|
||
print("=" * 60)
|
||
print("图片标注检查工具 - 使用说明")
|
||
print("=" * 60)
|
||
print("\n功能:")
|
||
print(" 检查图片文件是否有对应的标注文件,并删除没有标注的图片")
|
||
print("\n使用方法:")
|
||
print(" python check_and_clean_images.py")
|
||
print(" # 预览模式:检查但不删除文件(推荐先运行)")
|
||
print("\n python check_and_clean_images.py --delete")
|
||
print(" # 删除模式:删除没有标注的图片(需要确认)")
|
||
print("\n python check_and_clean_images.py --delete --force")
|
||
print(" # 强制删除:直接删除,不确认(适合脚本自动化)")
|
||
print("\n参数说明:")
|
||
print(" --delete, -d 启用删除模式")
|
||
print(" --force, -f 跳过确认,直接删除")
|
||
print(" --help, -h 显示此帮助信息")
|
||
print("\n默认路径:")
|
||
print(f" 图片目录: {train_dir}")
|
||
print(f" 标注目录: {label_dir}")
|
||
print("\n支持的图片格式:")
|
||
print(" .jpg, .jpeg, .png, .bmp, .tiff, .tif (大小写不敏感)")
|
||
print("\n标注文件格式:")
|
||
print(" .txt 文件")
|
||
print("\n文件名匹配规则:")
|
||
print(" 通过文件名(不含扩展名)匹配,例如:")
|
||
print(" image001.jpg ↔ image001.txt ✓")
|
||
print(" image001.jpg ↔ image_001.txt ✗")
|
||
print("\n注意事项:")
|
||
print(" ⚠ 删除操作不可恢复!建议先备份数据")
|
||
print(" ⚠ 使用 --delete 前建议先运行预览模式检查")
|
||
print("\n详细教程:")
|
||
print(" 查看文件: check_and_clean_images_使用教程.md")
|
||
print("=" * 60)
|
||
return
|
||
|
||
print("=" * 60)
|
||
print("检查没有标注的图片")
|
||
print("=" * 60)
|
||
print(f"图片目录: {train_dir}")
|
||
print(f"标注目录: {label_dir}")
|
||
print()
|
||
|
||
if not dry_run and not force:
|
||
try:
|
||
response = input("⚠ 警告: 将删除没有标注的图片文件!确认继续?(yes/no): ")
|
||
if response.lower() != 'yes':
|
||
print("已取消")
|
||
return
|
||
except (EOFError, KeyboardInterrupt):
|
||
print("\n已取消(非交互式环境,请使用 --force 参数)")
|
||
return
|
||
|
||
images_to_delete = find_images_without_labels(train_dir, label_dir, dry_run=dry_run)
|
||
|
||
if images_to_delete and dry_run:
|
||
print("\n提示: 使用 --delete 参数来实际删除这些文件")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|
||
|