From 085c0bbf6bd97207f9b05b0179ec2568e2e60e37 Mon Sep 17 00:00:00 2001 From: Nik Date: Wed, 5 Jun 2024 16:10:11 +0200 Subject: [PATCH] add mlperf train subset of openimages (#4841) --- examples/mlperf/model_eval.py | 2 +- extra/datasets/openimages.py | 50 ++++++++++++++++++++++------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/examples/mlperf/model_eval.py b/examples/mlperf/model_eval.py index cd9a6a5fbf..cef6d6cd37 100644 --- a/examples/mlperf/model_eval.py +++ b/examples/mlperf/model_eval.py @@ -98,7 +98,7 @@ def eval_retinanet(): from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval from contextlib import redirect_stdout - coco = COCO(openimages()) + coco = COCO(openimages('validation')) coco_eval = COCOeval(coco, iouType="bbox") coco_evalimgs, evaluated_imgs, ncats, narea = [], [], len(coco_eval.params.catIds), len(coco_eval.params.areaRng) diff --git a/extra/datasets/openimages.py b/extra/datasets/openimages.py index 360a0313e2..19ec24cfdf 100644 --- a/extra/datasets/openimages.py +++ b/extra/datasets/openimages.py @@ -1,6 +1,4 @@ -import os import sys -import math import json import numpy as np from PIL import Image @@ -13,7 +11,8 @@ import concurrent.futures BASEDIR = pathlib.Path(__file__).parent / "open-images-v6-mlperf" BUCKET_NAME = "open-images-dataset" -BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv" +TRAIN_BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-bbox.csv" +VALIDATION_BBOX_ANNOTATIONS_URL = "https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv" MAP_CLASSES_URL = "https://storage.googleapis.com/openimages/v5/class-descriptions-boxable.csv" MLPERF_CLASSES = ['Airplane', 'Antelope', 'Apple', 'Backpack', 'Balloon', 'Banana', 'Barrel', 'Baseball bat', 'Baseball glove', 'Bee', 'Beer', 'Bench', 'Bicycle', @@ -55,17 +54,24 @@ MLPERF_CLASSES = ['Airplane', 'Antelope', 'Apple', 'Backpack', 'Balloon', 'Banan 'Zebra', 'Zucchini', ] -def openimages(): - ann_file = BASEDIR / "validation/labels/openimages-mlperf.json" + +def openimages(subset: str): + valid_subsets = ['train', 'validation'] + if subset not in valid_subsets: + raise ValueError(f"{subset=} must be one of {valid_subsets}") + + ann_file = BASEDIR / f"{subset}/labels/openimages-mlperf.json" + if not ann_file.is_file(): - fetch_openimages(ann_file) + fetch_openimages(ann_file, subset) + return ann_file # this slows down the conversion a lot! # maybe use https://raw.githubusercontent.com/scardine/image_size/master/get_image_size.py def extract_dims(path): return Image.open(path).size[::-1] -def export_to_coco(class_map, annotations, image_list, dataset_path, output_path, classes=MLPERF_CLASSES): +def export_to_coco(class_map, annotations, image_list, dataset_path, output_path, subset, classes=MLPERF_CLASSES): output_path.parent.mkdir(parents=True, exist_ok=True) cats = [{"id": i, "name": c, "supercategory": None} for i, c in enumerate(classes)] categories_map = pd.DataFrame([(i, c) for i, c in enumerate(classes)], columns=["category_id", "category_name"]) @@ -76,7 +82,7 @@ def export_to_coco(class_map, annotations, image_list, dataset_path, output_path annotations[["height", "width"]] = annotations.apply(lambda x: extract_dims(dataset_path / f"{x['ImageID']}.jpg"), axis=1, result_type="expand") # Images - imgs = [{"id": int(id + 1), "file_name": f"{image_id}.jpg", "height": row["height"], "width": row["width"], "license": None, "coco_url": None} + imgs = [{"id": int(id + 1), "file_name": f"{image_id}.jpg", "height": row["height"], "width": row["width"], "subset": subset, "license": None, "coco_url": None} for (id, image_id), row in (annotations.groupby(["image_id", "ImageID"]).first().iterrows()) ] @@ -100,21 +106,26 @@ def get_image_list(class_map, annotations, classes=MLPERF_CLASSES): image_ids = annotations[np.isin(annotations["LabelName"], labels)]["ImageID"].unique() return image_ids -def download_image(bucket, image_id, data_dir): +def download_image(bucket, subset, image_id, data_dir): try: - bucket.download_file(f"validation/{image_id}.jpg", f"{data_dir}/{image_id}.jpg") + bucket.download_file(f"{subset}/{image_id}.jpg", f"{data_dir}/{image_id}.jpg") except botocore.exceptions.ClientError as exception: sys.exit(f"ERROR when downloading image `validation/{image_id}`: {str(exception)}") -def fetch_openimages(output_fn): +def fetch_openimages(output_fn, subset: str): bucket = boto3.resource("s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)).Bucket(BUCKET_NAME) - annotations_dir, data_dir = BASEDIR / "annotations", BASEDIR / "validation/data" + annotations_dir, data_dir = BASEDIR / "annotations", BASEDIR / f"{subset}/data" annotations_dir.mkdir(parents=True, exist_ok=True) data_dir.mkdir(parents=True, exist_ok=True) - annotations_fn = annotations_dir / BBOX_ANNOTATIONS_URL.split('/')[-1] - fetch(BBOX_ANNOTATIONS_URL, annotations_fn) + if subset == "train": + annotations_fn = annotations_dir / TRAIN_BBOX_ANNOTATIONS_URL.split('/')[-1] + fetch(TRAIN_BBOX_ANNOTATIONS_URL, annotations_fn) + else: # subset == validation + annotations_fn = annotations_dir / VALIDATION_BBOX_ANNOTATIONS_URL.split('/')[-1] + fetch(VALIDATION_BBOX_ANNOTATIONS_URL, annotations_fn) + annotations = pd.read_csv(annotations_fn) classmap_fn = annotations_dir / MAP_CLASSES_URL.split('/')[-1] @@ -124,16 +135,16 @@ def fetch_openimages(output_fn): image_list = get_image_list(class_map, annotations) with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [executor.submit(download_image, bucket, image_id, data_dir) for image_id in image_list] + futures = [executor.submit(download_image, bucket, subset, image_id, data_dir) for image_id in image_list] for future in (t := tqdm(concurrent.futures.as_completed(futures), total=len(image_list))): t.set_description(f"Downloading images") future.result() print("Converting annotations to COCO format...") - export_to_coco(class_map, annotations, image_list, data_dir, output_fn) + export_to_coco(class_map, annotations, image_list, data_dir, output_fn, subset) -def image_load(fn): - img_folder = BASEDIR / "validation/data" +def image_load(subset, fn): + img_folder = BASEDIR / f"{subset}/data" img = Image.open(img_folder / fn).convert('RGB') import torchvision.transforms.functional as F ret = F.resize(img, size=(800, 800)) @@ -158,7 +169,8 @@ def iterate(coco, bs=8): for i in range(0, len(image_ids), bs): X, targets = [], [] for img_id in image_ids[i:i+bs]: - x, original_size = image_load(coco.loadImgs(img_id)[0]["file_name"]) + img_dict = coco.loadImgs(img_id)[0] + x, original_size = image_load(img_dict['subset'], img_dict["file_name"]) X.append(x) annotations = coco.loadAnns(coco.getAnnIds(img_id)) targets.append(prepare_target(annotations, img_id, original_size))