Source code for optical.converter.coco

"""
__author__: HashTagML
license: MIT
Created: Monday, 29th March 2021
"""

import os
import warnings
from typing import List, Union

import numpy as np
import pandas as pd

from .base import FormatSpec
from .utils import exists, get_annotation_dir, get_image_dir, read_coco


[docs]class Coco(FormatSpec):
    """Represents a COCO annotation object.

    Args:
        root (Union[str, os.PathLike]): path to root directory. Expects the ``root`` directory to have either
           of the following layouts:

           .. code-block:: bash

                root
                ├── images
                │   ├── train
                │   │   ├── 1.jpg
                │   │   ├── 2.jpg
                │   │   │   ...
                │   │   └── n.jpg
                │   ├── valid (...)
                │   └── test (...)
                │
                └── annotations
                    ├── train.json
                    ├── valid.json
                    └── test.json

            or,

            .. code-block:: bash

                root
                ├── images
                │   ├── 1.jpg
                │   ├── 2.jpg
                │   │   ...
                │   └── n.jpg
                │
                └── annotations
                    └── label.json
    """

[docs]    def __init__(self, root: Union[str, os.PathLike]):
        super().__init__(root)
        self._image_dir = get_image_dir(root)
        self._annotation_dir = get_annotation_dir(root)
        self._has_image_split = False
        assert exists(self._image_dir), "root is missing `images` directory."
        assert exists(self._annotation_dir), "root is missing `annotations` directory."
        self._find_splits()
        self._resolve_dataframe()

    def _get_class_map(self, categories: List):
        """map from category id to category name"""
        class_map = dict()
        for cat in categories:
            class_map[cat["id"]] = cat["name"]
        return class_map

    def _resolve_dataframe(self):
        split_str = []
        master_df = pd.DataFrame(
            columns=["image_id", "image_width", "image_height", "x_min", "y_min", "width", "height", "category"],
        )
        for split in self._splits:
            coco_json = self._annotation_dir / f"{split}.json"
            images, annots, cats = read_coco(coco_json)
            split_str.append([split, len(images), len(annots), len(cats)])

            class_map = self._get_class_map(cats)

            images_df = pd.DataFrame(images)
            images_df = images_df[["id", "file_name", "width", "height"]]
            images_df.rename(columns={"width": "image_width", "height": "image_height"}, inplace=True)

            instances = [(x["image_id"], x["category_id"], x["bbox"]) for x in annots]
            annots_df = pd.DataFrame(instances, columns=["image_id", "class_id", "bbox"])
            annots_df["category"] = annots_df["class_id"].map(class_map)
            annots_df[["x_min", "y_min", "width", "height"]] = pd.DataFrame(
                annots_df["bbox"].to_list(), index=annots_df.index
            )

            annots_df.drop(["bbox"], axis=1, inplace=True)

            annots_df = annots_df.merge(images_df, left_on="image_id", right_on="id", how="left")
            annots_df.drop(["id", "image_id"], axis=1, inplace=True)
            annots_df.rename(columns={"file_name": "image_id"}, inplace=True)

            null_images = annots_df["image_id"].isnull().sum()
            if null_images > 0:
                warnings.warn(
                    "Some annotations in the dataset does not have images attached to it. Ignoring those annotations"
                )
            annots_df.dropna(subset=["image_id"], inplace=True)
            annots_df["split"] = split
            split_dir = split if self._has_image_split else ""
            annots_df["image_path"] = annots_df["image_id"].map(
                lambda x: self.root.joinpath("images").joinpath(split_dir).joinpath(x)
            )

            if len(annots_df[pd.isnull(annots_df.image_id)]) > 0:
                warnings.warn(
                    "There are annotations in your dataset for which there is no matching images"
                    + f"(in split `{split}`). These annotations will be removed during any "
                    + "computation or conversion. It is recommended that you clean your dataset."
                )

            master_df = pd.concat([master_df, annots_df], ignore_index=True)

        master_df = master_df[pd.notnull(master_df.image_id)]
        for col in ["x_min", "y_min", "width", "height"]:
            master_df[col] = master_df[col].astype(np.float32)

        for col in ["image_width", "image_height", "class_id"]:
            master_df[col] = master_df[col].astype(np.int32)

        self.master_df = master_df