"""
__author__: HashTagML
license: MIT
Created: Wednesday, 31st March 2021
"""
# TODO: needs better solution for Handling TFrecords
import copy
import json
import os
import warnings
from datetime import datetime
from pathlib import Path, PosixPath
from typing import Dict, List, Optional, Union
import numpy as np
import pandas as pd
import yaml
from joblib import Parallel, delayed
from tqdm.auto import tqdm
from .utils import (
copyfile,
get_id_to_class_map,
ifnone,
write_json,
write_xml,
create_tf_example,
write_label_map,
)
[docs]class LabelEncoder:
[docs] def __init__(self):
self._map = dict()
def fit(self, series):
if not isinstance(series, pd.Series):
series = pd.Series(series)
categories = series.unique().tolist()
label_map = dict(zip(categories, np.arange(len(categories))))
for k, _ in label_map.items():
if k not in self._map:
self._map[k] = label_map[k]
def transform(self, series):
series = series.map(self._map)
return series
def fit_transform(self, series):
self.fit(series)
return self.transform(series)
def _fastcopy(src_files: Union[str, os.PathLike], dest_dir: Union[str, os.PathLike]):
_ = Parallel(n_jobs=-1, backend="threading")(delayed(copyfile)(f, dest_dir) for f in src_files)
[docs]def write_yolo_txt(filename: str, output_dir: Union[str, os.PathLike, PosixPath], yolo_string: str):
filepath = Path(output_dir).joinpath(Path(filename).stem + ".txt")
with open(filepath, "a") as f:
f.write(yolo_string)
f.write("\n")
def _makedirs(src: Union[str, os.PathLike], ext: str, dest: Optional[Union[str, os.PathLike]] = None):
output_dir = ifnone(dest, src, Path)
output_dir = output_dir / ext
output_imagedir = output_dir / "images"
output_labeldir = output_dir / "annotations"
output_imagedir.mkdir(parents=True, exist_ok=True)
output_labeldir.mkdir(parents=True, exist_ok=True)
return output_imagedir, output_labeldir
[docs]def convert_yolo(
df: pd.DataFrame,
root: Union[str, os.PathLike, PosixPath],
copy_images: bool = False,
save_under: Optional[str] = None,
output_dir: Optional[Union[str, os.PathLike, PosixPath]] = None,
):
"""converts to yolo from master dataframe
Args:
df (pd.DataFrame): the master df
root (Union[str, os.PathLike, PosixPath]): root directory of the source format
has_image_split (bool, optional): If the images are arranged under the splits. Defaults to False.
copy_images (bool, optional): Whether to copy the images to a different directory. Defaults to False.
save_under (str, optional): Name of the folder to save the target annotations. Defaults to "labels".
output_dir (Optional[Union[str, os.PathLike, PosixPath]], optional): Output directory for the target
annotation. Defaults to ``None``.
"""
save_under = ifnone(save_under, "yolo")
output_imagedir, output_labeldir = _makedirs(root, save_under, output_dir)
splits = df.split.unique().tolist()
lbl = LabelEncoder()
dataset = dict()
for split in splits:
output_subdir = output_labeldir / split if len(splits) > 1 else output_labeldir
output_subdir.mkdir(parents=True, exist_ok=True)
split_df = df.query("split == @split").copy()
# drop images missing width or height information
hw_missing = split_df[pd.isnull(split_df["image_width"]) | pd.isnull(split_df["image_height"])]
if len(hw_missing) > 0:
warnings.warn(
f"{hw_missing['image_id'].nunique()} has height/width information missing in split `{split}`. "
+ f"{len(hw_missing)} annotations will be removed."
)
split_df = split_df[pd.notnull(split_df["image_width"]) & pd.notnull(split_df["image_height"])]
split_df["x_center"] = split_df["x_min"] + split_df["width"] / 2
split_df["y_center"] = split_df["y_min"] + split_df["height"] / 2
# normalize
split_df["x_center"] = split_df["x_center"] / split_df["image_width"]
split_df["y_center"] = split_df["y_center"] / split_df["image_height"]
split_df["width"] = split_df["width"] / split_df["image_width"]
split_df["height"] = split_df["height"] / split_df["image_height"]
split_df["class_index"] = lbl.fit_transform(split_df["category"])
split_df["yolo_string"] = (
split_df["class_index"].astype(str)
+ " "
+ split_df["x_center"].astype(str)
+ " "
+ split_df["y_center"].astype(str)
+ " "
+ split_df["width"].astype(str)
+ " "
+ split_df["height"].astype(str)
)
ds = split_df.groupby("image_id")["yolo_string"].agg(lambda x: "\n".join(x)).reset_index()
image_ids = ds["image_id"].tolist()
yolo_strings = ds["yolo_string"].tolist()
dataset[split] = str(Path(root) / "images" / split)
for image_id, ystr in tqdm(zip(image_ids, yolo_strings), total=len(image_ids), desc=f"split: {split}"):
write_yolo_txt(image_id, output_subdir, ystr)
if copy_images:
dest_dir = output_imagedir / split
dest_dir.mkdir(parents=True, exist_ok=True)
_fastcopy(split_df["image_path"].unique().tolist(), dest_dir)
dataset["nc"] = len(lbl._map)
dataset["names"] = list(lbl._map.keys())
with open(Path(output_labeldir).joinpath("dataset.yaml"), "w") as f:
yaml.dump(dataset, f, default_flow_style=None, allow_unicode=True)
[docs]def convert_csv(
df: pd.DataFrame,
root: Union[str, os.PathLike, PosixPath],
copy_images: bool = False,
save_under: Optional[str] = None,
output_dir: Optional[Union[str, os.PathLike, PosixPath]] = None,
):
save_under = ifnone(save_under, "csv")
output_imagedir, output_labeldir = _makedirs(root, save_under, output_dir)
df = copy.deepcopy(df)
df["x_max"] = df["x_min"] + df["width"]
df["y_max"] = df["y_min"] + df["height"]
df.drop(["width", "height"], axis=1, inplace=True)
for col in ("x_min", "y_min", "x_max", "y_max"):
df[col] = df[col].astype(np.int32)
splits = df.split.unique().tolist()
for split in splits:
split_df = df.query("split == @split").copy()
split_df.drop(["split"], axis=1, inplace=True)
image_paths = split_df["image_path"].unique().tolist()
split_df = split_df[
["image_id", "image_width", "image_height", "x_min", "y_min", "x_max", "y_max", "category"]
]
split_df.to_csv(output_labeldir.joinpath(f"{split}.csv"), index=False)
if copy_images:
dest_dir = output_imagedir / split
dest_dir.mkdir(parents=True, exist_ok=True)
_fastcopy(image_paths, dest_dir)
def _make_coco_images(df: pd.DataFrame, image_map: Dict) -> List:
"""makes images list for coco"""
df = copy.deepcopy(df)
df.drop_duplicates(subset=["image_id"], keep="first", inplace=True)
df = (
df[["image_id", "image_height", "image_width"]]
.copy()
.rename(columns={"image_id": "file_name", "image_height": "height", "image_width": "width"})
)
df["id"] = df["file_name"].map(image_map)
df = df[["id", "file_name", "height", "width"]]
image_list = list(df.to_dict(orient="index").values())
return image_list
def _make_coco_annotations(df: pd.DataFrame, image_map: Dict) -> List:
"""makes annotation list for coco"""
df = copy.deepcopy(df)
df["bbox"] = df[["x_min", "y_min", "width", "height"]].apply(list, axis=1)
df["area"] = df["height"] * df["width"]
df.drop(["x_min", "y_min", "width", "height", "image_width", "image_height"], axis=1, inplace=True)
df["id"] = range(len(df))
df["image_id"] = df["image_id"].map(image_map)
df.rename(columns={"class_id": "category_id"}, inplace=True)
df["category_id"] = df["category_id"].astype(int)
df["segmentation"] = [[]] * len(df)
df["iscrowd"] = 0
df = df[["id", "image_id", "category_id", "bbox", "area", "segmentation", "iscrowd"]].copy()
annotation_list = list(df.to_dict(orient="index").values())
return annotation_list
def _make_coco_categories(df: pd.DataFrame) -> List:
"""makes category list for coco"""
df = copy.deepcopy(df)
df = (
df.drop_duplicates(subset=["category"], keep="first")
.sort_values("class_id")[["class_id", "category"]]
.rename(columns={"category": "name", "class_id": "id"})
)
df["id"] = df["id"].astype(int)
df["supercategory"] = "none"
category_list = list(df.to_dict(orient="index").values())
return category_list
[docs]def convert_coco(
df: pd.DataFrame,
root: Union[str, os.PathLike, PosixPath],
copy_images: bool = False,
save_under: Optional[str] = None,
output_dir: Optional[Union[str, os.PathLike, PosixPath]] = None,
) -> None:
"""converts to coco from master df
Args:
df (pd.DataFrame): the master df
root (Union[str, os.PathLike, PosixPath]): root directory of the source format
has_image_split (bool, optional): If the images are arranged under the splits. Defaults to False.
copy_images (bool, optional): Whether to copy the images to a different directory. Defaults to False.
save_under (str, optional): Name of the folder to save the target annotations. Defaults to "labels".
output_dir (Optional[Union[str, os.PathLike, PosixPath]], optional): Output directory for the target
annotation. Defaults to ``None``.
"""
save_under = ifnone(save_under, "coco")
output_imagedir, output_labeldir = _makedirs(root, save_under, output_dir)
splits = df.split.unique().tolist()
for split in splits:
split_df = df.query("split == @split").copy()
images = df["image_id"].unique().tolist()
image_map = dict(zip(images, range(len(images))))
image_list = _make_coco_images(split_df, image_map)
annotation_list = _make_coco_annotations(split_df, image_map)
category_list = _make_coco_categories(split_df)
coco_dict = dict()
coco_dict["images"] = image_list
coco_dict["annotations"] = annotation_list
coco_dict["categories"] = category_list
output_file = output_labeldir / f"{split}.json"
# print(output_file)
write_json(coco_dict, output_file)
if copy_images:
dest_dir = output_imagedir / split
dest_dir.mkdir(parents=True, exist_ok=True)
_fastcopy(split_df["image_path"].unique().tolist(), dest_dir)
def _make_manifest_data(image_info: List, grouped_info: pd.DataFrame, job_name: str, id_to_class_map: Dict):
# creating json like data for each row of df
manifest_dic = {}
manifest_dic["source-ref"] = image_info[0]
manifest_dic[f"{job_name}"] = {
"image_size": [{"width": int(image_info[1]), "depth": 3, "height": int(image_info[2])}],
}
manifest_dic[f"{job_name}-metadata"] = {
"job-name": job_name,
"class-map": id_to_class_map,
"creation-date": str(datetime.now()),
"type": "groundtruth/object-detection",
}
annotations = grouped_info[["class_id", "height", "width", "top", "left"]].to_dict("records")
# append annotations
manifest_dic[f"{job_name}"]["annotations"] = annotations
return manifest_dic
[docs]def convert_sagemaker(
df: pd.DataFrame,
root: Union[str, os.PathLike, PosixPath],
copy_images: bool = False,
save_under: Optional[str] = None,
output_dir: Optional[Union[str, os.PathLike, PosixPath]] = None,
job_name: str = "optical",
):
"""converts to sagemaker .manifest from master dataframe
Args:
df (pd.DataFrame): the master df
root (Union[str, os.PathLike, PosixPath]): root directory of the source format
has_image_split (bool, optional): If the images are arranged under the splits. Defaults to False.
copy_images (bool, optional): Whether to copy the images to a different directory. Defaults to False.
save_under (str, optional): Name of the folder to save the target annotations. Defaults to "labels".
output_dir (Optional[Union[str, os.PathLike, PosixPath]], optional): Output directory for the target
annotation. Defaults to ``None``.
job_name(Optional[str]): manifest job name for the output file. Defaults to optical
"""
save_under = ifnone(save_under, "sagemaker")
output_imagedir, output_labeldir = _makedirs(root, save_under, output_dir)
splits = df.split.unique().tolist()
for split in splits:
# if split == "main":
# output_subdir = output_labeldir
# else:
# output_subdir = output_labeldir / split
# output_subdir.mkdir(parents=True, exist_ok=True)
split_df = df.query("split == @split").copy()
# drop images missing width or height information
hw_missing = split_df[pd.isnull(split_df["image_width"]) | pd.isnull(split_df["image_height"])]
if len(hw_missing) > 0:
warnings.warn(
f"{hw_missing['image_id'].nunique()} has height/width information missing in split `{split}`. "
+ f"{len(hw_missing)} annotations will be removed."
)
split_df = split_df[pd.notnull(split_df["image_width"]) & pd.notnull(split_df["image_height"])]
split_df = split_df.rename(columns={"y_min": "top", "x_min": "left"})
id_to_class_map = get_id_to_class_map(split_df)
grouped_split_df = split_df.groupby(["image_id", "image_width", "image_height"])
with open(output_labeldir / f"{split}.manifest", "w") as f:
for image_info, grouped_info in tqdm(
grouped_split_df, total=grouped_split_df.ngroups, desc=f"split: {split}"
):
manifest_dic = _make_manifest_data(image_info, grouped_info, job_name, id_to_class_map)
f.write(json.dumps(manifest_dic) + "\n")
if copy_images:
dest_dir = output_imagedir if split == "main" else output_imagedir / split
dest_dir.mkdir(parents=True, exist_ok=True)
_fastcopy(split_df["image_path"].unique().tolist(), dest_dir)
def _make_createml_annotation_data(dic):
""" makes createML annotations of a particular image"""
category = dic["category"]
del dic["category"]
return {"label": category, "coordinates": dic}
[docs]def convert_createml(
df: pd.DataFrame,
root: Union[str, os.PathLike, PosixPath],
copy_images: bool = False,
save_under: Optional[str] = None,
output_dir: Optional[Union[str, os.PathLike, PosixPath]] = None,
):
"""converts to createml .json from master dataframe
Args:
df (pd.DataFrame): the master df
root (Union[str, os.PathLike, PosixPath]): root directory of the source format
has_image_split (bool, optional): If the images are arranged under the splits. Defaults to False.
copy_images (bool, optional): Whether to copy the images to a different directory. Defaults to False.
save_under (str, optional): Name of the folder to save the target annotations. Defaults to "labels".
output_dir (Optional[Union[str, os.PathLike, PosixPath]], optional): Output directory for the target
annotation. Defaults to ``None``.
"""
save_under = ifnone(save_under, "createml")
output_imagedir, output_labeldir = _makedirs(root, save_under, output_dir)
splits = df.split.unique().tolist()
for split in splits:
# output_subdir = output_labeldir if split == "main" else output_labeldir / split
# output_subdir.mkdir(parents=True, exist_ok=True)
split_df = df.query("split == @split").copy()
# drop images missing width or height information
hw_missing = split_df[pd.isnull(split_df["image_width"]) | pd.isnull(split_df["image_height"])]
if len(hw_missing) > 0:
warnings.warn(
f"{hw_missing['image_id'].nunique()} has height/width information missing in split `{split}`. "
+ f"{len(hw_missing)} annotations will be removed."
)
split_df = split_df[pd.notnull(split_df["image_width"]) & pd.notnull(split_df["image_height"])]
split_df = split_df.rename(columns={"y_min": "y", "x_min": "x"})
grouped_split_df = split_df.groupby(["image_id"])
createml_data = []
for image_info, grouped_info in tqdm(grouped_split_df, total=grouped_split_df.ngroups, desc=f"split: {split}"):
file_result = {}
records = grouped_info[["category", "height", "width", "y", "x"]].to_dict("records")
file_result["image"] = image_info
# transform the records into createml annotation format
file_result["annotations"] = list(map(_make_createml_annotation_data, records))
createml_data.append(file_result)
file_path = output_labeldir / f"{split}.json"
write_json(createml_data, file_path)
if copy_images:
dest_dir = output_imagedir if split == "main" else output_imagedir / split
dest_dir.mkdir(parents=True, exist_ok=True)
_fastcopy(split_df["image_path"].unique().tolist(), dest_dir)
[docs]def convert_pascal(
df: pd.DataFrame,
root: Union[str, os.PathLike, PosixPath],
copy_images: bool = False,
save_under: Optional[str] = None,
output_dir: Optional[Union[str, os.PathLike, PosixPath]] = None,
):
"""convert to pascal from Masterdf
Args:
df (pd.DataFrame): the master df
root (Union[str, os.PathLike, PosixPath]): root directory of the source format
has_image_split (bool, optional): If the images are arranged under the splits. Defaults to False.
copy_images (bool, optional): Whether to copy the images to a different directory. Defaults to False.
save_under (Optional[str], optional): Name of the folder to save the target annotations. Defaults to "labels".
output_dir (Optional[Union[str, os.PathLike, PosixPath]], optional): Output directory for the target
"""
save_under = ifnone(save_under, "pascal")
output_imagedir, output_labeldir = _makedirs(root, save_under, output_dir)
df = copy.deepcopy(df)
df["x_max"] = df["x_min"] + df["width"]
df["y_max"] = df["y_min"] + df["height"]
df.drop(["width", "height"], axis=1, inplace=True)
for col in ("x_min", "y_min", "x_max", "y_max"):
df[col] = df[col].astype(np.int32)
splits = df.split.unique().tolist()
for split in splits:
output_subdir = output_labeldir / split if len(splits) > 1 else output_labeldir
output_subdir.mkdir(parents=True, exist_ok=True)
split_df = df.query("split == @split")
images = split_df["image_id"].unique()
for image in images:
image_df = split_df.query("image_id == @image")
write_xml(image_df, root, output_labeldir)
if copy_images:
dest_dir = output_imagedir / split
dest_dir.mkdir(parents=True, exist_ok=True)
_fastcopy(split_df["image_path"].unique().tolist(), dest_dir)
[docs]def convert_tfrecord(
df: pd.DataFrame,
root: Union[str, os.PathLike, PosixPath],
has_image_split: bool = False,
copy_images: bool = False,
save_under: Optional[str] = None,
output_dir: Optional[Union[str, os.PathLike, PosixPath]] = None,
) -> None:
"""convert to tfrecords from Masterdf
Args:
df (pd.DataFrame): the master df
root (Union[str, os.PathLike, PosixPath]): root directory of the source format
has_image_split (bool, optional): If the images are arranged under the splits. Defaults to False.
copy_images (bool, optional): Whether to copy the images to a different directory. Defaults to False.
save_under (Optional[str], optional): Name of the folder to save the target annotations. Defaults to "labels".
output_dir (Optional[Union[str, os.PathLike, PosixPath]], optional): Output directory for the target
"""
import tensorflow as tf
output_dir = ifnone(output_dir, root, Path)
save_under = ifnone(save_under, "tfrecord")
output_dir = output_dir / save_under
output_imagedir = output_dir / "images"
output_dir.mkdir(parents=True, exist_ok=True)
df = copy.deepcopy(df)
df["x_max"] = df["x_min"] + df["width"]
df["y_max"] = df["y_min"] + df["height"]
df.drop(["width", "height"], axis=1, inplace=True)
for col in ("x_min", "y_min", "x_max", "y_max"):
df[col] = df[col].astype(np.int32)
splits = df.split.unique().tolist()
for split in splits:
split_df = df.query("split == @split")
writer = tf.io.TFRecordWriter(str(Path(output_dir).joinpath(split + ".tfrecord")))
images = split_df["image_id"].unique()
for image in images:
image_df = split_df.query("image_id == @image")
tf_example = create_tf_example(image_df, root)
writer.write(tf_example.SerializeToString())
writer.close()
id_to_class_map = get_id_to_class_map(df)
write_label_map(id_to_class_map, output_dir)
if copy_images:
dest_dir = output_imagedir / split
dest_dir.mkdir(parents=True, exist_ok=True)
_fastcopy(split_df["image_path"].unique().tolist(), dest_dir)
[docs]def convert_simple_json(
df: pd.DataFrame,
root: Union[str, os.PathLike, PosixPath],
copy_images: bool = False,
save_under: Optional[str] = None,
output_dir: Optional[Union[str, os.PathLike, PosixPath]] = None,
) -> None:
"""converts to simple json from master df
Args:
df (pd.DataFrame): the master df
root (Union[str, os.PathLike, PosixPath]): root directory of the source format
has_image_split (bool, optional): If the images are arranged under the splits. Defaults to False.
copy_images (bool, optional): Whether to copy the images to a different directory. Defaults to False.
save_under (str, optional): Name of the folder to save the target annotations. Defaults to "labels".
output_dir (Optional[Union[str, os.PathLike, PosixPath]], optional): Output directory for the target
annotation. Defaults to ``None``.
"""
save_under = ifnone(save_under, "simple_json")
output_imagedir, output_labeldir = _makedirs(root, save_under, output_dir)
splits = df.split.unique().tolist()
for split in splits:
split_df = df.query("split == @split").copy()
split_df_columns = split_df.columns.to_list()
is_score = True if "score" in split_df_columns else False
images = split_df["image_id"].unique().tolist()
simple_json_dict = {}
image_groups = split_df.groupby("image_id")
for image in tqdm(images, desc=f"split: {split}"):
image_anns = image_groups.get_group(image)
ann_cols = ["x_min", "y_min", "width", "height", "category"]
if is_score:
ann_cols.append("score")
image_anns = image_anns[ann_cols].to_dict("records")
simple_json_dict[image] = _create_simple_json_dict(image_anns)
output_file = output_labeldir / f"{split}.json"
write_json(simple_json_dict, output_file)
if copy_images:
dest_dir = output_imagedir / split
dest_dir.mkdir(parents=True, exist_ok=True)
_fastcopy(split_df["image_path"].unique().tolist(), dest_dir)
def _create_simple_json_dict(image_anns):
"""Makes a list of annotations in simple_json format."""
simple_json_anns = []
for ann in image_anns:
ann_dict = {}
ann_dict["bbox"] = [ann["x_min"], ann["y_min"], ann["width"] + ann["x_min"], ann["height"] + ann["y_min"]]
ann_dict["classname"] = ann["category"]
if "score" in ann.keys():
ann_dict["confidence"] = ann["score"]
simple_json_anns.append(ann_dict)
return simple_json_anns