Example outputs
API Reference
explore_ground_truth_detection(ground_truth_file, output_folder, categories='categories', category_id='category_id')
Explore dataset in detection format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
ground_truth_file |
str |
path to file in detection format. |
required |
output_folder |
str |
plotting results will be generated in this folder. If it does not exists, it will be created. |
required |
categories |
str |
name of the top level key holding information about the list of categories. |
'categories' |
category_id |
str |
name of the annotation level key holding information about the category index. |
'category_id' |
Source code in src/stages/data/explore/explore_ground_truth_detection.py
def explore_ground_truth_detection(
ground_truth_file: str,
output_folder: str,
categories: str = "categories",
category_id: str = "category_id",
) -> None:
"""Explore dataset in [detection](https://gradiant.github.io/ai-dataset-template/supported_tasks/#detection) format.
Args:
ground_truth_file:
path to file in [detection](https://gradiant.github.io/ai-dataset-template/supported_tasks/#detection) format.
output_folder:
plotting results will be generated in this folder.
If it does not exists, it will be created.
categories:
name of the top level key holding information about the list of categories.
category_id:
name of the annotation level key holding information about the category index.
"""
import pandas as pd
from plotly import express as px
Path(output_folder).mkdir(parents=True, exist_ok=True)
ground_truth = json.load(open(ground_truth_file))
category_id_to_name = {
category["id"]: category["name"] for category in ground_truth[categories]
}
image_id_to_shape = {
image["id"]: (image["width"], image["height"])
for image in ground_truth["images"]
}
image_id_to_file_name = {
image["id"]: image["file_name"] for image in ground_truth["images"]
}
images_df = pd.DataFrame(ground_truth["images"])
images_df.to_csv(
Path(output_folder) / f"{Path(ground_truth_file).stem}_images.csv",
index=False,
)
px.scatter(images_df, x="width", y="height").write_html(
f"{output_folder}/image_shape_distribution.html"
)
annotations_dict = defaultdict(list)
for annotation in ground_truth["annotations"]:
image_width, image_height = image_id_to_shape[annotation["image_id"]]
annotations_dict["category"].append(
category_id_to_name[annotation[category_id]]
)
annotations_dict["file_name"].append(
image_id_to_file_name[annotation["image_id"]]
)
annotations_dict["width"].append(annotation["bbox"][2])
annotations_dict["height"].append(annotation["bbox"][3])
annotations_dict["abs_width"].append(annotation["bbox"][2] / image_width)
annotations_dict["abs_height"].append(annotation["bbox"][3] / image_height)
annotations_df = pd.DataFrame(ground_truth["annotations"])
for k, v in annotations_dict.items():
annotations_df[k] = v
annotations_df.to_csv(
Path(output_folder) / f"{Path(ground_truth_file).stem}_annotations.csv",
index=False,
)
px.histogram(annotations_df, x="category", y="area", histfunc="sum").write_html(
f"{output_folder}/areas_per_category.html"
)
px.histogram(annotations_df, x="category").write_html(
f"{output_folder}/count_per_category.html"
)
px.scatter(
annotations_df,
x="abs_width",
y="abs_height",
color="category",
hover_data=["file_name", "width", "height"],
).write_html(f"{output_folder}/bounding_box_shape_distribution.html")