Example outputs

API Reference

`explore_ground_truth_detection(ground_truth_file, output_folder, categories='categories', category_id='category_id')`

Explore dataset in detection format.

Parameters:

Name	Type	Description	Default
`ground_truth_file`	`str`	path to file in detection format.	required
`output_folder`	`str`	plotting results will be generated in this folder. If it does not exists, it will be created.	required
`categories`	`str`	name of the top level key holding information about the list of categories.	`'categories'`
`category_id`	`str`	name of the annotation level key holding information about the category index.	`'category_id'`

Source code in src/stages/data/explore/explore_ground_truth_detection.py

def explore_ground_truth_detection(
    ground_truth_file: str,
    output_folder: str,
    categories: str = "categories",
    category_id: str = "category_id",
) -> None:
    """Explore dataset in [detection](https://gradiant.github.io/ai-dataset-template/supported_tasks/#detection) format.

    Args:
        ground_truth_file:
            path to file in [detection](https://gradiant.github.io/ai-dataset-template/supported_tasks/#detection) format.
        output_folder:
            plotting results will be generated in this folder.
            If it does not exists, it will be created.
        categories:
            name of the top level key holding information about the list of categories.
        category_id:
            name of the annotation level key holding information about the category index.
    """
    import pandas as pd
    from plotly import express as px

    Path(output_folder).mkdir(parents=True, exist_ok=True)
    ground_truth = json.load(open(ground_truth_file))

    category_id_to_name = {
        category["id"]: category["name"] for category in ground_truth[categories]
    }
    image_id_to_shape = {
        image["id"]: (image["width"], image["height"])
        for image in ground_truth["images"]
    }
    image_id_to_file_name = {
        image["id"]: image["file_name"] for image in ground_truth["images"]
    }

    images_df = pd.DataFrame(ground_truth["images"])

    images_df.to_csv(
        Path(output_folder) / f"{Path(ground_truth_file).stem}_images.csv",
        index=False,
    )

    px.scatter(images_df, x="width", y="height").write_html(
        f"{output_folder}/image_shape_distribution.html"
    )

    annotations_dict = defaultdict(list)
    for annotation in ground_truth["annotations"]:
        image_width, image_height = image_id_to_shape[annotation["image_id"]]
        annotations_dict["category"].append(
            category_id_to_name[annotation[category_id]]
        )
        annotations_dict["file_name"].append(
            image_id_to_file_name[annotation["image_id"]]
        )
        annotations_dict["width"].append(annotation["bbox"][2])
        annotations_dict["height"].append(annotation["bbox"][3])
        annotations_dict["abs_width"].append(annotation["bbox"][2] / image_width)
        annotations_dict["abs_height"].append(annotation["bbox"][3] / image_height)

    annotations_df = pd.DataFrame(ground_truth["annotations"])
    for k, v in annotations_dict.items():
        annotations_df[k] = v

    annotations_df.to_csv(
        Path(output_folder) / f"{Path(ground_truth_file).stem}_annotations.csv",
        index=False,
    )

    px.histogram(annotations_df, x="category", y="area", histfunc="sum").write_html(
        f"{output_folder}/areas_per_category.html"
    )

    px.histogram(annotations_df, x="category").write_html(
        f"{output_folder}/count_per_category.html"
    )

    px.scatter(
        annotations_df,
        x="abs_width",
        y="abs_height",
        color="category",
        hover_data=["file_name", "width", "height"],
    ).write_html(f"{output_folder}/bounding_box_shape_distribution.html")