split

Coco Split App.

The pyodi coco split app can be used to split COCO annotation files in train and val annotations files.

There are two modes: 'random' or 'property'. The 'random' mode splits randomly the COCO file, while the 'property' mode allows to customize the split operation based in the properties of the COCO annotations file.

Example usage:

pyodi coco random-split ./coco.json ./random_coco_split --val-percentage 0.1

pyodi coco property-split ./coco.json ./property_coco_split ./split_config.json

The split config file is a json file that has 2 keys: 'discard' and 'val', both with dictionary values. The keys of the dictionaries will be the properties of the images that we want to match, and the values can be either the regex string to match or, for human readability, a dictionary with keys (you can choose whatever you want) and values (the regex string).

Split config example:

{
    "discard": {
        "file_name": "people_video|crowd_video|multiple_people_video",
        "source": "Youtube People Dataset|Bad Dataset",
    },
    "val": {
        "file_name": {
            "My Val Ground Vehicle Dataset": "val_car_video|val_bus_video|val_moto_video|val_bike_video",
            "My Val Flying Vehicle Dataset": "val_plane_video|val_drone_video|val_helicopter_video",
        },
        "source": "Val Dataset",
    }
}

API REFERENCE

`property_split(annotations_file, output_filename, split_config_file)`

Split the annotations file in training and validation subsets by properties.

Parameters:

Name	Type	Description	Default
`annotations_file`	`str`	Path to annotations file.	required
`output_filename`	`str`	Output filename.	required
`split_config_file`	`str`	Path to configuration file.	required

Returns:

Type	Description
`List[str]`	Output filenames.

Source code in pyodi/apps/coco/coco_split.py

@logger.catch(reraise=True)  # noqa: C901
def property_split(
    annotations_file: str, output_filename: str, split_config_file: str,
) -> List[str]:
    """Split the annotations file in training and validation subsets by properties.

    Args:
        annotations_file: Path to annotations file.
        output_filename: Output filename.
        split_config_file: Path to configuration file.

    Returns:
        Output filenames.

    """
    logger.info("Loading files...")
    split_config = json.load(open(Path(split_config_file)))
    split_list = []

    # Transform split_config from human readable format to a more code efficient format
    for section in split_config:  # sections: val / discard
        for property_name, property_value in split_config[section].items():
            if isinstance(property_value, dict):
                property_value = "|".join(property_value.values())
            split_list.append(
                dict(
                    split=section,
                    property_name=property_name,
                    property_regex=property_value,
                )
            )

    data = json.load(open(annotations_file))

    train_images, val_images = [], []
    train_annotations, val_annotations = [], []

    n_train_imgs, n_val_imgs = 0, 0
    n_train_anns, n_val_anns = 0, 0

    old_to_new_train_ids = dict()
    old_to_new_val_ids = dict()

    logger.info("Gathering images...")
    for img in data["images"]:

        i = 0
        while i < len(split_list) and not re.match(
            split_list[i]["property_regex"], img[split_list[i]["property_name"]]
        ):
            i += 1

        if i < len(split_list):  # discard or val
            if split_list[i]["split"] == "val":
                old_to_new_val_ids[img["id"]] = n_val_imgs
                img["id"] = n_val_imgs
                val_images.append(img)
                n_val_imgs += 1
        else:  # train
            old_to_new_train_ids[img["id"]] = n_train_imgs
            img["id"] = n_train_imgs
            train_images.append(img)
            n_train_imgs += 1

    logger.info("Gathering annotations...")
    for ann in data["annotations"]:

        if ann["image_id"] in old_to_new_val_ids:
            ann["image_id"] = old_to_new_val_ids[ann["image_id"]]
            ann["id"] = n_val_anns
            val_annotations.append(ann)
            n_val_anns += 1
        elif ann["image_id"] in old_to_new_train_ids:
            ann["image_id"] = old_to_new_train_ids[ann["image_id"]]
            ann["id"] = n_train_anns
            train_annotations.append(ann)
            n_train_anns += 1

    logger.info("Spliting data...")
    train_split = {
        "images": train_images,
        "annotations": train_annotations,
        "info": data.get("info", {}),
        "licenses": data.get("licenses", []),
        "categories": data["categories"],
    }
    val_split = {
        "images": val_images,
        "annotations": val_annotations,
        "info": data.get("info", {}),
        "licenses": data.get("licenses", []),
        "categories": data["categories"],
    }

    logger.info("Writing splited files...")
    output_files = []
    for split_type, split in zip(["train", "val"], [train_split, val_split]):
        output_files.append(output_filename + f"_{split_type}.json")
        with open(output_files[-1], "w") as f:
            json.dump(split, f, indent=2)

    return output_files

`random_split(annotations_file, output_filename, val_percentage=0.25, seed=47)`

Split the annotations file in training and validation subsets randomly.

Parameters:

Name	Type	Description	Default
`annotations_file`	`str`	Path to annotations file.	required
`output_filename`	`str`	Output filename.	required
`val_percentage`	`float`	Percentage of validation images. Defaults to 0.25.	`0.25`
`seed`	`int`	Seed for the random generator. Defaults to 47.	`47`

Returns:

Type	Description
`List[str]`	Output filenames.

Source code in pyodi/apps/coco/coco_split.py

@logger.catch(reraise=True)
def random_split(
    annotations_file: str,
    output_filename: str,
    val_percentage: float = 0.25,
    seed: int = 47,
) -> List[str]:
    """Split the annotations file in training and validation subsets randomly.

    Args:
        annotations_file: Path to annotations file.
        output_filename: Output filename.
        val_percentage: Percentage of validation images. Defaults to 0.25.
        seed: Seed for the random generator. Defaults to 47.

    Returns:
        Output filenames.

    """
    data = json.load(open(annotations_file))
    train_images, val_images, val_ids = [], [], []

    np.random.seed(seed)
    rand_values = np.random.rand(len(data["images"]))

    logger.info("Gathering images...")
    for i, image in enumerate(data["images"]):

        if rand_values[i] < val_percentage:
            val_images.append(copy(image))
            val_ids.append(image["id"])
        else:
            train_images.append(copy(image))

    train_annotations, val_annotations = [], []

    logger.info("Gathering annotations...")
    for annotation in data["annotations"]:

        if annotation["image_id"] in val_ids:
            val_annotations.append(copy(annotation))
        else:
            train_annotations.append(copy(annotation))

    train_split = {
        "images": train_images,
        "annotations": train_annotations,
        "info": data.get("info", {}),
        "licenses": data.get("licenses", []),
        "categories": data["categories"],
    }

    val_split = {
        "images": val_images,
        "annotations": val_annotations,
        "info": data.get("info", {}),
        "licenses": data.get("licenses", []),
        "categories": data["categories"],
    }

    logger.info("Saving splits to file...")
    output_files = []
    for split_type, split in zip(["train", "val"], [train_split, val_split]):
        output_files.append(output_filename + f"_{split_type}.json")
        with open(output_files[-1], "w") as f:
            json.dump(split, f, indent=2)

    return output_files