Skip to content

property_split

property_split(annotations_file, output_matched_file, property_name, pattern_to_match, output_unmatched_file=None)

Split based on matching each image's property_name with pattern_to_match.

Parameters:

Name Type Description Default
annotations_file str

File in COCO Classification/Detection/Segmentation format.

required
output_matched_file str

File where split of matched images will be saved.

required
property_name str

name of the string property that must be present in all items of images of annotation_file.

required
pattern_to_match str

Regex pattern to match property_name

required
output_unmatched_file Optional[str]

If not None, split of unmatched images will be saved to this path.

None
Source code in src/stages/data/transform/property_split.py
def property_split(
    annotations_file: str,
    output_matched_file: str,
    property_name: str,
    pattern_to_match: str,
    output_unmatched_file: Optional[str] = None,
) -> None:
    """Split based on matching each image's `property_name` with `pattern_to_match`.

    Args:
        annotations_file:
            File in COCO Classification/Detection/Segmentation format.
        output_matched_file:
            File where split of matched images will be saved.
        property_name:
            name of the string property that must be present in all items of `images`
            of `annotation_file`.
        pattern_to_match:
            Regex pattern to match `property_name`
        output_unmatched_file:
            If not None, split of unmatched images will be saved to this path.

    """
    Path(output_matched_file).parent.mkdir(parents=True, exist_ok=True)

    logger.info(f"Loading annotations from {annotations_file} ...")
    with open(annotations_file) as anns:
        data = json.load(anns)

    output_split = {k: v for k, v in data.items() if k not in ("images", "annotations")}

    logger.info("Processing images ...")
    matched_images = []
    unmatched_images = []
    for image in data["images"]:
        if re.match(pattern_to_match, image[property_name]):
            matched_images.append(image)
        else:
            unmatched_images.append(image)
    output_split["images"] = matched_images

    if "annotations" in data.keys():
        logger.info("Processing annotations ...")
        matched_image_ids = set(x["id"] for x in output_split["images"])
        matched_annotations = []
        unmatched_annotations = []
        for annotation in data["annotations"]:
            if annotation["image_id"] in matched_image_ids:
                matched_annotations.append(annotation)
            else:
                unmatched_annotations.append(annotation)

        output_split["annotations"] = matched_annotations

    logger.info(f"Writing {output_matched_file}...")
    with open(output_matched_file, "w") as f:
        json.dump(output_split, f, indent=2)
    logger.info("Done!")

    if output_unmatched_file is not None:
        output_split["images"] = unmatched_images
        if "annotations" in data.keys():
            output_split["annotations"] = unmatched_annotations
        logger.info(f"Writing {output_unmatched_file}...")
        with open(output_unmatched_file, "w") as f:
            json.dump(output_split, f, indent=2)
        logger.info("Done!")