Skip to content

split

Coco Split App.

The pyodi coco split app can be used to split COCO annotation files in train and val annotations files.

There are two modes: 'random' or 'property'. The 'random' mode splits randomly the COCO file, while the 'property' mode allows to customize the split operation based in the properties of the COCO annotations file.

Example usage:

pyodi coco random-split ./coco.json ./random_coco_split --val-percentage 0.1
pyodi coco property-split ./coco.json ./property_coco_split ./split_config.json

The split config file is a json file that has 2 keys: 'discard' and 'val', both with dictionary values. The keys of the dictionaries will be the properties of the images that we want to match, and the values can be either the regex string to match or, for human readability, a dictionary with keys (you can choose whatever you want) and values (the regex string).

Split config example:

{
    "discard": {
        "file_name": "people_video|crowd_video|multiple_people_video",
        "source": "Youtube People Dataset|Bad Dataset",
    },
    "val": {
        "file_name": {
            "My Val Ground Vehicle Dataset": "val_car_video|val_bus_video|val_moto_video|val_bike_video",
            "My Val Flying Vehicle Dataset": "val_plane_video|val_drone_video|val_helicopter_video",
        },
        "source": "Val Dataset",
    }
}

API REFERENCE

property_split(annotations_file, output_filename, split_config_file)

Split the annotations file in training and validation subsets by properties.

Parameters:

Name Type Description Default
annotations_file str

Path to annotations file.

required
output_filename str

Output filename.

required
split_config_file str

Path to configuration file.

required

Returns:

Type Description
List[str]

Output filenames.

Source code in pyodi/apps/coco/coco_split.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
@logger.catch(reraise=True)  # noqa: C901
def property_split(
    annotations_file: str, output_filename: str, split_config_file: str,
) -> List[str]:
    """Split the annotations file in training and validation subsets by properties.

    Args:
        annotations_file: Path to annotations file.
        output_filename: Output filename.
        split_config_file: Path to configuration file.

    Returns:
        Output filenames.

    """
    logger.info("Loading files...")
    split_config = json.load(open(Path(split_config_file)))
    split_list = []

    # Transform split_config from human readable format to a more code efficient format
    for section in split_config:  # sections: val / discard
        for property_name, property_value in split_config[section].items():
            if isinstance(property_value, dict):
                property_value = "|".join(property_value.values())
            split_list.append(
                dict(
                    split=section,
                    property_name=property_name,
                    property_regex=property_value,
                )
            )

    data = json.load(open(annotations_file))

    train_images, val_images = [], []
    train_annotations, val_annotations = [], []

    n_train_imgs, n_val_imgs = 0, 0
    n_train_anns, n_val_anns = 0, 0

    old_to_new_train_ids = dict()
    old_to_new_val_ids = dict()

    logger.info("Gathering images...")
    for img in data["images"]:

        i = 0
        while i < len(split_list) and not re.match(
            split_list[i]["property_regex"], img[split_list[i]["property_name"]]
        ):
            i += 1

        if i < len(split_list):  # discard or val
            if split_list[i]["split"] == "val":
                old_to_new_val_ids[img["id"]] = n_val_imgs
                img["id"] = n_val_imgs
                val_images.append(img)
                n_val_imgs += 1
        else:  # train
            old_to_new_train_ids[img["id"]] = n_train_imgs
            img["id"] = n_train_imgs
            train_images.append(img)
            n_train_imgs += 1

    logger.info("Gathering annotations...")
    for ann in data["annotations"]:

        if ann["image_id"] in old_to_new_val_ids:
            ann["image_id"] = old_to_new_val_ids[ann["image_id"]]
            ann["id"] = n_val_anns
            val_annotations.append(ann)
            n_val_anns += 1
        elif ann["image_id"] in old_to_new_train_ids:
            ann["image_id"] = old_to_new_train_ids[ann["image_id"]]
            ann["id"] = n_train_anns
            train_annotations.append(ann)
            n_train_anns += 1

    logger.info("Spliting data...")
    train_split = {
        "images": train_images,
        "annotations": train_annotations,
        "info": data.get("info", {}),
        "licenses": data.get("licenses", []),
        "categories": data["categories"],
    }
    val_split = {
        "images": val_images,
        "annotations": val_annotations,
        "info": data.get("info", {}),
        "licenses": data.get("licenses", []),
        "categories": data["categories"],
    }

    logger.info("Writing splited files...")
    output_files = []
    for split_type, split in zip(["train", "val"], [train_split, val_split]):
        output_files.append(output_filename + f"_{split_type}.json")
        with open(output_files[-1], "w") as f:
            json.dump(split, f, indent=2)

    return output_files

random_split(annotations_file, output_filename, val_percentage=0.25, seed=47)

Split the annotations file in training and validation subsets randomly.

Parameters:

Name Type Description Default
annotations_file str

Path to annotations file.

required
output_filename str

Output filename.

required
val_percentage float

Percentage of validation images. Defaults to 0.25.

0.25
seed int

Seed for the random generator. Defaults to 47.

47

Returns:

Type Description
List[str]

Output filenames.

Source code in pyodi/apps/coco/coco_split.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
@logger.catch(reraise=True)
def random_split(
    annotations_file: str,
    output_filename: str,
    val_percentage: float = 0.25,
    seed: int = 47,
) -> List[str]:
    """Split the annotations file in training and validation subsets randomly.

    Args:
        annotations_file: Path to annotations file.
        output_filename: Output filename.
        val_percentage: Percentage of validation images. Defaults to 0.25.
        seed: Seed for the random generator. Defaults to 47.

    Returns:
        Output filenames.

    """
    data = json.load(open(annotations_file))
    train_images, val_images, val_ids = [], [], []

    np.random.seed(seed)
    rand_values = np.random.rand(len(data["images"]))

    logger.info("Gathering images...")
    for i, image in enumerate(data["images"]):

        if rand_values[i] < val_percentage:
            val_images.append(copy(image))
            val_ids.append(image["id"])
        else:
            train_images.append(copy(image))

    train_annotations, val_annotations = [], []

    logger.info("Gathering annotations...")
    for annotation in data["annotations"]:

        if annotation["image_id"] in val_ids:
            val_annotations.append(copy(annotation))
        else:
            train_annotations.append(copy(annotation))

    train_split = {
        "images": train_images,
        "annotations": train_annotations,
        "info": data.get("info", {}),
        "licenses": data.get("licenses", []),
        "categories": data["categories"],
    }

    val_split = {
        "images": val_images,
        "annotations": val_annotations,
        "info": data.get("info", {}),
        "licenses": data.get("licenses", []),
        "categories": data["categories"],
    }

    logger.info("Saving splits to file...")
    output_files = []
    for split_type, split in zip(["train", "val"], [train_split, val_split]):
        output_files.append(output_filename + f"_{split_type}.json")
        with open(output_files[-1], "w") as f:
            json.dump(split, f, indent=2)

    return output_files