Commit a9d44455 authored by Owen Corrigan's avatar Owen Corrigan
Browse files

Convert fairface annotations to ndjson

parent 9f6ce804
......@@ -11,3 +11,18 @@ stages:
md5: e98b6274966ebeca27c3f0debfba5fda.dir
size: 4240651
nfiles: 2
fairface_annotations_to_ndjson:
cmd: python3 scripts/fairface_to_ndjson.py data/fairface/raw/fairface_label_train.csv
data/fairface/raw/fairface_label_val.csv data/fairface/processed/annotations.json
deps:
- path: data/fairface/raw
md5: e98b6274966ebeca27c3f0debfba5fda.dir
size: 4240651
nfiles: 2
- path: scripts/fairface_to_ndjson.py
md5: d1be196bac8fc6d1a4aa5df21a1d66be
size: 1321
outs:
- path: data/fairface/processed/annotations.json
md5: f47ca7827cd5d7c57d7291d3a941265d
size: 19132174
......@@ -5,3 +5,11 @@ stages:
- scripts/download_fairface.sh
outs:
- data/fairface/raw/
fairface_annotations_to_ndjson:
cmd: python3 scripts/fairface_to_ndjson.py data/fairface/raw/fairface_label_train.csv
data/fairface/raw/fairface_label_val.csv data/fairface/processed/annotations.json
deps:
- data/fairface/raw
- scripts/fairface_to_ndjson.py
outs:
- data/fairface/processed/annotations.json
import argparse
import datetime
import json
import math
import pandas as pd
def median_age(s):
if s == "more than 70":
return 75
min, max = [int(i) for i in s.split("-")]
return math.ceil((min + max) / 2)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert annotations to a single ndjson file."
)
parser.add_argument(
"train", type=argparse.FileType("r"), help="train annotation file"
)
parser.add_argument("val", type=argparse.FileType("r"), help="val annotation file")
parser.add_argument("out", type=argparse.FileType("w"), help="ndjson file")
args = parser.parse_args()
train = pd.read_csv(args.train)
val = pd.read_csv(args.val)
train["legacy_splits"] = "train"
# rename val to test for consistency
val["legacy_splits"] = "test"
# combine into single dataframe
df = pd.concat([train, val]).drop(["service_test"], axis=1).reset_index(drop=True)
df["image_width"] = 224
df["image_height"] = 224
df["ingestion_timestamp"] = datetime.datetime.utcnow().isoformat(
sep=" ", timespec="milliseconds"
)
df["age"] = df["age"].apply(median_age)
records = df.to_dict(orient="records")
for record in records:
args.out.write(json.dumps(record) + "\n")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment