Skip to content

Commit

Permalink
support downloading a portion of data.
Browse files Browse the repository at this point in the history
  • Loading branch information
qfgaohao committed Nov 1, 2018
1 parent fe5c7c4 commit e0d145c
Showing 1 changed file with 23 additions and 5 deletions.
28 changes: 23 additions & 5 deletions open_images_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ def parse_args():
help="retry times when downloading.")
parser.add_argument("--filter_file", type=str, default="",
help="This file specifies the image ids you want to exclude.")
parser.add_argument('--remove_overlapped', action='store_true',
help="Remove single boxes covered by group boxes.")
return parser.parse_args()


Expand All @@ -92,13 +94,18 @@ def parse_args():
names = [e.strip() for e in args.class_names.split(",")]
class_names = []
group_filters = []
percentages = []
for name in names:
t = name.split(":")
class_names.append(t[0].strip())
if len(t) >= 2:
if len(t) >= 2 and t[1].strip():
group_filters.append(t[1].strip())
else:
group_filters.append("")
if len(t) >= 3 and t[2].strip():
percentages.append(float(t[2].strip()))
else:
percentages.append(1.0)

if not os.path.exists(args.root):
os.makedirs(args.root)
Expand Down Expand Up @@ -138,25 +145,36 @@ def parse_args():
how="inner")
if not args.include_depiction:
annotations = annotations.loc[annotations['IsDepiction'] != 1, :]
annotations = annotations.loc[~annotations['ImageID'].isin(excluded_images), :]

# TODO MAKE IT MORE EFFICIENT
#filter by IsGroupOf
filtered = []
for class_name, group_filter in zip(class_names, group_filters):
for class_name, group_filter, percentage in zip(class_names, group_filters, percentages):
sub = annotations.loc[annotations['ClassName'] == class_name, :]
if group_filter == "group":
sub = sub.loc[sub['IsGroupOf'] == 1, :]
elif group_filter == '~group':
sub = sub.loc[sub['IsGroupOf'] == 0, :]
excluded_images |= set(sub['ImageID'].sample(frac=1 - percentage))
filtered.append(sub)

annotations = pd.concat(filtered)
annotations = annotations.loc[~annotations['ImageID'].isin(excluded_images), :]


logging.warning(f"{dataset_type} data size: {annotations.shape[0]}")
if args.remove_overlapped:
images_with_group = annotations.loc[annotations['IsGroupOf'] == 1, 'ImageID']
annotations = annotations.loc[~(annotations['ImageID'].isin(set(images_with_group)) & (annotations['IsGroupOf'] == 0)), :]
annotations = annotations.sample(frac=1.0)

logging.warning(f"{dataset_type} bounding boxes size: {annotations.shape[0]}")
logging.warning("Approximate Image Stats: ")
log_counts(annotations.drop_duplicates(["ImageID", "ClassName"])["ClassName"])
logging.warning("Label distribution: ")
log_counts(annotations['ClassName'])

logging.warning(f"Shuffle dataset.")
annotations = annotations.sample(frac=1.0)


sub_annotation_file = f"{args.root}/sub-{dataset_type}-annotations-bbox.csv"
logging.warning(f"Save {dataset_type} data to {sub_annotation_file}.")
Expand Down

0 comments on commit e0d145c

Please sign in to comment.