support downloading a portion of data.

qfgaohao · Nov 1, 2018 · e0d145c · e0d145c
1 parent fe5c7c4
commit e0d145c
Showing 1 changed file with 23 additions and 5 deletions.
diff --git a/open_images_downloader.py b/open_images_downloader.py
@@ -80,6 +80,8 @@ def parse_args():
                         help="retry times when downloading.")
     parser.add_argument("--filter_file", type=str, default="",
                         help="This file specifies the image ids you want to exclude.")
+    parser.add_argument('--remove_overlapped', action='store_true',
+                        help="Remove single boxes covered by group boxes.")
     return parser.parse_args()
 
 
@@ -92,13 +94,18 @@ def parse_args():
     names = [e.strip() for e in args.class_names.split(",")]
     class_names = []
     group_filters = []
+    percentages = []
     for name in names:
         t = name.split(":")
         class_names.append(t[0].strip())
-        if len(t) >= 2:
+        if len(t) >= 2 and t[1].strip():
             group_filters.append(t[1].strip())
         else:
             group_filters.append("")
+        if len(t) >= 3 and t[2].strip():
+            percentages.append(float(t[2].strip()))
+        else:
+            percentages.append(1.0)
 
     if not os.path.exists(args.root):
         os.makedirs(args.root)
@@ -138,25 +145,36 @@ def parse_args():
                                how="inner")
         if not args.include_depiction:
             annotations = annotations.loc[annotations['IsDepiction'] != 1, :]
-        annotations = annotations.loc[~annotations['ImageID'].isin(excluded_images), :]
 
         # TODO MAKE IT MORE EFFICIENT
         #filter by IsGroupOf
         filtered = []
-        for class_name, group_filter in zip(class_names, group_filters):
+        for class_name, group_filter, percentage in zip(class_names, group_filters, percentages):
             sub = annotations.loc[annotations['ClassName'] == class_name, :]
             if group_filter == "group":
                 sub = sub.loc[sub['IsGroupOf'] == 1, :]
             elif group_filter == '~group':
                 sub = sub.loc[sub['IsGroupOf'] == 0, :]
+            excluded_images |= set(sub['ImageID'].sample(frac=1 - percentage))
             filtered.append(sub)
+
         annotations = pd.concat(filtered)
+        annotations = annotations.loc[~annotations['ImageID'].isin(excluded_images), :]
+
 
-        logging.warning(f"{dataset_type} data size: {annotations.shape[0]}")
+        if args.remove_overlapped:
+            images_with_group = annotations.loc[annotations['IsGroupOf'] == 1, 'ImageID']
+            annotations = annotations.loc[~(annotations['ImageID'].isin(set(images_with_group)) & (annotations['IsGroupOf'] == 0)), :]
+        annotations = annotations.sample(frac=1.0)
+
+        logging.warning(f"{dataset_type} bounding boxes size: {annotations.shape[0]}")
+        logging.warning("Approximate Image Stats: ")
+        log_counts(annotations.drop_duplicates(["ImageID", "ClassName"])["ClassName"])
+        logging.warning("Label distribution: ")
         log_counts(annotations['ClassName'])
 
         logging.warning(f"Shuffle dataset.")
-        annotations = annotations.sample(frac=1.0)
+
 
         sub_annotation_file = f"{args.root}/sub-{dataset_type}-annotations-bbox.csv"
         logging.warning(f"Save {dataset_type} data to {sub_annotation_file}.")