ipcoder
diff --git a/‎research/object_detection/README.md
Lines changed: 9 additions & 0 deletions b/‎research/object_detection/README.md
Lines changed: 9 additions & 0 deletions
diff --git a/‎research/object_detection/data/ava_label_map_v2.1.pbtxt
Lines changed: 240 additions & 0 deletions b/‎research/object_detection/data/ava_label_map_v2.1.pbtxt
Lines changed: 240 additions & 0 deletions
diff --git a/‎research/object_detection/g3doc/detection_model_zoo.md
Lines changed: 8 additions & 1 deletion b/‎research/object_detection/g3doc/detection_model_zoo.md
Lines changed: 8 additions & 1 deletion
diff --git a/‎research/object_detection/model_lib.py
Lines changed: 18 additions & 19 deletions b/‎research/object_detection/model_lib.py
Lines changed: 18 additions & 19 deletions
@@ -90,6 +90,15 @@ reporting an issue.
 
 ## Release information
 
+### April 30, 2018
+
+We have released a Faster R-CNN detector with ResNet-101 feature extractor trained on [AVA](https://research.google.com/ava/) v2.1.
+Compared with other commonly used object detectors, it changes the action classification loss function to per-class Sigmoid loss to handle boxes with multiple labels.
+The model is trained on the training split of AVA v2.1 for 1.5M iterations, it achieves mean AP of 11.25% over 60 classes on the validation split of AVA v2.1.
+For more details please refer to this [paper](https://arxiv.org/abs/1705.08421).
+
+<b>Thanks to contributors</b>: Chen Sun, David Ross
+
 ### April 2, 2018
 
 Supercharge your mobile phones with the next generation mobile object detector!
 
@@ -0,0 +1,240 @@
+item {
+  name: "bend/bow (at the waist)"
+  id: 1
+}
+item {
+  name: "crouch/kneel"
+  id: 3
+}
+item {
+  name: "dance"
+  id: 4
+}
+item {
+  name: "fall down"
+  id: 5
+}
+item {
+  name: "get up"
+  id: 6
+}
+item {
+  name: "jump/leap"
+  id: 7
+}
+item {
+  name: "lie/sleep"
+  id: 8
+}
+item {
+  name: "martial art"
+  id: 9
+}
+item {
+  name: "run/jog"
+  id: 10
+}
+item {
+  name: "sit"
+  id: 11
+}
+item {
+  name: "stand"
+  id: 12
+}
+item {
+  name: "swim"
+  id: 13
+}
+item {
+  name: "walk"
+  id: 14
+}
+item {
+  name: "answer phone"
+  id: 15
+}
+item {
+  name: "carry/hold (an object)"
+  id: 17
+}
+item {
+  name: "climb (e.g., a mountain)"
+  id: 20
+}
+item {
+  name: "close (e.g., a door, a box)"
+  id: 22
+}
+item {
+  name: "cut"
+  id: 24
+}
+item {
+  name: "dress/put on clothing"
+  id: 26
+}
+item {
+  name: "drink"
+  id: 27
+}
+item {
+  name: "drive (e.g., a car, a truck)"
+  id: 28
+}
+item {
+  name: "eat"
+  id: 29
+}
+item {
+  name: "enter"
+  id: 30
+}
+item {
+  name: "hit (an object)"
+  id: 34
+}
+item {
+  name: "lift/pick up"
+  id: 36
+}
+item {
+  name: "listen (e.g., to music)"
+  id: 37
+}
+item {
+  name: "open (e.g., a window, a car door)"
+  id: 38
+}
+item {
+  name: "play musical instrument"
+  id: 41
+}
+item {
+  name: "point to (an object)"
+  id: 43
+}
+item {
+  name: "pull (an object)"
+  id: 45
+}
+item {
+  name: "push (an object)"
+  id: 46
+}
+item {
+  name: "put down"
+  id: 47
+}
+item {
+  name: "read"
+  id: 48
+}
+item {
+  name: "ride (e.g., a bike, a car, a horse)"
+  id: 49
+}
+item {
+  name: "sail boat"
+  id: 51
+}
+item {
+  name: "shoot"
+  id: 52
+}
+item {
+  name: "smoke"
+  id: 54
+}
+item {
+  name: "take a photo"
+  id: 56
+}
+item {
+  name: "text on/look at a cellphone"
+  id: 57
+}
+item {
+  name: "throw"
+  id: 58
+}
+item {
+  name: "touch (an object)"
+  id: 59
+}
+item {
+  name: "turn (e.g., a screwdriver)"
+  id: 60
+}
+item {
+  name: "watch (e.g., TV)"
+  id: 61
+}
+item {
+  name: "work on a computer"
+  id: 62
+}
+item {
+  name: "write"
+  id: 63
+}
+item {
+  name: "fight/hit (a person)"
+  id: 64
+}
+item {
+  name: "give/serve (an object) to (a person)"
+  id: 65
+}
+item {
+  name: "grab (a person)"
+  id: 66
+}
+item {
+  name: "hand clap"
+  id: 67
+}
+item {
+  name: "hand shake"
+  id: 68
+}
+item {
+  name: "hand wave"
+  id: 69
+}
+item {
+  name: "hug (a person)"
+  id: 70
+}
+item {
+  name: "kiss (a person)"
+  id: 72
+}
+item {
+  name: "lift (a person)"
+  id: 73
+}
+item {
+  name: "listen to (a person)"
+  id: 74
+}
+item {
+  name: "push (another person)"
+  id: 76
+}
+item {
+  name: "sing to (e.g., self, a person, a group)"
+  id: 77
+}
+item {
+  name: "take (an object) from (a person)"
+  id: 78
+}
+item {
+  name: "talk to (e.g., self, a person, a group)"
+  id: 79
+}
+item {
+  name: "watch (a person)"
+  id: 80
+}
@@ -91,7 +91,7 @@ Some remarks on frozen inference graphs:
 
 ## Kitti-trained models {#kitti-models}
 
-Model name                                                                                                                                                        | Speed (ms) | Pascal mAP@0.5 (ms) | Outputs
+Model name                                                                                                                                                        | Speed (ms) | Pascal mAP@0.5 | Outputs
 ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
 [faster_rcnn_resnet101_kitti](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_kitti_2018_01_28.tar.gz) | 79  | 87              | Boxes
 
@@ -103,6 +103,13 @@ Model name
 [faster_rcnn_inception_resnet_v2_atrous_lowproposals_oid](http://download.tensorflow.org/models/object_detection/faster_rcnn_inception_resnet_v2_atrous_lowproposals_oid_2018_01_28.tar.gz) | 347  |               | Boxes
 
 
+## AVA v2.1 trained models {#ava-models}
+
+Model name                                                                                                                                                        | Speed (ms) | Pascal mAP@0.5 | Outputs
+----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---: | :-------------: | :-----:
+[faster_rcnn_resnet101_ava_v2.1](http://download.tensorflow.org/models/object_detection/faster_rcnn_resnet101_ava_v2.1_2018_04_30.tar.gz) | 93  | 11              | Boxes
+
+
 [^1]: See [MSCOCO evaluation protocol](http://cocodataset.org/#detections-eval).
 [^2]: This is PASCAL mAP with a slightly different way of true positives computation: see [Open Images evaluation protocol](evaluation_protocols.md#open-images).
 
@@ -325,16 +325,16 @@ def tpu_scaffold():
       }
 
     eval_metric_ops = None
-    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
+    if mode == tf.estimator.ModeKeys.EVAL:
       class_agnostic = (fields.DetectionResultFields.detection_classes
                         not in detections)
       groundtruth = _get_groundtruth_data(detection_model, class_agnostic)
       use_original_images = fields.InputDataFields.original_image in features
-      original_images = (
+      eval_images = (
           features[fields.InputDataFields.original_image] if use_original_images
           else features[fields.InputDataFields.image])
       eval_dict = eval_util.result_dict_for_single_example(
-          original_images[0:1],
+          eval_images[0:1],
           features[inputs.HASH_KEY][0],
           detections,
           groundtruth,
@@ -355,22 +355,21 @@ def tpu_scaffold():
         img_summary = tf.summary.image('Detections_Left_Groundtruth_Right',
                                        detection_and_groundtruth)
 
-      if mode == tf.estimator.ModeKeys.EVAL:
-        # Eval metrics on a single example.
-        eval_metrics = eval_config.metrics_set
-        if not eval_metrics:
-          eval_metrics = ['coco_detection_metrics']
-        eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
-            eval_metrics, category_index.values(), eval_dict,
-            include_metrics_per_category=False)
-        for loss_key, loss_tensor in iter(losses_dict.items()):
-          eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
-        for var in optimizer_summary_vars:
-          eval_metric_ops[var.op.name] = (var, tf.no_op())
-        if img_summary is not None:
-          eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
-              img_summary, tf.no_op())
-        eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()}
+      # Eval metrics on a single example.
+      eval_metrics = eval_config.metrics_set
+      if not eval_metrics:
+        eval_metrics = ['coco_detection_metrics']
+      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
+          eval_metrics, category_index.values(), eval_dict,
+          include_metrics_per_category=False)
+      for loss_key, loss_tensor in iter(losses_dict.items()):
+        eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
+      for var in optimizer_summary_vars:
+        eval_metric_ops[var.op.name] = (var, tf.no_op())
+      if img_summary is not None:
+        eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
+            img_summary, tf.no_op())
+      eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()}
 
     if use_tpu:
       return tf.contrib.tpu.TPUEstimatorSpec(