diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index 43cd054cb876..282a1aebe9a9 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -1,5 +1,7 @@
 # Faster R-CNN in MXNet with distributed implementation and data parallelization
 
+![example detections](https://cloud.githubusercontent.com/assets/13162287/22101032/92085dc0-de6c-11e6-9228-67e72606ddbc.png)
+
 ## Why?
 There exist good implementations of Faster R-CNN yet they lack support for recent 
 ConvNet architectures. The aim of reproducing it from scratch is to fully utilize 
@@ -43,9 +45,8 @@ MXNet engines and parallelization for object detection.
 | Faster R-CNN end-to-end | VGG16 | COCO train | COCO val | 21.2 | 22.8 |
 | Faster R-CNN end-to-end | ResNet-101 | COCO train | COCO val | 27.2 | 26.1 |
 
-All reference results are from original publications.
-All VOC experiments are conducted in MXNet-v0.9.1-nnvm. MXNet-v0.8 have similar results.
-All COCO experiments are conducted in MXNet-v0.8.
+The above experiments were conducted at [mx-rcnn](https://github.com/precedenceguo/mx-rcnn/tree/6a1ab0eec5035a10a1efb5fc8c9d6c54e101b4d0)
+using [a MXNet fork, based on MXNet 0.9.1 nnvm pre-release](https://github.com/precedenceguo/mxnet/tree/simple).
 
 ## I'm Feeling Lucky
 * Prepare: `bash script/additional_deps.sh`
@@ -56,9 +57,8 @@ All COCO experiments are conducted in MXNet-v0.8.
 ## Getting started
 See if `bash script/additional_deps.sh` will do the following for you.
 * Suppose `HOME` represents where this file is located. All commands, unless stated otherwise, should be started from `HOME`.
-  Executing scripts in `script` must also be from `HOME`.
 * Install python package `cython easydict matplotlib scikit-image`.
-* Install MXNet Python Interface. Open `python` type `import mxnet` to confirm.
+* Install MXNet version v0.9.5 or higher and MXNet Python Interface. Open `python` type `import mxnet` to confirm.
 * Run `make` in `HOME`.
 
 Command line arguments have the same meaning as in mxnet/example/image-classification.
@@ -82,7 +82,7 @@ Refer to `script/vgg_voc07.sh` and other experiments for examples.
 
 ### Prepare Training Data
 See `bash script/get_voc.sh` and `bash script/get_coco.sh` will do the following for you.
-* Make a folder `data` in `HOME`. `data` folder will be used to place the training data folder `VOCdevkit` and `coco`. 
+* Make a folder `data` in `HOME`. `data` folder will be used to place the training data folder `VOCdevkit` and `coco`.
 * Download and extract [Pascal VOC data](http://host.robots.ox.ac.uk/pascal/VOC/), place the `VOCdevkit` folder in `HOME/data`.
 * Download and extract [coco dataset](http://mscoco.org/dataset/), place all images to `coco/images` and annotation jsons to `data/annotations`.
 
@@ -94,6 +94,7 @@ See `bash script/get_voc.sh` and `bash script/get_coco.sh` will do the following
 ### Prepare Pretrained Models
 See if `bash script/get_pretrained_model.sh` will do this for you. If not,
 * Make a folder `model` in `HOME`. `model` folder will be used to place model checkpoints along the training process. 
+  It is recommended to set `model` as a symbolic link to somewhere else in hard disk.
 * Download VGG16 pretrained model `vgg16-0000.params` from [MXNet model gallery](https://github.com/dmlc/mxnet-model-gallery/blob/master/imagenet-1k-vgg.md) to `model` folder.
 * Download ResNet pretrained model `resnet-101-0000.params` from [ResNet](https://github.com/tornadomeet/ResNet) to `model` folder.
 
@@ -174,7 +175,7 @@ History of this implementation is:
 * Faster R-CNN with end-to-end training and module testing (v4)
 * Faster R-CNN with accelerated training and resnet (v5)  
 
-mxnet/example/rcnn was v1, v2 and v3.5.
+mxnet/example/rcnn was v1, v2, v3.5 and now v5.
 
 ## References
 1. Tianqi Chen, Mu Li, Yutian Li, Min Lin, Naiyan Wang, Minjie Wang, Tianjun Xiao, Bing Xu, Chiyuan Zhang, and Zheng Zhang. MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems. In Neural Information Processing Systems, Workshop on Machine Learning Systems, 2015
@@ -186,3 +187,4 @@ mxnet/example/rcnn was v1, v2 and v3.5.
 7. Karen Simonyan, and Andrew Zisserman. "Very deep convolutional networks for large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
 8. Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition". In Computer Vision and Pattern Recognition, IEEE Conference on, 2016.
 9. Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. "Microsoft COCO: Common Objects in Context" In European Conference on Computer Vision, pp. 740-755. Springer International Publishing, 2014.
+
diff --git a/example/rcnn/demo.py b/example/rcnn/demo.py
index 9c01b48fd1bd..34ea327cffac 100644
--- a/example/rcnn/demo.py
+++ b/example/rcnn/demo.py
@@ -1,9 +1,9 @@
-from __future__ import print_function
 import argparse
 import os
 import cv2
 import mxnet as mx
 import numpy as np
+from rcnn.logger import logger
 from rcnn.config import config
 from rcnn.symbol import get_vgg_test, get_vgg_rpn_test
 from rcnn.io.image import resize, transform
@@ -104,17 +104,18 @@ def demo_net(predictor, image_name, vis=False):
     boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))]
 
     # print results
-    print('class ---- [[x1, x2, y1, y2, confidence]]')
+    logger.info('---class---')
+    logger.info('[[x1, x2, y1, y2, confidence]]')
     for ind, boxes in enumerate(boxes_this_image):
         if len(boxes) > 0:
-            print('---------', CLASSES[ind], '---------')
-            print(boxes)
+            logger.info('---%s---' % CLASSES[ind])
+            logger.info('%s' % boxes)
 
     if vis:
         vis_all_detection(data_dict['data'].asnumpy(), boxes_this_image, CLASSES, im_scale)
     else:
         result_file = image_name.replace('.', '_result.')
-        print('results saved to %s' % result_file)
+        logger.info('results saved to %s' % result_file)
         im = draw_all_detection(data_dict['data'].asnumpy(), boxes_this_image, CLASSES, im_scale)
         cv2.imwrite(result_file, im)
 
diff --git a/example/rcnn/rcnn/core/tester.py b/example/rcnn/rcnn/core/tester.py
index a99614b370b5..0ccc47df71eb 100644
--- a/example/rcnn/rcnn/core/tester.py
+++ b/example/rcnn/rcnn/core/tester.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import cPickle
 import os
 import time
@@ -6,6 +5,7 @@
 import numpy as np
 
 from module import MutableModule
+from rcnn.logger import logger
 from rcnn.config import config
 from rcnn.io import image
 from rcnn.processing.bbox_transform import bbox_pred, clip_boxes
@@ -79,9 +79,9 @@ def generate_proposals(predictor, test_data, imdb, vis=False, thresh=0.):
         if vis:
             vis_all_detection(data_dict['data'].asnumpy(), [dets], ['obj'], scale)
 
-        print('generating %d/%d' % (i + 1, imdb.num_images),
-              'proposal %d' % (dets.shape[0]),
-              'data %.4fs net %.4fs' % (t1, t2))
+        logger.info('generating %d/%d ' % (i + 1, imdb.num_images) +
+                    'proposal %d ' % (dets.shape[0]) +
+                    'data %.4fs net %.4fs' % (t1, t2))
         i += 1
 
     assert len(imdb_boxes) == imdb.num_images, 'calculations not complete'
@@ -100,7 +100,7 @@ def generate_proposals(predictor, test_data, imdb, vis=False, thresh=0.):
         with open(full_rpn_file, 'wb') as f:
             cPickle.dump(original_boxes, f, cPickle.HIGHEST_PROTOCOL)
 
-    print('wrote rpn proposals to {}'.format(rpn_file))
+    logger.info('wrote rpn proposals to %s' % rpn_file)
     return imdb_boxes
 
 
@@ -189,7 +189,7 @@ def pred_eval(predictor, test_data, imdb, vis=False, thresh=1e-3):
 
         t3 = time.time() - t
         t = time.time()
-        print('testing {}/{} data {:.4f}s net {:.4f}s post {:.4f}s'.format(i, imdb.num_images, t1, t2, t3))
+        logger.info('testing %d/%d data %.4fs net %.4fs post %.4fs' % (i, imdb.num_images, t1, t2, t3))
         i += 1
 
     det_file = os.path.join(imdb.cache_path, imdb.name + '_detections.pkl')
diff --git a/example/rcnn/rcnn/cython/setup.py b/example/rcnn/rcnn/cython/setup.py
index 330373dddb72..786460798fd2 100644
--- a/example/rcnn/rcnn/cython/setup.py
+++ b/example/rcnn/rcnn/cython/setup.py
@@ -55,7 +55,13 @@ def locate_cuda():
             raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
 
     return cudaconfig
-CUDA = locate_cuda()
+
+
+# Test if cuda could be foun
+try:
+    CUDA = locate_cuda()
+except EnvironmentError:
+    CUDA = None
 
 
 # Obtain the numpy include directory.  This logic works across numpy versions.
@@ -123,25 +129,32 @@ def build_extensions(self):
         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
         include_dirs = [numpy_include]
     ),
-    Extension('gpu_nms',
-        ['nms_kernel.cu', 'gpu_nms.pyx'],
-        library_dirs=[CUDA['lib64']],
-        libraries=['cudart'],
-        language='c++',
-        runtime_library_dirs=[CUDA['lib64']],
-        # this syntax is specific to this build system
-        # we're only going to use certain compiler args with nvcc and not with
-        # gcc the implementation of this trick is in customize_compiler() below
-        extra_compile_args={'gcc': ["-Wno-unused-function"],
-                            'nvcc': ['-arch=sm_35',
-                                     '--ptxas-options=-v',
-                                     '-c',
-                                     '--compiler-options',
-                                     "'-fPIC'"]},
-        include_dirs = [numpy_include, CUDA['include']]
-    ),
 ]
 
+if CUDA is not None:
+    ext_modules.append(
+        Extension('gpu_nms',
+            ['nms_kernel.cu', 'gpu_nms.pyx'],
+            library_dirs=[CUDA['lib64']],
+            libraries=['cudart'],
+            language='c++',
+            runtime_library_dirs=[CUDA['lib64']],
+            # this syntax is specific to this build system
+            # we're only going to use certain compiler args with nvcc and not with
+            # gcc the implementation of this trick is in customize_compiler() below
+            extra_compile_args={'gcc': ["-Wno-unused-function"],
+                                'nvcc': ['-arch=sm_35',
+                                         '--ptxas-options=-v',
+                                         '-c',
+                                         '--compiler-options',
+                                         "'-fPIC'"]},
+            include_dirs = [numpy_include, CUDA['include']]
+        )
+    )
+else:
+    print('Skipping GPU_NMS')
+
+
 setup(
     name='frcnn_cython',
     ext_modules=ext_modules,
diff --git a/example/rcnn/rcnn/dataset/coco.py b/example/rcnn/rcnn/dataset/coco.py
index 8026071a90c3..00c4c41cf3ce 100644
--- a/example/rcnn/rcnn/dataset/coco.py
+++ b/example/rcnn/rcnn/dataset/coco.py
@@ -1,10 +1,10 @@
-from __future__ import print_function
 import cPickle
 import cv2
 import os
 import json
 import numpy as np
 
+from ..logger import logger
 from imdb import IMDB
 
 # coco api
@@ -38,7 +38,7 @@ def __init__(self, image_set, root_path, data_path):
         # load image file names
         self.image_set_index = self._load_image_set_index()
         self.num_images = len(self.image_set_index)
-        print('num_images', self.num_images)
+        logger.info('%s num_images %d' % (self.name, self.num_images))
 
         # deal with data name
         view_map = {'minival2014': 'val2014',
@@ -68,13 +68,13 @@ def gt_roidb(self):
         if os.path.exists(cache_file):
             with open(cache_file, 'rb') as fid:
                 roidb = cPickle.load(fid)
-            print('{} gt roidb loaded from {}'.format(self.name, cache_file))
+            logger.info('%s gt roidb loaded from %s' % (self.name, cache_file))
             return roidb
 
         gt_roidb = [self._load_coco_annotation(index) for index in self.image_set_index]
         with open(cache_file, 'wb') as fid:
             cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
-        print('wrote gt roidb to {}'.format(cache_file))
+        logger.info('%s wrote gt roidb to %s' % (self.name, cache_file))
 
         return gt_roidb
 
@@ -155,10 +155,10 @@ def _write_coco_results(self, detections, res_file):
         for cls_ind, cls in enumerate(self.classes):
             if cls == '__background__':
                 continue
-            print('Collecting %s results (%d/%d)' % (cls, cls_ind, self.num_classes - 1))
+            logger.info('collecting %s results (%d/%d)' % (cls, cls_ind, self.num_classes - 1))
             coco_cat_id = self._class_to_coco_ind[cls]
             results.extend(self._coco_results_one_category(detections[cls_ind], coco_cat_id))
-        print('Writing results json to %s' % res_file)
+        logger.info('writing results json to %s' % res_file)
         with open(res_file, 'w') as f:
             json.dump(results, f, sort_keys=True, indent=4)
 
@@ -192,7 +192,7 @@ def _do_python_eval(self, res_file, res_folder):
         eval_file = os.path.join(res_folder, 'detections_%s_results.pkl' % self.image_set)
         with open(eval_file, 'wb') as f:
             cPickle.dump(coco_eval, f, cPickle.HIGHEST_PROTOCOL)
-        print('coco eval results saved to %s' % eval_file)
+        logger.info('eval results saved to %s' % eval_file)
 
     def _print_detection_metrics(self, coco_eval):
         IoU_lo_thresh = 0.5
@@ -214,15 +214,15 @@ def _get_thr_ind(coco_eval, thr):
         precision = \
             coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2]
         ap_default = np.mean(precision[precision > -1])
-        print('~~~~ Mean and per-category AP @ IoU=%.2f,%.2f] ~~~~' % (IoU_lo_thresh, IoU_hi_thresh))
-        print('%-15s %5.1f' % ('all', 100 * ap_default))
+        logger.info('~~~~ Mean and per-category AP @ IoU=%.2f,%.2f] ~~~~' % (IoU_lo_thresh, IoU_hi_thresh))
+        logger.info('%-15s %5.1f' % ('all', 100 * ap_default))
         for cls_ind, cls in enumerate(self.classes):
             if cls == '__background__':
                 continue
             # minus 1 because of __background__
             precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2]
             ap = np.mean(precision[precision > -1])
-            print('%-15s %5.1f' % (cls, 100 * ap))
+            logger.info('%-15s %5.1f' % (cls, 100 * ap))
 
-        print('~~~~ Summary metrics ~~~~')
+        logger.info('~~~~ Summary metrics ~~~~')
         coco_eval.summarize()
diff --git a/example/rcnn/rcnn/dataset/imdb.py b/example/rcnn/rcnn/dataset/imdb.py
index 1ad18dbc29bc..acdcd50f8208 100644
--- a/example/rcnn/rcnn/dataset/imdb.py
+++ b/example/rcnn/rcnn/dataset/imdb.py
@@ -9,7 +9,7 @@
 'boxes', 'gt_classes', 'gt_overlaps', 'max_classes', 'max_overlaps', 'bbox_targets']
 """
 
-from __future__ import print_function
+from ..logger import logger
 import os
 import cPickle
 import numpy as np
@@ -70,8 +70,8 @@ def load_rpn_data(self, full=False):
             rpn_file = os.path.join(self.root_path, 'rpn_data', self.name + '_full_rpn.pkl')
         else:
             rpn_file = os.path.join(self.root_path, 'rpn_data', self.name + '_rpn.pkl')
-        print('loading {}'.format(rpn_file))
-        assert os.path.exists(rpn_file), 'rpn data not found at {}'.format(rpn_file)
+        assert os.path.exists(rpn_file), '%s rpn data not found at %s' % (self.name, rpn_file)
+        logger.info('%s loading rpn data from %s' % (self.name, rpn_file))
         with open(rpn_file, 'rb') as f:
             box_list = cPickle.load(f)
         return box_list
@@ -93,7 +93,7 @@ def rpn_roidb(self, gt_roidb, append_gt=False):
         :return: roidb of rpn
         """
         if append_gt:
-            print('appending ground truth annotations')
+            logger.info('%s appending ground truth annotations' % self.name)
             rpn_roidb = self.load_rpn_roidb(gt_roidb)
             roidb = IMDB.merge_roidbs(gt_roidb, rpn_roidb)
         else:
@@ -156,7 +156,7 @@ def append_flipped_images(self, roidb):
         :param roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         """
-        print('append flipped images to roidb')
+        logger.info('%s append flipped images to roidb' % self.name)
         assert self.num_images == len(roidb)
         for i in range(self.num_images):
             roi_rec = roidb[i]
@@ -211,8 +211,8 @@ def evaluate_recall(self, roidb, candidate_boxes=None, thresholds=None):
             area_counts.append(area_count)
         total_counts = float(sum(area_counts))
         for area_name, area_count in zip(area_names[1:], area_counts):
-            print('percentage of', area_name, area_count / total_counts)
-        print('average number of proposal', total_counts / self.num_images)
+            logger.info('percentage of %s is %f' % (area_name, area_count / total_counts))
+        logger.info('average number of proposal is %f' % (total_counts / self.num_images))
         for area_name, area_range in zip(area_names, area_ranges):
             gt_overlaps = np.zeros(0)
             num_pos = 0
diff --git a/example/rcnn/rcnn/dataset/pascal_voc.py b/example/rcnn/rcnn/dataset/pascal_voc.py
index 268399316162..2135971faadf 100644
--- a/example/rcnn/rcnn/dataset/pascal_voc.py
+++ b/example/rcnn/rcnn/dataset/pascal_voc.py
@@ -6,12 +6,12 @@
 criterion.
 """
 
-from __future__ import print_function
 import cPickle
 import cv2
 import os
 import numpy as np
 
+from ..logger import logger
 from imdb import IMDB
 from pascal_voc_eval import voc_eval
 from ds_utils import unique_boxes, filter_small_boxes
@@ -42,7 +42,7 @@ def __init__(self, image_set, root_path, devkit_path):
         self.num_classes = len(self.classes)
         self.image_set_index = self.load_image_set_index()
         self.num_images = len(self.image_set_index)
-        print('num_images', self.num_images)
+        logger.info('%s num_images %d' % (self.name, self.num_images))
 
         self.config = {'comp_id': 'comp4',
                        'use_diff': False,
@@ -78,13 +78,13 @@ def gt_roidb(self):
         if os.path.exists(cache_file):
             with open(cache_file, 'rb') as fid:
                 roidb = cPickle.load(fid)
-            print('{} gt roidb loaded from {}'.format(self.name, cache_file))
+            logger.info('%s gt roidb loaded from %s' % (self.name, cache_file))
             return roidb
 
         gt_roidb = [self.load_pascal_annotation(index) for index in self.image_set_index]
         with open(cache_file, 'wb') as fid:
             cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
-        print('wrote gt roidb to {}'.format(cache_file))
+        logger.info('%s wrote gt roidb to %s' % (self.name, cache_file))
 
         return gt_roidb
 
@@ -168,18 +168,18 @@ def selective_search_roidb(self, gt_roidb, append_gt=False):
         if os.path.exists(cache_file):
             with open(cache_file, 'rb') as fid:
                 roidb = cPickle.load(fid)
-            print('{} ss roidb loaded from {}'.format(self.name, cache_file))
+            logger.info('%s ss roidb loaded from %s' % (self.name, cache_file))
             return roidb
 
         if append_gt:
-            print('appending ground truth annotations')
+            logger.info('%s appending ground truth annotations' % self.name)
             ss_roidb = self.load_selective_search_roidb(gt_roidb)
             roidb = IMDB.merge_roidbs(gt_roidb, ss_roidb)
         else:
             roidb = self.load_selective_search_roidb(gt_roidb)
         with open(cache_file, 'wb') as fid:
             cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
-        print('wrote ss roidb to {}'.format(cache_file))
+        logger.info('%s wrote ss roidb to %s' % (self.name, cache_file))
 
         return roidb
 
@@ -224,7 +224,7 @@ def write_pascal_results(self, all_boxes):
         for cls_ind, cls in enumerate(self.classes):
             if cls == '__background__':
                 continue
-            print('Writing {} VOC results file'.format(cls))
+            logger.info('Writing %s VOC results file' % cls)
             filename = self.get_result_file_template().format(cls)
             with open(filename, 'wt') as f:
                 for im_ind, index in enumerate(self.image_set_index):
@@ -248,7 +248,7 @@ def do_python_eval(self):
         aps = []
         # The PASCAL VOC metric changed in 2010
         use_07_metric = True if int(self.year) < 2010 else False
-        print('VOC07 metric? ' + ('Y' if use_07_metric else 'No'))
+        logger.info('VOC07 metric? ' + ('Y' if use_07_metric else 'No'))
         for cls_ind, cls in enumerate(self.classes):
             if cls == '__background__':
                 continue
@@ -256,5 +256,5 @@ def do_python_eval(self):
             rec, prec, ap = voc_eval(filename, annopath, imageset_file, cls, annocache,
                                      ovthresh=0.5, use_07_metric=use_07_metric)
             aps += [ap]
-            print('AP for {} = {:.4f}'.format(cls, ap))
-        print('Mean AP = {:.4f}'.format(np.mean(aps)))
+            logger.info('AP for {} = {:.4f}'.format(cls, ap))
+        logger.info('Mean AP = {:.4f}'.format(np.mean(aps)))
diff --git a/example/rcnn/rcnn/dataset/pascal_voc_eval.py b/example/rcnn/rcnn/dataset/pascal_voc_eval.py
index 295b866bb697..54fa12ddccd8 100644
--- a/example/rcnn/rcnn/dataset/pascal_voc_eval.py
+++ b/example/rcnn/rcnn/dataset/pascal_voc_eval.py
@@ -2,7 +2,7 @@
 given a pascal voc imdb, compute mAP
 """
 
-from __future__ import print_function
+from ..logger import logger
 import numpy as np
 import os
 import cPickle
@@ -86,8 +86,8 @@ def voc_eval(detpath, annopath, imageset_file, classname, annocache, ovthresh=0.
         for ind, image_filename in enumerate(image_filenames):
             recs[image_filename] = parse_voc_rec(annopath.format(image_filename))
             if ind % 100 == 0:
-                print('reading annotations for {:d}/{:d}'.format(ind + 1, len(image_filenames)))
-        print('saving annotations cache to {:s}'.format(annocache))
+                logger.info('reading annotations for %d/%d' % (ind + 1, len(image_filenames)))
+        logger.info('saving annotations cache to %s' % annocache)
         with open(annocache, 'wb') as f:
             cPickle.dump(recs, f, protocol=cPickle.HIGHEST_PROTOCOL)
     else:
diff --git a/example/rcnn/rcnn/io/rpn.py b/example/rcnn/rcnn/io/rpn.py
index c813e4ab06f6..52fe1a50c276 100644
--- a/example/rcnn/rcnn/io/rpn.py
+++ b/example/rcnn/rcnn/io/rpn.py
@@ -10,10 +10,11 @@
      'bbox_weight': [batch_size, num_anchors, feat_height, feat_width]}
 """
 
-from __future__ import print_function
+import logging
 import numpy as np
 import numpy.random as npr
 
+from ..logger import logger
 from ..config import config
 from .image import get_image, tensor_vstack
 from ..processing.generate_anchor import generate_anchors
@@ -94,23 +95,19 @@ def _unmap(data, count, inds, fill=0):
             ret[inds, :] = data
         return ret
 
-    DEBUG = False
     im_info = im_info[0]
     scales = np.array(scales, dtype=np.float32)
     base_anchors = generate_anchors(base_size=feat_stride, ratios=list(ratios), scales=scales)
     num_anchors = base_anchors.shape[0]
     feat_height, feat_width = feat_shape[-2:]
 
-    if DEBUG:
-        print('anchors:')
-        print(base_anchors)
-        print('anchor shapes:')
-        print(np.hstack((base_anchors[:, 2::4] - base_anchors[:, 0::4],
-                         base_anchors[:, 3::4] - base_anchors[:, 1::4])))
-        print('im_info', im_info)
-        print('height', feat_height, 'width', feat_width)
-        print('gt_boxes shape', gt_boxes.shape)
-        print('gt_boxes', gt_boxes)
+    logger.debug('anchors: %s' % base_anchors)
+    logger.debug('anchor shapes: %s' % np.hstack((base_anchors[:, 2::4] - base_anchors[:, 0::4],
+                                                 base_anchors[:, 3::4] - base_anchors[:, 1::4])))
+    logger.debug('im_info %s' % im_info)
+    logger.debug('height %d width %d' % (feat_height, feat_width))
+    logger.debug('gt_boxes shape %s' % np.array(gt_boxes.shape))
+    logger.debug('gt_boxes %s' % gt_boxes)
 
     # 1. generate proposals from bbox deltas and shifted anchors
     shift_x = np.arange(0, feat_width) * feat_stride
@@ -132,14 +129,12 @@ def _unmap(data, count, inds, fill=0):
                            (all_anchors[:, 1] >= -allowed_border) &
                            (all_anchors[:, 2] < im_info[1] + allowed_border) &
                            (all_anchors[:, 3] < im_info[0] + allowed_border))[0]
-    if DEBUG:
-        print('total_anchors', total_anchors)
-        print('inds_inside', len(inds_inside))
+    logger.debug('total_anchors %d' % total_anchors)
+    logger.debug('inds_inside %d' % len(inds_inside))
 
     # keep only inside anchors
     anchors = all_anchors[inds_inside, :]
-    if DEBUG:
-        print('anchors shape', anchors.shape)
+    logger.debug('anchors shape %s' % np.array(anchors.shape))
 
     # label: 1 is positive, 0 is negative, -1 is dont care
     labels = np.empty((len(inds_inside),), dtype=np.float32)
@@ -176,7 +171,7 @@ def _unmap(data, count, inds, fill=0):
     fg_inds = np.where(labels == 1)[0]
     if len(fg_inds) > num_fg:
         disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False)
-        if DEBUG:
+        if logger.level == logging.INFO:
             disable_inds = fg_inds[:(len(fg_inds) - num_fg)]
         labels[disable_inds] = -1
 
@@ -185,7 +180,7 @@ def _unmap(data, count, inds, fill=0):
     bg_inds = np.where(labels == 0)[0]
     if len(bg_inds) > num_bg:
         disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False)
-        if DEBUG:
+        if logger.level == logging.INFO:
             disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
         labels[disable_inds] = -1
 
@@ -196,29 +191,30 @@ def _unmap(data, count, inds, fill=0):
     bbox_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
     bbox_weights[labels == 1, :] = np.array(config.TRAIN.RPN_BBOX_WEIGHTS)
 
-    if DEBUG:
+    if logger.level == logging.DEBUG:
         _sums = bbox_targets[labels == 1, :].sum(axis=0)
         _squared_sums = (bbox_targets[labels == 1, :] ** 2).sum(axis=0)
         _counts = np.sum(labels == 1)
         means = _sums / (_counts + 1e-14)
         stds = np.sqrt(_squared_sums / _counts - means ** 2)
-        print('means', means)
-        print('stdevs', stds)
+        logger.debug('means %s' % means)
+        logger.debug('stdevs %s' % stds)
 
     # map up to original set of anchors
     labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
     bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
     bbox_weights = _unmap(bbox_weights, total_anchors, inds_inside, fill=0)
 
-    if DEBUG:
-        print('rpn: max max_overlaps', np.max(max_overlaps))
-        print('rpn: num_positives', np.sum(labels == 1))
-        print('rpn: num_negatives', np.sum(labels == 0))
+    if logger.level == logging.DEBUG:
+        if gt_boxes.size > 0:
+            logger.debug('rpn: max max_overlaps %f' % np.max(max_overlaps))
+        logger.debug('rpn: num_positives %f' % np.sum(labels == 1))
+        logger.debug('rpn: num_negatives %f' % np.sum(labels == 0))
         _fg_sum = np.sum(labels == 1)
         _bg_sum = np.sum(labels == 0)
         _count = 1
-        print('rpn: num_positive avg', _fg_sum / _count)
-        print('rpn: num_negative avg', _bg_sum / _count)
+        logger.debug('rpn: num_positive avg %f' % (_fg_sum / _count))
+        logger.debug('rpn: num_negative avg %f' % (_bg_sum / _count))
 
     labels = labels.reshape((1, feat_height, feat_width, A)).transpose(0, 3, 1, 2)
     labels = labels.reshape((1, A * feat_height * feat_width))
diff --git a/example/rcnn/rcnn/logger.py b/example/rcnn/rcnn/logger.py
new file mode 100644
index 000000000000..2806e1add180
--- /dev/null
+++ b/example/rcnn/rcnn/logger.py
@@ -0,0 +1,6 @@
+import logging
+
+# set up logger
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
diff --git a/example/rcnn/rcnn/processing/bbox_regression.py b/example/rcnn/rcnn/processing/bbox_regression.py
index 46969aa0ec5e..d5b48a71b754 100644
--- a/example/rcnn/rcnn/processing/bbox_regression.py
+++ b/example/rcnn/rcnn/processing/bbox_regression.py
@@ -2,9 +2,9 @@
 This file has functions about generating bounding box regression targets
 """
 
-from __future__ import print_function
 import numpy as np
 
+from ..logger import logger
 from bbox_transform import bbox_overlaps, bbox_transform
 from rcnn.config import config
 
@@ -22,12 +22,13 @@ def compute_bbox_regression_targets(rois, overlaps, labels):
 
     # Sanity check
     if len(rois) != len(overlaps):
-        print('bbox regression: this should not happen')
+        logger.warning('bbox regression: len(rois) != len(overlaps)')
 
     # Indices of ground-truth ROIs
     gt_inds = np.where(overlaps == 1)[0]
     if len(gt_inds) == 0:
-        print('something wrong : zero ground truth rois')
+        logger.warning('bbox regression: len(gt_inds) == 0')
+
     # Indices of examples for which we try to make predictions
     ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]
 
@@ -52,7 +53,7 @@ def add_bbox_regression_targets(roidb):
     :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
     :return: means, std variances of targets
     """
-    print('add bounding box regression targets')
+    logger.info('bbox regression: add bounding box regression targets')
     assert len(roidb) > 0
     assert 'max_classes' in roidb[0]
 
diff --git a/example/rcnn/rcnn/processing/image_processing.py b/example/rcnn/rcnn/processing/image_processing.py
deleted file mode 100644
index dafca3c15850..000000000000
--- a/example/rcnn/rcnn/processing/image_processing.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import numpy as np
-import cv2
-
-
-def resize(im, target_size, max_size):
-    """
-    only resize input image to target size and return scale
-    :param im: BGR image input by opencv
-    :param target_size: one dimensional size (the short side)
-    :param max_size: one dimensional max size (the long side)
-    :return:
-    """
-    im_shape = im.shape
-    im_size_min = np.min(im_shape[0:2])
-    im_size_max = np.max(im_shape[0:2])
-    im_scale = float(target_size) / float(im_size_min)
-    # prevent bigger axis from being more than max_size:
-    if np.round(im_scale * im_size_max) > max_size:
-        im_scale = float(max_size) / float(im_size_max)
-    im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
-    return im, im_scale
-
-
-def transform(im, pixel_means, need_mean=False):
-    """
-    transform into mxnet tensor
-    subtract pixel size and transform to correct format
-    :param im: [height, width, channel] in BGR
-    :param pixel_means: [[[R, G, B pixel means]]]
-    :return: [batch, channel, height, width]
-    """
-    im = im.copy()
-    im[:, :, (0, 1, 2)] = im[:, :, (2, 1, 0)]
-    im = im.astype(float)
-    if need_mean:
-        im -= pixel_means
-    im_tensor = im[np.newaxis, :]
-    # put channel first
-    channel_swap = (0, 3, 1, 2)
-    im_tensor = im_tensor.transpose(channel_swap)
-    return im_tensor
-
-
-def transform_inverse(im_tensor, pixel_means):
-    """
-    transform from mxnet im_tensor to ordinary RGB image
-    im_tensor is limited to one image
-    :param im_tensor: [batch, channel, height, width]
-    :param pixel_means: [[[R, G, B pixel means]]]
-    :return: im [height, width, channel(RGB)]
-    """
-    assert im_tensor.shape[0] == 1
-    im_tensor = im_tensor.copy()
-    # put channel back
-    channel_swap = (0, 2, 3, 1)
-    im_tensor = im_tensor.transpose(channel_swap)
-    im = im_tensor[0]
-    assert im.shape[2] == 3
-    im += pixel_means
-    im = im.astype(np.uint8)
-    return im
-
-
-def tensor_vstack(tensor_list, pad=0):
-    """
-    vertically stack tensors
-    :param tensor_list: list of tensor to be stacked vertically
-    :param pad: label to pad with
-    :return: tensor with max shape
-    """
-    ndim = len(tensor_list[0].shape)
-    if ndim == 1:
-        return np.hstack(tensor_list)
-    dimensions = [0]
-    for dim in range(1, ndim):
-        dimensions.append(max([tensor.shape[dim] for tensor in tensor_list]))
-    for ind, tensor in enumerate(tensor_list):
-        pad_shape = [(0, 0)]
-        for dim in range(1, ndim):
-            pad_shape.append((0, dimensions[dim] - tensor.shape[dim]))
-        tensor_list[ind] = np.lib.pad(tensor, pad_shape, 'constant', constant_values=pad)
-    all_tensor = np.vstack(tensor_list)
-    return all_tensor
diff --git a/example/rcnn/rcnn/processing/nms.py b/example/rcnn/rcnn/processing/nms.py
index cab093c51152..230139c413ec 100644
--- a/example/rcnn/rcnn/processing/nms.py
+++ b/example/rcnn/rcnn/processing/nms.py
@@ -1,6 +1,9 @@
 import numpy as np
 from ..cython.cpu_nms import cpu_nms
-from ..cython.gpu_nms import gpu_nms
+try:
+    from ..cython.gpu_nms import gpu_nms
+except ImportError:
+    gpu_nms = None
 
 
 def py_nms_wrapper(thresh):
@@ -18,7 +21,10 @@ def _nms(dets):
 def gpu_nms_wrapper(thresh, device_id):
     def _nms(dets):
         return gpu_nms(dets, thresh, device_id)
-    return _nms
+    if gpu_nms is not None:
+        return _nms
+    else:
+        return cpu_nms_wrapper(thresh)
 
 
 def nms(dets, thresh):
diff --git a/example/rcnn/rcnn/processing/roidb.py b/example/rcnn/rcnn/processing/roidb.py
deleted file mode 100644
index 8dddc27f60c9..000000000000
--- a/example/rcnn/rcnn/processing/roidb.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""
-roidb
-basic format [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
-extended ['image', 'max_classes', 'max_overlaps', 'bbox_targets']
-"""
-
-from __future__ import print_function
-import cv2
-import numpy as np
-
-from bbox_regression import compute_bbox_regression_targets
-from rcnn.config import config
-
-
-def prepare_roidb(imdb, roidb):
-    """
-    add image path, max_classes, max_overlaps to roidb
-    :param imdb: image database, provide path
-    :param roidb: roidb
-    :return: None
-    """
-    print('prepare roidb')
-    for i in range(len(roidb)):  # image_index
-        roidb[i]['image'] = imdb.image_path_from_index(imdb.image_set_index[i])
-        if config.TRAIN.ASPECT_GROUPING:
-            size = cv2.imread(roidb[i]['image']).shape
-            roidb[i]['height'] = size[0]
-            roidb[i]['width'] = size[1]
-        gt_overlaps = roidb[i]['gt_overlaps'].toarray()
-        max_overlaps = gt_overlaps.max(axis=1)
-        max_classes = gt_overlaps.argmax(axis=1)
-        roidb[i]['max_overlaps'] = max_overlaps
-        roidb[i]['max_classes'] = max_classes
-
-        # background roi => background class
-        zero_indexes = np.where(max_overlaps == 0)[0]
-        assert all(max_classes[zero_indexes] == 0)
-        # foreground roi => foreground class
-        nonzero_indexes = np.where(max_overlaps > 0)[0]
-        assert all(max_classes[nonzero_indexes] != 0)
-
-
-def add_bbox_regression_targets(roidb):
-    """
-    given roidb, add ['bbox_targets'] and normalize bounding box regression targets
-    :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
-    :return: means, std variances of targets
-    """
-    print('add bounding box regression targets')
-    assert len(roidb) > 0
-    assert 'max_classes' in roidb[0]
-
-    num_images = len(roidb)
-    num_classes = roidb[0]['gt_overlaps'].shape[1]
-    for im_i in range(num_images):
-        rois = roidb[im_i]['boxes']
-        max_overlaps = roidb[im_i]['max_overlaps']
-        max_classes = roidb[im_i]['max_classes']
-        roidb[im_i]['bbox_targets'] = compute_bbox_regression_targets(rois, max_overlaps, max_classes)
-
-    if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
-        # use fixed / precomputed means and stds instead of empirical values
-        means = np.tile(np.array(config.TRAIN.BBOX_MEANS), (num_classes, 1))
-        stds = np.tile(np.array(config.TRAIN.BBOX_STDS), (num_classes, 1))
-    else:
-        # compute mean, std values
-        class_counts = np.zeros((num_classes, 1)) + config.EPS
-        sums = np.zeros((num_classes, 4))
-        squared_sums = np.zeros((num_classes, 4))
-        for im_i in range(num_images):
-            targets = roidb[im_i]['bbox_targets']
-            for cls in range(1, num_classes):
-                cls_indexes = np.where(targets[:, 0] == cls)[0]
-                if cls_indexes.size > 0:
-                    class_counts[cls] += cls_indexes.size
-                    sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0)
-                    squared_sums[cls, :] += (targets[cls_indexes, 1:] ** 2).sum(axis=0)
-
-        means = sums / class_counts
-        # var(x) = E(x^2) - E(x)^2
-        stds = np.sqrt(squared_sums / class_counts - means ** 2)
-
-    # normalized targets
-    for im_i in range(num_images):
-        targets = roidb[im_i]['bbox_targets']
-        for cls in range(1, num_classes):
-            cls_indexes = np.where(targets[:, 0] == cls)[0]
-            roidb[im_i]['bbox_targets'][cls_indexes, 1:] -= means[cls, :]
-            roidb[im_i]['bbox_targets'][cls_indexes, 1:] /= stds[cls, :]
-
-    return means.ravel(), stds.ravel()
diff --git a/example/rcnn/rcnn/pycocotools/UPSTREAM_REV b/example/rcnn/rcnn/pycocotools/UPSTREAM_REV
index 706219b77d90..9613b145b237 100644
--- a/example/rcnn/rcnn/pycocotools/UPSTREAM_REV
+++ b/example/rcnn/rcnn/pycocotools/UPSTREAM_REV
@@ -1 +1 @@
-https://github.com/pdollar/coco/commit/3ac47c77ebd5a1ed4254a98b7fbf2ef4765a3574
+https://github.com/pdollar/coco/commit/336d2a27c91e3c0663d2dcf0b13574674d30f88e
diff --git a/example/rcnn/rcnn/pycocotools/_mask.pyx b/example/rcnn/rcnn/pycocotools/_mask.pyx
index 4e9278af2a03..1c3e127a1c05 100644
--- a/example/rcnn/rcnn/pycocotools/_mask.pyx
+++ b/example/rcnn/rcnn/pycocotools/_mask.pyx
@@ -10,6 +10,9 @@
 
 __author__ = 'tsungyi'
 
+import sys
+PYTHON_VERSION = sys.version_info[0]
+
 # import both Python-level and C-level symbols of Numpy
 # the API uses Numpy to interface C and Python
 import numpy as np
@@ -38,7 +41,7 @@ cdef extern from "maskApi.h":
     void rlesInit( RLE **R, siz n )
     void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n )
     void rleDecode( const RLE *R, byte *mask, siz n )
-    void rleMerge( const RLE *R, RLE *M, siz n, bint intersect )
+    void rleMerge( const RLE *R, RLE *M, siz n, int intersect )
     void rleArea( const RLE *R, siz n, uint *a )
     void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o )
     void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o )
@@ -119,7 +122,12 @@ def _frString(rleObjs):
     cdef bytes py_string
     cdef char* c_string
     for i, obj in enumerate(rleObjs):
-        py_string = str(obj['counts'])
+        if PYTHON_VERSION == 2:
+            py_string = str(obj['counts']).encode('utf8')
+        elif PYTHON_VERSION == 3:
+            py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
+        else:
+            raise Exception('Python version must be 2 or 3')
         c_string = py_string
         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
     return Rs
@@ -138,10 +146,10 @@ def decode(rleObjs):
     cdef RLEs Rs = _frString(rleObjs)
     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
     masks = Masks(h, w, n)
-    rleDecode( <RLE*>Rs._R, masks._mask, n );
+    rleDecode(<RLE*>Rs._R, masks._mask, n);
     return np.array(masks)
 
-def merge(rleObjs, bint intersect=0):
+def merge(rleObjs, intersect=0):
     cdef RLEs Rs = _frString(rleObjs)
     cdef RLEs R = RLEs(1)
     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
@@ -255,7 +263,7 @@ def frPoly( poly, siz h, siz w ):
     Rs = RLEs(n)
     for i, p in enumerate(poly):
         np_poly = np.array(p, dtype=np.double, order='F')
-        rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, len(np_poly)/2, h, w )
+        rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
     objs = _toString(Rs)
     return objs
 
@@ -277,15 +285,24 @@ def frUncompressedRLE(ucRles, siz h, siz w):
         objs.append(_toString(Rs)[0])
     return objs
 
-def frPyObjects(pyobj, siz h, w):
+def frPyObjects(pyobj, h, w):
+    # encode rle from a list of python objects
     if type(pyobj) == np.ndarray:
-        objs = frBbox(pyobj, h, w )
+        objs = frBbox(pyobj, h, w)
     elif type(pyobj) == list and len(pyobj[0]) == 4:
-        objs = frBbox(pyobj, h, w )
+        objs = frBbox(pyobj, h, w)
     elif type(pyobj) == list and len(pyobj[0]) > 4:
-        objs = frPoly(pyobj, h, w )
-    elif type(pyobj) == list and type(pyobj[0]) == dict:
+        objs = frPoly(pyobj, h, w)
+    elif type(pyobj) == list and type(pyobj[0]) == dict \
+        and 'counts' in pyobj[0] and 'size' in pyobj[0]:
         objs = frUncompressedRLE(pyobj, h, w)
+    # encode rle from single python object
+    elif type(pyobj) == list and len(pyobj) == 4:
+        objs = frBbox([pyobj], h, w)[0]
+    elif type(pyobj) == list and len(pyobj) > 4:
+        objs = frPoly([pyobj], h, w)[0]
+    elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
+        objs = frUncompressedRLE([pyobj], h, w)[0]
     else:
         raise Exception('input type is not supported.')
     return objs
diff --git a/example/rcnn/rcnn/pycocotools/coco.py b/example/rcnn/rcnn/pycocotools/coco.py
index 44158d21d5a4..ca35cc0b053b 100644
--- a/example/rcnn/rcnn/pycocotools/coco.py
+++ b/example/rcnn/rcnn/pycocotools/coco.py
@@ -1,5 +1,5 @@
 __author__ = 'tylin'
-__version__ = '1.0.1'
+__version__ = '2.0'
 # Interface for accessing the Microsoft COCO dataset.
 
 # Microsoft COCO is a large image dataset designed for object detection,
@@ -27,7 +27,7 @@
 #  loadAnns   - Load anns with the specified ids.
 #  loadCats   - Load cats with the specified ids.
 #  loadImgs   - Load imgs with the specified ids.
-#  segToMask  - Convert polygon segmentation to binary mask.
+#  annToMask  - Convert segmentation in an annotation to binary mask.
 #  showAnns   - Display the specified annotations.
 #  loadRes    - Load algorithm results and create API for accessing them.
 #  download   - Download COCO images from mscoco.org server.
@@ -37,27 +37,30 @@
 # See also COCO>decodeMask,
 # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
 # COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
-# COCO>loadImgs, COCO>segToMask, COCO>showAnns
+# COCO>loadImgs, COCO>annToMask, COCO>showAnns
 
 # Microsoft COCO Toolbox.      version 2.0
 # Data, paper, and tutorials available at:  http://mscoco.org/
 # Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
 # Licensed under the Simplified BSD License [see bsd.txt]
 
-from __future__ import print_function
 import json
-import datetime
 import time
 import matplotlib.pyplot as plt
 from matplotlib.collections import PatchCollection
 from matplotlib.patches import Polygon
 import numpy as np
-from skimage.draw import polygon
-import urllib
 import copy
 import itertools
-import mask
+from . import mask as maskUtils
 import os
+from collections import defaultdict
+import sys
+PYTHON_VERSION = sys.version_info[0]
+if PYTHON_VERSION == 2:
+    from urllib import urlretrieve
+elif PYTHON_VERSION == 3:
+    from urllib.request import urlretrieve
 
 class COCO:
     def __init__(self, annotation_file=None):
@@ -68,47 +71,38 @@ def __init__(self, annotation_file=None):
         :return:
         """
         # load dataset
-        self.dataset = {}
-        self.anns = []
-        self.imgToAnns = {}
-        self.catToImgs = {}
-        self.imgs = {}
-        self.cats = {}
-        if annotation_file is not None:
+        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
+        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
+        if not annotation_file == None:
             print('loading annotations into memory...')
             tic = time.time()
             dataset = json.load(open(annotation_file, 'r'))
-            print('Done (t=%0.2fs)'%(time.time()- tic))
+            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
+            print('Done (t={:0.2f}s)'.format(time.time()- tic))
             self.dataset = dataset
             self.createIndex()
 
     def createIndex(self):
         # create index
         print('creating index...')
-        anns = {}
-        imgToAnns = {}
-        catToImgs = {}
-        cats = {}
-        imgs = {}
+        anns, cats, imgs = {}, {}, {}
+        imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
         if 'annotations' in self.dataset:
-            imgToAnns = {ann['image_id']: [] for ann in self.dataset['annotations']}
-            anns =      {ann['id']:       [] for ann in self.dataset['annotations']}
             for ann in self.dataset['annotations']:
-                imgToAnns[ann['image_id']] += [ann]
+                imgToAnns[ann['image_id']].append(ann)
                 anns[ann['id']] = ann
 
         if 'images' in self.dataset:
-            imgs      = {im['id']: {} for im in self.dataset['images']}
             for img in self.dataset['images']:
                 imgs[img['id']] = img
 
         if 'categories' in self.dataset:
-            cats = {cat['id']: [] for cat in self.dataset['categories']}
             for cat in self.dataset['categories']:
                 cats[cat['id']] = cat
-            catToImgs = {cat['id']: [] for cat in self.dataset['categories']}
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
             for ann in self.dataset['annotations']:
-                catToImgs[ann['category_id']] += [ann['image_id']]
+                catToImgs[ann['category_id']].append(ann['image_id'])
 
         print('index created!')
 
@@ -125,7 +119,7 @@ def info(self):
         :return:
         """
         for key, value in self.dataset['info'].items():
-            print('%s: %s'%(key, value))
+            print('{}: {}'.format(key, value))
 
     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         """
@@ -143,14 +137,13 @@ def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
             anns = self.dataset['annotations']
         else:
             if not len(imgIds) == 0:
-                # this can be changed by defaultdict
                 lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
                 anns = list(itertools.chain.from_iterable(lists))
             else:
                 anns = self.dataset['annotations']
             anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
             anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
-        if iscrowd is not None:
+        if not iscrowd == None:
             ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
         else:
             ids = [ann['id'] for ann in anns]
@@ -240,39 +233,57 @@ def showAnns(self, anns):
         """
         if len(anns) == 0:
             return 0
-        if 'segmentation' in anns[0]:
+        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
             datasetType = 'instances'
         elif 'caption' in anns[0]:
             datasetType = 'captions'
+        else:
+            raise Exception('datasetType not supported')
         if datasetType == 'instances':
             ax = plt.gca()
+            ax.set_autoscale_on(False)
             polygons = []
             color = []
             for ann in anns:
-                c = np.random.random((1, 3)).tolist()[0]
-                if type(ann['segmentation']) == list:
-                    # polygon
-                    for seg in ann['segmentation']:
-                        poly = np.array(seg).reshape((len(seg)/2, 2))
-                        polygons.append(Polygon(poly, True,alpha=0.4))
-                        color.append(c)
-                else:
-                    # mask
-                    t = self.imgs[ann['image_id']]
-                    if type(ann['segmentation']['counts']) == list:
-                        rle = mask.frPyObjects([ann['segmentation']], t['height'], t['width'])
+                c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
+                if 'segmentation' in ann:
+                    if type(ann['segmentation']) == list:
+                        # polygon
+                        for seg in ann['segmentation']:
+                            poly = np.array(seg).reshape((int(len(seg)/2), 2))
+                            polygons.append(Polygon(poly))
+                            color.append(c)
                     else:
-                        rle = [ann['segmentation']]
-                    m = mask.decode(rle)
-                    img = np.ones( (m.shape[0], m.shape[1], 3) )
-                    if ann['iscrowd'] == 1:
-                        color_mask = np.array([2.0,166.0,101.0])/255
-                    if ann['iscrowd'] == 0:
-                        color_mask = np.random.random((1, 3)).tolist()[0]
-                    for i in range(3):
-                        img[:,:,i] = color_mask[i]
-                    ax.imshow(np.dstack( (img, m*0.5) ))
-            p = PatchCollection(polygons, facecolors=color, edgecolors=(0,0,0,1), linewidths=3, alpha=0.4)
+                        # mask
+                        t = self.imgs[ann['image_id']]
+                        if type(ann['segmentation']['counts']) == list:
+                            rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
+                        else:
+                            rle = [ann['segmentation']]
+                        m = maskUtils.decode(rle)
+                        img = np.ones( (m.shape[0], m.shape[1], 3) )
+                        if ann['iscrowd'] == 1:
+                            color_mask = np.array([2.0,166.0,101.0])/255
+                        if ann['iscrowd'] == 0:
+                            color_mask = np.random.random((1, 3)).tolist()[0]
+                        for i in range(3):
+                            img[:,:,i] = color_mask[i]
+                        ax.imshow(np.dstack( (img, m*0.5) ))
+                if 'keypoints' in ann and type(ann['keypoints']) == list:
+                    # turn skeleton into zero-based index
+                    sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
+                    kp = np.array(ann['keypoints'])
+                    x = kp[0::3]
+                    y = kp[1::3]
+                    v = kp[2::3]
+                    for sk in sks:
+                        if np.all(v[sk]>0):
+                            plt.plot(x[sk],y[sk], linewidth=3, color=c)
+                    plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
+                    plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
+            p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
+            ax.add_collection(p)
+            p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
             ax.add_collection(p)
         elif datasetType == 'captions':
             for ann in anns:
@@ -286,12 +297,15 @@ def loadRes(self, resFile):
         """
         res = COCO()
         res.dataset['images'] = [img for img in self.dataset['images']]
-        # res.dataset['info'] = copy.deepcopy(self.dataset['info'])
-        # res.dataset['licenses'] = copy.deepcopy(self.dataset['licenses'])
 
-        print('Loading and preparing results...     ')
+        print('Loading and preparing results...')
         tic = time.time()
-        anns    = json.load(open(resFile))
+        if type(resFile) == str or type(resFile) == unicode:
+            anns = json.load(open(resFile))
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
         assert type(anns) == list, 'results in not an array of objects'
         annsImgIds = [ann['image_id'] for ann in anns]
         assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
@@ -315,18 +329,28 @@ def loadRes(self, resFile):
             res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
             for id, ann in enumerate(anns):
                 # now only support compressed RLE format as segmentation results
-                ann['area'] = mask.area([ann['segmentation']])[0]
+                ann['area'] = maskUtils.area(ann['segmentation'])
                 if not 'bbox' in ann:
-                    ann['bbox'] = mask.toBbox([ann['segmentation']])[0]
+                    ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
                 ann['id'] = id+1
                 ann['iscrowd'] = 0
-        print('DONE (t=%0.2fs)'%(time.time()- tic))
+        elif 'keypoints' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                s = ann['keypoints']
+                x = s[0::3]
+                y = s[1::3]
+                x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann['area'] = (x1-x0)*(y1-y0)
+                ann['id'] = id + 1
+                ann['bbox'] = [x0,y0,x1-x0,y1-y0]
+        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
 
         res.dataset['annotations'] = anns
         res.createIndex()
         return res
 
-    def download(self, tarDir=None, imgIds=[]):
+    def download(self, tarDir = None, imgIds = [] ):
         '''
         Download COCO images from mscoco.org server.
         :param tarDir (str): COCO results directory name
@@ -347,5 +371,58 @@ def download(self, tarDir=None, imgIds=[]):
             tic = time.time()
             fname = os.path.join(tarDir, img['file_name'])
             if not os.path.exists(fname):
-                urllib.urlretrieve(img['coco_url'], fname)
-            print('downloaded %d/%d images (t=%.1fs)'%(i, N, time.time()- tic))
+                urlretrieve(img['coco_url'], fname)
+            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
+
+    def loadNumpyAnnotations(self, data):
+        """
+        Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
+        :param  data (numpy.ndarray)
+        :return: annotations (python nested list)
+        """
+        print('Converting ndarray to lists...')
+        assert(type(data) == np.ndarray)
+        print(data.shape)
+        assert(data.shape[1] == 7)
+        N = data.shape[0]
+        ann = []
+        for i in range(N):
+            if i % 1000000 == 0:
+                print('{}/{}'.format(i,N))
+            ann += [{
+                'image_id'  : int(data[i, 0]),
+                'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
+                'score' : data[i, 5],
+                'category_id': int(data[i, 6]),
+                }]
+        return ann
+
+    def annToRLE(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE to RLE.
+        :return: binary mask (numpy 2D array)
+        """
+        t = self.imgs[ann['image_id']]
+        h, w = t['height'], t['width']
+        segm = ann['segmentation']
+        if type(segm) == list:
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(segm, h, w)
+            rle = maskUtils.merge(rles)
+        elif type(segm['counts']) == list:
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(segm, h, w)
+        else:
+            # rle
+            rle = ann['segmentation']
+        return rle
+
+    def annToMask(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
+        :return: binary mask (numpy 2D array)
+        """
+        rle = self.annToRLE(ann)
+        m = maskUtils.decode(rle)
+        return m
\ No newline at end of file
diff --git a/example/rcnn/rcnn/pycocotools/cocoeval.py b/example/rcnn/rcnn/pycocotools/cocoeval.py
index 015c9f4ff8cc..a5dd1852912d 100644
--- a/example/rcnn/rcnn/pycocotools/cocoeval.py
+++ b/example/rcnn/rcnn/pycocotools/cocoeval.py
@@ -1,11 +1,10 @@
 __author__ = 'tsungyi'
 
-from __future__ import print_function
 import numpy as np
 import datetime
 import time
 from collections import defaultdict
-import mask
+import mask as maskUtils
 import copy
 
 class COCOeval:
@@ -27,8 +26,9 @@ class COCOeval:
     #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
     #  areaRng    - [...] A=4 object area ranges for evaluation
     #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
-    #  useSegm    - [1] if true evaluate against ground-truth segments
-    #  useCats    - [1] if true use category labels for evaluation    # Note: if useSegm=0 the evaluation is run on bounding boxes.
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
     # Note: if useCats=0 category labels are ignored as in proposal scoring.
     # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
     #
@@ -57,13 +57,15 @@ class COCOeval:
     # Data, paper, and tutorials available at:  http://mscoco.org/
     # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
     # Licensed under the Simplified BSD License [see coco/license.txt]
-    def __init__(self, cocoGt=None, cocoDt=None):
+    def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
         '''
         Initialize CocoEval using coco APIs for gt and dt
         :param cocoGt: coco object with ground truth annotations
         :param cocoDt: coco object with detection results
         :return: None
         '''
+        if not iouType:
+            print('iouType not specified. use default iouType segm')
         self.cocoGt   = cocoGt              # ground truth COCO API
         self.cocoDt   = cocoDt              # detections COCO API
         self.params   = {}                  # evaluation parameters
@@ -71,7 +73,7 @@ def __init__(self, cocoGt=None, cocoDt=None):
         self.eval     = {}                  # accumulated evaluation results
         self._gts = defaultdict(list)       # gt for evaluation
         self._dts = defaultdict(list)       # dt for evaluation
-        self.params = Params()              # parameters
+        self.params = Params(iouType=iouType) # parameters
         self._paramsEval = {}               # parameters for evaluation
         self.stats = []                     # result summarization
         self.ious = {}                      # ious between all gts and dts
@@ -85,28 +87,11 @@ def _prepare(self):
         Prepare ._gts and ._dts for evaluation based on params
         :return: None
         '''
-        #
-        def _toMask(objs, coco):
-            # modify segmentation by reference
-            for obj in objs:
-                t = coco.imgs[obj['image_id']]
-                if type(obj['segmentation']) == list:
-                    if type(obj['segmentation'][0]) == dict:
-                        print('debug')
-                    obj['segmentation'] = mask.frPyObjects(obj['segmentation'],t['height'],t['width'])
-                    if len(obj['segmentation']) == 1:
-                        obj['segmentation'] = obj['segmentation'][0]
-                    else:
-                        # an object can have multiple polygon regions
-                        # merge them into one RLE mask
-                        obj['segmentation'] = mask.merge(obj['segmentation'])
-                elif type(obj['segmentation']) == dict and type(obj['segmentation']['counts']) == list:
-                    obj['segmentation'] = mask.frPyObjects([obj['segmentation']],t['height'],t['width'])[0]
-                elif type(obj['segmentation']) == dict and \
-                     type(obj['segmentation']['counts'] == unicode or type(obj['segmentation']['counts']) == str):
-                    pass
-                else:
-                    raise Exception('segmentation format not supported.')
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann['segmentation'] = rle
         p = self.params
         if p.useCats:
             gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
@@ -115,9 +100,16 @@ def _toMask(objs, coco):
             gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
             dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
 
-        if p.useSegm:
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
             _toMask(gts, self.cocoGt)
             _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
         self._gts = defaultdict(list)       # gt for evaluation
         self._dts = defaultdict(list)       # dt for evaluation
         for gt in gts:
@@ -133,8 +125,13 @@ def evaluate(self):
         :return: None
         '''
         tic = time.time()
-        print('Running per image evaluation...      ')
+        print('Running per image evaluation...')
         p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if not p.useSegm is None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
         p.imgIds = list(np.unique(p.imgIds))
         if p.useCats:
             p.catIds = list(np.unique(p.catIds))
@@ -145,7 +142,10 @@ def evaluate(self):
         # loop through images, area range, max detection number
         catIds = p.catIds if p.useCats else [-1]
 
-        computeIoU = self.computeIoU
+        if p.iouType == 'segm' or p.iouType == 'bbox':
+            computeIoU = self.computeIoU
+        elif p.iouType == 'keypoints':
+            computeIoU = self.computeOks
         self.ious = {(imgId, catId): computeIoU(imgId, catId) \
                         for imgId in p.imgIds
                         for catId in catIds}
@@ -159,7 +159,7 @@ def evaluate(self):
              ]
         self._paramsEval = copy.deepcopy(self.params)
         toc = time.time()
-        print('DONE (t=%0.2fs).'%(toc-tic))
+        print('DONE (t={:0.2f}s).'.format(toc-tic))
 
     def computeIoU(self, imgId, catId):
         p = self.params
@@ -171,20 +171,66 @@ def computeIoU(self, imgId, catId):
             dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
         if len(gt) == 0 and len(dt) ==0:
             return []
-        dt = sorted(dt, key=lambda x: -x['score'])
+        inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in inds]
         if len(dt) > p.maxDets[-1]:
             dt=dt[0:p.maxDets[-1]]
 
-        if p.useSegm:
+        if p.iouType == 'segm':
             g = [g['segmentation'] for g in gt]
             d = [d['segmentation'] for d in dt]
-        else:
+        elif p.iouType == 'bbox':
             g = [g['bbox'] for g in gt]
             d = [d['bbox'] for d in dt]
+        else:
+            raise Exception('unknown iouType for iou computation')
 
         # compute iou between each dt and gt region
         iscrowd = [int(o['iscrowd']) for o in gt]
-        ious = mask.iou(d,g,iscrowd)
+        ious = maskUtils.iou(d,g,iscrowd)
+        return ious
+
+    def computeOks(self, imgId, catId):
+        p = self.params
+        # dimention here should be Nxm
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0:p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
+        vars = (sigmas * 2)**2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt['keypoints'])
+            xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt['bbox']
+            x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt['keypoints'])
+                xd = d[0::3]; yd = d[1::3]
+                if k1>0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
+                    z = np.zeros((k))
+                    dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
+                    dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)
+                e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2
+                if k1 > 0:
+                    e=e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
         return ious
 
     def evaluateImg(self, imgId, catId, aRng, maxDet):
@@ -192,7 +238,6 @@ def evaluateImg(self, imgId, catId, aRng, maxDet):
         perform evaluation for single category and image
         :return: dict (single image results)
         '''
-        #
         p = self.params
         if p.useCats:
             gt = self._gts[imgId,catId]
@@ -204,23 +249,19 @@ def evaluateImg(self, imgId, catId, aRng, maxDet):
             return None
 
         for g in gt:
-            if 'ignore' not in g:
-                g['ignore'] = 0
-            if g['iscrowd'] == 1 or g['ignore'] or (g['area']<aRng[0] or g['area']>aRng[1]):
+            if g['ignore'] or (g['area']<aRng[0] or g['area']>aRng[1]):
                 g['_ignore'] = 1
             else:
                 g['_ignore'] = 0
 
         # sort dt highest score first, sort gt ignore last
-        # gt = sorted(gt, key=lambda x: x['_ignore'])
-        gtind = [ind for (ind, g) in sorted(enumerate(gt), key=lambda (ind, g): g['_ignore']) ]
-
-        gt = [gt[ind] for ind in gtind]
-        dt = sorted(dt, key=lambda x: -x['score'])[0:maxDet]
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
         iscrowd = [int(o['iscrowd']) for o in gt]
         # load computed ious
-        N_iou = len(self.ious[imgId, catId])
-        ious = self.ious[imgId, catId][0:maxDet, np.array(gtind)] if N_iou >0 else self.ious[imgId, catId]
+        ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId]
 
         T = len(p.iouThrs)
         G = len(gt)
@@ -245,7 +286,7 @@ def evaluateImg(self, imgId, catId, aRng, maxDet):
                         # continue to next gt unless better match made
                         if ious[dind,gind] < iou:
                             continue
-                        # match successful and best so far, store appropriately
+                        # if match successful and best so far, store appropriately
                         iou=ious[dind,gind]
                         m=gind
                     # if match made store id of match for both dt and gt
@@ -278,7 +319,7 @@ def accumulate(self, p = None):
         :param p: input params for evaluation
         :return: None
         '''
-        print('Accumulating evaluation results...   ')
+        print('Accumulating evaluation results...')
         tic = time.time()
         if not self.evalImgs:
             print('Please run evaluate() first')
@@ -306,7 +347,6 @@ def accumulate(self, p = None):
         m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
         a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
         i_list = [n for n, i in enumerate(p.imgIds)  if i in setI]
-        # K0 = len(_pe.catIds)
         I0 = len(_pe.imgIds)
         A0 = len(_pe.areaRng)
         # retrieve E at each category, area range, and max number of detections
@@ -315,8 +355,8 @@ def accumulate(self, p = None):
             for a, a0 in enumerate(a_list):
                 Na = a0*I0
                 for m, maxDet in enumerate(m_list):
-                    E = [self.evalImgs[Nk+Na+i] for i in i_list]
-                    E = filter(None, E)
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if not e is None]
                     if len(E) == 0:
                         continue
                     dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
@@ -327,8 +367,8 @@ def accumulate(self, p = None):
 
                     dtm  = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
                     dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet]  for e in E], axis=1)[:,inds]
-                    gtIg = np.concatenate([e['gtIgnore']  for e in E])
-                    npig = len([ig for ig in gtIg if ig == 0])
+                    gtIg = np.concatenate([e['gtIgnore'] for e in E])
+                    npig = np.count_nonzero(gtIg==0 )
                     if npig == 0:
                         continue
                     tps = np.logical_and(               dtm,  np.logical_not(dtIg) )
@@ -357,7 +397,7 @@ def accumulate(self, p = None):
                             if pr[i] > pr[i-1]:
                                 pr[i-1] = pr[i]
 
-                        inds = np.searchsorted(rc, p.recThrs)
+                        inds = np.searchsorted(rc, p.recThrs, side='left')
                         try:
                             for ri, pi in enumerate(inds):
                                 q[ri] = pr[pi]
@@ -367,12 +407,12 @@ def accumulate(self, p = None):
         self.eval = {
             'params': p,
             'counts': [T, R, K, A, M],
-            'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
             'precision': precision,
             'recall':   recall,
         }
         toc = time.time()
-        print('DONE (t=%0.2fs).'%( toc-tic ))
+        print('DONE (t={:0.2f}s).'.format( toc-tic))
 
     def summarize(self):
         '''
@@ -381,15 +421,14 @@ def summarize(self):
         '''
         def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
             p = self.params
-            iStr        = ' {:<18} {} @[ IoU={:<9} | area={:>6} | maxDets={:>3} ] = {}'
-            titleStr    = 'Average Precision' if ap == 1 else 'Average Recall'
-            typeStr     = '(AP)' if ap==1 else '(AR)'
-            iouStr      = '%0.2f:%0.2f'%(p.iouThrs[0], p.iouThrs[-1]) if iouThr is None else '%0.2f'%(iouThr)
-            areaStr     = areaRng
-            maxDetsStr  = '%d'%(maxDets)
-
-            aind = [i for i, aRng in enumerate(['all', 'small', 'medium', 'large']) if aRng == areaRng]
-            mind = [i for i, mDet in enumerate([1, 10, 100]) if mDet == maxDets]
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap==1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
             if ap == 1:
                 # dimension of precision: [TxRxKxAxM]
                 s = self.eval['precision']
@@ -397,34 +436,56 @@ def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
                 if iouThr is not None:
                     t = np.where(iouThr == p.iouThrs)[0]
                     s = s[t]
-                # areaRng
                 s = s[:,:,:,aind,mind]
             else:
                 # dimension of recall: [TxKxAxM]
                 s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
                 s = s[:,:,aind,mind]
             if len(s[s>-1])==0:
                 mean_s = -1
             else:
                 mean_s = np.mean(s[s>-1])
-            print(iStr.format(titleStr, typeStr, iouStr, areaStr, maxDetsStr, '%.3f'%(float(mean_s))))
+            print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
             return mean_s
-
+        def _summarizeDets():
+            stats = np.zeros((12,))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
+            return stats
+        def _summarizeKps():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
         if not self.eval:
             raise Exception('Please run accumulate() first')
-        self.stats = np.zeros((12,))
-        self.stats[0] = _summarize(1)
-        self.stats[1] = _summarize(1,iouThr=.5)
-        self.stats[2] = _summarize(1,iouThr=.75)
-        self.stats[3] = _summarize(1,areaRng='small')
-        self.stats[4] = _summarize(1,areaRng='medium')
-        self.stats[5] = _summarize(1,areaRng='large')
-        self.stats[6] = _summarize(0,maxDets=1)
-        self.stats[7] = _summarize(0,maxDets=10)
-        self.stats[8] = _summarize(0,maxDets=100)
-        self.stats[9]  = _summarize(0,areaRng='small')
-        self.stats[10] = _summarize(0,areaRng='medium')
-        self.stats[11] = _summarize(0,areaRng='large')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
 
     def __str__(self):
         self.summarize()
@@ -433,13 +494,35 @@ class Params:
     '''
     Params for coco evaluation api
     '''
-    def __init__(self):
+    def setDetParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ['all', 'small', 'medium', 'large']
+        self.useCats = 1
+
+    def setKpParams(self):
         self.imgIds = []
         self.catIds = []
         # np.arange causes trouble.  the data point on arange is slightly larger than the true value
-        self.iouThrs = np.linspace(.5, 0.95, np.round((0.95-.5)/.05)+1, endpoint=True)
-        self.recThrs = np.linspace(.0, 1.00, np.round((1.00-.0)/.01)+1, endpoint=True)
-        self.maxDets = [1,10,100]
-        self.areaRng = [ [0**2,1e5**2], [0**2, 32**2], [32**2, 96**2], [96**2, 1e5**2] ]
-        self.useSegm = 0
-        self.useCats = 1
\ No newline at end of file
+        self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ['all', 'medium', 'large']
+        self.useCats = 1
+
+    def __init__(self, iouType='segm'):
+        if iouType == 'segm' or iouType == 'bbox':
+            self.setDetParams()
+        elif iouType == 'keypoints':
+            self.setKpParams()
+        else:
+            raise Exception('iouType not supported')
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None
\ No newline at end of file
diff --git a/example/rcnn/rcnn/pycocotools/mask.py b/example/rcnn/rcnn/pycocotools/mask.py
index c00e09b6e46e..f49b8736b280 100644
--- a/example/rcnn/rcnn/pycocotools/mask.py
+++ b/example/rcnn/rcnn/pycocotools/mask.py
@@ -1,6 +1,6 @@
 __author__ = 'tsungyi'
 
-import _mask as _mask
+import _mask
 
 # Interface for manipulating masks stored in RLE format.
 #
@@ -73,10 +73,31 @@
 # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
 # Licensed under the Simplified BSD License [see coco/license.txt]
 
-encode      = _mask.encode
-decode      = _mask.decode
 iou         = _mask.iou
 merge       = _mask.merge
-area        = _mask.area
-toBbox      = _mask.toBbox
 frPyObjects = _mask.frPyObjects
+
+def encode(bimask):
+    if len(bimask.shape) == 3:
+        return _mask.encode(bimask)
+    elif len(bimask.shape) == 2:
+        h, w = bimask.shape
+        return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
+
+def decode(rleObjs):
+    if type(rleObjs) == list:
+        return _mask.decode(rleObjs)
+    else:
+        return _mask.decode([rleObjs])[:,:,0]
+
+def area(rleObjs):
+    if type(rleObjs) == list:
+        return _mask.area(rleObjs)
+    else:
+        return _mask.area([rleObjs])[0]
+
+def toBbox(rleObjs):
+    if type(rleObjs) == list:
+        return _mask.toBbox(rleObjs)
+    else:
+        return _mask.toBbox([rleObjs])[0]
\ No newline at end of file
diff --git a/example/rcnn/rcnn/pycocotools/maskApi.c b/example/rcnn/rcnn/pycocotools/maskApi.c
index 2b2d89116574..85e397918278 100644
--- a/example/rcnn/rcnn/pycocotools/maskApi.c
+++ b/example/rcnn/rcnn/pycocotools/maskApi.c
@@ -13,7 +13,7 @@ uint umax( uint a, uint b ) { return (a>b) ? a : b; }
 
 void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) {
   R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m);
-  if(cnts) for(siz j=0; j<m; j++) R->cnts[j]=cnts[j];
+  siz j; if(cnts) for(j=0; j<m; j++) R->cnts[j]=cnts[j];
 }
 
 void rleFree( RLE *R ) {
@@ -21,12 +21,12 @@ void rleFree( RLE *R ) {
 }
 
 void rlesInit( RLE **R, siz n ) {
-  *R = (RLE*) malloc(sizeof(RLE)*n);
-  for(siz i=0; i<n; i++) rleInit((*R)+i,0,0,0,0);
+  siz i; *R = (RLE*) malloc(sizeof(RLE)*n);
+  for(i=0; i<n; i++) rleInit((*R)+i,0,0,0,0);
 }
 
 void rlesFree( RLE **R, siz n ) {
-  for(siz i=0; i<n; i++) rleFree((*R)+i); free(*R); *R=0;
+  siz i; for(i=0; i<n; i++) rleFree((*R)+i); free(*R); *R=0;
 }
 
 void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) {
@@ -41,13 +41,13 @@ void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) {
 }
 
 void rleDecode( const RLE *R, byte *M, siz n ) {
-  for( siz i=0; i<n; i++ ) {
-    byte v=0; for( siz j=0; j<R[i].m; j++ ) {
-      for( siz k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
+  siz i, j, k; for( i=0; i<n; i++ ) {
+    byte v=0; for( j=0; j<R[i].m; j++ ) {
+      for( k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
 }
 
-void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ) {
-  uint *cnts, c, ca, cb, cc, ct; bool v, va, vb, vp;
+void rleMerge( const RLE *R, RLE *M, siz n, int intersect ) {
+  uint *cnts, c, ca, cb, cc, ct; int v, va, vb, vp;
   siz i, a, b, h=R[0].h, w=R[0].w, m=R[0].m; RLE A, B;
   if(n==0) { rleInit(M,0,0,0,0); return; }
   if(n==1) { rleInit(M,h,w,m,R[0].cnts); return; }
@@ -70,19 +70,19 @@ void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ) {
 }
 
 void rleArea( const RLE *R, siz n, uint *a ) {
-  for( siz i=0; i<n; i++ ) {
-    a[i]=0; for( siz j=1; j<R[i].m; j+=2 ) a[i]+=R[i].cnts[j]; }
+  siz i, j; for( i=0; i<n; i++ ) {
+    a[i]=0; for( j=1; j<R[i].m; j+=2 ) a[i]+=R[i].cnts[j]; }
 }
 
 void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) {
-  siz g, d; BB db, gb; bool crowd;
+  siz g, d; BB db, gb; int crowd;
   db=malloc(sizeof(double)*m*4); rleToBbox(dt,db,m);
   gb=malloc(sizeof(double)*n*4); rleToBbox(gt,gb,n);
   bbIou(db,gb,m,n,iscrowd,o); free(db); free(gb);
   for( g=0; g<n; g++ ) for( d=0; d<m; d++ ) if(o[g*m+d]>0) {
     crowd=iscrowd!=NULL && iscrowd[g];
     if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; }
-    siz ka, kb, a, b; uint c, ca, cb, ct, i, u; bool va, vb;
+    siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb;
     ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0;
     cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1;
     while( ct>0 ) {
@@ -95,8 +95,19 @@ void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) {
   }
 }
 
+void rleNms( RLE *dt, siz n, uint *keep, double thr ) {
+  siz i, j; double u;
+  for( i=0; i<n; i++ ) keep[i]=1;
+  for( i=0; i<n; i++ ) if(keep[i]) {
+    for( j=i+1; j<n; j++ ) if(keep[j]) {
+      rleIou(dt+i,dt+j,1,1,0,&u);
+      if(u>thr) keep[j]=0;
+    }
+  }
+}
+
 void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) {
-  double h, w, i, u, ga, da; siz g, d; bool crowd;
+  double h, w, i, u, ga, da; siz g, d; int crowd;
   for( g=0; g<n; g++ ) {
     BB G=gt+g*4; ga=G[2]*G[3]; crowd=iscrowd!=NULL && iscrowd[g];
     for( d=0; d<m; d++ ) {
@@ -108,8 +119,19 @@ void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) {
   }
 }
 
+void bbNms( BB dt, siz n, uint *keep, double thr ) {
+  siz i, j; double u;
+  for( i=0; i<n; i++ ) keep[i]=1;
+  for( i=0; i<n; i++ ) if(keep[i]) {
+    for( j=i+1; j<n; j++ ) if(keep[j]) {
+      bbIou(dt+i*4,dt+j*4,1,1,0,&u);
+      if(u>thr) keep[j]=0;
+    }
+  }
+}
+
 void rleToBbox( const RLE *R, BB bb, siz n ) {
-  for( siz i=0; i<n; i++ ) {
+  siz i; for( i=0; i<n; i++ ) {
     uint h, w, x, y, xs, ys, xe, ye, cc, t; siz j, m;
     h=(uint)R[i].h; w=(uint)R[i].w; m=R[i].m;
     m=((siz)(m/2))*2; xs=w; ys=h; xe=ye=0; cc=0;
@@ -124,7 +146,7 @@ void rleToBbox( const RLE *R, BB bb, siz n ) {
 }
 
 void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) {
-  for( siz i=0; i<n; i++ ) {
+  siz i; for( i=0; i<n; i++ ) {
     double xs=bb[4*i+0], xe=xs+bb[4*i+2];
     double ys=bb[4*i+1], ye=ys+bb[4*i+3];
     double xy[8] = {xs,ys,xs,ye,xe,ye,xe,ys};
@@ -137,7 +159,7 @@ int uintCompare(const void *a, const void *b) {
 }
 
 void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) {
-  // upsample and get discrete points densely along entire boundary
+  /* upsample and get discrete points densely along entire boundary */
   siz j, m=0; double scale=5; int *x, *y, *u, *v; uint *a, *b;
   x=malloc(sizeof(int)*(k+1)); y=malloc(sizeof(int)*(k+1));
   for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
@@ -145,18 +167,18 @@ void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) {
   for(j=0; j<k; j++) m+=umax(abs(x[j]-x[j+1]),abs(y[j]-y[j+1]))+1;
   u=malloc(sizeof(int)*m); v=malloc(sizeof(int)*m); m=0;
   for( j=0; j<k; j++ ) {
-    int xs=x[j], xe=x[j+1], ys=y[j], ye=y[j+1], dx, dy, t;
-    bool flip; double s; dx=abs(xe-xs); dy=abs(ys-ye);
+    int xs=x[j], xe=x[j+1], ys=y[j], ye=y[j+1], dx, dy, t, d;
+    int flip; double s; dx=abs(xe-xs); dy=abs(ys-ye);
     flip = (dx>=dy && xs>xe) || (dx<dy && ys>ye);
     if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; }
     s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy;
-    if(dx>=dy) for( int d=0; d<=dx; d++ ) {
+    if(dx>=dy) for( d=0; d<=dx; d++ ) {
       t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++;
-    } else for( int d=0; d<=dy; d++ ) {
+    } else for( d=0; d<=dy; d++ ) {
       t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++;
     }
   }
-  // get points along y-boundary and downsample
+  /* get points along y-boundary and downsample */
   free(x); free(y); k=m; m=0; double xd, yd;
   x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k);
   for( j=1; j<k; j++ ) if(u[j]!=u[j-1]) {
@@ -166,7 +188,7 @@ void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) {
     if(yd<0) yd=0; else if(yd>h) yd=h; yd=ceil(yd);
     x[m]=(int) xd; y[m]=(int) yd; m++;
   }
-  // compute rle encoding given y-boundary points
+  /* compute rle encoding given y-boundary points */
   k=m; a=malloc(sizeof(uint)*(k+1));
   for( j=0; j<k; j++ ) a[j]=(uint)(x[j]*(int)(h)+y[j]);
   a[k++]=(uint)(h*w); free(u); free(v); free(x); free(y);
@@ -179,8 +201,8 @@ void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) {
 }
 
 char* rleToString( const RLE *R ) {
-  // Similar to LEB128 but using 6 bits/char and ascii chars 48-111.
-  siz i, m=R->m, p=0; long x; bool more;
+  /* Similar to LEB128 but using 6 bits/char and ascii chars 48-111. */
+  siz i, m=R->m, p=0; long x; int more;
   char *s=malloc(sizeof(char)*m*6);
   for( i=0; i<m; i++ ) {
     x=(long) R->cnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1;
@@ -193,7 +215,7 @@ char* rleToString( const RLE *R ) {
 }
 
 void rleFrString( RLE *R, char *s, siz h, siz w ) {
-  siz m=0, p=0, k; long x; bool more; uint *cnts;
+  siz m=0, p=0, k; long x; int more; uint *cnts;
   while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0;
   while( s[p] ) {
     x=0; k=0; more=1;
diff --git a/example/rcnn/rcnn/pycocotools/maskApi.h b/example/rcnn/rcnn/pycocotools/maskApi.h
index ff16116c4781..ebc7892da382 100644
--- a/example/rcnn/rcnn/pycocotools/maskApi.h
+++ b/example/rcnn/rcnn/pycocotools/maskApi.h
@@ -5,7 +5,6 @@
 * Licensed under the Simplified BSD License [see coco/license.txt]
 **************************************************************************/
 #pragma once
-#include <stdbool.h>
 
 typedef unsigned int uint;
 typedef unsigned long siz;
@@ -13,43 +12,49 @@ typedef unsigned char byte;
 typedef double* BB;
 typedef struct { siz h, w, m; uint *cnts; } RLE;
 
-// Initialize/destroy RLE.
+/* Initialize/destroy RLE. */
 void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts );
 void rleFree( RLE *R );
 
-// Initialize/destroy RLE array.
+/* Initialize/destroy RLE array. */
 void rlesInit( RLE **R, siz n );
 void rlesFree( RLE **R, siz n );
 
-// Encode binary masks using RLE.
+/* Encode binary masks using RLE. */
 void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n );
 
-// Decode binary masks encoded via RLE.
+/* Decode binary masks encoded via RLE. */
 void rleDecode( const RLE *R, byte *mask, siz n );
 
-// Compute union or intersection of encoded masks.
-void rleMerge( const RLE *R, RLE *M, siz n, bool intersect );
+/* Compute union or intersection of encoded masks. */
+void rleMerge( const RLE *R, RLE *M, siz n, int intersect );
 
-// Compute area of encoded masks.
+/* Compute area of encoded masks. */
 void rleArea( const RLE *R, siz n, uint *a );
 
-// Compute intersection over union between masks.
+/* Compute intersection over union between masks. */
 void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o );
 
-// Compute intersection over union between bounding boxes.
+/* Compute non-maximum suppression between bounding masks */
+void rleNms( RLE *dt, siz n, uint *keep, double thr );
+
+/* Compute intersection over union between bounding boxes. */
 void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o );
 
-// Get bounding boxes surrounding encoded masks.
+/* Compute non-maximum suppression between bounding boxes */
+void bbNms( BB dt, siz n, uint *keep, double thr );
+
+/* Get bounding boxes surrounding encoded masks. */
 void rleToBbox( const RLE *R, BB bb, siz n );
 
-// Convert bounding boxes to encoded masks.
+/* Convert bounding boxes to encoded masks. */
 void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n );
 
-// Convert polygon to encoded mask.
+/* Convert polygon to encoded mask. */
 void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w );
 
-// Get compressed string representation of encoded mask.
+/* Get compressed string representation of encoded mask. */
 char* rleToString( const RLE *R );
 
-// Convert from compressed string representation of encoded mask.
+/* Convert from compressed string representation of encoded mask. */
 void rleFrString( RLE *R, char *s, siz h, siz w );
diff --git a/example/rcnn/rcnn/symbol/proposal.py b/example/rcnn/rcnn/symbol/proposal.py
index 397030db6d7c..dd0bb15f5168 100644
--- a/example/rcnn/rcnn/symbol/proposal.py
+++ b/example/rcnn/rcnn/symbol/proposal.py
@@ -3,18 +3,16 @@
 classification probability and bounding box prediction results, and image size and scale information.
 """
 
-from __future__ import print_function
 import mxnet as mx
 import numpy as np
 import numpy.random as npr
 from distutils.util import strtobool
 
+from rcnn.logger import logger
 from rcnn.processing.bbox_transform import bbox_pred, clip_boxes
 from rcnn.processing.generate_anchor import generate_anchors
 from rcnn.processing.nms import py_nms_wrapper, cpu_nms_wrapper, gpu_nms_wrapper
 
-DEBUG = False
-
 
 class ProposalOperator(mx.operator.CustomOp):
     def __init__(self, feat_stride, scales, ratios, output_score,
@@ -31,10 +29,8 @@ def __init__(self, feat_stride, scales, ratios, output_score,
         self._threshold = threshold
         self._rpn_min_size = rpn_min_size
 
-        if DEBUG:
-            print('feat_stride: {}'.format(self._feat_stride))
-            print('anchors:')
-            print(self._anchors)
+        logger.debug('feat_stride: %s' % self._feat_stride)
+        logger.debug('anchors:\n%s' % self._anchors)
 
     def forward(self, is_train, req, in_data, out_data, aux):
         nms = gpu_nms_wrapper(self._threshold, in_data[0].context.device_id)
@@ -64,17 +60,14 @@ def forward(self, is_train, req, in_data, out_data, aux):
         bbox_deltas = in_data[1].asnumpy()
         im_info = in_data[2].asnumpy()[0, :]
 
-        if DEBUG:
-            print('im_size: ({}, {})'.format(im_info[0], im_info[1]))
-            print('scale: {}'.format(im_info[2]))
+        logger.debug('im_info: %s' % im_info)
 
         # 1. Generate proposals from bbox_deltas and shifted anchors
         # use real image size instead of padded feature map sizes
         height, width = int(im_info[0] / self._feat_stride), int(im_info[1] / self._feat_stride)
 
-        if DEBUG:
-            print('score map size: {}'.format(scores.shape))
-            print("resudial: {}".format((scores.shape[2] - height, scores.shape[3] - width)))
+        logger.debug('score map size: (%d, %d)' % (scores.shape[2], scores.shape[3]))
+        logger.debug('resudial: (%d, %d)' % (scores.shape[2] - height, scores.shape[3] - width))
 
         # Enumerate all shifts
         shift_x = np.arange(0, width) * self._feat_stride
diff --git a/example/rcnn/rcnn/symbol/proposal_target.py b/example/rcnn/rcnn/symbol/proposal_target.py
index 3f28cb2cbebb..6f1a6ffbc440 100644
--- a/example/rcnn/rcnn/symbol/proposal_target.py
+++ b/example/rcnn/rcnn/symbol/proposal_target.py
@@ -2,15 +2,14 @@
 Proposal Target Operator selects foreground and background roi and assigns label, bbox_transform to them.
 """
 
-from __future__ import print_function
+import logging
 import mxnet as mx
 import numpy as np
 from distutils.util import strtobool
 
+from ..logger import logger
 from rcnn.io.rcnn import sample_rois
 
-DEBUG = False
-
 
 class ProposalTargetOperator(mx.operator.CustomOp):
     def __init__(self, num_classes, batch_images, batch_rois, fg_fraction):
@@ -20,7 +19,7 @@ def __init__(self, num_classes, batch_images, batch_rois, fg_fraction):
         self._batch_rois = batch_rois
         self._fg_fraction = fg_fraction
 
-        if DEBUG:
+        if logger.level == logging.DEBUG:
             self._count = 0
             self._fg_num = 0
             self._bg_num = 0
@@ -43,17 +42,17 @@ def forward(self, is_train, req, in_data, out_data, aux):
         rois, labels, bbox_targets, bbox_weights = \
             sample_rois(all_rois, fg_rois_per_image, rois_per_image, self._num_classes, gt_boxes=gt_boxes)
 
-        if DEBUG:
-            print("labels=", labels)
-            print('num fg: {}'.format((labels > 0).sum()))
-            print('num bg: {}'.format((labels == 0).sum()))
+        if logger.level == logging.DEBUG:
+            logger.debug("labels: %s" % labels)
+            logger.debug('num fg: {}'.format((labels > 0).sum()))
+            logger.debug('num bg: {}'.format((labels == 0).sum()))
             self._count += 1
             self._fg_num += (labels > 0).sum()
             self._bg_num += (labels == 0).sum()
-            print("self._count=", self._count)
-            print('num fg avg: {}'.format(self._fg_num / self._count))
-            print('num bg avg: {}'.format(self._bg_num / self._count))
-            print('ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num)))
+            logger.debug("self._count: %d" % self._count)
+            logger.debug('num fg avg: %d' % (self._fg_num / self._count))
+            logger.debug('num bg avg: %d' % (self._bg_num / self._count))
+            logger.debug('ratio: %.3f' % (float(self._fg_num) / float(self._bg_num)))
 
         for ind, val in enumerate([rois, labels, bbox_targets, bbox_weights]):
             self.assign(out_data[ind], req[ind], val)
diff --git a/example/rcnn/rcnn/tools/reeval.py b/example/rcnn/rcnn/tools/reeval.py
index a2e6264942de..22e5e206f4d0 100644
--- a/example/rcnn/rcnn/tools/reeval.py
+++ b/example/rcnn/rcnn/tools/reeval.py
@@ -1,9 +1,9 @@
-from __future__ import print_function
 import argparse
 import cPickle
 import os
 import mxnet as mx
 
+from ..logger import logger
 from ..config import config, default, generate_config
 from ..dataset import *
 
@@ -39,7 +39,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print('Called with argument:', args)
+    logger.info('Called with argument: %s' % args)
     reeval(args)
 
 
diff --git a/example/rcnn/rcnn/tools/test_rcnn.py b/example/rcnn/rcnn/tools/test_rcnn.py
index 65dca7a6d0f4..83a9fac03e67 100644
--- a/example/rcnn/rcnn/tools/test_rcnn.py
+++ b/example/rcnn/rcnn/tools/test_rcnn.py
@@ -1,8 +1,8 @@
-from __future__ import print_function
 import argparse
 import pprint
 import mxnet as mx
 
+from ..logger import logger
 from ..config import config, default, generate_config
 from ..symbol import *
 from ..dataset import *
@@ -99,8 +99,8 @@ def parse_args():
 
 def main():
     args = parse_args()
+    logger.info('Called with argument: %s' % args)
     ctx = mx.gpu(args.gpu)
-    print(args)
     test_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
               ctx, args.prefix, args.epoch,
               args.vis, args.shuffle, args.has_rpn, args.proposal, args.thresh)
diff --git a/example/rcnn/rcnn/tools/test_rpn.py b/example/rcnn/rcnn/tools/test_rpn.py
index 9d0ff198e1b4..09f6af74368f 100644
--- a/example/rcnn/rcnn/tools/test_rpn.py
+++ b/example/rcnn/rcnn/tools/test_rpn.py
@@ -1,8 +1,8 @@
-from __future__ import print_function
 import argparse
 import pprint
 import mxnet as mx
 
+from ..logger import logger
 from ..config import config, default, generate_config
 from ..symbol import *
 from ..dataset import *
@@ -89,7 +89,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print('Called with argument:', args)
+    logger.info('Called with argument: %s' % args)
     ctx = mx.gpu(args.gpu)
     test_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
              ctx, args.prefix, args.epoch,
diff --git a/example/rcnn/rcnn/tools/train_rcnn.py b/example/rcnn/rcnn/tools/train_rcnn.py
index 0669af047819..3f1cde380e8c 100644
--- a/example/rcnn/rcnn/tools/train_rcnn.py
+++ b/example/rcnn/rcnn/tools/train_rcnn.py
@@ -1,8 +1,8 @@
 import argparse
-import logging
 import pprint
 import mxnet as mx
 
+from ..logger import logger
 from ..config import config, default, generate_config
 from ..symbol import *
 from ..core import callback, metric
@@ -17,11 +17,6 @@ def train_rcnn(network, dataset, image_set, root_path, dataset_path,
                frequent, kvstore, work_load_list, no_flip, no_shuffle, resume,
                ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
                train_shared, lr, lr_step, proposal):
-    # set up logger
-    logging.basicConfig()
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
     # set up config
     config.TRAIN.BATCH_IMAGES = 2
     config.TRAIN.BATCH_ROIS = 128
@@ -36,7 +31,7 @@ def train_rcnn(network, dataset, image_set, root_path, dataset_path,
     input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size
 
     # print config
-    pprint.pprint(config)
+    logger.info(pprint.pformat(config))
 
     # load dataset and prepare imdb for training
     image_sets = [iset for iset in image_set.split('+')]
@@ -53,6 +48,7 @@ def train_rcnn(network, dataset, image_set, root_path, dataset_path,
 
     # infer max shape
     max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))]
+    logger.info('providing maximum shape %s' % max_data_shape)
 
     # infer shape
     data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
@@ -60,8 +56,7 @@ def train_rcnn(network, dataset, image_set, root_path, dataset_path,
     arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
     out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
     aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
-    print('output shape')
-    pprint.pprint(out_shape_dict)
+    logger.info('output shape %s' % pprint.pformat(out_shape_dict))
 
     # load and initialize params
     if resume:
@@ -115,7 +110,7 @@ def train_rcnn(network, dataset, image_set, root_path, dataset_path,
     lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
     lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
     lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff]
-    print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters)
+    logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters))
     lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)
     # optimizer
     optimizer_params = {'momentum': 0.9,
@@ -166,7 +161,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print('Called with argument:', args)
+    logger.info('Called with argument: %s' % args)
     ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
     train_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
                args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume,
diff --git a/example/rcnn/rcnn/tools/train_rpn.py b/example/rcnn/rcnn/tools/train_rpn.py
index 2c7267ea36ef..87b92c8229ef 100644
--- a/example/rcnn/rcnn/tools/train_rpn.py
+++ b/example/rcnn/rcnn/tools/train_rpn.py
@@ -1,9 +1,8 @@
-from __future__ import print_function
 import argparse
-import logging
 import pprint
 import mxnet as mx
 
+from ..logger import logger
 from ..config import config, default, generate_config
 from ..symbol import *
 from ..core import callback, metric
@@ -17,11 +16,6 @@ def train_rpn(network, dataset, image_set, root_path, dataset_path,
               frequent, kvstore, work_load_list, no_flip, no_shuffle, resume,
               ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
               train_shared, lr, lr_step):
-    # set up logger
-    logging.basicConfig()
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
     # setup config
     config.TRAIN.BATCH_IMAGES = 1
 
@@ -34,7 +28,7 @@ def train_rpn(network, dataset, image_set, root_path, dataset_path,
     input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size
 
     # print config
-    pprint.pprint(config)
+    logger.info(pprint.pformat(config))
 
     # load dataset and prepare imdb for training
     image_sets = [iset for iset in image_set.split('+')]
@@ -53,7 +47,7 @@ def train_rpn(network, dataset, image_set, root_path, dataset_path,
     # infer max shape
     max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))]
     max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape)
-    print('providing maximum shape', max_data_shape, max_label_shape)
+    logger.info('providing maximum shape %s %s' % (max_data_shape, max_label_shape))
 
     # infer shape
     data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
@@ -61,8 +55,7 @@ def train_rpn(network, dataset, image_set, root_path, dataset_path,
     arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
     out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
     aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
-    print('output shape')
-    pprint.pprint(out_shape_dict)
+    logger.info('output shape %s' % pprint.pformat(out_shape_dict))
 
     # load and initialize params
     if resume:
@@ -118,7 +111,7 @@ def train_rpn(network, dataset, image_set, root_path, dataset_path,
     lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
     lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
     lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff]
-    print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters)
+    logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters))
     lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)
     # optimizer
     optimizer_params = {'momentum': 0.9,
@@ -168,7 +161,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print('Called with argument:', args)
+    logger.info('Called with argument: %s' % args)
     ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
     train_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
               args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume,
diff --git a/example/rcnn/rcnn/utils/caffe_convert.py b/example/rcnn/rcnn/utils/caffe_convert.py
deleted file mode 100644
index b5f0fbe27d14..000000000000
--- a/example/rcnn/rcnn/utils/caffe_convert.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# This script will not work unless all paths are set right
-
-from __future__ import print_function
-import os
-import sys
-import mxnet as mx
-import numpy as np
-fast_rcnn_path = None
-sys.path.insert(0, os.path.join(fast_rcnn_path, 'caffe-fast-rcnn', 'python'))
-sys.path.insert(0, os.path.join(fast_rcnn_path, 'lib'))
-import caffe
-from rcnn.symbol import get_symbol_vgg_test
-
-def load_model(caffeproto, caffemodel, arg_shape_dic):
-    def get_caffe_iter(layer_names, layers):
-        for layer_idx, layer in enumerate(layers):
-            layer_name = layer_names[layer_idx].replace('/', '_')
-            layer_type = layer.type
-            layer_blobs = layer.blobs
-            yield (layer_name, layer_type, layer_blobs)
-
-    net_caffe = caffe.Net(caffeproto, caffemodel, caffe.TEST)
-    layer_names = net_caffe._layer_names
-    layers = net_caffe.layers
-    iter = ''
-    iter = get_caffe_iter(layer_names, layers)
-    first_conv = True
-
-    arg_params = {}
-    for layer_name, layer_type, layer_blobs in iter:
-        if layer_type == 'Convolution' or layer_type == 'InnerProduct' or layer_type == 4 or layer_type == 14:
-            assert(len(layer_blobs) == 2)
-            wmat = np.array(layer_blobs[0].data).reshape(layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height, layer_blobs[0].width)
-            bias = np.array(layer_blobs[1].data)
-            if first_conv:
-                print('Swapping BGR of caffe into RGB in mxnet')
-                wmat[:, [0, 2], :, :] = wmat[:, [2, 0], :, :]
-
-            assert(wmat.flags['C_CONTIGUOUS'] is True)
-            assert(bias.flags['C_CONTIGUOUS'] is True)
-            print('converting layer {0}, wmat shape = {1}, bias shape = {2}'.format(layer_name, wmat.shape, bias.shape))
-            wmat = wmat.reshape((wmat.shape[0], -1))
-            bias = bias.reshape((bias.shape[0], 1))
-            weight_name = layer_name + "_weight"
-            bias_name = layer_name + "_bias"
-            
-            if weight_name not in arg_shape_dic:
-                print(weight_name + ' not found in arg_shape_dic.')
-                continue
-            wmat = wmat.reshape(arg_shape_dic[weight_name])
-            arg_params[weight_name] = mx.nd.zeros(wmat.shape)
-            arg_params[weight_name][:] = wmat
-
-            bias = bias.reshape(arg_shape_dic[bias_name])
-            arg_params[bias_name] = mx.nd.zeros(bias.shape)
-            arg_params[bias_name][:] = bias
-
-            if first_conv and (layer_type == 'Convolution' or layer_type == 4):
-                first_conv = False
-    
-    return arg_params
-
-proto_path = os.path.join(fast_rcnn_path, 'models', 'VGG16', 'test.prototxt')
-model_path = os.path.join(fast_rcnn_path, 'data', 'fast_rcnn_models', 'vgg16_fast_rcnn_iter_40000.caffemodel')
-
-symbol = get_symbol_vgg_test()
-arg_shapes, out_shapes, aux_shapes = symbol.infer_shape(**{'data': (1, 3, 224, 224), 'rois': (1, 5)})
-arg_shape_dic = { name: shape for name, shape in zip(symbol.list_arguments(), arg_shapes) }
-
-arg_params = load_model(proto_path, model_path, arg_shape_dic)
-
-model = mx.model.FeedForward(ctx=mx.cpu(), symbol=symbol, arg_params=arg_params,
-                             aux_params={}, num_epoch=1,
-                             learning_rate=0.01, momentum=0.9, wd=0.0001)
-model.save('model/ref')
diff --git a/example/rcnn/rcnn/utils/load_data.py b/example/rcnn/rcnn/utils/load_data.py
index d56882a5c9d8..4700229e65af 100644
--- a/example/rcnn/rcnn/utils/load_data.py
+++ b/example/rcnn/rcnn/utils/load_data.py
@@ -1,5 +1,5 @@
-from __future__ import print_function
 import numpy as np
+from ..logger import logger
 from ..config import config
 from ..dataset import *
 
@@ -47,6 +47,6 @@ def is_valid(entry):
     num = len(roidb)
     filtered_roidb = [entry for entry in roidb if is_valid(entry)]
     num_after = len(filtered_roidb)
-    print('filtered %d roidb entries: %d -> %d' % (num - num_after, num, num_after))
+    logger.info('load data: filtered %d roidb entries: %d -> %d' % (num - num_after, num, num_after))
 
     return filtered_roidb
diff --git a/example/rcnn/test.py b/example/rcnn/test.py
index 708efc8c7ddb..12fe6973fbcf 100644
--- a/example/rcnn/test.py
+++ b/example/rcnn/test.py
@@ -1,6 +1,6 @@
-from __future__ import print_function
 import argparse
 import mxnet as mx
+from rcnn.logger import logger
 from rcnn.config import config, default, generate_config
 from rcnn.tools.test_rcnn import test_rcnn
 
@@ -31,8 +31,8 @@ def parse_args():
 
 def main():
     args = parse_args()
+    logger.info('Called with argument: %s' % args)
     ctx = mx.gpu(args.gpu)
-    print(args)
     test_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
               ctx, args.prefix, args.epoch,
               args.vis, args.shuffle, args.has_rpn, args.proposal, args.thresh)
diff --git a/example/rcnn/train_alternate.py b/example/rcnn/train_alternate.py
index 991fb237d085..74f16b9980aa 100644
--- a/example/rcnn/train_alternate.py
+++ b/example/rcnn/train_alternate.py
@@ -1,9 +1,7 @@
-from __future__ import print_function
 import argparse
-import logging
-
 import mxnet as mx
 
+from rcnn.logger import logger
 from rcnn.config import config, default, generate_config
 from rcnn.tools.train_rpn import train_rpn
 from rcnn.tools.test_rpn import test_rpn
@@ -14,41 +12,36 @@
 def alternate_train(args, ctx, pretrained, epoch,
                     rpn_epoch, rpn_lr, rpn_lr_step,
                     rcnn_epoch, rcnn_lr, rcnn_lr_step):
-    # set up logger
-    logging.basicConfig()
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
     # basic config
     begin_epoch = 0
     config.TRAIN.BG_THRESH_LO = 0.0
 
-    logging.info('########## TRAIN RPN WITH IMAGENET INIT')
+    logger.info('########## TRAIN RPN WITH IMAGENET INIT')
     train_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
               args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume,
               ctx, pretrained, epoch, 'model/rpn1', begin_epoch, rpn_epoch,
               train_shared=False, lr=rpn_lr, lr_step=rpn_lr_step)
 
-    logging.info('########## GENERATE RPN DETECTION')
+    logger.info('########## GENERATE RPN DETECTION')
     image_sets = [iset for iset in args.image_set.split('+')]
     for image_set in image_sets:
         test_rpn(args.network, args.dataset, image_set, args.root_path, args.dataset_path,
                  ctx[0], 'model/rpn1', rpn_epoch,
                  vis=False, shuffle=False, thresh=0)
 
-    logging.info('########## TRAIN RCNN WITH IMAGENET INIT AND RPN DETECTION')
+    logger.info('########## TRAIN RCNN WITH IMAGENET INIT AND RPN DETECTION')
     train_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
                args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume,
                ctx, pretrained, epoch, 'model/rcnn1', begin_epoch, rcnn_epoch,
                train_shared=False, lr=rcnn_lr, lr_step=rcnn_lr_step, proposal='rpn')
 
-    logging.info('########## TRAIN RPN WITH RCNN INIT')
+    logger.info('########## TRAIN RPN WITH RCNN INIT')
     train_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
               args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume,
               ctx, 'model/rcnn1', rcnn_epoch, 'model/rpn2', begin_epoch, rpn_epoch,
               train_shared=True, lr=rpn_lr, lr_step=rpn_lr_step)
 
-    logging.info('########## GENERATE RPN DETECTION')
+    logger.info('########## GENERATE RPN DETECTION')
     image_sets = [iset for iset in args.image_set.split('+')]
     for image_set in image_sets:
         test_rpn(args.network, args.dataset, image_set, args.root_path, args.dataset_path,
@@ -101,7 +94,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print('Called with argument:', args)
+    logger.info('Called with argument: %s' % args)
     ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
     alternate_train(args, ctx, args.pretrained, args.pretrained_epoch,
                     args.rpn_epoch, args.rpn_lr, args.rpn_lr_step,
diff --git a/example/rcnn/train_end2end.py b/example/rcnn/train_end2end.py
index ac00120131c9..b8b1c5c3a410 100644
--- a/example/rcnn/train_end2end.py
+++ b/example/rcnn/train_end2end.py
@@ -1,10 +1,9 @@
-from __future__ import print_function
 import argparse
-import logging
 import pprint
 import mxnet as mx
 import numpy as np
 
+from rcnn.logger import logger
 from rcnn.config import config, default, generate_config
 from rcnn.symbol import *
 from rcnn.core import callback, metric
@@ -16,11 +15,6 @@
 
 def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
               lr=0.001, lr_step='5'):
-    # set up logger
-    logging.basicConfig()
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
     # setup config
     config.TRAIN.BATCH_IMAGES = 1
     config.TRAIN.BATCH_ROIS = 128
@@ -36,7 +30,7 @@ def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
     input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size
 
     # print config
-    pprint.pprint(config)
+    logger.info(pprint.pformat(config))
 
     # load dataset and prepare imdb for training
     image_sets = [iset for iset in args.image_set.split('+')]
@@ -56,7 +50,7 @@ def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
     max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))]
     max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape)
     max_data_shape.append(('gt_boxes', (input_batch_size, 100, 5)))
-    print('providing maximum shape', max_data_shape, max_label_shape)
+    logger.info('providing maximum shape %s %s' % (max_data_shape, max_label_shape))
 
     # infer shape
     data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
@@ -64,8 +58,7 @@ def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
     arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
     out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
     aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
-    print('output shape')
-    pprint.pprint(out_shape_dict)
+    logger.info('output shape %s' % pprint.pformat(out_shape_dict))
 
     # load and initialize params
     if args.resume:
@@ -127,7 +120,7 @@ def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
     lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
     lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
     lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff]
-    print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters)
+    logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters))
     lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)
     # optimizer
     optimizer_params = {'momentum': 0.9,
@@ -176,7 +169,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print('Called with argument:', args)
+    logger.info('Called with argument: %s' % args)
     ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
     train_net(args, ctx, args.pretrained, args.pretrained_epoch, args.prefix, args.begin_epoch, args.end_epoch,
               lr=args.lr, lr_step=args.lr_step)
diff --git a/src/operator/contrib/proposal-inl.h b/src/operator/contrib/proposal-inl.h
index ed0ec826588f..686a8a354ff9 100644
--- a/src/operator/contrib/proposal-inl.h
+++ b/src/operator/contrib/proposal-inl.h
@@ -267,7 +267,7 @@ inline void _Transform(float scale,
                        float ratio,
                        const std::vector<float>& base_anchor,
                        std::vector<float>  *out_anchors) {
-  float w = base_anchor[2] - base_anchor[1] + 1.0f;
+  float w = base_anchor[2] - base_anchor[0] + 1.0f;
   float h = base_anchor[3] - base_anchor[1] + 1.0f;
   float x_ctr = base_anchor[0] + 0.5 * (w - 1.0f);
   float y_ctr = base_anchor[1] + 0.5 * (h - 1.0f);